diff options
Diffstat (limited to 'net/sched')
-rw-r--r-- | net/sched/Kconfig | 91 | ||||
-rw-r--r-- | net/sched/Makefile | 7 | ||||
-rw-r--r-- | net/sched/act_api.c | 57 | ||||
-rw-r--r-- | net/sched/act_connmark.c | 107 | ||||
-rw-r--r-- | net/sched/act_ct.c | 141 | ||||
-rw-r--r-- | net/sched/act_gate.c | 30 | ||||
-rw-r--r-- | net/sched/act_mirred.c | 23 | ||||
-rw-r--r-- | net/sched/act_nat.c | 72 | ||||
-rw-r--r-- | net/sched/act_pedit.c | 300 | ||||
-rw-r--r-- | net/sched/cls_api.c | 304 | ||||
-rw-r--r-- | net/sched/cls_flower.c | 80 | ||||
-rw-r--r-- | net/sched/cls_matchall.c | 6 | ||||
-rw-r--r-- | net/sched/cls_rsvp.c | 26 | ||||
-rw-r--r-- | net/sched/cls_rsvp.h | 764 | ||||
-rw-r--r-- | net/sched/cls_rsvp6.c | 26 | ||||
-rw-r--r-- | net/sched/cls_tcindex.c | 742 | ||||
-rw-r--r-- | net/sched/sch_api.c | 87 | ||||
-rw-r--r-- | net/sched/sch_atm.c | 706 | ||||
-rw-r--r-- | net/sched/sch_cake.c | 2 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 1727 | ||||
-rw-r--r-- | net/sched/sch_dsmark.c | 518 | ||||
-rw-r--r-- | net/sched/sch_mqprio.c | 291 | ||||
-rw-r--r-- | net/sched/sch_mqprio_lib.c | 117 | ||||
-rw-r--r-- | net/sched/sch_mqprio_lib.h | 18 | ||||
-rw-r--r-- | net/sched/sch_taprio.c | 745 |
25 files changed, 1556 insertions, 5431 deletions
diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 777d6b50505c..4b95cb1ac435 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -45,23 +45,6 @@ if NET_SCHED comment "Queueing/Scheduling" -config NET_SCH_CBQ - tristate "Class Based Queueing (CBQ)" - help - Say Y here if you want to use the Class-Based Queueing (CBQ) packet - scheduling algorithm. This algorithm classifies the waiting packets - into a tree-like hierarchy of classes; the leaves of this tree are - in turn scheduled by separate algorithms. - - See the top of <file:net/sched/sch_cbq.c> for more details. - - CBQ is a commonly used scheduler, so if you're unsure, you should - say Y here. Then say Y to all the queueing algorithms below that you - want to use as leaf disciplines. - - To compile this code as a module, choose M here: the - module will be called sch_cbq. - config NET_SCH_HTB tristate "Hierarchical Token Bucket (HTB)" help @@ -85,20 +68,6 @@ config NET_SCH_HFSC To compile this code as a module, choose M here: the module will be called sch_hfsc. -config NET_SCH_ATM - tristate "ATM Virtual Circuits (ATM)" - depends on ATM - help - Say Y here if you want to use the ATM pseudo-scheduler. This - provides a framework for invoking classifiers, which in turn - select classes of this queuing discipline. Each class maps - the flow(s) it is handling to a given virtual circuit. - - See the top of <file:net/sched/sch_atm.c> for more details. - - To compile this code as a module, choose M here: the - module will be called sch_atm. - config NET_SCH_PRIO tristate "Multi Band Priority Queueing (PRIO)" help @@ -195,8 +164,14 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_MQPRIO_LIB + tristate + help + Common library for manipulating mqprio queue configurations. + config NET_SCH_TAPRIO tristate "Time Aware Priority (taprio) Scheduler" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Time Aware Priority (taprio) packet scheduling algorithm. @@ -217,17 +192,6 @@ config NET_SCH_GRED To compile this code as a module, choose M here: the module will be called sch_gred. -config NET_SCH_DSMARK - tristate "Differentiated Services marker (DSMARK)" - help - Say Y if you want to schedule packets according to the - Differentiated Services architecture proposed in RFC 2475. - Technical information on this method, with pointers to associated - RFCs, is available at <http://www.gta.ufrj.br/diffserv/>. - - To compile this code as a module, choose M here: the - module will be called sch_dsmark. - config NET_SCH_NETEM tristate "Network emulator (NETEM)" help @@ -253,6 +217,7 @@ config NET_SCH_DRR config NET_SCH_MQPRIO tristate "Multi-queue priority scheduler (MQPRIO)" + select NET_SCH_MQPRIO_LIB help Say Y here if you want to use the Multi-queue Priority scheduler. This scheduler allows QOS to be offloaded on NICs that have support @@ -337,7 +302,7 @@ config NET_SCH_FQ Say Y here if you want to use the FQ packet scheduling algorithm. FQ does flow separation, and is able to respect pacing requirements - set by TCP stack into sk->sk_pacing_rate (for localy generated + set by TCP stack into sk->sk_pacing_rate (for locally generated traffic) To compile this driver as a module, choose M here: the module @@ -503,17 +468,6 @@ config NET_CLS_BASIC To compile this code as a module, choose M here: the module will be called cls_basic. -config NET_CLS_TCINDEX - tristate "Traffic-Control Index (TCINDEX)" - select NET_CLS - help - Say Y here if you want to be able to classify packets based on - traffic control indices. You will want this feature if you want - to implement Differentiated Services together with DSMARK. - - To compile this code as a module, choose M here: the - module will be called cls_tcindex. - config NET_CLS_ROUTE4 tristate "Routing decision (ROUTE)" depends on INET @@ -559,34 +513,6 @@ config CLS_U32_MARK help Say Y here to be able to use netfilter marks as u32 key. -config NET_CLS_RSVP - tristate "IPv4 Resource Reservation Protocol (RSVP)" - select NET_CLS - help - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp. - -config NET_CLS_RSVP6 - tristate "IPv6 Resource Reservation Protocol (RSVP6)" - select NET_CLS - help - The Resource Reservation Protocol (RSVP) permits end systems to - request a minimum and maximum data flow rate for a connection; this - is important for real time data such as streaming sound or video. - - Say Y here if you want to be able to classify outgoing packets based - on their RSVP requests and you are using the IPv6 protocol. - - To compile this code as a module, choose M here: the - module will be called cls_rsvp6. - config NET_CLS_FLOW tristate "Flow classifier" select NET_CLS @@ -977,6 +903,7 @@ config NET_ACT_TUNNEL_KEY config NET_ACT_CT tristate "connection tracking tc action" depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE + select NF_CONNTRACK_OVS select NF_NAT_OVS if NF_NAT help Say Y here to allow sending the packets to conntrack module. diff --git a/net/sched/Makefile b/net/sched/Makefile index dd14ef413fda..b5fd49641d91 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -33,25 +33,23 @@ obj-$(CONFIG_NET_ACT_TUNNEL_KEY)+= act_tunnel_key.o obj-$(CONFIG_NET_ACT_CT) += act_ct.o obj-$(CONFIG_NET_ACT_GATE) += act_gate.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o -obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o obj-$(CONFIG_NET_SCH_RED) += sch_red.o obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o -obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o -obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o obj-$(CONFIG_NET_SCH_ETS) += sch_ets.o obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o +obj-$(CONFIG_NET_SCH_MQPRIO_LIB) += sch_mqprio_lib.o obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o @@ -69,9 +67,6 @@ obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o obj-$(CONFIG_NET_CLS_FW) += cls_fw.o -obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o -obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o -obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o obj-$(CONFIG_NET_CLS_FLOW) += cls_flow.o obj-$(CONFIG_NET_CLS_CGROUP) += cls_cgroup.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 5b3c0ac495be..fce522886099 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -125,7 +125,7 @@ static void free_tcf(struct tc_action *p) free_percpu(p->cpu_bstats_hw); free_percpu(p->cpu_qstats); - tcf_set_action_cookie(&p->act_cookie, NULL); + tcf_set_action_cookie(&p->user_cookie, NULL); if (chain) tcf_chain_put_by_act(chain); @@ -169,11 +169,6 @@ static bool tc_act_skip_sw(u32 flags) return (flags & TCA_ACT_FLAGS_SKIP_SW) ? true : false; } -static bool tc_act_in_hw(struct tc_action *act) -{ - return !!act->in_hw_count; -} - /* SKIP_HW and SKIP_SW are mutually exclusive flags. */ static bool tc_act_flags_valid(u32 flags) { @@ -192,6 +187,7 @@ static int offload_action_init(struct flow_offload_action *fl_action, fl_action->extack = extack; fl_action->command = cmd; fl_action->index = act->tcfa_index; + fl_action->cookie = (unsigned long)act; if (act->ops->offload_act_setup) { spin_lock_bh(&act->tcfa_lock); @@ -272,7 +268,7 @@ static int tcf_action_offload_add_ex(struct tc_action *action, if (err) goto fl_err; - err = tc_setup_action(&fl_action->action, actions, extack); + err = tc_setup_action(&fl_action->action, actions, 0, extack); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to setup tc actions for offload"); @@ -307,9 +303,6 @@ int tcf_action_update_hw_stats(struct tc_action *action) struct flow_offload_action fl_act = {}; int err; - if (!tc_act_in_hw(action)) - return -EOPNOTSUPP; - err = offload_action_init(&fl_act, action, FLOW_ACT_STATS, NULL); if (err) return err; @@ -438,14 +431,14 @@ EXPORT_SYMBOL(tcf_idr_release); static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { - struct tc_cookie *act_cookie; + struct tc_cookie *user_cookie; u32 cookie_len = 0; rcu_read_lock(); - act_cookie = rcu_dereference(act->act_cookie); + user_cookie = rcu_dereference(act->user_cookie); - if (act_cookie) - cookie_len = nla_total_size(act_cookie->len); + if (user_cookie) + cookie_len = nla_total_size(user_cookie->len); rcu_read_unlock(); return nla_total_size(0) /* action number nested */ @@ -495,7 +488,7 @@ tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a, bool from_act) goto nla_put_failure; rcu_read_lock(); - cookie = rcu_dereference(a->act_cookie); + cookie = rcu_dereference(a->user_cookie); if (cookie) { if (nla_put(skb, TCA_ACT_COOKIE, cookie->len, cookie->data)) { rcu_read_unlock(); @@ -539,6 +532,8 @@ static int tcf_dump_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, (unsigned long)p->tcfa_tm.lastuse)) continue; + tcf_action_update_hw_stats(p); + nest = nla_nest_start_noflag(skb, n_i); if (!nest) { index--; @@ -1367,9 +1362,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, { bool police = flags & TCA_ACT_FLAGS_POLICE; struct nla_bitfield32 userflags = { 0, 0 }; + struct tc_cookie *user_cookie = NULL; u8 hw_stats = TCA_ACT_HW_STATS_ANY; struct nlattr *tb[TCA_ACT_MAX + 1]; - struct tc_cookie *cookie = NULL; struct tc_action *a; int err; @@ -1380,8 +1375,8 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, if (err < 0) return ERR_PTR(err); if (tb[TCA_ACT_COOKIE]) { - cookie = nla_memdup_cookie(tb); - if (!cookie) { + user_cookie = nla_memdup_cookie(tb); + if (!user_cookie) { NL_SET_ERR_MSG(extack, "No memory to generate TC cookie"); err = -ENOMEM; goto err_out; @@ -1407,7 +1402,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, *init_res = err; if (!police && tb[TCA_ACT_COOKIE]) - tcf_set_action_cookie(&a->act_cookie, cookie); + tcf_set_action_cookie(&a->user_cookie, user_cookie); if (!police) a->hw_stats = hw_stats; @@ -1415,9 +1410,9 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, return a; err_out: - if (cookie) { - kfree(cookie->data); - kfree(cookie); + if (user_cookie) { + kfree(user_cookie->data); + kfree(user_cookie); } return ERR_PTR(err); } @@ -1539,9 +1534,6 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (p == NULL) goto errout; - /* update hw stats for this action */ - tcf_action_update_hw_stats(p); - /* compat_mode being true specifies a call that is supposed * to add additional backward compatibility statistic TLVs. */ @@ -1582,7 +1574,7 @@ errout: static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], u32 portid, u32 seq, u16 flags, int event, int bind, - int ref) + int ref, struct netlink_ext_ack *extack) { struct tcamsg *t; struct nlmsghdr *nlh; @@ -1606,7 +1598,12 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[], nla_nest_end(skb, nest); + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1625,7 +1622,7 @@ tcf_get_notify(struct net *net, u32 portid, struct nlmsghdr *n, if (!skb) return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, event, - 0, 1) <= 0) { + 0, 1, NULL) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; @@ -1799,7 +1796,7 @@ tcf_reoffload_del_notify(struct net *net, struct tc_action *action) if (!skb) return -ENOBUFS; - if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1) <= 0) { + if (tca_get_fill(skb, actions, 0, 0, 0, RTM_DELACTION, 0, 1, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1886,7 +1883,7 @@ tcf_del_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, 0, RTM_DELACTION, - 0, 2) <= 0) { + 0, 2, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink TC action attributes"); kfree_skb(skb); return -EINVAL; @@ -1965,7 +1962,7 @@ tcf_add_notify(struct net *net, struct nlmsghdr *n, struct tc_action *actions[], return -ENOBUFS; if (tca_get_fill(skb, actions, portid, n->nlmsg_seq, n->nlmsg_flags, - RTM_NEWACTION, 0, 0) <= 0) { + RTM_NEWACTION, 0, 0, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to fill netlink attributes while adding TC action"); kfree_skb(skb); return -EINVAL; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 7e63ff7e3ed7..8dabfb52ea3d 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -36,13 +36,15 @@ TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, struct nf_conntrack_tuple tuple; enum ip_conntrack_info ctinfo; struct tcf_connmark_info *ca = to_connmark(a); + struct tcf_connmark_parms *parms; struct nf_conntrack_zone zone; struct nf_conn *c; int proto; - spin_lock(&ca->tcf_lock); tcf_lastuse_update(&ca->tcf_tm); - bstats_update(&ca->tcf_bstats, skb); + tcf_action_update_bstats(&ca->common, skb); + + parms = rcu_dereference_bh(ca->parms); switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): @@ -64,31 +66,29 @@ TC_INDIRECT_SCOPE int tcf_connmark_act(struct sk_buff *skb, c = nf_ct_get(skb, &ctinfo); if (c) { skb->mark = READ_ONCE(c->mark); - /* using overlimits stats to count how many packets marked */ - ca->tcf_qstats.overlimits++; - goto out; + goto count; } - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - proto, ca->net, &tuple)) + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, parms->net, + &tuple)) goto out; - zone.id = ca->zone; + zone.id = parms->zone; zone.dir = NF_CT_DEFAULT_ZONE_DIR; - thash = nf_conntrack_find_get(ca->net, &zone, &tuple); + thash = nf_conntrack_find_get(parms->net, &zone, &tuple); if (!thash) goto out; c = nf_ct_tuplehash_to_ctrack(thash); - /* using overlimits stats to count how many packets marked */ - ca->tcf_qstats.overlimits++; skb->mark = READ_ONCE(c->mark); nf_ct_put(c); +count: + /* using overlimits stats to count how many packets marked */ + tcf_action_inc_overlimit_qstats(&ca->common); out: - spin_unlock(&ca->tcf_lock); - return ca->tcf_action; + return READ_ONCE(ca->tcf_action); } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -101,6 +101,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); + struct tcf_connmark_parms *nparms, *oparms; struct nlattr *tb[TCA_CONNMARK_MAX + 1]; bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; @@ -120,52 +121,66 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!tb[TCA_CONNMARK_PARMS]) return -EINVAL; + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) + return -ENOMEM; + parm = nla_data(tb[TCA_CONNMARK_PARMS]); index = parm->index; ret = tcf_idr_check_alloc(tn, &index, a, bind); if (!ret) { - ret = tcf_idr_create(tn, index, est, a, - &act_connmark_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_connmark_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - return ret; + err = ret; + goto out_free; } ci = to_connmark(*a); - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, - extack); - if (err < 0) - goto release_idr; - tcf_action_set_ctrlact(*a, parm->action, goto_ch); - ci->net = net; - ci->zone = parm->zone; + + nparms->net = net; + nparms->zone = parm->zone; ret = ACT_P_CREATED; } else if (ret > 0) { ci = to_connmark(*a); - if (bind) - return 0; - if (!(flags & TCA_ACT_FLAGS_REPLACE)) { - tcf_idr_release(*a, bind); - return -EEXIST; + if (bind) { + err = 0; + goto out_free; } - err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, - extack); - if (err < 0) + if (!(flags & TCA_ACT_FLAGS_REPLACE)) { + err = -EEXIST; goto release_idr; - /* replacing action and zone */ - spin_lock_bh(&ci->tcf_lock); - goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); - ci->zone = parm->zone; - spin_unlock_bh(&ci->tcf_lock); - if (goto_ch) - tcf_chain_put_by_act(goto_ch); + } + + nparms->net = rtnl_dereference(ci->parms)->net; + nparms->zone = parm->zone; + ret = 0; } + err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); + if (err < 0) + goto release_idr; + + spin_lock_bh(&ci->tcf_lock); + goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); + spin_unlock_bh(&ci->tcf_lock); + + if (goto_ch) + tcf_chain_put_by_act(goto_ch); + + if (oparms) + kfree_rcu(oparms, rcu); + return ret; + release_idr: tcf_idr_release(*a, bind); +out_free: + kfree(nparms); return err; } @@ -179,11 +194,14 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; + struct tcf_connmark_parms *parms; struct tcf_t t; spin_lock_bh(&ci->tcf_lock); + parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + opt.action = ci->tcf_action; - opt.zone = ci->zone; + opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -201,6 +219,16 @@ nla_put_failure: return -1; } +static void tcf_connmark_cleanup(struct tc_action *a) +{ + struct tcf_connmark_info *ci = to_connmark(a); + struct tcf_connmark_parms *parms; + + parms = rcu_dereference_protected(ci->parms, 1); + if (parms) + kfree_rcu(parms, rcu); +} + static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .id = TCA_ID_CONNMARK, @@ -208,6 +236,7 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, + .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 0ca2bb8ed026..9cc0bc7c71ed 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -170,11 +170,11 @@ tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple, static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, enum ip_conntrack_dir dir, + enum ip_conntrack_info ctinfo, struct flow_action *action) { struct nf_conn_labels *ct_labels; struct flow_action_entry *entry; - enum ip_conntrack_info ctinfo; u32 *act_ct_labels; entry = tcf_ct_flow_table_flow_action_get_next(action); @@ -182,8 +182,6 @@ static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct, #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) entry->ct_metadata.mark = READ_ONCE(ct->mark); #endif - ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; /* aligns with the CT reference on the SKB nf_ct_set */ entry->ct_metadata.cookie = (unsigned long)ct | ctinfo; entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL; @@ -237,22 +235,28 @@ static int tcf_ct_flow_table_add_action_nat(struct net *net, } static int tcf_ct_flow_table_fill_actions(struct net *net, - const struct flow_offload *flow, + struct flow_offload *flow, enum flow_offload_tuple_dir tdir, struct nf_flow_rule *flow_rule) { struct flow_action *action = &flow_rule->rule->action; int num_entries = action->num_entries; struct nf_conn *ct = flow->ct; + enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; int i, err; switch (tdir) { case FLOW_OFFLOAD_DIR_ORIGINAL: dir = IP_CT_DIR_ORIGINAL; + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + if (ctinfo == IP_CT_ESTABLISHED) + set_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags); break; case FLOW_OFFLOAD_DIR_REPLY: dir = IP_CT_DIR_REPLY; + ctinfo = IP_CT_ESTABLISHED_REPLY; break; default: return -EOPNOTSUPP; @@ -262,7 +266,7 @@ static int tcf_ct_flow_table_fill_actions(struct net *net, if (err) goto err_nat; - tcf_ct_flow_table_add_action_meta(ct, dir, action); + tcf_ct_flow_table_add_action_meta(ct, dir, ctinfo, action); return 0; err_nat: @@ -365,7 +369,7 @@ static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry, static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, - bool tcp) + bool tcp, bool bidirectional) { struct nf_conn_act_ct_ext *act_ct_ext; struct flow_offload *entry; @@ -384,6 +388,8 @@ static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft, ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } + if (bidirectional) + __set_bit(NF_FLOW_HW_BIDIRECTIONAL, &entry->flags); act_ct_ext = nf_conn_act_ct_ext_find(ct); if (act_ct_ext) { @@ -407,26 +413,34 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - bool tcp = false; - - if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) || - !test_bit(IPS_ASSURED_BIT, &ct->status)) - return; + bool tcp = false, bidirectional = true; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: - tcp = true; - if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) return; + + tcp = true; break; case IPPROTO_UDP: + if (!nf_ct_is_confirmed(ct)) + return; + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) + bidirectional = false; break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: { struct nf_conntrack_tuple *tuple; - if (ct->status & IPS_NAT_MASK) + if ((ctinfo != IP_CT_ESTABLISHED && + ctinfo != IP_CT_ESTABLISHED_REPLY) || + !test_bit(IPS_ASSURED_BIT, &ct->status) || + ct->status & IPS_NAT_MASK) return; + tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; /* No support for GRE v1 */ if (tuple->src.u.gre.key || tuple->dst.u.gre.key) @@ -442,7 +456,7 @@ static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft, ct->status & IPS_SEQ_ADJUST) return; - tcf_ct_flow_table_add(ct_ft, ct, tcp); + tcf_ct_flow_table_add(ct_ft, ct, tcp, bidirectional); } static bool @@ -621,13 +635,30 @@ static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); ct = flow->ct; + if (dir == FLOW_OFFLOAD_DIR_REPLY && + !test_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags)) { + /* Only offload reply direction after connection became + * assured. + */ + if (test_bit(IPS_ASSURED_BIT, &ct->status)) + set_bit(NF_FLOW_HW_BIDIRECTIONAL, &flow->flags); + else if (test_bit(NF_FLOW_HW_ESTABLISHED, &flow->flags)) + /* If flow_table flow has already been updated to the + * established state, then don't refresh. + */ + return false; + } + if (tcph && (unlikely(tcph->fin || tcph->rst))) { flow_offload_teardown(flow); return false; } - ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED : - IP_CT_ESTABLISHED_REPLY; + if (dir == FLOW_OFFLOAD_DIR_ORIGINAL) + ctinfo = test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ? + IP_CT_ESTABLISHED : IP_CT_NEW; + else + ctinfo = IP_CT_ESTABLISHED_REPLY; flow_offload_refresh(nf_ft, flow); nf_conntrack_get(&ct->ct_general); @@ -695,31 +726,6 @@ drop_ct: return false; } -/* Trim the skb to the length specified by the IP/IPv6 header, - * removing any trailing lower-layer padding. This prepares the skb - * for higher-layer processing that assumes skb->len excludes padding - * (such as nf_ip_checksum). The caller needs to pull the skb to the - * network header, and ensure ip_hdr/ipv6_hdr points to valid data. - */ -static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) -{ - unsigned int len; - - switch (family) { - case NFPROTO_IPV4: - len = ntohs(ip_hdr(skb)->tot_len); - break; - case NFPROTO_IPV6: - len = sizeof(struct ipv6hdr) - + ntohs(ipv6_hdr(skb)->payload_len); - break; - default: - len = skb->len; - } - - return pskb_trim_rcsum(skb, len); -} - static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) { u8 family = NFPROTO_UNSPEC; @@ -779,6 +785,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, struct nf_conn *ct; int err = 0; bool frag; + u8 proto; u16 mru; /* Previously seen (loopback)? Ignore. */ @@ -794,50 +801,14 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb, return err; skb_get(skb); - mru = tc_skb_cb(skb)->mru; - - if (family == NFPROTO_IPV4) { - enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone; - - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - local_bh_disable(); - err = ip_defrag(net, skb, user); - local_bh_enable(); - if (err && err != -EINPROGRESS) - return err; - - if (!err) { - *defrag = true; - mru = IPCB(skb)->frag_max_size; - } - } else { /* NFPROTO_IPV6 */ -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6) - enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone; - - memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm)); - err = nf_ct_frag6_gather(net, skb, user); - if (err && err != -EINPROGRESS) - goto out_free; - - if (!err) { - *defrag = true; - mru = IP6CB(skb)->frag_max_size; - } -#else - err = -EOPNOTSUPP; - goto out_free; -#endif - } + err = nf_ct_handle_fragments(net, skb, zone, family, &proto, &mru); + if (err) + return err; - if (err != -EINPROGRESS) - tc_skb_cb(skb)->mru = mru; - skb_clear_hash(skb); - skb->ignore_df = 1; - return err; + *defrag = true; + tc_skb_cb(skb)->mru = mru; -out_free: - kfree_skb(skb); - return err; + return 0; } static void tcf_ct_params_free(struct tcf_ct_params *params) @@ -980,7 +951,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, if (err) goto drop; - err = tcf_ct_skb_network_trim(skb, family); + err = nf_ct_skb_network_trim(skb, family); if (err) goto drop; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index 9b8def0be41e..c9a811f4c7ee 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -119,35 +119,37 @@ TC_INDIRECT_SCOPE int tcf_gate_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_gate *gact = to_gate(a); - - spin_lock(&gact->tcf_lock); + int action = READ_ONCE(gact->tcf_action); tcf_lastuse_update(&gact->tcf_tm); - bstats_update(&gact->tcf_bstats, skb); + tcf_action_update_bstats(&gact->common, skb); + spin_lock(&gact->tcf_lock); if (unlikely(gact->current_gate_status & GATE_ACT_PENDING)) { spin_unlock(&gact->tcf_lock); - return gact->tcf_action; + return action; } - if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) + if (!(gact->current_gate_status & GATE_ACT_GATE_OPEN)) { + spin_unlock(&gact->tcf_lock); goto drop; + } if (gact->current_max_octets >= 0) { gact->current_entry_octets += qdisc_pkt_len(skb); if (gact->current_entry_octets > gact->current_max_octets) { - gact->tcf_qstats.overlimits++; - goto drop; + spin_unlock(&gact->tcf_lock); + goto overlimit; } } - spin_unlock(&gact->tcf_lock); - return gact->tcf_action; -drop: - gact->tcf_qstats.drops++; - spin_unlock(&gact->tcf_lock); + return action; +overlimit: + tcf_action_inc_overlimit_qstats(&gact->common); +drop: + tcf_action_inc_drop_qstats(&gact->common); return TC_ACT_SHOT; } @@ -357,8 +359,8 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, return 0; if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_gate_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_gate_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7284bcea7b0b..8037ec9b1d31 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,8 +29,8 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_RECURSION_LIMIT 4 -static DEFINE_PER_CPU(unsigned int, mirred_rec_level); +#define MIRRED_NEST_LIMIT 4 +static DEFINE_PER_CPU(unsigned int, mirred_nest_level); static bool tcf_mirred_is_act_redirect(int action) { @@ -206,12 +206,19 @@ release_idr: return err; } +static bool is_mirred_nested(void) +{ + return unlikely(__this_cpu_read(mirred_nest_level) > 1); +} + static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb) { int err; if (!want_ingress) err = tcf_dev_queue_xmit(skb, dev_queue_xmit); + else if (is_mirred_nested()) + err = netif_rx(skb); else err = netif_receive_skb(skb); @@ -226,7 +233,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, struct sk_buff *skb2 = skb; bool m_mac_header_xmit; struct net_device *dev; - unsigned int rec_level; + unsigned int nest_level; int retval, err = 0; bool use_reinsert; bool want_ingress; @@ -237,11 +244,11 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, int mac_len; bool at_nh; - rec_level = __this_cpu_inc_return(mirred_rec_level); - if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) { + nest_level = __this_cpu_inc_return(mirred_nest_level); + if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_SHOT; } @@ -310,7 +317,7 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, err = tcf_mirred_forward(want_ingress, skb); if (err) tcf_action_inc_overlimit_qstats(&m->common); - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return TC_ACT_CONSUMED; } } @@ -322,7 +329,7 @@ out: if (tcf_mirred_is_act_redirect(m_eaction)) retval = TC_ACT_SHOT; } - __this_cpu_dec(mirred_rec_level); + __this_cpu_dec(mirred_nest_level); return retval; } diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 74c74be33048..4184af5abbf3 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -38,6 +38,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, { struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; + struct tcf_nat_parms *nparm, *oparm; struct nlattr *tb[TCA_NAT_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_nat *parm; @@ -59,8 +60,8 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_nat_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, &act_nat_ops, + bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; @@ -79,19 +80,31 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; + + nparm = kzalloc(sizeof(*nparm), GFP_KERNEL); + if (!nparm) { + err = -ENOMEM; + goto release_idr; + } + + nparm->old_addr = parm->old_addr; + nparm->new_addr = parm->new_addr; + nparm->mask = parm->mask; + nparm->flags = parm->flags; + p = to_tcf_nat(*a); spin_lock_bh(&p->tcf_lock); - p->old_addr = parm->old_addr; - p->new_addr = parm->new_addr; - p->mask = parm->mask; - p->flags = parm->flags; - goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparm = rcu_replace_pointer(p->parms, nparm, lockdep_is_held(&p->tcf_lock)); spin_unlock_bh(&p->tcf_lock); + if (goto_ch) tcf_chain_put_by_act(goto_ch); + if (oparm) + kfree_rcu(oparm, rcu); + return ret; release_idr: tcf_idr_release(*a, bind); @@ -103,6 +116,7 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_nat *p = to_tcf_nat(a); + struct tcf_nat_parms *parms; struct iphdr *iph; __be32 old_addr; __be32 new_addr; @@ -113,18 +127,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, int ihl; int noff; - spin_lock(&p->tcf_lock); - tcf_lastuse_update(&p->tcf_tm); - old_addr = p->old_addr; - new_addr = p->new_addr; - mask = p->mask; - egress = p->flags & TCA_NAT_FLAG_EGRESS; - action = p->tcf_action; + tcf_action_update_bstats(&p->common, skb); - bstats_update(&p->tcf_bstats, skb); + action = READ_ONCE(p->tcf_action); - spin_unlock(&p->tcf_lock); + parms = rcu_dereference_bh(p->parms); + old_addr = parms->old_addr; + new_addr = parms->new_addr; + mask = parms->mask; + egress = parms->flags & TCA_NAT_FLAG_EGRESS; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -248,9 +260,7 @@ out: return action; drop: - spin_lock(&p->tcf_lock); - p->tcf_qstats.drops++; - spin_unlock(&p->tcf_lock); + tcf_action_inc_drop_qstats(&p->common); return TC_ACT_SHOT; } @@ -264,15 +274,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; + struct tcf_nat_parms *parms; struct tcf_t t; spin_lock_bh(&p->tcf_lock); - opt.old_addr = p->old_addr; - opt.new_addr = p->new_addr; - opt.mask = p->mask; - opt.flags = p->flags; + opt.action = p->tcf_action; + parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + + opt.old_addr = parms->old_addr; + opt.new_addr = parms->new_addr; + opt.mask = parms->mask; + opt.flags = parms->flags; + if (nla_put(skb, TCA_NAT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -289,6 +304,16 @@ nla_put_failure: return -1; } +static void tcf_nat_cleanup(struct tc_action *a) +{ + struct tcf_nat *p = to_tcf_nat(a); + struct tcf_nat_parms *parms; + + parms = rcu_dereference_protected(p->parms, 1); + if (parms) + kfree_rcu(parms, rcu); +} + static struct tc_action_ops act_nat_ops = { .kind = "nat", .id = TCA_ID_NAT, @@ -296,6 +321,7 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, + .cleanup = tcf_nat_cleanup, .size = sizeof(struct tcf_nat), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index a0378e9f0121..77d288d384ae 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -134,6 +134,17 @@ nla_failure: return -EINVAL; } +static void tcf_pedit_cleanup_rcu(struct rcu_head *head) +{ + struct tcf_pedit_parms *parms = + container_of(head, struct tcf_pedit_parms, rcu); + + kfree(parms->tcfp_keys_ex); + kfree(parms->tcfp_keys); + + kfree(parms); +} + static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, @@ -141,10 +152,9 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, { struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; - struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; - struct tc_pedit_key *keys = NULL; - struct tcf_pedit_key_ex *keys_ex; + struct tcf_pedit_parms *oparms, *nparms; + struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; @@ -181,18 +191,25 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, return -EINVAL; } - keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); - if (IS_ERR(keys_ex)) - return PTR_ERR(keys_ex); + nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); + if (!nparms) + return -ENOMEM; + + nparms->tcfp_keys_ex = + tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); + if (IS_ERR(nparms->tcfp_keys_ex)) { + ret = PTR_ERR(nparms->tcfp_keys_ex); + goto out_free; + } index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { - ret = tcf_idr_create(tn, index, est, a, - &act_pedit_ops, bind, false, flags); + ret = tcf_idr_create_from_flags(tn, index, est, a, + &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); - goto out_free; + goto out_free_ex; } ret = ACT_P_CREATED; } else if (err > 0) { @@ -204,7 +221,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } } else { ret = err; - goto out_free; + goto out_free_ex; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); @@ -212,48 +229,50 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, ret = err; goto out_release; } - p = to_pedit(*a); - spin_lock_bh(&p->tcf_lock); - if (ret == ACT_P_CREATED || - (p->tcfp_nkeys && p->tcfp_nkeys != parm->nkeys)) { - keys = kmalloc(ksize, GFP_ATOMIC); - if (!keys) { - spin_unlock_bh(&p->tcf_lock); - ret = -ENOMEM; - goto put_chain; - } - kfree(p->tcfp_keys); - p->tcfp_keys = keys; - p->tcfp_nkeys = parm->nkeys; + nparms->tcfp_off_max_hint = 0; + nparms->tcfp_flags = parm->flags; + nparms->tcfp_nkeys = parm->nkeys; + + nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); + if (!nparms->tcfp_keys) { + ret = -ENOMEM; + goto put_chain; } - memcpy(p->tcfp_keys, parm->keys, ksize); - p->tcfp_off_max_hint = 0; - for (i = 0; i < p->tcfp_nkeys; ++i) { - u32 cur = p->tcfp_keys[i].off; + + memcpy(nparms->tcfp_keys, parm->keys, ksize); + + for (i = 0; i < nparms->tcfp_nkeys; ++i) { + u32 cur = nparms->tcfp_keys[i].off; /* sanitize the shift value for any later use */ - p->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, - p->tcfp_keys[i].shift); + nparms->tcfp_keys[i].shift = min_t(size_t, + BITS_PER_TYPE(int) - 1, + nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ - cur += (0xff & p->tcfp_keys[i].offmask) >> p->tcfp_keys[i].shift; + cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ - p->tcfp_off_max_hint = max(p->tcfp_off_max_hint, cur + 4); + nparms->tcfp_off_max_hint = + max(nparms->tcfp_off_max_hint, cur + 4); } - p->tcfp_flags = parm->flags; + p = to_pedit(*a); + + spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); + oparms = rcu_replace_pointer(p->parms, nparms, 1); + spin_unlock_bh(&p->tcf_lock); - kfree(p->tcfp_keys_ex); - p->tcfp_keys_ex = keys_ex; + if (oparms) + call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); - spin_unlock_bh(&p->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); + return ret; put_chain: @@ -261,19 +280,22 @@ put_chain: tcf_chain_put_by_act(goto_ch); out_release: tcf_idr_release(*a, bind); +out_free_ex: + kfree(nparms->tcfp_keys_ex); out_free: - kfree(keys_ex); + kfree(nparms); return ret; - } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); - struct tc_pedit_key *keys = p->tcfp_keys; + struct tcf_pedit_parms *parms; - kfree(keys); - kfree(p->tcfp_keys_ex); + parms = rcu_dereference_protected(p->parms, 1); + + if (parms) + call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) @@ -324,109 +346,105 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { + enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; + enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_key_ex *tkey_ex; + struct tcf_pedit_parms *parms; + struct tc_pedit_key *tkey; u32 max_offset; int i; - spin_lock(&p->tcf_lock); + parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + - p->tcfp_off_max_hint; + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) - goto unlock; + goto done; tcf_lastuse_update(&p->tcf_tm); + tcf_action_update_bstats(&p->common, skb); - if (p->tcfp_nkeys > 0) { - struct tc_pedit_key *tkey = p->tcfp_keys; - struct tcf_pedit_key_ex *tkey_ex = p->tcfp_keys_ex; - enum pedit_header_type htype = - TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; - enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; - - for (i = p->tcfp_nkeys; i > 0; i--, tkey++) { - u32 *ptr, hdata; - int offset = tkey->off; - int hoffset; - u32 val; - int rc; - - if (tkey_ex) { - htype = tkey_ex->htype; - cmd = tkey_ex->cmd; - - tkey_ex++; - } + tkey = parms->tcfp_keys; + tkey_ex = parms->tcfp_keys_ex; - rc = pedit_skb_hdr_offset(skb, htype, &hoffset); - if (rc) { - pr_info("tc action pedit bad header type specified (0x%x)\n", - htype); - goto bad; - } + for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { + int offset = tkey->off; + u32 *ptr, hdata; + int hoffset; + u32 val; + int rc; - if (tkey->offmask) { - u8 *d, _d; - - if (!offset_valid(skb, hoffset + tkey->at)) { - pr_info("tc action pedit 'at' offset %d out of bounds\n", - hoffset + tkey->at); - goto bad; - } - d = skb_header_pointer(skb, hoffset + tkey->at, - sizeof(_d), &_d); - if (!d) - goto bad; - offset += (*d & tkey->offmask) >> tkey->shift; - } + if (tkey_ex) { + htype = tkey_ex->htype; + cmd = tkey_ex->cmd; - if (offset % 4) { - pr_info("tc action pedit offset must be on 32 bit boundaries\n"); - goto bad; - } + tkey_ex++; + } - if (!offset_valid(skb, hoffset + offset)) { - pr_info("tc action pedit offset %d out of bounds\n", - hoffset + offset); - goto bad; - } + rc = pedit_skb_hdr_offset(skb, htype, &hoffset); + if (rc) { + pr_info("tc action pedit bad header type specified (0x%x)\n", + htype); + goto bad; + } - ptr = skb_header_pointer(skb, hoffset + offset, - sizeof(hdata), &hdata); - if (!ptr) - goto bad; - /* just do it, baby */ - switch (cmd) { - case TCA_PEDIT_KEY_EX_CMD_SET: - val = tkey->val; - break; - case TCA_PEDIT_KEY_EX_CMD_ADD: - val = (*ptr + tkey->val) & ~tkey->mask; - break; - default: - pr_info("tc action pedit bad command (%d)\n", - cmd); + if (tkey->offmask) { + u8 *d, _d; + + if (!offset_valid(skb, hoffset + tkey->at)) { + pr_info("tc action pedit 'at' offset %d out of bounds\n", + hoffset + tkey->at); goto bad; } + d = skb_header_pointer(skb, hoffset + tkey->at, + sizeof(_d), &_d); + if (!d) + goto bad; + offset += (*d & tkey->offmask) >> tkey->shift; + } - *ptr = ((*ptr & tkey->mask) ^ val); - if (ptr == &hdata) - skb_store_bits(skb, hoffset + offset, ptr, 4); + if (offset % 4) { + pr_info("tc action pedit offset must be on 32 bit boundaries\n"); + goto bad; } - goto done; - } else { - WARN(1, "pedit BUG: index %d\n", p->tcf_index); + if (!offset_valid(skb, hoffset + offset)) { + pr_info("tc action pedit offset %d out of bounds\n", + hoffset + offset); + goto bad; + } + + ptr = skb_header_pointer(skb, hoffset + offset, + sizeof(hdata), &hdata); + if (!ptr) + goto bad; + /* just do it, baby */ + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + val = tkey->val; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + val = (*ptr + tkey->val) & ~tkey->mask; + break; + default: + pr_info("tc action pedit bad command (%d)\n", + cmd); + goto bad; + } + + *ptr = ((*ptr & tkey->mask) ^ val); + if (ptr == &hdata) + skb_store_bits(skb, hoffset + offset, ptr, 4); } + goto done; + bad: - p->tcf_qstats.overlimits++; + tcf_action_inc_overlimit_qstats(&p->common); done: - bstats_update(&p->tcf_bstats, skb); -unlock: - spin_unlock(&p->tcf_lock); return p->tcf_action; } @@ -445,30 +463,33 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); + struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - s = struct_size(opt, keys, p->tcfp_nkeys); + spin_lock_bh(&p->tcf_lock); + parms = rcu_dereference_protected(p->parms, 1); + s = struct_size(opt, keys, parms->tcfp_nkeys); - /* netlink spinlocks held above us - must use ATOMIC */ opt = kzalloc(s, GFP_ATOMIC); - if (unlikely(!opt)) + if (unlikely(!opt)) { + spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; + } - spin_lock_bh(&p->tcf_lock); - memcpy(opt->keys, p->tcfp_keys, flex_array_size(opt, keys, p->tcfp_nkeys)); + memcpy(opt->keys, parms->tcfp_keys, + flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; - opt->nkeys = p->tcfp_nkeys; - opt->flags = p->tcfp_flags; + opt->nkeys = parms->tcfp_nkeys; + opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; - if (p->tcfp_keys_ex) { - if (tcf_pedit_key_ex_dump(skb, - p->tcfp_keys_ex, - p->tcfp_nkeys)) + if (parms->tcfp_keys_ex) { + if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, + parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) @@ -522,7 +543,28 @@ static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, } *index_inc = k; } else { - return -EOPNOTSUPP; + struct flow_offload_action *fl_action = entry_data; + u32 cmd = tcf_pedit_cmd(act, 0); + int k; + + switch (cmd) { + case TCA_PEDIT_KEY_EX_CMD_SET: + fl_action->id = FLOW_ACTION_MANGLE; + break; + case TCA_PEDIT_KEY_EX_CMD_ADD: + fl_action->id = FLOW_ACTION_ADD; + break; + default: + NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); + return -EOPNOTSUPP; + } + + for (k = 1; k < tcf_pedit_nkeys(act); k++) { + if (cmd != tcf_pedit_cmd(act, k)) { + NL_SET_ERR_MSG_MOD(extack, "Unsupported pedit command offload"); + return -EOPNOTSUPP; + } + } } return 0; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 668130f08903..3569e2c3660c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -22,6 +22,7 @@ #include <linux/idr.h> #include <linux/jhash.h> #include <linux/rculist.h> +#include <linux/rhashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> @@ -50,6 +51,109 @@ static LIST_HEAD(tcf_proto_base); /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(cls_mod_lock); +static struct xarray tcf_exts_miss_cookies_xa; +struct tcf_exts_miss_cookie_node { + const struct tcf_chain *chain; + const struct tcf_proto *tp; + const struct tcf_exts *exts; + u32 chain_index; + u32 tp_prio; + u32 handle; + u32 miss_cookie_base; + struct rcu_head rcu; +}; + +/* Each tc action entry cookie will be comprised of 32bit miss_cookie_base + + * action index in the exts tc actions array. + */ +union tcf_exts_miss_cookie { + struct { + u32 miss_cookie_base; + u32 act_index; + }; + u64 miss_cookie; +}; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) +static int +tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, + u32 handle) +{ + struct tcf_exts_miss_cookie_node *n; + static u32 next; + int err; + + if (WARN_ON(!handle || !tp->ops->get_exts)) + return -EINVAL; + + n = kzalloc(sizeof(*n), GFP_KERNEL); + if (!n) + return -ENOMEM; + + n->chain_index = tp->chain->index; + n->chain = tp->chain; + n->tp_prio = tp->prio; + n->tp = tp; + n->exts = exts; + n->handle = handle; + + err = xa_alloc_cyclic(&tcf_exts_miss_cookies_xa, &n->miss_cookie_base, + n, xa_limit_32b, &next, GFP_KERNEL); + if (err) + goto err_xa_alloc; + + exts->miss_cookie_node = n; + return 0; + +err_xa_alloc: + kfree(n); + return err; +} + +static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) +{ + struct tcf_exts_miss_cookie_node *n; + + if (!exts->miss_cookie_node) + return; + + n = exts->miss_cookie_node; + xa_erase(&tcf_exts_miss_cookies_xa, n->miss_cookie_base); + kfree_rcu(n, rcu); +} + +static struct tcf_exts_miss_cookie_node * +tcf_exts_miss_cookie_lookup(u64 miss_cookie, int *act_index) +{ + union tcf_exts_miss_cookie mc = { .miss_cookie = miss_cookie, }; + + *act_index = mc.act_index; + return xa_load(&tcf_exts_miss_cookies_xa, mc.miss_cookie_base); +} +#else /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ +static int +tcf_exts_miss_cookie_base_alloc(struct tcf_exts *exts, struct tcf_proto *tp, + u32 handle) +{ + return 0; +} + +static void tcf_exts_miss_cookie_base_destroy(struct tcf_exts *exts) +{ +} +#endif /* IS_ENABLED(CONFIG_NET_TC_SKB_EXT) */ + +static u64 tcf_exts_miss_cookie_get(u32 miss_cookie_base, int act_index) +{ + union tcf_exts_miss_cookie mc = { .act_index = act_index, }; + + if (!miss_cookie_base) + return 0; + + mc.miss_cookie_base = miss_cookie_base; + return mc.miss_cookie; +} + #ifdef CONFIG_NET_CLS_ACT DEFINE_STATIC_KEY_FALSE(tc_skb_ext_tc); EXPORT_SYMBOL(tc_skb_ext_tc); @@ -488,7 +592,8 @@ static struct tcf_chain *tcf_chain_lookup_rcu(const struct tcf_block *block, #endif static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast); + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack); static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, u32 chain_index, bool create, @@ -521,7 +626,7 @@ static struct tcf_chain *__tcf_chain_get(struct tcf_block *block, */ if (is_first_reference && !by_act) tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, NULL); return chain; @@ -1548,6 +1653,8 @@ static inline int __tcf_classify(struct sk_buff *skb, const struct tcf_proto *orig_tp, struct tcf_result *res, bool compat_mode, + struct tcf_exts_miss_cookie_node *n, + int act_index, u32 *last_executed_chain) { #ifdef CONFIG_NET_CLS_ACT @@ -1559,13 +1666,36 @@ reclassify: #endif for (; tp; tp = rcu_dereference_bh(tp->next)) { __be16 protocol = skb_protocol(skb, false); - int err; + int err = 0; - if (tp->protocol != protocol && - tp->protocol != htons(ETH_P_ALL)) - continue; + if (n) { + struct tcf_exts *exts; + + if (n->tp_prio != tp->prio) + continue; + + /* We re-lookup the tp and chain based on index instead + * of having hard refs and locks to them, so do a sanity + * check if any of tp,chain,exts was replaced by the + * time we got here with a cookie from hardware. + */ + if (unlikely(n->tp != tp || n->tp->chain != n->chain || + !tp->ops->get_exts)) + return TC_ACT_SHOT; + + exts = tp->ops->get_exts(tp, n->handle); + if (unlikely(!exts || n->exts != exts)) + return TC_ACT_SHOT; - err = tc_classify(skb, tp, res); + n = NULL; + err = tcf_exts_exec_ex(skb, exts, act_index, res); + } else { + if (tp->protocol != protocol && + tp->protocol != htons(ETH_P_ALL)) + continue; + + err = tc_classify(skb, tp, res); + } #ifdef CONFIG_NET_CLS_ACT if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode)) { first_tp = orig_tp; @@ -1581,6 +1711,9 @@ reclassify: return err; } + if (unlikely(n)) + return TC_ACT_SHOT; + return TC_ACT_UNSPEC; /* signal: continue lookup */ #ifdef CONFIG_NET_CLS_ACT reset: @@ -1605,21 +1738,35 @@ int tcf_classify(struct sk_buff *skb, #if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) u32 last_executed_chain = 0; - return __tcf_classify(skb, tp, tp, res, compat_mode, + return __tcf_classify(skb, tp, tp, res, compat_mode, NULL, 0, &last_executed_chain); #else u32 last_executed_chain = tp ? tp->chain->index : 0; + struct tcf_exts_miss_cookie_node *n = NULL; const struct tcf_proto *orig_tp = tp; struct tc_skb_ext *ext; + int act_index = 0; int ret; if (block) { ext = skb_ext_find(skb, TC_SKB_EXT); - if (ext && ext->chain) { + if (ext && (ext->chain || ext->act_miss)) { struct tcf_chain *fchain; + u32 chain; + + if (ext->act_miss) { + n = tcf_exts_miss_cookie_lookup(ext->act_miss_cookie, + &act_index); + if (!n) + return TC_ACT_SHOT; - fchain = tcf_chain_lookup_rcu(block, ext->chain); + chain = n->chain_index; + } else { + chain = ext->chain; + } + + fchain = tcf_chain_lookup_rcu(block, chain); if (!fchain) return TC_ACT_SHOT; @@ -1631,7 +1778,7 @@ int tcf_classify(struct sk_buff *skb, } } - ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, + ret = __tcf_classify(skb, tp, orig_tp, res, compat_mode, n, act_index, &last_executed_chain); if (tc_skb_ext_tc_enabled()) { @@ -1817,7 +1964,8 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, u32 portid, u32 seq, u16 flags, int event, - bool terse_dump, bool rtnl_held) + bool terse_dump, bool rtnl_held, + struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1857,7 +2005,13 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb, tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0) goto nla_put_failure; } + + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto nla_put_failure; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1871,7 +2025,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct tcf_proto *tp, struct tcf_block *block, struct Qdisc *q, u32 parent, void *fh, int event, bool unicast, - bool rtnl_held) + bool rtnl_held, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1883,7 +2037,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, event, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -1912,7 +2066,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid, n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER, - false, rtnl_held) <= 0) { + false, rtnl_held, extack) <= 0) { NL_SET_ERR_MSG(extack, "Failed to build del event notification"); kfree_skb(skb); return -EINVAL; @@ -1938,14 +2092,15 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb, static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb, struct tcf_block *block, struct Qdisc *q, u32 parent, struct nlmsghdr *n, - struct tcf_chain *chain, int event) + struct tcf_chain *chain, int event, + struct netlink_ext_ack *extack) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) - tfilter_notify(net, oskb, n, tp, block, - q, parent, NULL, event, false, true); + tfilter_notify(net, oskb, n, tp, block, q, parent, NULL, + event, false, true, extack); } static void tfilter_put(struct tcf_proto *tp, void *fh) @@ -2156,7 +2311,7 @@ replay: flags, extack); if (err == 0) { tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_NEWTFILTER, false, rtnl_held); + RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); /* q pointer is NULL for shared blocks */ if (q) @@ -2284,7 +2439,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, if (prio == 0) { tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); tcf_chain_flush(chain, rtnl_held); err = 0; goto errout; @@ -2308,7 +2463,7 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n, tcf_proto_put(tp, rtnl_held, NULL); tfilter_notify(net, skb, n, tp, block, q, parent, fh, - RTM_DELTFILTER, false, rtnl_held); + RTM_DELTFILTER, false, rtnl_held, extack); err = 0; goto errout; } @@ -2452,7 +2607,7 @@ static int tc_get_tfilter(struct sk_buff *skb, struct nlmsghdr *n, err = -ENOENT; } else { err = tfilter_notify(net, skb, n, tp, block, q, parent, - fh, RTM_NEWTFILTER, true, rtnl_held); + fh, RTM_NEWTFILTER, true, rtnl_held, NULL); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send filter notify message"); } @@ -2490,7 +2645,7 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg) return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent, n, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, a->terse_dump, true); + RTM_NEWTFILTER, a->terse_dump, true, NULL); } static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, @@ -2524,7 +2679,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent, if (tcf_fill_node(net, skb, tp, block, q, parent, NULL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTFILTER, false, true) <= 0) + RTM_NEWTFILTER, false, true, NULL) <= 0) goto errout; cb->args[1] = 1; } @@ -2667,7 +2822,8 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, void *tmplt_priv, u32 chain_index, struct net *net, struct sk_buff *skb, struct tcf_block *block, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { unsigned char *b = skb_tail_pointer(skb); const struct tcf_proto_ops *ops; @@ -2704,7 +2860,12 @@ static int tc_chain_fill_node(const struct tcf_proto_ops *tmplt_ops, goto nla_put_failure; } + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -2714,7 +2875,8 @@ nla_put_failure: } static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, - u32 seq, u16 flags, int event, bool unicast) + u32 seq, u16 flags, int event, bool unicast, + struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct tcf_block *block = chain->block; @@ -2728,7 +2890,7 @@ static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb, if (tc_chain_fill_node(chain->tmplt_ops, chain->tmplt_priv, chain->index, net, skb, block, portid, - seq, flags, event) <= 0) { + seq, flags, event, extack) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2756,7 +2918,7 @@ static int tc_chain_notify_delete(const struct tcf_proto_ops *tmplt_ops, return -ENOBUFS; if (tc_chain_fill_node(tmplt_ops, tmplt_priv, chain_index, net, skb, - block, portid, seq, flags, RTM_DELCHAIN) <= 0) { + block, portid, seq, flags, RTM_DELCHAIN, NULL) <= 0) { kfree_skb(skb); return -EINVAL; } @@ -2908,11 +3070,11 @@ replay: } tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL, - RTM_NEWCHAIN, false); + RTM_NEWCHAIN, false, extack); break; case RTM_DELCHAIN: tfilter_notify_chain(net, skb, block, q, parent, n, - chain, RTM_DELTFILTER); + chain, RTM_DELTFILTER, extack); /* Flush the chain first as the user requested chain removal. */ tcf_chain_flush(chain, true); /* In case the chain was successfully deleted, put a reference @@ -2922,7 +3084,7 @@ replay: break; case RTM_GETCHAIN: err = tc_chain_notify(chain, skb, n->nlmsg_seq, - n->nlmsg_flags, n->nlmsg_type, true); + n->nlmsg_flags, n->nlmsg_type, true, extack); if (err < 0) NL_SET_ERR_MSG(extack, "Failed to send chain notify message"); break; @@ -3022,7 +3184,7 @@ static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb) chain->index, net, skb, block, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWCHAIN); + RTM_NEWCHAIN, NULL); if (err <= 0) break; index++; @@ -3040,9 +3202,48 @@ out: return skb->len; } +int tcf_exts_init_ex(struct tcf_exts *exts, struct net *net, int action, + int police, struct tcf_proto *tp, u32 handle, + bool use_action_miss) +{ + int err = 0; + +#ifdef CONFIG_NET_CLS_ACT + exts->type = 0; + exts->nr_actions = 0; + /* Note: we do not own yet a reference on net. + * This reference might be taken later from tcf_exts_get_net(). + */ + exts->net = net; + exts->actions = kcalloc(TCA_ACT_MAX_PRIO, sizeof(struct tc_action *), + GFP_KERNEL); + if (!exts->actions) + return -ENOMEM; +#endif + + exts->action = action; + exts->police = police; + + if (!use_action_miss) + return 0; + + err = tcf_exts_miss_cookie_base_alloc(exts, tp, handle); + if (err) + goto err_miss_alloc; + + return 0; + +err_miss_alloc: + tcf_exts_destroy(exts); + return err; +} +EXPORT_SYMBOL(tcf_exts_init_ex); + void tcf_exts_destroy(struct tcf_exts *exts) { #ifdef CONFIG_NET_CLS_ACT + tcf_exts_miss_cookie_base_destroy(exts); + if (exts->actions) { tcf_action_destroy(exts->actions, TCA_ACT_UNBIND); kfree(exts->actions); @@ -3474,28 +3675,28 @@ int tc_setup_cb_reoffload(struct tcf_block *block, struct tcf_proto *tp, } EXPORT_SYMBOL(tc_setup_cb_reoffload); -static int tcf_act_get_cookie(struct flow_action_entry *entry, - const struct tc_action *act) +static int tcf_act_get_user_cookie(struct flow_action_entry *entry, + const struct tc_action *act) { - struct tc_cookie *cookie; + struct tc_cookie *user_cookie; int err = 0; rcu_read_lock(); - cookie = rcu_dereference(act->act_cookie); - if (cookie) { - entry->cookie = flow_action_cookie_create(cookie->data, - cookie->len, - GFP_ATOMIC); - if (!entry->cookie) + user_cookie = rcu_dereference(act->user_cookie); + if (user_cookie) { + entry->user_cookie = flow_action_cookie_create(user_cookie->data, + user_cookie->len, + GFP_ATOMIC); + if (!entry->user_cookie) err = -ENOMEM; } rcu_read_unlock(); return err; } -static void tcf_act_put_cookie(struct flow_action_entry *entry) +static void tcf_act_put_user_cookie(struct flow_action_entry *entry) { - flow_action_cookie_destroy(entry->cookie); + flow_action_cookie_destroy(entry->user_cookie); } void tc_cleanup_offload_action(struct flow_action *flow_action) @@ -3504,7 +3705,7 @@ void tc_cleanup_offload_action(struct flow_action *flow_action) int i; flow_action_for_each(i, entry, flow_action) { - tcf_act_put_cookie(entry); + tcf_act_put_user_cookie(entry); if (entry->destructor) entry->destructor(entry->destructor_priv); } @@ -3531,6 +3732,7 @@ static int tc_setup_offload_act(struct tc_action *act, int tc_setup_action(struct flow_action *flow_action, struct tc_action *actions[], + u32 miss_cookie_base, struct netlink_ext_ack *extack) { int i, j, k, index, err = 0; @@ -3549,7 +3751,7 @@ int tc_setup_action(struct flow_action *flow_action, entry = &flow_action->entries[j]; spin_lock_bh(&act->tcfa_lock); - err = tcf_act_get_cookie(entry, act); + err = tcf_act_get_user_cookie(entry, act); if (err) goto err_out_locked; @@ -3561,6 +3763,9 @@ int tc_setup_action(struct flow_action *flow_action, for (k = 0; k < index ; k++) { entry[k].hw_stats = tc_act_hw_stats(act->hw_stats); entry[k].hw_index = act->tcfa_index; + entry[k].cookie = (unsigned long)act; + entry[k].miss_cookie = + tcf_exts_miss_cookie_get(miss_cookie_base, i); } j += index; @@ -3583,10 +3788,15 @@ int tc_setup_offload_action(struct flow_action *flow_action, struct netlink_ext_ack *extack) { #ifdef CONFIG_NET_CLS_ACT + u32 miss_cookie_base; + if (!exts) return 0; - return tc_setup_action(flow_action, exts->actions, extack); + miss_cookie_base = exts->miss_cookie_node ? + exts->miss_cookie_node->miss_cookie_base : 0; + return tc_setup_action(flow_action, exts->actions, miss_cookie_base, + extack); #else return 0; #endif @@ -3754,6 +3964,8 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; + xa_init_flags(&tcf_exts_miss_cookies_xa, XA_FLAGS_ALLOC1); + rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0b15698b3531..e960a46b0520 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -502,12 +502,7 @@ static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f, tc_setup_cb_call(block, TC_SETUP_CLSFLOWER, &cls_flower, false, rtnl_held); - tcf_exts_hw_stats_update(&f->exts, cls_flower.stats.bytes, - cls_flower.stats.pkts, - cls_flower.stats.drops, - cls_flower.stats.lastused, - cls_flower.stats.used_hw_stats, - cls_flower.stats.used_hw_stats_valid); + tcf_exts_hw_stats_update(&f->exts, &cls_flower.stats, cls_flower.use_act_stats); } static void __fl_put(struct cls_fl_filter *f) @@ -534,6 +529,15 @@ static struct cls_fl_filter *__fl_get(struct cls_fl_head *head, u32 handle) return f; } +static struct tcf_exts *fl_get_exts(const struct tcf_proto *tp, u32 handle) +{ + struct cls_fl_head *head = rcu_dereference_bh(tp->root); + struct cls_fl_filter *f; + + f = idr_find(&head->handle_idr, handle); + return f ? &f->exts : NULL; +} + static int __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) @@ -2192,10 +2196,6 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, INIT_LIST_HEAD(&fnew->hw_list); refcount_set(&fnew->refcnt, 1); - err = tcf_exts_init(&fnew->exts, net, TCA_FLOWER_ACT, 0); - if (err < 0) - goto errout; - if (tb[TCA_FLOWER_FLAGS]) { fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); @@ -2205,15 +2205,46 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, } } + if (!fold) { + spin_lock(&tp->lock); + if (!handle) { + handle = 1; + err = idr_alloc_u32(&head->handle_idr, fnew, &handle, + INT_MAX, GFP_ATOMIC); + } else { + err = idr_alloc_u32(&head->handle_idr, fnew, &handle, + handle, GFP_ATOMIC); + + /* Filter with specified handle was concurrently + * inserted after initial check in cls_api. This is not + * necessarily an error if NLM_F_EXCL is not set in + * message flags. Returning EAGAIN will cause cls_api to + * try to update concurrently inserted rule. + */ + if (err == -ENOSPC) + err = -EAGAIN; + } + spin_unlock(&tp->lock); + + if (err) + goto errout; + } + fnew->handle = handle; + + err = tcf_exts_init_ex(&fnew->exts, net, TCA_FLOWER_ACT, 0, tp, handle, + !tc_skip_hw(fnew->flags)); + if (err < 0) + goto errout_idr; + err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], tp->chain->tmplt_priv, flags, fnew->flags, extack); if (err) - goto errout; + goto errout_idr; err = fl_check_assign_mask(head, fnew, fold, mask); if (err) - goto errout; + goto errout_idr; err = fl_ht_insert_unique(fnew, fold, &in_ht); if (err) @@ -2279,29 +2310,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, refcount_dec(&fold->refcnt); __fl_put(fold); } else { - if (handle) { - /* user specifies a handle and it doesn't exist */ - err = idr_alloc_u32(&head->handle_idr, fnew, &handle, - handle, GFP_ATOMIC); - - /* Filter with specified handle was concurrently - * inserted after initial check in cls_api. This is not - * necessarily an error if NLM_F_EXCL is not set in - * message flags. Returning EAGAIN will cause cls_api to - * try to update concurrently inserted rule. - */ - if (err == -ENOSPC) - err = -EAGAIN; - } else { - handle = 1; - err = idr_alloc_u32(&head->handle_idr, fnew, &handle, - INT_MAX, GFP_ATOMIC); - } - if (err) - goto errout_hw; + idr_replace(&head->handle_idr, fnew, fnew->handle); refcount_inc(&fnew->refcnt); - fnew->handle = handle; list_add_tail_rcu(&fnew->list, &fnew->mask->filters); spin_unlock(&tp->lock); } @@ -2324,6 +2335,8 @@ errout_hw: fnew->mask->filter_ht_params); errout_mask: fl_mask_put(head, fnew->mask); +errout_idr: + idr_remove(&head->handle_idr, fnew->handle); errout: __fl_put(fnew); errout_tb: @@ -3441,6 +3454,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .tmplt_create = fl_tmplt_create, .tmplt_destroy = fl_tmplt_destroy, .tmplt_dump = fl_tmplt_dump, + .get_exts = fl_get_exts, .owner = THIS_MODULE, .flags = TCF_PROTO_OPS_DOIT_UNLOCKED, }; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 705f63da2c21..fa3bbd187eb9 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -331,11 +331,7 @@ static void mall_stats_hw_filter(struct tcf_proto *tp, tc_setup_cb_call(block, TC_SETUP_CLSMATCHALL, &cls_mall, false, true); - tcf_exts_hw_stats_update(&head->exts, cls_mall.stats.bytes, - cls_mall.stats.pkts, cls_mall.stats.drops, - cls_mall.stats.lastused, - cls_mall.stats.used_hw_stats, - cls_mall.stats.used_hw_stats_valid); + tcf_exts_hw_stats_update(&head->exts, &cls_mall.stats, cls_mall.use_act_stats); } static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c deleted file mode 100644 index 03d8619bd9c6..000000000000 --- a/net/sched/cls_rsvp.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/ip.h> -#include <net/netlink.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> -#include <net/tc_wrapper.h> - -#define RSVP_DST_LEN 1 -#define RSVP_ID "rsvp" -#define RSVP_OPS cls_rsvp_ops -#define RSVP_CLS rsvp_classify - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h deleted file mode 100644 index 869efba9f834..000000000000 --- a/net/sched/cls_rsvp.h +++ /dev/null @@ -1,764 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -/* - Comparing to general packet classification problem, - RSVP needs only several relatively simple rules: - - * (dst, protocol) are always specified, - so that we are able to hash them. - * src may be exact, or may be wildcard, so that - we can keep a hash table plus one wildcard entry. - * source port (or flow label) is important only if src is given. - - IMPLEMENTATION. - - We use a two level hash table: The top level is keyed by - destination address and protocol ID, every bucket contains a list - of "rsvp sessions", identified by destination address, protocol and - DPI(="Destination Port ID"): triple (key, mask, offset). - - Every bucket has a smaller hash table keyed by source address - (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. - Every bucket is again a list of "RSVP flows", selected by - source address and SPI(="Source Port ID" here rather than - "security parameter index"): triple (key, mask, offset). - - - NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) - and all fragmented packets go to the best-effort traffic class. - - - NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires - only one "Generalized Port Identifier". So that for classic - ah, esp (and udp,tcp) both *pi should coincide or one of them - should be wildcard. - - At first sight, this redundancy is just a waste of CPU - resources. But DPI and SPI add the possibility to assign different - priorities to GPIs. Look also at note 4 about tunnels below. - - - NOTE 3. One complication is the case of tunneled packets. - We implement it as following: if the first lookup - matches a special session with "tunnelhdr" value not zero, - flowid doesn't contain the true flow ID, but the tunnel ID (1...255). - In this case, we pull tunnelhdr bytes and restart lookup - with tunnel ID added to the list of keys. Simple and stupid 8)8) - It's enough for PIMREG and IPIP. - - - NOTE 4. Two GPIs make it possible to parse even GRE packets. - F.e. DPI can select ETH_P_IP (and necessary flags to make - tunnelhdr correct) in GRE protocol field and SPI matches - GRE key. Is it not nice? 8)8) - - - Well, as result, despite its simplicity, we get a pretty - powerful classification engine. */ - - -struct rsvp_head { - u32 tmap[256/32]; - u32 hgenerator; - u8 tgenerator; - struct rsvp_session __rcu *ht[256]; - struct rcu_head rcu; -}; - -struct rsvp_session { - struct rsvp_session __rcu *next; - __be32 dst[RSVP_DST_LEN]; - struct tc_rsvp_gpi dpi; - u8 protocol; - u8 tunnelid; - /* 16 (src,sport) hash slots, and one wildcard source slot */ - struct rsvp_filter __rcu *ht[16 + 1]; - struct rcu_head rcu; -}; - - -struct rsvp_filter { - struct rsvp_filter __rcu *next; - __be32 src[RSVP_DST_LEN]; - struct tc_rsvp_gpi spi; - u8 tunnelhdr; - - struct tcf_result res; - struct tcf_exts exts; - - u32 handle; - struct rsvp_session *sess; - struct rcu_work rwork; -}; - -static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) -{ - unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1]; - - h ^= h>>16; - h ^= h>>8; - return (h ^ protocol ^ tunnelid) & 0xFF; -} - -static inline unsigned int hash_src(__be32 *src) -{ - unsigned int h = (__force __u32)src[RSVP_DST_LEN-1]; - - h ^= h>>16; - h ^= h>>8; - h ^= h>>4; - return h & 0xF; -} - -#define RSVP_APPLY_RESULT() \ -{ \ - int r = tcf_exts_exec(skb, &f->exts, res); \ - if (r < 0) \ - continue; \ - else if (r > 0) \ - return r; \ -} - -TC_INDIRECT_SCOPE int RSVP_CLS(struct sk_buff *skb, const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct rsvp_head *head = rcu_dereference_bh(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1, h2; - __be32 *dst, *src; - u8 protocol; - u8 tunnelid = 0; - u8 *xprt; -#if RSVP_DST_LEN == 4 - struct ipv6hdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ipv6_hdr(skb); -#else - struct iphdr *nhptr; - - if (!pskb_network_may_pull(skb, sizeof(*nhptr))) - return -1; - nhptr = ip_hdr(skb); -#endif -restart: - -#if RSVP_DST_LEN == 4 - src = &nhptr->saddr.s6_addr32[0]; - dst = &nhptr->daddr.s6_addr32[0]; - protocol = nhptr->nexthdr; - xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr); -#else - src = &nhptr->saddr; - dst = &nhptr->daddr; - protocol = nhptr->protocol; - xprt = ((u8 *)nhptr) + (nhptr->ihl<<2); - if (ip_is_fragment(nhptr)) - return -1; -#endif - - h1 = hash_dst(dst, protocol, tunnelid); - h2 = hash_src(src); - - for (s = rcu_dereference_bh(head->ht[h1]); s; - s = rcu_dereference_bh(s->next)) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] && - protocol == s->protocol && - !(s->dpi.mask & - (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - tunnelid == s->tunnelid) { - - for (f = rcu_dereference_bh(s->ht[h2]); f; - f = rcu_dereference_bh(f->next)) { - if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] && - !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key)) -#if RSVP_DST_LEN == 4 - && - src[0] == f->src[0] && - src[1] == f->src[1] && - src[2] == f->src[2] -#endif - ) { - *res = f->res; - RSVP_APPLY_RESULT(); - -matched: - if (f->tunnelhdr == 0) - return 0; - - tunnelid = f->res.classid; - nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr)); - goto restart; - } - } - - /* And wildcard bucket... */ - for (f = rcu_dereference_bh(s->ht[16]); f; - f = rcu_dereference_bh(f->next)) { - *res = f->res; - RSVP_APPLY_RESULT(); - goto matched; - } - return -1; - } - } - return -1; -} - -static void rsvp_replace(struct tcf_proto *tp, struct rsvp_filter *n, u32 h) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter __rcu **ins; - struct rsvp_filter *pins; - unsigned int h1 = h & 0xFF; - unsigned int h2 = (h >> 8) & 0xFF; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (ins = &s->ht[h2], pins = rtnl_dereference(*ins); ; - ins = &pins->next, pins = rtnl_dereference(*ins)) { - if (pins->handle == h) { - RCU_INIT_POINTER(n->next, pins->next); - rcu_assign_pointer(*ins, n); - return; - } - } - } - - /* Something went wrong if we are trying to replace a non-existent - * node. Mind as well halt instead of silently failing. - */ - BUG_ON(1); -} - -static void *rsvp_get(struct tcf_proto *tp, u32 handle) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_session *s; - struct rsvp_filter *f; - unsigned int h1 = handle & 0xFF; - unsigned int h2 = (handle >> 8) & 0xFF; - - if (h2 > 16) - return NULL; - - for (s = rtnl_dereference(head->ht[h1]); s; - s = rtnl_dereference(s->next)) { - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->handle == handle) - return f; - } - } - return NULL; -} - -static int rsvp_init(struct tcf_proto *tp) -{ - struct rsvp_head *data; - - data = kzalloc(sizeof(struct rsvp_head), GFP_KERNEL); - if (data) { - rcu_assign_pointer(tp->root, data); - return 0; - } - return -ENOBUFS; -} - -static void __rsvp_delete_filter(struct rsvp_filter *f) -{ - tcf_exts_destroy(&f->exts); - tcf_exts_put_net(&f->exts); - kfree(f); -} - -static void rsvp_delete_filter_work(struct work_struct *work) -{ - struct rsvp_filter *f = container_of(to_rcu_work(work), - struct rsvp_filter, - rwork); - rtnl_lock(); - __rsvp_delete_filter(f); - rtnl_unlock(); -} - -static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) -{ - tcf_unbind_filter(tp, &f->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (tcf_exts_get_net(&f->exts)) - tcf_queue_work(&f->rwork, rsvp_delete_filter_work); - else - __rsvp_delete_filter(f); -} - -static void rsvp_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int h1, h2; - - if (data == NULL) - return; - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - - while ((s = rtnl_dereference(data->ht[h1])) != NULL) { - RCU_INIT_POINTER(data->ht[h1], s->next); - - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - while ((f = rtnl_dereference(s->ht[h2])) != NULL) { - rcu_assign_pointer(s->ht[h2], f->next); - rsvp_delete_filter(tp, f); - } - } - kfree_rcu(s, rcu); - } - } - kfree_rcu(data, rcu); -} - -static int rsvp_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - struct rsvp_filter *nfp, *f = arg; - struct rsvp_filter __rcu **fp; - unsigned int h = f->handle; - struct rsvp_session __rcu **sp; - struct rsvp_session *nsp, *s = f->sess; - int i, h1; - - fp = &s->ht[(h >> 8) & 0xFF]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - if (nfp == f) { - RCU_INIT_POINTER(*fp, f->next); - rsvp_delete_filter(tp, f); - - /* Strip tree */ - - for (i = 0; i <= 16; i++) - if (s->ht[i]) - goto out; - - /* OK, session has no flows */ - sp = &head->ht[h & 0xFF]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if (nsp == s) { - RCU_INIT_POINTER(*sp, s->next); - kfree_rcu(s, rcu); - goto out; - } - } - - break; - } - } - -out: - *last = true; - for (h1 = 0; h1 < 256; h1++) { - if (rcu_access_pointer(head->ht[h1])) { - *last = false; - break; - } - } - - return 0; -} - -static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - int i = 0xFFFF; - - while (i-- > 0) { - u32 h; - - if ((data->hgenerator += 0x10000) == 0) - data->hgenerator = 0x10000; - h = data->hgenerator|salt; - if (!rsvp_get(tp, h)) - return h; - } - return 0; -} - -static int tunnel_bts(struct rsvp_head *data) -{ - int n = data->tgenerator >> 5; - u32 b = 1 << (data->tgenerator & 0x1F); - - if (data->tmap[n] & b) - return 0; - data->tmap[n] |= b; - return 1; -} - -static void tunnel_recycle(struct rsvp_head *data) -{ - struct rsvp_session __rcu **sht = data->ht; - u32 tmap[256/32]; - int h1, h2; - - memset(tmap, 0, sizeof(tmap)); - - for (h1 = 0; h1 < 256; h1++) { - struct rsvp_session *s; - for (s = rtnl_dereference(sht[h1]); s; - s = rtnl_dereference(s->next)) { - for (h2 = 0; h2 <= 16; h2++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h2]); f; - f = rtnl_dereference(f->next)) { - if (f->tunnelhdr == 0) - continue; - data->tgenerator = f->res.classid; - tunnel_bts(data); - } - } - } - } - - memcpy(data->tmap, tmap, sizeof(tmap)); -} - -static u32 gen_tunnel(struct rsvp_head *data) -{ - int i, k; - - for (k = 0; k < 2; k++) { - for (i = 255; i > 0; i--) { - if (++data->tgenerator == 0) - data->tgenerator = 1; - if (tunnel_bts(data)) - return data->tgenerator; - } - tunnel_recycle(data); - } - return 0; -} - -static const struct nla_policy rsvp_policy[TCA_RSVP_MAX + 1] = { - [TCA_RSVP_CLASSID] = { .type = NLA_U32 }, - [TCA_RSVP_DST] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_SRC] = { .len = RSVP_DST_LEN * sizeof(u32) }, - [TCA_RSVP_PINFO] = { .len = sizeof(struct tc_rsvp_pinfo) }, -}; - -static int rsvp_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, - u32 handle, struct nlattr **tca, - void **arg, u32 flags, - struct netlink_ext_ack *extack) -{ - struct rsvp_head *data = rtnl_dereference(tp->root); - struct rsvp_filter *f, *nfp; - struct rsvp_filter __rcu **fp; - struct rsvp_session *nsp, *s; - struct rsvp_session __rcu **sp; - struct tc_rsvp_pinfo *pinfo = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_RSVP_MAX + 1]; - struct tcf_exts e; - unsigned int h1, h2; - __be32 *dst; - int err; - - if (opt == NULL) - return handle ? -EINVAL : 0; - - err = nla_parse_nested_deprecated(tb, TCA_RSVP_MAX, opt, rsvp_policy, - NULL); - if (err < 0) - return err; - - err = tcf_exts_init(&e, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &e, flags, - extack); - if (err < 0) - goto errout2; - - f = *arg; - if (f) { - /* Node exists: adjust only classid */ - struct rsvp_filter *n; - - if (f->handle != handle && handle) - goto errout2; - - n = kmemdup(f, sizeof(*f), GFP_KERNEL); - if (!n) { - err = -ENOMEM; - goto errout2; - } - - err = tcf_exts_init(&n->exts, net, TCA_RSVP_ACT, - TCA_RSVP_POLICE); - if (err < 0) { - kfree(n); - goto errout2; - } - - if (tb[TCA_RSVP_CLASSID]) { - n->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - tcf_bind_filter(tp, &n->res, base); - } - - tcf_exts_change(&n->exts, &e); - rsvp_replace(tp, n, handle); - return 0; - } - - /* Now more serious part... */ - err = -EINVAL; - if (handle) - goto errout2; - if (tb[TCA_RSVP_DST] == NULL) - goto errout2; - - err = -ENOBUFS; - f = kzalloc(sizeof(struct rsvp_filter), GFP_KERNEL); - if (f == NULL) - goto errout2; - - err = tcf_exts_init(&f->exts, net, TCA_RSVP_ACT, TCA_RSVP_POLICE); - if (err < 0) - goto errout; - h2 = 16; - if (tb[TCA_RSVP_SRC]) { - memcpy(f->src, nla_data(tb[TCA_RSVP_SRC]), sizeof(f->src)); - h2 = hash_src(f->src); - } - if (tb[TCA_RSVP_PINFO]) { - pinfo = nla_data(tb[TCA_RSVP_PINFO]); - f->spi = pinfo->spi; - f->tunnelhdr = pinfo->tunnelhdr; - } - if (tb[TCA_RSVP_CLASSID]) - f->res.classid = nla_get_u32(tb[TCA_RSVP_CLASSID]); - - dst = nla_data(tb[TCA_RSVP_DST]); - h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); - - err = -ENOMEM; - if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) - goto errout; - - if (f->tunnelhdr) { - err = -EINVAL; - if (f->res.classid > 255) - goto errout; - - err = -ENOMEM; - if (f->res.classid == 0 && - (f->res.classid = gen_tunnel(data)) == 0) - goto errout; - } - - for (sp = &data->ht[h1]; - (s = rtnl_dereference(*sp)) != NULL; - sp = &s->next) { - if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && - pinfo && pinfo->protocol == s->protocol && - memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && -#if RSVP_DST_LEN == 4 - dst[0] == s->dst[0] && - dst[1] == s->dst[1] && - dst[2] == s->dst[2] && -#endif - pinfo->tunnelid == s->tunnelid) { - -insert: - /* OK, we found appropriate session */ - - fp = &s->ht[h2]; - - f->sess = s; - if (f->tunnelhdr == 0) - tcf_bind_filter(tp, &f->res, base); - - tcf_exts_change(&f->exts, &e); - - fp = &s->ht[h2]; - for (nfp = rtnl_dereference(*fp); nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) { - __u32 mask = nfp->spi.mask & f->spi.mask; - - if (mask != f->spi.mask) - break; - } - RCU_INIT_POINTER(f->next, nfp); - rcu_assign_pointer(*fp, f); - - *arg = f; - return 0; - } - } - - /* No session found. Create new one. */ - - err = -ENOBUFS; - s = kzalloc(sizeof(struct rsvp_session), GFP_KERNEL); - if (s == NULL) - goto errout; - memcpy(s->dst, dst, sizeof(s->dst)); - - if (pinfo) { - s->dpi = pinfo->dpi; - s->protocol = pinfo->protocol; - s->tunnelid = pinfo->tunnelid; - } - sp = &data->ht[h1]; - for (nsp = rtnl_dereference(*sp); nsp; - sp = &nsp->next, nsp = rtnl_dereference(*sp)) { - if ((nsp->dpi.mask & s->dpi.mask) != s->dpi.mask) - break; - } - RCU_INIT_POINTER(s->next, nsp); - rcu_assign_pointer(*sp, s); - - goto insert; - -errout: - tcf_exts_destroy(&f->exts); - kfree(f); -errout2: - tcf_exts_destroy(&e); - return err; -} - -static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg, - bool rtnl_held) -{ - struct rsvp_head *head = rtnl_dereference(tp->root); - unsigned int h, h1; - - if (arg->stop) - return; - - for (h = 0; h < 256; h++) { - struct rsvp_session *s; - - for (s = rtnl_dereference(head->ht[h]); s; - s = rtnl_dereference(s->next)) { - for (h1 = 0; h1 <= 16; h1++) { - struct rsvp_filter *f; - - for (f = rtnl_dereference(s->ht[h1]); f; - f = rtnl_dereference(f->next)) { - if (!tc_cls_stats_dump(tp, arg, f)) - return; - } - } - } - } -} - -static int rsvp_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct rsvp_filter *f = fh; - struct rsvp_session *s; - struct nlattr *nest; - struct tc_rsvp_pinfo pinfo; - - if (f == NULL) - return skb->len; - s = f->sess; - - t->tcm_handle = f->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst)) - goto nla_put_failure; - pinfo.dpi = s->dpi; - pinfo.spi = f->spi; - pinfo.protocol = s->protocol; - pinfo.tunnelid = s->tunnelid; - pinfo.tunnelhdr = f->tunnelhdr; - pinfo.pad = 0; - if (nla_put(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo)) - goto nla_put_failure; - if (f->res.classid && - nla_put_u32(skb, TCA_RSVP_CLASSID, f->res.classid)) - goto nla_put_failure; - if (((f->handle >> 8) & 0xFF) != 16 && - nla_put(skb, TCA_RSVP_SRC, sizeof(f->src), f->src)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &f->exts) < 0) - goto nla_put_failure; - - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &f->exts) < 0) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, - unsigned long base) -{ - struct rsvp_filter *f = fh; - - tc_cls_bind_class(classid, cl, q, &f->res, base); -} - -static struct tcf_proto_ops RSVP_OPS __read_mostly = { - .kind = RSVP_ID, - .classify = RSVP_CLS, - .init = rsvp_init, - .destroy = rsvp_destroy, - .get = rsvp_get, - .change = rsvp_change, - .delete = rsvp_delete, - .walk = rsvp_walk, - .dump = rsvp_dump, - .bind_class = rsvp_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_rsvp(void) -{ - return register_tcf_proto_ops(&RSVP_OPS); -} - -static void __exit exit_rsvp(void) -{ - unregister_tcf_proto_ops(&RSVP_OPS); -} - -module_init(init_rsvp) -module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c deleted file mode 100644 index e627cc32d633..000000000000 --- a/net/sched/cls_rsvp6.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/ipv6.h> -#include <linux/skbuff.h> -#include <net/act_api.h> -#include <net/pkt_cls.h> -#include <net/netlink.h> -#include <net/tc_wrapper.h> - -#define RSVP_DST_LEN 4 -#define RSVP_ID "rsvp6" -#define RSVP_OPS cls_rsvp6_ops -#define RSVP_CLS rsvp6_classify - -#include "cls_rsvp.h" -MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c deleted file mode 100644 index 6640e75eaa02..000000000000 --- a/net/sched/cls_tcindex.c +++ /dev/null @@ -1,742 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * net/sched/cls_tcindex.c Packet classifier for skb->tc_index - * - * Written 1998,1999 by Werner Almesberger, EPFL ICA - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <linux/slab.h> -#include <linux/refcount.h> -#include <linux/rcupdate.h> -#include <net/act_api.h> -#include <net/netlink.h> -#include <net/pkt_cls.h> -#include <net/sch_generic.h> -#include <net/tc_wrapper.h> - -/* - * Passing parameters to the root seems to be done more awkwardly than really - * necessary. At least, u32 doesn't seem to use such dirty hacks. To be - * verified. FIXME. - */ - -#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ -#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ - - -struct tcindex_data; - -struct tcindex_filter_result { - struct tcf_exts exts; - struct tcf_result res; - struct tcindex_data *p; - struct rcu_work rwork; -}; - -struct tcindex_filter { - u16 key; - struct tcindex_filter_result result; - struct tcindex_filter __rcu *next; - struct rcu_work rwork; -}; - - -struct tcindex_data { - struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ - struct tcindex_filter __rcu **h; /* imperfect hash; */ - struct tcf_proto *tp; - u16 mask; /* AND key with mask */ - u32 shift; /* shift ANDed key to the right */ - u32 hash; /* hash table size; 0 if undefined */ - u32 alloc_hash; /* allocated size */ - u32 fall_through; /* 0: only classify if explicit match */ - refcount_t refcnt; /* a temporary refcnt for perfect hash */ - struct rcu_work rwork; -}; - -static inline int tcindex_filter_is_set(struct tcindex_filter_result *r) -{ - return tcf_exts_has_actions(&r->exts) || r->res.classid; -} - -static void tcindex_data_get(struct tcindex_data *p) -{ - refcount_inc(&p->refcnt); -} - -static void tcindex_data_put(struct tcindex_data *p) -{ - if (refcount_dec_and_test(&p->refcnt)) { - kfree(p->perfect); - kfree(p->h); - kfree(p); - } -} - -static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p, - u16 key) -{ - if (p->perfect) { - struct tcindex_filter_result *f = p->perfect + key; - - return tcindex_filter_is_set(f) ? f : NULL; - } else if (p->h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *f; - - fp = &p->h[key % p->hash]; - for (f = rcu_dereference_bh_rtnl(*fp); - f; - fp = &f->next, f = rcu_dereference_bh_rtnl(*fp)) - if (f->key == key) - return &f->result; - } - - return NULL; -} - -TC_INDIRECT_SCOPE int tcindex_classify(struct sk_buff *skb, - const struct tcf_proto *tp, - struct tcf_result *res) -{ - struct tcindex_data *p = rcu_dereference_bh(tp->root); - struct tcindex_filter_result *f; - int key = (skb->tc_index & p->mask) >> p->shift; - - pr_debug("tcindex_classify(skb %p,tp %p,res %p),p %p\n", - skb, tp, res, p); - - f = tcindex_lookup(p, key); - if (!f) { - struct Qdisc *q = tcf_block_q(tp->chain->block); - - if (!p->fall_through) - return -1; - res->classid = TC_H_MAKE(TC_H_MAJ(q->handle), key); - res->class = 0; - pr_debug("alg 0x%x\n", res->classid); - return 0; - } - *res = f->res; - pr_debug("map 0x%x\n", res->classid); - - return tcf_exts_exec(skb, &f->exts, res); -} - - -static void *tcindex_get(struct tcf_proto *tp, u32 handle) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r; - - pr_debug("tcindex_get(tp %p,handle 0x%08x)\n", tp, handle); - if (p->perfect && handle >= p->alloc_hash) - return NULL; - r = tcindex_lookup(p, handle); - return r && tcindex_filter_is_set(r) ? r : NULL; -} - -static int tcindex_init(struct tcf_proto *tp) -{ - struct tcindex_data *p; - - pr_debug("tcindex_init(tp %p)\n", tp); - p = kzalloc(sizeof(struct tcindex_data), GFP_KERNEL); - if (!p) - return -ENOMEM; - - p->mask = 0xffff; - p->hash = DEFAULT_HASH_SIZE; - p->fall_through = 1; - refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - rcu_assign_pointer(tp->root, p); - return 0; -} - -static void __tcindex_destroy_rexts(struct tcindex_filter_result *r) -{ - tcf_exts_destroy(&r->exts); - tcf_exts_put_net(&r->exts); - tcindex_data_put(r->p); -} - -static void tcindex_destroy_rexts_work(struct work_struct *work) -{ - struct tcindex_filter_result *r; - - r = container_of(to_rcu_work(work), - struct tcindex_filter_result, - rwork); - rtnl_lock(); - __tcindex_destroy_rexts(r); - rtnl_unlock(); -} - -static void __tcindex_destroy_fexts(struct tcindex_filter *f) -{ - tcf_exts_destroy(&f->result.exts); - tcf_exts_put_net(&f->result.exts); - kfree(f); -} - -static void tcindex_destroy_fexts_work(struct work_struct *work) -{ - struct tcindex_filter *f = container_of(to_rcu_work(work), - struct tcindex_filter, - rwork); - - rtnl_lock(); - __tcindex_destroy_fexts(f); - rtnl_unlock(); -} - -static int tcindex_delete(struct tcf_proto *tp, void *arg, bool *last, - bool rtnl_held, struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = arg; - struct tcindex_filter __rcu **walk; - struct tcindex_filter *f = NULL; - - pr_debug("tcindex_delete(tp %p,arg %p),p %p\n", tp, arg, p); - if (p->perfect) { - if (!r->res.class) - return -ENOENT; - } else { - int i; - - for (i = 0; i < p->hash; i++) { - walk = p->h + i; - for (f = rtnl_dereference(*walk); f; - walk = &f->next, f = rtnl_dereference(*walk)) { - if (&f->result == r) - goto found; - } - } - return -ENOENT; - -found: - rcu_assign_pointer(*walk, rtnl_dereference(f->next)); - } - tcf_unbind_filter(tp, &r->res); - /* all classifiers are required to call tcf_exts_destroy() after rcu - * grace period, since converted-to-rcu actions are relying on that - * in cleanup() callback - */ - if (f) { - if (tcf_exts_get_net(&f->result.exts)) - tcf_queue_work(&f->rwork, tcindex_destroy_fexts_work); - else - __tcindex_destroy_fexts(f); - } else { - tcindex_data_get(p); - - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - - *last = false; - return 0; -} - -static void tcindex_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - tcindex_data_put(p); -} - -static inline int -valid_perfect_hash(struct tcindex_data *p) -{ - return p->hash > (p->mask >> p->shift); -} - -static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = { - [TCA_TCINDEX_HASH] = { .type = NLA_U32 }, - [TCA_TCINDEX_MASK] = { .type = NLA_U16 }, - [TCA_TCINDEX_SHIFT] = { .type = NLA_U32 }, - [TCA_TCINDEX_FALL_THROUGH] = { .type = NLA_U32 }, - [TCA_TCINDEX_CLASSID] = { .type = NLA_U32 }, -}; - -static int tcindex_filter_result_init(struct tcindex_filter_result *r, - struct tcindex_data *p, - struct net *net) -{ - memset(r, 0, sizeof(*r)); - r->p = p; - return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT, - TCA_TCINDEX_POLICE); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp); - -static void tcindex_partial_destroy_work(struct work_struct *work) -{ - struct tcindex_data *p = container_of(to_rcu_work(work), - struct tcindex_data, - rwork); - - rtnl_lock(); - if (p->perfect) - tcindex_free_perfect_hash(p); - kfree(p); - rtnl_unlock(); -} - -static void tcindex_free_perfect_hash(struct tcindex_data *cp) -{ - int i; - - for (i = 0; i < cp->hash; i++) - tcf_exts_destroy(&cp->perfect[i].exts); - kfree(cp->perfect); -} - -static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp) -{ - int i, err = 0; - - cp->perfect = kcalloc(cp->hash, sizeof(struct tcindex_filter_result), - GFP_KERNEL | __GFP_NOWARN); - if (!cp->perfect) - return -ENOMEM; - - for (i = 0; i < cp->hash; i++) { - err = tcf_exts_init(&cp->perfect[i].exts, net, - TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - goto errout; - cp->perfect[i].p = cp; - } - - return 0; - -errout: - tcindex_free_perfect_hash(cp); - return err; -} - -static int -tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, - u32 handle, struct tcindex_data *p, - struct tcindex_filter_result *r, struct nlattr **tb, - struct nlattr *est, u32 flags, struct netlink_ext_ack *extack) -{ - struct tcindex_filter_result new_filter_result; - struct tcindex_data *cp = NULL, *oldp; - struct tcindex_filter *f = NULL; /* make gcc behave */ - struct tcf_result cr = {}; - int err, balloc = 0; - struct tcf_exts e; - bool update_h = false; - - err = tcf_exts_init(&e, net, TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE); - if (err < 0) - return err; - err = tcf_exts_validate(net, tp, tb, est, &e, flags, extack); - if (err < 0) - goto errout; - - err = -ENOMEM; - /* tcindex_data attributes must look atomic to classifier/lookup so - * allocate new tcindex data and RCU assign it onto root. Keeping - * perfect hash and hash pointers from old data. - */ - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) - goto errout; - - cp->mask = p->mask; - cp->shift = p->shift; - cp->hash = p->hash; - cp->alloc_hash = p->alloc_hash; - cp->fall_through = p->fall_through; - cp->tp = tp; - refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */ - - if (tb[TCA_TCINDEX_HASH]) - cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]); - - if (tb[TCA_TCINDEX_MASK]) - cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - - if (tb[TCA_TCINDEX_SHIFT]) { - cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - if (cp->shift > 16) { - err = -EINVAL; - goto errout; - } - } - if (!cp->hash) { - /* Hash not specified, use perfect hash if the upper limit - * of the hashing index is below the threshold. - */ - if ((cp->mask >> cp->shift) < PERFECT_HASH_THRESHOLD) - cp->hash = (cp->mask >> cp->shift) + 1; - else - cp->hash = DEFAULT_HASH_SIZE; - } - - if (p->perfect) { - int i; - - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout; - cp->alloc_hash = cp->hash; - for (i = 0; i < min(cp->hash, p->hash); i++) - cp->perfect[i].res = p->perfect[i].res; - balloc = 1; - } - cp->h = p->h; - - err = tcindex_filter_result_init(&new_filter_result, cp, net); - if (err < 0) - goto errout_alloc; - if (r) - cr = r->res; - - err = -EBUSY; - - /* Hash already allocated, make sure that we still meet the - * requirements for the allocated hash. - */ - if (cp->perfect) { - if (!valid_perfect_hash(cp) || - cp->hash > cp->alloc_hash) - goto errout_alloc; - } else if (cp->h && cp->hash != cp->alloc_hash) { - goto errout_alloc; - } - - err = -EINVAL; - if (tb[TCA_TCINDEX_FALL_THROUGH]) - cp->fall_through = nla_get_u32(tb[TCA_TCINDEX_FALL_THROUGH]); - - if (!cp->perfect && !cp->h) - cp->alloc_hash = cp->hash; - - /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) - * but then, we'd fail handles that may become valid after some future - * mask change. While this is extremely unlikely to ever matter, - * the check below is safer (and also more backwards-compatible). - */ - if (cp->perfect || valid_perfect_hash(cp)) - if (handle >= cp->alloc_hash) - goto errout_alloc; - - - err = -ENOMEM; - if (!cp->perfect && !cp->h) { - if (valid_perfect_hash(cp)) { - if (tcindex_alloc_perfect_hash(net, cp) < 0) - goto errout_alloc; - balloc = 1; - } else { - struct tcindex_filter __rcu **hash; - - hash = kcalloc(cp->hash, - sizeof(struct tcindex_filter *), - GFP_KERNEL); - - if (!hash) - goto errout_alloc; - - cp->h = hash; - balloc = 2; - } - } - - if (cp->perfect) { - r = cp->perfect + handle; - } else { - /* imperfect area is updated in-place using rcu */ - update_h = !!tcindex_lookup(cp, handle); - r = &new_filter_result; - } - - if (r == &new_filter_result) { - f = kzalloc(sizeof(*f), GFP_KERNEL); - if (!f) - goto errout_alloc; - f->key = handle; - f->next = NULL; - err = tcindex_filter_result_init(&f->result, cp, net); - if (err < 0) { - kfree(f); - goto errout_alloc; - } - } - - if (tb[TCA_TCINDEX_CLASSID]) { - cr.classid = nla_get_u32(tb[TCA_TCINDEX_CLASSID]); - tcf_bind_filter(tp, &cr, base); - } - - oldp = p; - r->res = cr; - tcf_exts_change(&r->exts, &e); - - rcu_assign_pointer(tp->root, cp); - - if (update_h) { - struct tcindex_filter __rcu **fp; - struct tcindex_filter *cf; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - /* imperfect area bucket */ - fp = cp->h + (handle % cp->hash); - - /* lookup the filter, guaranteed to exist */ - for (cf = rcu_dereference_bh_rtnl(*fp); cf; - fp = &cf->next, cf = rcu_dereference_bh_rtnl(*fp)) - if (cf->key == (u16)handle) - break; - - f->next = cf->next; - - cf = rcu_replace_pointer(*fp, f, 1); - tcf_exts_get_net(&cf->result.exts); - tcf_queue_work(&cf->rwork, tcindex_destroy_fexts_work); - } else if (r == &new_filter_result) { - struct tcindex_filter *nfp; - struct tcindex_filter __rcu **fp; - - f->result.res = r->res; - tcf_exts_change(&f->result.exts, &r->exts); - - fp = cp->h + (handle % cp->hash); - for (nfp = rtnl_dereference(*fp); - nfp; - fp = &nfp->next, nfp = rtnl_dereference(*fp)) - ; /* nothing */ - - rcu_assign_pointer(*fp, f); - } else { - tcf_exts_destroy(&new_filter_result.exts); - } - - if (oldp) - tcf_queue_work(&oldp->rwork, tcindex_partial_destroy_work); - return 0; - -errout_alloc: - if (balloc == 1) - tcindex_free_perfect_hash(cp); - else if (balloc == 2) - kfree(cp->h); - tcf_exts_destroy(&new_filter_result.exts); -errout: - kfree(cp); - tcf_exts_destroy(&e); - return err; -} - -static int -tcindex_change(struct net *net, struct sk_buff *in_skb, - struct tcf_proto *tp, unsigned long base, u32 handle, - struct nlattr **tca, void **arg, u32 flags, - struct netlink_ext_ack *extack) -{ - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_TCINDEX_MAX + 1]; - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = *arg; - int err; - - pr_debug("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," - "p %p,r %p,*arg %p\n", - tp, handle, tca, arg, opt, p, r, *arg); - - if (!opt) - return 0; - - err = nla_parse_nested_deprecated(tb, TCA_TCINDEX_MAX, opt, - tcindex_policy, NULL); - if (err < 0) - return err; - - return tcindex_set_parms(net, tp, base, handle, p, r, tb, - tca[TCA_RATE], flags, extack); -} - -static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, - bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter *f, *next; - int i; - - pr_debug("tcindex_walk(tp %p,walker %p),p %p\n", tp, walker, p); - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - if (!p->perfect[i].res.class) - continue; - if (!tc_cls_stats_dump(tp, walker, p->perfect + i)) - return; - } - } - if (!p->h) - return; - for (i = 0; i < p->hash; i++) { - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - if (!tc_cls_stats_dump(tp, walker, &f->result)) - return; - } - } -} - -static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held, - struct netlink_ext_ack *extack) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - int i; - - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); - - if (p->perfect) { - for (i = 0; i < p->hash; i++) { - struct tcindex_filter_result *r = p->perfect + i; - - /* tcf_queue_work() does not guarantee the ordering we - * want, so we have to take this refcnt temporarily to - * ensure 'p' is freed after all tcindex_filter_result - * here. Imperfect hash does not need this, because it - * uses linked lists rather than an array. - */ - tcindex_data_get(p); - - tcf_unbind_filter(tp, &r->res); - if (tcf_exts_get_net(&r->exts)) - tcf_queue_work(&r->rwork, - tcindex_destroy_rexts_work); - else - __tcindex_destroy_rexts(r); - } - } - - for (i = 0; p->h && i < p->hash; i++) { - struct tcindex_filter *f, *next; - bool last; - - for (f = rtnl_dereference(p->h[i]); f; f = next) { - next = rtnl_dereference(f->next); - tcindex_delete(tp, &f->result, &last, rtnl_held, NULL); - } - } - - tcf_queue_work(&p->rwork, tcindex_destroy_work); -} - - -static int tcindex_dump(struct net *net, struct tcf_proto *tp, void *fh, - struct sk_buff *skb, struct tcmsg *t, bool rtnl_held) -{ - struct tcindex_data *p = rtnl_dereference(tp->root); - struct tcindex_filter_result *r = fh; - struct nlattr *nest; - - pr_debug("tcindex_dump(tp %p,fh %p,skb %p,t %p),p %p,r %p\n", - tp, fh, skb, t, p, r); - pr_debug("p->perfect %p p->h %p\n", p->perfect, p->h); - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (!fh) { - t->tcm_handle = ~0; /* whatever ... */ - if (nla_put_u32(skb, TCA_TCINDEX_HASH, p->hash) || - nla_put_u16(skb, TCA_TCINDEX_MASK, p->mask) || - nla_put_u32(skb, TCA_TCINDEX_SHIFT, p->shift) || - nla_put_u32(skb, TCA_TCINDEX_FALL_THROUGH, p->fall_through)) - goto nla_put_failure; - nla_nest_end(skb, nest); - } else { - if (p->perfect) { - t->tcm_handle = r - p->perfect; - } else { - struct tcindex_filter *f; - struct tcindex_filter __rcu **fp; - int i; - - t->tcm_handle = 0; - for (i = 0; !t->tcm_handle && i < p->hash; i++) { - fp = &p->h[i]; - for (f = rtnl_dereference(*fp); - !t->tcm_handle && f; - fp = &f->next, f = rtnl_dereference(*fp)) { - if (&f->result == r) - t->tcm_handle = f->key; - } - } - } - pr_debug("handle = %d\n", t->tcm_handle); - if (r->res.class && - nla_put_u32(skb, TCA_TCINDEX_CLASSID, r->res.classid)) - goto nla_put_failure; - - if (tcf_exts_dump(skb, &r->exts) < 0) - goto nla_put_failure; - nla_nest_end(skb, nest); - - if (tcf_exts_dump_stats(skb, &r->exts) < 0) - goto nla_put_failure; - } - - return skb->len; - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, - void *q, unsigned long base) -{ - struct tcindex_filter_result *r = fh; - - tc_cls_bind_class(classid, cl, q, &r->res, base); -} - -static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { - .kind = "tcindex", - .classify = tcindex_classify, - .init = tcindex_init, - .destroy = tcindex_destroy, - .get = tcindex_get, - .change = tcindex_change, - .delete = tcindex_delete, - .walk = tcindex_walk, - .dump = tcindex_dump, - .bind_class = tcindex_bind_class, - .owner = THIS_MODULE, -}; - -static int __init init_tcindex(void) -{ - return register_tcf_proto_ops(&cls_tcindex_ops); -} - -static void __exit exit_tcindex(void) -{ - unregister_tcf_proto_ops(&cls_tcindex_ops); -} - -module_init(init_tcindex) -module_exit(exit_tcindex) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 72d2c204d5f3..aba789c30a2e 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -902,7 +902,8 @@ static void qdisc_offload_graft_root(struct net_device *dev, } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, - u32 portid, u32 seq, u16 flags, int event) + u32 portid, u32 seq, u16 flags, int event, + struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; @@ -970,7 +971,12 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -991,7 +997,8 @@ static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1002,12 +1009,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, - 0, RTM_DELQDISC) < 0) + 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, - old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } @@ -1022,10 +1029,11 @@ err_out: static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, - struct Qdisc *old, struct Qdisc *new) + struct Qdisc *old, struct Qdisc *new, + struct netlink_ext_ack *extack) { if (new || old) - qdisc_notify(net, skb, n, clid, old, new); + qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); @@ -1105,12 +1113,12 @@ skip: qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } else { - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } if (dev->flags & IFF_UP) @@ -1141,7 +1149,7 @@ skip: err = cops->graft(parent, cl, new, &old, extack); if (err) return err; - notify_and_destroy(net, skb, n, classid, old, new); + notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } @@ -1274,20 +1282,21 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (err) goto err_out3; - if (ops->init) { - err = ops->init(sch, tca[TCA_OPTIONS], extack); - if (err != 0) - goto err_out5; - } - if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); - goto err_out4; + goto err_out3; } rcu_assign_pointer(sch->stab, stab); } + + if (ops->init) { + err = ops->init(sch, tca[TCA_OPTIONS], extack); + if (err != 0) + goto err_out4; + } + if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { @@ -1312,10 +1321,13 @@ static struct Qdisc *qdisc_create(struct net_device *dev, return sch; -err_out5: - /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */ +err_out4: + /* Even if ops->init() failed, we call ops->destroy() + * like qdisc_create_dflt(). + */ if (ops->destroy) ops->destroy(sch); + qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); @@ -1324,16 +1336,6 @@ err_out2: err_out: *errp = err; return NULL; - -err_out4: - /* - * Any broken qdiscs that would require a ops->reset() here? - * The qdisc was never in action so it shouldn't be necessary. - */ - qdisc_put_stab(rtnl_dereference(sch->stab)); - if (ops->destroy) - ops->destroy(sch); - goto err_out3; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, @@ -1509,7 +1511,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, if (err != 0) return err; } else { - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, NULL); } return 0; } @@ -1648,7 +1650,7 @@ replay: } err = qdisc_change(q, tca, extack); if (err == 0) - qdisc_notify(net, skb, n, clid, NULL, q); + qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: @@ -1715,7 +1717,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1737,7 +1739,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWQDISC) <= 0) + RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } @@ -1810,8 +1812,8 @@ done: ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, - unsigned long cl, - u32 portid, u32 seq, u16 flags, int event) + unsigned long cl, u32 portid, u32 seq, u16 flags, + int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; @@ -1846,7 +1848,12 @@ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; + if (extack && extack->_msg && + nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) + goto out_nlmsg_trim; + nlh->nlmsg_len = skb_tail_pointer(skb) - b; + return skb->len; out_nlmsg_trim: @@ -1857,7 +1864,7 @@ nla_put_failure: static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, - unsigned long cl, int event) + unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; @@ -1866,7 +1873,7 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) { + if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -1893,7 +1900,7 @@ static int tclass_del_notify(struct net *net, return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, - RTM_DELTCLASS) < 0) { + RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } @@ -2100,7 +2107,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: - err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS); + err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack); goto out; default: err = -EINVAL; @@ -2118,7 +2125,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { - tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS); + tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); @@ -2140,7 +2147,7 @@ static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, - RTM_NEWTCLASS); + RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c deleted file mode 100644 index 4a981ca90b0b..000000000000 --- a/net/sched/sch_atm.c +++ /dev/null @@ -1,706 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_atm.c - ATM VC selection "queueing discipline" */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/interrupt.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/atmdev.h> -#include <linux/atmclip.h> -#include <linux/rtnetlink.h> -#include <linux/file.h> /* for fput */ -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - -/* - * The ATM queuing discipline provides a framework for invoking classifiers - * (aka "filters"), which in turn select classes of this queuing discipline. - * Each class maps the flow(s) it is handling to a given VC. Multiple classes - * may share the same VC. - * - * When creating a class, VCs are specified by passing the number of the open - * socket descriptor by which the calling process references the VC. The kernel - * keeps the VC open at least until all classes using it are removed. - * - * In this file, most functions are named atm_tc_* to avoid confusion with all - * the atm_* in net/atm. This naming convention differs from what's used in the - * rest of net/sched. - * - * Known bugs: - * - sometimes messes up the IP stack - * - any manipulations besides the few operations described in the README, are - * untested and likely to crash the system - * - should lock the flow while there is data in the queue (?) - */ - -#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) - -struct atm_flow_data { - struct Qdisc_class_common common; - struct Qdisc *q; /* FIFO, TBF, etc. */ - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ - void (*old_pop)(struct atm_vcc *vcc, - struct sk_buff *skb); /* chaining */ - struct atm_qdisc_data *parent; /* parent qdisc */ - struct socket *sock; /* for closing */ - int ref; /* reference count */ - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct list_head list; - struct atm_flow_data *excess; /* flow for excess traffic; - NULL to set CLP instead */ - int hdr_len; - unsigned char hdr[]; /* header data; MUST BE LAST */ -}; - -struct atm_qdisc_data { - struct atm_flow_data link; /* unclassified skbs go here */ - struct list_head flows; /* NB: "link" is also on this - list */ - struct tasklet_struct task; /* dequeue tasklet */ -}; - -/* ------------------------- Class/flow operations ------------------------- */ - -static inline struct atm_flow_data *lookup_flow(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - list_for_each_entry(flow, &p->flows, list) { - if (flow->common.classid == classid) - return flow; - } - return NULL; -} - -static int atm_tc_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n", - sch, p, flow, new, old); - if (list_empty(&flow->list)) - return -EINVAL; - if (!new) - new = &noop_qdisc; - *old = flow->q; - flow->q = new; - if (*old) - qdisc_reset(*old); - return 0; -} - -static struct Qdisc *atm_tc_leaf(struct Qdisc *sch, unsigned long cl) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_leaf(sch %p,flow %p)\n", sch, flow); - return flow ? flow->q : NULL; -} - -static unsigned long atm_tc_find(struct Qdisc *sch, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -static unsigned long atm_tc_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - struct atm_qdisc_data *p __maybe_unused = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", __func__, sch, p, classid); - flow = lookup_flow(sch, classid); - if (flow) - flow->ref++; - pr_debug("%s: flow %p\n", __func__, flow); - return (unsigned long)flow; -} - -/* - * atm_tc_put handles all destructions, including the ones that are explicitly - * requested (atm_tc_destroy, etc.). The assumption here is that we never drop - * anything that still seems to be in use. - */ -static void atm_tc_put(struct Qdisc *sch, unsigned long cl) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_put(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (--flow->ref) - return; - pr_debug("atm_tc_put: destroying\n"); - list_del_init(&flow->list); - pr_debug("atm_tc_put: qdisc %p\n", flow->q); - qdisc_put(flow->q); - tcf_block_put(flow->block); - if (flow->sock) { - pr_debug("atm_tc_put: f_count %ld\n", - file_count(flow->sock->file)); - flow->vcc->pop = flow->old_pop; - sockfd_put(flow->sock); - } - if (flow->excess) - atm_tc_put(sch, (unsigned long)flow->excess); - if (flow != &p->link) - kfree(flow); - /* - * If flow == &p->link, the qdisc no longer works at this point and - * needs to be removed. (By the caller of atm_tc_put.) - */ -} - -static void sch_atm_pop(struct atm_vcc *vcc, struct sk_buff *skb) -{ - struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; - - pr_debug("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n", vcc, skb, p); - VCC2FLOW(vcc)->old_pop(vcc, skb); - tasklet_schedule(&p->task); -} - -static const u8 llc_oui_ip[] = { - 0xaa, /* DSAP: non-ISO */ - 0xaa, /* SSAP: non-ISO */ - 0x03, /* Ctrl: Unnumbered Information Command PDU */ - 0x00, /* OUI: EtherType */ - 0x00, 0x00, - 0x08, 0x00 -}; /* Ethertype IP (0800) */ - -static const struct nla_policy atm_policy[TCA_ATM_MAX + 1] = { - [TCA_ATM_FD] = { .type = NLA_U32 }, - [TCA_ATM_EXCESS] = { .type = NLA_U32 }, -}; - -static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)*arg; - struct atm_flow_data *excess = NULL; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_ATM_MAX + 1]; - struct socket *sock; - int fd, error, hdr_len; - void *hdr; - - pr_debug("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," - "flow %p,opt %p)\n", sch, p, classid, parent, flow, opt); - /* - * The concept of parents doesn't apply for this qdisc. - */ - if (parent && parent != TC_H_ROOT && parent != sch->handle) - return -EINVAL; - /* - * ATM classes cannot be changed. In order to change properties of the - * ATM connection, that socket needs to be modified directly (via the - * native ATM API. In order to send a flow to a different VC, the old - * class needs to be removed and a new one added. (This may be changed - * later.) - */ - if (flow) - return -EBUSY; - if (opt == NULL) - return -EINVAL; - - error = nla_parse_nested_deprecated(tb, TCA_ATM_MAX, opt, atm_policy, - NULL); - if (error < 0) - return error; - - if (!tb[TCA_ATM_FD]) - return -EINVAL; - fd = nla_get_u32(tb[TCA_ATM_FD]); - pr_debug("atm_tc_change: fd %d\n", fd); - if (tb[TCA_ATM_HDR]) { - hdr_len = nla_len(tb[TCA_ATM_HDR]); - hdr = nla_data(tb[TCA_ATM_HDR]); - } else { - hdr_len = RFC1483LLC_LEN; - hdr = NULL; /* default LLC/SNAP for IP */ - } - if (!tb[TCA_ATM_EXCESS]) - excess = NULL; - else { - excess = (struct atm_flow_data *) - atm_tc_find(sch, nla_get_u32(tb[TCA_ATM_EXCESS])); - if (!excess) - return -ENOENT; - } - pr_debug("atm_tc_change: type %d, payload %d, hdr_len %d\n", - opt->nla_type, nla_len(opt), hdr_len); - sock = sockfd_lookup(fd, &error); - if (!sock) - return error; /* f_count++ */ - pr_debug("atm_tc_change: f_count %ld\n", file_count(sock->file)); - if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { - error = -EPROTOTYPE; - goto err_out; - } - /* @@@ should check if the socket is really operational or we'll crash - on vcc->send */ - if (classid) { - if (TC_H_MAJ(classid ^ sch->handle)) { - pr_debug("atm_tc_change: classid mismatch\n"); - error = -EINVAL; - goto err_out; - } - } else { - int i; - unsigned long cl; - - for (i = 1; i < 0x8000; i++) { - classid = TC_H_MAKE(sch->handle, 0x8000 | i); - cl = atm_tc_find(sch, classid); - if (!cl) - break; - } - } - pr_debug("atm_tc_change: new id %x\n", classid); - flow = kzalloc(sizeof(struct atm_flow_data) + hdr_len, GFP_KERNEL); - pr_debug("atm_tc_change: flow %p\n", flow); - if (!flow) { - error = -ENOBUFS; - goto err_out; - } - - error = tcf_block_get(&flow->block, &flow->filter_list, sch, - extack); - if (error) { - kfree(flow); - goto err_out; - } - - flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - extack); - if (!flow->q) - flow->q = &noop_qdisc; - pr_debug("atm_tc_change: qdisc %p\n", flow->q); - flow->sock = sock; - flow->vcc = ATM_SD(sock); /* speedup */ - flow->vcc->user_back = flow; - pr_debug("atm_tc_change: vcc %p\n", flow->vcc); - flow->old_pop = flow->vcc->pop; - flow->parent = p; - flow->vcc->pop = sch_atm_pop; - flow->common.classid = classid; - flow->ref = 1; - flow->excess = excess; - list_add(&flow->list, &p->link.list); - flow->hdr_len = hdr_len; - if (hdr) - memcpy(flow->hdr, hdr, hdr_len); - else - memcpy(flow->hdr, llc_oui_ip, sizeof(llc_oui_ip)); - *arg = (unsigned long)flow; - return 0; -err_out: - sockfd_put(sock); - return error; -} - -static int atm_tc_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - pr_debug("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - if (list_empty(&flow->list)) - return -EINVAL; - if (rcu_access_pointer(flow->filter_list) || flow == &p->link) - return -EBUSY; - /* - * Reference count must be 2: one for "keepalive" (set at class - * creation), and one for the reference held when calling delete. - */ - if (flow->ref < 2) { - pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref); - return -EINVAL; - } - if (flow->ref > 2) - return -EBUSY; /* catch references via excess, etc. */ - atm_tc_put(sch, arg); - return 0; -} - -static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); - if (walker->stop) - return; - list_for_each_entry(flow, &p->flows, list) { - if (!tc_qdisc_stats_dump(sch, (unsigned long)flow, walker)) - break; - } -} - -static struct tcf_block *atm_tc_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - - pr_debug("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n", sch, p, flow); - return flow ? flow->block : p->link.block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - struct tcf_result res; - int result; - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - - pr_debug("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); - result = TC_ACT_OK; /* be nice to gcc */ - flow = NULL; - if (TC_H_MAJ(skb->priority) != sch->handle || - !(flow = (struct atm_flow_data *)atm_tc_find(sch, skb->priority))) { - struct tcf_proto *fl; - - list_for_each_entry(flow, &p->flows, list) { - fl = rcu_dereference_bh(flow->filter_list); - if (fl) { - result = tcf_classify(skb, NULL, fl, &res, true); - if (result < 0) - continue; - if (result == TC_ACT_SHOT) - goto done; - - flow = (struct atm_flow_data *)res.class; - if (!flow) - flow = lookup_flow(sch, res.classid); - goto drop; - } - } - flow = NULL; -done: - ; - } - if (!flow) { - flow = &p->link; - } else { - if (flow->vcc) - ATM_SKB(skb)->atm_options = flow->vcc->atm_options; - /*@@@ looks good ... but it's not supposed to work :-) */ -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - case TC_ACT_SHOT: - __qdisc_drop(skb, to_free); - goto drop; - case TC_ACT_RECLASSIFY: - if (flow->excess) - flow = flow->excess; - else - ATM_SKB(skb)->atm_options |= ATM_ATMOPT_CLP; - break; - } -#endif - } - - ret = qdisc_enqueue(skb, flow->q, to_free); - if (ret != NET_XMIT_SUCCESS) { -drop: __maybe_unused - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - if (flow) - flow->qstats.drops++; - } - return ret; - } - /* - * Okay, this may seem weird. We pretend we've dropped the packet if - * it goes via ATM. The reason for this is that the outer qdisc - * expects to be able to q->dequeue the packet later on if we return - * success at this place. Also, sch->q.qdisc needs to reflect whether - * there is a packet egligible for dequeuing or not. Note that the - * statistics of the outer qdisc are necessarily wrong because of all - * this. There's currently no correct solution for this. - */ - if (flow == &p->link) { - sch->q.qlen++; - return NET_XMIT_SUCCESS; - } - tasklet_schedule(&p->task); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -/* - * Dequeue packets and send them over ATM. Note that we quite deliberately - * avoid checking net_device's flow control here, simply because sch_atm - * uses its own channels, which have nothing to do with any CLIP/LANE/or - * non-ATM interfaces. - */ - -static void sch_atm_dequeue(struct tasklet_struct *t) -{ - struct atm_qdisc_data *p = from_tasklet(p, t, task); - struct Qdisc *sch = qdisc_from_priv(p); - struct atm_flow_data *flow; - struct sk_buff *skb; - - pr_debug("sch_atm_dequeue(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - if (flow == &p->link) - continue; - /* - * If traffic is properly shaped, this won't generate nasty - * little bursts. Otherwise, it may ... (but that's okay) - */ - while ((skb = flow->q->ops->peek(flow->q))) { - if (!atm_may_send(flow->vcc, skb->truesize)) - break; - - skb = qdisc_dequeue_peeked(flow->q); - if (unlikely(!skb)) - break; - - qdisc_bstats_update(sch, skb); - bstats_update(&flow->bstats, skb); - pr_debug("atm_tc_dequeue: sending on class %p\n", flow); - /* remove any LL header somebody else has attached */ - skb_pull(skb, skb_network_offset(skb)); - if (skb_headroom(skb) < flow->hdr_len) { - struct sk_buff *new; - - new = skb_realloc_headroom(skb, flow->hdr_len); - dev_kfree_skb(skb); - if (!new) - continue; - skb = new; - } - pr_debug("sch_atm_dequeue: ip %p, data %p\n", - skb_network_header(skb), skb->data); - ATM_SKB(skb)->vcc = flow->vcc; - memcpy(skb_push(skb, flow->hdr_len), flow->hdr, - flow->hdr_len); - refcount_add(skb->truesize, - &sk_atm(flow->vcc)->sk_wmem_alloc); - /* atm.atm_options are already set by atm_tc_enqueue */ - flow->vcc->send(flow->vcc, skb); - } - } -} - -static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - - pr_debug("atm_tc_dequeue(sch %p,[qdisc %p])\n", sch, p); - tasklet_schedule(&p->task); - skb = qdisc_dequeue_peeked(p->link.q); - if (skb) - sch->q.qlen--; - return skb; -} - -static struct sk_buff *atm_tc_peek(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - - pr_debug("atm_tc_peek(sch %p,[qdisc %p])\n", sch, p); - - return p->link.q->ops->peek(p->link.q); -} - -static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); - INIT_LIST_HEAD(&p->flows); - INIT_LIST_HEAD(&p->link.list); - gnet_stats_basic_sync_init(&p->link.bstats); - list_add(&p->link.list, &p->flows); - p->link.q = qdisc_create_dflt(sch->dev_queue, - &pfifo_qdisc_ops, sch->handle, extack); - if (!p->link.q) - p->link.q = &noop_qdisc; - pr_debug("atm_tc_init: link (%p) qdisc %p\n", &p->link, p->link.q); - p->link.vcc = NULL; - p->link.sock = NULL; - p->link.common.classid = sch->handle; - p->link.ref = 1; - - err = tcf_block_get(&p->link.block, &p->link.filter_list, sch, - extack); - if (err) - return err; - - tasklet_setup(&p->task, sch_atm_dequeue); - return 0; -} - -static void atm_tc_reset(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow; - - pr_debug("atm_tc_reset(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) - qdisc_reset(flow->q); -} - -static void atm_tc_destroy(struct Qdisc *sch) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow, *tmp; - - pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p); - list_for_each_entry(flow, &p->flows, list) { - tcf_block_put(flow->block); - flow->block = NULL; - } - - list_for_each_entry_safe(flow, tmp, &p->flows, list) { - if (flow->ref > 1) - pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref); - atm_tc_put(sch, (unsigned long)flow); - } - tasklet_kill(&p->task); -} - -static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct atm_qdisc_data *p = qdisc_priv(sch); - struct atm_flow_data *flow = (struct atm_flow_data *)cl; - struct nlattr *nest; - - pr_debug("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", - sch, p, flow, skb, tcm); - if (list_empty(&flow->list)) - return -EINVAL; - tcm->tcm_handle = flow->common.classid; - tcm->tcm_info = flow->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - - if (nla_put(skb, TCA_ATM_HDR, flow->hdr_len, flow->hdr)) - goto nla_put_failure; - if (flow->vcc) { - struct sockaddr_atmpvc pvc; - int state; - - memset(&pvc, 0, sizeof(pvc)); - pvc.sap_family = AF_ATMPVC; - pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; - pvc.sap_addr.vpi = flow->vcc->vpi; - pvc.sap_addr.vci = flow->vcc->vci; - if (nla_put(skb, TCA_ATM_ADDR, sizeof(pvc), &pvc)) - goto nla_put_failure; - state = ATM_VF2VS(flow->vcc->flags); - if (nla_put_u32(skb, TCA_ATM_STATE, state)) - goto nla_put_failure; - } - if (flow->excess) { - if (nla_put_u32(skb, TCA_ATM_EXCESS, flow->common.classid)) - goto nla_put_failure; - } else { - if (nla_put_u32(skb, TCA_ATM_EXCESS, 0)) - goto nla_put_failure; - } - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} -static int -atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct atm_flow_data *flow = (struct atm_flow_data *)arg; - - if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || - gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) - return -1; - - return 0; -} - -static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - return 0; -} - -static const struct Qdisc_class_ops atm_class_ops = { - .graft = atm_tc_graft, - .leaf = atm_tc_leaf, - .find = atm_tc_find, - .change = atm_tc_change, - .delete = atm_tc_delete, - .walk = atm_tc_walk, - .tcf_block = atm_tc_tcf_block, - .bind_tcf = atm_tc_bind_filter, - .unbind_tcf = atm_tc_put, - .dump = atm_tc_dump_class, - .dump_stats = atm_tc_dump_class_stats, -}; - -static struct Qdisc_ops atm_qdisc_ops __read_mostly = { - .cl_ops = &atm_class_ops, - .id = "atm", - .priv_size = sizeof(struct atm_qdisc_data), - .enqueue = atm_tc_enqueue, - .dequeue = atm_tc_dequeue, - .peek = atm_tc_peek, - .init = atm_tc_init, - .reset = atm_tc_reset, - .destroy = atm_tc_destroy, - .dump = atm_tc_dump, - .owner = THIS_MODULE, -}; - -static int __init atm_init(void) -{ - return register_qdisc(&atm_qdisc_ops); -} - -static void __exit atm_exit(void) -{ - unregister_qdisc(&atm_qdisc_ops); -} - -module_init(atm_init) -module_exit(atm_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 3ed0c3342189..7970217b565a 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1209,7 +1209,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, iph_check->daddr != iph->daddr) continue; - seglen = ntohs(iph_check->tot_len) - + seglen = iph_totlen(skb, iph_check) - (4 * iph_check->ihl); } else if (iph_check->version == 6) { ipv6h = (struct ipv6hdr *)iph; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c deleted file mode 100644 index 36db5f6782f2..000000000000 --- a/net/sched/sch_cbq.c +++ /dev/null @@ -1,1727 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * net/sched/sch_cbq.c Class-Based Queueing discipline. - * - * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <net/netlink.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> - - -/* Class-Based Queueing (CBQ) algorithm. - ======================================= - - Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource - Management Models for Packet Networks", - IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 - - [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 - - [3] Sally Floyd, "Notes on Class-Based Queueing: Setting - Parameters", 1996 - - [4] Sally Floyd and Michael Speer, "Experimental Results - for Class-Based Queueing", 1998, not published. - - ----------------------------------------------------------------------- - - Algorithm skeleton was taken from NS simulator cbq.cc. - If someone wants to check this code against the LBL version, - he should take into account that ONLY the skeleton was borrowed, - the implementation is different. Particularly: - - --- The WRR algorithm is different. Our version looks more - reasonable (I hope) and works when quanta are allowed to be - less than MTU, which is always the case when real time classes - have small rates. Note, that the statement of [3] is - incomplete, delay may actually be estimated even if class - per-round allotment is less than MTU. Namely, if per-round - allotment is W*r_i, and r_1+...+r_k = r < 1 - - delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B - - In the worst case we have IntServ estimate with D = W*r+k*MTU - and C = MTU*r. The proof (if correct at all) is trivial. - - - --- It seems that cbq-2.0 is not very accurate. At least, I cannot - interpret some places, which look like wrong translations - from NS. Anyone is advised to find these differences - and explain to me, why I am wrong 8). - - --- Linux has no EOI event, so that we cannot estimate true class - idle time. Workaround is to consider the next dequeue event - as sign that previous packet is finished. This is wrong because of - internal device queueing, but on a permanently loaded link it is true. - Moreover, combined with clock integrator, this scheme looks - very close to an ideal solution. */ - -struct cbq_sched_data; - - -struct cbq_class { - struct Qdisc_class_common common; - struct cbq_class *next_alive; /* next class with backlog in this priority band */ - -/* Parameters */ - unsigned char priority; /* class priority */ - unsigned char priority2; /* priority to be used after overlimit */ - unsigned char ewma_log; /* time constant for idle time calculation */ - - u32 defmap; - - /* Link-sharing scheduler parameters */ - long maxidle; /* Class parameters: see below. */ - long offtime; - long minidle; - u32 avpkt; - struct qdisc_rate_table *R_tab; - - /* General scheduler (WRR) parameters */ - long allot; - long quantum; /* Allotment per WRR round */ - long weight; /* Relative allotment: see below */ - - struct Qdisc *qdisc; /* Ptr to CBQ discipline */ - struct cbq_class *split; /* Ptr to split node */ - struct cbq_class *share; /* Ptr to LS parent in the class tree */ - struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ - struct cbq_class *borrow; /* NULL if class is bandwidth limited; - parent otherwise */ - struct cbq_class *sibling; /* Sibling chain */ - struct cbq_class *children; /* Pointer to children chain */ - - struct Qdisc *q; /* Elementary queueing discipline */ - - -/* Variables */ - unsigned char cpriority; /* Effective priority */ - unsigned char delayed; - unsigned char level; /* level of the class in hierarchy: - 0 for leaf classes, and maximal - level of children + 1 for nodes. - */ - - psched_time_t last; /* Last end of service */ - psched_time_t undertime; - long avgidle; - long deficit; /* Saved deficit for WRR */ - psched_time_t penalized; - struct gnet_stats_basic_sync bstats; - struct gnet_stats_queue qstats; - struct net_rate_estimator __rcu *rate_est; - struct tc_cbq_xstats xstats; - - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - - int filters; - - struct cbq_class *defaults[TC_PRIO_MAX + 1]; -}; - -struct cbq_sched_data { - struct Qdisc_class_hash clhash; /* Hash table of all classes */ - int nclasses[TC_CBQ_MAXPRIO + 1]; - unsigned int quanta[TC_CBQ_MAXPRIO + 1]; - - struct cbq_class link; - - unsigned int activemask; - struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes - with backlog */ - -#ifdef CONFIG_NET_CLS_ACT - struct cbq_class *rx_class; -#endif - struct cbq_class *tx_class; - struct cbq_class *tx_borrowed; - int tx_len; - psched_time_t now; /* Cached timestamp */ - unsigned int pmask; - - struct qdisc_watchdog watchdog; /* Watchdog timer, - started when CBQ has - backlog, but cannot - transmit just now */ - psched_tdiff_t wd_expires; - int toplevel; - u32 hgenerator; -}; - - -#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len) - -static inline struct cbq_class * -cbq_class_lookup(struct cbq_sched_data *q, u32 classid) -{ - struct Qdisc_class_common *clc; - - clc = qdisc_class_find(&q->clhash, classid); - if (clc == NULL) - return NULL; - return container_of(clc, struct cbq_class, common); -} - -#ifdef CONFIG_NET_CLS_ACT - -static struct cbq_class * -cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) -{ - struct cbq_class *cl; - - for (cl = this->tparent; cl; cl = cl->tparent) { - struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT]; - - if (new != NULL && new != this) - return new; - } - return NULL; -} - -#endif - -/* Classify packet. The procedure is pretty complicated, but - * it allows us to combine link sharing and priority scheduling - * transparently. - * - * Namely, you can put link sharing rules (f.e. route based) at root of CBQ, - * so that it resolves to split nodes. Then packets are classified - * by logical priority, or a more specific classifier may be attached - * to the split node. - */ - -static struct cbq_class * -cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *head = &q->link; - struct cbq_class **defmap; - struct cbq_class *cl = NULL; - u32 prio = skb->priority; - struct tcf_proto *fl; - struct tcf_result res; - - /* - * Step 1. If skb->priority points to one of our classes, use it. - */ - if (TC_H_MAJ(prio ^ sch->handle) == 0 && - (cl = cbq_class_lookup(q, prio)) != NULL) - return cl; - - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; - for (;;) { - int result = 0; - defmap = head->defaults; - - fl = rcu_dereference_bh(head->filter_list); - /* - * Step 2+n. Apply classifier. - */ - result = tcf_classify(skb, NULL, fl, &res, true); - if (!fl || result < 0) - goto fallback; - if (result == TC_ACT_SHOT) - return NULL; - - cl = (void *)res.class; - if (!cl) { - if (TC_H_MAJ(res.classid)) - cl = cbq_class_lookup(q, res.classid); - else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL) - cl = defmap[TC_PRIO_BESTEFFORT]; - - if (cl == NULL) - goto fallback; - } - if (cl->level >= head->level) - goto fallback; -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - fallthrough; - case TC_ACT_RECLASSIFY: - return cbq_reclassify(skb, cl); - } -#endif - if (cl->level == 0) - return cl; - - /* - * Step 3+n. If classifier selected a link sharing class, - * apply agency specific classifier. - * Repeat this procedure until we hit a leaf node. - */ - head = cl; - } - -fallback: - cl = head; - - /* - * Step 4. No success... - */ - if (TC_H_MAJ(prio) == 0 && - !(cl = head->defaults[prio & TC_PRIO_MAX]) && - !(cl = head->defaults[TC_PRIO_BESTEFFORT])) - return head; - - return cl; -} - -/* - * A packet has just been enqueued on the empty class. - * cbq_activate_class adds it to the tail of active class list - * of its priority band. - */ - -static inline void cbq_activate_class(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - int prio = cl->cpriority; - struct cbq_class *cl_tail; - - cl_tail = q->active[prio]; - q->active[prio] = cl; - - if (cl_tail != NULL) { - cl->next_alive = cl_tail->next_alive; - cl_tail->next_alive = cl; - } else { - cl->next_alive = cl; - q->activemask |= (1<<prio); - } -} - -/* - * Unlink class from active chain. - * Note that this same procedure is done directly in cbq_dequeue* - * during round-robin procedure. - */ - -static void cbq_deactivate_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - int prio = this->cpriority; - struct cbq_class *cl; - struct cbq_class *cl_prev = q->active[prio]; - - do { - cl = cl_prev->next_alive; - if (cl == this) { - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - if (cl == q->active[prio]) { - q->active[prio] = cl_prev; - if (cl == q->active[prio]) { - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - return; - } - } - return; - } - } while ((cl_prev = cl) != q->active[prio]); -} - -static void -cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) -{ - int toplevel = q->toplevel; - - if (toplevel > cl->level) { - psched_time_t now = psched_get_time(); - - do { - if (cl->undertime < now) { - q->toplevel = cl->level; - return; - } - } while ((cl = cl->borrow) != NULL && toplevel > cl->level); - } -} - -static int -cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - int ret; - struct cbq_class *cl = cbq_classify(skb, sch, &ret); - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = cl; -#endif - if (cl == NULL) { - if (ret & __NET_XMIT_BYPASS) - qdisc_qstats_drop(sch); - __qdisc_drop(skb, to_free); - return ret; - } - - ret = qdisc_enqueue(skb, cl->q, to_free); - if (ret == NET_XMIT_SUCCESS) { - sch->q.qlen++; - cbq_mark_toplevel(q, cl); - if (!cl->next_alive) - cbq_activate_class(cl); - return ret; - } - - if (net_xmit_drop_count(ret)) { - qdisc_qstats_drop(sch); - cbq_mark_toplevel(q, cl); - cl->qstats.drops++; - } - return ret; -} - -/* Overlimit action: penalize leaf class by adding offtime */ -static void cbq_overlimit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - psched_tdiff_t delay = cl->undertime - q->now; - - if (!cl->delayed) { - delay += cl->offtime; - - /* - * Class goes to sleep, so that it will have no - * chance to work avgidle. Let's forgive it 8) - * - * BTW cbq-2.0 has a crap in this - * place, apparently they forgot to shift it by cl->ewma_log. - */ - if (cl->avgidle < 0) - delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); - if (cl->avgidle < cl->minidle) - cl->avgidle = cl->minidle; - if (delay <= 0) - delay = 1; - cl->undertime = q->now + delay; - - cl->xstats.overactions++; - cl->delayed = 1; - } - if (q->wd_expires == 0 || q->wd_expires > delay) - q->wd_expires = delay; - - /* Dirty work! We must schedule wakeups based on - * real available rate, rather than leaf rate, - * which may be tiny (even zero). - */ - if (q->toplevel == TC_CBQ_MAXLEVEL) { - struct cbq_class *b; - psched_tdiff_t base_delay = q->wd_expires; - - for (b = cl->borrow; b; b = b->borrow) { - delay = b->undertime - q->now; - if (delay < base_delay) { - if (delay <= 0) - delay = 1; - base_delay = delay; - } - } - - q->wd_expires = base_delay; - } -} - -/* - * It is mission critical procedure. - * - * We "regenerate" toplevel cutoff, if transmitting class - * has backlog and it is not regulated. It is not part of - * original CBQ description, but looks more reasonable. - * Probably, it is wrong. This question needs further investigation. - */ - -static inline void -cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, - struct cbq_class *borrowed) -{ - if (cl && q->toplevel >= borrowed->level) { - if (cl->q->q.qlen > 1) { - do { - if (borrowed->undertime == PSCHED_PASTPERFECT) { - q->toplevel = borrowed->level; - return; - } - } while ((borrowed = borrowed->borrow) != NULL); - } -#if 0 - /* It is not necessary now. Uncommenting it - will save CPU cycles, but decrease fairness. - */ - q->toplevel = TC_CBQ_MAXLEVEL; -#endif - } -} - -static void -cbq_update(struct cbq_sched_data *q) -{ - struct cbq_class *this = q->tx_class; - struct cbq_class *cl = this; - int len = q->tx_len; - psched_time_t now; - - q->tx_class = NULL; - /* Time integrator. We calculate EOS time - * by adding expected packet transmission time. - */ - now = q->now + L2T(&q->link, len); - - for ( ; cl; cl = cl->share) { - long avgidle = cl->avgidle; - long idle; - - _bstats_update(&cl->bstats, len, 1); - - /* - * (now - last) is total time between packet right edges. - * (last_pktlen/rate) is "virtual" busy time, so that - * - * idle = (now - last) - last_pktlen/rate - */ - - idle = now - cl->last; - if ((unsigned long)idle > 128*1024*1024) { - avgidle = cl->maxidle; - } else { - idle -= L2T(cl, len); - - /* true_avgidle := (1-W)*true_avgidle + W*idle, - * where W=2^{-ewma_log}. But cl->avgidle is scaled: - * cl->avgidle == true_avgidle/W, - * hence: - */ - avgidle += idle - (avgidle>>cl->ewma_log); - } - - if (avgidle <= 0) { - /* Overlimit or at-limit */ - - if (avgidle < cl->minidle) - avgidle = cl->minidle; - - cl->avgidle = avgidle; - - /* Calculate expected time, when this class - * will be allowed to send. - * It will occur, when: - * (1-W)*true_avgidle + W*delay = 0, i.e. - * idle = (1/W - 1)*(-true_avgidle) - * or - * idle = (1 - W)*(-cl->avgidle); - */ - idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); - - /* - * That is not all. - * To maintain the rate allocated to the class, - * we add to undertime virtual clock, - * necessary to complete transmitted packet. - * (len/phys_bandwidth has been already passed - * to the moment of cbq_update) - */ - - idle -= L2T(&q->link, len); - idle += L2T(cl, len); - - cl->undertime = now + idle; - } else { - /* Underlimit */ - - cl->undertime = PSCHED_PASTPERFECT; - if (avgidle > cl->maxidle) - cl->avgidle = cl->maxidle; - else - cl->avgidle = avgidle; - } - if ((s64)(now - cl->last) > 0) - cl->last = now; - } - - cbq_update_toplevel(q, this, q->tx_borrowed); -} - -static inline struct cbq_class * -cbq_under_limit(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *this_cl = cl; - - if (cl->tparent == NULL) - return cl; - - if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { - cl->delayed = 0; - return cl; - } - - do { - /* It is very suspicious place. Now overlimit - * action is generated for not bounded classes - * only if link is completely congested. - * Though it is in agree with ancestor-only paradigm, - * it looks very stupid. Particularly, - * it means that this chunk of code will either - * never be called or result in strong amplification - * of burstiness. Dangerous, silly, and, however, - * no another solution exists. - */ - cl = cl->borrow; - if (!cl) { - this_cl->qstats.overlimits++; - cbq_overlimit(this_cl); - return NULL; - } - if (cl->level > q->toplevel) - return NULL; - } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); - - cl->delayed = 0; - return cl; -} - -static inline struct sk_buff * -cbq_dequeue_prio(struct Qdisc *sch, int prio) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl_tail, *cl_prev, *cl; - struct sk_buff *skb; - int deficit; - - cl_tail = cl_prev = q->active[prio]; - cl = cl_prev->next_alive; - - do { - deficit = 0; - - /* Start round */ - do { - struct cbq_class *borrow = cl; - - if (cl->q->q.qlen && - (borrow = cbq_under_limit(cl)) == NULL) - goto skip_class; - - if (cl->deficit <= 0) { - /* Class exhausted its allotment per - * this round. Switch to the next one. - */ - deficit = 1; - cl->deficit += cl->quantum; - goto next_class; - } - - skb = cl->q->dequeue(cl->q); - - /* Class did not give us any skb :-( - * It could occur even if cl->q->q.qlen != 0 - * f.e. if cl->q == "tbf" - */ - if (skb == NULL) - goto skip_class; - - cl->deficit -= qdisc_pkt_len(skb); - q->tx_class = cl; - q->tx_borrowed = borrow; - if (borrow != cl) { -#ifndef CBQ_XSTATS_BORROWS_BYTES - borrow->xstats.borrows++; - cl->xstats.borrows++; -#else - borrow->xstats.borrows += qdisc_pkt_len(skb); - cl->xstats.borrows += qdisc_pkt_len(skb); -#endif - } - q->tx_len = qdisc_pkt_len(skb); - - if (cl->deficit <= 0) { - q->active[prio] = cl; - cl = cl->next_alive; - cl->deficit += cl->quantum; - } - return skb; - -skip_class: - if (cl->q->q.qlen == 0 || prio != cl->cpriority) { - /* Class is empty or penalized. - * Unlink it from active chain. - */ - cl_prev->next_alive = cl->next_alive; - cl->next_alive = NULL; - - /* Did cl_tail point to it? */ - if (cl == cl_tail) { - /* Repair it! */ - cl_tail = cl_prev; - - /* Was it the last class in this band? */ - if (cl == cl_tail) { - /* Kill the band! */ - q->active[prio] = NULL; - q->activemask &= ~(1<<prio); - if (cl->q->q.qlen) - cbq_activate_class(cl); - return NULL; - } - - q->active[prio] = cl_tail; - } - if (cl->q->q.qlen) - cbq_activate_class(cl); - - cl = cl_prev; - } - -next_class: - cl_prev = cl; - cl = cl->next_alive; - } while (cl_prev != cl_tail); - } while (deficit); - - q->active[prio] = cl_prev; - - return NULL; -} - -static inline struct sk_buff * -cbq_dequeue_1(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct sk_buff *skb; - unsigned int activemask; - - activemask = q->activemask & 0xFF; - while (activemask) { - int prio = ffz(~activemask); - activemask &= ~(1<<prio); - skb = cbq_dequeue_prio(sch, prio); - if (skb) - return skb; - } - return NULL; -} - -static struct sk_buff * -cbq_dequeue(struct Qdisc *sch) -{ - struct sk_buff *skb; - struct cbq_sched_data *q = qdisc_priv(sch); - psched_time_t now; - - now = psched_get_time(); - - if (q->tx_class) - cbq_update(q); - - q->now = now; - - for (;;) { - q->wd_expires = 0; - - skb = cbq_dequeue_1(sch); - if (skb) { - qdisc_bstats_update(sch, skb); - sch->q.qlen--; - return skb; - } - - /* All the classes are overlimit. - * - * It is possible, if: - * - * 1. Scheduler is empty. - * 2. Toplevel cutoff inhibited borrowing. - * 3. Root class is overlimit. - * - * Reset 2d and 3d conditions and retry. - * - * Note, that NS and cbq-2.0 are buggy, peeking - * an arbitrary class is appropriate for ancestor-only - * sharing, but not for toplevel algorithm. - * - * Our version is better, but slower, because it requires - * two passes, but it is unavoidable with top-level sharing. - */ - - if (q->toplevel == TC_CBQ_MAXLEVEL && - q->link.undertime == PSCHED_PASTPERFECT) - break; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->link.undertime = PSCHED_PASTPERFECT; - } - - /* No packets in scheduler or nobody wants to give them to us :-( - * Sigh... start watchdog timer in the last case. - */ - - if (sch->q.qlen) { - qdisc_qstats_overlimit(sch); - if (q->wd_expires) - qdisc_watchdog_schedule(&q->watchdog, - now + q->wd_expires); - } - return NULL; -} - -/* CBQ class maintenance routines */ - -static void cbq_adjust_levels(struct cbq_class *this) -{ - if (this == NULL) - return; - - do { - int level = 0; - struct cbq_class *cl; - - cl = this->children; - if (cl) { - do { - if (cl->level > level) - level = cl->level; - } while ((cl = cl->sibling) != this->children); - } - this->level = level + 1; - } while ((this = this->tparent) != NULL); -} - -static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) -{ - struct cbq_class *cl; - unsigned int h; - - if (q->quanta[prio] == 0) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - /* BUGGGG... Beware! This expression suffer of - * arithmetic overflows! - */ - if (cl->priority == prio) { - cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ - q->quanta[prio]; - } - if (cl->quantum <= 0 || - cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) { - pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n", - cl->common.classid, cl->quantum); - cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; - } - } - } -} - -static void cbq_sync_defmap(struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - struct cbq_class *split = cl->split; - unsigned int h; - int i; - - if (split == NULL) - return; - - for (i = 0; i <= TC_PRIO_MAX; i++) { - if (split->defaults[i] == cl && !(cl->defmap & (1<<i))) - split->defaults[i] = NULL; - } - - for (i = 0; i <= TC_PRIO_MAX; i++) { - int level = split->level; - - if (split->defaults[i]) - continue; - - for (h = 0; h < q->clhash.hashsize; h++) { - struct cbq_class *c; - - hlist_for_each_entry(c, &q->clhash.hash[h], - common.hnode) { - if (c->split == split && c->level < level && - c->defmap & (1<<i)) { - split->defaults[i] = c; - level = c->level; - } - } - } - } -} - -static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) -{ - struct cbq_class *split = NULL; - - if (splitid == 0) { - split = cl->split; - if (!split) - return; - splitid = split->common.classid; - } - - if (split == NULL || split->common.classid != splitid) { - for (split = cl->tparent; split; split = split->tparent) - if (split->common.classid == splitid) - break; - } - - if (split == NULL) - return; - - if (cl->split != split) { - cl->defmap = 0; - cbq_sync_defmap(cl); - cl->split = split; - cl->defmap = def & mask; - } else - cl->defmap = (cl->defmap & ~mask) | (def & mask); - - cbq_sync_defmap(cl); -} - -static void cbq_unlink_class(struct cbq_class *this) -{ - struct cbq_class *cl, **clp; - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - - qdisc_class_hash_remove(&q->clhash, &this->common); - - if (this->tparent) { - clp = &this->sibling; - cl = *clp; - do { - if (cl == this) { - *clp = cl->sibling; - break; - } - clp = &cl->sibling; - } while ((cl = *clp) != this->sibling); - - if (this->tparent->children == this) { - this->tparent->children = this->sibling; - if (this->sibling == this) - this->tparent->children = NULL; - } - } else { - WARN_ON(this->sibling != this); - } -} - -static void cbq_link_class(struct cbq_class *this) -{ - struct cbq_sched_data *q = qdisc_priv(this->qdisc); - struct cbq_class *parent = this->tparent; - - this->sibling = this; - qdisc_class_hash_insert(&q->clhash, &this->common); - - if (parent == NULL) - return; - - if (parent->children == NULL) { - parent->children = this; - } else { - this->sibling = parent->children->sibling; - parent->children->sibling = this; - } -} - -static void -cbq_reset(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - int prio; - unsigned int h; - - q->activemask = 0; - q->pmask = 0; - q->tx_class = NULL; - q->tx_borrowed = NULL; - qdisc_watchdog_cancel(&q->watchdog); - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) - q->active[prio] = NULL; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - qdisc_reset(cl->q); - - cl->next_alive = NULL; - cl->undertime = PSCHED_PASTPERFECT; - cl->avgidle = cl->maxidle; - cl->deficit = cl->quantum; - cl->cpriority = cl->priority; - } - } -} - - -static void cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) -{ - if (lss->change & TCF_CBQ_LSS_FLAGS) { - cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; - cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; - } - if (lss->change & TCF_CBQ_LSS_EWMA) - cl->ewma_log = lss->ewma_log; - if (lss->change & TCF_CBQ_LSS_AVPKT) - cl->avpkt = lss->avpkt; - if (lss->change & TCF_CBQ_LSS_MINIDLE) - cl->minidle = -(long)lss->minidle; - if (lss->change & TCF_CBQ_LSS_MAXIDLE) { - cl->maxidle = lss->maxidle; - cl->avgidle = lss->maxidle; - } - if (lss->change & TCF_CBQ_LSS_OFFTIME) - cl->offtime = lss->offtime; -} - -static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]--; - q->quanta[cl->priority] -= cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) -{ - q->nclasses[cl->priority]++; - q->quanta[cl->priority] += cl->weight; - cbq_normalize_quanta(q, cl->priority); -} - -static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) -{ - struct cbq_sched_data *q = qdisc_priv(cl->qdisc); - - if (wrr->allot) - cl->allot = wrr->allot; - if (wrr->weight) - cl->weight = wrr->weight; - if (wrr->priority) { - cl->priority = wrr->priority - 1; - cl->cpriority = cl->priority; - if (cl->priority >= cl->priority2) - cl->priority2 = TC_CBQ_MAXPRIO - 1; - } - - cbq_addprio(q, cl); - return 0; -} - -static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) -{ - cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); - return 0; -} - -static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { - [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, - [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, - [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, - [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, - [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, - [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, - [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, -}; - -static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], - struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - int err; - - if (!opt) { - NL_SET_ERR_MSG(extack, "CBQ options are required for this operation"); - return -EINVAL; - } - - err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt, - cbq_policy, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_WRROPT]) { - const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]); - - if (wrr->priority > TC_CBQ_MAXPRIO) { - NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO"); - err = -EINVAL; - } - } - return err; -} - -static int cbq_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct tc_ratespec *r; - int err; - - qdisc_watchdog_init(&q->watchdog, sch); - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) { - NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete"); - return -EINVAL; - } - - r = nla_data(tb[TCA_CBQ_RATE]); - - q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack); - if (!q->link.R_tab) - return -EINVAL; - - err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack); - if (err) - goto put_rtab; - - err = qdisc_class_hash_init(&q->clhash); - if (err < 0) - goto put_block; - - q->link.sibling = &q->link; - q->link.common.classid = sch->handle; - q->link.qdisc = sch; - q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (!q->link.q) - q->link.q = &noop_qdisc; - else - qdisc_hash_add(q->link.q, true); - - q->link.priority = TC_CBQ_MAXPRIO - 1; - q->link.priority2 = TC_CBQ_MAXPRIO - 1; - q->link.cpriority = TC_CBQ_MAXPRIO - 1; - q->link.allot = psched_mtu(qdisc_dev(sch)); - q->link.quantum = q->link.allot; - q->link.weight = q->link.R_tab->rate.rate; - - q->link.ewma_log = TC_CBQ_DEF_EWMA; - q->link.avpkt = q->link.allot/2; - q->link.minidle = -0x7FFFFFFF; - - q->toplevel = TC_CBQ_MAXLEVEL; - q->now = psched_get_time(); - - cbq_link_class(&q->link); - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); - - cbq_addprio(q, &q->link); - return 0; - -put_block: - tcf_block_put(q->link.block); - -put_rtab: - qdisc_put_rtab(q->link.R_tab); - return err; -} - -static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - - if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_lssopt opt; - - opt.flags = 0; - if (cl->borrow == NULL) - opt.flags |= TCF_CBQ_LSS_BOUNDED; - if (cl->share == NULL) - opt.flags |= TCF_CBQ_LSS_ISOLATED; - opt.ewma_log = cl->ewma_log; - opt.level = cl->level; - opt.avpkt = cl->avpkt; - opt.maxidle = cl->maxidle; - opt.minidle = (u32)(-cl->minidle); - opt.offtime = cl->offtime; - opt.change = ~0; - if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_wrropt opt; - - memset(&opt, 0, sizeof(opt)); - opt.flags = 0; - opt.allot = cl->allot; - opt.priority = cl->priority + 1; - opt.cpriority = cl->cpriority + 1; - opt.weight = cl->weight; - if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt)) - goto nla_put_failure; - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) -{ - unsigned char *b = skb_tail_pointer(skb); - struct tc_cbq_fopt opt; - - if (cl->split || cl->defmap) { - opt.split = cl->split ? cl->split->common.classid : 0; - opt.defmap = cl->defmap; - opt.defchange = ~0; - if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt)) - goto nla_put_failure; - } - return skb->len; - -nla_put_failure: - nlmsg_trim(skb, b); - return -1; -} - -static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) -{ - if (cbq_dump_lss(skb, cl) < 0 || - cbq_dump_rate(skb, cl) < 0 || - cbq_dump_wrr(skb, cl) < 0 || - cbq_dump_fopt(skb, cl) < 0) - return -1; - return 0; -} - -static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct nlattr *nest; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, &q->link) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - q->link.xstats.avgidle = q->link.avgidle; - return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); -} - -static int -cbq_dump_class(struct Qdisc *sch, unsigned long arg, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - struct nlattr *nest; - - if (cl->tparent) - tcm->tcm_parent = cl->tparent->common.classid; - else - tcm->tcm_parent = TC_H_ROOT; - tcm->tcm_handle = cl->common.classid; - tcm->tcm_info = cl->q->handle; - - nest = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (nest == NULL) - goto nla_put_failure; - if (cbq_dump_attr(skb, cl) < 0) - goto nla_put_failure; - return nla_nest_end(skb, nest); - -nla_put_failure: - nla_nest_cancel(skb, nest); - return -1; -} - -static int -cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, - struct gnet_dump *d) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - __u32 qlen; - - cl->xstats.avgidle = cl->avgidle; - cl->xstats.undertime = 0; - qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog); - - if (cl->undertime != PSCHED_PASTPERFECT) - cl->xstats.undertime = cl->undertime - q->now; - - if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || - gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || - gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) - return -1; - - return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); -} - -static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, - struct Qdisc **old, struct netlink_ext_ack *extack) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - cl->common.classid, extack); - if (new == NULL) - return -ENOBUFS; - } - - *old = qdisc_replace(sch, new, &cl->q); - return 0; -} - -static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - return cl->q; -} - -static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cbq_deactivate_class(cl); -} - -static unsigned long cbq_find(struct Qdisc *sch, u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - return (unsigned long)cbq_class_lookup(q, classid); -} - -static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - - WARN_ON(cl->filters); - - tcf_block_put(cl->block); - qdisc_put(cl->q); - qdisc_put_rtab(cl->R_tab); - gen_kill_estimator(&cl->rate_est); - if (cl != &q->link) - kfree(cl); -} - -static void cbq_destroy(struct Qdisc *sch) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct hlist_node *next; - struct cbq_class *cl; - unsigned int h; - -#ifdef CONFIG_NET_CLS_ACT - q->rx_class = NULL; -#endif - /* - * Filters must be destroyed first because we don't destroy the - * classes from root to leafs which means that filters can still - * be bound to classes which have been destroyed already. --TGR '04 - */ - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - tcf_block_put(cl->block); - cl->block = NULL; - } - } - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h], - common.hnode) - cbq_destroy_class(sch, cl); - } - qdisc_class_hash_destroy(&q->clhash); -} - -static int -cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, - unsigned long *arg, struct netlink_ext_ack *extack) -{ - int err; - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)*arg; - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_CBQ_MAX + 1]; - struct cbq_class *parent; - struct qdisc_rate_table *rtab = NULL; - - err = cbq_opt_parse(tb, opt, extack); - if (err < 0) - return err; - - if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) { - NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params"); - return -EOPNOTSUPP; - } - - if (cl) { - /* Check parent */ - if (parentid) { - if (cl->tparent && - cl->tparent->common.classid != parentid) { - NL_SET_ERR_MSG(extack, "Invalid parent id"); - return -EINVAL; - } - if (!cl->tparent && parentid != TC_H_ROOT) { - NL_SET_ERR_MSG(extack, "Parent must be root"); - return -EINVAL; - } - } - - if (tb[TCA_CBQ_RATE]) { - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), - tb[TCA_CBQ_RTAB], extack); - if (rtab == NULL) - return -EINVAL; - } - - if (tca[TCA_RATE]) { - err = gen_replace_estimator(&cl->bstats, NULL, - &cl->rate_est, - NULL, - true, - tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); - qdisc_put_rtab(rtab); - return err; - } - } - - /* Change class parameters */ - sch_tree_lock(sch); - - if (cl->next_alive != NULL) - cbq_deactivate_class(cl); - - if (rtab) { - qdisc_put_rtab(cl->R_tab); - cl->R_tab = rtab; - } - - if (tb[TCA_CBQ_LSSOPT]) - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - - if (tb[TCA_CBQ_WRROPT]) { - cbq_rmprio(q, cl); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - } - - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - - if (cl->q->q.qlen) - cbq_activate_class(cl); - - sch_tree_unlock(sch); - - return 0; - } - - if (parentid == TC_H_ROOT) - return -EINVAL; - - if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) { - NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing"); - return -EINVAL; - } - - rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB], - extack); - if (rtab == NULL) - return -EINVAL; - - if (classid) { - err = -EINVAL; - if (TC_H_MAJ(classid ^ sch->handle) || - cbq_class_lookup(q, classid)) { - NL_SET_ERR_MSG(extack, "Specified class not found"); - goto failure; - } - } else { - int i; - classid = TC_H_MAKE(sch->handle, 0x8000); - - for (i = 0; i < 0x8000; i++) { - if (++q->hgenerator >= 0x8000) - q->hgenerator = 1; - if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) - break; - } - err = -ENOSR; - if (i >= 0x8000) { - NL_SET_ERR_MSG(extack, "Unable to generate classid"); - goto failure; - } - classid = classid|q->hgenerator; - } - - parent = &q->link; - if (parentid) { - parent = cbq_class_lookup(q, parentid); - err = -EINVAL; - if (!parent) { - NL_SET_ERR_MSG(extack, "Failed to find parentid"); - goto failure; - } - } - - err = -ENOBUFS; - cl = kzalloc(sizeof(*cl), GFP_KERNEL); - if (cl == NULL) - goto failure; - - gnet_stats_basic_sync_init(&cl->bstats); - err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); - if (err) { - kfree(cl); - goto failure; - } - - if (tca[TCA_RATE]) { - err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, - NULL, true, tca[TCA_RATE]); - if (err) { - NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); - tcf_block_put(cl->block); - kfree(cl); - goto failure; - } - } - - cl->R_tab = rtab; - rtab = NULL; - cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid, - NULL); - if (!cl->q) - cl->q = &noop_qdisc; - else - qdisc_hash_add(cl->q, true); - - cl->common.classid = classid; - cl->tparent = parent; - cl->qdisc = sch; - cl->allot = parent->allot; - cl->quantum = cl->allot; - cl->weight = cl->R_tab->rate.rate; - - sch_tree_lock(sch); - cbq_link_class(cl); - cl->borrow = cl->tparent; - if (cl->tparent != &q->link) - cl->share = cl->tparent; - cbq_adjust_levels(parent); - cl->minidle = -0x7FFFFFFF; - cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); - cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); - if (cl->ewma_log == 0) - cl->ewma_log = q->link.ewma_log; - if (cl->maxidle == 0) - cl->maxidle = q->link.maxidle; - if (cl->avpkt == 0) - cl->avpkt = q->link.avpkt; - if (tb[TCA_CBQ_FOPT]) - cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); - sch_tree_unlock(sch); - - qdisc_class_hash_grow(sch, &q->clhash); - - *arg = (unsigned long)cl; - return 0; - -failure: - qdisc_put_rtab(rtab); - return err; -} - -static int cbq_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl->filters || cl->children || cl == &q->link) - return -EBUSY; - - sch_tree_lock(sch); - - qdisc_purge_queue(cl->q); - - if (cl->next_alive) - cbq_deactivate_class(cl); - - if (q->tx_borrowed == cl) - q->tx_borrowed = q->tx_class; - if (q->tx_class == cl) { - q->tx_class = NULL; - q->tx_borrowed = NULL; - } -#ifdef CONFIG_NET_CLS_ACT - if (q->rx_class == cl) - q->rx_class = NULL; -#endif - - cbq_unlink_class(cl); - cbq_adjust_levels(cl->tparent); - cl->defmap = 0; - cbq_sync_defmap(cl); - - cbq_rmprio(q, cl); - sch_tree_unlock(sch); - - cbq_destroy_class(sch, cl); - return 0; -} - -static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl = (struct cbq_class *)arg; - - if (cl == NULL) - cl = &q->link; - - return cl->block; -} - -static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, - u32 classid) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *p = (struct cbq_class *)parent; - struct cbq_class *cl = cbq_class_lookup(q, classid); - - if (cl) { - if (p && p->level <= cl->level) - return 0; - cl->filters++; - return (unsigned long)cl; - } - return 0; -} - -static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) -{ - struct cbq_class *cl = (struct cbq_class *)arg; - - cl->filters--; -} - -static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) -{ - struct cbq_sched_data *q = qdisc_priv(sch); - struct cbq_class *cl; - unsigned int h; - - if (arg->stop) - return; - - for (h = 0; h < q->clhash.hashsize; h++) { - hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) - return; - } - } -} - -static const struct Qdisc_class_ops cbq_class_ops = { - .graft = cbq_graft, - .leaf = cbq_leaf, - .qlen_notify = cbq_qlen_notify, - .find = cbq_find, - .change = cbq_change_class, - .delete = cbq_delete, - .walk = cbq_walk, - .tcf_block = cbq_tcf_block, - .bind_tcf = cbq_bind_filter, - .unbind_tcf = cbq_unbind_filter, - .dump = cbq_dump_class, - .dump_stats = cbq_dump_class_stats, -}; - -static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &cbq_class_ops, - .id = "cbq", - .priv_size = sizeof(struct cbq_sched_data), - .enqueue = cbq_enqueue, - .dequeue = cbq_dequeue, - .peek = qdisc_peek_dequeued, - .init = cbq_init, - .reset = cbq_reset, - .destroy = cbq_destroy, - .change = NULL, - .dump = cbq_dump, - .dump_stats = cbq_dump_stats, - .owner = THIS_MODULE, -}; - -static int __init cbq_module_init(void) -{ - return register_qdisc(&cbq_qdisc_ops); -} -static void __exit cbq_module_exit(void) -{ - unregister_qdisc(&cbq_qdisc_ops); -} -module_init(cbq_module_init) -module_exit(cbq_module_exit) -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c deleted file mode 100644 index 401ffaf87d62..000000000000 --- a/net/sched/sch_dsmark.c +++ /dev/null @@ -1,518 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* net/sched/sch_dsmark.c - Differentiated Services field marker */ - -/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ - - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/skbuff.h> -#include <linux/rtnetlink.h> -#include <linux/bitops.h> -#include <net/pkt_sched.h> -#include <net/pkt_cls.h> -#include <net/dsfield.h> -#include <net/inet_ecn.h> -#include <asm/byteorder.h> - -/* - * classid class marking - * ------- ----- ------- - * n/a 0 n/a - * x:0 1 use entry [0] - * ... ... ... - * x:y y>0 y+1 use entry [y] - * ... ... ... - * x:indices-1 indices use entry [indices-1] - * ... ... ... - * x:y y+1 use entry [y & (indices-1)] - * ... ... ... - * 0xffff 0x10000 use entry [indices-1] - */ - - -#define NO_DEFAULT_INDEX (1 << 16) - -struct mask_value { - u8 mask; - u8 value; -}; - -struct dsmark_qdisc_data { - struct Qdisc *q; - struct tcf_proto __rcu *filter_list; - struct tcf_block *block; - struct mask_value *mv; - u16 indices; - u8 set_tc_index; - u32 default_index; /* index range is 0...0xffff */ -#define DSMARK_EMBEDDED_SZ 16 - struct mask_value embedded[DSMARK_EMBEDDED_SZ]; -}; - -static inline int dsmark_valid_index(struct dsmark_qdisc_data *p, u16 index) -{ - return index <= p->indices && index > 0; -} - -/* ------------------------- Class/flow operations ------------------------- */ - -static int dsmark_graft(struct Qdisc *sch, unsigned long arg, - struct Qdisc *new, struct Qdisc **old, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p],new %p,old %p)\n", - __func__, sch, p, new, old); - - if (new == NULL) { - new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, - sch->handle, NULL); - if (new == NULL) - new = &noop_qdisc; - } - - *old = qdisc_replace(sch, new, &p->q); - return 0; -} - -static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - return p->q; -} - -static unsigned long dsmark_find(struct Qdisc *sch, u32 classid) -{ - return TC_H_MIN(classid) + 1; -} - -static unsigned long dsmark_bind_filter(struct Qdisc *sch, - unsigned long parent, u32 classid) -{ - pr_debug("%s(sch %p,[qdisc %p],classid %x)\n", - __func__, sch, qdisc_priv(sch), classid); - - return dsmark_find(sch, classid); -} - -static void dsmark_unbind_filter(struct Qdisc *sch, unsigned long cl) -{ -} - -static const struct nla_policy dsmark_policy[TCA_DSMARK_MAX + 1] = { - [TCA_DSMARK_INDICES] = { .type = NLA_U16 }, - [TCA_DSMARK_DEFAULT_INDEX] = { .type = NLA_U16 }, - [TCA_DSMARK_SET_TC_INDEX] = { .type = NLA_FLAG }, - [TCA_DSMARK_MASK] = { .type = NLA_U8 }, - [TCA_DSMARK_VALUE] = { .type = NLA_U8 }, -}; - -static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, - struct nlattr **tca, unsigned long *arg, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opt = tca[TCA_OPTIONS]; - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - - pr_debug("%s(sch %p,[qdisc %p],classid %x,parent %x), arg 0x%lx\n", - __func__, sch, p, classid, parent, *arg); - - if (!dsmark_valid_index(p, *arg)) { - err = -ENOENT; - goto errout; - } - - if (!opt) - goto errout; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - if (tb[TCA_DSMARK_VALUE]) - p->mv[*arg - 1].value = nla_get_u8(tb[TCA_DSMARK_VALUE]); - - if (tb[TCA_DSMARK_MASK]) - p->mv[*arg - 1].mask = nla_get_u8(tb[TCA_DSMARK_MASK]); - - err = 0; - -errout: - return err; -} - -static int dsmark_delete(struct Qdisc *sch, unsigned long arg, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - if (!dsmark_valid_index(p, arg)) - return -EINVAL; - - p->mv[arg - 1].mask = 0xff; - p->mv[arg - 1].value = 0; - - return 0; -} - -static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int i; - - pr_debug("%s(sch %p,[qdisc %p],walker %p)\n", - __func__, sch, p, walker); - - if (walker->stop) - return; - - for (i = 0; i < p->indices; i++) { - if (p->mv[i].mask == 0xff && !p->mv[i].value) { - walker->count++; - continue; - } - if (!tc_qdisc_stats_dump(sch, i + 1, walker)) - break; - } -} - -static struct tcf_block *dsmark_tcf_block(struct Qdisc *sch, unsigned long cl, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - return p->block; -} - -/* --------------------------- Qdisc operations ---------------------------- */ - -static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch, - struct sk_buff **to_free) -{ - unsigned int len = qdisc_pkt_len(skb); - struct dsmark_qdisc_data *p = qdisc_priv(sch); - int err; - - pr_debug("%s(skb %p,sch %p,[qdisc %p])\n", __func__, skb, sch, p); - - if (p->set_tc_index) { - int wlen = skb_network_offset(skb); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - wlen += sizeof(struct iphdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv4_get_dsfield(ip_hdr(skb)) - & ~INET_ECN_MASK; - break; - - case htons(ETH_P_IPV6): - wlen += sizeof(struct ipv6hdr); - if (!pskb_may_pull(skb, wlen) || - skb_try_make_writable(skb, wlen)) - goto drop; - - skb->tc_index = ipv6_get_dsfield(ipv6_hdr(skb)) - & ~INET_ECN_MASK; - break; - default: - skb->tc_index = 0; - break; - } - } - - if (TC_H_MAJ(skb->priority) == sch->handle) - skb->tc_index = TC_H_MIN(skb->priority); - else { - struct tcf_result res; - struct tcf_proto *fl = rcu_dereference_bh(p->filter_list); - int result = tcf_classify(skb, NULL, fl, &res, false); - - pr_debug("result %d class 0x%04x\n", result, res.classid); - - switch (result) { -#ifdef CONFIG_NET_CLS_ACT - case TC_ACT_QUEUED: - case TC_ACT_STOLEN: - case TC_ACT_TRAP: - __qdisc_drop(skb, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - - case TC_ACT_SHOT: - goto drop; -#endif - case TC_ACT_OK: - skb->tc_index = TC_H_MIN(res.classid); - break; - - default: - if (p->default_index != NO_DEFAULT_INDEX) - skb->tc_index = p->default_index; - break; - } - } - - err = qdisc_enqueue(skb, p->q, to_free); - if (err != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(err)) - qdisc_qstats_drop(sch); - return err; - } - - sch->qstats.backlog += len; - sch->q.qlen++; - - return NET_XMIT_SUCCESS; - -drop: - qdisc_drop(skb, sch, to_free); - return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; -} - -static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct sk_buff *skb; - u32 index; - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - skb = qdisc_dequeue_peeked(p->q); - if (skb == NULL) - return NULL; - - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - index = skb->tc_index & (p->indices - 1); - pr_debug("index %d->%d\n", skb->tc_index, index); - - switch (skb_protocol(skb, true)) { - case htons(ETH_P_IP): - ipv4_change_dsfield(ip_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - case htons(ETH_P_IPV6): - ipv6_change_dsfield(ipv6_hdr(skb), p->mv[index].mask, - p->mv[index].value); - break; - default: - /* - * Only complain if a change was actually attempted. - * This way, we can send non-IP traffic through dsmark - * and don't need yet another qdisc as a bypass. - */ - if (p->mv[index].mask != 0xff || p->mv[index].value) - pr_warn("%s: unsupported protocol %d\n", - __func__, ntohs(skb_protocol(skb, true))); - break; - } - - return skb; -} - -static struct sk_buff *dsmark_peek(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - return p->q->ops->peek(p->q); -} - -static int dsmark_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *tb[TCA_DSMARK_MAX + 1]; - int err = -EINVAL; - u32 default_index = NO_DEFAULT_INDEX; - u16 indices; - int i; - - pr_debug("%s(sch %p,[qdisc %p],opt %p)\n", __func__, sch, p, opt); - - if (!opt) - goto errout; - - err = tcf_block_get(&p->block, &p->filter_list, sch, extack); - if (err) - return err; - - err = nla_parse_nested_deprecated(tb, TCA_DSMARK_MAX, opt, - dsmark_policy, NULL); - if (err < 0) - goto errout; - - err = -EINVAL; - if (!tb[TCA_DSMARK_INDICES]) - goto errout; - indices = nla_get_u16(tb[TCA_DSMARK_INDICES]); - - if (hweight32(indices) != 1) - goto errout; - - if (tb[TCA_DSMARK_DEFAULT_INDEX]) - default_index = nla_get_u16(tb[TCA_DSMARK_DEFAULT_INDEX]); - - if (indices <= DSMARK_EMBEDDED_SZ) - p->mv = p->embedded; - else - p->mv = kmalloc_array(indices, sizeof(*p->mv), GFP_KERNEL); - if (!p->mv) { - err = -ENOMEM; - goto errout; - } - for (i = 0; i < indices; i++) { - p->mv[i].mask = 0xff; - p->mv[i].value = 0; - } - p->indices = indices; - p->default_index = default_index; - p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - - p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle, - NULL); - if (p->q == NULL) - p->q = &noop_qdisc; - else - qdisc_hash_add(p->q, true); - - pr_debug("%s: qdisc %p\n", __func__, p->q); - - err = 0; -errout: - return err; -} - -static void dsmark_reset(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - if (p->q) - qdisc_reset(p->q); -} - -static void dsmark_destroy(struct Qdisc *sch) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - - pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p); - - tcf_block_put(p->block); - qdisc_put(p->q); - if (p->mv != p->embedded) - kfree(p->mv); -} - -static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - pr_debug("%s(sch %p,[qdisc %p],class %ld\n", __func__, sch, p, cl); - - if (!dsmark_valid_index(p, cl)) - return -EINVAL; - - tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1); - tcm->tcm_info = p->q->handle; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u8(skb, TCA_DSMARK_MASK, p->mv[cl - 1].mask) || - nla_put_u8(skb, TCA_DSMARK_VALUE, p->mv[cl - 1].value)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) -{ - struct dsmark_qdisc_data *p = qdisc_priv(sch); - struct nlattr *opts = NULL; - - opts = nla_nest_start_noflag(skb, TCA_OPTIONS); - if (opts == NULL) - goto nla_put_failure; - if (nla_put_u16(skb, TCA_DSMARK_INDICES, p->indices)) - goto nla_put_failure; - - if (p->default_index != NO_DEFAULT_INDEX && - nla_put_u16(skb, TCA_DSMARK_DEFAULT_INDEX, p->default_index)) - goto nla_put_failure; - - if (p->set_tc_index && - nla_put_flag(skb, TCA_DSMARK_SET_TC_INDEX)) - goto nla_put_failure; - - return nla_nest_end(skb, opts); - -nla_put_failure: - nla_nest_cancel(skb, opts); - return -EMSGSIZE; -} - -static const struct Qdisc_class_ops dsmark_class_ops = { - .graft = dsmark_graft, - .leaf = dsmark_leaf, - .find = dsmark_find, - .change = dsmark_change, - .delete = dsmark_delete, - .walk = dsmark_walk, - .tcf_block = dsmark_tcf_block, - .bind_tcf = dsmark_bind_filter, - .unbind_tcf = dsmark_unbind_filter, - .dump = dsmark_dump_class, -}; - -static struct Qdisc_ops dsmark_qdisc_ops __read_mostly = { - .next = NULL, - .cl_ops = &dsmark_class_ops, - .id = "dsmark", - .priv_size = sizeof(struct dsmark_qdisc_data), - .enqueue = dsmark_enqueue, - .dequeue = dsmark_dequeue, - .peek = dsmark_peek, - .init = dsmark_init, - .reset = dsmark_reset, - .destroy = dsmark_destroy, - .change = NULL, - .dump = dsmark_dump, - .owner = THIS_MODULE, -}; - -static int __init dsmark_module_init(void) -{ - return register_qdisc(&dsmark_qdisc_ops); -} - -static void __exit dsmark_module_exit(void) -{ - unregister_qdisc(&dsmark_qdisc_ops); -} - -module_init(dsmark_module_init) -module_exit(dsmark_module_exit) - -MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 4c68abaa289b..48ed87b91086 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -17,6 +17,8 @@ #include <net/sch_generic.h> #include <net/pkt_cls.h> +#include "sch_mqprio_lib.h" + struct mqprio_sched { struct Qdisc **qdiscs; u16 mode; @@ -27,6 +29,62 @@ struct mqprio_sched { u64 max_rate[TC_QOPT_MAX_QUEUE]; }; +static int mqprio_enable_offload(struct Qdisc *sch, + const struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int err, i; + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + if (priv->shaper != TC_MQPRIO_SHAPER_DCB) + return -EINVAL; + break; + case TC_MQPRIO_MODE_CHANNEL: + mqprio.flags = priv->flags; + if (priv->flags & TC_MQPRIO_F_MODE) + mqprio.mode = priv->mode; + if (priv->flags & TC_MQPRIO_F_SHAPER) + mqprio.shaper = priv->shaper; + if (priv->flags & TC_MQPRIO_F_MIN_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.min_rate[i] = priv->min_rate[i]; + if (priv->flags & TC_MQPRIO_F_MAX_RATE) + for (i = 0; i < mqprio.qopt.num_tc; i++) + mqprio.max_rate[i] = priv->max_rate[i]; + break; + default: + return -EINVAL; + } + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + if (err) + return err; + + priv->hw_offload = mqprio.qopt.hw; + + return 0; +} + +static void mqprio_disable_offload(struct Qdisc *sch) +{ + struct tc_mqprio_qopt_offload mqprio = { { 0 } }; + struct mqprio_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + switch (priv->mode) { + case TC_MQPRIO_MODE_DCB: + case TC_MQPRIO_MODE_CHANNEL: + dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, + &mqprio); + break; + } +} + static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); @@ -41,37 +99,17 @@ static void mqprio_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } - if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { - struct tc_mqprio_qopt_offload mqprio = { { 0 } }; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - case TC_MQPRIO_MODE_CHANNEL: - dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); - break; - default: - return; - } - } else { + if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) + mqprio_disable_offload(sch); + else netdev_set_num_tc(dev, 0); - } } -static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) +static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + const struct tc_mqprio_caps *caps, + struct netlink_ext_ack *extack) { - int i, j; - - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) - return -EINVAL; - - /* Verify priority mapping uses valid tcs */ - for (i = 0; i < TC_BITMASK + 1; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) - return -EINVAL; - } + int err; /* Limit qopt->hw to maximum supported offload value. Drivers have * the option of overriding this later if they don't support the a @@ -80,31 +118,23 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; - /* If hardware offload is requested we will leave it to the device - * to either populate the queue counts itself or to validate the - * provided queue counts. If ndo_setup_tc is not present then - * hardware doesn't support offload and we should return an error. + /* If hardware offload is requested, we will leave 3 options to the + * device driver: + * - populate the queue counts itself (and ignore what was requested) + * - validate the provided queue counts by itself (and apply them) + * - request queue count validation here (and apply them) */ - if (qopt->hw) - return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->real_num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) - return -EINVAL; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) - return -EINVAL; - } - } + err = mqprio_validate_qopt(dev, qopt, + !qopt->hw || caps->validate_queue_counts, + false, extack); + if (err) + return err; + + /* If ndo_setup_tc is not present then hardware doesn't support offload + * and we should return an error. + */ + if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) + return -EINVAL; return 0; } @@ -130,6 +160,67 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, return 0; } +static int mqprio_parse_nlattr(struct Qdisc *sch, struct tc_mqprio_qopt *qopt, + struct nlattr *opt) +{ + struct mqprio_sched *priv = qdisc_priv(sch); + struct nlattr *tb[TCA_MQPRIO_MAX + 1]; + struct nlattr *attr; + int i, rem, err; + + err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, + sizeof(*qopt)); + if (err < 0) + return err; + + if (!qopt->hw) + return -EINVAL; + + if (tb[TCA_MQPRIO_MODE]) { + priv->flags |= TC_MQPRIO_F_MODE; + priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); + } + + if (tb[TCA_MQPRIO_SHAPER]) { + priv->flags |= TC_MQPRIO_F_SHAPER; + priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); + } + + if (tb[TCA_MQPRIO_MIN_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->min_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MIN_RATE; + } + + if (tb[TCA_MQPRIO_MAX_RATE64]) { + if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) + return -EINVAL; + i = 0; + nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], + rem) { + if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) + return -EINVAL; + if (i >= qopt->num_tc) + break; + priv->max_rate[i] = *(u64 *)nla_data(attr); + i++; + } + priv->flags |= TC_MQPRIO_F_MAX_RATE; + } + + return 0; +} + static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -139,9 +230,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; - struct nlattr *tb[TCA_MQPRIO_MAX + 1]; - struct nlattr *attr; - int rem; + struct tc_mqprio_caps caps; int len; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); @@ -160,61 +249,18 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, + &caps, sizeof(caps)); + qopt = nla_data(opt); - if (mqprio_parse_opt(dev, qopt)) + if (mqprio_parse_opt(dev, qopt, &caps, extack)) return -EINVAL; len = nla_len(opt) - NLA_ALIGN(sizeof(*qopt)); if (len > 0) { - err = parse_attr(tb, TCA_MQPRIO_MAX, opt, mqprio_policy, - sizeof(*qopt)); - if (err < 0) + err = mqprio_parse_nlattr(sch, qopt, opt); + if (err) return err; - - if (!qopt->hw) - return -EINVAL; - - if (tb[TCA_MQPRIO_MODE]) { - priv->flags |= TC_MQPRIO_F_MODE; - priv->mode = *(u16 *)nla_data(tb[TCA_MQPRIO_MODE]); - } - - if (tb[TCA_MQPRIO_SHAPER]) { - priv->flags |= TC_MQPRIO_F_SHAPER; - priv->shaper = *(u16 *)nla_data(tb[TCA_MQPRIO_SHAPER]); - } - - if (tb[TCA_MQPRIO_MIN_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MIN_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MIN_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->min_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MIN_RATE; - } - - if (tb[TCA_MQPRIO_MAX_RATE64]) { - if (priv->shaper != TC_MQPRIO_SHAPER_BW_RATE) - return -EINVAL; - i = 0; - nla_for_each_nested(attr, tb[TCA_MQPRIO_MAX_RATE64], - rem) { - if (nla_type(attr) != TCA_MQPRIO_MAX_RATE64) - return -EINVAL; - if (i >= qopt->num_tc) - break; - priv->max_rate[i] = *(u64 *)nla_data(attr); - i++; - } - priv->flags |= TC_MQPRIO_F_MAX_RATE; - } } /* pre-allocate qdisc, attachment can't fail */ @@ -241,36 +287,9 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt, * supplied and verified mapping */ if (qopt->hw) { - struct tc_mqprio_qopt_offload mqprio = {.qopt = *qopt}; - - switch (priv->mode) { - case TC_MQPRIO_MODE_DCB: - if (priv->shaper != TC_MQPRIO_SHAPER_DCB) - return -EINVAL; - break; - case TC_MQPRIO_MODE_CHANNEL: - mqprio.flags = priv->flags; - if (priv->flags & TC_MQPRIO_F_MODE) - mqprio.mode = priv->mode; - if (priv->flags & TC_MQPRIO_F_SHAPER) - mqprio.shaper = priv->shaper; - if (priv->flags & TC_MQPRIO_F_MIN_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.min_rate[i] = priv->min_rate[i]; - if (priv->flags & TC_MQPRIO_F_MAX_RATE) - for (i = 0; i < mqprio.qopt.num_tc; i++) - mqprio.max_rate[i] = priv->max_rate[i]; - break; - default: - return -EINVAL; - } - err = dev->netdev_ops->ndo_setup_tc(dev, - TC_SETUP_QDISC_MQPRIO, - &mqprio); + err = mqprio_enable_offload(sch, qopt, extack); if (err) return err; - - priv->hw_offload = mqprio.qopt.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -387,7 +406,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nla = (struct nlattr *)skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; - unsigned int ntx, tc; + unsigned int ntx; sch->q.qlen = 0; gnet_stats_basic_sync_init(&sch->bstats); @@ -411,15 +430,9 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + mqprio_qopt_reconstruct(dev, &opt); opt.hw = priv->hw_offload; - for (tc = 0; tc < netdev_get_num_tc(dev); tc++) { - opt.count[tc] = dev->tc_to_txq[tc].count; - opt.offset[tc] = dev->tc_to_txq[tc].offset; - } - if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; diff --git a/net/sched/sch_mqprio_lib.c b/net/sched/sch_mqprio_lib.c new file mode 100644 index 000000000000..c58a533b8ec5 --- /dev/null +++ b/net/sched/sch_mqprio_lib.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/net.h> +#include <linux/netdevice.h> +#include <linux/netlink.h> +#include <linux/types.h> +#include <net/pkt_sched.h> + +#include "sch_mqprio_lib.h" + +/* Returns true if the intervals [a, b) and [c, d) overlap. */ +static bool intervals_overlap(int a, int b, int c, int d) +{ + int left = max(a, c), right = min(b, d); + + return left < right; +} + +static int mqprio_validate_queue_counts(struct net_device *dev, + const struct tc_mqprio_qopt *qopt, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, j; + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + if (!qopt->count[i]) { + NL_SET_ERR_MSG_FMT_MOD(extack, "No queues for TC %d", + i); + return -EINVAL; + } + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->real_num_tx_queues || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "Queues %d:%d for TC %d exceed the %d TX queues available", + qopt->count[i], qopt->offset[i], + i, dev->real_num_tx_queues); + return -EINVAL; + } + + if (allow_overlapping_txqs) + continue; + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (intervals_overlap(qopt->offset[i], last, + qopt->offset[j], + qopt->offset[j] + + qopt->count[j])) { + NL_SET_ERR_MSG_FMT_MOD(extack, + "TC %d queues %d@%d overlap with TC %d queues %d@%d", + i, qopt->count[i], qopt->offset[i], + j, qopt->count[j], qopt->offset[j]); + return -EINVAL; + } + } + } + + return 0; +} + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack) +{ + int i, err; + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, + "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i <= TC_BITMASK; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, + "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + if (validate_queue_counts) { + err = mqprio_validate_queue_counts(dev, qopt, + allow_overlapping_txqs, + extack); + if (err) + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(mqprio_validate_qopt); + +void mqprio_qopt_reconstruct(struct net_device *dev, struct tc_mqprio_qopt *qopt) +{ + int tc, num_tc = netdev_get_num_tc(dev); + + qopt->num_tc = num_tc; + memcpy(qopt->prio_tc_map, dev->prio_tc_map, sizeof(qopt->prio_tc_map)); + + for (tc = 0; tc < num_tc; tc++) { + qopt->count[tc] = dev->tc_to_txq[tc].count; + qopt->offset[tc] = dev->tc_to_txq[tc].offset; + } +} +EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_mqprio_lib.h b/net/sched/sch_mqprio_lib.h new file mode 100644 index 000000000000..63f725ab8761 --- /dev/null +++ b/net/sched/sch_mqprio_lib.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __SCH_MQPRIO_LIB_H +#define __SCH_MQPRIO_LIB_H + +#include <linux/types.h> + +struct net_device; +struct netlink_ext_ack; +struct tc_mqprio_qopt; + +int mqprio_validate_qopt(struct net_device *dev, struct tc_mqprio_qopt *qopt, + bool validate_queue_counts, + bool allow_overlapping_txqs, + struct netlink_ext_ack *extack); +void mqprio_qopt_reconstruct(struct net_device *dev, + struct tc_mqprio_qopt *qopt); + +#endif diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c322a61eaeea..1f469861eae3 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -26,7 +26,11 @@ #include <net/sock.h> #include <net/tcp.h> +#include "sch_mqprio_lib.h" + static LIST_HEAD(taprio_list); +static struct static_key_false taprio_have_broken_mqprio; +static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 @@ -35,15 +39,19 @@ static LIST_HEAD(taprio_list); #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { - struct list_head list; - - /* The instant that this entry "closes" and the next one - * should open, the qdisc will make some effort so that no - * packet leaves after this time. + /* Durations between this GCL entry and the GCL entry where the + * respective traffic class gate closes */ - ktime_t close_time; + u64 gate_duration[TC_MAX_QUEUE]; + atomic_t budget[TC_MAX_QUEUE]; + /* The qdisc makes some effort so that no packet leaves + * after this time + */ + ktime_t gate_close_time[TC_MAX_QUEUE]; + struct list_head list; + /* Used to calculate when to advance the schedule */ + ktime_t end_time; ktime_t next_txtime; - atomic_t budget; int index; u32 gate_mask; u32 interval; @@ -51,10 +59,16 @@ struct sched_entry { }; struct sched_gate_list { + /* Longest non-zero contiguous gate durations per traffic class, + * or 0 if a traffic class gate never opens during the schedule. + */ + u64 max_open_gate_duration[TC_MAX_QUEUE]; + u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ + u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; - ktime_t cycle_close_time; + ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; @@ -67,6 +81,8 @@ struct taprio_sched { enum tk_offsets tk_offset; int clockid; bool offloaded; + bool detected_mqprio; + bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -78,8 +94,8 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ - u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */ + int cur_txq[TC_MAX_QUEUE]; + u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 txtime_delay; }; @@ -88,6 +104,57 @@ struct __tc_taprio_qopt_offload { struct tc_taprio_qopt_offload offload; }; +static void taprio_calculate_gate_durations(struct taprio_sched *q, + struct sched_gate_list *sched) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + struct sched_entry *entry, *cur; + int tc; + + list_for_each_entry(entry, &sched->entries, list) { + u32 gates_still_open = entry->gate_mask; + + /* For each traffic class, calculate each open gate duration, + * starting at this schedule entry and ending at the schedule + * entry containing a gate close event for that TC. + */ + cur = entry; + + do { + if (!gates_still_open) + break; + + for (tc = 0; tc < num_tc; tc++) { + if (!(gates_still_open & BIT(tc))) + continue; + + if (cur->gate_mask & BIT(tc)) + entry->gate_duration[tc] += cur->interval; + else + gates_still_open &= ~BIT(tc); + } + + cur = list_next_entry_circular(cur, &sched->entries, list); + } while (cur != entry); + + /* Keep track of the maximum gate duration for each traffic + * class, taking care to not confuse a traffic class which is + * temporarily closed with one that is always closed. + */ + for (tc = 0; tc < num_tc; tc++) + if (entry->gate_duration[tc] && + sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) + sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; + } +} + +static bool taprio_entry_allows_tx(ktime_t skb_end_time, + struct sched_entry *entry, int tc) +{ + return ktime_before(skb_end_time, entry->gate_close_time[tc]); +} + static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) @@ -180,6 +247,63 @@ static int length_to_duration(struct taprio_sched *q, int len) return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } +static int duration_to_length(struct taprio_sched *q, u64 duration) +{ + return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); +} + +/* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the + * q->max_sdu[] requested by the user and the max_sdu dynamically determined by + * the maximum open gate durations at the given link speed. + */ +static void taprio_update_queue_max_sdu(struct taprio_sched *q, + struct sched_gate_list *sched, + struct qdisc_size_table *stab) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + u32 max_sdu_from_user; + u32 max_sdu_dynamic; + u32 max_sdu; + int tc; + + for (tc = 0; tc < num_tc; tc++) { + max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; + + /* TC gate never closes => keep the queueMaxSDU + * selected by the user + */ + if (sched->max_open_gate_duration[tc] == sched->cycle_time) { + max_sdu_dynamic = U32_MAX; + } else { + u32 max_frm_len; + + max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); + /* Compensate for L1 overhead from size table, + * but don't let the frame size go negative + */ + if (stab) { + max_frm_len -= stab->szopts.overhead; + max_frm_len = max_t(int, max_frm_len, + dev->hard_header_len + 1); + } + max_sdu_dynamic = max_frm_len - dev->hard_header_len; + if (max_sdu_dynamic > dev->max_mtu) + max_sdu_dynamic = U32_MAX; + } + + max_sdu = min(max_sdu_dynamic, max_sdu_from_user); + + if (max_sdu != U32_MAX) { + sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; + sched->max_sdu[tc] = max_sdu; + } else { + sched->max_frm_len[tc] = U32_MAX; /* never oversized */ + sched->max_sdu[tc] = 0; + } + } +} + /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. @@ -413,14 +537,33 @@ done: return txtime; } -static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, - struct Qdisc *child, struct sk_buff **to_free) +/* Devices with full offload are expected to honor this in hardware */ +static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, + struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *sched; int prio = skb->priority; + bool exceeds = false; u8 tc; + tc = netdev_get_prio_tc_map(dev, prio); + + rcu_read_lock(); + sched = rcu_dereference(q->oper_sched); + if (sched && skb->len > sched->max_frm_len[tc]) + exceeds = true; + rcu_read_unlock(); + + return exceeds; +} + +static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) @@ -431,17 +574,53 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } - /* Devices with full offload are expected to honor this in hardware */ - tc = netdev_get_prio_tc_map(dev, prio); - if (skb->len > q->max_frm_len[tc]) - return qdisc_drop(skb, sch, to_free); - qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } +static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, + struct Qdisc *child, + struct sk_buff **to_free) +{ + unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); + netdev_features_t features = netif_skb_features(skb); + struct sk_buff *segs, *nskb; + int ret; + + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(segs)) + return qdisc_drop(skb, sch, to_free); + + skb_list_walk_safe(segs, segs, nskb) { + skb_mark_not_on_list(segs); + qdisc_skb_cb(segs)->pkt_len = segs->len; + slen += segs->len; + + /* FIXME: we should be segmenting to a smaller size + * rather than dropping these + */ + if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) + ret = qdisc_drop(segs, sch, to_free); + else + ret = taprio_enqueue_one(segs, sch, child, to_free); + + if (ret != NET_XMIT_SUCCESS) { + if (net_xmit_drop_count(ret)) + qdisc_qstats_drop(sch); + } else { + numsegs++; + } + } + + if (numsegs > 1) + qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); + consume_skb(skb); + + return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +} + /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ @@ -458,97 +637,190 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); - /* Large packets might not be transmitted when the transmission duration - * exceeds any configured interval. Therefore, segment the skb into - * smaller chunks. Drivers with full offload are expected to handle - * this in hardware. - */ - if (skb_is_gso(skb)) { - unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); - netdev_features_t features = netif_skb_features(skb); - struct sk_buff *segs, *nskb; - int ret; - - segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); - if (IS_ERR_OR_NULL(segs)) - return qdisc_drop(skb, sch, to_free); + if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { + /* Large packets might not be transmitted when the transmission + * duration exceeds any configured interval. Therefore, segment + * the skb into smaller chunks. Drivers with full offload are + * expected to handle this in hardware. + */ + if (skb_is_gso(skb)) + return taprio_enqueue_segmented(skb, sch, child, + to_free); - skb_list_walk_safe(segs, segs, nskb) { - skb_mark_not_on_list(segs); - qdisc_skb_cb(segs)->pkt_len = segs->len; - slen += segs->len; + return qdisc_drop(skb, sch, to_free); + } - ret = taprio_enqueue_one(segs, sch, child, to_free); - if (ret != NET_XMIT_SUCCESS) { - if (net_xmit_drop_count(ret)) - qdisc_qstats_drop(sch); - } else { - numsegs++; - } - } + return taprio_enqueue_one(skb, sch, child, to_free); +} - if (numsegs > 1) - qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); - consume_skb(skb); +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); + return NULL; +} + +static void taprio_set_budgets(struct taprio_sched *q, + struct sched_gate_list *sched, + struct sched_entry *entry) +{ + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); + int tc, budget; + + for (tc = 0; tc < num_tc; tc++) { + /* Traffic classes which never close have infinite budget */ + if (entry->gate_duration[tc] == sched->cycle_time) + budget = INT_MAX; + else + budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, + atomic64_read(&q->picos_per_byte)); + + atomic_set(&entry->budget[tc], budget); + } +} - return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; +/* When an skb is sent, it consumes from the budget of all traffic classes */ +static int taprio_update_budgets(struct sched_entry *entry, size_t len, + int tc_consumed, int num_tc) +{ + int tc, budget, new_budget = 0; + + for (tc = 0; tc < num_tc; tc++) { + budget = atomic_read(&entry->budget[tc]); + /* Don't consume from infinite budget */ + if (budget == INT_MAX) { + if (tc == tc_consumed) + new_budget = budget; + continue; + } + + if (tc == tc_consumed) + new_budget = atomic_sub_return(len, &entry->budget[tc]); + else + atomic_sub(len, &entry->budget[tc]); } - return taprio_enqueue_one(skb, sch, child, to_free); + return new_budget; } -/* Will not be called in the full offload case, since the TX queues are - * attached to the Qdisc created using qdisc_create_dflt() - */ -static struct sk_buff *taprio_peek(struct Qdisc *sch) +static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, + struct sched_entry *entry, + u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); - struct sched_entry *entry; + struct Qdisc *child = q->qdiscs[txq]; + int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; - u32 gate_mask; - int i; + ktime_t guard; + int prio; + int len; + u8 tc; - rcu_read_lock(); - entry = rcu_dereference(q->current_entry); - gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - rcu_read_unlock(); + if (unlikely(!child)) + return NULL; - if (!gate_mask) + if (TXTIME_ASSIST_IS_ENABLED(q->flags)) + goto skip_peek_checks; + + skb = child->ops->peek(child); + if (!skb) return NULL; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - int prio; - u8 tc; + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); - if (unlikely(!child)) - continue; + if (!(gate_mask & BIT(tc))) + return NULL; - skb = child->ops->peek(child); - if (!skb) - continue; + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) - return skb; + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + !taprio_entry_allows_tx(guard, entry, tc)) + return NULL; - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + taprio_update_budgets(entry, len, tc, num_tc) < 0) + return NULL; + +skip_peek_checks: + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; +} + +static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) +{ + int offset = dev->tc_to_txq[tc].offset; + int count = dev->tc_to_txq[tc].count; + + (*txq)++; + if (*txq == offset + count) + *txq = offset; +} + +/* Prioritize higher traffic classes, and select among TXQs belonging to the + * same TC using round robin + */ +static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int num_tc = netdev_get_num_tc(dev); + struct sk_buff *skb; + int tc; + + for (tc = num_tc - 1; tc >= 0; tc--) { + int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; - return skb; + do { + skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], + entry, gate_mask); + + taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); + + if (skb) + return skb; + } while (q->cur_txq[tc] != first_txq); } return NULL; } -static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) +/* Broken way of prioritizing smaller TXQ indices and ignoring the traffic + * class other than to determine whether the gate is open or not + */ +static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, + struct sched_entry *entry, + u32 gate_mask) { - atomic_set(&entry->budget, - div64_u64((u64)entry->interval * PSEC_PER_NSEC, - atomic64_read(&q->picos_per_byte))); + struct net_device *dev = qdisc_dev(sch); + struct sk_buff *skb; + int i; + + for (i = 0; i < dev->num_tx_queues; i++) { + skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); + if (skb) + return skb; + } + + return NULL; } /* Will not be called in the full offload case, since the TX queues are @@ -557,11 +829,9 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; - int i; rcu_read_lock(); entry = rcu_dereference(q->current_entry); @@ -571,69 +841,23 @@ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; - if (!gate_mask) goto done; - for (i = 0; i < dev->num_tx_queues; i++) { - struct Qdisc *child = q->qdiscs[i]; - ktime_t guard; - int prio; - int len; - u8 tc; - - if (unlikely(!child)) - continue; - - if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { - skb = child->ops->dequeue(child); - if (!skb) - continue; - goto skb_found; - } - - skb = child->ops->peek(child); - if (!skb) - continue; - - prio = skb->priority; - tc = netdev_get_prio_tc_map(dev, prio); - - if (!(gate_mask & BIT(tc))) { - skb = NULL; - continue; - } - - len = qdisc_pkt_len(skb); - guard = ktime_add_ns(taprio_get_time(q), - length_to_duration(q, len)); - - /* In the case that there's no gate entry, there's no - * guard band ... - */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - ktime_after(guard, entry->close_time)) { - skb = NULL; - continue; - } - - /* ... and no budget. */ - if (gate_mask != TAPRIO_ALL_GATES_OPEN && - atomic_sub_return(len, &entry->budget) < 0) { - skb = NULL; - continue; - } - - skb = child->ops->dequeue(child); - if (unlikely(!skb)) - goto done; - -skb_found: - qdisc_bstats_update(sch, skb); - qdisc_qstats_backlog_dec(sch, skb); - sch->q.qlen--; - - goto done; + if (static_branch_unlikely(&taprio_have_broken_mqprio) && + !static_branch_likely(&taprio_have_working_mqprio)) { + /* Single NIC kind which is broken */ + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + } else if (static_branch_likely(&taprio_have_working_mqprio) && + !static_branch_unlikely(&taprio_have_broken_mqprio)) { + /* Single NIC kind which prioritizes properly */ + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); + } else { + /* Mixed NIC kinds present in system, need dynamic testing */ + if (q->broken_mqprio) + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); + else + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: @@ -648,7 +872,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, if (list_is_last(&entry->list, &oper->entries)) return true; - if (ktime_compare(entry->close_time, oper->cycle_close_time) == 0) + if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; @@ -656,7 +880,7 @@ static bool should_restart_cycle(const struct sched_gate_list *oper, static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, - ktime_t close_time) + ktime_t end_time) { ktime_t next_base_time, extension_time; @@ -665,18 +889,18 @@ static bool should_change_schedules(const struct sched_gate_list *admin, next_base_time = sched_base_time(admin); - /* This is the simple case, the close_time would fall after + /* This is the simple case, the end_time would fall after * the next schedule base_time. */ - if (ktime_compare(next_base_time, close_time) <= 0) + if (ktime_compare(next_base_time, end_time) <= 0) return true; - /* This is the cycle_time_extension case, if the close_time + /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ - extension_time = ktime_add_ns(close_time, oper->cycle_time_extension); + extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after @@ -692,10 +916,13 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); + struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; + int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; - ktime_t close_time; + ktime_t end_time; + int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, @@ -714,41 +941,49 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer) * entry of all schedules are pre-calculated during the * schedule initialization. */ - if (unlikely(!entry || entry->close_time == oper->base_time)) { + if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - close_time = next->close_time; + end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); - oper->cycle_close_time = ktime_add_ns(oper->cycle_close_time, - oper->cycle_time); + oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, + oper->cycle_time); } else { next = list_next_entry(entry, list); } - close_time = ktime_add_ns(entry->close_time, next->interval); - close_time = min_t(ktime_t, close_time, oper->cycle_close_time); + end_time = ktime_add_ns(entry->end_time, next->interval); + end_time = min_t(ktime_t, end_time, oper->cycle_end_time); + + for (tc = 0; tc < num_tc; tc++) { + if (next->gate_duration[tc] == oper->cycle_time) + next->gate_close_time[tc] = KTIME_MAX; + else + next->gate_close_time[tc] = ktime_add_ns(entry->end_time, + next->gate_duration[tc]); + } - if (should_change_schedules(admin, oper, close_time)) { + if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ - close_time = sched_base_time(admin); + end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } - next->close_time = close_time; - taprio_set_budget(q, next); + next->end_time = end_time; + taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); - hrtimer_set_expires(&q->advance_timer, close_time); + hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); @@ -916,6 +1151,8 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + taprio_calculate_gate_durations(q, new); + return 0; } @@ -924,7 +1161,7 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, struct netlink_ext_ack *extack, u32 taprio_flags) { - int i, j; + bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); @@ -937,52 +1174,17 @@ static int taprio_parse_mqprio_opt(struct net_device *dev, if (dev->num_tc) return 0; - /* Verify num_tc is not out of max range */ - if (qopt->num_tc > TC_MAX_QUEUE) { - NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); - return -EINVAL; - } - /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } - /* Verify priority mapping uses valid tcs */ - for (i = 0; i <= TC_BITMASK; i++) { - if (qopt->prio_tc_map[i] >= qopt->num_tc) { - NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); - return -EINVAL; - } - } - - for (i = 0; i < qopt->num_tc; i++) { - unsigned int last = qopt->offset[i] + qopt->count[i]; - - /* Verify the queue count is in tx range being equal to the - * real_num_tx_queues indicates the last queue is in use. - */ - if (qopt->offset[i] >= dev->num_tx_queues || - !qopt->count[i] || - last > dev->real_num_tx_queues) { - NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); - return -EINVAL; - } - - if (TXTIME_ASSIST_IS_ENABLED(taprio_flags)) - continue; - - /* Verify that the offset and counts do not overlap */ - for (j = i + 1; j < qopt->num_tc; j++) { - if (last > qopt->offset[j]) { - NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); - return -EINVAL; - } - } - } - - return 0; + /* For some reason, in txtime-assist mode, we allow TXQ ranges for + * different TCs to overlap, and just validate the TXQ ranges. + */ + return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, + extack); } static int taprio_get_start_time(struct Qdisc *sch, @@ -1019,11 +1221,14 @@ static int taprio_get_start_time(struct Qdisc *sch, return 0; } -static void setup_first_close_time(struct taprio_sched *q, - struct sched_gate_list *sched, ktime_t base) +static void setup_first_end_time(struct taprio_sched *q, + struct sched_gate_list *sched, ktime_t base) { + struct net_device *dev = qdisc_dev(q->root); + int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; + int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); @@ -1031,10 +1236,18 @@ static void setup_first_close_time(struct taprio_sched *q, cycle = sched->cycle_time; /* FIXME: find a better place to do this */ - sched->cycle_close_time = ktime_add_ns(base, cycle); + sched->cycle_end_time = ktime_add_ns(base, cycle); + + first->end_time = ktime_add_ns(base, first->interval); + taprio_set_budgets(q, sched, first); + + for (tc = 0; tc < num_tc; tc++) { + if (first->gate_duration[tc] == sched->cycle_time) + first->gate_close_time[tc] = KTIME_MAX; + else + first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); + } - first->close_time = ktime_add_ns(base, first->interval); - taprio_set_budget(q, first); rcu_assign_pointer(q->current_entry, NULL); } @@ -1088,6 +1301,8 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct sched_gate_list *oper, *admin; + struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); @@ -1100,6 +1315,17 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, continue; taprio_set_picos_per_byte(dev, q); + + stab = rtnl_dereference(q->root->stab); + + oper = rtnl_dereference(q->oper_sched); + if (oper) + taprio_update_queue_max_sdu(q, oper, stab); + + admin = rtnl_dereference(q->admin_sched); + if (admin) + taprio_update_queue_max_sdu(q, admin, stab); + break; } @@ -1203,7 +1429,8 @@ static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, - struct tc_taprio_qopt_offload *offload) + struct tc_taprio_qopt_offload *offload, + const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; @@ -1217,7 +1444,11 @@ static void taprio_sched_to_offload(struct net_device *dev, e->command = entry->command; e->interval = entry->interval; - e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); + if (caps->gate_mask_per_txq) + e->gate_mask = tc_map_to_queue_mask(dev, + entry->gate_mask); + else + e->gate_mask = entry->gate_mask; i++; } @@ -1225,6 +1456,34 @@ static void taprio_sched_to_offload(struct net_device *dev, offload->num_entries = i; } +static void taprio_detect_broken_mqprio(struct taprio_sched *q) +{ + struct net_device *dev = qdisc_dev(q->root); + struct tc_taprio_caps caps; + + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, + &caps, sizeof(caps)); + + q->broken_mqprio = caps.broken_mqprio; + if (q->broken_mqprio) + static_branch_inc(&taprio_have_broken_mqprio); + else + static_branch_inc(&taprio_have_working_mqprio); + + q->detected_mqprio = true; +} + +static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) +{ + if (!q->detected_mqprio) + return; + + if (q->broken_mqprio) + static_branch_dec(&taprio_have_broken_mqprio); + else + static_branch_dec(&taprio_have_working_mqprio); +} + static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, @@ -1261,7 +1520,8 @@ static int taprio_enable_offload(struct net_device *dev, return -ENOMEM; } offload->enable = 1; - taprio_sched_to_offload(dev, sched, offload); + mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); + taprio_sched_to_offload(dev, sched, offload, &caps); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; @@ -1452,7 +1712,6 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); - struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; unsigned long seen_tcs = 0; struct nlattr *n; @@ -1466,18 +1725,14 @@ static int taprio_parse_tc_entries(struct Qdisc *sch, if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; - err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack); + err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, + extack); if (err) goto out; } - for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->max_sdu[tc] = max_sdu[tc]; - if (max_sdu[tc]) - q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len; - else - q->max_frm_len[tc] = U32_MAX; /* never oversized */ - } out: return err; @@ -1533,6 +1788,7 @@ static int taprio_new_flags(const struct nlattr *attr, u32 old, static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); @@ -1585,6 +1841,23 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; } + if (mqprio) { + err = netdev_set_num_tc(dev, mqprio->num_tc); + if (err) + goto free_sched; + for (i = 0; i < mqprio->num_tc; i++) { + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + q->cur_txq[i] = mqprio->offset[i]; + } + + /* Always use supplied priority mappings */ + for (i = 0; i <= TC_BITMASK; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; @@ -1600,21 +1873,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, goto free_sched; taprio_set_picos_per_byte(dev, q); - - if (mqprio) { - err = netdev_set_num_tc(dev, mqprio->num_tc); - if (err) - goto free_sched; - for (i = 0; i < mqprio->num_tc; i++) - netdev_set_tc_queue(dev, i, - mqprio->count[i], - mqprio->offset[i]); - - /* Always use supplied priority mappings */ - for (i = 0; i <= TC_BITMASK; i++) - netdev_set_prio_tc_map(dev, i, - mqprio->prio_tc_map[i]); - } + taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); @@ -1663,7 +1922,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { - setup_first_close_time(q, new_admin, start); + setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); @@ -1683,6 +1942,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, new_admin = NULL; err = 0; + if (!stab) + NL_SET_ERR_MSG_MOD(extack, + "Size table not specified, frame length estimations may be inaccurate"); + unlock: spin_unlock_bh(qdisc_lock(sch)); @@ -1743,6 +2006,8 @@ static void taprio_destroy(struct Qdisc *sch) if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); + + taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -1807,6 +2072,8 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, q->qdiscs[i] = qdisc; } + taprio_detect_broken_mqprio(q); + return taprio_change(sch, opt, extack); } @@ -1947,7 +2214,8 @@ error_nest: return -1; } -static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) +static int taprio_dump_tc_entries(struct sk_buff *skb, + struct sched_gate_list *sched) { struct nlattr *n; int tc; @@ -1961,7 +2229,7 @@ static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, - q->max_sdu[tc])) + sched->max_sdu[tc])) goto nla_put_failure; nla_nest_end(skb, n); @@ -1981,18 +2249,11 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; - unsigned int i; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); - opt.num_tc = netdev_get_num_tc(dev); - memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); - - for (i = 0; i < netdev_get_num_tc(dev); i++) { - opt.count[i] = dev->tc_to_txq[i].count; - opt.offset[i] = dev->tc_to_txq[i].offset; - } + mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) @@ -2012,7 +2273,7 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; - if (taprio_dump_tc_entries(q, skb)) + if (oper && taprio_dump_tc_entries(skb, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) |