diff options
Diffstat (limited to 'net')
65 files changed, 1043 insertions, 622 deletions
diff --git a/net/Kconfig b/net/Kconfig index 2fb25b534df5..d532ec33f1fe 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -52,6 +52,11 @@ config NET_INGRESS config NET_EGRESS bool +config NET_XGRESS + select NET_INGRESS + select NET_EGRESS + bool + config NET_REDIRECT bool diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 2321bd2f9964..7d47f53f20c1 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -555,12 +555,23 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } +void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +{ +} + __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } +__bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, + void *e, char f, int g) +{ + *b += 1; + return a + *b + c + d + (long)e + f + g; +} + int noinline bpf_fentry_shadow_test(int a) { return a + 1; @@ -596,6 +607,7 @@ __diag_pop(); BTF_SET8_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) +BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_SET8_END(bpf_test_modify_return_ids) @@ -663,7 +675,11 @@ int bpf_prog_test_run_tracing(struct bpf_prog *prog, case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) - side_effect = 1; + side_effect++; + b = 2; + ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); + if (b != 2) + side_effect++; break; default: goto out; diff --git a/net/bridge/br.c b/net/bridge/br.c index 4f5098d33a46..a6e94ceb7c9a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -234,6 +234,14 @@ static int br_switchdev_blocking_event(struct notifier_block *nb, br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; + case SWITCHDEV_BRPORT_REPLAY: + brport_info = ptr; + b = &brport_info->brport; + + err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, + b->blocking_nb, extack); + err = notifier_from_errno(err); + break; } out: diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 6116eba1bd89..9d7bc8b96b53 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -154,6 +154,7 @@ void br_forward(const struct net_bridge_port *to, backup_port = rcu_dereference(to->backup_port); if (unlikely(!backup_port)) goto out; + BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); to = backup_port; } diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 05c5863d2e20..10f0d33d8ccf 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -211,6 +211,7 @@ static inline size_t br_port_info_size(void) + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ + 0; } @@ -319,6 +320,10 @@ static int br_port_fill_attrs(struct sk_buff *skb, backup_p->dev->ifindex); rcu_read_unlock(); + if (p->backup_nhid && + nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) + return -EMSGSIZE; + return 0; } @@ -895,6 +900,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), + [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, }; /* Change the state of the port and notify spanning tree */ @@ -1065,6 +1071,12 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[], return err; } + if (tb[IFLA_BRPORT_BACKUP_NHID]) { + u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); + + WRITE_ONCE(p->backup_nhid, backup_nhid); + } + return 0; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index a63b32c1638e..51e4ca54b537 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -387,6 +387,7 @@ struct net_bridge_port { struct net_bridge_vlan_group __rcu *vlgrp; #endif struct net_bridge_port __rcu *backup_port; + u32 backup_nhid; /* STP */ u8 priority; @@ -605,6 +606,8 @@ struct br_input_skb_cb { */ unsigned long fwd_hwdoms; #endif + + u32 backup_nhid; }; #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb) @@ -2115,6 +2118,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, struct notifier_block *atomic_nb, struct notifier_block *blocking_nb); +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack); + bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb); void br_switchdev_frame_set_offload_fwd_mark(struct sk_buff *skb); @@ -2165,6 +2174,16 @@ br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, { } +static inline int +br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return -EOPNOTSUPP; +} + static inline bool br_switchdev_frame_uses_tx_fwd_offload(struct sk_buff *skb) { return false; diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c index ba95c4d74a60..ee84e783e1df 100644 --- a/net/bridge/br_switchdev.c +++ b/net/bridge/br_switchdev.c @@ -727,6 +727,8 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev, err = br_switchdev_mdb_replay_one(nb, dev, SWITCHDEV_OBJ_PORT_MDB(obj), action, ctx, extack); + if (err == -EOPNOTSUPP) + err = 0; if (err) goto out_free_mdb; } @@ -759,8 +761,10 @@ static int nbp_switchdev_sync_objs(struct net_bridge_port *p, const void *ctx, err = br_switchdev_mdb_replay(br_dev, dev, ctx, true, blocking_nb, extack); - if (err && err != -EOPNOTSUPP) + if (err) { + /* -EOPNOTSUPP not propagated from MDB replay. */ return err; + } err = br_switchdev_fdb_replay(br_dev, ctx, true, atomic_nb); if (err && err != -EOPNOTSUPP) @@ -825,3 +829,12 @@ void br_switchdev_port_unoffload(struct net_bridge_port *p, const void *ctx, nbp_switchdev_del(p); } + +int br_switchdev_port_replay(struct net_bridge_port *p, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + return nbp_switchdev_sync_objs(p, ctx, atomic_nb, blocking_nb, extack); +} diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6399a8a69d07..81833ca7a2c7 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -201,6 +201,21 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err; + if (BR_INPUT_SKB_CB(skb)->backup_nhid) { + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, + tunnel_id, 0); + if (!tunnel_dst) + return -ENOMEM; + + tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | + IP_TUNNEL_INFO_BRIDGE; + tunnel_dst->u.tun_info.key.nhid = + BR_INPUT_SKB_CB(skb)->backup_nhid; + skb_dst_set(skb, &tunnel_dst->dst); + + return 0; + } + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); diff --git a/net/core/dev.c b/net/core/dev.c index 69a3e544676c..e7ffcfa037f7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -107,6 +107,7 @@ #include <net/pkt_cls.h> #include <net/checksum.h> #include <net/xfrm.h> +#include <net/tcx.h> #include <linux/highmem.h> #include <linux/init.h> #include <linux/module.h> @@ -154,7 +155,6 @@ #include "dev.h" #include "net-sysfs.h" - static DEFINE_SPINLOCK(ptype_lock); struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; struct list_head ptype_all __read_mostly; /* Taps */ @@ -2384,8 +2384,7 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, struct xps_map *map = NULL; int pos; - if (dev_maps) - map = xmap_dereference(dev_maps->attr_map[tci]); + map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) return false; @@ -3882,69 +3881,198 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(dev_loopback_xmit); #ifdef CONFIG_NET_EGRESS -static struct sk_buff * -sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +static struct netdev_queue * +netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) +{ + int qm = skb_get_queue_mapping(skb); + + return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); +} + +static bool netdev_xmit_txqueue_skipped(void) +{ + return __this_cpu_read(softnet_data.xmit.skip_txqueue); +} + +void netdev_xmit_skip_txqueue(bool skip) +{ + __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); +} +EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); +#endif /* CONFIG_NET_EGRESS */ + +#ifdef CONFIG_NET_XGRESS +static int tc_run(struct tcx_entry *entry, struct sk_buff *skb) { + int ret = TC_ACT_UNSPEC; #ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress); - struct tcf_result cl_res; + struct mini_Qdisc *miniq = rcu_dereference_bh(entry->miniq); + struct tcf_result res; if (!miniq) - return skb; + return ret; - /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */ tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; - mini_qdisc_bstats_cpu_update(miniq, skb); - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { + mini_qdisc_bstats_cpu_update(miniq, skb); + ret = tcf_classify(skb, miniq->block, miniq->filter_list, &res, false); + /* Only tcf related quirks below. */ + switch (ret) { + case TC_ACT_SHOT: + mini_qdisc_qstats_cpu_drop(miniq); + break; case TC_ACT_OK: case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); + skb->tc_index = TC_H_MIN(res.classid); break; + } +#endif /* CONFIG_NET_CLS_ACT */ + return ret; +} + +static DEFINE_STATIC_KEY_FALSE(tcx_needed_key); + +void tcx_inc(void) +{ + static_branch_inc(&tcx_needed_key); +} + +void tcx_dec(void) +{ + static_branch_dec(&tcx_needed_key); +} + +static __always_inline enum tcx_action_base +tcx_run(const struct bpf_mprog_entry *entry, struct sk_buff *skb, + const bool needs_mac) +{ + const struct bpf_mprog_fp *fp; + const struct bpf_prog *prog; + int ret = TCX_NEXT; + + if (needs_mac) + __skb_push(skb, skb->mac_len); + bpf_mprog_foreach_prog(entry, fp, prog) { + bpf_compute_data_pointers(skb); + ret = bpf_prog_run(prog, skb); + if (ret != TCX_NEXT) + break; + } + if (needs_mac) + __skb_pull(skb, skb->mac_len); + return tcx_action_code(skb, ret); +} + +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(skb->dev->tcx_ingress); + int sch_ret; + + if (!entry) + return skb; + if (*pt_prev) { + *ret = deliver_skb(skb, *pt_prev, orig_dev); + *pt_prev = NULL; + } + + qdisc_skb_cb(skb)->pkt_len = skb->len; + tcx_set_ingress(skb, true); + + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, true); + if (sch_ret != TC_ACT_UNSPEC) + goto ingress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +ingress_verdict: + switch (sch_ret) { + case TC_ACT_REDIRECT: + /* skb_mac_header check was done by BPF, so we can safely + * push the L2 header back before redirecting to another + * netdev. + */ + __skb_push(skb, skb->mac_len); + if (skb_do_redirect(skb) == -EAGAIN) { + __skb_pull(skb, skb->mac_len); + *another = true; + break; + } + *ret = NET_RX_SUCCESS; + return NULL; case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - *ret = NET_XMIT_DROP; - kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; + /* used by tc_run */ case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: - *ret = NET_XMIT_SUCCESS; consume_skb(skb); + fallthrough; + case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; + } + + return skb; +} + +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) +{ + struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); + int sch_ret; + + if (!entry) + return skb; + + /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was + * already set by the caller. + */ + if (static_branch_unlikely(&tcx_needed_key)) { + sch_ret = tcx_run(entry, skb, false); + if (sch_ret != TC_ACT_UNSPEC) + goto egress_verdict; + } + sch_ret = tc_run(tcx_entry(entry), skb); +egress_verdict: + switch (sch_ret) { case TC_ACT_REDIRECT: /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; return NULL; - default: - break; + case TC_ACT_SHOT: + kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS); + *ret = NET_XMIT_DROP; + return NULL; + /* used by tc_run */ + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + *ret = NET_XMIT_SUCCESS; + return NULL; } -#endif /* CONFIG_NET_CLS_ACT */ return skb; } - -static struct netdev_queue * -netdev_tx_queue_mapping(struct net_device *dev, struct sk_buff *skb) -{ - int qm = skb_get_queue_mapping(skb); - - return netdev_get_tx_queue(dev, netdev_cap_txqueue(dev, qm)); -} - -static bool netdev_xmit_txqueue_skipped(void) +#else +static __always_inline struct sk_buff * +sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, + struct net_device *orig_dev, bool *another) { - return __this_cpu_read(softnet_data.xmit.skip_txqueue); + return skb; } -void netdev_xmit_skip_txqueue(bool skip) +static __always_inline struct sk_buff * +sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { - __this_cpu_write(softnet_data.xmit.skip_txqueue, skip); + return skb; } -EXPORT_SYMBOL_GPL(netdev_xmit_skip_txqueue); -#endif /* CONFIG_NET_EGRESS */ +#endif /* CONFIG_NET_XGRESS */ #ifdef CONFIG_XPS static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, @@ -4128,9 +4256,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); qdisc_pkt_len_init(skb); -#ifdef CONFIG_NET_CLS_ACT - skb->tc_at_ingress = 0; -#endif + tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) { @@ -5064,72 +5190,6 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev, EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); #endif -static inline struct sk_buff * -sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, - struct net_device *orig_dev, bool *another) -{ -#ifdef CONFIG_NET_CLS_ACT - struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); - struct tcf_result cl_res; - - /* If there's at least one ingress present somewhere (so - * we get here via enabled static key), remaining devices - * that are not configured with an ingress qdisc will bail - * out here. - */ - if (!miniq) - return skb; - - if (*pt_prev) { - *ret = deliver_skb(skb, *pt_prev, orig_dev); - *pt_prev = NULL; - } - - qdisc_skb_cb(skb)->pkt_len = skb->len; - tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; - skb->tc_at_ingress = 1; - mini_qdisc_bstats_cpu_update(miniq, skb); - - switch (tcf_classify(skb, miniq->block, miniq->filter_list, &cl_res, false)) { - case TC_ACT_OK: - case TC_ACT_RECLASSIFY: - skb->tc_index = TC_H_MIN(cl_res.classid); - break; - case TC_ACT_SHOT: - mini_qdisc_qstats_cpu_drop(miniq); - kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); - *ret = NET_RX_DROP; - return NULL; - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - case TC_ACT_TRAP: - consume_skb(skb); - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_REDIRECT: - /* skb_mac_header check was done by cls/act_bpf, so - * we can safely push the L2 header back before - * redirecting to another netdev - */ - __skb_push(skb, skb->mac_len); - if (skb_do_redirect(skb) == -EAGAIN) { - __skb_pull(skb, skb->mac_len); - *another = true; - break; - } - *ret = NET_RX_SUCCESS; - return NULL; - case TC_ACT_CONSUMED: - *ret = NET_RX_SUCCESS; - return NULL; - default: - break; - } -#endif /* CONFIG_NET_CLS_ACT */ - return skb; -} - /** * netdev_is_rx_handler_busy - check if receive handler is registered * @dev: device to check @@ -6316,12 +6376,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded) * softirq mode will happen in the next round of napi_schedule(). * This should not cause hiccups/stalls to the live traffic. */ - list_for_each_entry(napi, &dev->napi_list, dev_list) { - if (threaded) - set_bit(NAPI_STATE_THREADED, &napi->state); - else - clear_bit(NAPI_STATE_THREADED, &napi->state); - } + list_for_each_entry(napi, &dev->napi_list, dev_list) + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); return err; } @@ -10617,6 +10673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); dev->gso_max_size = GSO_LEGACY_MAX_SIZE; + dev->xdp_zc_max_segs = 1; dev->gso_max_segs = GSO_MAX_SEGS; dev->gro_max_size = GRO_LEGACY_MAX_SIZE; dev->gso_ipv4_max_size = GSO_LEGACY_MAX_SIZE; @@ -10838,7 +10895,7 @@ void unregister_netdevice_many_notify(struct list_head *head, /* Shutdown queueing discipline. */ dev_shutdown(dev); - + dev_tcx_uninstall(dev); dev_xdp_uninstall(dev); bpf_dev_bound_netdev_unregister(dev); diff --git a/net/core/filter.c b/net/core/filter.c index 06ba0e56e369..797e8f039696 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4345,13 +4345,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); enum bpf_map_type map_type = ri->map_type; - if (map_type == BPF_MAP_TYPE_XSKMAP) { - /* XDP_REDIRECT is not supported AF_XDP yet. */ - if (unlikely(xdp_buff_has_frags(xdp))) - return -EOPNOTSUPP; - + if (map_type == BPF_MAP_TYPE_XSKMAP) return __xdp_do_redirect_xsk(ri, dev, xdp, xdp_prog); - } return __xdp_do_redirect_frame(ri, dev, xdp_convert_buff_to_frame(xdp), xdp_prog); @@ -9312,7 +9307,7 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, __u8 value_reg = si->dst_reg; __u8 skb_reg = si->src_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, read skb->tstamp as is if tstamp_type_access is true. @@ -9346,7 +9341,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, __u8 value_reg = si->src_reg; __u8 skb_reg = si->dst_reg; -#ifdef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NET_XGRESS /* If the tstamp_type is read, * the bpf prog is aware the tstamp could have delivery time. * Thus, write skb->tstamp as is if tstamp_type_access is true. diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index a4270fafdf11..65ef4867fc49 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -25,6 +25,14 @@ netdev_nl_dev_fill(struct net_device *netdev, struct sk_buff *rsp, return -EINVAL; } + if (netdev->xdp_features & NETDEV_XDP_ACT_XSK_ZEROCOPY) { + if (nla_put_u32(rsp, NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + netdev->xdp_zc_max_segs)) { + genlmsg_cancel(rsp, hdr); + return -EINVAL; + } + } + genlmsg_end(rsp, hdr); return 0; diff --git a/net/core/page_pool.c b/net/core/page_pool.c index a3e12a61d456..7ca456bfab71 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -492,7 +492,7 @@ static s32 page_pool_inflight(struct page_pool *pool) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_release_page(struct page_pool *pool, struct page *page) +static void page_pool_return_page(struct page_pool *pool, struct page *page) { dma_addr_t dma; int count; @@ -518,13 +518,6 @@ skip_dma_unmap: */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); trace_page_pool_state_release(pool, page, count); -} -EXPORT_SYMBOL(page_pool_release_page); - -/* Return a page to the page allocator, cleaning up our state */ -static void page_pool_return_page(struct page_pool *pool, struct page *page) -{ - page_pool_release_page(pool, page); put_page(page); /* An optimization would be to call __free_pages(page, pool->p.order) @@ -616,9 +609,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - /* Do not replace this with page_pool_return_page() */ - page_pool_release_page(pool, page); - put_page(page); + page_pool_return_page(pool, page); return NULL; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 3ad4e030846d..1777a5e1830b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -61,7 +61,7 @@ #include "dev.h" #define RTNL_MAX_TYPE 50 -#define RTNL_SLAVE_MAX_TYPE 43 +#define RTNL_SLAVE_MAX_TYPE 44 struct rtnl_link { rtnl_doit_func doit; @@ -1273,7 +1273,6 @@ static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, - struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; @@ -1343,7 +1342,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) - goto nla_put_vfinfo_failure; + return -EMSGSIZE; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || @@ -1414,8 +1413,6 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_vf_failure: nla_nest_cancel(skb, vf); -nla_put_vfinfo_failure: - nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } @@ -1441,8 +1438,10 @@ static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { - if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) + if (rtnl_fill_vfinfo(skb, dev, i, ext_filter_mask)) { + nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; + } } nla_nest_end(skb, vfinfo); diff --git a/net/core/sock.c b/net/core/sock.c index 9370fd50aa2c..ab1e8d1bd5a1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1244,17 +1244,11 @@ set_sndbuf: break; case SO_PASSCRED: - if (valbool) - set_bit(SOCK_PASSCRED, &sock->flags); - else - clear_bit(SOCK_PASSCRED, &sock->flags); + assign_bit(SOCK_PASSCRED, &sock->flags, valbool); break; case SO_PASSPIDFD: - if (valbool) - set_bit(SOCK_PASSPIDFD, &sock->flags); - else - clear_bit(SOCK_PASSPIDFD, &sock->flags); + assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool); break; case SO_TIMESTAMP_OLD: @@ -1358,10 +1352,7 @@ set_sndbuf: break; case SO_PASSSEC: - if (valbool) - set_bit(SOCK_PASSSEC, &sock->flags); - else - clear_bit(SOCK_PASSSEC, &sock->flags); + assign_bit(SOCK_PASSSEC, &sock->flags, valbool); break; case SO_MARK: if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index fa8079303cb0..8e919cfe6e23 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -474,7 +474,8 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, .flowi4_oif = inet_iif(skb), .daddr = iph->saddr, .saddr = iph->daddr, - .flowi4_tos = RT_CONN_FLAGS(sk), + .flowi4_tos = ip_sock_rt_tos(sk), + .flowi4_scope = ip_sock_rt_scope(sk), .flowi4_proto = sk->sk_protocol, .fl4_sport = dccp_hdr(skb)->dccph_dport, .fl4_dport = dccp_hdr(skb)->dccph_sport, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 7249ef218178..e03b5331df6d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1056,6 +1056,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), + .ipv6_pinfo_offset = offsetof(struct dccp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h index 7e4c2a3b322b..c5d14c48def1 100644 --- a/net/dccp/ipv6.h +++ b/net/dccp/ipv6.h @@ -13,10 +13,6 @@ struct dccp6_sock { struct dccp_sock dccp; - /* - * ipv6_pinfo has to be the last member of dccp6_sock, - * see inet6_sk_generic. - */ struct ipv6_pinfo inet6; }; diff --git a/net/devlink/leftover.c b/net/devlink/leftover.c index 1f00f874471f..5128b9c7eea8 100644 --- a/net/devlink/leftover.c +++ b/net/devlink/leftover.c @@ -3946,7 +3946,7 @@ static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->get || devlink->reload_failed) + if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } @@ -3955,7 +3955,7 @@ static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { - if (!param->set || devlink->reload_failed) + if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } diff --git a/net/dsa/port.c b/net/dsa/port.c index 0ce8fd311c78..c63cbfbe6489 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1568,27 +1568,6 @@ static void dsa_port_phylink_validate(struct phylink_config *config, phylink_generic_validate(config, supported, state); } -static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config, - struct phylink_link_state *state) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - int err; - - /* Only called for inband modes */ - if (!ds->ops->phylink_mac_link_state) { - state->link = 0; - return; - } - - err = ds->ops->phylink_mac_link_state(ds, dp->index, state); - if (err < 0) { - dev_err(ds->dev, "p%d: phylink_mac_link_state() failed: %d\n", - dp->index, err); - state->link = 0; - } -} - static struct phylink_pcs * dsa_port_phylink_mac_select_pcs(struct phylink_config *config, phy_interface_t interface) @@ -1646,17 +1625,6 @@ static int dsa_port_phylink_mac_finish(struct phylink_config *config, return err; } -static void dsa_port_phylink_mac_an_restart(struct phylink_config *config) -{ - struct dsa_port *dp = container_of(config, struct dsa_port, pl_config); - struct dsa_switch *ds = dp->ds; - - if (!ds->ops->phylink_mac_an_restart) - return; - - ds->ops->phylink_mac_an_restart(ds, dp->index); -} - static void dsa_port_phylink_mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface) @@ -1700,11 +1668,9 @@ static void dsa_port_phylink_mac_link_up(struct phylink_config *config, static const struct phylink_mac_ops dsa_port_phylink_mac_ops = { .validate = dsa_port_phylink_validate, .mac_select_pcs = dsa_port_phylink_mac_select_pcs, - .mac_pcs_get_state = dsa_port_phylink_mac_pcs_get_state, .mac_prepare = dsa_port_phylink_mac_prepare, .mac_config = dsa_port_phylink_mac_config, .mac_finish = dsa_port_phylink_mac_finish, - .mac_an_restart = dsa_port_phylink_mac_an_restart, .mac_link_down = dsa_port_phylink_mac_link_down, .mac_link_up = dsa_port_phylink_mac_link_up, }; @@ -1720,13 +1686,6 @@ int dsa_port_phylink_create(struct dsa_port *dp) if (err) mode = PHY_INTERFACE_MODE_NA; - /* Presence of phylink_mac_link_state or phylink_mac_an_restart is - * an indicator of a legacy phylink driver. - */ - if (ds->ops->phylink_mac_link_state || - ds->ops->phylink_mac_an_restart) - dp->pl_config.legacy_pre_march2020 = true; - if (ds->ops->phylink_get_caps) ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 527b1d576460..48db91b33390 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -21,6 +21,7 @@ #include <linux/if_hsr.h> #include <net/dcbnl.h> #include <linux/netpoll.h> +#include <linux/string.h> #include "dsa.h" #include "port.h" @@ -1056,10 +1057,10 @@ static void dsa_slave_get_strings(struct net_device *dev, if (stringset == ETH_SS_STATS) { int len = ETH_GSTRING_LEN; - strncpy(data, "tx_packets", len); - strncpy(data + len, "tx_bytes", len); - strncpy(data + 2 * len, "rx_packets", len); - strncpy(data + 3 * len, "rx_bytes", len); + strscpy_pad(data, "tx_packets", len); + strscpy_pad(data + len, "tx_bytes", len); + strscpy_pad(data + 2 * len, "rx_packets", len); + strscpy_pad(data + 3 * len, "rx_bytes", len); if (ds->ops->get_strings) ds->ops->get_strings(ds, dp->index, stringset, data + 4 * len); diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 4406d796cc2f..39dcccf0f174 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -51,8 +51,6 @@ static bool is_unsupported(u32 member_offset) return false; } -extern struct btf *btf_vmlinux; - static bool bpf_tcp_ca_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index f95142e56da0..93f14d39fef6 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -1152,41 +1152,64 @@ static bool ipv4_good_nh(const struct fib_nh *nh) return !!(state & NUD_VALID); } -static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +static bool nexthop_is_good_nh(const struct nexthop *nh) +{ + struct nh_info *nhi = rcu_dereference(nh->nh_info); + + switch (nhi->family) { + case AF_INET: + return ipv4_good_nh(&nhi->fib_nh); + case AF_INET6: + return ipv6_good_nh(&nhi->fib6_nh); + } + + return false; +} + +static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { - struct nexthop *rc = NULL; int i; - for (i = 0; i < nhg->num_nh; ++i) { + for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; - struct nh_info *nhi; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; - nhi = rcu_dereference(nhge->nh->nh_info); - if (nhi->fdb_nh) - return nhge->nh; + return nhge->nh; + } + + WARN_ON_ONCE(1); + return NULL; +} + +static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) +{ + struct nexthop *rc = NULL; + int i; + + if (nhg->fdb_nh) + return nexthop_select_path_fdb(nhg, hash); + + for (i = 0; i < nhg->num_nh; ++i) { + struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ - switch (nhi->family) { - case AF_INET: - if (ipv4_good_nh(&nhi->fib_nh)) - return nhge->nh; - break; - case AF_INET6: - if (ipv6_good_nh(&nhi->fib6_nh)) - return nhge->nh; - break; - } + if (!nexthop_is_good_nh(nhge->nh)) + continue; if (!rc) rc = nhge->nh; + + if (hash > atomic_read(&nhge->hthr.upper_bound)) + continue; + + return nhge->nh; } - return rc; + return rc ? : nhg->nh_entries[0].nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ed52e1e3c99..aca5620cf3ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { - int cap; + int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - val <<= 1; - if (val > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, val); - tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + tcp_sk(sk)->window_clamp = val; } return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c1..670c3dab24f2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ @@ -287,7 +297,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks) icsk->icsk_ack.quick = quickacks; } -void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) +static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -295,7 +305,6 @@ void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks) inet_csk_exit_pingpong_mode(sk); icsk->icsk_ack.ato = TCP_ATO_MIN; } -EXPORT_SYMBOL(tcp_enter_quickack_mode); /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. @@ -727,8 +736,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +749,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); @@ -4308,10 +4312,16 @@ static inline bool tcp_paws_discard(const struct sock *sk, * (borrowed from freebsd) */ -static inline bool tcp_sequence(const struct tcp_sock *tp, u32 seq, u32 end_seq) +static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, + u32 seq, u32 end_seq) { - return !before(end_seq, tp->rcv_wup) && - !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); + if (before(end_seq, tp->rcv_wup)) + return SKB_DROP_REASON_TCP_OLD_SEQUENCE; + + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + return SKB_NOT_DROPPED_YET; } /* When we get a reset we do this. */ @@ -5734,7 +5744,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } /* Step 1: check sequence number */ - if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." * And page 69: "If an incoming segment is not acceptable, @@ -5751,7 +5762,6 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, } else if (tcp_reset_check(sk, skb)) { goto reset; } - SKB_DR_SET(reason, TCP_INVALID_SEQUENCE); goto discard; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 069642014636..09cffbc82d32 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -57,6 +57,7 @@ #include <linux/init.h> #include <linux/times.h> #include <linux/slab.h> +#include <linux/sched.h> #include <net/net_namespace.h> #include <net/icmp.h> @@ -2448,6 +2449,8 @@ static void *established_get_first(struct seq_file *seq) struct hlist_nulls_node *node; spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); + cond_resched(); + /* Lockless fast path for the common case of empty buckets */ if (empty_bucket(hinfo, st)) continue; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 42a96b3547c9..8c3ebd95f5b9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1553,7 +1553,7 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk); + INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 94cec2075eee..94d1fdb0393f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -202,6 +202,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_rtr_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -262,6 +263,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .ra_defrtr_metric = IP6_RT_PRIO_USER, .accept_ra_from_local = 0, .accept_ra_min_hop_limit= 1, + .accept_ra_min_rtr_lft = 0, .accept_ra_pinfo = 1, #ifdef CONFIG_IPV6_ROUTER_PREF .accept_ra_rtr_pref = 1, @@ -5602,6 +5604,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_IOAM6_ID_WIDE] = cnf->ioam6_id_wide; array[DEVCONF_NDISC_EVICT_NOCARRIER] = cnf->ndisc_evict_nocarrier; array[DEVCONF_ACCEPT_UNTRACKED_NA] = cnf->accept_untracked_na; + array[DEVCONF_ACCEPT_RA_MIN_RTR_LFT] = cnf->accept_ra_min_rtr_lft; } static inline size_t inet6_ifla6_size(void) @@ -6796,6 +6799,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "accept_ra_min_rtr_lft", + .data = &ipv6_devconf.accept_ra_min_rtr_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "accept_ra_pinfo", .data = &ipv6_devconf.accept_ra_pinfo, .maxlen = sizeof(int), diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 5d593ddc0347..9f9c4b838664 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -102,9 +102,9 @@ bool ipv6_mod_enabled(void) } EXPORT_SYMBOL_GPL(ipv6_mod_enabled); -static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +static struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) { - const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + const int offset = sk->sk_prot->ipv6_pinfo_offset; return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 9b6818453afe..d80d6024cafa 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -38,10 +38,11 @@ static bool ipv6_mapped_addr_any(const struct in6_addr *a) return ipv6_addr_v4mapped(a) && (a->s6_addr32[3] == 0); } -static void ip6_datagram_flow_key_init(struct flowi6 *fl6, struct sock *sk) +static void ip6_datagram_flow_key_init(struct flowi6 *fl6, + const struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); int oif = sk->sk_bound_dev_if; memset(fl6, 0, sizeof(*fl6)); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 202fc3aaa83c..f4bfccae003c 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -612,8 +612,6 @@ looped_back: kfree(buf); - skb_dst_drop(skb); - ip6_route_input(skb); if (skb_dst(skb)->error) { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 65fa5014bc85..6d88f5248c1f 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1034,11 +1034,9 @@ drop_no_count: return 0; } -void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, - u8 type, +void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, - const struct in6_addr *daddr, - int oif) + const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 714cdc9e2b8e..5ce25bcb9974 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1699,11 +1699,9 @@ mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) return scount; } -static void ip6_mc_hdr(struct sock *sk, struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - int proto, int len) +static void ip6_mc_hdr(const struct sock *sk, struct sk_buff *skb, + struct net_device *dev, const struct in6_addr *saddr, + const struct in6_addr *daddr, int proto, int len) { struct ipv6hdr *hdr; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 18634ebd20a4..29ddad1c1a2f 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1280,6 +1280,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) if (!ndisc_parse_options(skb->dev, opt, optlen, &ndopts)) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + if (!ipv6_accept_ra(in6_dev)) { ND_PRINTK(2, info, "RA: %s, did not accept ra for dev: %s\n", @@ -1287,6 +1289,13 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_linkparms; } + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto skip_linkparms; + } + #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { @@ -1339,8 +1348,6 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) goto skip_defrtr; } - lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); - #ifdef CONFIG_IPV6_ROUTER_PREF pref = ra_msg->icmph.icmp6_router_pref; /* 10b is handled as if it were 00b (medium) */ @@ -1492,6 +1499,13 @@ skip_linkparms: goto out; } + if (lifetime != 0 && lifetime < in6_dev->cnf.accept_ra_min_rtr_lft) { + ND_PRINTK(2, info, + "RA: router lifetime (%ds) is too short: %s\n", + lifetime, skb->dev->name); + goto out; + } + #ifdef CONFIG_IPV6_ROUTE_INFO if (!in6_dev->cnf.accept_ra_from_local && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index f804c11e2146..2a0e8bc07398 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -215,6 +215,7 @@ struct proto pingv6_prot = { .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), }; EXPORT_SYMBOL_GPL(pingv6_prot); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index ac1cef094c5f..0fcf1b890807 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1216,6 +1216,7 @@ struct proto rawv6_prot = { .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), + .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index b1c028df686e..a013b92cbb86 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -272,8 +272,6 @@ static int rpl_input(struct sk_buff *skb) dst = dst_cache_get(&rlwt->cache); preempt_enable(); - skb_dst_drop(skb); - if (!dst) { ip6_route_input(skb); dst = skb_dst(skb); @@ -284,6 +282,7 @@ static int rpl_input(struct sk_buff *skb) preempt_enable(); } } else { + skb_dst_drop(skb); skb_dst_set(skb, dst); } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 4714eb695913..1b4529e833a1 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2175,6 +2175,7 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b7c972aa09a7..95c75d8f73d5 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1798,6 +1798,7 @@ struct proto udpv6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = NULL, .diag_destroy = udp_abort, }; diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index 8e010d07917a..267d491e9707 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -67,6 +67,7 @@ struct proto udplitev6_prot = { .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp6_sock), + .ipv6_pinfo_offset = offsetof(struct udp6_sock, inet6), .h.udp_table = &udplite_table, }; diff --git a/net/key/af_key.c b/net/key/af_key.c index ede3c6a60353..542439b6a59c 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1281,7 +1281,6 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } - memset(&natt->encap_oa, 0, sizeof(natt->encap_oa)); } err = xfrm_init_state(x); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b1623f9c4f92..2eee95a00c05 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -36,9 +36,6 @@ struct l2tp_ip6_sock { u32 conn_id; u32 peer_conn_id; - /* ipv6_pinfo has to be the last member of l2tp_ip6_sock, see - * inet6_sk_generic - */ struct ipv6_pinfo inet6; }; @@ -730,6 +727,7 @@ static struct proto l2tp_ip6_prot = { .hash = l2tp_ip6_hash, .unhash = l2tp_ip6_unhash, .obj_size = sizeof(struct l2tp_ip6_sock), + .ipv6_pinfo_offset = offsetof(struct l2tp_ip6_sock, inet6), }; static const struct proto_ops l2tp_ip6_ops = { diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 3317d1cca156..65ee949a8a44 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -90,6 +90,7 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) if (err) return err; + msk->scaling_ratio = tcp_sk(ssock->sk)->scaling_ratio; WRITE_ONCE(msk->first, ssock->sk); WRITE_ONCE(msk->subflow, ssock); subflow = mptcp_subflow_ctx(ssock->sk); @@ -1881,6 +1882,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) { struct mptcp_subflow_context *subflow; struct sock *sk = (struct sock *)msk; + u8 scaling_ratio = U8_MAX; u32 time, advmss = 1; u64 rtt_us, mstamp; @@ -1911,9 +1913,11 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) rtt_us = max(sf_rtt_us, rtt_us); advmss = max(sf_advmss, advmss); + scaling_ratio = min(tp->scaling_ratio, scaling_ratio); } msk->rcvq_space.rtt_us = rtt_us; + msk->scaling_ratio = scaling_ratio; if (time < (rtt_us >> 3) || rtt_us == 0) return; @@ -1922,8 +1926,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; @@ -1932,18 +1936,13 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) do_div(grow, msk->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < advmss) - rcvmem += 128; - - do_div(rcvwin, advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, __tcp_space_from_win(scaling_ratio, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { u32 window_clamp; - window_clamp = tcp_win_from_space(sk, rcvbuf); + window_clamp = __tcp_win_from_space(scaling_ratio, rcvbuf); WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); /* Make subflows follow along. If we do not do this, we @@ -3987,6 +3986,7 @@ int __init mptcp_proto_v6_init(void) strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); + mptcp_v6_prot.ipv6_pinfo_offset = offsetof(struct mptcp6_sock, np); err = proto_register(&mptcp_v6_prot, 1); if (err) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 37fbe22e2433..795f422e8597 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -321,6 +321,7 @@ struct mptcp_sock { u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; + u8 scaling_ratio; u32 subflow_id; u32 setsockopt_seq; @@ -351,9 +352,14 @@ static inline int __mptcp_rmem(const struct sock *sk) return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); } +static inline int mptcp_win_from_space(const struct sock *sk, int space) +{ + return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); +} + static inline int __mptcp_space(const struct sock *sk) { - return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); } static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 9ee3b7abbaf6..ad7080fabb2f 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1359,7 +1359,7 @@ void mptcp_space(const struct sock *ssk, int *space, int *full_space) const struct sock *sk = subflow->conn; *space = __mptcp_space(sk); - *full_space = tcp_full_space(sk); + *full_space = mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void __mptcp_error_report(struct sock *sk) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 992393102d5f..9f6f2e643575 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1756,7 +1756,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, cnet = nf_ct_pernet(net); if (cnet->expect_count) { spin_lock_bh(&nf_conntrack_expect_lock); - exp = nf_ct_find_expectation(net, zone, tuple); + exp = nf_ct_find_expectation(net, zone, tuple, !tmpl || nf_ct_is_confirmed(tmpl)); if (exp) { /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 96948e98ec53..81ca348915c9 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, - const struct nf_conntrack_tuple *tuple) + const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; @@ -211,7 +211,7 @@ nf_ct_find_expectation(struct net *net, !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; - if (exp->flags & NF_CT_EXPECT_PERMANENT) { + if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 38958e067aa8..e87fd4314c68 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -262,6 +262,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr, regs->verdict.code = NF_DROP; return; } + __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); @@ -368,6 +369,7 @@ static bool nft_ct_tmpl_alloc_pcpu(void) return false; } + __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 383631873748..96c605e45235 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -677,6 +677,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sock, unsigned long *groups); int err = 0; sock->state = SS_UNCONNECTED; @@ -704,6 +705,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; + release = nl_table[protocol].release; netlink_unlock_table(); if (err < 0) @@ -719,6 +721,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol, nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; + nlk->netlink_release = release; out: return err; @@ -763,6 +766,8 @@ static int netlink_release(struct socket *sock) * OK. Socket is unlinked, any packets that arrive now * will be purged. */ + if (nlk->netlink_release) + nlk->netlink_release(sk, nlk->groups); /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock @@ -1432,6 +1437,8 @@ struct netlink_broadcast_data { int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; + int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); + void *tx_data; }; static void do_one_broadcast(struct sock *sk, @@ -1485,6 +1492,13 @@ static void do_one_broadcast(struct sock *sk, p->delivery_failure = 1; goto out; } + + if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { + kfree_skb(p->skb2); + p->skb2 = NULL; + goto out; + } + if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; @@ -1507,8 +1521,12 @@ out: sock_put(sk); } -int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, - u32 group, gfp_t allocation) +int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, + u32 portid, + u32 group, gfp_t allocation, + int (*filter)(struct sock *dsk, + struct sk_buff *skb, void *data), + void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; @@ -1527,6 +1545,8 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, info.allocation = allocation; info.skb = skb; info.skb2 = NULL; + info.tx_filter = filter; + info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ @@ -1552,6 +1572,14 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, } return -ESRCH; } +EXPORT_SYMBOL(netlink_broadcast_filtered); + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, + u32 group, gfp_t allocation) +{ + return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, + NULL, NULL); +} EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { @@ -1629,10 +1657,7 @@ static void netlink_update_socket_mc(struct netlink_sock *nlk, old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; - if (new) - __set_bit(group - 1, nlk->groups); - else - __clear_bit(group - 1, nlk->groups); + __assign_bit(group - 1, nlk->groups, new); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } @@ -2069,6 +2094,7 @@ __netlink_kernel_create(struct net *net, int unit, struct module *module, if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; + nl_table[unit].release = cfg->release; nl_table[unit].flags = cfg->flags; } nl_table[unit].registered = 1; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 90a3198a9b7f..fd424cd63f31 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -42,6 +42,8 @@ struct netlink_sock { void (*netlink_rcv)(struct sk_buff *skb); int (*netlink_bind)(struct net *net, int group); void (*netlink_unbind)(struct net *net, int group); + void (*netlink_release)(struct sock *sk, + unsigned long *groups); struct module *module; struct rhash_head node; @@ -64,6 +66,8 @@ struct netlink_table { struct module *module; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); + void (*release)(struct sock *sk, + unsigned long *groups); int registered; }; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index a157247a1e45..6bd2ce51271f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -593,8 +593,12 @@ static int genl_validate_ops(const struct genl_family *family) return -EINVAL; /* Check sort order */ - if (a->cmd < b->cmd) + if (a->cmd < b->cmd) { continue; + } else if (a->cmd > b->cmd) { + WARN_ON(1); + return -EINVAL; + } if (a->internal_flags != b->internal_flags || ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO | diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 331730fd3580..fa955e892210 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -455,45 +455,6 @@ static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, return 0; } -static struct nf_conntrack_expect * -ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, - u16 proto, const struct sk_buff *skb) -{ - struct nf_conntrack_tuple tuple; - struct nf_conntrack_expect *exp; - - if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) - return NULL; - - exp = __nf_ct_expect_find(net, zone, &tuple); - if (exp) { - struct nf_conntrack_tuple_hash *h; - - /* Delete existing conntrack entry, if it clashes with the - * expectation. This can happen since conntrack ALGs do not - * check for clashes between (new) expectations and existing - * conntrack entries. nf_conntrack_in() will check the - * expectations only if a conntrack entry can not be found, - * which can lead to OVS finding the expectation (here) in the - * init direction, but which will not be removed by the - * nf_conntrack_in() call, if a matching conntrack entry is - * found instead. In this case all init direction packets - * would be reported as new related packets, while reply - * direction packets would be reported as un-related - * established packets. - */ - h = nf_conntrack_find_get(net, zone, &tuple); - if (h) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - - nf_ct_delete(ct, 0, 0); - nf_ct_put(ct); - } - } - - return exp; -} - /* This replicates logic from nf_conntrack_core.c that is not exported. */ static enum ip_conntrack_info ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) @@ -852,36 +813,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, const struct ovs_conntrack_info *info, struct sk_buff *skb) { - struct nf_conntrack_expect *exp; - - /* If we pass an expected packet through nf_conntrack_in() the - * expectation is typically removed, but the packet could still be - * lost in upcall processing. To prevent this from happening we - * perform an explicit expectation lookup. Expected connections are - * always new, and will be passed through conntrack only when they are - * committed, as it is OK to remove the expectation at that time. - */ - exp = ovs_ct_expect_find(net, &info->zone, info->family, skb); - if (exp) { - u8 state; - - /* NOTE: New connections are NATted and Helped only when - * committed, so we are not calling into NAT here. - */ - state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED; - __ovs_ct_update_key(key, state, &info->zone, exp->master); - } else { - struct nf_conn *ct; - int err; + struct nf_conn *ct; + int err; - err = __ovs_ct_lookup(net, key, info, skb); - if (err) - return err; + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; - ct = (struct nf_conn *)skb_nfct(skb); - if (ct) - nf_ct_deliver_cached_events(ct); - } + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); return 0; } @@ -1460,7 +1401,8 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, if (err) goto err_free_ct; - __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); return 0; err_free_ct: __ovs_ct_free_action(&ct_info); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 78beb74146e7..41ece61eb57a 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -23,6 +23,8 @@ #define QRTR_EPH_PORT_RANGE \ XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) +#define QRTR_PORT_CTRL_LEGACY 0xffff + /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 * @version: protocol version @@ -495,6 +497,9 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } + if (cb->dst_port == QRTR_PORT_CTRL_LEGACY) + cb->dst_port = QRTR_PORT_CTRL; + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 0f7a729f1a1f..b1db0b519179 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -16,7 +16,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/qrtr.h> -static RADIX_TREE(nodes, GFP_KERNEL); +static DEFINE_XARRAY(nodes); static struct { struct socket *sock; @@ -66,14 +66,14 @@ struct qrtr_server { struct qrtr_node { unsigned int id; - struct radix_tree_root servers; + struct xarray servers; }; static struct qrtr_node *node_get(unsigned int node_id) { struct qrtr_node *node; - node = radix_tree_lookup(&nodes, node_id); + node = xa_load(&nodes, node_id); if (node) return node; @@ -83,8 +83,9 @@ static struct qrtr_node *node_get(unsigned int node_id) return NULL; node->id = node_id; + xa_init(&node->servers); - if (radix_tree_insert(&nodes, node_id, node)) { + if (xa_store(&nodes, node_id, node, GFP_KERNEL)) { kfree(node); return NULL; } @@ -193,40 +194,23 @@ static void lookup_notify(struct sockaddr_qrtr *to, struct qrtr_server *srv, static int announce_servers(struct sockaddr_qrtr *sq) { - struct radix_tree_iter iter; struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; int ret; node = node_get(qrtr_ns.local_node); if (!node) return 0; - rcu_read_lock(); /* Announce the list of servers registered in this node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&node->servers, index, srv) { ret = service_announce_new(sq, srv); if (ret < 0) { pr_err("failed to announce new service\n"); return ret; } - - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -256,14 +240,17 @@ static struct qrtr_server *server_add(unsigned int service, goto err; /* Delete the old server on the same port */ - old = radix_tree_lookup(&node->servers, port); + old = xa_store(&node->servers, port, srv, GFP_KERNEL); if (old) { - radix_tree_delete(&node->servers, port); - kfree(old); + if (xa_is_err(old)) { + pr_err("failed to add server [0x%x:0x%x] ret:%d\n", + srv->service, srv->instance, xa_err(old)); + goto err; + } else { + kfree(old); + } } - radix_tree_insert(&node->servers, port, srv); - trace_qrtr_ns_server_add(srv->service, srv->instance, srv->node, srv->port); @@ -280,11 +267,11 @@ static int server_del(struct qrtr_node *node, unsigned int port, bool bcast) struct qrtr_server *srv; struct list_head *li; - srv = radix_tree_lookup(&node->servers, port); + srv = xa_load(&node->servers, port); if (!srv) return -ENOENT; - radix_tree_delete(&node->servers, port); + xa_erase(&node->servers, port); /* Broadcast the removal of local servers */ if (srv->node == qrtr_ns.local_node && bcast) @@ -344,13 +331,12 @@ static int ctrl_cmd_hello(struct sockaddr_qrtr *sq) static int ctrl_cmd_bye(struct sockaddr_qrtr *from) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_ctrl_pkt pkt; struct qrtr_server *srv; struct sockaddr_qrtr sq; struct msghdr msg = { }; struct qrtr_node *node; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -361,22 +347,9 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) if (!node) return 0; - rcu_read_lock(); /* Advertise removal of this client to all servers of remote node */ - radix_tree_for_each_slot(slot, &node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); + xa_for_each(&node->servers, index, srv) server_del(node, srv->port, true); - rcu_read_lock(); - } - rcu_read_unlock(); /* Advertise the removal of this client to all local servers */ local_node = node_get(qrtr_ns.local_node); @@ -387,18 +360,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pkt.cmd = cpu_to_le32(QRTR_TYPE_BYE); pkt.client.node = cpu_to_le32(from->sq_node); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -411,11 +373,7 @@ static int ctrl_cmd_bye(struct sockaddr_qrtr *from) pr_err("failed to send bye cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -423,7 +381,6 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, unsigned int node_id, unsigned int port) { struct qrtr_node *local_node; - struct radix_tree_iter iter; struct qrtr_lookup *lookup; struct qrtr_ctrl_pkt pkt; struct msghdr msg = { }; @@ -432,7 +389,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, struct qrtr_node *node; struct list_head *tmp; struct list_head *li; - void __rcu **slot; + unsigned long index; struct kvec iv; int ret; @@ -477,18 +434,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pkt.client.node = cpu_to_le32(node_id); pkt.client.port = cpu_to_le32(port); - rcu_read_lock(); - radix_tree_for_each_slot(slot, &local_node->servers, &iter, 0) { - srv = radix_tree_deref_slot(slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - rcu_read_unlock(); - + xa_for_each(&local_node->servers, index, srv) { sq.sq_family = AF_QIPCRTR; sq.sq_node = srv->node; sq.sq_port = srv->port; @@ -501,11 +447,7 @@ static int ctrl_cmd_del_client(struct sockaddr_qrtr *from, pr_err("failed to send del client cmd\n"); return ret; } - rcu_read_lock(); } - - rcu_read_unlock(); - return 0; } @@ -576,13 +518,12 @@ static int ctrl_cmd_del_server(struct sockaddr_qrtr *from, static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, unsigned int service, unsigned int instance) { - struct radix_tree_iter node_iter; struct qrtr_server_filter filter; - struct radix_tree_iter srv_iter; struct qrtr_lookup *lookup; + struct qrtr_server *srv; struct qrtr_node *node; - void __rcu **node_slot; - void __rcu **srv_slot; + unsigned long node_idx; + unsigned long srv_idx; /* Accept only local observers */ if (from->sq_node != qrtr_ns.local_node) @@ -601,40 +542,14 @@ static int ctrl_cmd_new_lookup(struct sockaddr_qrtr *from, filter.service = service; filter.instance = instance; - rcu_read_lock(); - radix_tree_for_each_slot(node_slot, &nodes, &node_iter, 0) { - node = radix_tree_deref_slot(node_slot); - if (!node) - continue; - if (radix_tree_deref_retry(node)) { - node_slot = radix_tree_iter_retry(&node_iter); - continue; - } - node_slot = radix_tree_iter_resume(node_slot, &node_iter); - - radix_tree_for_each_slot(srv_slot, &node->servers, - &srv_iter, 0) { - struct qrtr_server *srv; - - srv = radix_tree_deref_slot(srv_slot); - if (!srv) - continue; - if (radix_tree_deref_retry(srv)) { - srv_slot = radix_tree_iter_retry(&srv_iter); - continue; - } - + xa_for_each(&nodes, node_idx, node) { + xa_for_each(&node->servers, srv_idx, srv) { if (!server_match(srv, &filter)) continue; - srv_slot = radix_tree_iter_resume(srv_slot, &srv_iter); - - rcu_read_unlock(); lookup_notify(from, srv, true); - rcu_read_lock(); } } - rcu_read_unlock(); /* Empty notification, to indicate end of listing */ lookup_notify(from, NULL, true); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 4b95cb1ac435..470c70deffe2 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -347,8 +347,7 @@ config NET_SCH_FQ_PIE config NET_SCH_INGRESS tristate "Ingress/classifier-action Qdisc" depends on NET_CLS_ACT - select NET_INGRESS - select NET_EGRESS + select NET_XGRESS help Say Y here if you want to use classifiers for incoming and/or outgoing packets. This qdisc doesn't do anything else besides running classifiers, @@ -679,6 +678,7 @@ config NET_EMATCH_IPT config NET_CLS_ACT bool "Actions" select NET_CLS + select NET_XGRESS help Say Y here if you want to use traffic control actions. Actions get attached to classifiers and are invoked after a successful diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index abc71a06d634..7c652d14528b 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1238,7 +1238,8 @@ static int tcf_ct_fill_params(struct net *net, } } - __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); + if (p->ct_action & TCA_CT_ACT_COMMIT) + __set_bit(IPS_CONFIRMED_BIT, &tmpl->status); return 0; err: nf_ct_put(p->tmpl); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 325c29041c7d..333800a7d4eb 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1810,10 +1810,6 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, NL_SET_ERR_MSG(extack, "HTB offload doesn't support the mpu parameter"); goto failure; } - if (hopt->quantum) { - NL_SET_ERR_MSG(extack, "HTB offload doesn't support the quantum parameter"); - goto failure; - } } /* Keeping backward compatible with rate_table based iproute2 tc */ @@ -1910,6 +1906,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -1931,6 +1928,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); @@ -2017,6 +2015,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, .rate = max_t(u64, hopt->rate.rate, rate64), .ceil = max_t(u64, hopt->ceil.rate, ceil64), .prio = hopt->prio, + .quantum = hopt->quantum, .extack = extack, }; err = htb_offload(dev, &offload_opt); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index e43a45499372..a463a63192c3 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -13,6 +13,7 @@ #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> +#include <net/tcx.h> struct ingress_sched_data { struct tcf_block *block; @@ -78,6 +79,8 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, { struct ingress_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_INGRESS) @@ -85,7 +88,13 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); - mini_qdisc_pair_init(&q->miniqp, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->block_info.chain_head_change = clsact_chain_head_change; @@ -103,11 +112,22 @@ static int ingress_init(struct Qdisc *sch, struct nlattr *opt, static void ingress_destroy(struct Qdisc *sch) { struct ingress_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry = rtnl_dereference(dev->tcx_ingress); if (sch->parent != TC_H_INGRESS) return; tcf_block_put_ext(q->block, sch, &q->block_info); + + if (entry) { + tcx_miniq_set_active(entry, false); + if (!tcx_entry_is_active(entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(entry); + } + } + net_dec_ingress_queue(); } @@ -223,6 +243,8 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, { struct clsact_sched_data *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *entry; + bool created; int err; if (sch->parent != TC_H_CLSACT) @@ -231,7 +253,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, net_inc_ingress_queue(); net_inc_egress_queue(); - mini_qdisc_pair_init(&q->miniqp_ingress, sch, &dev->miniq_ingress); + entry = tcx_entry_fetch_or_create(dev, true, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_ingress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, true); q->ingress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; q->ingress_block_info.chain_head_change = clsact_chain_head_change; @@ -244,7 +272,13 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, mini_qdisc_pair_block_init(&q->miniqp_ingress, q->ingress_block); - mini_qdisc_pair_init(&q->miniqp_egress, sch, &dev->miniq_egress); + entry = tcx_entry_fetch_or_create(dev, false, &created); + if (!entry) + return -ENOMEM; + tcx_miniq_set_active(entry, true); + mini_qdisc_pair_init(&q->miniqp_egress, sch, &tcx_entry(entry)->miniq); + if (created) + tcx_entry_update(dev, entry, false); q->egress_block_info.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS; q->egress_block_info.chain_head_change = clsact_chain_head_change; @@ -256,12 +290,31 @@ static int clsact_init(struct Qdisc *sch, struct nlattr *opt, static void clsact_destroy(struct Qdisc *sch) { struct clsact_sched_data *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct bpf_mprog_entry *ingress_entry = rtnl_dereference(dev->tcx_ingress); + struct bpf_mprog_entry *egress_entry = rtnl_dereference(dev->tcx_egress); if (sch->parent != TC_H_CLSACT) return; - tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); tcf_block_put_ext(q->ingress_block, sch, &q->ingress_block_info); + tcf_block_put_ext(q->egress_block, sch, &q->egress_block_info); + + if (ingress_entry) { + tcx_miniq_set_active(ingress_entry, false); + if (!tcx_entry_is_active(ingress_entry)) { + tcx_entry_update(dev, NULL, true); + tcx_entry_free(ingress_entry); + } + } + + if (egress_entry) { + tcx_miniq_set_active(egress_entry, false); + if (!tcx_entry_is_active(egress_entry)) { + tcx_entry_update(dev, NULL, false); + tcx_entry_free(egress_entry); + } + } net_dec_ingress_queue(); net_dec_egress_queue(); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 274d07bd774f..33c0895e101c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -435,7 +435,8 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = RT_CONN_FLAGS_TOS(asoc->base.sk, tos); + fl4->flowi4_tos = RT_TOS(tos); + fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9388d98aebc0..6e3d28aa587c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9732,6 +9732,7 @@ struct proto sctpv6_prot = { .unhash = sctp_unhash, .no_autobind = true, .obj_size = sizeof(struct sctp6_sock), + .ipv6_pinfo_offset = offsetof(struct sctp6_sock, inet6), .useroffset = offsetof(struct sctp6_sock, sctp.subscribe), .usersize = offsetof(struct sctp6_sock, sctp.initmsg) - offsetof(struct sctp6_sock, sctp.subscribe) + diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 8cc42aea19c7..5b045284849e 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -862,3 +862,28 @@ void switchdev_bridge_port_unoffload(struct net_device *brport_dev, NULL); } EXPORT_SYMBOL_GPL(switchdev_bridge_port_unoffload); + +int switchdev_bridge_port_replay(struct net_device *brport_dev, + struct net_device *dev, const void *ctx, + struct notifier_block *atomic_nb, + struct notifier_block *blocking_nb, + struct netlink_ext_ack *extack) +{ + struct switchdev_notifier_brport_info brport_info = { + .brport = { + .dev = dev, + .ctx = ctx, + .atomic_nb = atomic_nb, + .blocking_nb = blocking_nb, + }, + }; + int err; + + ASSERT_RTNL(); + + err = call_switchdev_blocking_notifiers(SWITCHDEV_BRPORT_REPLAY, + brport_dev, &brport_info.info, + extack); + return notifier_to_errno(err); +} +EXPORT_SYMBOL_GPL(switchdev_bridge_port_replay); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b769fc258931..352d042b130b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -348,37 +348,34 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, size_t len) { struct virtio_vsock_sock *vvs = vsk->trans; - size_t bytes, total = 0, off; - struct sk_buff *skb, *tmp; - int err = -EFAULT; + struct sk_buff *skb; + size_t total = 0; + int err; spin_lock_bh(&vvs->rx_lock); - skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { - off = 0; + skb_queue_walk(&vvs->rx_queue, skb) { + size_t bytes; - if (total == len) - break; + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; - while (total < len && off < skb->len) { - bytes = len - total; - if (bytes > skb->len - off) - bytes = skb->len - off; + spin_unlock_bh(&vvs->rx_lock); - /* sk_lock is held by caller so no one else can dequeue. - * Unlock rx_lock since memcpy_to_msg() may sleep. - */ - spin_unlock_bh(&vvs->rx_lock); + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + goto out; - err = memcpy_to_msg(msg, skb->data + off, bytes); - if (err) - goto out; + total += bytes; - spin_lock_bh(&vvs->rx_lock); + spin_lock_bh(&vvs->rx_lock); - total += bytes; - off += bytes; - } + if (total == len) + break; } spin_unlock_bh(&vvs->rx_lock); @@ -463,6 +460,63 @@ out: return err; } +static ssize_t +virtio_transport_seqpacket_do_peek(struct vsock_sock *vsk, + struct msghdr *msg) +{ + struct virtio_vsock_sock *vvs = vsk->trans; + struct sk_buff *skb; + size_t total, len; + + spin_lock_bh(&vvs->rx_lock); + + if (!vvs->msg_count) { + spin_unlock_bh(&vvs->rx_lock); + return 0; + } + + total = 0; + len = msg_data_left(msg); + + skb_queue_walk(&vvs->rx_queue, skb) { + struct virtio_vsock_hdr *hdr; + + if (total < len) { + size_t bytes; + int err; + + bytes = len - total; + if (bytes > skb->len) + bytes = skb->len; + + spin_unlock_bh(&vvs->rx_lock); + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + return err; + + spin_lock_bh(&vvs->rx_lock); + } + + total += skb->len; + hdr = virtio_vsock_hdr(skb); + + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + + break; + } + } + + spin_unlock_bh(&vvs->rx_lock); + + return total; +} + static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, struct msghdr *msg, int flags) @@ -557,9 +611,9 @@ virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk, int flags) { if (flags & MSG_PEEK) - return -EOPNOTSUPP; - - return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); + return virtio_transport_seqpacket_do_peek(vsk, msg); + else + return virtio_transport_seqpacket_do_dequeue(vsk, msg, flags); } EXPORT_SYMBOL_GPL(virtio_transport_seqpacket_dequeue); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 31dca4ecb2c5..4f1e0599146e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -135,14 +135,14 @@ int xsk_reg_pool_at_qid(struct net_device *dev, struct xsk_buff_pool *pool, return 0; } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff_xsk *xskb, u32 len, + u32 flags) { - struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); u64 addr; int err; addr = xp_get_handle(xskb); - err = xskq_prod_reserve_desc(xs->rx, addr, len); + err = xskq_prod_reserve_desc(xs->rx, addr, len, flags); if (err) { xs->rx_queue_full++; return err; @@ -152,48 +152,138 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) return 0; } -static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) +static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *from_buf, *to_buf; - u32 metalen; + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u32 frags = xdp_buff_has_frags(xdp); + struct xdp_buff_xsk *pos, *tmp; + struct list_head *xskb_list; + u32 contd = 0; + int err; - if (unlikely(xdp_data_meta_unsupported(from))) { - from_buf = from->data; - to_buf = to->data; - metalen = 0; - } else { - from_buf = from->data_meta; - metalen = from->data - from->data_meta; - to_buf = to->data - metalen; + if (frags) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); + if (err || likely(!frags)) + goto out; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { + if (list_is_singular(xskb_list)) + contd = 0; + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) + return err; + list_del(&pos->xskb_list_node); } - memcpy(to_buf, from_buf, len + metalen); +out: + return err; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static void *xsk_copy_xdp_start(struct xdp_buff *from) { + if (unlikely(xdp_data_meta_unsupported(from))) + return from->data; + else + return from->data_meta; +} + +static u32 xsk_copy_xdp(void *to, void **from, u32 to_len, + u32 *from_len, skb_frag_t **frag, u32 rem) +{ + u32 copied = 0; + + while (1) { + u32 copy_len = min_t(u32, *from_len, to_len); + + memcpy(to, *from, copy_len); + copied += copy_len; + if (rem == copied) + return copied; + + if (*from_len == copy_len) { + *from = skb_frag_address(*frag); + *from_len = skb_frag_size((*frag)++); + } else { + *from += copy_len; + *from_len -= copy_len; + } + if (to_len == copy_len) + return copied; + + to_len -= copy_len; + to += copy_len; + } +} + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +{ + u32 frame_size = xsk_pool_get_rx_frame_size(xs->pool); + void *copy_from = xsk_copy_xdp_start(xdp), *copy_to; + u32 from_len, meta_len, rem, num_desc; + struct xdp_buff_xsk *xskb; struct xdp_buff *xsk_xdp; - int err; - u32 len; + skb_frag_t *frag; - len = xdp->data_end - xdp->data; - if (len > xsk_pool_get_rx_frame_size(xs->pool)) { - xs->rx_dropped++; - return -ENOSPC; + from_len = xdp->data_end - copy_from; + meta_len = xdp->data - copy_from; + rem = len + meta_len; + + if (len <= frame_size && !xdp_buff_has_frags(xdp)) { + int err; + + xsk_xdp = xsk_buff_alloc(xs->pool); + if (!xsk_xdp) { + xs->rx_dropped++; + return -ENOMEM; + } + memcpy(xsk_xdp->data - meta_len, copy_from, rem); + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + err = __xsk_rcv_zc(xs, xskb, len, 0); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + + return 0; } - xsk_xdp = xsk_buff_alloc(xs->pool); - if (!xsk_xdp) { + num_desc = (len - 1) / frame_size + 1; + + if (!xsk_buff_can_alloc(xs->pool, num_desc)) { xs->rx_dropped++; return -ENOMEM; } + if (xskq_prod_nb_free(xs->rx, num_desc) < num_desc) { + xs->rx_queue_full++; + return -ENOBUFS; + } - xsk_copy_xdp(xsk_xdp, xdp, len); - err = __xsk_rcv_zc(xs, xsk_xdp, len); - if (err) { - xsk_buff_free(xsk_xdp); - return err; + if (xdp_buff_has_frags(xdp)) { + struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(xdp); + frag = &sinfo->frags[0]; } + + do { + u32 to_len = frame_size + meta_len; + u32 copied; + + xsk_xdp = xsk_buff_alloc(xs->pool); + copy_to = xsk_xdp->data - meta_len; + + copied = xsk_copy_xdp(copy_to, ©_from, to_len, &from_len, &frag, rem); + rem -= copied; + + xskb = container_of(xsk_xdp, struct xdp_buff_xsk, xdp); + __xsk_rcv_zc(xs, xskb, copied - meta_len, rem ? XDP_PKT_CONTD : 0); + meta_len = 0; + } while (rem); + return 0; } @@ -215,7 +305,7 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { if (!xsk_is_bound(xs)) return -ENXIO; @@ -223,6 +313,11 @@ static int xsk_rcv_check(struct xdp_sock *xs, struct xdp_buff *xdp) if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) return -EINVAL; + if (len > xsk_pool_get_rx_frame_size(xs->pool) && !xs->sg) { + xs->rx_dropped++; + return -ENOSPC; + } + sk_mark_napi_id_once_xdp(&xs->sk, xdp); return 0; } @@ -236,12 +331,13 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; spin_lock_bh(&xs->rx_lock); - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (!err) { - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); xsk_flush(xs); } spin_unlock_bh(&xs->rx_lock); @@ -250,19 +346,19 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp_get_buff_len(xdp); int err; - u32 len; - err = xsk_rcv_check(xs, xdp); + err = xsk_rcv_check(xs, xdp, len); if (err) return err; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) { len = xdp->data_end - xdp->data; - return __xsk_rcv_zc(xs, xdp, len); + return xsk_rcv_zc(xs, xdp, len); } - err = __xsk_rcv(xs, xdp); + err = __xsk_rcv(xs, xdp, len); if (!err) xdp_return_buff(xdp); return err; @@ -321,7 +417,8 @@ bool xsk_tx_peek_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) rcu_read_lock(); list_for_each_entry_rcu(xs, &pool->xsk_tx_list, tx_list) { if (!xskq_cons_peek_desc(xs->tx, desc, pool)) { - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) + xskq_cons_release(xs->tx); continue; } @@ -408,37 +505,91 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static void xsk_destruct_skb(struct sk_buff *skb) +static int xsk_cq_reserve_addr_locked(struct xdp_sock *xs, u64 addr) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&xs->pool->cq_lock, flags); + ret = xskq_prod_reserve_addr(xs->pool->cq, addr); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + + return ret; +} + +static void xsk_cq_submit_locked(struct xdp_sock *xs, u32 n) { - u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; - struct xdp_sock *xs = xdp_sk(skb->sk); unsigned long flags; spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_submit_addr(xs->pool->cq, addr); + xskq_prod_submit_n(xs->pool->cq, n); spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xdp_sock *xs, u32 n) +{ + unsigned long flags; + spin_lock_irqsave(&xs->pool->cq_lock, flags); + xskq_prod_cancel_n(xs->pool->cq, n); + spin_unlock_irqrestore(&xs->pool->cq_lock, flags); +} + +static u32 xsk_get_num_desc(struct sk_buff *skb) +{ + return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +} + +static void xsk_destruct_skb(struct sk_buff *skb) +{ + xsk_cq_submit_locked(xdp_sk(skb->sk), xsk_get_num_desc(skb)); sock_wfree(skb); } +static void xsk_set_destructor_arg(struct sk_buff *skb) +{ + long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; + + skb_shinfo(skb)->destructor_arg = (void *)num; +} + +static void xsk_consume_skb(struct sk_buff *skb) +{ + struct xdp_sock *xs = xdp_sk(skb->sk); + + skb->destructor = sock_wfree; + xsk_cq_cancel_locked(xs, xsk_get_num_desc(skb)); + /* Free skb without triggering the perf drop trace */ + consume_skb(skb); + xs->skb = NULL; +} + +static void xsk_drop_skb(struct sk_buff *skb) +{ + xdp_sk(skb->sk)->tx->invalid_descs += xsk_get_num_desc(skb); + xsk_consume_skb(skb); +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { struct xsk_buff_pool *pool = xs->pool; u32 hr, len, ts, offset, copy, copied; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; struct page *page; void *buffer; int err, i; u64 addr; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); - skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err); + if (unlikely(!skb)) + return ERR_PTR(err); - skb_reserve(skb, hr); + skb_reserve(skb, hr); + } addr = desc->addr; len = desc->len; @@ -448,7 +599,10 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, offset = offset_in_page(buffer); addr = buffer - pool->addrs; - for (copied = 0, i = 0; copied < len; i++) { + for (copied = 0, i = skb_shinfo(skb)->nr_frags; copied < len; i++) { + if (unlikely(i >= MAX_SKB_FRAGS)) + return ERR_PTR(-EFAULT); + page = pool->umem->pgs[addr >> PAGE_SHIFT]; get_page(page); @@ -473,43 +627,77 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { struct net_device *dev = xs->dev; - struct sk_buff *skb; + struct sk_buff *skb = xs->skb; + int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); - if (IS_ERR(skb)) - return skb; + if (IS_ERR(skb)) { + err = PTR_ERR(skb); + goto free_err; + } } else { u32 hr, tr, len; void *buffer; - int err; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); - tr = dev->needed_tailroom; + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); len = desc->len; - skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); - if (unlikely(!skb)) - return ERR_PTR(err); + if (!skb) { + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); + tr = dev->needed_tailroom; + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); + if (unlikely(!skb)) + goto free_err; - skb_reserve(skb, hr); - skb_put(skb, len); + skb_reserve(skb, hr); + skb_put(skb, len); - buffer = xsk_buff_raw_get_data(xs->pool, desc->addr); - err = skb_store_bits(skb, 0, buffer, len); - if (unlikely(err)) { - kfree_skb(skb); - return ERR_PTR(err); + err = skb_store_bits(skb, 0, buffer, len); + if (unlikely(err)) + goto free_err; + } else { + int nr_frags = skb_shinfo(skb)->nr_frags; + struct page *page; + u8 *vaddr; + + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { + err = -EFAULT; + goto free_err; + } + + page = alloc_page(xs->sk.sk_allocation); + if (unlikely(!page)) { + err = -EAGAIN; + goto free_err; + } + + vaddr = kmap_local_page(page); + memcpy(vaddr, buffer, len); + kunmap_local(vaddr); + + skb_add_rx_frag(skb, nr_frags, page, 0, len, 0); } } skb->dev = dev; skb->priority = xs->sk.sk_priority; skb->mark = xs->sk.sk_mark; - skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr; skb->destructor = xsk_destruct_skb; + xsk_set_destructor_arg(skb); return skb; + +free_err: + if (err == -EAGAIN) { + xsk_cq_cancel_locked(xs, 1); + } else { + xsk_set_destructor_arg(skb); + xsk_drop_skb(skb); + xskq_cons_release(xs->tx); + } + + return ERR_PTR(err); } static int __xsk_generic_xmit(struct sock *sk) @@ -519,7 +707,6 @@ static int __xsk_generic_xmit(struct sock *sk) bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; - unsigned long flags; int err = 0; mutex_lock(&xs->mutex); @@ -544,47 +731,51 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - spin_lock_irqsave(&xs->pool->cq_lock, flags); - if (xskq_prod_reserve(xs->pool->cq)) { - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); + if (xsk_cq_reserve_addr_locked(xs, desc.addr)) goto out; - } - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); skb = xsk_build_skb(xs, &desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - goto out; + if (err == -EAGAIN) + goto out; + err = 0; + continue; + } + + xskq_cons_release(xs->tx); + + if (xp_mb_desc(&desc)) { + xs->skb = skb; + continue; } err = __dev_direct_xmit(skb, xs->queue_id); if (err == NETDEV_TX_BUSY) { /* Tell user-space to retry the send */ - skb->destructor = sock_wfree; - spin_lock_irqsave(&xs->pool->cq_lock, flags); - xskq_prod_cancel(xs->pool->cq); - spin_unlock_irqrestore(&xs->pool->cq_lock, flags); - /* Free skb without triggering the perf drop trace */ - consume_skb(skb); + xskq_cons_cancel_n(xs->tx, xsk_get_num_desc(skb)); + xsk_consume_skb(skb); err = -EAGAIN; goto out; } - xskq_cons_release(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP) { /* SKB completed but not sent */ err = -EBUSY; + xs->skb = NULL; goto out; } sent_frame = true; + xs->skb = NULL; } - xs->tx->queue_empty_descs++; + if (xskq_has_descs(xs->tx)) { + if (xs->skb) + xsk_drop_skb(xs->skb); + xskq_cons_release(xs->tx); + } out: if (sent_frame) @@ -834,6 +1025,9 @@ static int xsk_release(struct socket *sock) net = sock_net(sk); + if (xs->skb) + xsk_drop_skb(xs->skb); + mutex_lock(&net->xdp.lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->xdp.lock); @@ -897,7 +1091,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) flags = sxdp->sxdp_flags; if (flags & ~(XDP_SHARED_UMEM | XDP_COPY | XDP_ZEROCOPY | - XDP_USE_NEED_WAKEUP)) + XDP_USE_NEED_WAKEUP | XDP_USE_SG)) return -EINVAL; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); @@ -929,7 +1123,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct socket *sock; if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY) || - (flags & XDP_USE_NEED_WAKEUP)) { + (flags & XDP_USE_NEED_WAKEUP) || (flags & XDP_USE_SG)) { /* Cannot specify flags for shared sockets. */ err = -EINVAL; goto out_unlock; @@ -1028,6 +1222,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) xs->dev = dev; xs->zc = xs->umem->zc; + xs->sg = !!(flags & XDP_USE_SG); xs->queue_id = qid; xp_add_xsk(xs->pool, xs); diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 26f6d304451e..b3f7b310811e 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -86,6 +86,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->umem = umem; pool->addrs = umem->addrs; INIT_LIST_HEAD(&pool->free_list); + INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_lock); @@ -99,6 +100,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, xskb->pool = pool; xskb->xdp.frame_sz = umem->chunk_size - umem->headroom; INIT_LIST_HEAD(&xskb->free_list_node); + INIT_LIST_HEAD(&xskb->xskb_list_node); if (pool->unaligned) pool->free_heads[i] = xskb; else @@ -187,6 +189,11 @@ int xp_assign_dev(struct xsk_buff_pool *pool, goto err_unreg_pool; } + if (netdev->xdp_zc_max_segs == 1 && (flags & XDP_USE_SG)) { + err = -EOPNOTSUPP; + goto err_unreg_pool; + } + bpf.command = XDP_SETUP_XSK_POOL; bpf.xsk.pool = pool; bpf.xsk.queue_id = queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 6d40a77fccbe..13354a1e4280 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -48,6 +48,11 @@ struct xsk_queue { size_t ring_vmalloc_size; }; +struct parsed_desc { + u32 mb; + u32 valid; +}; + /* The structure of the shared state of the rings are a simple * circular buffer, as outlined in * Documentation/core-api/circular-buffers.rst. For the Rx and @@ -130,18 +135,26 @@ static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) return false; } +static inline bool xp_unused_options_set(u32 options) +{ + return options & ~XDP_PKT_CONTD; +} + static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { u64 offset = desc->addr & (pool->chunk_size - 1); + if (!desc->len) + return false; + if (offset + desc->len > pool->chunk_size) return false; if (desc->addr >= pool->addrs_cnt) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -151,6 +164,9 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, { u64 addr = xp_unaligned_add_offset_to_addr(desc->addr); + if (!desc->len) + return false; + if (desc->len > pool->chunk_size) return false; @@ -158,7 +174,7 @@ static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) return false; - if (desc->options) + if (xp_unused_options_set(desc->options)) return false; return true; } @@ -170,6 +186,11 @@ static inline bool xp_validate_desc(struct xsk_buff_pool *pool, xp_aligned_validate_desc(pool, desc); } +static inline bool xskq_has_descs(struct xsk_queue *q) +{ + return q->cached_cons != q->cached_prod; +} + static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, struct xdp_desc *d, struct xsk_buff_pool *pool) @@ -185,17 +206,15 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xsk_buff_pool *pool) { - while (q->cached_cons != q->cached_prod) { + if (q->cached_cons != q->cached_prod) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = q->cached_cons & q->ring_mask; *desc = ring->desc[idx]; - if (xskq_cons_is_valid_desc(q, desc, pool)) - return true; - - q->cached_cons++; + return xskq_cons_is_valid_desc(q, desc, pool); } + q->queue_empty_descs++; return false; } @@ -204,30 +223,52 @@ static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) q->cached_cons += cnt; } -static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, - u32 max) +static inline void parse_desc(struct xsk_queue *q, struct xsk_buff_pool *pool, + struct xdp_desc *desc, struct parsed_desc *parsed) +{ + parsed->valid = xskq_cons_is_valid_desc(q, desc, pool); + parsed->mb = xp_mb_desc(desc); +} + +static inline +u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, + u32 max) { u32 cached_cons = q->cached_cons, nb_entries = 0; struct xdp_desc *descs = pool->tx_descs; + u32 total_descs = 0, nr_frags = 0; + /* track first entry, if stumble upon *any* invalid descriptor, rewind + * current packet that consists of frags and stop the processing + */ while (cached_cons != q->cached_prod && nb_entries < max) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx = cached_cons & q->ring_mask; + struct parsed_desc parsed; descs[nb_entries] = ring->desc[idx]; - if (unlikely(!xskq_cons_is_valid_desc(q, &descs[nb_entries], pool))) { - /* Skip the entry */ - cached_cons++; - continue; + cached_cons++; + parse_desc(q, pool, &descs[nb_entries], &parsed); + if (unlikely(!parsed.valid)) + break; + + if (likely(!parsed.mb)) { + total_descs += (nr_frags + 1); + nr_frags = 0; + } else { + nr_frags++; + if (nr_frags == pool->netdev->xdp_zc_max_segs) { + nr_frags = 0; + break; + } } - nb_entries++; - cached_cons++; } + cached_cons -= nr_frags; /* Release valid plus any invalid entries */ xskq_cons_release_n(q, cached_cons - q->cached_cons); - return nb_entries; + return total_descs; } /* Functions for consumers */ @@ -292,6 +333,11 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } +static inline void xskq_cons_cancel_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons -= cnt; +} + static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -319,9 +365,9 @@ static inline bool xskq_prod_is_full(struct xsk_queue *q) return xskq_prod_nb_free(q, 1) ? false : true; } -static inline void xskq_prod_cancel(struct xsk_queue *q) +static inline void xskq_prod_cancel_n(struct xsk_queue *q, u32 cnt) { - q->cached_prod--; + q->cached_prod -= cnt; } static inline int xskq_prod_reserve(struct xsk_queue *q) @@ -360,7 +406,7 @@ static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_de } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, - u64 addr, u32 len) + u64 addr, u32 len, u32 flags) { struct xdp_rxtx_ring *ring = (struct xdp_rxtx_ring *)q->ring; u32 idx; @@ -372,6 +418,7 @@ static inline int xskq_prod_reserve_desc(struct xsk_queue *q, idx = q->cached_prod++ & q->ring_mask; ring->desc[idx].addr = addr; ring->desc[idx].len = len; + ring->desc[idx].options = flags; return 0; } @@ -386,16 +433,6 @@ static inline void xskq_prod_submit(struct xsk_queue *q) __xskq_prod_submit(q, q->cached_prod); } -static inline void xskq_prod_submit_addr(struct xsk_queue *q, u64 addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 idx = q->ring->producer; - - ring->desc[idx++ & q->ring_mask] = addr; - - __xskq_prod_submit(q, idx); -} - static inline void xskq_prod_submit_n(struct xsk_queue *q, u32 nb_entries) { __xskq_prod_submit(q, q->ring->producer + nb_entries); diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 533697e2488f..3784534c9185 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -247,12 +247,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - /* We don't yet support UDP encapsulation and TFC padding. */ - if (x->encap || x->tfcpad) { - NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); - return -EINVAL; - } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) { NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); @@ -260,6 +254,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, } is_packet_offload = xuo->flags & XFRM_OFFLOAD_PACKET; + + /* We don't yet support UDP encapsulation and TFC padding. */ + if ((!is_packet_offload && x->encap) || x->tfcpad) { + NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); + return -EINVAL; + } + dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { |