diff options
author | Ingo Molnar <mingo@kernel.org> | 2020-02-05 08:44:22 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2020-02-05 08:44:22 +0100 |
commit | fdff7c21ea00787e3f70a1a00b40b88eb998c6ad (patch) | |
tree | 03016a8375e849e2c39dec8a15d660055bb16a8c /net/netfilter | |
parent | f1ec3a517b4352e78dbef6b1e591f43202ecb3fe (diff) | |
parent | b3a6082223369203d7e7db7e81253ac761377644 (diff) |
Merge branch 'linus' into perf/urgent, to synchronize with upstream
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'net/netfilter')
-rw-r--r-- | net/netfilter/Makefile | 3 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_core.c | 1 | ||||
-rw-r--r-- | net/netfilter/nf_conntrack_extend.c | 1 | ||||
-rw-r--r-- | net/netfilter/nf_flow_table_core.c | 31 | ||||
-rw-r--r-- | net/netfilter/nf_flow_table_ip.c | 21 | ||||
-rw-r--r-- | net/netfilter/nf_flow_table_offload.c | 164 | ||||
-rw-r--r-- | net/netfilter/nf_tables_api.c | 260 | ||||
-rw-r--r-- | net/netfilter/nf_tables_set_core.c | 2 | ||||
-rw-r--r-- | net/netfilter/nfnetlink_queue.c | 8 | ||||
-rw-r--r-- | net/netfilter/nft_bitwise.c | 224 | ||||
-rw-r--r-- | net/netfilter/nft_dynset.c | 2 | ||||
-rw-r--r-- | net/netfilter/nft_meta.c | 440 | ||||
-rw-r--r-- | net/netfilter/nft_set_bitmap.c | 4 | ||||
-rw-r--r-- | net/netfilter/nft_set_hash.c | 2 | ||||
-rw-r--r-- | net/netfilter/nft_set_pipapo.c | 2102 | ||||
-rw-r--r-- | net/netfilter/nft_set_rbtree.c | 3 | ||||
-rw-r--r-- | net/netfilter/nft_tunnel.c | 52 | ||||
-rw-r--r-- | net/netfilter/xt_hashlimit.c | 22 |
18 files changed, 2973 insertions, 369 deletions
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 5e9b2eb24349..3f572e5a975e 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,7 +81,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nft_chain_route.o nf_tables_offload.o nf_tables_set-objs := nf_tables_set_core.o \ - nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o + nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ + nft_set_pipapo.o obj-$(CONFIG_NF_TABLES) += nf_tables.o obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f475fec84536..f4c4b467c87e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2334,7 +2334,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp) return nf_conntrack_hash_resize(hashsize); } -EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); static __always_inline unsigned int total_extension_size(void) { diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index c24e5b64b00c..3dbe2329c3f1 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -37,7 +37,6 @@ void nf_ct_ext_destroy(struct nf_conn *ct) kfree(ct->ext); } -EXPORT_SYMBOL(nf_ct_ext_destroy); void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e33a73cb1f42..7e91989a1b55 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -61,9 +61,9 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY); if (ct->status & IPS_SRC_NAT) - flow->flags |= FLOW_OFFLOAD_SNAT; + __set_bit(NF_FLOW_SNAT, &flow->flags); if (ct->status & IPS_DST_NAT) - flow->flags |= FLOW_OFFLOAD_DNAT; + __set_bit(NF_FLOW_DNAT, &flow->flags); return flow; @@ -182,8 +182,6 @@ void flow_offload_free(struct flow_offload *flow) default: break; } - if (flow->flags & FLOW_OFFLOAD_DYING) - nf_ct_delete(flow->ct, 0, 0); nf_ct_put(flow->ct); kfree_rcu(flow, rcu_head); } @@ -245,8 +243,10 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) return err; } - if (flow_table->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flow_table)) { + __set_bit(NF_FLOW_HW, &flow->flags); nf_flow_offload_add(flow_table, flow); + } return 0; } @@ -271,7 +271,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, if (nf_flow_has_expired(flow)) flow_offload_fixup_ct(flow->ct); - else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); @@ -279,7 +279,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, void flow_offload_teardown(struct flow_offload *flow) { - flow->flags |= FLOW_OFFLOAD_TEARDOWN; + set_bit(NF_FLOW_TEARDOWN, &flow->flags); flow_offload_fixup_ct_state(flow->ct); } @@ -300,7 +300,7 @@ flow_offload_lookup(struct nf_flowtable *flow_table, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - if (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN)) + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) return NULL; if (unlikely(nf_ct_is_dying(flow->ct))) @@ -348,19 +348,18 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (flow->flags & FLOW_OFFLOAD_HW) - nf_flow_offload_stats(flow_table, flow); - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - (flow->flags & (FLOW_OFFLOAD_DYING | FLOW_OFFLOAD_TEARDOWN))) { - if (flow->flags & FLOW_OFFLOAD_HW) { - if (!(flow->flags & FLOW_OFFLOAD_HW_DYING)) + test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (test_bit(NF_FLOW_HW, &flow->flags)) { + if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); - else if (flow->flags & FLOW_OFFLOAD_HW_DEAD) + else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags)) flow_offload_del(flow_table, flow); } else { flow_offload_del(flow_table, flow); } + } else if (test_bit(NF_FLOW_HW, &flow->flags)) { + nf_flow_offload_stats(flow_table, flow); } } @@ -524,7 +523,7 @@ static void nf_flow_table_do_cleanup(struct flow_offload *flow, void *data) if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) && (flow->tuplehash[0].tuple.iifidx == dev->ifindex || flow->tuplehash[1].tuple.iifidx == dev->ifindex)) - flow_offload_dead(flow); + flow_offload_teardown(flow); } static void nf_flow_table_iterate_cleanup(struct nf_flowtable *flowtable, diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 7ea2ddc2aa93..9e563fd3da0f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -144,11 +144,11 @@ static int nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, { struct iphdr *iph = ip_hdr(skb); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_snat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir) < 0 || nf_flow_dnat_ip(flow, skb, iph, thoff, dir) < 0)) return -1; @@ -232,6 +232,13 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_offload_refresh(struct nf_flowtable *flow_table, + struct flow_offload *flow) +{ + return nf_flowtable_hw_offload(flow_table) && + test_and_clear_bit(NF_FLOW_HW_REFRESH, &flow->flags); +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -272,6 +279,9 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; @@ -414,11 +424,11 @@ static int nf_flow_nat_ipv6(const struct flow_offload *flow, struct ipv6hdr *ip6h = ipv6_hdr(skb); unsigned int thoff = sizeof(*ip6h); - if (flow->flags & FLOW_OFFLOAD_SNAT && + if (test_bit(NF_FLOW_SNAT, &flow->flags) && (nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; - if (flow->flags & FLOW_OFFLOAD_DNAT && + if (test_bit(NF_FLOW_DNAT, &flow->flags) && (nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir) < 0 || nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir) < 0)) return -1; @@ -498,6 +508,9 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (unlikely(nf_flow_offload_refresh(flow_table, flow))) + nf_flow_offload_add(flow_table, flow); + if (nf_flow_offload_dst_check(&rt->dst)) { flow_offload_teardown(flow); return NF_ACCEPT; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index d06969af1085..c8b70ffeef0c 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -24,6 +24,7 @@ struct flow_offload_work { }; struct nf_flow_key { + struct flow_dissector_key_meta meta; struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; union { @@ -55,6 +56,7 @@ static int nf_flow_rule_match(struct nf_flow_match *match, struct nf_flow_key *mask = &match->mask; struct nf_flow_key *key = &match->key; + NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_META, meta); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_CONTROL, control); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_BASIC, basic); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); @@ -62,6 +64,9 @@ static int nf_flow_rule_match(struct nf_flow_match *match, NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_TCP, tcp); NF_FLOW_DISSECTOR(match, FLOW_DISSECTOR_KEY_PORTS, tp); + key->meta.ingress_ifindex = tuple->iifidx; + mask->meta.ingress_ifindex = 0xffffffff; + switch (tuple->l3proto) { case AF_INET: key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; @@ -105,7 +110,8 @@ static int nf_flow_rule_match(struct nf_flow_match *match, key->tp.dst = tuple->dst_port; mask->tp.dst = 0xffff; - match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_CONTROL) | + match->dissector.used_keys |= BIT(FLOW_DISSECTOR_KEY_META) | + BIT(FLOW_DISSECTOR_KEY_CONTROL) | BIT(FLOW_DISSECTOR_KEY_BASIC) | BIT(FLOW_DISSECTOR_KEY_PORTS); return 0; @@ -444,16 +450,16 @@ int nf_flow_rule_route_ipv4(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv4_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv4_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_SNAT || - flow->flags & FLOW_OFFLOAD_DNAT) + if (test_bit(NF_FLOW_SNAT, &flow->flags) || + test_bit(NF_FLOW_DNAT, &flow->flags)) flow_offload_ipv4_checksum(net, flow, flow_rule); flow_offload_redirect(flow, dir, flow_rule); @@ -470,11 +476,11 @@ int nf_flow_rule_route_ipv6(struct net *net, const struct flow_offload *flow, flow_offload_eth_dst(net, flow, dir, flow_rule) < 0) return -1; - if (flow->flags & FLOW_OFFLOAD_SNAT) { + if (test_bit(NF_FLOW_SNAT, &flow->flags)) { flow_offload_ipv6_snat(net, flow, dir, flow_rule); flow_offload_port_snat(net, flow, dir, flow_rule); } - if (flow->flags & FLOW_OFFLOAD_DNAT) { + if (test_bit(NF_FLOW_DNAT, &flow->flags)) { flow_offload_ipv6_dnat(net, flow, dir, flow_rule); flow_offload_port_dnat(net, flow, dir, flow_rule); } @@ -586,23 +592,25 @@ static void nf_flow_offload_init(struct flow_cls_offload *cls_flow, cls_flow->cookie = (unsigned long)tuple; } -static int flow_offload_tuple_add(struct flow_offload_work *offload, - struct nf_flow_rule *flow_rule, - enum flow_offload_tuple_dir dir) +static int nf_flow_offload_tuple(struct nf_flowtable *flowtable, + struct flow_offload *flow, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir, + int priority, int cmd, + struct list_head *block_cb_list) { - struct nf_flowtable *flowtable = offload->flowtable; struct flow_cls_offload cls_flow = {}; struct flow_block_cb *block_cb; struct netlink_ext_ack extack; __be16 proto = ETH_P_ALL; int err, i = 0; - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_REPLACE, - &offload->flow->tuplehash[dir].tuple, &extack); - cls_flow.rule = flow_rule->rule; + nf_flow_offload_init(&cls_flow, proto, priority, cmd, + &flow->tuplehash[dir].tuple, &extack); + if (cmd == FLOW_CLS_REPLACE) + cls_flow.rule = flow_rule->rule; - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) { + list_for_each_entry(block_cb, block_cb_list, list) { err = block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); if (err < 0) @@ -614,23 +622,22 @@ static int flow_offload_tuple_add(struct flow_offload_work *offload, return i; } +static int flow_offload_tuple_add(struct flow_offload_work *offload, + struct nf_flow_rule *flow_rule, + enum flow_offload_tuple_dir dir) +{ + return nf_flow_offload_tuple(offload->flowtable, offload->flow, + flow_rule, dir, offload->priority, + FLOW_CLS_REPLACE, + &offload->flowtable->flow_block.cb_list); +} + static void flow_offload_tuple_del(struct flow_offload_work *offload, enum flow_offload_tuple_dir dir) { - struct nf_flowtable *flowtable = offload->flowtable; - struct flow_cls_offload cls_flow = {}; - struct flow_block_cb *block_cb; - struct netlink_ext_ack extack; - __be16 proto = ETH_P_ALL; - - nf_flow_offload_init(&cls_flow, proto, offload->priority, - FLOW_CLS_DESTROY, - &offload->flow->tuplehash[dir].tuple, &extack); - - list_for_each_entry(block_cb, &flowtable->flow_block.cb_list, list) - block_cb->cb(TC_SETUP_CLSFLOWER, &cls_flow, block_cb->cb_priv); - - offload->flow->flags |= FLOW_OFFLOAD_HW_DEAD; + nf_flow_offload_tuple(offload->flowtable, offload->flow, NULL, dir, + offload->priority, FLOW_CLS_DESTROY, + &offload->flowtable->flow_block.cb_list); } static int flow_offload_rule_add(struct flow_offload_work *offload, @@ -648,20 +655,20 @@ static int flow_offload_rule_add(struct flow_offload_work *offload, return 0; } -static int flow_offload_work_add(struct flow_offload_work *offload) +static void flow_offload_work_add(struct flow_offload_work *offload) { struct nf_flow_rule *flow_rule[FLOW_OFFLOAD_DIR_MAX]; int err; err = nf_flow_offload_alloc(offload, flow_rule); if (err < 0) - return -ENOMEM; + return; err = flow_offload_rule_add(offload, flow_rule); + if (err < 0) + set_bit(NF_FLOW_HW_REFRESH, &offload->flow->flags); nf_flow_offload_destroy(flow_rule); - - return err; } static void flow_offload_work_del(struct flow_offload_work *offload) @@ -706,7 +713,6 @@ static void flow_offload_work_handler(struct work_struct *work) { struct flow_offload_work *offload, *next; LIST_HEAD(offload_pending_list); - int ret; spin_lock_bh(&flow_offload_pending_list_lock); list_replace_init(&flow_offload_pending_list, &offload_pending_list); @@ -715,9 +721,7 @@ static void flow_offload_work_handler(struct work_struct *work) list_for_each_entry_safe(offload, next, &offload_pending_list, list) { switch (offload->cmd) { case FLOW_CLS_REPLACE: - ret = flow_offload_work_add(offload); - if (ret < 0) - offload->flow->flags &= ~FLOW_OFFLOAD_HW; + flow_offload_work_add(offload); break; case FLOW_CLS_DESTROY: flow_offload_work_del(offload); @@ -742,20 +746,33 @@ static void flow_offload_queue_work(struct flow_offload_work *offload) schedule_work(&nf_flow_offload_work); } -void nf_flow_offload_add(struct nf_flowtable *flowtable, - struct flow_offload *flow) +static struct flow_offload_work * +nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, + struct flow_offload *flow, unsigned int cmd) { struct flow_offload_work *offload; offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); if (!offload) - return; + return NULL; - offload->cmd = FLOW_CLS_REPLACE; + offload->cmd = cmd; offload->flow = flow; offload->priority = flowtable->priority; offload->flowtable = flowtable; - flow->flags |= FLOW_OFFLOAD_HW; + + return offload; +} + + +void nf_flow_offload_add(struct nf_flowtable *flowtable, + struct flow_offload *flow) +{ + struct flow_offload_work *offload; + + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_REPLACE); + if (!offload) + return; flow_offload_queue_work(offload); } @@ -765,15 +782,11 @@ void nf_flow_offload_del(struct nf_flowtable *flowtable, { struct flow_offload_work *offload; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_DESTROY); if (!offload) return; - offload->cmd = FLOW_CLS_DESTROY; - offload->flow = flow; - offload->flow->flags |= FLOW_OFFLOAD_HW_DYING; - offload->flowtable = flowtable; - + set_bit(NF_FLOW_HW_DYING, &flow->flags); flow_offload_queue_work(offload); } @@ -784,24 +797,19 @@ void nf_flow_offload_stats(struct nf_flowtable *flowtable, __s32 delta; delta = nf_flow_timeout_delta(flow->timeout); - if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10) || - flow->flags & FLOW_OFFLOAD_HW_DYING) + if ((delta >= (9 * NF_FLOW_TIMEOUT) / 10)) return; - offload = kzalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + offload = nf_flow_offload_work_alloc(flowtable, flow, FLOW_CLS_STATS); if (!offload) return; - offload->cmd = FLOW_CLS_STATS; - offload->flow = flow; - offload->flowtable = flowtable; - flow_offload_queue_work(offload); } void nf_flow_table_offload_flush(struct nf_flowtable *flowtable) { - if (flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD) + if (nf_flowtable_hw_offload(flowtable)) flush_work(&nf_flow_offload_work); } @@ -830,28 +838,44 @@ static int nf_flow_table_block_setup(struct nf_flowtable *flowtable, return err; } -int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, - struct net_device *dev, - enum flow_block_command cmd) +static int nf_flow_table_offload_cmd(struct flow_block_offload *bo, + struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd, + struct netlink_ext_ack *extack) { - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo = {}; int err; - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) + if (!nf_flowtable_hw_offload(flowtable)) return 0; if (!dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; - bo.net = dev_net(dev); - bo.block = &flowtable->flow_block; - bo.command = cmd; - bo.binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; - bo.extack = &extack; - INIT_LIST_HEAD(&bo.cb_list); + memset(bo, 0, sizeof(*bo)); + bo->net = dev_net(dev); + bo->block = &flowtable->flow_block; + bo->command = cmd; + bo->binder_type = FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS; + bo->extack = extack; + INIT_LIST_HEAD(&bo->cb_list); + + err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, bo); + if (err < 0) + return err; + + return 0; +} + +int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, + struct net_device *dev, + enum flow_block_command cmd) +{ + struct netlink_ext_ack extack = {}; + struct flow_block_offload bo; + int err; - err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_FT, &bo); + err = nf_flow_table_offload_cmd(&bo, flowtable, dev, cmd, &extack); if (err < 0) return err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7e63b481cc86..d1318bdf49ca 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3391,6 +3391,7 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_SIZE] = { .type = NLA_U32 }, + [NFTA_SET_DESC_CONCAT] = { .type = NLA_NESTED }, }; static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, @@ -3557,6 +3558,33 @@ static __be64 nf_jiffies64_to_msecs(u64 input) return cpu_to_be64(jiffies64_to_msecs(input)); } +static int nf_tables_fill_set_concat(struct sk_buff *skb, + const struct nft_set *set) +{ + struct nlattr *concat, *field; + int i; + + concat = nla_nest_start_noflag(skb, NFTA_SET_DESC_CONCAT); + if (!concat) + return -ENOMEM; + + for (i = 0; i < set->field_count; i++) { + field = nla_nest_start_noflag(skb, NFTA_LIST_ELEM); + if (!field) + return -ENOMEM; + + if (nla_put_be32(skb, NFTA_SET_FIELD_LEN, + htonl(set->field_len[i]))) + return -ENOMEM; + + nla_nest_end(skb, field); + } + + nla_nest_end(skb, concat); + + return 0; +} + static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, const struct nft_set *set, u16 event, u16 flags) { @@ -3620,11 +3648,17 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, goto nla_put_failure; desc = nla_nest_start_noflag(skb, NFTA_SET_DESC); + if (desc == NULL) goto nla_put_failure; if (set->size && nla_put_be32(skb, NFTA_SET_DESC_SIZE, htonl(set->size))) goto nla_put_failure; + + if (set->field_count > 1 && + nf_tables_fill_set_concat(skb, set)) + goto nla_put_failure; + nla_nest_end(skb, desc); nlmsg_end(skb, nlh); @@ -3797,6 +3831,53 @@ err: return err; } +static const struct nla_policy nft_concat_policy[NFTA_SET_FIELD_MAX + 1] = { + [NFTA_SET_FIELD_LEN] = { .type = NLA_U32 }, +}; + +static int nft_set_desc_concat_parse(const struct nlattr *attr, + struct nft_set_desc *desc) +{ + struct nlattr *tb[NFTA_SET_FIELD_MAX + 1]; + u32 len; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_SET_FIELD_MAX, attr, + nft_concat_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_SET_FIELD_LEN]) + return -EINVAL; + + len = ntohl(nla_get_be32(tb[NFTA_SET_FIELD_LEN])); + + if (len * BITS_PER_BYTE / 32 > NFT_REG32_COUNT) + return -E2BIG; + + desc->field_len[desc->field_count++] = len; + + return 0; +} + +static int nft_set_desc_concat(struct nft_set_desc *desc, + const struct nlattr *nla) +{ + struct nlattr *attr; + int rem, err; + + nla_for_each_nested(attr, nla, rem) { + if (nla_type(attr) != NFTA_LIST_ELEM) + return -EINVAL; + + err = nft_set_desc_concat_parse(attr, desc); + if (err < 0) + return err; + } + + return 0; +} + static int nf_tables_set_desc_parse(struct nft_set_desc *desc, const struct nlattr *nla) { @@ -3810,8 +3891,10 @@ static int nf_tables_set_desc_parse(struct nft_set_desc *desc, if (da[NFTA_SET_DESC_SIZE] != NULL) desc->size = ntohl(nla_get_be32(da[NFTA_SET_DESC_SIZE])); + if (da[NFTA_SET_DESC_CONCAT]) + err = nft_set_desc_concat(desc, da[NFTA_SET_DESC_CONCAT]); - return 0; + return err; } static int nf_tables_newset(struct net *net, struct sock *nlsk, @@ -3834,6 +3917,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, unsigned char *udata; u16 udlen; int err; + int i; if (nla[NFTA_SET_TABLE] == NULL || nla[NFTA_SET_NAME] == NULL || @@ -4012,6 +4096,10 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, set->gc_int = gc_int; set->handle = nf_tables_alloc_handle(table); + set->field_count = desc.field_count; + for (i = 0; i < desc.field_count; i++) + set->field_len[i] = desc.field_len[i]; + err = ops->init(set, &desc, nla); if (err < 0) goto err3; @@ -4215,6 +4303,9 @@ const struct nft_set_ext_type nft_set_ext_types[] = { .len = sizeof(struct nft_userdata), .align = __alignof__(struct nft_userdata), }, + [NFT_SET_EXT_KEY_END] = { + .align = __alignof__(u32), + }, }; EXPORT_SYMBOL_GPL(nft_set_ext_types); @@ -4233,6 +4324,7 @@ static const struct nla_policy nft_set_elem_policy[NFTA_SET_ELEM_MAX + 1] = { [NFTA_SET_ELEM_EXPR] = { .type = NLA_NESTED }, [NFTA_SET_ELEM_OBJREF] = { .type = NLA_STRING, .len = NFT_OBJ_MAXNAMELEN - 1 }, + [NFTA_SET_ELEM_KEY_END] = { .type = NLA_NESTED }, }; static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX + 1] = { @@ -4282,6 +4374,11 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, NFT_DATA_VALUE, set->klen) < 0) goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END) && + nft_data_dump(skb, NFTA_SET_ELEM_KEY_END, nft_set_ext_key_end(ext), + NFT_DATA_VALUE, set->klen) < 0) + goto nla_put_failure; + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_data_dump(skb, NFTA_SET_ELEM_DATA, nft_set_ext_data(ext), set->dtype == NFT_DATA_VERDICT ? NFT_DATA_VERDICT : NFT_DATA_VALUE, @@ -4524,11 +4621,28 @@ static int nft_setelem_parse_flags(const struct nft_set *set, return 0; } +static int nft_setelem_parse_key(struct nft_ctx *ctx, struct nft_set *set, + struct nft_data *key, struct nlattr *attr) +{ + struct nft_data_desc desc; + int err; + + err = nft_data_init(ctx, key, NFT_DATA_VALUE_MAXLEN, &desc, attr); + if (err < 0) + return err; + + if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { + nft_data_release(key, desc.type); + return -EINVAL; + } + + return 0; +} + static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; - struct nft_data_desc desc; struct nft_set_elem elem; struct sk_buff *skb; uint32_t flags = 0; @@ -4547,15 +4661,16 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (err < 0) return err; - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) { - nft_data_release(&elem.key.val, desc.type); - return err; + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; } priv = set->ops->get(ctx->net, set, &elem, flags); @@ -4683,8 +4798,8 @@ static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, - const u32 *key, const u32 *data, - u64 timeout, u64 expiration, gfp_t gfp) + const u32 *key, const u32 *key_end, + const u32 *data, u64 timeout, u64 expiration, gfp_t gfp) { struct nft_set_ext *ext; void *elem; @@ -4697,6 +4812,8 @@ void *nft_set_elem_init(const struct nft_set *set, nft_set_ext_init(ext, tmpl); memcpy(nft_set_ext_key(ext), key, set->klen); + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) + memcpy(nft_set_ext_key_end(ext), key_end, set->klen); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) memcpy(nft_set_ext_data(ext), data, set->dlen); if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -4756,13 +4873,13 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; u8 genmask = nft_genmask_next(ctx->net); - struct nft_data_desc d1, d2; struct nft_set_ext_tmpl tmpl; struct nft_set_ext *ext, *ext2; struct nft_set_elem elem; struct nft_set_binding *binding; struct nft_object *obj = NULL; struct nft_userdata *udata; + struct nft_data_desc desc; struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; @@ -4828,15 +4945,22 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, return err; } - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &d1, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; - err = -EINVAL; - if (d1.type != NFT_DATA_VALUE || d1.len != set->klen) - goto err2; + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); + + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + goto err_parse_key; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, d1.len); if (timeout > 0) { nft_set_ext_add(&tmpl, NFT_SET_EXT_EXPIRATION); if (timeout != set->timeout) @@ -4846,27 +4970,27 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_OBJREF] != NULL) { if (!(set->flags & NFT_SET_OBJECT)) { err = -EINVAL; - goto err2; + goto err_parse_key_end; } obj = nft_obj_lookup(ctx->net, ctx->table, nla[NFTA_SET_ELEM_OBJREF], set->objtype, genmask); if (IS_ERR(obj)) { err = PTR_ERR(obj); - goto err2; + goto err_parse_key_end; } nft_set_ext_add(&tmpl, NFT_SET_EXT_OBJREF); } if (nla[NFTA_SET_ELEM_DATA] != NULL) { - err = nft_data_init(ctx, &data, sizeof(data), &d2, + err = nft_data_init(ctx, &data, sizeof(data), &desc, nla[NFTA_SET_ELEM_DATA]); if (err < 0) - goto err2; + goto err_parse_key_end; err = -EINVAL; - if (set->dtype != NFT_DATA_VERDICT && d2.len != set->dlen) - goto err3; + if (set->dtype != NFT_DATA_VERDICT && desc.len != set->dlen) + goto err_parse_data; dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { @@ -4882,18 +5006,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = nft_validate_register_store(&bind_ctx, dreg, &data, - d2.type, d2.len); + desc.type, desc.len); if (err < 0) - goto err3; + goto err_parse_data; - if (d2.type == NFT_DATA_VERDICT && + if (desc.type == NFT_DATA_VERDICT && (data.verdict.code == NFT_GOTO || data.verdict.code == NFT_JUMP)) nft_validate_state_update(ctx->net, NFT_VALIDATE_NEED); } - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, desc.len); } /* The full maximum length of userdata can exceed the maximum @@ -4909,10 +5033,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, data.data, + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, data.data, timeout, expiration, GFP_KERNEL); if (elem.priv == NULL) - goto err3; + goto err_parse_data; ext = nft_set_elem_ext(set, elem.priv); if (flags) @@ -4929,7 +5054,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set); if (trans == NULL) - goto err4; + goto err_trans; ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; err = set->ops->insert(ctx->net, set, &elem, &ext2); @@ -4940,7 +5065,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) { err = -EBUSY; - goto err5; + goto err_element_clash; } if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && @@ -4953,33 +5078,35 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, else if (!(nlmsg_flags & NLM_F_EXCL)) err = 0; } - goto err5; + goto err_element_clash; } if (set->size && !atomic_add_unless(&set->nelems, 1, set->size + set->ndeact)) { err = -ENFILE; - goto err6; + goto err_set_full; } nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err6: +err_set_full: set->ops->remove(ctx->net, set, &elem); -err5: +err_element_clash: kfree(trans); -err4: +err_trans: if (obj) obj->use--; kfree(elem.priv); -err3: +err_parse_data: if (nla[NFTA_SET_ELEM_DATA] != NULL) - nft_data_release(&data, d2.type); -err2: - nft_data_release(&elem.key.val, d1.type); -err1: + nft_data_release(&data, desc.type); +err_parse_key_end: + nft_data_release(&elem.key_end.val, NFT_DATA_VALUE); +err_parse_key: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); + return err; } @@ -5074,7 +5201,6 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; - struct nft_data_desc desc; struct nft_set_elem elem; struct nft_set_ext *ext; struct nft_trans *trans; @@ -5085,11 +5211,10 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, err = nla_parse_nested_deprecated(nla, NFTA_SET_ELEM_MAX, attr, nft_set_elem_policy, NULL); if (err < 0) - goto err1; + return err; - err = -EINVAL; if (nla[NFTA_SET_ELEM_KEY] == NULL) - goto err1; + return -EINVAL; nft_set_ext_prepare(&tmpl); @@ -5099,37 +5224,41 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags != 0) nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, - nla[NFTA_SET_ELEM_KEY]); + err = nft_setelem_parse_key(ctx, set, &elem.key.val, + nla[NFTA_SET_ELEM_KEY]); if (err < 0) - goto err1; + return err; - err = -EINVAL; - if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) - goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, set->klen); - nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + if (nla[NFTA_SET_ELEM_KEY_END]) { + err = nft_setelem_parse_key(ctx, set, &elem.key_end.val, + nla[NFTA_SET_ELEM_KEY_END]); + if (err < 0) + return err; + + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY_END, set->klen); + } err = -ENOMEM; - elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, - 0, GFP_KERNEL); + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, + elem.key_end.val.data, NULL, 0, 0, + GFP_KERNEL); if (elem.priv == NULL) - goto err2; + goto fail_elem; ext = nft_set_elem_ext(set, elem.priv); if (flags) *nft_set_ext_flags(ext) = flags; trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); - if (trans == NULL) { - err = -ENOMEM; - goto err3; - } + if (trans == NULL) + goto fail_trans; priv = set->ops->deactivate(ctx->net, set, &elem); if (priv == NULL) { err = -ENOENT; - goto err4; + goto fail_ops; } kfree(elem.priv); elem.priv = priv; @@ -5140,13 +5269,12 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err4: +fail_ops: kfree(trans); -err3: +fail_trans: kfree(elem.priv); -err2: - nft_data_release(&elem.key.val, desc.type); -err1: +fail_elem: + nft_data_release(&elem.key.val, NFT_DATA_VALUE); return err; } diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c index a9fce8d10051..586b621007eb 100644 --- a/net/netfilter/nf_tables_set_core.c +++ b/net/netfilter/nf_tables_set_core.c @@ -9,12 +9,14 @@ static int __init nf_tables_set_module_init(void) nft_register_set(&nft_set_rhash_type); nft_register_set(&nft_set_bitmap_type); nft_register_set(&nft_set_rbtree_type); + nft_register_set(&nft_set_pipapo_type); return 0; } static void __exit nf_tables_set_module_exit(void) { + nft_unregister_set(&nft_set_pipapo_type); nft_unregister_set(&nft_set_rbtree_type); nft_unregister_set(&nft_set_bitmap_type); nft_unregister_set(&nft_set_rhash_type); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index feabdfb22920..76535fd9278c 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -778,7 +778,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; - struct sk_buff *skb, *segs; + struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -815,8 +815,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; - do { - struct sk_buff *nskb = segs->next; + skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -824,8 +823,7 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) queued++; else kfree_skb(segs); - segs = nskb; - } while (segs); + } if (queued) { if (err) /* some segments are already queued */ diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c index 10e9d50e4e19..0ed2281f03be 100644 --- a/net/netfilter/nft_bitwise.c +++ b/net/netfilter/nft_bitwise.c @@ -18,21 +18,66 @@ struct nft_bitwise { enum nft_registers sreg:8; enum nft_registers dreg:8; + enum nft_bitwise_ops op:8; u8 len; struct nft_data mask; struct nft_data xor; + struct nft_data data; }; +static void nft_bitwise_eval_bool(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) + dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; +} + +static void nft_bitwise_eval_lshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = DIV_ROUND_UP(priv->len, sizeof(u32)); i > 0; i--) { + dst[i - 1] = (src[i - 1] << shift) | carry; + carry = src[i - 1] >> (BITS_PER_TYPE(u32) - shift); + } +} + +static void nft_bitwise_eval_rshift(u32 *dst, const u32 *src, + const struct nft_bitwise *priv) +{ + u32 shift = priv->data.data[0]; + unsigned int i; + u32 carry = 0; + + for (i = 0; i < DIV_ROUND_UP(priv->len, sizeof(u32)); i++) { + dst[i] = carry | (src[i] >> shift); + carry = src[i] << (BITS_PER_TYPE(u32) - shift); + } +} + void nft_bitwise_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_bitwise *priv = nft_expr_priv(expr); const u32 *src = ®s->data[priv->sreg]; u32 *dst = ®s->data[priv->dreg]; - unsigned int i; - for (i = 0; i < DIV_ROUND_UP(priv->len, 4); i++) - dst[i] = (src[i] & priv->mask.data[i]) ^ priv->xor.data[i]; + switch (priv->op) { + case NFT_BITWISE_BOOL: + nft_bitwise_eval_bool(dst, src, priv); + break; + case NFT_BITWISE_LSHIFT: + nft_bitwise_eval_lshift(dst, src, priv); + break; + case NFT_BITWISE_RSHIFT: + nft_bitwise_eval_rshift(dst, src, priv); + break; + } } static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { @@ -41,40 +86,22 @@ static const struct nla_policy nft_bitwise_policy[NFTA_BITWISE_MAX + 1] = { [NFTA_BITWISE_LEN] = { .type = NLA_U32 }, [NFTA_BITWISE_MASK] = { .type = NLA_NESTED }, [NFTA_BITWISE_XOR] = { .type = NLA_NESTED }, + [NFTA_BITWISE_OP] = { .type = NLA_U32 }, + [NFTA_BITWISE_DATA] = { .type = NLA_NESTED }, }; -static int nft_bitwise_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_bitwise_init_bool(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { - struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_data_desc d1, d2; - u32 len; int err; - if (tb[NFTA_BITWISE_SREG] == NULL || - tb[NFTA_BITWISE_DREG] == NULL || - tb[NFTA_BITWISE_LEN] == NULL || - tb[NFTA_BITWISE_MASK] == NULL || - tb[NFTA_BITWISE_XOR] == NULL) + if (tb[NFTA_BITWISE_DATA]) return -EINVAL; - err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); - if (err < 0) - return err; - - priv->len = len; - - priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); - err = nft_validate_register_load(priv->sreg, priv->len); - if (err < 0) - return err; - - priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); - err = nft_validate_register_store(ctx, priv->dreg, NULL, - NFT_DATA_VALUE, priv->len); - if (err < 0) - return err; + if (!tb[NFTA_BITWISE_MASK] || + !tb[NFTA_BITWISE_XOR]) + return -EINVAL; err = nft_data_init(NULL, &priv->mask, sizeof(priv->mask), &d1, tb[NFTA_BITWISE_MASK]); @@ -102,40 +129,151 @@ err1: return err; } -static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +static int nft_bitwise_init_shift(struct nft_bitwise *priv, + const struct nlattr *const tb[]) { - const struct nft_bitwise *priv = nft_expr_priv(expr); + struct nft_data_desc d; + int err; - if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) - goto nla_put_failure; - if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) - goto nla_put_failure; - if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) - goto nla_put_failure; + if (tb[NFTA_BITWISE_MASK] || + tb[NFTA_BITWISE_XOR]) + return -EINVAL; + + if (!tb[NFTA_BITWISE_DATA]) + return -EINVAL; + err = nft_data_init(NULL, &priv->data, sizeof(priv->data), &d, + tb[NFTA_BITWISE_DATA]); + if (err < 0) + return err; + if (d.type != NFT_DATA_VALUE || d.len != sizeof(u32) || + priv->data.data[0] >= BITS_PER_TYPE(u32)) { + nft_data_release(&priv->data, d.type); + return -EINVAL; + } + + return 0; +} + +static int nft_bitwise_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_bitwise *priv = nft_expr_priv(expr); + u32 len; + int err; + + if (!tb[NFTA_BITWISE_SREG] || + !tb[NFTA_BITWISE_DREG] || + !tb[NFTA_BITWISE_LEN]) + return -EINVAL; + + err = nft_parse_u32_check(tb[NFTA_BITWISE_LEN], U8_MAX, &len); + if (err < 0) + return err; + + priv->len = len; + + priv->sreg = nft_parse_register(tb[NFTA_BITWISE_SREG]); + err = nft_validate_register_load(priv->sreg, priv->len); + if (err < 0) + return err; + + priv->dreg = nft_parse_register(tb[NFTA_BITWISE_DREG]); + err = nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, priv->len); + if (err < 0) + return err; + + if (tb[NFTA_BITWISE_OP]) { + priv->op = ntohl(nla_get_be32(tb[NFTA_BITWISE_OP])); + switch (priv->op) { + case NFT_BITWISE_BOOL: + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + break; + default: + return -EOPNOTSUPP; + } + } else { + priv->op = NFT_BITWISE_BOOL; + } + + switch(priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_init_bool(priv, tb); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_init_shift(priv, tb); + break; + } + + return err; +} + +static int nft_bitwise_dump_bool(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ if (nft_data_dump(skb, NFTA_BITWISE_MASK, &priv->mask, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; if (nft_data_dump(skb, NFTA_BITWISE_XOR, &priv->xor, NFT_DATA_VALUE, priv->len) < 0) - goto nla_put_failure; + return -1; return 0; +} + +static int nft_bitwise_dump_shift(struct sk_buff *skb, + const struct nft_bitwise *priv) +{ + if (nft_data_dump(skb, NFTA_BITWISE_DATA, &priv->data, + NFT_DATA_VALUE, sizeof(u32)) < 0) + return -1; + return 0; +} + +static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_bitwise *priv = nft_expr_priv(expr); + int err = 0; + + if (nft_dump_register(skb, NFTA_BITWISE_SREG, priv->sreg)) + return -1; + if (nft_dump_register(skb, NFTA_BITWISE_DREG, priv->dreg)) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_LEN, htonl(priv->len))) + return -1; + if (nla_put_be32(skb, NFTA_BITWISE_OP, htonl(priv->op))) + return -1; + + switch (priv->op) { + case NFT_BITWISE_BOOL: + err = nft_bitwise_dump_bool(skb, priv); + break; + case NFT_BITWISE_LSHIFT: + case NFT_BITWISE_RSHIFT: + err = nft_bitwise_dump_shift(skb, priv); + break; + } -nla_put_failure: - return -1; + return err; } static struct nft_data zero; static int nft_bitwise_offload(struct nft_offload_ctx *ctx, - struct nft_flow_rule *flow, - const struct nft_expr *expr) + struct nft_flow_rule *flow, + const struct nft_expr *expr) { const struct nft_bitwise *priv = nft_expr_priv(expr); struct nft_offload_reg *reg = &ctx->regs[priv->dreg]; + if (priv->op != NFT_BITWISE_BOOL) + return -EOPNOTSUPP; + if (memcmp(&priv->xor, &zero, sizeof(priv->xor)) || priv->sreg != priv->dreg || priv->len != reg->len) return -EOPNOTSUPP; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 8887295414dc..683785225a3e 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -54,7 +54,7 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, timeout = priv->timeout ? : set->timeout; elem = nft_set_elem_init(set, &priv->tmpl, - ®s->data[priv->sreg_key], + ®s->data[priv->sreg_key], NULL, ®s->data[priv->sreg_data], timeout, 0, GFP_ATOMIC); if (elem == NULL) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 9740b554fdb3..951b6e87ed5d 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -17,6 +17,7 @@ #include <linux/smp.h> #include <linux/static_key.h> #include <net/dst.h> +#include <net/ip.h> #include <net/sock.h> #include <net/tcp_states.h> /* for TCP_TIME_WAIT */ #include <net/netfilter/nf_tables.h> @@ -33,8 +34,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state); -static u8 nft_meta_weekday(time64_t secs) +static u8 nft_meta_weekday(void) { + time64_t secs = ktime_get_real_seconds(); unsigned int dse; u8 wday; @@ -56,14 +58,264 @@ static u32 nft_meta_hour(time64_t secs) + tm.tm_sec; } +static noinline_for_stack void +nft_meta_get_eval_time(enum nft_meta_keys key, + u32 *dest) +{ + switch (key) { + case NFT_META_TIME_NS: + nft_reg_store64(dest, ktime_get_real_ns()); + break; + case NFT_META_TIME_DAY: + nft_reg_store8(dest, nft_meta_weekday()); + break; + case NFT_META_TIME_HOUR: + *dest = nft_meta_hour(ktime_get_real_seconds()); + break; + default: + break; + } +} + +static noinline bool +nft_meta_get_eval_pkttype_lo(const struct nft_pktinfo *pkt, + u32 *dest) +{ + const struct sk_buff *skb = pkt->skb; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + break; + case NFPROTO_IPV6: + nft_reg_store8(dest, PACKET_MULTICAST); + break; + case NFPROTO_NETDEV: + switch (skb->protocol) { + case htons(ETH_P_IP): { + int noff = skb_network_offset(skb); + struct iphdr *iph, _iph; + + iph = skb_header_pointer(skb, noff, + sizeof(_iph), &_iph); + if (!iph) + return false; + + if (ipv4_is_multicast(iph->daddr)) + nft_reg_store8(dest, PACKET_MULTICAST); + else + nft_reg_store8(dest, PACKET_BROADCAST); + + break; + } + case htons(ETH_P_IPV6): + nft_reg_store8(dest, PACKET_MULTICAST); + break; + default: + WARN_ON_ONCE(1); + return false; + } + break; + default: + WARN_ON_ONCE(1); + return false; + } + + return true; +} + +static noinline bool +nft_meta_get_eval_skugid(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + struct socket *sock; + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + read_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + if (!sock || !sock->file) { + read_unlock_bh(&sk->sk_callback_lock); + return false; + } + + switch (key) { + case NFT_META_SKUID: + *dest = from_kuid_munged(&init_user_ns, + sock->file->f_cred->fsuid); + break; + case NFT_META_SKGID: + *dest = from_kgid_munged(&init_user_ns, + sock->file->f_cred->fsgid); + break; + default: + break; + } + + read_unlock_bh(&sk->sk_callback_lock); + return true; +} + +#ifdef CONFIG_CGROUP_NET_CLASSID +static noinline bool +nft_meta_get_eval_cgroup(u32 *dest, const struct nft_pktinfo *pkt) +{ + struct sock *sk = skb_to_full_sk(pkt->skb); + + if (!sk || !sk_fullsock(sk) || !net_eq(nft_net(pkt), sock_net(sk))) + return false; + + *dest = sock_cgroup_classid(&sk->sk_cgrp_data); + return true; +} +#endif + +static noinline bool nft_meta_get_eval_kind(enum nft_meta_keys key, + u32 *dest, + const struct nft_pktinfo *pkt) +{ + const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); + + switch (key) { + case NFT_META_IIFKIND: + if (!in || !in->rtnl_link_ops) + return false; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (!out || !out->rtnl_link_ops) + return false; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; + default: + return false; + } + + return true; +} + +static void nft_meta_store_ifindex(u32 *dest, const struct net_device *dev) +{ + *dest = dev ? dev->ifindex : 0; +} + +static void nft_meta_store_ifname(u32 *dest, const struct net_device *dev) +{ + strncpy((char *)dest, dev ? dev->name : "", IFNAMSIZ); +} + +static bool nft_meta_store_iftype(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + nft_reg_store16(dest, dev->type); + return true; +} + +static bool nft_meta_store_ifgroup(u32 *dest, const struct net_device *dev) +{ + if (!dev) + return false; + + *dest = dev->group; + return true; +} + +static bool nft_meta_get_eval_ifname(enum nft_meta_keys key, u32 *dest, + const struct nft_pktinfo *pkt) +{ + switch (key) { + case NFT_META_IIFNAME: + nft_meta_store_ifname(dest, nft_in(pkt)); + break; + case NFT_META_OIFNAME: + nft_meta_store_ifname(dest, nft_out(pkt)); + break; + case NFT_META_IIF: + nft_meta_store_ifindex(dest, nft_in(pkt)); + break; + case NFT_META_OIF: + nft_meta_store_ifindex(dest, nft_out(pkt)); + break; + case NFT_META_IIFTYPE: + if (!nft_meta_store_iftype(dest, nft_in(pkt))) + return false; + break; + case NFT_META_OIFTYPE: + if (!nft_meta_store_iftype(dest, nft_out(pkt))) + return false; + break; + case NFT_META_IIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + case NFT_META_OIFGROUP: + if (!nft_meta_store_ifgroup(dest, nft_out(pkt))) + return false; + break; + default: + return false; + } + + return true; +} + +static noinline u32 nft_prandom_u32(void) +{ + struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); + + return prandom_u32_state(state); +} + +#ifdef CONFIG_IP_ROUTE_CLASSID +static noinline bool +nft_meta_get_eval_rtclassid(const struct sk_buff *skb, u32 *dest) +{ + const struct dst_entry *dst = skb_dst(skb); + + if (!dst) + return false; + + *dest = dst->tclassid; + return true; +} +#endif + +static noinline u32 nft_meta_get_eval_sdif(const struct nft_pktinfo *pkt) +{ + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + return inet_sdif(pkt->skb); + case NFPROTO_IPV6: + return inet6_sdif(pkt->skb); + } + + return 0; +} + +static noinline void +nft_meta_get_eval_sdifname(u32 *dest, const struct nft_pktinfo *pkt) +{ + u32 sdif = nft_meta_get_eval_sdif(pkt); + const struct net_device *dev; + + dev = sdif ? dev_get_by_index_rcu(nft_net(pkt), sdif) : NULL; + nft_meta_store_ifname(dest, dev); +} + void nft_meta_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; - const struct net_device *in = nft_in(pkt), *out = nft_out(pkt); - struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -88,69 +340,26 @@ void nft_meta_get_eval(const struct nft_expr *expr, *dest = skb->mark; break; case NFT_META_IIF: - *dest = in ? in->ifindex : 0; - break; case NFT_META_OIF: - *dest = out ? out->ifindex : 0; - break; case NFT_META_IIFNAME: - strncpy((char *)dest, in ? in->name : "", IFNAMSIZ); - break; case NFT_META_OIFNAME: - strncpy((char *)dest, out ? out->name : "", IFNAMSIZ); - break; case NFT_META_IIFTYPE: - if (in == NULL) - goto err; - nft_reg_store16(dest, in->type); - break; case NFT_META_OIFTYPE: - if (out == NULL) + case NFT_META_IIFGROUP: + case NFT_META_OIFGROUP: + if (!nft_meta_get_eval_ifname(priv->key, dest, pkt)) goto err; - nft_reg_store16(dest, out->type); break; case NFT_META_SKUID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) - goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - - *dest = from_kuid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&sk->sk_callback_lock); - break; case NFT_META_SKGID: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_skugid(priv->key, dest, pkt)) goto err; - - read_lock_bh(&sk->sk_callback_lock); - if (sk->sk_socket == NULL || - sk->sk_socket->file == NULL) { - read_unlock_bh(&sk->sk_callback_lock); - goto err; - } - *dest = from_kgid_munged(&init_user_ns, - sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID - case NFT_META_RTCLASSID: { - const struct dst_entry *dst = skb_dst(skb); - - if (dst == NULL) + case NFT_META_RTCLASSID: + if (!nft_meta_get_eval_rtclassid(skb, dest)) goto err; - *dest = dst->tclassid; break; - } #endif #ifdef CONFIG_NETWORK_SECMARK case NFT_META_SECMARK: @@ -163,97 +372,41 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; } - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - if (ipv4_is_multicast(ip_hdr(skb)->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - break; - case NFPROTO_IPV6: - nft_reg_store8(dest, PACKET_MULTICAST); - break; - case NFPROTO_NETDEV: - switch (skb->protocol) { - case htons(ETH_P_IP): { - int noff = skb_network_offset(skb); - struct iphdr *iph, _iph; - - iph = skb_header_pointer(skb, noff, - sizeof(_iph), &_iph); - if (!iph) - goto err; - - if (ipv4_is_multicast(iph->daddr)) - nft_reg_store8(dest, PACKET_MULTICAST); - else - nft_reg_store8(dest, PACKET_BROADCAST); - - break; - } - case htons(ETH_P_IPV6): - nft_reg_store8(dest, PACKET_MULTICAST); - break; - default: - WARN_ON_ONCE(1); - goto err; - } - break; - default: - WARN_ON_ONCE(1); + if (!nft_meta_get_eval_pkttype_lo(pkt, dest)) goto err; - } break; case NFT_META_CPU: *dest = raw_smp_processor_id(); break; - case NFT_META_IIFGROUP: - if (in == NULL) - goto err; - *dest = in->group; - break; - case NFT_META_OIFGROUP: - if (out == NULL) - goto err; - *dest = out->group; - break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - sk = skb_to_full_sk(skb); - if (!sk || !sk_fullsock(sk) || - !net_eq(nft_net(pkt), sock_net(sk))) + if (!nft_meta_get_eval_cgroup(dest, pkt)) goto err; - *dest = sock_cgroup_classid(&sk->sk_cgrp_data); break; #endif - case NFT_META_PRANDOM: { - struct rnd_state *state = this_cpu_ptr(&nft_prandom_state); - *dest = prandom_u32_state(state); + case NFT_META_PRANDOM: + *dest = nft_prandom_u32(); break; - } #ifdef CONFIG_XFRM case NFT_META_SECPATH: nft_reg_store8(dest, secpath_exists(skb)); break; #endif case NFT_META_IIFKIND: - if (in == NULL || in->rtnl_link_ops == NULL) - goto err; - strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); - break; case NFT_META_OIFKIND: - if (out == NULL || out->rtnl_link_ops == NULL) + if (!nft_meta_get_eval_kind(priv->key, dest, pkt)) goto err; - strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); break; case NFT_META_TIME_NS: - nft_reg_store64(dest, ktime_get_real_ns()); - break; case NFT_META_TIME_DAY: - nft_reg_store8(dest, nft_meta_weekday(ktime_get_real_seconds())); - break; case NFT_META_TIME_HOUR: - *dest = nft_meta_hour(ktime_get_real_seconds()); + nft_meta_get_eval_time(priv->key, dest); + break; + case NFT_META_SDIF: + *dest = nft_meta_get_eval_sdif(pkt); + break; + case NFT_META_SDIFNAME: + nft_meta_get_eval_sdifname(dest, pkt); break; default: WARN_ON(1); @@ -335,6 +488,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_MARK: case NFT_META_IIF: case NFT_META_OIF: + case NFT_META_SDIF: case NFT_META_SKUID: case NFT_META_SKGID: #ifdef CONFIG_IP_ROUTE_CLASSID @@ -356,6 +510,7 @@ int nft_meta_get_init(const struct nft_ctx *ctx, case NFT_META_OIFNAME: case NFT_META_IIFKIND: case NFT_META_OIFKIND: + case NFT_META_SDIFNAME: len = IFNAMSIZ; break; case NFT_META_PRANDOM: @@ -386,16 +541,28 @@ int nft_meta_get_init(const struct nft_ctx *ctx, } EXPORT_SYMBOL_GPL(nft_meta_get_init); -static int nft_meta_get_validate(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nft_data **data) +static int nft_meta_get_validate_sdif(const struct nft_ctx *ctx) { -#ifdef CONFIG_XFRM - const struct nft_meta *priv = nft_expr_priv(expr); unsigned int hooks; - if (priv->key != NFT_META_SECPATH) - return 0; + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + hooks = (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD); + break; + default: + return -EOPNOTSUPP; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + +static int nft_meta_get_validate_xfrm(const struct nft_ctx *ctx) +{ +#ifdef CONFIG_XFRM + unsigned int hooks; switch (ctx->family) { case NFPROTO_NETDEV: @@ -418,6 +585,25 @@ static int nft_meta_get_validate(const struct nft_ctx *ctx, #endif } +static int nft_meta_get_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + + switch (priv->key) { + case NFT_META_SECPATH: + return nft_meta_get_validate_xfrm(ctx); + case NFT_META_SDIF: + case NFT_META_SDIFNAME: + return nft_meta_get_validate_sdif(ctx); + default: + break; + } + + return 0; +} + int nft_meta_set_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 087a056e34d1..87e8d9ba0c9b 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -259,8 +259,8 @@ static u64 nft_bitmap_privsize(const struct nlattr * const nla[], } static int nft_bitmap_init(const struct nft_set *set, - const struct nft_set_desc *desc, - const struct nlattr * const nla[]) + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index b331a3c9a3a8..d350a7cd3af0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -645,7 +645,7 @@ static bool nft_hash_estimate(const struct nft_set_desc *desc, u32 features, } static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features, - struct nft_set_estimate *est) + struct nft_set_estimate *est) { if (!desc->size) return false; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c new file mode 100644 index 000000000000..f0cb1e13af50 --- /dev/null +++ b/net/netfilter/nft_set_pipapo.c @@ -0,0 +1,2102 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* PIPAPO: PIle PAcket POlicies: set for arbitrary concatenations of ranges + * + * Copyright (c) 2019-2020 Red Hat GmbH + * + * Author: Stefano Brivio <sbrivio@redhat.com> + */ + +/** + * DOC: Theory of Operation + * + * + * Problem + * ------- + * + * Match packet bytes against entries composed of ranged or non-ranged packet + * field specifiers, mapping them to arbitrary references. For example: + * + * :: + * + * --- fields ---> + * | [net],[port],[net]... => [reference] + * entries [net],[port],[net]... => [reference] + * | [net],[port],[net]... => [reference] + * V ... + * + * where [net] fields can be IP ranges or netmasks, and [port] fields are port + * ranges. Arbitrary packet fields can be matched. + * + * + * Algorithm Overview + * ------------------ + * + * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally + * relies on the consideration that every contiguous range in a space of b bits + * can be converted into b * 2 netmasks, from Theorem 3 in [Rottenstreich 2010], + * as also illustrated in Section 9 of [Kogan 2014]. + * + * Classification against a number of entries, that require matching given bits + * of a packet field, is performed by grouping those bits in sets of arbitrary + * size, and classifying packet bits one group at a time. + * + * Example: + * to match the source port (16 bits) of a packet, we can divide those 16 bits + * in 4 groups of 4 bits each. Given the entry: + * 0000 0001 0101 1001 + * and a packet with source port: + * 0000 0001 1010 1001 + * first and second groups match, but the third doesn't. We conclude that the + * packet doesn't match the given entry. + * + * Translate the set to a sequence of lookup tables, one per field. Each table + * has two dimensions: bit groups to be matched for a single packet field, and + * all the possible values of said groups (buckets). Input entries are + * represented as one or more rules, depending on the number of composing + * netmasks for the given field specifier, and a group match is indicated as a + * set bit, with number corresponding to the rule index, in all the buckets + * whose value matches the entry for a given group. + * + * Rules are mapped between fields through an array of x, n pairs, with each + * item mapping a matched rule to one or more rules. The position of the pair in + * the array indicates the matched rule to be mapped to the next field, x + * indicates the first rule index in the next field, and n the amount of + * next-field rules the current rule maps to. + * + * The mapping array for the last field maps to the desired references. + * + * To match, we perform table lookups using the values of grouped packet bits, + * and use a sequence of bitwise operations to progressively evaluate rule + * matching. + * + * A stand-alone, reference implementation, also including notes about possible + * future optimisations, is available at: + * https://pipapo.lameexcu.se/ + * + * Insertion + * --------- + * + * - For each packet field: + * + * - divide the b packet bits we want to classify into groups of size t, + * obtaining ceil(b / t) groups + * + * Example: match on destination IP address, with t = 4: 32 bits, 8 groups + * of 4 bits each + * + * - allocate a lookup table with one column ("bucket") for each possible + * value of a group, and with one row for each group + * + * Example: 8 groups, 2^4 buckets: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 + * 1 + * 2 + * 3 + * 4 + * 5 + * 6 + * 7 + * + * - map the bits we want to classify for the current field, for a given + * entry, to a single rule for non-ranged and netmask set items, and to one + * or multiple rules for ranges. Ranges are expanded to composing netmasks + * by pipapo_expand(). + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048 + * - rule #0: 10.0.0.5 + * - rule #1: 192.168.1.0/24 + * - rule #2: 192.168.2.0/31 + * + * - insert references to the rules in the lookup table, selecting buckets + * according to bit values of a rule in the given group. This is done by + * pipapo_insert(). + * + * Example: given: + * - rule #0: 10.0.0.5 mapping to buckets + * < 0 10 0 0 0 0 0 5 > + * - rule #1: 192.168.1.0/24 mapping to buckets + * < 12 0 10 8 0 1 < 0..15 > < 0..15 > > + * - rule #2: 192.168.2.0/31 mapping to buckets + * < 12 0 10 8 0 2 0 < 0..1 > > + * + * these bits are set in the lookup table: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * - if this is not the last field in the set, fill a mapping array that maps + * rules from the lookup table to rules belonging to the same entry in + * the next lookup table, done by pipapo_map(). + * + * Note that as rules map to contiguous ranges of rules, given how netmask + * expansion and insertion is performed, &union nft_pipapo_map_bucket stores + * this information as pairs of first rule index, rule count. + * + * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048, + * given lookup table #0 for field 0 (see example above): + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * and lookup table #1 for field 1 with: + * - rule #0: 1024 mapping to buckets + * < 0 0 4 0 > + * - rule #1: 2048 mapping to buckets + * < 0 0 5 0 > + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024 + * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1 + * (rules #1, #2) to 2048 in lookup table #2 (rule #1): + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * - if this is the last field in the set, fill a mapping array that maps + * rules from the last lookup table to element pointers, also done by + * pipapo_map(). + * + * Note that, in this implementation, we have two elements (start, end) for + * each entry. The pointer to the end element is stored in this array, and + * the pointer to the start element is linked from it. + * + * Example: entry 10.0.0.5:1024 has a corresponding &struct nft_pipapo_elem + * pointer, 0x66, and element for 192.168.1.0-192.168.2.1:2048 is at 0x42. + * From the rules of lookup table #1 as mapped above: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * + * Matching + * -------- + * + * We use a result bitmap, with the size of a single lookup table bucket, to + * represent the matching state that applies at every algorithm step. This is + * done by pipapo_lookup(). + * + * - For each packet field: + * + * - start with an all-ones result bitmap (res_map in pipapo_lookup()) + * + * - perform a lookup into the table corresponding to the current field, + * for each group, and at every group, AND the current result bitmap with + * the value from the lookup table bucket + * + * :: + * + * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from + * insertion examples. + * Lookup table buckets are at least 3 bits wide, we'll assume 8 bits for + * convenience in this example. Initial result bitmap is 0xff, the steps + * below show the value of the result bitmap after each group is processed: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6 + * + * 1 1,2 0 + * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6 + * + * 2 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6 + * + * 3 0 1,2 + * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6 + * + * 4 0,1,2 + * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6 + * + * 5 0 1 2 + * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2 + * + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2 + * + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2 + * + * - at the next field, start with a new, all-zeroes result bitmap. For each + * bit set in the previous result bitmap, fill the new result bitmap + * (fill_map in pipapo_lookup()) with the rule indices from the + * corresponding buckets of the mapping field for this field, done by + * pipapo_refill() + * + * Example: with mapping table from insertion examples, with the current + * result bitmap from the previous example, 0x02: + * + * :: + * + * rule indices in current field: 0 1 2 + * map to rules in next field: 0 1 1 + * + * the new result bitmap will be 0x02: rule 1 was set, and rule 1 will be + * set. + * + * We can now extend this example to cover the second iteration of the step + * above (lookup and AND bitmap): assuming the port field is + * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table + * for "port" field from pre-computation example: + * + * :: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0,1 + * 1 0,1 + * 2 0 1 + * 3 0,1 + * + * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5] + * & 0x3 [bucket 0], resulting bitmap is 0x2. + * + * - if this is the last field in the set, look up the value from the mapping + * array corresponding to the final result bitmap + * + * Example: 0x2 resulting bitmap from 192.168.1.5:2048, mapping array for + * last field from insertion example: + * + * :: + * + * rule indices in last field: 0 1 + * map to elements: 0x42 0x66 + * + * the matching element is at 0x42. + * + * + * References + * ---------- + * + * [Ligatti 2010] + * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with + * Automatic Time-space Tradeoffs + * Jay Ligatti, Josh Kuhn, and Chris Gage. + * Proceedings of the IEEE International Conference on Computer + * Communication Networks (ICCCN), August 2010. + * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf + * + * [Rottenstreich 2010] + * Worst-Case TCAM Rule Expansion + * Ori Rottenstreich and Isaac Keslassy. + * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010. + * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf + * + * [Kogan 2014] + * SAX-PAC (Scalable And eXpressive PAcket Classification) + * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane, + * and Patrick Eugster. + * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014. + * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <uapi/linux/netfilter/nf_tables.h> +#include <net/ipv6.h> /* For the maximum length of a field */ +#include <linux/bitmap.h> +#include <linux/bitops.h> + +/* Count of concatenated fields depends on count of 32-bit nftables registers */ +#define NFT_PIPAPO_MAX_FIELDS NFT_REG32_COUNT + +/* Largest supported field size */ +#define NFT_PIPAPO_MAX_BYTES (sizeof(struct in6_addr)) +#define NFT_PIPAPO_MAX_BITS (NFT_PIPAPO_MAX_BYTES * BITS_PER_BYTE) + +/* Number of bits to be grouped together in lookup table buckets, arbitrary */ +#define NFT_PIPAPO_GROUP_BITS 4 +#define NFT_PIPAPO_GROUPS_PER_BYTE (BITS_PER_BYTE / NFT_PIPAPO_GROUP_BITS) + +/* Fields are padded to 32 bits in input registers */ +#define NFT_PIPAPO_GROUPS_PADDED_SIZE(x) \ + (round_up((x) / NFT_PIPAPO_GROUPS_PER_BYTE, sizeof(u32))) +#define NFT_PIPAPO_GROUPS_PADDING(x) \ + (NFT_PIPAPO_GROUPS_PADDED_SIZE((x)) - (x) / NFT_PIPAPO_GROUPS_PER_BYTE) + +/* Number of buckets, given by 2 ^ n, with n grouped bits */ +#define NFT_PIPAPO_BUCKETS (1 << NFT_PIPAPO_GROUP_BITS) + +/* Each n-bit range maps to up to n * 2 rules */ +#define NFT_PIPAPO_MAP_NBITS (const_ilog2(NFT_PIPAPO_MAX_BITS * 2)) + +/* Use the rest of mapping table buckets for rule indices, but it makes no sense + * to exceed 32 bits + */ +#if BITS_PER_LONG == 64 +#define NFT_PIPAPO_MAP_TOBITS 32 +#else +#define NFT_PIPAPO_MAP_TOBITS (BITS_PER_LONG - NFT_PIPAPO_MAP_NBITS) +#endif + +/* ...which gives us the highest allowed index for a rule */ +#define NFT_PIPAPO_RULE0_MAX ((1UL << (NFT_PIPAPO_MAP_TOBITS - 1)) \ + - (1UL << NFT_PIPAPO_MAP_NBITS)) + +#define nft_pipapo_for_each_field(field, index, match) \ + for ((field) = (match)->f, (index) = 0; \ + (index) < (match)->field_count; \ + (index)++, (field)++) + +/** + * union nft_pipapo_map_bucket - Bucket of mapping table + * @to: First rule number (in next field) this rule maps to + * @n: Number of rules (in next field) this rule maps to + * @e: If there's no next field, pointer to element this rule maps to + */ +union nft_pipapo_map_bucket { + struct { +#if BITS_PER_LONG == 64 + static_assert(NFT_PIPAPO_MAP_TOBITS <= 32); + u32 to; + + static_assert(NFT_PIPAPO_MAP_NBITS <= 32); + u32 n; +#else + unsigned long to:NFT_PIPAPO_MAP_TOBITS; + unsigned long n:NFT_PIPAPO_MAP_NBITS; +#endif + }; + struct nft_pipapo_elem *e; +}; + +/** + * struct nft_pipapo_field - Lookup, mapping tables and related data for a field + * @groups: Amount of 4-bit groups + * @rules: Number of inserted rules + * @bsize: Size of each bucket in lookup table, in longs + * @lt: Lookup table: 'groups' rows of NFT_PIPAPO_BUCKETS buckets + * @mt: Mapping table: one bucket per rule + */ +struct nft_pipapo_field { + int groups; + unsigned long rules; + size_t bsize; + unsigned long *lt; + union nft_pipapo_map_bucket *mt; +}; + +/** + * struct nft_pipapo_match - Data used for lookup and matching + * @field_count Amount of fields in set + * @scratch: Preallocated per-CPU maps for partial matching results + * @bsize_max: Maximum lookup table bucket size of all fields, in longs + * @rcu Matching data is swapped on commits + * @f: Fields, with lookup and mapping tables + */ +struct nft_pipapo_match { + int field_count; + unsigned long * __percpu *scratch; + size_t bsize_max; + struct rcu_head rcu; + struct nft_pipapo_field f[0]; +}; + +/* Current working bitmap index, toggled between field matches */ +static DEFINE_PER_CPU(bool, nft_pipapo_scratch_index); + +/** + * struct nft_pipapo - Representation of a set + * @match: Currently in-use matching data + * @clone: Copy where pending insertions and deletions are kept + * @groups: Total amount of 4-bit groups for fields in this set + * @width: Total bytes to be matched for one packet, including padding + * @dirty: Working copy has pending insertions or deletions + * @last_gc: Timestamp of last garbage collection run, jiffies + */ +struct nft_pipapo { + struct nft_pipapo_match __rcu *match; + struct nft_pipapo_match *clone; + int groups; + int width; + bool dirty; + unsigned long last_gc; +}; + +struct nft_pipapo_elem; + +/** + * struct nft_pipapo_elem - API-facing representation of single set element + * @ext: nftables API extensions + */ +struct nft_pipapo_elem { + struct nft_set_ext ext; +}; + +/** + * pipapo_refill() - For each set bit, set bits from selected mapping table item + * @map: Bitmap to be scanned for set bits + * @len: Length of bitmap in longs + * @rules: Number of rules in field + * @dst: Destination bitmap + * @mt: Mapping table containing bit set specifiers + * @match_only: Find a single bit and return, don't fill + * + * Iteration over set bits with __builtin_ctzl(): Daniel Lemire, public domain. + * + * For each bit set in map, select the bucket from mapping table with index + * corresponding to the position of the bit set. Use start bit and amount of + * bits specified in bucket to fill region in dst. + * + * Return: -1 on no match, bit position on 'match_only', 0 otherwise. + */ +static int pipapo_refill(unsigned long *map, int len, int rules, + unsigned long *dst, union nft_pipapo_map_bucket *mt, + bool match_only) +{ + unsigned long bitset; + int k, ret = -1; + + for (k = 0; k < len; k++) { + bitset = map[k]; + while (bitset) { + unsigned long t = bitset & -bitset; + int r = __builtin_ctzl(bitset); + int i = k * BITS_PER_LONG + r; + + if (unlikely(i >= rules)) { + map[k] = 0; + return -1; + } + + if (unlikely(match_only)) { + bitmap_clear(map, i, 1); + return i; + } + + ret = 0; + + bitmap_set(dst, mt[i].to, mt[i].n); + + bitset ^= t; + } + map[k] = 0; + } + + return ret; +} + +/** + * nft_pipapo_lookup() - Lookup function + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext: nftables API extension pointer, filled with matching reference + * + * For more details, see DOC: Theory of Operation. + * + * Return: true on match, false otherwise. + */ +static bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext) +{ + struct nft_pipapo *priv = nft_set_priv(set); + unsigned long *res_map, *fill_map; + u8 genmask = nft_genmask_cur(net); + const u8 *rp = (const u8 *)key; + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + bool map_index; + int i; + + local_bh_disable(); + + map_index = raw_cpu_read(nft_pipapo_scratch_index); + + m = rcu_dereference(priv->match); + + if (unlikely(!m || !*raw_cpu_ptr(m->scratch))) + goto out; + + res_map = *raw_cpu_ptr(m->scratch) + (map_index ? m->bsize_max : 0); + fill_map = *raw_cpu_ptr(m->scratch) + (map_index ? 0 : m->bsize_max); + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group += 2) { + u8 v; + + v = *rp >> 4; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + + v = *rp & 0x0f; + rp++; + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) { + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return false; + } + + if (last) { + *ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(*ext) || + !nft_set_elem_active(*ext, genmask))) + goto next_match; + + /* Last field: we're just returning the key without + * filling the initial bitmap for the next field, so the + * current inactive bitmap is clean and can be reused as + * *next* bitmap (not initial) for the next packet. + */ + raw_cpu_write(nft_pipapo_scratch_index, map_index); + local_bh_enable(); + + return true; + } + + /* Swap bitmap indices: res_map is the initial bitmap for the + * next field, and fill_map is guaranteed to be all-zeroes at + * this point. + */ + map_index = !map_index; + swap(res_map, fill_map); + + rp += NFT_PIPAPO_GROUPS_PADDING(f->groups); + } + +out: + local_bh_enable(); + return false; +} + +/** + * pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * + * This is essentially the same as the lookup function, except that it matches + * key data against the uncommitted copy and doesn't use preallocated maps for + * bitmap results. + * + * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct net *net, + const struct nft_set *set, + const u8 *data, u8 genmask) +{ + struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT); + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + unsigned long *res_map, *fill_map = NULL; + struct nft_pipapo_field *f; + int i; + + res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!res_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + fill_map = kcalloc(m->bsize_max, sizeof(*res_map), GFP_ATOMIC); + if (!fill_map) { + ret = ERR_PTR(-ENOMEM); + goto out; + } + + memset(res_map, 0xff, m->bsize_max * sizeof(*res_map)); + + nft_pipapo_for_each_field(f, i, m) { + bool last = i == m->field_count - 1; + unsigned long *lt = f->lt; + int b, group; + + /* For each 4-bit group: select lookup table bucket depending on + * packet bytes value, then AND bucket value + */ + for (group = 0; group < f->groups; group++) { + u8 v; + + if (group % 2) { + v = *data & 0x0f; + data++; + } else { + v = *data >> 4; + } + __bitmap_and(res_map, res_map, lt + v * f->bsize, + f->bsize * BITS_PER_LONG); + + lt += f->bsize * NFT_PIPAPO_BUCKETS; + } + + /* Now populate the bitmap for the next field, unless this is + * the last field, in which case return the matched 'ext' + * pointer if any. + * + * Now res_map contains the matching bitmap, and fill_map is the + * bitmap for the next field. + */ +next_match: + b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt, + last); + if (b < 0) + goto out; + + if (last) { + if (nft_set_elem_expired(&f->mt[b].e->ext) || + (genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + + ret = f->mt[b].e; + goto out; + } + + data += NFT_PIPAPO_GROUPS_PADDING(f->groups); + + /* Swap bitmap indices: fill_map will be the initial bitmap for + * the next field (i.e. the new res_map), and res_map is + * guaranteed to be all-zeroes at this point, ready to be filled + * according to the next mapping table. + */ + swap(res_map, fill_map); + } + +out: + kfree(fill_map); + kfree(res_map); + return ret; +} + +/** + * nft_pipapo_get() - Get matching element reference given key data + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @flags: Unused + */ +void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) +{ + return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +} + +/** + * pipapo_resize() - Resize lookup or mapping table, or both + * @f: Field containing lookup and mapping tables + * @old_rules: Previous amount of rules in field + * @rules: New amount of rules + * + * Increase, decrease or maintain tables size depending on new amount of rules, + * and copy data over. In case the new size is smaller, throw away data for + * highest-numbered rules. + * + * Return: 0 on success, -ENOMEM on allocation failure. + */ +static int pipapo_resize(struct nft_pipapo_field *f, int old_rules, int rules) +{ + long *new_lt = NULL, *new_p, *old_lt = f->lt, *old_p; + union nft_pipapo_map_bucket *new_mt, *old_mt = f->mt; + size_t new_bucket_size, copy; + int group, bucket; + + new_bucket_size = DIV_ROUND_UP(rules, BITS_PER_LONG); + + if (new_bucket_size == f->bsize) + goto mt; + + if (new_bucket_size > f->bsize) + copy = f->bsize; + else + copy = new_bucket_size; + + new_lt = kvzalloc(f->groups * NFT_PIPAPO_BUCKETS * new_bucket_size * + sizeof(*new_lt), GFP_KERNEL); + if (!new_lt) + return -ENOMEM; + + new_p = new_lt; + old_p = old_lt; + for (group = 0; group < f->groups; group++) { + for (bucket = 0; bucket < NFT_PIPAPO_BUCKETS; bucket++) { + memcpy(new_p, old_p, copy * sizeof(*new_p)); + new_p += copy; + old_p += copy; + + if (new_bucket_size > f->bsize) + new_p += new_bucket_size - f->bsize; + else + old_p += f->bsize - new_bucket_size; + } + } + +mt: + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); + return -ENOMEM; + } + + memcpy(new_mt, f->mt, min(old_rules, rules) * sizeof(*new_mt)); + if (rules > old_rules) { + memset(new_mt + old_rules, 0, + (rules - old_rules) * sizeof(*new_mt)); + } + + if (new_lt) { + f->bsize = new_bucket_size; + f->lt = new_lt; + kvfree(old_lt); + } + + f->mt = new_mt; + kvfree(old_mt); + + return 0; +} + +/** + * pipapo_bucket_set() - Set rule bit in bucket given group and group value + * @f: Field containing lookup table + * @rule: Rule index + * @group: Group index + * @v: Value of bit group + */ +static void pipapo_bucket_set(struct nft_pipapo_field *f, int rule, int group, + int v) +{ + unsigned long *pos; + + pos = f->lt + f->bsize * NFT_PIPAPO_BUCKETS * group; + pos += f->bsize * v; + + __set_bit(rule, pos); +} + +/** + * pipapo_insert() - Insert new rule in field given input key and mask length + * @f: Field containing lookup table + * @k: Input key for classification, without nftables padding + * @mask_bits: Length of mask; matches field length for non-ranged entry + * + * Insert a new rule reference in lookup buckets corresponding to k and + * mask_bits. + * + * Return: 1 on success (one rule inserted), negative error code on failure. + */ +static int pipapo_insert(struct nft_pipapo_field *f, const uint8_t *k, + int mask_bits) +{ + int rule = f->rules++, group, ret; + + ret = pipapo_resize(f, f->rules - 1, f->rules); + if (ret) + return ret; + + for (group = 0; group < f->groups; group++) { + int i, v; + u8 mask; + + if (group % 2) + v = k[group / 2] & 0x0f; + else + v = k[group / 2] >> 4; + + if (mask_bits >= (group + 1) * 4) { + /* Not masked */ + pipapo_bucket_set(f, rule, group, v); + } else if (mask_bits <= group * 4) { + /* Completely masked */ + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) + pipapo_bucket_set(f, rule, group, i); + } else { + /* The mask limit falls on this group */ + mask = 0x0f >> (mask_bits - group * 4); + for (i = 0; i < NFT_PIPAPO_BUCKETS; i++) { + if ((i & ~mask) == (v & ~mask)) + pipapo_bucket_set(f, rule, group, i); + } + } + } + + return 1; +} + +/** + * pipapo_step_diff() - Check if setting @step bit in netmask would change it + * @base: Mask we are expanding + * @step: Step bit for given expansion step + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if step bit changes mask (i.e. isn't set), false otherwise. + */ +static bool pipapo_step_diff(u8 *base, int step, int len) +{ + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + return !(BIT(step % BITS_PER_BYTE) & base[step / BITS_PER_BYTE]); +#else + return !(BIT(step % BITS_PER_BYTE) & + base[len - 1 - step / BITS_PER_BYTE]); +#endif +} + +/** + * pipapo_step_after_end() - Check if mask exceeds range end with given step + * @base: Mask we are expanding + * @end: End of range + * @step: Step bit for given expansion step, highest bit to be set + * @len: Total length of mask space (set and unset bits), bytes + * + * Convenience function for mask expansion. + * + * Return: true if mask exceeds range setting step bits, false otherwise. + */ +static bool pipapo_step_after_end(const u8 *base, const u8 *end, int step, + int len) +{ + u8 tmp[NFT_PIPAPO_MAX_BYTES]; + int i; + + memcpy(tmp, base, len); + + /* Network order, byte-addressed */ + for (i = 0; i <= step; i++) +#ifdef __BIG_ENDIAN__ + tmp[i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#else + tmp[len - 1 - i / BITS_PER_BYTE] |= BIT(i % BITS_PER_BYTE); +#endif + + return memcmp(tmp, end, len) > 0; +} + +/** + * pipapo_base_sum() - Sum step bit to given len-sized netmask base with carry + * @base: Netmask base + * @step: Step bit to sum + * @len: Netmask length, bytes + */ +static void pipapo_base_sum(u8 *base, int step, int len) +{ + bool carry = false; + int i; + + /* Network order, byte-addressed */ +#ifdef __BIG_ENDIAN__ + for (i = step / BITS_PER_BYTE; i < len; i++) { +#else + for (i = len - 1 - step / BITS_PER_BYTE; i >= 0; i--) { +#endif + if (carry) + base[i]++; + else + base[i] += 1 << (step % BITS_PER_BYTE); + + if (base[i]) + break; + + carry = true; + } +} + +/** + * pipapo_expand() - Expand to composing netmasks, insert into lookup table + * @f: Field containing lookup table + * @start: Start of range + * @end: End of range + * @len: Length of value in bits + * + * Expand range to composing netmasks and insert corresponding rule references + * in lookup buckets. + * + * Return: number of inserted rules on success, negative error code on failure. + */ +static int pipapo_expand(struct nft_pipapo_field *f, + const u8 *start, const u8 *end, int len) +{ + int step, masks = 0, bytes = DIV_ROUND_UP(len, BITS_PER_BYTE); + u8 base[NFT_PIPAPO_MAX_BYTES]; + + memcpy(base, start, bytes); + while (memcmp(base, end, bytes) <= 0) { + int err; + + step = 0; + while (pipapo_step_diff(base, step, bytes)) { + if (pipapo_step_after_end(base, end, step, bytes)) + break; + + step++; + if (step >= len) { + if (!masks) { + pipapo_insert(f, base, 0); + masks = 1; + } + goto out; + } + } + + err = pipapo_insert(f, base, len - step); + + if (err < 0) + return err; + + masks++; + pipapo_base_sum(base, step, bytes); + } +out: + return masks; +} + +/** + * pipapo_map() - Insert rules in mapping tables, mapping them between fields + * @m: Matching data, including mapping table + * @map: Table of rule maps: array of first rule and amount of rules + * in next field a given rule maps to, for each field + * @ext: For last field, nft_set_ext pointer matching rules map to + */ +static void pipapo_map(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket map[NFT_PIPAPO_MAX_FIELDS], + struct nft_pipapo_elem *e) +{ + struct nft_pipapo_field *f; + int i, j; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) { + for (j = 0; j < map[i].n; j++) { + f->mt[map[i].to + j].to = map[i + 1].to; + f->mt[map[i].to + j].n = map[i + 1].n; + } + } + + /* Last field: map to ext instead of mapping to next field */ + for (j = 0; j < map[i].n; j++) + f->mt[map[i].to + j].e = e; +} + +/** + * pipapo_realloc_scratch() - Reallocate scratch maps for partial match results + * @clone: Copy of matching data with pending insertions and deletions + * @bsize_max Maximum bucket size, scratch maps cover two buckets + * + * Return: 0 on success, -ENOMEM on failure. + */ +static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, + unsigned long bsize_max) +{ + int i; + + for_each_possible_cpu(i) { + unsigned long *scratch; + + scratch = kzalloc_node(bsize_max * sizeof(*scratch) * 2, + GFP_KERNEL, cpu_to_node(i)); + if (!scratch) { + /* On failure, there's no need to undo previous + * allocations: this means that some scratch maps have + * a bigger allocated size now (this is only called on + * insertion), but the extra space won't be used by any + * CPU as new elements are not inserted and m->bsize_max + * is not updated. + */ + return -ENOMEM; + } + + kfree(*per_cpu_ptr(clone->scratch, i)); + + *per_cpu_ptr(clone->scratch, i) = scratch; + } + + return 0; +} + +/** + * nft_pipapo_insert() - Validate and insert ranged elements + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * @ext2: Filled with pointer to &struct nft_set_ext in inserted element + * + * Return: 0 on success, error pointer on failure. + */ +static int nft_pipapo_insert(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, + struct nft_set_ext **ext2) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *start = (const u8 *)elem->key.val.data, *end; + struct nft_pipapo_elem *e = elem->priv, *dup; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + u8 genmask = nft_genmask_next(net); + struct nft_pipapo_field *f; + int i, bsize_max, err = 0; + + dup = pipapo_get(net, set, start, genmask); + if (PTR_ERR(dup) == -ENOENT) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) { + end = (const u8 *)nft_set_ext_key_end(ext)->data; + dup = pipapo_get(net, set, end, nft_genmask_next(net)); + } else { + end = start; + } + } + + if (PTR_ERR(dup) != -ENOENT) { + if (IS_ERR(dup)) + return PTR_ERR(dup); + *ext2 = &dup->ext; + return -EEXIST; + } + + /* Validate */ + nft_pipapo_for_each_field(f, i, m) { + const u8 *start_p = start, *end_p = end; + + if (f->rules >= (unsigned long)NFT_PIPAPO_RULE0_MAX) + return -ENOSPC; + + if (memcmp(start_p, end_p, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) > 0) + return -EINVAL; + + start_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end_p += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + /* Insert */ + priv->dirty = true; + + bsize_max = m->bsize_max; + + nft_pipapo_for_each_field(f, i, m) { + int ret; + + rulemap[i].to = f->rules; + + ret = memcmp(start, end, + f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); + if (!ret) { + ret = pipapo_insert(f, start, + f->groups * NFT_PIPAPO_GROUP_BITS); + } else { + ret = pipapo_expand(f, start, end, + f->groups * NFT_PIPAPO_GROUP_BITS); + } + + if (f->bsize > bsize_max) + bsize_max = f->bsize; + + rulemap[i].n = ret; + + start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (!*this_cpu_ptr(m->scratch) || bsize_max > m->bsize_max) { + err = pipapo_realloc_scratch(m, bsize_max); + if (err) + return err; + + this_cpu_write(nft_pipapo_scratch_index, false); + + m->bsize_max = bsize_max; + } + + *ext2 = &e->ext; + + pipapo_map(m, rulemap, e); + + return 0; +} + +/** + * pipapo_clone() - Clone matching data to create new working copy + * @old: Existing matching data + * + * Return: copy of matching data passed as 'old', error pointer on failure + */ +static struct nft_pipapo_match *pipapo_clone(struct nft_pipapo_match *old) +{ + struct nft_pipapo_field *dst, *src; + struct nft_pipapo_match *new; + int i; + + new = kmalloc(sizeof(*new) + sizeof(*dst) * old->field_count, + GFP_KERNEL); + if (!new) + return ERR_PTR(-ENOMEM); + + new->field_count = old->field_count; + new->bsize_max = old->bsize_max; + + new->scratch = alloc_percpu(*new->scratch); + if (!new->scratch) + goto out_scratch; + + rcu_head_init(&new->rcu); + + src = old->f; + dst = new->f; + + for (i = 0; i < old->field_count; i++) { + memcpy(dst, src, offsetof(struct nft_pipapo_field, lt)); + + dst->lt = kvzalloc(src->groups * NFT_PIPAPO_BUCKETS * + src->bsize * sizeof(*dst->lt), + GFP_KERNEL); + if (!dst->lt) + goto out_lt; + + memcpy(dst->lt, src->lt, + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS); + + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; + + memcpy(dst->mt, src->mt, src->rules * sizeof(*src->mt)); + src++; + dst++; + } + + return new; + +out_mt: + kvfree(dst->lt); +out_lt: + for (dst--; i > 0; i--) { + kvfree(dst->mt); + kvfree(dst->lt); + dst--; + } + free_percpu(new->scratch); +out_scratch: + kfree(new); + + return ERR_PTR(-ENOMEM); +} + +/** + * pipapo_rules_same_key() - Get number of rules originated from the same entry + * @f: Field containing mapping table + * @first: Index of first rule in set of rules mapping to same entry + * + * Using the fact that all rules in a field that originated from the same entry + * will map to the same set of rules in the next field, or to the same element + * reference, return the cardinality of the set of rules that originated from + * the same entry as the rule with index @first, @first rule included. + * + * In pictures: + * rules + * field #0 0 1 2 3 4 + * map to: 0 1 2-4 2-4 5-9 + * . . ....... . ... + * | | | | \ \ + * | | | | \ \ + * | | | | \ \ + * ' ' ' ' ' \ + * in field #1 0 1 2 3 4 5 ... + * + * if this is called for rule 2 on field #0, it will return 3, as also rules 2 + * and 3 in field 0 map to the same set of rules (2, 3, 4) in the next field. + * + * For the last field in a set, we can rely on associated entries to map to the + * same element references. + * + * Return: Number of rules that originated from the same entry as @first. + */ +static int pipapo_rules_same_key(struct nft_pipapo_field *f, int first) +{ + struct nft_pipapo_elem *e = NULL; /* Keep gcc happy */ + int r; + + for (r = first; r < f->rules; r++) { + if (r != first && e != f->mt[r].e) + return r - first; + + e = f->mt[r].e; + } + + if (r != first) + return r - first; + + return 0; +} + +/** + * pipapo_unmap() - Remove rules from mapping tables, renumber remaining ones + * @mt: Mapping array + * @rules: Original amount of rules in mapping table + * @start: First rule index to be removed + * @n: Amount of rules to be removed + * @to_offset: First rule index, in next field, this group of rules maps to + * @is_last: If this is the last field, delete reference from mapping array + * + * This is used to unmap rules from the mapping table for a single field, + * maintaining consistency and compactness for the existing ones. + * + * In pictures: let's assume that we want to delete rules 2 and 3 from the + * following mapping array: + * + * rules + * 0 1 2 3 4 + * map to: 4-10 4-10 11-15 11-15 16-18 + * + * the result will be: + * + * rules + * 0 1 2 + * map to: 4-10 4-10 11-13 + * + * for fields before the last one. In case this is the mapping table for the + * last field in a set, and rules map to pointers to &struct nft_pipapo_elem: + * + * rules + * 0 1 2 3 4 + * element pointers: 0x42 0x42 0x33 0x33 0x44 + * + * the result will be: + * + * rules + * 0 1 2 + * element pointers: 0x42 0x42 0x44 + */ +static void pipapo_unmap(union nft_pipapo_map_bucket *mt, int rules, + int start, int n, int to_offset, bool is_last) +{ + int i; + + memmove(mt + start, mt + start + n, (rules - start - n) * sizeof(*mt)); + memset(mt + rules - n, 0, n * sizeof(*mt)); + + if (is_last) + return; + + for (i = start; i < rules - n; i++) + mt[i].to -= to_offset; +} + +/** + * pipapo_drop() - Delete entry from lookup and mapping tables, given rule map + * @m: Matching data + * @rulemap Table of rule maps, arrays of first rule and amount of rules + * in next field a given entry maps to, for each field + * + * For each rule in lookup table buckets mapping to this set of rules, drop + * all bits set in lookup table mapping. In pictures, assuming we want to drop + * rules 0 and 1 from this lookup table: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 1,2 + * 1 1,2 0 + * 2 0 1,2 + * 3 0 1,2 + * 4 0,1,2 + * 5 0 1 2 + * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1 + * + * rule 2 becomes rule 0, and the result will be: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 0 + * 1 0 + * 2 0 + * 3 0 + * 4 0 + * 5 0 + * 6 0 + * 7 0 0 + * + * once this is done, call unmap() to drop all the corresponding rule references + * from mapping tables. + */ +static void pipapo_drop(struct nft_pipapo_match *m, + union nft_pipapo_map_bucket rulemap[]) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + int g; + + for (g = 0; g < f->groups; g++) { + unsigned long *pos; + int b; + + pos = f->lt + g * NFT_PIPAPO_BUCKETS * f->bsize; + + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + bitmap_cut(pos, pos, rulemap[i].to, + rulemap[i].n, + f->bsize * BITS_PER_LONG); + + pos += f->bsize; + } + } + + pipapo_unmap(f->mt, f->rules, rulemap[i].to, rulemap[i].n, + rulemap[i + 1].n, i == m->field_count - 1); + if (pipapo_resize(f, f->rules, f->rules - rulemap[i].n)) { + /* We can ignore this, a failure to shrink tables down + * doesn't make tables invalid. + */ + ; + } + f->rules -= rulemap[i].n; + } +} + +/** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) +{ + struct nft_pipapo *priv = nft_set_priv(set); + int rules_f0, first_rule = 0; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + struct nft_pipapo_field *f; + struct nft_pipapo_elem *e; + int i, start, rules_fx; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + if (i < m->field_count - 1) { + rules_fx = f->mt[start].n; + start = f->mt[start].to; + } + } + + /* Pick the last field, and its last index */ + f--; + i--; + e = f->mt[rulemap[i].to].e; + if (nft_set_elem_expired(&e->ext) && + !nft_set_elem_mark_busy(&e->ext)) { + priv->dirty = true; + pipapo_drop(m, rulemap); + + rcu_barrier(); + nft_set_elem_destroy(set, e, true); + + /* And check again current first rule, which is now the + * first we haven't checked. + */ + } else { + first_rule += rules_f0; + } + } + + priv->last_gc = jiffies; +} + +/** + * pipapo_free_fields() - Free per-field tables contained in matching data + * @m: Matching data + */ +static void pipapo_free_fields(struct nft_pipapo_match *m) +{ + struct nft_pipapo_field *f; + int i; + + nft_pipapo_for_each_field(f, i, m) { + kvfree(f->lt); + kvfree(f->mt); + } +} + +/** + * pipapo_reclaim_match - RCU callback to free fields from old matching data + * @rcu: RCU head + */ +static void pipapo_reclaim_match(struct rcu_head *rcu) +{ + struct nft_pipapo_match *m; + int i; + + m = container_of(rcu, struct nft_pipapo_match, rcu); + + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + + free_percpu(m->scratch); + + pipapo_free_fields(m); + + kfree(m); +} + +/** + * pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working + * copy before committing it for lookup, and don't replace the table if the + * working copy doesn't have pending changes. + * + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +static void pipapo_commit(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; + + if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) + pipapo_gc(set, priv->clone); + + if (!priv->dirty) + return; + + new_clone = pipapo_clone(priv->clone); + if (IS_ERR(new_clone)) + return; + + priv->dirty = false; + + old = rcu_access_pointer(priv->match); + rcu_assign_pointer(priv->match, priv->clone); + if (old) + call_rcu(&old->rcu, pipapo_reclaim_match); + + priv->clone = new_clone; +} + +/** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently + * in use for lookups, and not directly inserted into current lookup data, so + * we'll take care of that by calling pipapo_commit() here. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); + if (IS_ERR(e)) + return; + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); + + pipapo_commit(set); +} + +/** + * pipapo_deactivate() - Check that element is in set, mark as inactive + * @net: Network namespace + * @set: nftables API set representation + * @data: Input key data + * @ext: nftables API extension pointer, used to check for end element + * + * This is a convenience function that can be called from both + * nft_pipapo_deactivate() and nft_pipapo_flush(), as they are in fact the same + * operation. + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *pipapo_deactivate(const struct net *net, const struct nft_set *set, + const u8 *data, const struct nft_set_ext *ext) +{ + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, nft_genmask_next(net)); + if (IS_ERR(e)) + return NULL; + + nft_set_elem_change_active(net, set, &e->ext); + + return e; +} + +/** + * nft_pipapo_deactivate() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Return: deactivated element if found, NULL otherwise. + */ +static void *nft_pipapo_deactivate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + + return pipapo_deactivate(net, set, (const u8 *)elem->key.val.data, ext); +} + +/** + * nft_pipapo_flush() - Call pipapo_deactivate() to make element inactive + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * This is functionally the same as nft_pipapo_deactivate(), with a slightly + * different interface, and it's also called once for each element in a set + * being flushed, so we can't implement, strictly speaking, a flush operation, + * which would otherwise be as simple as allocating an empty copy of the + * matching data. + * + * Note that we could in theory do that, mark the set as flushed, and ignore + * subsequent calls, but we would leak all the elements after the first one, + * because they wouldn't then be freed as result of API calls. + * + * Return: true if element was found and deactivated. + */ +static bool nft_pipapo_flush(const struct net *net, const struct nft_set *set, + void *elem) +{ + struct nft_pipapo_elem *e = elem; + + return pipapo_deactivate(net, set, (const u8 *)nft_set_ext_key(&e->ext), + &e->ext); +} + +/** + * pipapo_get_boundaries() - Get byte interval for associated rules + * @f: Field including lookup table + * @first_rule: First rule (lowest index) + * @rule_count: Number of associated rules + * @left: Byte expression for left boundary (start of range) + * @right: Byte expression for right boundary (end of range) + * + * Given the first rule and amount of rules that originated from the same entry, + * build the original range associated with the entry, and calculate the length + * of the originating netmask. + * + * In pictures: + * + * bucket + * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + * 0 1,2 + * 1 1,2 + * 2 1,2 + * 3 1,2 + * 4 1,2 + * 5 1 2 + * 6 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * 7 1,2 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 + * + * this is the lookup table corresponding to the IPv4 range + * 192.168.1.0-192.168.2.1, which was expanded to the two composing netmasks, + * rule #1: 192.168.1.0/24, and rule #2: 192.168.2.0/31. + * + * This function fills @left and @right with the byte values of the leftmost + * and rightmost bucket indices for the lowest and highest rule indices, + * respectively. If @first_rule is 1 and @rule_count is 2, we obtain, in + * nibbles: + * left: < 12, 0, 10, 8, 0, 1, 0, 0 > + * right: < 12, 0, 10, 8, 0, 2, 2, 1 > + * corresponding to bytes: + * left: < 192, 168, 1, 0 > + * right: < 192, 168, 2, 1 > + * with mask length irrelevant here, unused on return, as the range is already + * defined by its start and end points. The mask length is relevant for a single + * ranged entry instead: if @first_rule is 1 and @rule_count is 1, we ignore + * rule 2 above: @left becomes < 192, 168, 1, 0 >, @right becomes + * < 192, 168, 1, 255 >, and the mask length, calculated from the distances + * between leftmost and rightmost bucket indices for each group, would be 24. + * + * Return: mask length, in bits. + */ +static int pipapo_get_boundaries(struct nft_pipapo_field *f, int first_rule, + int rule_count, u8 *left, u8 *right) +{ + u8 *l = left, *r = right; + int g, mask_len = 0; + + for (g = 0; g < f->groups; g++) { + int b, x0, x1; + + x0 = -1; + x1 = -1; + for (b = 0; b < NFT_PIPAPO_BUCKETS; b++) { + unsigned long *pos; + + pos = f->lt + (g * NFT_PIPAPO_BUCKETS + b) * f->bsize; + if (test_bit(first_rule, pos) && x0 == -1) + x0 = b; + if (test_bit(first_rule + rule_count - 1, pos)) + x1 = b; + } + + if (g % 2) { + *(l++) |= x0 & 0x0f; + *(r++) |= x1 & 0x0f; + } else { + *l |= x0 << 4; + *r |= x1 << 4; + } + + if (x1 - x0 == 0) + mask_len += 4; + else if (x1 - x0 == 1) + mask_len += 3; + else if (x1 - x0 == 3) + mask_len += 2; + else if (x1 - x0 == 7) + mask_len += 1; + } + + return mask_len; +} + +/** + * pipapo_match_field() - Match rules against byte ranges + * @f: Field including the lookup table + * @first_rule: First of associated rules originating from same entry + * @rule_count: Amount of associated rules + * @start: Start of range to be matched + * @end: End of range to be matched + * + * Return: true on match, false otherwise. + */ +static bool pipapo_match_field(struct nft_pipapo_field *f, + int first_rule, int rule_count, + const u8 *start, const u8 *end) +{ + u8 right[NFT_PIPAPO_MAX_BYTES] = { 0 }; + u8 left[NFT_PIPAPO_MAX_BYTES] = { 0 }; + + pipapo_get_boundaries(f, first_rule, rule_count, left, right); + + return !memcmp(start, left, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE) && + !memcmp(end, right, f->groups / NFT_PIPAPO_GROUPS_PER_BYTE); +} + +/** + * nft_pipapo_remove() - Remove element given key, commit + * @net: Network namespace + * @set: nftables API set representation + * @elem: nftables API element representation containing key data + * + * Similarly to nft_pipapo_activate(), this is used as commit operation by the + * API, but it's called once per element in the pending transaction, so we can't + * implement this as a single commit operation. Closest we can get is to remove + * the matched element here, if any, and commit the updated matching data. + */ +static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem) +{ + const u8 *data = (const u8 *)elem->key.val.data; + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m = priv->clone; + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; + + e = pipapo_get(net, set, data, 0); + if (IS_ERR(e)) + return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; + struct nft_pipapo_field *f; + int i, start, rules_fx; + + match_start = data; + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + start = first_rule; + rules_fx = rules_f0; + + nft_pipapo_for_each_field(f, i, m) { + if (!pipapo_match_field(f, start, rules_fx, + match_start, match_end)) + break; + + rulemap[i].to = start; + rulemap[i].n = rules_fx; + + rules_fx = f->mt[start].n; + start = f->mt[start].to; + + match_start += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + match_end += NFT_PIPAPO_GROUPS_PADDED_SIZE(f->groups); + } + + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); + pipapo_commit(set); + return; + } + + first_rule += rules_f0; + } +} + +/** + * nft_pipapo_walk() - Walk over elements + * @ctx: nftables API context + * @set: nftables API set representation + * @iter: Iterator + * + * As elements are referenced in the mapping array for the last field, directly + * scan that array: there's no need to follow rule mappings from the first + * field. + */ +static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r; + + rcu_read_lock(); + m = rcu_dereference(priv->match); + + if (unlikely(!m)) + goto out; + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + struct nft_set_elem elem; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + if (iter->count < iter->skip) + goto cont; + + e = f->mt[r].e; + if (nft_set_elem_expired(&e->ext)) + goto cont; + + elem.priv = e; + + iter->err = iter->fn(ctx, set, iter, &elem); + if (iter->err < 0) + goto out; + +cont: + iter->count++; + } + +out: + rcu_read_unlock(); +} + +/** + * nft_pipapo_privsize() - Return the size of private data for the set + * @nla: netlink attributes, ignored as size doesn't depend on them + * @desc: Set description, ignored as size doesn't depend on it + * + * Return: size of private data for this set implementation, in bytes + */ +static u64 nft_pipapo_privsize(const struct nlattr * const nla[], + const struct nft_set_desc *desc) +{ + return sizeof(struct nft_pipapo); +} + +/** + * nft_pipapo_estimate() - Estimate set size, space and lookup complexity + * @desc: Set description, element count and field description used here + * @features: Flags: NFT_SET_INTERVAL needs to be there + * @est: Storage for estimation data + * + * The size for this set type can vary dramatically, as it depends on the number + * of rules (composing netmasks) the entries expand to. We compute the worst + * case here. + * + * In general, for a non-ranged entry or a single composing netmask, we need + * one bit in each of the sixteen NFT_PIPAPO_BUCKETS, for each 4-bit group (that + * is, each input bit needs four bits of matching data), plus a bucket in the + * mapping table for each field. + * + * Return: true only for compatible range concatenations + */ +static bool nft_pipapo_estimate(const struct nft_set_desc *desc, u32 features, + struct nft_set_estimate *est) +{ + unsigned long entry_size; + int i; + + if (!(features & NFT_SET_INTERVAL) || desc->field_count <= 1) + return false; + + for (i = 0, entry_size = 0; i < desc->field_count; i++) { + unsigned long rules; + + if (desc->field_len[i] > NFT_PIPAPO_MAX_BYTES) + return false; + + /* Worst-case ranges for each concatenated field: each n-bit + * field can expand to up to n * 2 rules in each bucket, and + * each rule also needs a mapping bucket. + */ + rules = ilog2(desc->field_len[i] * BITS_PER_BYTE) * 2; + entry_size += rules * NFT_PIPAPO_BUCKETS / BITS_PER_BYTE; + entry_size += rules * sizeof(union nft_pipapo_map_bucket); + } + + /* Rules in lookup and mapping tables are needed for each entry */ + est->size = desc->size * entry_size; + if (est->size && div_u64(est->size, desc->size) != entry_size) + return false; + + est->size += sizeof(struct nft_pipapo) + + sizeof(struct nft_pipapo_match) * 2; + + est->size += sizeof(struct nft_pipapo_field) * desc->field_count; + + est->lookup = NFT_SET_CLASS_O_LOG_N; + + est->space = NFT_SET_CLASS_O_N; + + return true; +} + +/** + * nft_pipapo_init() - Initialise data for a set instance + * @set: nftables API set representation + * @desc: Set description + * @nla: netlink attributes + * + * Validate number and size of fields passed as NFTA_SET_DESC_CONCAT netlink + * attributes, initialise internal set parameters, current instance of matching + * data and a copy for subsequent insertions. + * + * Return: 0 on success, negative error code on failure. + */ +static int nft_pipapo_init(const struct nft_set *set, + const struct nft_set_desc *desc, + const struct nlattr * const nla[]) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int err, i; + + if (desc->field_count > NFT_PIPAPO_MAX_FIELDS) + return -EINVAL; + + m = kmalloc(sizeof(*priv->match) + sizeof(*f) * desc->field_count, + GFP_KERNEL); + if (!m) + return -ENOMEM; + + m->field_count = desc->field_count; + m->bsize_max = 0; + + m->scratch = alloc_percpu(unsigned long *); + if (!m->scratch) { + err = -ENOMEM; + goto out_free; + } + for_each_possible_cpu(i) + *per_cpu_ptr(m->scratch, i) = NULL; + + rcu_head_init(&m->rcu); + + nft_pipapo_for_each_field(f, i, m) { + f->groups = desc->field_len[i] * NFT_PIPAPO_GROUPS_PER_BYTE; + priv->groups += f->groups; + + priv->width += round_up(desc->field_len[i], sizeof(u32)); + + f->bsize = 0; + f->rules = 0; + f->lt = NULL; + f->mt = NULL; + } + + /* Create an initial clone of matching data for next insertion */ + priv->clone = pipapo_clone(m); + if (IS_ERR(priv->clone)) { + err = PTR_ERR(priv->clone); + goto out_free; + } + + priv->dirty = false; + + rcu_assign_pointer(priv->match, m); + + return 0; + +out_free: + free_percpu(m->scratch); + kfree(m); + + return err; +} + +/** + * nft_pipapo_destroy() - Free private data for set and all committed elements + * @set: nftables API set representation + */ +static void nft_pipapo_destroy(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *m; + struct nft_pipapo_field *f; + int i, r, cpu; + + m = rcu_dereference_protected(priv->match, true); + if (m) { + rcu_barrier(); + + for (i = 0, f = m->f; i < m->field_count - 1; i++, f++) + ; + + for (r = 0; r < f->rules; r++) { + struct nft_pipapo_elem *e; + + if (r < f->rules - 1 && f->mt[r + 1].e == f->mt[r].e) + continue; + + e = f->mt[r].e; + + nft_set_elem_destroy(set, e, true); + } + + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(m->scratch, cpu)); + free_percpu(m->scratch); + + pipapo_free_fields(m); + kfree(m); + priv->match = NULL; + } + + if (priv->clone) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(priv->clone->scratch, cpu)); + free_percpu(priv->clone->scratch); + + pipapo_free_fields(priv->clone); + kfree(priv->clone); + priv->clone = NULL; + } +} + +/** + * nft_pipapo_gc_init() - Initialise garbage collection + * @set: nftables API set representation + * + * Instead of actually setting up a periodic work for garbage collection, as + * this operation requires a swap of matching data with the working copy, we'll + * do that opportunistically with other commit operations if the interval is + * elapsed, so we just need to set the current jiffies timestamp here. + */ +static void nft_pipapo_gc_init(const struct nft_set *set) +{ + struct nft_pipapo *priv = nft_set_priv(set); + + priv->last_gc = jiffies; +} + +struct nft_set_type nft_set_pipapo_type __read_mostly = { + .owner = THIS_MODULE, + .features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | + NFT_SET_TIMEOUT, + .ops = { + .lookup = nft_pipapo_lookup, + .insert = nft_pipapo_insert, + .activate = nft_pipapo_activate, + .deactivate = nft_pipapo_deactivate, + .flush = nft_pipapo_flush, + .remove = nft_pipapo_remove, + .walk = nft_pipapo_walk, + .get = nft_pipapo_get, + .privsize = nft_pipapo_privsize, + .estimate = nft_pipapo_estimate, + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, +}; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index a9f804f7a04a..5000b938ab1e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -466,6 +466,9 @@ static void nft_rbtree_destroy(const struct nft_set *set) static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { + if (desc->field_count > 1) + return false; + if (desc->size) est->size = sizeof(struct nft_rbtree) + desc->size * sizeof(struct nft_rbtree_elem); diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 5284fcf16be7..4c3f2e24c7cb 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -248,8 +248,9 @@ static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr, } static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = { + [NFTA_TUNNEL_KEY_ERSPAN_VERSION] = { .type = NLA_U32 }, [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 }, - [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, + [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 }, [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 }, }; @@ -445,10 +446,15 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 || - nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 || - nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label)) + if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, + &info->key.u.ipv6.src) < 0 || + nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, + &info->key.u.ipv6.dst) < 0 || + nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, + info->key.label)) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } else { @@ -456,9 +462,13 @@ static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info) if (!nest) return -1; - if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 || - nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0) + if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, + info->key.u.ipv4.src) < 0 || + nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, + info->key.u.ipv4.dst) < 0) { + nla_nest_cancel(skb, nest); return -1; + } nla_nest_end(skb, nest); } @@ -470,42 +480,58 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb, struct nft_tunnel_obj *priv) { struct nft_tunnel_opts *opts = &priv->opts; - struct nlattr *nest; + struct nlattr *nest, *inner; nest = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS); if (!nest) return -1; if (opts->flags & TUNNEL_VXLAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_VXLAN); + if (!inner) + goto failure; if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP, htonl(opts->u.vxlan.gbp))) - return -1; + goto inner_failure; + nla_nest_end(skb, inner); } else if (opts->flags & TUNNEL_ERSPAN_OPT) { + inner = nla_nest_start_noflag(skb, NFTA_TUNNEL_KEY_OPTS_ERSPAN); + if (!inner) + goto failure; + if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_VERSION, + htonl(opts->u.erspan.version))) + goto inner_failure; switch (opts->u.erspan.version) { case ERSPAN_VERSION: if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX, opts->u.erspan.u.index)) - return -1; + goto inner_failure; break; case ERSPAN_VERSION2: if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID, get_hwid(&opts->u.erspan.u.md2)) || nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR, opts->u.erspan.u.md2.dir)) - return -1; + goto inner_failure; break; } + nla_nest_end(skb, inner); } nla_nest_end(skb, nest); - return 0; + +inner_failure: + nla_nest_cancel(skb, inner); +failure: + nla_nest_cancel(skb, nest); + return -1; } static int nft_tunnel_ports_dump(struct sk_buff *skb, struct ip_tunnel_info *info) { - if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 || - nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0) + if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, info->key.tp_src) < 0 || + nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, info->key.tp_dst) < 0) return -1; return 0; diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ced3fc8fad7c..bccd47cd7190 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -357,21 +357,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, return 0; } -static bool select_all(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return true; -} - -static bool select_gc(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he) -{ - return time_after_eq(jiffies, he->expires); -} - -static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, - bool (*select)(const struct xt_hashlimit_htable *ht, - const struct dsthash_ent *he)) +static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select_all) { unsigned int i; @@ -381,7 +367,7 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, spin_lock_bh(&ht->lock); hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) { - if ((*select)(ht, dh)) + if (time_after_eq(jiffies, dh->expires) || select_all) dsthash_free(ht, dh); } spin_unlock_bh(&ht->lock); @@ -395,7 +381,7 @@ static void htable_gc(struct work_struct *work) ht = container_of(work, struct xt_hashlimit_htable, gc_work.work); - htable_selective_cleanup(ht, select_gc); + htable_selective_cleanup(ht, false); queue_delayed_work(system_power_efficient_wq, &ht->gc_work, msecs_to_jiffies(ht->cfg.gc_interval)); @@ -419,7 +405,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) { cancel_delayed_work_sync(&hinfo->gc_work); htable_remove_proc_entry(hinfo); - htable_selective_cleanup(hinfo, select_all); + htable_selective_cleanup(hinfo, true); kfree(hinfo->name); vfree(hinfo); } |