summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJohn Fastabend <john.fastabend@gmail.com>2017-07-17 09:29:40 -0700
committerDavid S. Miller <davem@davemloft.net>2017-07-17 09:48:06 -0700
commit11393cc9b9be2a1f61559e6fb9c27bc8fa20b1ff (patch)
tree9983db89954e9d34d04d16fa023e5df752efd90d /kernel
parent97f91a7cf04ff605845c20948b8a80e54cbd3376 (diff)
xdp: Add batching support to redirect map
For performance reasons we want to avoid updating the tail pointer in the driver tx ring as much as possible. To accomplish this we add batching support to the redirect path in XDP. This adds another ndo op "xdp_flush" that is used to inform the driver that it should bump the tail pointer on the TX ring. Signed-off-by: John Fastabend <john.fastabend@gmail.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Daniel Borkmann <daniel@iogearbox.net> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/devmap.c84
1 files changed, 83 insertions, 1 deletions
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 36dc13deb2e1..b2ef04a1c86a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -53,6 +53,7 @@ struct bpf_dtab_netdev {
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
+ unsigned long int __percpu *flush_needed;
};
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
@@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
+ cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
if (cost >= U32_MAX - PAGE_SIZE)
goto free_dtab;
@@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;
+ /* A per cpu bitfield with a bit per possible net device */
+ dtab->flush_needed = __alloc_percpu(
+ BITS_TO_LONGS(attr->max_entries) *
+ sizeof(unsigned long),
+ __alignof__(unsigned long));
+ if (!dtab->flush_needed)
+ goto free_dtab;
+
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *));
if (!dtab->netdev_map)
@@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
return &dtab->map;
free_dtab:
+ free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
@@ -112,7 +123,7 @@ free_dtab:
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- int i;
+ int i, cpu;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
@@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map)
*/
synchronize_rcu();
+ /* To ensure all pending flush operations have completed wait for flush
+ * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+ * Because the above synchronize_rcu() ensures the map is disconnected
+ * from the program we can assume no new bits will be set.
+ */
+ for_each_online_cpu(cpu) {
+ unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
+
+ while (!bitmap_empty(bitmap, dtab->map.max_entries))
+ cpu_relax();
+ }
+
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
@@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map)
/* At this point bpf program is detached and all pending operations
* _must_ be complete
*/
+ free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
@@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
return 0;
}
+void __dev_map_insert_ctx(struct bpf_map *map, u32 key)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+
+ __set_bit(key, bitmap);
+}
+
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
return dev ? dev->dev : NULL;
}
+/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
+ * from the driver before returning from its napi->poll() routine. The poll()
+ * routine is called either from busy_poll context or net_rx_action signaled
+ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
+ * net device can be torn down. On devmap tear down we ensure the ctx bitmap
+ * is zeroed before completing to ensure all flush operations have completed.
+ */
+void __dev_map_flush(struct bpf_map *map)
+{
+ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+ unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
+ u32 bit;
+
+ for_each_set_bit(bit, bitmap, map->max_entries) {
+ struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct net_device *netdev;
+
+ /* This is possible if the dev entry is removed by user space
+ * between xdp redirect and flush op.
+ */
+ if (unlikely(!dev))
+ continue;
+
+ netdev = dev->dev;
+
+ __clear_bit(bit, bitmap);
+ if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush))
+ continue;
+
+ netdev->netdev_ops->ndo_xdp_flush(netdev);
+ }
+}
+
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
@@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
return dev ? &dev->dev->ifindex : NULL;
}
+static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev)
+{
+ if (old_dev->dev->netdev_ops->ndo_xdp_flush) {
+ struct net_device *fl = old_dev->dev;
+ unsigned long *bitmap;
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu);
+ __clear_bit(old_dev->key, bitmap);
+
+ fl->netdev_ops->ndo_xdp_flush(old_dev->dev);
+ }
+ }
+}
+
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *old_dev;
old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
+ dev_map_flush_old(old_dev);
dev_put(old_dev->dev);
kfree(old_dev);
}