diff options
| author | Alexei Starovoitov <ast@kernel.org> | 2023-08-21 15:21:16 -0700 | 
|---|---|---|
| committer | Alexei Starovoitov <ast@kernel.org> | 2023-08-21 15:21:16 -0700 | 
| commit | 5bebd3e3a37dc00c6dbbcd0eb5cebd58e3f18f64 (patch) | |
| tree | a3e9247534fe1fcb2809caabc9ba62daa7e6e381 | |
| parent | 0a55264cf966fb95ebf9d03d9f81fa992f069312 (diff) | |
| parent | c2e42ddf26cad03ea92400c88b024e8ce1601dff (diff) | |
Merge branch 'remove-unnecessary-synchronizations-in-cpumap'
Hou Tao says:
====================
Remove unnecessary synchronizations in cpumap
From: Hou Tao <houtao1@huawei.com>
Hi,
This is the formal patchset to remove unnecessary synchronizations in
cpu-map after address comments and collect Rvb tags from Toke
Høiland-Jørgensen (Big thanks to Toke). Patch #1 removes the unnecessary
rcu_barrier() when freeing bpf_cpu_map_entry and replaces it by
queue_rcu_work(). Patch #2 removes the unnecessary call_rcu() and
queue_work() when destroying cpu-map and does the freeing directly.
Test the patchset by using xdp_redirect_cpu and virtio-net. Both
xdp-mode and skb-mode have been exercised and no issues were reported.
As ususal, comments and suggestions are always welcome.
Change Log:
v1:
  * address comments from Toke Høiland-Jørgensen
  * add Rvb tags from Toke Høiland-Jørgensen
  * update outdated comment in cpu_map_delete_elem()
RFC: https://lore.kernel.org/bpf/20230728023030.1906124-1-houtao@huaweicloud.com
====================
Link: https://lore.kernel.org/r/20230816045959.358059-1-houtao@huaweicloud.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
| -rw-r--r-- | kernel/bpf/cpumap.c | 113 | 
1 files changed, 35 insertions, 78 deletions
| diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ef28c64f1eb1..e42a1bdb7f53 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -68,11 +68,8 @@ struct bpf_cpu_map_entry {  	struct bpf_cpumap_val value;  	struct bpf_prog *prog; -	atomic_t refcnt; /* Control when this struct can be free'ed */ -	struct rcu_head rcu; - -	struct work_struct kthread_stop_wq;  	struct completion kthread_running; +	struct rcu_work free_work;  };  struct bpf_cpu_map { @@ -117,11 +114,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)  	return &cmap->map;  } -static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ -	atomic_inc(&rcpu->refcnt); -} -  static void __cpu_map_ring_cleanup(struct ptr_ring *ring)  {  	/* The tear-down procedure should have made sure that queue is @@ -142,35 +134,6 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring)  	}  } -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ -	if (atomic_dec_and_test(&rcpu->refcnt)) { -		if (rcpu->prog) -			bpf_prog_put(rcpu->prog); -		/* The queue should be empty at this point */ -		__cpu_map_ring_cleanup(rcpu->queue); -		ptr_ring_cleanup(rcpu->queue, NULL); -		kfree(rcpu->queue); -		kfree(rcpu); -	} -} - -/* called from workqueue, to workaround syscall using preempt_disable */ -static void cpu_map_kthread_stop(struct work_struct *work) -{ -	struct bpf_cpu_map_entry *rcpu; - -	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq); - -	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier, -	 * as it waits until all in-flight call_rcu() callbacks complete. -	 */ -	rcu_barrier(); - -	/* kthread_stop will wake_up_process and wait for it to complete */ -	kthread_stop(rcpu->kthread); -} -  static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,  				     struct list_head *listp,  				     struct xdp_cpumap_stats *stats) @@ -395,7 +358,6 @@ static int cpu_map_kthread_run(void *data)  	}  	__set_current_state(TASK_RUNNING); -	put_cpu_map_entry(rcpu);  	return 0;  } @@ -472,9 +434,6 @@ __cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value,  	if (IS_ERR(rcpu->kthread))  		goto free_prog; -	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */ -	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */ -  	/* Make sure kthread runs on a single CPU */  	kthread_bind(rcpu->kthread, cpu);  	wake_up_process(rcpu->kthread); @@ -501,40 +460,40 @@ free_rcu:  	return NULL;  } -static void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct work_struct *work)  {  	struct bpf_cpu_map_entry *rcpu;  	/* This cpu_map_entry have been disconnected from map and one -	 * RCU grace-period have elapsed.  Thus, XDP cannot queue any +	 * RCU grace-period have elapsed. Thus, XDP cannot queue any  	 * new packets and cannot change/set flush_needed that can  	 * find this entry.  	 */ -	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu); +	rcpu = container_of(to_rcu_work(work), struct bpf_cpu_map_entry, free_work); +	/* kthread_stop will wake_up_process and wait for it to complete. +	 * cpu_map_kthread_run() makes sure the pointer ring is empty +	 * before exiting. +	 */ +	kthread_stop(rcpu->kthread); + +	if (rcpu->prog) +		bpf_prog_put(rcpu->prog); +	/* The queue should be empty at this point */ +	__cpu_map_ring_cleanup(rcpu->queue); +	ptr_ring_cleanup(rcpu->queue, NULL); +	kfree(rcpu->queue);  	free_percpu(rcpu->bulkq); -	/* Cannot kthread_stop() here, last put free rcpu resources */ -	put_cpu_map_entry(rcpu); +	kfree(rcpu);  } -/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to - * ensure any driver rcu critical sections have completed, but this - * does not guarantee a flush has happened yet. Because driver side - * rcu_read_lock/unlock only protects the running XDP program.  The - * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a - * pending flush op doesn't fail. - * - * The bpf_cpu_map_entry is still used by the kthread, and there can - * still be pending packets (in queue and percpu bulkq).  A refcnt - * makes sure to last user (kthread_stop vs. call_rcu) free memory - * resources. - * - * The rcu callback __cpu_map_entry_free flush remaining packets in - * percpu bulkq to queue.  Due to caller map_delete_elem() disable - * preemption, cannot call kthread_stop() to make sure queue is empty. - * Instead a work_queue is started for stopping kthread, - * cpu_map_kthread_stop, which waits for an RCU grace period before - * stopping kthread, emptying the queue. +/* After the xchg of the bpf_cpu_map_entry pointer, we need to make sure the old + * entry is no longer in use before freeing. We use queue_rcu_work() to call + * __cpu_map_entry_free() in a separate workqueue after waiting for an RCU grace + * period. This means that (a) all pending enqueue and flush operations have + * completed (because of the RCU callback), and (b) we are in a workqueue + * context where we can stop the kthread and wait for it to exit before freeing + * everything.   */  static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,  				    u32 key_cpu, struct bpf_cpu_map_entry *rcpu) @@ -543,9 +502,8 @@ static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,  	old_rcpu = unrcu_pointer(xchg(&cmap->cpu_map[key_cpu], RCU_INITIALIZER(rcpu)));  	if (old_rcpu) { -		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free); -		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop); -		schedule_work(&old_rcpu->kthread_stop_wq); +		INIT_RCU_WORK(&old_rcpu->free_work, __cpu_map_entry_free); +		queue_rcu_work(system_wq, &old_rcpu->free_work);  	}  } @@ -557,7 +515,7 @@ static long cpu_map_delete_elem(struct bpf_map *map, void *key)  	if (key_cpu >= map->max_entries)  		return -EINVAL; -	/* notice caller map_delete_elem() use preempt_disable() */ +	/* notice caller map_delete_elem() uses rcu_read_lock() */  	__cpu_map_entry_replace(cmap, key_cpu, NULL);  	return 0;  } @@ -608,16 +566,15 @@ static void cpu_map_free(struct bpf_map *map)  	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,  	 * so the bpf programs (can be more than one that used this map) were  	 * disconnected from events. Wait for outstanding critical sections in -	 * these programs to complete. The rcu critical section only guarantees -	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map. -	 * It does __not__ ensure pending flush operations (if any) are -	 * complete. +	 * these programs to complete. synchronize_rcu() below not only +	 * guarantees no further "XDP/bpf-side" reads against +	 * bpf_cpu_map->cpu_map, but also ensure pending flush operations +	 * (if any) are completed.  	 */ -  	synchronize_rcu(); -	/* For cpu_map the remote CPUs can still be using the entries -	 * (struct bpf_cpu_map_entry). +	/* The only possible user of bpf_cpu_map_entry is +	 * cpu_map_kthread_run().  	 */  	for (i = 0; i < cmap->map.max_entries; i++) {  		struct bpf_cpu_map_entry *rcpu; @@ -626,8 +583,8 @@ static void cpu_map_free(struct bpf_map *map)  		if (!rcpu)  			continue; -		/* bq flush and cleanup happens after RCU grace-period */ -		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ +		/* Stop kthread and cleanup entry directly */ +		__cpu_map_entry_free(&rcpu->free_work.work);  	}  	bpf_map_area_free(cmap->cpu_map);  	bpf_map_area_free(cmap); | 
