diff options
Diffstat (limited to 'kernel/locking')
| -rw-r--r-- | kernel/locking/qspinlock.c | 82 | ||||
| -rw-r--r-- | kernel/locking/qspinlock_paravirt.h | 252 | ||||
| -rw-r--r-- | kernel/locking/qspinlock_stat.h | 300 | 
3 files changed, 576 insertions, 58 deletions
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 87e9ce6a63c5..393d1874b9e0 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -14,8 +14,9 @@   * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P.   * (C) Copyright 2013-2014 Red Hat, Inc.   * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP   * - * Authors: Waiman Long <waiman.long@hp.com> + * Authors: Waiman Long <waiman.long@hpe.com>   *          Peter Zijlstra <peterz@infradead.org>   */ @@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)  {  	struct __qspinlock *l = (void *)lock; -	return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; +	/* +	 * Use release semantics to make sure that the MCS node is properly +	 * initialized before changing the tail code. +	 */ +	return (u32)xchg_release(&l->tail, +				 tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;  }  #else /* _Q_PENDING_BITS == 8 */ @@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)  	for (;;) {  		new = (val & _Q_LOCKED_PENDING_MASK) | tail; -		old = atomic_cmpxchg(&lock->val, val, new); +		/* +		 * Use release semantics to make sure that the MCS node is +		 * properly initialized before changing the tail code. +		 */ +		old = atomic_cmpxchg_release(&lock->val, val, new);  		if (old == val)  			break; @@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock)   */  static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node, +					   struct mcs_spinlock *prev) { }  static __always_inline void __pv_kick_node(struct qspinlock *lock,  					   struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_head(struct qspinlock *lock, -					   struct mcs_spinlock *node) { } +static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock, +						   struct mcs_spinlock *node) +						   { return 0; }  #define pv_enabled()		false  #define pv_init_node		__pv_init_node  #define pv_wait_node		__pv_wait_node  #define pv_kick_node		__pv_kick_node -#define pv_wait_head		__pv_wait_head +#define pv_wait_head_or_lock	__pv_wait_head_or_lock  #ifdef CONFIG_PARAVIRT_SPINLOCKS  #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath @@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)  		if (val == new)  			new |= _Q_PENDING_VAL; -		old = atomic_cmpxchg(&lock->val, val, new); +		/* +		 * Acquire semantic is required here as the function may +		 * return immediately if the lock was free. +		 */ +		old = atomic_cmpxchg_acquire(&lock->val, val, new);  		if (old == val)  			break; @@ -382,6 +398,7 @@ queue:  	 * p,*,* -> n,*,*  	 */  	old = xchg_tail(lock, tail); +	next = NULL;  	/*  	 * if there was a previous node; link it and wait until reaching the @@ -391,8 +408,18 @@ queue:  		prev = decode_tail(old);  		WRITE_ONCE(prev->next, node); -		pv_wait_node(node); +		pv_wait_node(node, prev);  		arch_mcs_spin_lock_contended(&node->locked); + +		/* +		 * While waiting for the MCS lock, the next pointer may have +		 * been set by another lock waiter. We optimistically load +		 * the next pointer & prefetch the cacheline for writing +		 * to reduce latency in the upcoming MCS unlock operation. +		 */ +		next = READ_ONCE(node->next); +		if (next) +			prefetchw(next);  	}  	/* @@ -406,11 +433,22 @@ queue:  	 * sequentiality; this is because the set_locked() function below  	 * does not imply a full barrier.  	 * +	 * The PV pv_wait_head_or_lock function, if active, will acquire +	 * the lock and return a non-zero value. So we have to skip the +	 * smp_load_acquire() call. As the next PV queue head hasn't been +	 * designated yet, there is no way for the locked value to become +	 * _Q_SLOW_VAL. So both the set_locked() and the +	 * atomic_cmpxchg_relaxed() calls will be safe. +	 * +	 * If PV isn't active, 0 will be returned instead. +	 *  	 */ -	pv_wait_head(lock, node); -	while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) -		cpu_relax(); +	if ((val = pv_wait_head_or_lock(lock, node))) +		goto locked; +	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + +locked:  	/*  	 * claim the lock:  	 * @@ -422,11 +460,17 @@ queue:  	 * to grab the lock.  	 */  	for (;;) { -		if (val != tail) { +		/* In the PV case we might already have _Q_LOCKED_VAL set */ +		if ((val & _Q_TAIL_MASK) != tail) {  			set_locked(lock);  			break;  		} -		old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); +		/* +		 * The smp_load_acquire() call above has provided the necessary +		 * acquire semantics required for locking. At most two +		 * iterations of this loop may be ran. +		 */ +		old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);  		if (old == val)  			goto release;	/* No contention */ @@ -434,10 +478,12 @@ queue:  	}  	/* -	 * contended path; wait for next, release. +	 * contended path; wait for next if not observed yet, release.  	 */ -	while (!(next = READ_ONCE(node->next))) -		cpu_relax(); +	if (!next) { +		while (!(next = READ_ONCE(node->next))) +			cpu_relax(); +	}  	arch_mcs_spin_unlock_contended(&next->locked);  	pv_kick_node(lock, next); @@ -462,7 +508,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);  #undef pv_init_node  #undef pv_wait_node  #undef pv_kick_node -#undef pv_wait_head +#undef pv_wait_head_or_lock  #undef  queued_spin_lock_slowpath  #define queued_spin_lock_slowpath	__pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f0450ff4829b..87bb235c3448 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -23,6 +23,20 @@  #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)  /* + * Queue Node Adaptive Spinning + * + * A queue node vCPU will stop spinning if the vCPU in the previous node is + * not running. The one lock stealing attempt allowed at slowpath entry + * mitigates the slight slowdown for non-overcommitted guest with this + * aggressive wait-early mechanism. + * + * The status of the previous node will be checked at fixed interval + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't + * pound on the cacheline of the previous node too heavily. + */ +#define PV_PREV_CHECK_MASK	0xff + +/*   * Queue node uses: vcpu_running & vcpu_halted.   * Queue head uses: vcpu_running & vcpu_hashed.   */ @@ -41,6 +55,94 @@ struct pv_node {  };  /* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l)	pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ +	struct __qspinlock *l = (void *)lock; + +	return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && +		(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ +	struct __qspinlock *l = (void *)lock; + +	WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ +	struct __qspinlock *l = (void *)lock; + +	WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ +	struct __qspinlock *l = (void *)lock; + +	return !READ_ONCE(l->locked) && +	       (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) +			== _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ +	atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ +	atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ +	int val = atomic_read(&lock->val); + +	for (;;) { +		int old, new; + +		if (val  & _Q_LOCKED_MASK) +			break; + +		/* +		 * Try to clear pending bit & set locked bit +		 */ +		old = val; +		new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; +		val = atomic_cmpxchg(&lock->val, old, new); + +		if (val == old) +			return 1; +	} +	return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/*   * Lock and MCS node addresses hash table for fast lookup   *   * Hashing is done on a per-cacheline basis to minimize the need to access @@ -100,10 +202,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)  {  	unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);  	struct pv_hash_entry *he; +	int hopcnt = 0;  	for_each_hash_entry(he, offset, hash) { +		hopcnt++;  		if (!cmpxchg(&he->lock, NULL, lock)) {  			WRITE_ONCE(he->node, node); +			qstat_hop(hopcnt);  			return &he->lock;  		}  	} @@ -144,6 +249,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)  }  /* + * Return true if when it is time to check the previous node which is not + * in a running state. + */ +static inline bool +pv_wait_early(struct pv_node *prev, int loop) +{ + +	if ((loop & PV_PREV_CHECK_MASK) != 0) +		return false; + +	return READ_ONCE(prev->state) != vcpu_running; +} + +/*   * Initialize the PV part of the mcs_spinlock node.   */  static void pv_init_node(struct mcs_spinlock *node) @@ -161,15 +280,23 @@ static void pv_init_node(struct mcs_spinlock *node)   * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its   * behalf.   */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)  {  	struct pv_node *pn = (struct pv_node *)node; +	struct pv_node *pp = (struct pv_node *)prev; +	int waitcnt = 0;  	int loop; +	bool wait_early; -	for (;;) { -		for (loop = SPIN_THRESHOLD; loop; loop--) { +	/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ +	for (;; waitcnt++) { +		for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {  			if (READ_ONCE(node->locked))  				return; +			if (pv_wait_early(pp, loop)) { +				wait_early = true; +				break; +			}  			cpu_relax();  		} @@ -184,12 +311,17 @@ static void pv_wait_node(struct mcs_spinlock *node)  		 */  		smp_store_mb(pn->state, vcpu_halted); -		if (!READ_ONCE(node->locked)) +		if (!READ_ONCE(node->locked)) { +			qstat_inc(qstat_pv_wait_node, true); +			qstat_inc(qstat_pv_wait_again, waitcnt); +			qstat_inc(qstat_pv_wait_early, wait_early);  			pv_wait(&pn->state, vcpu_halted); +		}  		/* -		 * If pv_kick_node() changed us to vcpu_hashed, retain that value -		 * so that pv_wait_head() knows to not also try to hash this lock. +		 * If pv_kick_node() changed us to vcpu_hashed, retain that +		 * value so that pv_wait_head_or_lock() knows to not also try +		 * to hash this lock.  		 */  		cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -200,6 +332,7 @@ static void pv_wait_node(struct mcs_spinlock *node)  		 * So it is better to spin for a while in the hope that the  		 * MCS lock will be released soon.  		 */ +		qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));  	}  	/* @@ -212,8 +345,9 @@ static void pv_wait_node(struct mcs_spinlock *node)  /*   * Called after setting next->locked = 1 when we're the lock owner.   * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle.   */  static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)  { @@ -242,14 +376,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)  }  /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin.   * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing.   */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)  {  	struct pv_node *pn = (struct pv_node *)node;  	struct __qspinlock *l = (void *)lock;  	struct qspinlock **lp = NULL; +	int waitcnt = 0;  	int loop;  	/* @@ -259,12 +398,25 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)  	if (READ_ONCE(pn->state) == vcpu_hashed)  		lp = (struct qspinlock **)1; -	for (;;) { +	for (;; waitcnt++) { +		/* +		 * Set correct vCPU state to be used by queue node wait-early +		 * mechanism. +		 */ +		WRITE_ONCE(pn->state, vcpu_running); + +		/* +		 * Set the pending bit in the active lock spinning loop to +		 * disable lock stealing before attempting to acquire the lock. +		 */ +		set_pending(lock);  		for (loop = SPIN_THRESHOLD; loop; loop--) { -			if (!READ_ONCE(l->locked)) -				return; +			if (trylock_clear_pending(lock)) +				goto gotlock;  			cpu_relax();  		} +		clear_pending(lock); +  		if (!lp) { /* ONCE */  			lp = pv_hash(lock, pn); @@ -280,51 +432,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)  			 *  			 * Matches the smp_rmb() in __pv_queued_spin_unlock().  			 */ -			if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { +			if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {  				/* -				 * The lock is free and _Q_SLOW_VAL has never -				 * been set. Therefore we need to unhash before -				 * getting the lock. +				 * The lock was free and now we own the lock. +				 * Change the lock value back to _Q_LOCKED_VAL +				 * and unhash the table.  				 */ +				WRITE_ONCE(l->locked, _Q_LOCKED_VAL);  				WRITE_ONCE(*lp, NULL); -				return; +				goto gotlock;  			}  		} +		WRITE_ONCE(pn->state, vcpu_halted); +		qstat_inc(qstat_pv_wait_head, true); +		qstat_inc(qstat_pv_wait_again, waitcnt);  		pv_wait(&l->locked, _Q_SLOW_VAL);  		/*  		 * The unlocker should have freed the lock before kicking the  		 * CPU. So if the lock is still not free, it is a spurious -		 * wakeup and so the vCPU should wait again after spinning for -		 * a while. +		 * wakeup or another vCPU has stolen the lock. The current +		 * vCPU should spin again.  		 */ +		qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));  	}  	/* -	 * Lock is unlocked now; the caller will acquire it without waiting. -	 * As with pv_wait_node() we rely on the caller to do a load-acquire -	 * for us. +	 * The cmpxchg() or xchg() call before coming here provides the +	 * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL +	 * here is to indicate to the compiler that the value will always +	 * be nozero to enable better code optimization.  	 */ +gotlock: +	return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);  }  /* - * PV version of the unlock function to be used in stead of - * queued_spin_unlock(). + * PV versions of the unlock fastpath and slowpath functions to be used + * instead of queued_spin_unlock().   */ -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible void +__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)  {  	struct __qspinlock *l = (void *)lock;  	struct pv_node *node; -	u8 locked; - -	/* -	 * We must not unlock if SLOW, because in that case we must first -	 * unhash. Otherwise it would be possible to have multiple @lock -	 * entries, which would be BAD. -	 */ -	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); -	if (likely(locked == _Q_LOCKED_VAL)) -		return;  	if (unlikely(locked != _Q_SLOW_VAL)) {  		WARN(!debug_locks_silent, @@ -338,7 +489,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)  	 * so we need a barrier to order the read of the node data in  	 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.  	 * -	 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. +	 * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.  	 */  	smp_rmb(); @@ -361,14 +512,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)  	 * vCPU is harmless other than the additional latency in completing  	 * the unlock.  	 */ +	qstat_inc(qstat_pv_kick_unlock, true);  	pv_kick(node->cpu);  } +  /*   * Include the architecture specific callee-save thunk of the   * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() near the top of the file to make sure - * that the callee-save thunk and the real unlock function are close - * to each other sharing consecutive instruction cachelines. + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined.   */  #include <asm/qspinlock_paravirt.h> +#ifndef __pv_queued_spin_unlock +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ +	struct __qspinlock *l = (void *)lock; +	u8 locked; + +	/* +	 * We must not unlock if SLOW, because in that case we must first +	 * unhash. Otherwise it would be possible to have multiple @lock +	 * entries, which would be BAD. +	 */ +	locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); +	if (likely(locked == _Q_LOCKED_VAL)) +		return; + +	__pv_queued_spin_unlock_slowpath(lock, locked); +} +#endif /* __pv_queued_spin_unlock */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h new file mode 100644 index 000000000000..640dcecdd1df --- /dev/null +++ b/kernel/locking/qspinlock_stat.h @@ -0,0 +1,300 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * When queued spinlock statistical counters are enabled, the following + * debugfs files will be created for reporting the counter values: + * + * <debugfs>/qlockstat/ + *   pv_hash_hops	- average # of hops per hashing operation + *   pv_kick_unlock	- # of vCPU kicks issued at unlock time + *   pv_kick_wake	- # of vCPU kicks used for computing pv_latency_wake + *   pv_latency_kick	- average latency (ns) of vCPU kick operation + *   pv_latency_wake	- average latency (ns) from vCPU kick to wakeup + *   pv_lock_stealing	- # of lock stealing operations + *   pv_spurious_wakeup	- # of spurious wakeups + *   pv_wait_again	- # of vCPU wait's that happened after a vCPU kick + *   pv_wait_early	- # of early vCPU wait's + *   pv_wait_head	- # of vCPU wait's at the queue head + *   pv_wait_node	- # of vCPU wait's at a non-head queue node + * + * Writing to the "reset_counters" file will reset all the above counter + * values. + * + * These statistical counters are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counters usable even in a production + * environment. + * + * There may be slight difference between pv_kick_wake and pv_kick_unlock. + */ +enum qlock_stats { +	qstat_pv_hash_hops, +	qstat_pv_kick_unlock, +	qstat_pv_kick_wake, +	qstat_pv_latency_kick, +	qstat_pv_latency_wake, +	qstat_pv_lock_stealing, +	qstat_pv_spurious_wakeup, +	qstat_pv_wait_again, +	qstat_pv_wait_early, +	qstat_pv_wait_head, +	qstat_pv_wait_node, +	qstat_num,	/* Total number of statistical counters */ +	qstat_reset_cnts = qstat_num, +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Collect pvqspinlock statistics + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/fs.h> + +static const char * const qstat_names[qstat_num + 1] = { +	[qstat_pv_hash_hops]	   = "pv_hash_hops", +	[qstat_pv_kick_unlock]     = "pv_kick_unlock", +	[qstat_pv_kick_wake]       = "pv_kick_wake", +	[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", +	[qstat_pv_latency_kick]	   = "pv_latency_kick", +	[qstat_pv_latency_wake]    = "pv_latency_wake", +	[qstat_pv_lock_stealing]   = "pv_lock_stealing", +	[qstat_pv_wait_again]      = "pv_wait_again", +	[qstat_pv_wait_early]      = "pv_wait_early", +	[qstat_pv_wait_head]       = "pv_wait_head", +	[qstat_pv_wait_node]       = "pv_wait_node", +	[qstat_reset_cnts]         = "reset_counters", +}; + +/* + * Per-cpu counters + */ +static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); +static DEFINE_PER_CPU(u64, pv_kick_time); + +/* + * Function to read and return the qlock statistical counter values + * + * The following counters are handled specially: + * 1. qstat_pv_latency_kick + *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock + * 2. qstat_pv_latency_wake + *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake + * 3. qstat_pv_hash_hops + *    Average hops/hash = pv_hash_hops/pv_kick_unlock + */ +static ssize_t qstat_read(struct file *file, char __user *user_buf, +			  size_t count, loff_t *ppos) +{ +	char buf[64]; +	int cpu, counter, len; +	u64 stat = 0, kicks = 0; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	if (!file->f_inode) { +		WARN_ON_ONCE(1); +		return -EBADF; +	} +	counter = (long)(file->f_inode->i_private); + +	if (counter >= qstat_num) +		return -EBADF; + +	for_each_possible_cpu(cpu) { +		stat += per_cpu(qstats[counter], cpu); +		/* +		 * Need to sum additional counter for some of them +		 */ +		switch (counter) { + +		case qstat_pv_latency_kick: +		case qstat_pv_hash_hops: +			kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); +			break; + +		case qstat_pv_latency_wake: +			kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); +			break; +		} +	} + +	if (counter == qstat_pv_hash_hops) { +		u64 frac; + +		frac = 100ULL * do_div(stat, kicks); +		frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + +		/* +		 * Return a X.XX decimal number +		 */ +		len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); +	} else { +		/* +		 * Round to the nearest ns +		 */ +		if ((counter == qstat_pv_latency_kick) || +		    (counter == qstat_pv_latency_wake)) { +			stat = 0; +			if (kicks) +				stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); +		} +		len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); +	} + +	return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When counter = reset_cnts, reset all the counter values. + * Since the counter updates aren't atomic, the resetting is done twice + * to make sure that the counters are very likely to be all cleared. + */ +static ssize_t qstat_write(struct file *file, const char __user *user_buf, +			   size_t count, loff_t *ppos) +{ +	int cpu; + +	/* +	 * Get the counter ID stored in file->f_inode->i_private +	 */ +	if (!file->f_inode) { +		WARN_ON_ONCE(1); +		return -EBADF; +	} +	if ((long)(file->f_inode->i_private) != qstat_reset_cnts) +		return count; + +	for_each_possible_cpu(cpu) { +		int i; +		unsigned long *ptr = per_cpu_ptr(qstats, cpu); + +		for (i = 0 ; i < qstat_num; i++) +			WRITE_ONCE(ptr[i], 0); +		for (i = 0 ; i < qstat_num; i++) +			WRITE_ONCE(ptr[i], 0); +	} +	return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_qstat = { +	.read = qstat_read, +	.write = qstat_write, +	.llseek = default_llseek, +}; + +/* + * Initialize debugfs for the qspinlock statistical counters + */ +static int __init init_qspinlock_stat(void) +{ +	struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); +	int i; + +	if (!d_qstat) { +		pr_warn("Could not create 'qlockstat' debugfs directory\n"); +		return 0; +	} + +	/* +	 * Create the debugfs files +	 * +	 * As reading from and writing to the stat files can be slow, only +	 * root is allowed to do the read/write to limit impact to system +	 * performance. +	 */ +	for (i = 0; i < qstat_num; i++) +		debugfs_create_file(qstat_names[i], 0400, d_qstat, +				   (void *)(long)i, &fops_qstat); + +	debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, +			   (void *)(long)qstat_reset_cnts, &fops_qstat); +	return 0; +} +fs_initcall(init_qspinlock_stat); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void qstat_inc(enum qlock_stats stat, bool cond) +{ +	if (cond) +		this_cpu_inc(qstats[stat]); +} + +/* + * PV hash hop count + */ +static inline void qstat_hop(int hopcnt) +{ +	this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +} + +/* + * Replacement function for pv_kick() + */ +static inline void __pv_kick(int cpu) +{ +	u64 start = sched_clock(); + +	per_cpu(pv_kick_time, cpu) = start; +	pv_kick(cpu); +	this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +} + +/* + * Replacement function for pv_wait() + */ +static inline void __pv_wait(u8 *ptr, u8 val) +{ +	u64 *pkick_time = this_cpu_ptr(&pv_kick_time); + +	*pkick_time = 0; +	pv_wait(ptr, val); +	if (*pkick_time) { +		this_cpu_add(qstats[qstat_pv_latency_wake], +			     sched_clock() - *pkick_time); +		qstat_inc(qstat_pv_kick_wake, true); +	} +} + +#define pv_kick(c)	__pv_kick(c) +#define pv_wait(p, v)	__pv_wait(p, v) + +/* + * PV unfair trylock count tracking function + */ +static inline int qstat_spin_steal_lock(struct qspinlock *lock) +{ +	int ret = pv_queued_spin_steal_lock(lock); + +	qstat_inc(qstat_pv_lock_stealing, ret); +	return ret; +} +#undef  queued_spin_trylock +#define queued_spin_trylock(l)	qstat_spin_steal_lock(l) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +static inline void qstat_inc(enum qlock_stats stat, bool cond)	{ } +static inline void qstat_hop(int hopcnt)			{ } + +#endif /* CONFIG_QUEUED_LOCK_STAT */  | 
