diff options
Diffstat (limited to 'kernel/trace/ring_buffer.c')
| -rw-r--r-- | kernel/trace/ring_buffer.c | 78 | 
1 files changed, 55 insertions, 23 deletions
| diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..a6268e09160a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum {  };  /*   * Used for which event context the event is in. - *  NMI     = 0 - *  IRQ     = 1 - *  SOFTIRQ = 2 - *  NORMAL  = 3 + *  TRANSITION = 0 + *  NMI     = 1 + *  IRQ     = 2 + *  SOFTIRQ = 3 + *  NORMAL  = 4   *   * See trace_recursive_lock() comment below for more details.   */  enum { +	RB_CTX_TRANSITION,  	RB_CTX_NMI,  	RB_CTX_IRQ,  	RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)   * a bit of overhead in something as critical as function tracing,   * we use a bitmask trick.   * - *  bit 0 =  NMI context - *  bit 1 =  IRQ context - *  bit 2 =  SoftIRQ context - *  bit 3 =  normal context. + *  bit 1 =  NMI context + *  bit 2 =  IRQ context + *  bit 3 =  SoftIRQ context + *  bit 4 =  normal context.   *   * This works because this is the order of contexts that can   * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)   * The least significant bit can be cleared this way, and it   * just so happens that it is the same bit corresponding to   * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context.   */  static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)  		bit = pc & NMI_MASK ? RB_CTX_NMI :  			pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; -	if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) -		return 1; +	if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { +		/* +		 * It is possible that this was called by transitioning +		 * between interrupt context, and preempt_count() has not +		 * been updated yet. In this case, use the TRANSITION bit. +		 */ +		bit = RB_CTX_TRANSITION; +		if (val & (1 << (bit + cpu_buffer->nest))) +			return 1; +	}  	val |= (1 << (bit + cpu_buffer->nest));  	cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)  		cpu_buffer->current_context - (1 << cpu_buffer->nest);  } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5  /**   * ring_buffer_nest_start - Allow to trace while nested @@ -3200,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  	/* See if we shot pass the end of this buffer page */  	if (unlikely(write > BUF_PAGE_SIZE)) { -		if (tail != w) { -			/* before and after may now different, fix it up*/ -			b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); -			a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); -			if (a_ok && b_ok && info->before != info->after) -				(void)rb_time_cmpxchg(&cpu_buffer->before_stamp, -						      info->before, info->after); -		} +		/* before and after may now different, fix it up*/ +		b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); +		a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); +		if (a_ok && b_ok && info->before != info->after) +			(void)rb_time_cmpxchg(&cpu_buffer->before_stamp, +					      info->before, info->after);  		return rb_move_tail(cpu_buffer, tail, info);  	} @@ -3253,11 +3285,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,  		ts = rb_time_stamp(cpu_buffer->buffer);  		barrier();   /*E*/		if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && -		    info->after < ts) { +		    info->after < ts && +		    rb_time_cmpxchg(&cpu_buffer->write_stamp, +				    info->after, ts)) {  			/* Nothing came after this event between C and E */  			info->delta = ts - info->after; -			(void)rb_time_cmpxchg(&cpu_buffer->write_stamp, -					      info->after, info->ts);  			info->ts = ts;  		} else {  			/* | 
