diff options
Diffstat (limited to 'arch/x86')
71 files changed, 2244 insertions, 1541 deletions
| diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac22bb7719f7..d99eeb7915c6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -29,11 +29,14 @@ config X86  	select HAVE_FTRACE_MCOUNT_RECORD  	select HAVE_DYNAMIC_FTRACE  	select HAVE_FUNCTION_TRACER +	select HAVE_FUNCTION_GRAPH_TRACER +	select HAVE_FUNCTION_TRACE_MCOUNT_TEST  	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)  	select HAVE_ARCH_KGDB if !X86_VOYAGER  	select HAVE_ARCH_TRACEHOOK  	select HAVE_GENERIC_DMA_COHERENT if X86_32  	select HAVE_EFFICIENT_UNALIGNED_ACCESS +	select USER_STACKTRACE_SUPPORT  config ARCH_DEFCONFIG  	string @@ -238,6 +241,16 @@ config X86_HAS_BOOT_CPU_ID  	def_bool y  	depends on X86_VOYAGER +config SPARSE_IRQ +	bool "Support sparse irq numbering" +	depends on (PCI_MSI || HT_IRQ) && SMP +	default y +	help +	  This enables support for sparse irq, esp for msi/msi-x. You may need +	  if you have lots of cards supports msi-x installed. + +	  If you don't know what to do here, say Y. +  config X86_FIND_SMP_CONFIG  	def_bool y  	depends on X86_MPPARSE || X86_VOYAGER @@ -367,10 +380,10 @@ config X86_RDC321X  	  as R-8610-(G).  	  If you don't have one of these chips, you should say N here. -config SCHED_NO_NO_OMIT_FRAME_POINTER +config SCHED_OMIT_FRAME_POINTER  	def_bool y  	prompt "Single-depth WCHAN output" -	depends on X86_32 +	depends on X86  	help  	  Calculate simpler /proc/<PID>/wchan values. If this option  	  is disabled then wchan values will recurse back to the @@ -465,10 +478,6 @@ config X86_CYCLONE_TIMER  	def_bool y  	depends on X86_GENERICARCH -config ES7000_CLUSTERED_APIC -	def_bool y -	depends on SMP && X86_ES7000 && MPENTIUMIII -  source "arch/x86/Kconfig.cpu"  config HPET_TIMER @@ -1632,13 +1641,6 @@ config APM_ALLOW_INTS  	  many of the newer IBM Thinkpads.  If you experience hangs when you  	  suspend, try setting this to Y.  Otherwise, say N. -config APM_REAL_MODE_POWER_OFF -	bool "Use real mode APM BIOS call to power off" -	help -	  Use real mode APM BIOS calls to switch off the computer. This is -	  a work-around for a number of buggy BIOSes. Switch this option on if -	  your computer crashes instead of powering off properly. -  endif # APM  source "arch/x86/kernel/cpu/cpufreq/Kconfig" diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index b815664fe370..85a78575956c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -515,6 +515,7 @@ config CPU_SUP_UMC_32  config X86_DS  	def_bool X86_PTRACE_BTS  	depends on X86_DEBUGCTLMSR +	select HAVE_HW_BRANCH_TRACER  config X86_PTRACE_BTS  	bool "Branch Trace Store" diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2a3dfbd5e677..fa013f529b74 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -186,14 +186,10 @@ config IOMMU_LEAK  	  Add a simple leak tracer to the IOMMU code. This is useful when you  	  are debugging a buggy device driver that leaks IOMMU mappings. -config MMIOTRACE_HOOKS -	bool -  config MMIOTRACE  	bool "Memory mapped IO tracing"  	depends on DEBUG_KERNEL && PCI  	select TRACING -	select MMIOTRACE_HOOKS  	help  	  Mmiotrace traces Memory Mapped I/O access and is meant for  	  debugging and reverse engineering. It is called from the ioremap diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 3b1510b4fc57..25caa0738af5 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -193,6 +193,7 @@ extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);  static inline void lapic_shutdown(void) { }  #define local_apic_timer_c2_ok		1  static inline void init_apic_mappings(void) { } +static inline void disable_local_APIC(void) { }  #endif /* !CONFIG_X86_LOCAL_APIC */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h index 1d9543b9d358..ce547f24a1cd 100644 --- a/arch/x86/include/asm/bigsmp/apic.h +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -24,8 +24,6 @@ static inline cpumask_t target_cpus(void)  #define INT_DELIVERY_MODE	(dest_Fixed)  #define INT_DEST_MODE		(0)    /* phys delivery to target proc */  #define NO_BALANCE_IRQ		(0) -#define WAKE_SECONDARY_VIA_INIT -  static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)  { diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a95008457ea4..99b6c39774a4 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -7,13 +7,12 @@   *   * It manages:   * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done)   * - buffer access   *   * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks   *   *   * Copyright (C) 2007-2008 Intel Corporation. @@ -26,11 +25,18 @@  #include <linux/types.h>  #include <linux/init.h> +#include <linux/err.h>  #ifdef CONFIG_X86_DS  struct task_struct; +struct ds_tracer; +struct bts_tracer; +struct pebs_tracer; + +typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); +typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *);  /*   * Request BTS or PEBS @@ -38,60 +44,62 @@ struct task_struct;   * Due to alignement constraints, the actual buffer may be slightly   * smaller than the requested or provided buffer.   * - * Returns 0 on success; -Eerrno otherwise + * Returns a pointer to a tracer structure on success, or + * ERR_PTR(errcode) on failure. + * + * The interrupt threshold is independent from the overflow callback + * to allow users to use their own overflow interrupt handling mechanism.   *   * task: the task to request recording for;   *       NULL for per-cpu recording on the current cpu   * base: the base pointer for the (non-pageable) buffer; - *       NULL if buffer allocation requested - * size: the size of the requested or provided buffer + * size: the size of the provided buffer in bytes   * ovfl: pointer to a function to be called on buffer overflow;   *       NULL if cyclic buffer requested + * th: the interrupt threshold in records from the end of the buffer; + *     -1 if no interrupt threshold is requested.   */ -typedef void (*ds_ovfl_callback_t)(struct task_struct *); -extern int ds_request_bts(struct task_struct *task, void *base, size_t size, -			  ds_ovfl_callback_t ovfl); -extern int ds_request_pebs(struct task_struct *task, void *base, size_t size, -			   ds_ovfl_callback_t ovfl); +extern struct bts_tracer *ds_request_bts(struct task_struct *task, +					 void *base, size_t size, +					 bts_ovfl_callback_t ovfl, size_t th); +extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, +					   void *base, size_t size, +					   pebs_ovfl_callback_t ovfl, +					   size_t th);  /*   * Release BTS or PEBS resources   * - * Frees buffers allocated on ds_request. - *   * Returns 0 on success; -Eerrno otherwise   * - * task: the task to release resources for; - *       NULL to release resources for the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_release_bts(struct task_struct *task); -extern int ds_release_pebs(struct task_struct *task); +extern int ds_release_bts(struct bts_tracer *tracer); +extern int ds_release_pebs(struct pebs_tracer *tracer);  /* - * Return the (array) index of the write pointer. + * Get the (array) index of the write pointer.   * (assuming an array of BTS/PEBS records)   * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result   */ -extern int ds_get_bts_index(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_index(struct task_struct *task, size_t *pos); +extern int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos);  /* - * Return the (array) index one record beyond the end of the array. + * Get the (array) index one record beyond the end of the array.   * (assuming an array of BTS/PEBS records)   * - * Returns -Eerrno on error + * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu - * pos (out): if not NULL, will hold the result + * tracer: the tracer handle returned from ds_request_~() + * pos (out): will hold the result   */ -extern int ds_get_bts_end(struct task_struct *task, size_t *pos); -extern int ds_get_pebs_end(struct task_struct *task, size_t *pos); +extern int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos); +extern int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos);  /*   * Provide a pointer to the BTS/PEBS record at parameter index. @@ -102,14 +110,13 @@ extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);   *   * Returns the size of a single record on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   * index: the index of the requested record   * record (out): pointer to the requested record   */ -extern int ds_access_bts(struct task_struct *task, +extern int ds_access_bts(struct bts_tracer *tracer,  			 size_t index, const void **record); -extern int ds_access_pebs(struct task_struct *task, +extern int ds_access_pebs(struct pebs_tracer *tracer,  			  size_t index, const void **record);  /* @@ -129,38 +136,24 @@ extern int ds_access_pebs(struct task_struct *task,   *   * Returns the number of bytes written or -Eerrno.   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   * buffer: the buffer to write   * size: the size of the buffer   */ -extern int ds_write_bts(struct task_struct *task, +extern int ds_write_bts(struct bts_tracer *tracer,  			const void *buffer, size_t size); -extern int ds_write_pebs(struct task_struct *task, +extern int ds_write_pebs(struct pebs_tracer *tracer,  			 const void *buffer, size_t size);  /* - * Same as ds_write_bts/pebs, but omit ownership checks. - * - * This is needed to have some other task than the owner of the - * BTS/PEBS buffer or the parameter task itself write into the - * respective buffer. - */ -extern int ds_unchecked_write_bts(struct task_struct *task, -				  const void *buffer, size_t size); -extern int ds_unchecked_write_pebs(struct task_struct *task, -				   const void *buffer, size_t size); - -/*   * Reset the write pointer of the BTS/PEBS buffer.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_reset_bts(struct task_struct *task); -extern int ds_reset_pebs(struct task_struct *task); +extern int ds_reset_bts(struct bts_tracer *tracer); +extern int ds_reset_pebs(struct pebs_tracer *tracer);  /*   * Clear the BTS/PEBS buffer and reset the write pointer. @@ -168,33 +161,30 @@ extern int ds_reset_pebs(struct task_struct *task);   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_~()   */ -extern int ds_clear_bts(struct task_struct *task); -extern int ds_clear_pebs(struct task_struct *task); +extern int ds_clear_bts(struct bts_tracer *tracer); +extern int ds_clear_pebs(struct pebs_tracer *tracer);  /*   * Provide the PEBS counter reset value.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs()   * value (out): the counter reset value   */ -extern int ds_get_pebs_reset(struct task_struct *task, u64 *value); +extern int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value);  /*   * Set the PEBS counter reset value.   *   * Returns 0 on success; -Eerrno on error   * - * task: the task to access; - *       NULL to access the current cpu + * tracer: the tracer handle returned from ds_request_pebs()   * value: the new counter reset value   */ -extern int ds_set_pebs_reset(struct task_struct *task, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);  /*   * Initialization @@ -207,17 +197,13 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);  /*   * The DS context - part of struct thread_struct.   */ +#define MAX_SIZEOF_DS (12 * 8) +  struct ds_context {  	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ -	unsigned char *ds; +	unsigned char ds[MAX_SIZEOF_DS];  	/* the owner of the BTS and PEBS configuration, respectively */ -	struct task_struct *owner[2]; -	/* buffer overflow notification function for BTS and PEBS */ -	ds_ovfl_callback_t callback[2]; -	/* the original buffer address */ -	void *buffer[2]; -	/* the number of allocated pages for on-request allocated buffers */ -	unsigned int pages[2]; +	struct ds_tracer  *owner[2];  	/* use count */  	unsigned long count;  	/* a pointer to the context location inside the thread_struct diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h index 94826cf87455..cc70c1c78ca4 100644 --- a/arch/x86/include/asm/emergency-restart.h +++ b/arch/x86/include/asm/emergency-restart.h @@ -8,7 +8,9 @@ enum reboot_type {  	BOOT_BIOS = 'b',  #endif  	BOOT_ACPI = 'a', -	BOOT_EFI = 'e' +	BOOT_EFI = 'e', +	BOOT_CF9 = 'p', +	BOOT_CF9_COND = 'q',  };  extern enum reboot_type reboot_type; diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h index 380f0b4f17ed..e24ef876915f 100644 --- a/arch/x86/include/asm/es7000/apic.h +++ b/arch/x86/include/asm/es7000/apic.h @@ -9,31 +9,27 @@ static inline int apic_id_registered(void)  	        return (1);  } -static inline cpumask_t target_cpus(void) +static inline cpumask_t target_cpus_cluster(void)  { -#if defined CONFIG_ES7000_CLUSTERED_APIC  	return CPU_MASK_ALL; -#else +} + +static inline cpumask_t target_cpus(void) +{  	return cpumask_of_cpu(smp_processor_id()); -#endif  } -#if defined CONFIG_ES7000_CLUSTERED_APIC -#define APIC_DFR_VALUE		(APIC_DFR_CLUSTER) -#define INT_DELIVERY_MODE	(dest_LowestPrio) -#define INT_DEST_MODE		(1)    /* logical delivery broadcast to all procs */ -#define NO_BALANCE_IRQ		(1) -#undef  WAKE_SECONDARY_VIA_INIT -#define WAKE_SECONDARY_VIA_MIP -#else +#define APIC_DFR_VALUE_CLUSTER		(APIC_DFR_CLUSTER) +#define INT_DELIVERY_MODE_CLUSTER	(dest_LowestPrio) +#define INT_DEST_MODE_CLUSTER		(1) /* logical delivery broadcast to all procs */ +#define NO_BALANCE_IRQ_CLUSTER		(1) +  #define APIC_DFR_VALUE		(APIC_DFR_FLAT)  #define INT_DELIVERY_MODE	(dest_Fixed)  #define INT_DEST_MODE		(0)    /* phys delivery to target procs */  #define NO_BALANCE_IRQ		(0)  #undef  APIC_DEST_LOGICAL  #define APIC_DEST_LOGICAL	0x0 -#define WAKE_SECONDARY_VIA_INIT -#endif  static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)  { @@ -60,6 +56,16 @@ static inline unsigned long calculate_ldr(int cpu)   * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel   * document number 292116).  So here it goes...   */ +static inline void init_apic_ldr_cluster(void) +{ +	unsigned long val; +	int cpu = smp_processor_id(); + +	apic_write(APIC_DFR, APIC_DFR_VALUE_CLUSTER); +	val = calculate_ldr(cpu); +	apic_write(APIC_LDR, val); +} +  static inline void init_apic_ldr(void)  {  	unsigned long val; @@ -70,10 +76,6 @@ static inline void init_apic_ldr(void)  	apic_write(APIC_LDR, val);  } -#ifndef CONFIG_X86_GENERICARCH -extern void enable_apic_mode(void); -#endif -  extern int apic_version [MAX_APICS];  static inline void setup_apic_routing(void)  { @@ -144,7 +146,7 @@ static inline int check_phys_apicid_present(int cpu_physical_apicid)  	return (1);  } -static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +static inline unsigned int cpu_mask_to_apicid_cluster(cpumask_t cpumask)  {  	int num_bits_set;  	int cpus_found = 0; @@ -154,11 +156,7 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)  	num_bits_set = cpus_weight(cpumask);  	/* Return id to all */  	if (num_bits_set == NR_CPUS) -#if defined CONFIG_ES7000_CLUSTERED_APIC  		return 0xFF; -#else -		return cpu_to_logical_apicid(0); -#endif  	/*  	 * The cpus in the mask must all be on the apic cluster.  If are not  	 * on the same apicid cluster return default value of TARGET_CPUS. @@ -171,11 +169,40 @@ static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)  			if (apicid_cluster(apicid) !=  					apicid_cluster(new_apicid)){  				printk ("%s: Not a valid mask!\n", __func__); -#if defined CONFIG_ES7000_CLUSTERED_APIC  				return 0xFF; -#else +			} +			apicid = new_apicid; +			cpus_found++; +		} +		cpu++; +	} +	return apicid; +} + +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ +	int num_bits_set; +	int cpus_found = 0; +	int cpu; +	int apicid; + +	num_bits_set = cpus_weight(cpumask); +	/* Return id to all */ +	if (num_bits_set == NR_CPUS) +		return cpu_to_logical_apicid(0); +	/* +	 * The cpus in the mask must all be on the apic cluster.  If are not +	 * on the same apicid cluster return default value of TARGET_CPUS. +	 */ +	cpu = first_cpu(cpumask); +	apicid = cpu_to_logical_apicid(cpu); +	while (cpus_found < num_bits_set) { +		if (cpu_isset(cpu, cpumask)) { +			int new_apicid = cpu_to_logical_apicid(cpu); +			if (apicid_cluster(apicid) != +					apicid_cluster(new_apicid)){ +				printk ("%s: Not a valid mask!\n", __func__);  				return cpu_to_logical_apicid(0); -#endif  			}  			apicid = new_apicid;  			cpus_found++; diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h index 398493461913..78f0daaee436 100644 --- a/arch/x86/include/asm/es7000/wakecpu.h +++ b/arch/x86/include/asm/es7000/wakecpu.h @@ -1,36 +1,12 @@  #ifndef __ASM_ES7000_WAKECPU_H  #define __ASM_ES7000_WAKECPU_H -/* - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#ifdef CONFIG_ES7000_CLUSTERED_APIC -#define WAKE_SECONDARY_VIA_MIP -#else -#define WAKE_SECONDARY_VIA_INIT -#endif - -#ifdef WAKE_SECONDARY_VIA_MIP -extern int es7000_start_cpu(int cpu, unsigned long eip); -static inline int -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) -{ -	int boot_error = 0; -	boot_error = es7000_start_cpu(phys_apicid, start_eip); -	return boot_error; -} -#endif - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW	0x467 +#define TRAMPOLINE_PHYS_HIGH	0x469  static inline void wait_for_init_deassert(atomic_t *deassert)  { -#ifdef WAKE_SECONDARY_VIA_INIT +#ifndef CONFIG_ES7000_CLUSTERED_APIC  	while (!atomic_read(deassert))  		cpu_relax();  #endif @@ -50,9 +26,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)  {  } -#define inquire_remote_apic(apicid) do {		\ -		if (apic_verbosity >= APIC_DEBUG)	\ -			__inquire_remote_apic(apicid);	\ -	} while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ +	if (apic_verbosity >= APIC_DEBUG) +		__inquire_remote_apic(apicid); +}  #endif /* __ASM_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 9e8bc29b8b17..7e61b4ceb9a4 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -17,8 +17,40 @@ static inline unsigned long ftrace_call_adjust(unsigned long addr)  	 */  	return addr - 1;  } -#endif +#ifdef CONFIG_DYNAMIC_FTRACE + +struct dyn_arch_ftrace { +	/* No extra data needed for x86 */ +}; + +#endif /*  CONFIG_DYNAMIC_FTRACE */ +#endif /* __ASSEMBLY__ */  #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifndef __ASSEMBLY__ + +/* + * Stack of return addresses for functions + * of a thread. + * Used in struct thread_info + */ +struct ftrace_ret_stack { +	unsigned long ret; +	unsigned long func; +	unsigned long long calltime; +}; + +/* + * Primary handler of a function return. + * It relays on ftrace_return_to_handler. + * Defined in entry32.S + */ +extern void return_to_handler(void); + +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +  #endif /* _ASM_X86_FTRACE_H */ diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h index 5cbd4fcc06fd..0ac17d33a8c7 100644 --- a/arch/x86/include/asm/genapic_32.h +++ b/arch/x86/include/asm/genapic_32.h @@ -2,6 +2,7 @@  #define _ASM_X86_GENAPIC_32_H  #include <asm/mpspec.h> +#include <asm/atomic.h>  /*   * Generic APIC driver interface. @@ -65,6 +66,14 @@ struct genapic {  	void (*send_IPI_allbutself)(int vector);  	void (*send_IPI_all)(int vector);  #endif +	int (*wakeup_cpu)(int apicid, unsigned long start_eip); +	int trampoline_phys_low; +	int trampoline_phys_high; +	void (*wait_for_init_deassert)(atomic_t *deassert); +	void (*smp_callin_clear_local_apic)(void); +	void (*store_NMI_vector)(unsigned short *high, unsigned short *low); +	void (*restore_NMI_vector)(unsigned short *high, unsigned short *low); +	void (*inquire_remote_apic)(int apicid);  };  #define APICFUNC(x) .x = x, @@ -105,16 +114,24 @@ struct genapic {  	APICFUNC(get_apic_id)				\  	.apic_id_mask = APIC_ID_MASK,			\  	APICFUNC(cpu_mask_to_apicid)			\ -	APICFUNC(vector_allocation_domain)			\ +	APICFUNC(vector_allocation_domain)		\  	APICFUNC(acpi_madt_oem_check)			\  	IPIFUNC(send_IPI_mask)				\  	IPIFUNC(send_IPI_allbutself)			\  	IPIFUNC(send_IPI_all)				\  	APICFUNC(enable_apic_mode)			\  	APICFUNC(phys_pkg_id)				\ +	.trampoline_phys_low = TRAMPOLINE_PHYS_LOW,		\ +	.trampoline_phys_high = TRAMPOLINE_PHYS_HIGH,		\ +	APICFUNC(wait_for_init_deassert)		\ +	APICFUNC(smp_callin_clear_local_apic)		\ +	APICFUNC(store_NMI_vector)			\ +	APICFUNC(restore_NMI_vector)			\ +	APICFUNC(inquire_remote_apic)			\  }  extern struct genapic *genapic; +extern void es7000_update_genapic_to_cluster(void);  enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};  #define get_uv_system_type()		UV_NONE diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h index 13c4e96199ea..2cae011668b7 100644 --- a/arch/x86/include/asm/genapic_64.h +++ b/arch/x86/include/asm/genapic_64.h @@ -32,6 +32,8 @@ struct genapic {  	unsigned int (*get_apic_id)(unsigned long x);  	unsigned long (*set_apic_id)(unsigned int id);  	unsigned long apic_id_mask; +	/* wakeup_secondary_cpu */ +	int (*wakeup_cpu)(int apicid, unsigned long start_eip);  };  extern struct genapic *genapic; diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index 6afd9933a7dd..25d527ca1362 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -188,17 +188,14 @@ extern void restore_IO_APIC_setup(void);  extern void reinit_intr_remapped_IO_APIC(int);  #endif -extern int probe_nr_irqs(void); +extern void probe_nr_irqs_gsi(void);  #else  /* !CONFIG_X86_IO_APIC */  #define io_apic_assign_pci_irqs 0  static const int timer_through_8259 = 0; -static inline void ioapic_init_mappings(void) { } +static inline void ioapic_init_mappings(void)	{ } -static inline int probe_nr_irqs(void) -{ -	return NR_IRQS; -} +static inline void probe_nr_irqs_gsi(void)	{ }  #endif  #endif /* _ASM_X86_IO_APIC_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f941..f7ff65032b9d 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -101,12 +101,23 @@  #define LAST_VM86_IRQ		15  #define invalid_vm86_irq(irq)	((irq) < 3 || (irq) > 15) +#define NR_IRQS_LEGACY		16 +  #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) + +#ifndef CONFIG_SPARSE_IRQ  # if NR_CPUS < MAX_IO_APICS  #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))  # else  #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))  # endif +#else +# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) +#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) +# else +#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) +# endif +#endif  #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h index ff3a6c236c00..6cb3a467e067 100644 --- a/arch/x86/include/asm/mach-default/mach_apic.h +++ b/arch/x86/include/asm/mach-default/mach_apic.h @@ -32,11 +32,13 @@ static inline cpumask_t target_cpus(void)  #define vector_allocation_domain    (genapic->vector_allocation_domain)  #define read_apic_id()  (GET_APIC_ID(apic_read(APIC_ID)))  #define send_IPI_self (genapic->send_IPI_self) +#define wakeup_secondary_cpu (genapic->wakeup_cpu)  extern void setup_apic_routing(void);  #else  #define INT_DELIVERY_MODE dest_LowestPrio  #define INT_DEST_MODE 1     /* logical delivery broadcast to all procs */  #define TARGET_CPUS (target_cpus()) +#define wakeup_secondary_cpu wakeup_secondary_cpu_via_init  /*   * Set up the logical destination ID.   * diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h index 9d80db91e992..ceb013660146 100644 --- a/arch/x86/include/asm/mach-default/mach_wakecpu.h +++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h @@ -1,17 +1,8 @@  #ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H  #define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H -/*  - * This file copes with machines that wakeup secondary CPUs by the - * INIT, INIT, STARTUP sequence. - */ - -#define WAKE_SECONDARY_VIA_INIT - -#define TRAMPOLINE_LOW phys_to_virt(0x467) -#define TRAMPOLINE_HIGH phys_to_virt(0x469) - -#define boot_cpu_apicid boot_cpu_physical_apicid +#define TRAMPOLINE_PHYS_LOW (0x467) +#define TRAMPOLINE_PHYS_HIGH (0x469)  static inline void wait_for_init_deassert(atomic_t *deassert)  { @@ -33,9 +24,12 @@ static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)  {  } -#define inquire_remote_apic(apicid) do {		\ -		if (apic_verbosity >= APIC_DEBUG)	\ -			__inquire_remote_apic(apicid);	\ -	} while (0) +extern void __inquire_remote_apic(int apicid); + +static inline void inquire_remote_apic(int apicid) +{ +	if (apic_verbosity >= APIC_DEBUG) +		__inquire_remote_apic(apicid); +}  #endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */ diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h index dbab36d64d48..23bf52103b89 100644 --- a/arch/x86/include/asm/mach-default/smpboot_hooks.h +++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h @@ -13,9 +13,11 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)  	CMOS_WRITE(0xa, 0xf);  	local_flush_tlb();  	pr_debug("1.\n"); -	*((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; +	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = +								 start_eip >> 4;  	pr_debug("2.\n"); -	*((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; +	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = +							 start_eip & 0xf;  	pr_debug("3.\n");  } @@ -32,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)  	 */  	CMOS_WRITE(0, 0xf); -	*((volatile long *) phys_to_virt(0x467)) = 0; +	*((volatile long *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;  }  static inline void __init smpboot_setup_io_apic(void) diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h index 5180bd7478fb..e430f47df667 100644 --- a/arch/x86/include/asm/mach-generic/mach_apic.h +++ b/arch/x86/include/asm/mach-generic/mach_apic.h @@ -27,6 +27,7 @@  #define vector_allocation_domain (genapic->vector_allocation_domain)  #define enable_apic_mode (genapic->enable_apic_mode)  #define phys_pkg_id (genapic->phys_pkg_id) +#define wakeup_secondary_cpu (genapic->wakeup_cpu)  extern void generic_bigsmp_probe(void); diff --git a/arch/x86/include/asm/mach-generic/mach_wakecpu.h b/arch/x86/include/asm/mach-generic/mach_wakecpu.h new file mode 100644 index 000000000000..1ab16b168c8a --- /dev/null +++ b/arch/x86/include/asm/mach-generic/mach_wakecpu.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H +#define _ASM_X86_MACH_GENERIC_MACH_WAKECPU_H + +#define TRAMPOLINE_PHYS_LOW (genapic->trampoline_phys_low) +#define TRAMPOLINE_PHYS_HIGH (genapic->trampoline_phys_high) +#define wait_for_init_deassert (genapic->wait_for_init_deassert) +#define smp_callin_clear_local_apic (genapic->smp_callin_clear_local_apic) +#define store_NMI_vector (genapic->store_NMI_vector) +#define restore_NMI_vector (genapic->restore_NMI_vector) +#define inquire_remote_apic (genapic->inquire_remote_apic) + +#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */ diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h index c577bda5b1c5..6f499df8eddb 100644 --- a/arch/x86/include/asm/numaq/wakecpu.h +++ b/arch/x86/include/asm/numaq/wakecpu.h @@ -3,12 +3,8 @@  /* This file copes with machines that wakeup secondary CPUs by NMIs */ -#define WAKE_SECONDARY_VIA_NMI - -#define TRAMPOLINE_LOW phys_to_virt(0x8) -#define TRAMPOLINE_HIGH phys_to_virt(0xa) - -#define boot_cpu_apicid boot_cpu_logical_apicid +#define TRAMPOLINE_PHYS_LOW (0x8) +#define TRAMPOLINE_PHYS_HIGH (0xa)  /* We don't do anything here because we use NMI's to boot instead */  static inline void wait_for_init_deassert(atomic_t *deassert) @@ -27,17 +23,23 @@ static inline void smp_callin_clear_local_apic(void)  static inline void store_NMI_vector(unsigned short *high, unsigned short *low)  {  	printk("Storing NMI vector\n"); -	*high = *((volatile unsigned short *) TRAMPOLINE_HIGH); -	*low = *((volatile unsigned short *) TRAMPOLINE_LOW); +	*high = +	  *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)); +	*low = +	  *((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW));  }  static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)  {  	printk("Restoring NMI vector\n"); -	*((volatile unsigned short *) TRAMPOLINE_HIGH) = *high; -	*((volatile unsigned short *) TRAMPOLINE_LOW) = *low; +	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) = +								 *high; +	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = +								 *low;  } -#define inquire_remote_apic(apicid) {} +static inline void inquire_remote_apic(int apicid) +{ +}  #endif /* __ASM_NUMAQ_WAKECPU_H */ diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index f12d37237465..294daeb3a006 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -16,6 +16,8 @@ static inline void visws_early_detect(void) { }  static inline int is_visws_box(void) { return 0; }  #endif +extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); +extern int wakeup_secondary_cpu_via_init(int apicid, unsigned long start_eip);  /*   * Any setup quirks to be performed?   */ @@ -39,6 +41,7 @@ struct x86_quirks {  	void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,                                      unsigned short oemsize);  	int (*setup_ioapic_ids)(void); +	int (*update_genapic)(void);  };  extern struct x86_quirks *x86_quirks; diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h index 2ed3f0f44ff7..07c3e4048991 100644 --- a/arch/x86/include/asm/system.h +++ b/arch/x86/include/asm/system.h @@ -314,6 +314,8 @@ extern void free_init_pages(char *what, unsigned long begin, unsigned long end);  void default_idle(void); +void stop_this_cpu(void *dummy); +  /*   * Force strict CPU ordering.   * And yes, this is required on UP too when we're talking diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad2..0921b4018c11 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -20,6 +20,8 @@  struct task_struct;  struct exec_domain;  #include <asm/processor.h> +#include <asm/ftrace.h> +#include <asm/atomic.h>  struct thread_info {  	struct task_struct	*task;		/* main task structure */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 35c54921b2e4..99192bb55a53 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -157,6 +157,7 @@ extern int __get_user_bad(void);  	int __ret_gu;							\  	unsigned long __val_gu;						\  	__chk_user_ptr(ptr);						\ +	might_fault();							\  	switch (sizeof(*(ptr))) {					\  	case 1:								\  		__get_user_x(1, __ret_gu, __val_gu, ptr);		\ @@ -241,6 +242,7 @@ extern void __put_user_8(void);  	int __ret_pu;						\  	__typeof__(*(ptr)) __pu_val;				\  	__chk_user_ptr(ptr);					\ +	might_fault();						\  	__pu_val = x;						\  	switch (sizeof(*(ptr))) {				\  	case 1:							\ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index d095a3aeea1b..5e06259e90e5 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -82,8 +82,8 @@ __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)  static __always_inline unsigned long __must_check  __copy_to_user(void __user *to, const void *from, unsigned long n)  { -       might_sleep(); -       return __copy_to_user_inatomic(to, from, n); +	might_fault(); +	return __copy_to_user_inatomic(to, from, n);  }  static __always_inline unsigned long @@ -137,7 +137,7 @@ __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)  static __always_inline unsigned long  __copy_from_user(void *to, const void __user *from, unsigned long n)  { -	might_sleep(); +	might_fault();  	if (__builtin_constant_p(n)) {  		unsigned long ret; @@ -159,7 +159,7 @@ __copy_from_user(void *to, const void __user *from, unsigned long n)  static __always_inline unsigned long __copy_from_user_nocache(void *to,  				const void __user *from, unsigned long n)  { -	might_sleep(); +	might_fault();  	if (__builtin_constant_p(n)) {  		unsigned long ret; diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index f8cfd00db450..84210c479fca 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -29,6 +29,8 @@ static __always_inline __must_check  int __copy_from_user(void *dst, const void __user *src, unsigned size)  {  	int ret = 0; + +	might_fault();  	if (!__builtin_constant_p(size))  		return copy_user_generic(dst, (__force void *)src, size);  	switch (size) { @@ -71,6 +73,8 @@ static __always_inline __must_check  int __copy_to_user(void __user *dst, const void *src, unsigned size)  {  	int ret = 0; + +	might_fault();  	if (!__builtin_constant_p(size))  		return copy_user_generic((__force void *)dst, src, size);  	switch (size) { @@ -113,6 +117,8 @@ static __always_inline __must_check  int __copy_in_user(void __user *dst, const void __user *src, unsigned size)  {  	int ret = 0; + +	might_fault();  	if (!__builtin_constant_p(size))  		return copy_user_generic((__force void *)dst,  					 (__force void *)src, size); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index b62a7667828e..1cad9318d217 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,7 @@ CFLAGS_tsc.o		:= $(nostackp)  obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o  obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y			+= time_$(BITS).o ioport.o ldt.o +obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o  obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o  obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o  obj-$(CONFIG_X86_32)	+= probe_roms_32.o @@ -65,6 +65,7 @@ obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o  obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o  obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o  obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o  obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o  obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o  obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4c51a2f8fd31..65d0b72777ea 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1360,6 +1360,17 @@ static void __init acpi_process_madt(void)  			disable_acpi();  		}  	} + +	/* +	 * ACPI supports both logical (e.g. Hyper-Threading) and physical +	 * processors, where MPS only supports physical. +	 */ +	if (acpi_lapic && acpi_ioapic) +		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " +		       "information\n"); +	else if (acpi_lapic) +		printk(KERN_INFO "Using ACPI for processor (LAPIC) " +		       "configuration information\n");  #endif  	return;  } diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5145a6e72bbb..3a26525a3f31 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -391,11 +391,7 @@ static int power_off;  #else  static int power_off = 1;  #endif -#ifdef CONFIG_APM_REAL_MODE_POWER_OFF -static int realmode_power_off = 1; -#else  static int realmode_power_off; -#endif  #ifdef CONFIG_APM_ALLOW_INTS  static int allow_ints = 1;  #else diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 8e48c5d4467d..88ea02dcb622 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,6 +33,7 @@  #include <linux/cpufreq.h>  #include <linux/compiler.h>  #include <linux/dmi.h> +#include <linux/ftrace.h>  #include <linux/acpi.h>  #include <acpi/processor.h> @@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,  	unsigned int next_perf_state = 0; /* Index into perf table */  	unsigned int i;  	int result = 0; +	struct power_trace it;  	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,  		}  	} +	trace_power_mark(&it, POWER_PSTATE, next_perf_state); +  	switch (data->cpu_feature) {  	case SYSTEM_INTEL_MSR_CAPABLE:  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cce0b6118d55..816f27f289b1 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -307,12 +307,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)  		set_cpu_cap(c, X86_FEATURE_P4);  	if (c->x86 == 6)  		set_cpu_cap(c, X86_FEATURE_P3); +#endif  	if (cpu_has_bts)  		ptrace_bts_init_intel(c); -#endif -  	detect_extended_topology(c);  	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {  		/* diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38ee..19a8c2c0389f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -7,13 +7,12 @@   *   * It manages:   * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - buffer overflow handling (to be done)   * - buffer access   *   * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * - get_task_struct on all traced tasks + * - current is allowed to trace tasks   *   *   * Copyright (C) 2007-2008 Intel Corporation. @@ -28,6 +27,7 @@  #include <linux/slab.h>  #include <linux/sched.h>  #include <linux/mm.h> +#include <linux/kernel.h>  /* @@ -44,6 +44,33 @@ struct ds_configuration {  };  static struct ds_configuration ds_cfg; +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { +	/* the DS context (partially) owned by this tracer */ +	struct ds_context *context; +	/* the buffer provided on ds_request() and its size in bytes */ +	void *buffer; +	size_t size; +}; + +struct bts_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* buffer overflow notification function */ +	bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { +	/* the common DS part */ +	struct ds_tracer ds; +	/* buffer overflow notification function */ +	pebs_ovfl_callback_t ovfl; +};  /*   * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -107,34 +134,13 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,  	(*(unsigned long *)base) = value;  } +#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */ -/* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. - */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);  /* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. + * Locking is done only for allocating BTS or PEBS resources.   */ -static inline int ds_validate_access(struct ds_context *context, -				     enum ds_qualifier qual) -{ -	if (!context) -		return -EPERM; - -	if (context->owner[qual] == current) -		return 0; - -	return -EPERM; -} +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);  /* @@ -183,51 +189,13 @@ static inline int check_tracer(struct task_struct *task)   *   * Contexts are use-counted. They are allocated on first access and   * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - *   requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - *   non-existing context indicates an error. It acquires and releases - *   the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size.   */  static DEFINE_PER_CPU(struct ds_context *, system_context);  #define this_system_context per_cpu(system_context, smp_processor_id()) -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. - */  static inline struct ds_context *ds_get_context(struct task_struct *task)  { -	struct ds_context *context; -	unsigned long irq; - -	spin_lock_irqsave(&ds_lock, irq); - -	context = (task ? task->thread.ds_ctx : this_system_context); -	if (context) -		context->count++; - -	spin_unlock_irqrestore(&ds_lock, irq); - -	return context; -} - -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) -{  	struct ds_context **p_context =  		(task ? &task->thread.ds_ctx : &this_system_context);  	struct ds_context *context = *p_context; @@ -238,16 +206,9 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)  		if (!context)  			return NULL; -		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); -		if (!context->ds) { -			kfree(context); -			return NULL; -		} -  		spin_lock_irqsave(&ds_lock, irq);  		if (*p_context) { -			kfree(context->ds);  			kfree(context);  			context = *p_context; @@ -272,10 +233,6 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task)  	return context;  } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */  static inline void ds_put_context(struct ds_context *context)  {  	unsigned long irq; @@ -296,13 +253,6 @@ static inline void ds_put_context(struct ds_context *context)  	if (!context->task || (context->task == current))  		wrmsrl(MSR_IA32_DS_AREA, 0); -	put_tracer(context->task); - -	/* free any leftover buffers from tracers that did not -	 * deallocate them properly. */ -	kfree(context->buffer[ds_bts]); -	kfree(context->buffer[ds_pebs]); -	kfree(context->ds);  	kfree(context);   out:  	spin_unlock_irqrestore(&ds_lock, irq); @@ -312,345 +262,342 @@ static inline void ds_put_context(struct ds_context *context)  /*   * Handle a buffer overflow   * - * task: the task whose buffers are overflowing; - *       NULL for a buffer overflow on the current cpu   * context: the ds context   * qual: the buffer type   */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, -			enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)  { -	if (!context) -		return; - -	if (context->callback[qual]) -		(*context->callback[qual])(task); - -	/* todo: do some more overflow handling */ +	switch (qual) { +	case ds_bts: { +		struct bts_tracer *tracer = +			container_of(context->owner[qual], +				     struct bts_tracer, ds); +		if (tracer->ovfl) +			tracer->ovfl(tracer); +	} +		break; +	case ds_pebs: { +		struct pebs_tracer *tracer = +			container_of(context->owner[qual], +				     struct pebs_tracer, ds); +		if (tracer->ovfl) +			tracer->ovfl(tracer); +	} +		break; +	}  } -/* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. - * - * Returns the buffer, if successful; - *         NULL, if out of memory or rlimit exceeded. - * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved - */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static void ds_install_ds_config(struct ds_context *context, +				 enum ds_qualifier qual, +				 void *base, size_t size, size_t ith)  { -	unsigned long rlim, vm, pgsz; -	void *buffer; - -	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - -	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->total_vm  + pgsz; -	if (rlim < vm) -		return NULL; +	unsigned long buffer, adj; -	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; -	vm   = current->mm->locked_vm  + pgsz; -	if (rlim < vm) -		return NULL; +	/* adjust the buffer address and size to meet alignment +	 * constraints: +	 * - buffer is double-word aligned +	 * - size is multiple of record size +	 * +	 * We checked the size at the very beginning; we have enough +	 * space to do the adjustment. +	 */ +	buffer = (unsigned long)base; -	buffer = kzalloc(size, GFP_KERNEL); -	if (!buffer) -		return NULL; +	adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; +	buffer += adj; +	size   -= adj; -	current->mm->total_vm  += pgsz; -	current->mm->locked_vm += pgsz; +	size /= ds_cfg.sizeof_rec[qual]; +	size *= ds_cfg.sizeof_rec[qual]; -	if (pages) -		*pages = pgsz; +	ds_set(context->ds, qual, ds_buffer_base, buffer); +	ds_set(context->ds, qual, ds_index, buffer); +	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); -	return buffer; +	/* The value for 'no threshold' is -1, which will set the +	 * threshold outside of the buffer, just like we want it. +	 */ +	ds_set(context->ds, qual, +	       ds_interrupt_threshold, buffer + size - ith);  } -static int ds_request(struct task_struct *task, void *base, size_t size, -		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, enum ds_qualifier qual, +		      struct task_struct *task, +		      void *base, size_t size, size_t th)  {  	struct ds_context *context; -	unsigned long buffer, adj; -	const unsigned long alignment = (1 << 3);  	unsigned long irq; -	int error = 0; +	int error; +	error = -EOPNOTSUPP;  	if (!ds_cfg.sizeof_ds) -		return -EOPNOTSUPP; +		goto out; + +	error = -EINVAL; +	if (!base) +		goto out;  	/* we require some space to do alignment adjustments below */ -	if (size < (alignment + ds_cfg.sizeof_rec[qual])) -		return -EINVAL; +	error = -EINVAL; +	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) +		goto out; -	/* buffer overflow notification is not yet implemented */ -	if (ovfl) -		return -EOPNOTSUPP; +	if (th != (size_t)-1) { +		th *= ds_cfg.sizeof_rec[qual]; + +		error = -EINVAL; +		if (size <= th) +			goto out; +	} +	tracer->buffer = base; +	tracer->size = size; -	context = ds_alloc_context(task); +	error = -ENOMEM; +	context = ds_get_context(task);  	if (!context) -		return -ENOMEM; +		goto out; +	tracer->context = context; +  	spin_lock_irqsave(&ds_lock, irq);  	error = -EPERM;  	if (!check_tracer(task))  		goto out_unlock; -  	get_tracer(task); -	error = -EALREADY; -	if (context->owner[qual] == current) -		goto out_put_tracer;  	error = -EPERM; -	if (context->owner[qual] != NULL) +	if (context->owner[qual])  		goto out_put_tracer; -	context->owner[qual] = current; +	context->owner[qual] = tracer;  	spin_unlock_irqrestore(&ds_lock, irq); -	error = -ENOMEM; -	if (!base) { -		base = ds_allocate_buffer(size, &context->pages[qual]); -		if (!base) -			goto out_release; - -		context->buffer[qual]   = base; -	} -	error = 0; - -	context->callback[qual] = ovfl; +	ds_install_ds_config(context, qual, base, size, th); -	/* adjust the buffer address and size to meet alignment -	 * constraints: -	 * - buffer is double-word aligned -	 * - size is multiple of record size -	 * -	 * We checked the size at the very beginning; we have enough -	 * space to do the adjustment. -	 */ -	buffer = (unsigned long)base; - -	adj = ALIGN(buffer, alignment) - buffer; -	buffer += adj; -	size   -= adj; - -	size /= ds_cfg.sizeof_rec[qual]; -	size *= ds_cfg.sizeof_rec[qual]; - -	ds_set(context->ds, qual, ds_buffer_base, buffer); -	ds_set(context->ds, qual, ds_index, buffer); -	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); - -	if (ovfl) { -		/* todo: select a suitable interrupt threshold */ -	} else -		ds_set(context->ds, qual, -		       ds_interrupt_threshold, buffer + size + 1); - -	/* we keep the context until ds_release */ -	return error; - - out_release: -	context->owner[qual] = NULL; -	ds_put_context(context); -	put_tracer(task); -	return error; +	return 0;   out_put_tracer: -	spin_unlock_irqrestore(&ds_lock, irq); -	ds_put_context(context);  	put_tracer(task); -	return error; -   out_unlock:  	spin_unlock_irqrestore(&ds_lock, irq);  	ds_put_context(context); +	tracer->context = NULL; + out:  	return error;  } -int ds_request_bts(struct task_struct *task, void *base, size_t size, -		   ds_ovfl_callback_t ovfl) +struct bts_tracer *ds_request_bts(struct task_struct *task, +				  void *base, size_t size, +				  bts_ovfl_callback_t ovfl, size_t th)  { -	return ds_request(task, base, size, ovfl, ds_bts); -} +	struct bts_tracer *tracer; +	int error; -int ds_request_pebs(struct task_struct *task, void *base, size_t size, -		    ds_ovfl_callback_t ovfl) -{ -	return ds_request(task, base, size, ovfl, ds_pebs); +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl) +		goto out; + +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer) +		goto out; +	tracer->ovfl = ovfl; + +	error = ds_request(&tracer->ds, ds_bts, task, base, size, th); +	if (error < 0) +		goto out_tracer; + +	return tracer; + + out_tracer: +	kfree(tracer); + out: +	return ERR_PTR(error);  } -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, +				    void *base, size_t size, +				    pebs_ovfl_callback_t ovfl, size_t th)  { -	struct ds_context *context; +	struct pebs_tracer *tracer;  	int error; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) +	/* buffer overflow notification is not yet implemented */ +	error = -EOPNOTSUPP; +	if (ovfl) +		goto out; + +	error = -ENOMEM; +	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); +	if (!tracer)  		goto out; +	tracer->ovfl = ovfl; -	kfree(context->buffer[qual]); -	context->buffer[qual] = NULL; +	error = ds_request(&tracer->ds, ds_pebs, task, base, size, th); +	if (error < 0) +		goto out_tracer; -	current->mm->total_vm  -= context->pages[qual]; -	current->mm->locked_vm -= context->pages[qual]; -	context->pages[qual] = 0; -	context->owner[qual] = NULL; +	return tracer; -	/* -	 * we put the context twice: -	 *   once for the ds_get_context -	 *   once for the corresponding ds_request -	 */ -	ds_put_context(context); + out_tracer: +	kfree(tracer);   out: -	ds_put_context(context); -	return error; +	return ERR_PTR(error);  } -int ds_release_bts(struct task_struct *task) +static void ds_release(struct ds_tracer *tracer, enum ds_qualifier qual)  { -	return ds_release(task, ds_bts); +	BUG_ON(tracer->context->owner[qual] != tracer); +	tracer->context->owner[qual] = NULL; + +	put_tracer(tracer->context->task); +	ds_put_context(tracer->context);  } -int ds_release_pebs(struct task_struct *task) +int ds_release_bts(struct bts_tracer *tracer)  { -	return ds_release(task, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	ds_release(&tracer->ds, ds_bts); +	kfree(tracer); + +	return 0;  } -static int ds_get_index(struct task_struct *task, size_t *pos, -			enum ds_qualifier qual) +int ds_release_pebs(struct pebs_tracer *tracer)  { -	struct ds_context *context; -	unsigned long base, index; -	int error; +	if (!tracer) +		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	ds_release(&tracer->ds, ds_pebs); +	kfree(tracer); + +	return 0; +} + +static size_t ds_get_index(struct ds_context *context, enum ds_qualifier qual) +{ +	unsigned long base, index;  	base  = ds_get(context->ds, qual, ds_buffer_base);  	index = ds_get(context->ds, qual, ds_index); -	error = ((index - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; - out: -	ds_put_context(context); -	return error; +	return (index - base) / ds_cfg.sizeof_rec[qual];  } -int ds_get_bts_index(struct task_struct *task, size_t *pos) +int ds_get_bts_index(struct bts_tracer *tracer, size_t *pos)  { -	return ds_get_index(task, pos, ds_bts); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_index(tracer->ds.context, ds_bts); + +	return 0;  } -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +int ds_get_pebs_index(struct pebs_tracer *tracer, size_t *pos)  { -	return ds_get_index(task, pos, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_index(tracer->ds.context, ds_pebs); + +	return 0;  } -static int ds_get_end(struct task_struct *task, size_t *pos, -		      enum ds_qualifier qual) +static size_t ds_get_end(struct ds_context *context, enum ds_qualifier qual)  { -	struct ds_context *context; -	unsigned long base, end; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; +	unsigned long base, max;  	base = ds_get(context->ds, qual, ds_buffer_base); -	end  = ds_get(context->ds, qual, ds_absolute_maximum); +	max  = ds_get(context->ds, qual, ds_absolute_maximum); -	error = ((end - base) / ds_cfg.sizeof_rec[qual]); -	if (pos) -		*pos = error; - out: -	ds_put_context(context); -	return error; +	return (max - base) / ds_cfg.sizeof_rec[qual];  } -int ds_get_bts_end(struct task_struct *task, size_t *pos) +int ds_get_bts_end(struct bts_tracer *tracer, size_t *pos)  { -	return ds_get_end(task, pos, ds_bts); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_end(tracer->ds.context, ds_bts); + +	return 0;  } -int ds_get_pebs_end(struct task_struct *task, size_t *pos) +int ds_get_pebs_end(struct pebs_tracer *tracer, size_t *pos)  { -	return ds_get_end(task, pos, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	if (!pos) +		return -EINVAL; + +	*pos = ds_get_end(tracer->ds.context, ds_pebs); + +	return 0;  } -static int ds_access(struct task_struct *task, size_t index, -		     const void **record, enum ds_qualifier qual) +static int ds_access(struct ds_context *context, enum ds_qualifier qual, +		     size_t index, const void **record)  { -	struct ds_context *context;  	unsigned long base, idx; -	int error;  	if (!record)  		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out; -  	base = ds_get(context->ds, qual, ds_buffer_base);  	idx = base + (index * ds_cfg.sizeof_rec[qual]); -	error = -EINVAL;  	if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) -		goto out; +		return -EINVAL;  	*record = (const void *)idx; -	error = ds_cfg.sizeof_rec[qual]; - out: -	ds_put_context(context); -	return error; + +	return ds_cfg.sizeof_rec[qual];  } -int ds_access_bts(struct task_struct *task, size_t index, const void **record) +int ds_access_bts(struct bts_tracer *tracer, size_t index, +		  const void **record)  { -	return ds_access(task, index, record, ds_bts); +	if (!tracer) +		return -EINVAL; + +	return ds_access(tracer->ds.context, ds_bts, index, record);  } -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) +int ds_access_pebs(struct pebs_tracer *tracer, size_t index, +		   const void **record)  { -	return ds_access(task, index, record, ds_pebs); +	if (!tracer) +		return -EINVAL; + +	return ds_access(tracer->ds.context, ds_pebs, index, record);  } -static int ds_write(struct task_struct *task, const void *record, size_t size, -		    enum ds_qualifier qual, int force) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, +		    const void *record, size_t size)  { -	struct ds_context *context; -	int error; +	int bytes_written = 0;  	if (!record)  		return -EINVAL; -	error = -EPERM; -	context = ds_get_context(task); -	if (!context) -		goto out; - -	if (!force) { -		error = ds_validate_access(context, qual); -		if (error < 0) -			goto out; -	} - -	error = 0;  	while (size) {  		unsigned long base, index, end, write_end, int_th;  		unsigned long write_size, adj_write_size; @@ -678,14 +625,14 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,  			write_end = end;  		if (write_end <= index) -			goto out; +			break;  		write_size = min((unsigned long) size, write_end - index);  		memcpy((void *)index, record, write_size);  		record = (const char *)record + write_size; -		size  -= write_size; -		error += write_size; +		size -= write_size; +		bytes_written += write_size;  		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];  		adj_write_size *= ds_cfg.sizeof_rec[qual]; @@ -700,47 +647,32 @@ static int ds_write(struct task_struct *task, const void *record, size_t size,  		ds_set(context->ds, qual, ds_index, index);  		if (index >= int_th) -			ds_overflow(task, context, qual); +			ds_overflow(context, qual);  	} - out: -	ds_put_context(context); -	return error; +	return bytes_written;  } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +int ds_write_bts(struct bts_tracer *tracer, const void *record, size_t size)  { -	return ds_write(task, record, size, ds_bts, /* force = */ 0); -} +	if (!tracer) +		return -EINVAL; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 0); +	return ds_write(tracer->ds.context, ds_bts, record, size);  } -int ds_unchecked_write_bts(struct task_struct *task, -			   const void *record, size_t size) +int ds_write_pebs(struct pebs_tracer *tracer, const void *record, size_t size)  { -	return ds_write(task, record, size, ds_bts, /* force = */ 1); -} +	if (!tracer) +		return -EINVAL; -int ds_unchecked_write_pebs(struct task_struct *task, -			    const void *record, size_t size) -{ -	return ds_write(task, record, size, ds_pebs, /* force = */ 1); +	return ds_write(tracer->ds.context, ds_pebs, record, size);  } -static int ds_reset_or_clear(struct task_struct *task, -			     enum ds_qualifier qual, int clear) +static void ds_reset_or_clear(struct ds_context *context, +			      enum ds_qualifier qual, int clear)  { -	struct ds_context *context;  	unsigned long base, end; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, qual); -	if (error < 0) -		goto out;  	base = ds_get(context->ds, qual, ds_buffer_base);  	end  = ds_get(context->ds, qual, ds_absolute_maximum); @@ -749,70 +681,69 @@ static int ds_reset_or_clear(struct task_struct *task,  		memset((void *)base, 0, end - base);  	ds_set(context->ds, qual, ds_index, base); - -	error = 0; - out: -	ds_put_context(context); -	return error;  } -int ds_reset_bts(struct task_struct *task) +int ds_reset_bts(struct bts_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 0); + +	return 0;  } -int ds_reset_pebs(struct task_struct *task) +int ds_reset_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 0); + +	return 0;  } -int ds_clear_bts(struct task_struct *task) +int ds_clear_bts(struct bts_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_bts, /* clear = */ 1); + +	return 0;  } -int ds_clear_pebs(struct task_struct *task) +int ds_clear_pebs(struct pebs_tracer *tracer)  { -	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +	if (!tracer) +		return -EINVAL; + +	ds_reset_or_clear(tracer->ds.context, ds_pebs, /* clear = */ 1); + +	return 0;  } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +int ds_get_pebs_reset(struct pebs_tracer *tracer, u64 *value)  { -	struct ds_context *context; -	int error; +	if (!tracer) +		return -EINVAL;  	if (!value)  		return -EINVAL; -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	*value = *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); -	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); - -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0;  } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)  { -	struct ds_context *context; -	int error; - -	context = ds_get_context(task); -	error = ds_validate_access(context, ds_pebs); -	if (error < 0) -		goto out; +	if (!tracer) +		return -EINVAL; -	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; +	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; -	error = 0; - out: -	ds_put_context(context); -	return error; +	return 0;  }  static const struct ds_configuration ds_cfg_var = { @@ -840,6 +771,10 @@ static inline void  ds_configure(const struct ds_configuration *cfg)  {  	ds_cfg = *cfg; + +	printk(KERN_INFO "DS available\n"); + +	BUG_ON(MAX_SIZEOF_DS < ds_cfg.sizeof_ds);  }  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,17 +782,16 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) { +		case 0 ... 0xC: +			/* sorry, don't know about them */ +			break;  		case 0xD:  		case 0xE: /* Pentium M */  			ds_configure(&ds_cfg_var);  			break; -		case 0xF: /* Core2 */ -		case 0x1C: /* Atom */ +		default: /* Core2, Atom, ... */  			ds_configure(&ds_cfg_64);  			break; -		default: -			/* sorry, don't know about them */ -			break;  		}  		break;  	case 0xF: @@ -884,6 +818,8 @@ void ds_free(struct ds_context *context)  	 * is dying. There should not be any user of that context left  	 * to disturb us, anymore. */  	unsigned long leftovers = context->count; -	while (leftovers--) +	while (leftovers--) { +		put_tracer(context->task);  		ds_put_context(context); +	}  } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c new file mode 100644 index 000000000000..6b1f6f6f8661 --- /dev/null +++ b/arch/x86/kernel/dumpstack.c @@ -0,0 +1,351 @@ +/* + *  Copyright (C) 1991, 1992  Linus Torvalds + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/uaccess.h> +#include <linux/utsname.h> +#include <linux/hardirq.h> +#include <linux/kdebug.h> +#include <linux/module.h> +#include <linux/ptrace.h> +#include <linux/kexec.h> +#include <linux/bug.h> +#include <linux/nmi.h> +#include <linux/sysfs.h> + +#include <asm/stacktrace.h> + +#include "dumpstack.h" + +int panic_on_unrecovered_nmi; +unsigned int code_bytes = 64; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ +	printk(" [<%p>] %s%pS\n", (void *) address, +			reliable ? "" : "? ", (void *) address); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static void +print_ftrace_graph_addr(unsigned long addr, void *data, +			const struct stacktrace_ops *ops, +			struct thread_info *tinfo, int *graph) +{ +	struct task_struct *task = tinfo->task; +	unsigned long ret_addr; +	int index = task->curr_ret_stack; + +	if (addr != (unsigned long)return_to_handler) +		return; + +	if (!task->ret_stack || index < *graph) +		return; + +	index -= *graph; +	ret_addr = task->ret_stack[index].ret; + +	ops->address(data, ret_addr, 1); + +	(*graph)++; +} +#else +static inline void +print_ftrace_graph_addr(unsigned long addr, void *data, +			const struct stacktrace_ops *ops, +			struct thread_info *tinfo, int *graph) +{ } +#endif + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, +			void *p, unsigned int size, void *end) +{ +	void *t = tinfo; +	if (end) { +		if (p < end && p >= (end-THREAD_SIZE)) +			return 1; +		else +			return 0; +	} +	return p > t && p < t + THREAD_SIZE - size; +} + +unsigned long +print_context_stack(struct thread_info *tinfo, +		unsigned long *stack, unsigned long bp, +		const struct stacktrace_ops *ops, void *data, +		unsigned long *end, int *graph) +{ +	struct stack_frame *frame = (struct stack_frame *)bp; + +	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { +		unsigned long addr; + +		addr = *stack; +		if (__kernel_text_address(addr)) { +			if ((unsigned long) stack == bp + sizeof(long)) { +				ops->address(data, addr, 1); +				frame = frame->next_frame; +				bp = (unsigned long) frame; +			} else { +				ops->address(data, addr, bp == 0); +			} +			print_ftrace_graph_addr(addr, data, ops, tinfo, graph); +		} +		stack++; +	} +	return bp; +} + + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ +	printk(data); +	print_symbol(msg, symbol); +	printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ +	printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ +	printk("%s <%s> ", (char *)data, name); +	return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ +	touch_nmi_watchdog(); +	printk(data); +	printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { +	.warning = print_trace_warning, +	.warning_symbol = print_trace_warning_symbol, +	.stack = print_trace_stack, +	.address = print_trace_address, +}; + +void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, +		unsigned long *stack, unsigned long bp, char *log_lvl) +{ +	printk("%sCall Trace:\n", log_lvl); +	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, +		unsigned long *stack, unsigned long bp) +{ +	show_trace_log_lvl(task, regs, stack, bp, ""); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ +	show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ +	unsigned long bp = 0; +	unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER +	if (!bp) +		get_bp(bp); +#endif + +	printk("Pid: %d, comm: %.20s %s %s %.*s\n", +		current->pid, current->comm, print_tainted(), +		init_utsname()->release, +		(int)strcspn(init_utsname()->version, " "), +		init_utsname()->version); +	show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ +	int cpu; +	unsigned long flags; + +	oops_enter(); + +	/* racy, but better than risking deadlock. */ +	raw_local_irq_save(flags); +	cpu = smp_processor_id(); +	if (!__raw_spin_trylock(&die_lock)) { +		if (cpu == die_owner) +			/* nested oops. should stop eventually */; +		else +			__raw_spin_lock(&die_lock); +	} +	die_nest_count++; +	die_owner = cpu; +	console_verbose(); +	bust_spinlocks(1); +	return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ +	if (regs && kexec_should_crash(current)) +		crash_kexec(regs); + +	bust_spinlocks(0); +	die_owner = -1; +	add_taint(TAINT_DIE); +	die_nest_count--; +	if (!die_nest_count) +		/* Nest count reaches zero, release the lock. */ +		__raw_spin_unlock(&die_lock); +	raw_local_irq_restore(flags); +	oops_exit(); + +	if (!signr) +		return; +	if (in_interrupt()) +		panic("Fatal exception in interrupt"); +	if (panic_on_oops) +		panic("Fatal exception"); +	do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ +#ifdef CONFIG_X86_32 +	unsigned short ss; +	unsigned long sp; +#endif +	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT +	printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP +	printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC +	printk("DEBUG_PAGEALLOC"); +#endif +	printk("\n"); +	sysfs_printk_last_file(); +	if (notify_die(DIE_OOPS, str, regs, err, +			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) +		return 1; + +	show_registers(regs); +#ifdef CONFIG_X86_32 +	sp = (unsigned long) (®s->sp); +	savesegment(ss, ss); +	if (user_mode(regs)) { +		sp = regs->sp; +		ss = regs->ss & 0xffff; +	} +	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); +	print_symbol("%s", regs->ip); +	printk(" SS:ESP %04x:%08lx\n", ss, sp); +#else +	/* Executive summary in case the oops scrolled away */ +	printk(KERN_ALERT "RIP "); +	printk_address(regs->ip, 1); +	printk(" RSP <%016lx>\n", regs->sp); +#endif +	return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ +	unsigned long flags = oops_begin(); +	int sig = SIGSEGV; + +	if (!user_mode_vm(regs)) +		report_bug(regs->ip, regs); + +	if (__die(str, regs, err)) +		sig = 0; +	oops_end(flags, regs, sig); +} + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ +	unsigned long flags; + +	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) +		return; + +	/* +	 * We are in trouble anyway, lets at least try +	 * to get a message out. +	 */ +	flags = oops_begin(); +	printk(KERN_EMERG "%s", str); +	printk(" on CPU%d, ip %08lx, registers:\n", +		smp_processor_id(), regs->ip); +	show_registers(regs); +	oops_end(flags, regs, 0); +	if (do_panic || panic_on_oops) +		panic("Non maskable interrupt"); +	nmi_exit(); +	local_irq_enable(); +	do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ +	if (!s) +		return -EINVAL; +	if (!strcmp(s, "panic")) +		panic_on_oops = 1; +	return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ +	if (!s) +		return -EINVAL; +	kstack_depth_to_print = simple_strtoul(s, NULL, 0); +	return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ +	code_bytes = simple_strtoul(s, NULL, 0); +	if (code_bytes > 8192) +		code_bytes = 8192; + +	return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h new file mode 100644 index 000000000000..da87590b8698 --- /dev/null +++ b/arch/x86/kernel/dumpstack.h @@ -0,0 +1,39 @@ +/* + *  Copyright (C) 1991, 1992  Linus Torvalds + *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + +#ifndef DUMPSTACK_H +#define DUMPSTACK_H + +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +extern unsigned long +print_context_stack(struct thread_info *tinfo, +		unsigned long *stack, unsigned long bp, +		const struct stacktrace_ops *ops, void *data, +		unsigned long *end, int *graph); + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, +		unsigned long *stack, unsigned long bp, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, +		unsigned long *sp, unsigned long bp, char *log_lvl); + +extern unsigned int code_bytes; +extern int kstack_depth_to_print; + +/* The form of the top of the frame on the stack */ +struct stack_frame { +	struct stack_frame *next_frame; +	unsigned long return_address; +}; +#endif diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b3614752197b..d593cd1f58dc 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -17,69 +17,14 @@  #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ -	printk(" [<%p>] %s%pS\n", (void *) address, -			reliable ? "" : "? ", (void *) address); -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, -			void *p, unsigned int size, void *end) -{ -	void *t = tinfo; -	if (end) { -		if (p < end && p >= (end-THREAD_SIZE)) -			return 1; -		else -			return 0; -	} -	return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { -	struct stack_frame *next_frame; -	unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, -		unsigned long *stack, unsigned long bp, -		const struct stacktrace_ops *ops, void *data, -		unsigned long *end) -{ -	struct stack_frame *frame = (struct stack_frame *)bp; - -	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { -		unsigned long addr; - -		addr = *stack; -		if (__kernel_text_address(addr)) { -			if ((unsigned long) stack == bp + sizeof(long)) { -				ops->address(data, addr, 1); -				frame = frame->next_frame; -				bp = (unsigned long) frame; -			} else { -				ops->address(data, addr, bp == 0); -			} -		} -		stack++; -	} -	return bp; -} +#include "dumpstack.h"  void dump_trace(struct task_struct *task, struct pt_regs *regs,  		unsigned long *stack, unsigned long bp,  		const struct stacktrace_ops *ops, void *data)  { +	int graph = 0; +  	if (!task)  		task = current; @@ -107,7 +52,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  		context = (struct thread_info *)  			((unsigned long)stack & (~(THREAD_SIZE - 1))); -		bp = print_context_stack(context, stack, bp, ops, data, NULL); +		bp = print_context_stack(context, stack, bp, ops, +					 data, NULL, &graph);  		stack = (unsigned long *)context->previous_esp;  		if (!stack) @@ -119,57 +65,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  }  EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -	printk(data); -	print_symbol(msg, symbol); -	printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ -	printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ -	printk("%s <%s> ", (char *)data, name); -	return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ -	touch_nmi_watchdog(); -	printk(data); -	printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { -	.warning = print_trace_warning, -	.warning_symbol = print_trace_warning_symbol, -	.stack = print_trace_stack, -	.address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, -		unsigned long *stack, unsigned long bp, char *log_lvl) -{ -	printk("%sCall Trace:\n", log_lvl); -	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, -		unsigned long *stack, unsigned long bp) -{ -	show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  		unsigned long *sp, unsigned long bp, char *log_lvl)  { @@ -196,33 +92,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  	show_trace_log_lvl(task, regs, sp, bp, log_lvl);  } -void show_stack(struct task_struct *task, unsigned long *sp) -{ -	show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ -	unsigned long bp = 0; -	unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER -	if (!bp) -		get_bp(bp); -#endif - -	printk("Pid: %d, comm: %.20s %s %s %.*s\n", -		current->pid, current->comm, print_tainted(), -		init_utsname()->release, -		(int)strcspn(init_utsname()->version, " "), -		init_utsname()->version); -	show_trace(NULL, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack);  void show_registers(struct pt_regs *regs)  { @@ -283,167 +152,3 @@ int is_valid_bugaddr(unsigned long ip)  	return ud2 == 0x0b0f;  } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ -	unsigned long flags; - -	oops_enter(); - -	if (die_owner != raw_smp_processor_id()) { -		console_verbose(); -		raw_local_irq_save(flags); -		__raw_spin_lock(&die_lock); -		die_owner = smp_processor_id(); -		die_nest_count = 0; -		bust_spinlocks(1); -	} else { -		raw_local_irq_save(flags); -	} -	die_nest_count++; -	return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ -	bust_spinlocks(0); -	die_owner = -1; -	add_taint(TAINT_DIE); -	__raw_spin_unlock(&die_lock); -	raw_local_irq_restore(flags); - -	if (!regs) -		return; - -	if (kexec_should_crash(current)) -		crash_kexec(regs); -	if (in_interrupt()) -		panic("Fatal exception in interrupt"); -	if (panic_on_oops) -		panic("Fatal exception"); -	oops_exit(); -	do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ -	unsigned short ss; -	unsigned long sp; - -	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT -	printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP -	printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -	printk("DEBUG_PAGEALLOC"); -#endif -	printk("\n"); -	sysfs_printk_last_file(); -	if (notify_die(DIE_OOPS, str, regs, err, -			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) -		return 1; - -	show_registers(regs); -	/* Executive summary in case the oops scrolled away */ -	sp = (unsigned long) (®s->sp); -	savesegment(ss, ss); -	if (user_mode(regs)) { -		sp = regs->sp; -		ss = regs->ss & 0xffff; -	} -	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); -	print_symbol("%s", regs->ip); -	printk(" SS:ESP %04x:%08lx\n", ss, sp); -	return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ -	unsigned long flags = oops_begin(); - -	if (die_nest_count < 3) { -		report_bug(regs->ip, regs); - -		if (__die(str, regs, err)) -			regs = NULL; -	} else { -		printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); -	} - -	oops_end(flags, regs, SIGSEGV); -} - -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ -	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) -		return; - -	spin_lock(&nmi_print_lock); -	/* -	* We are in trouble anyway, lets at least try -	* to get a message out: -	*/ -	bust_spinlocks(1); -	printk(KERN_EMERG "%s", str); -	printk(" on CPU%d, ip %08lx, registers:\n", -		smp_processor_id(), regs->ip); -	show_registers(regs); -	if (do_panic) -		panic("Non maskable interrupt"); -	console_silent(); -	spin_unlock(&nmi_print_lock); - -	/* -	 * If we are in kernel we are probably nested up pretty bad -	 * and might aswell get out now while we still can: -	 */ -	if (!user_mode_vm(regs)) { -		current->thread.trap_no = 2; -		crash_kexec(regs); -	} - -	bust_spinlocks(0); -	do_exit(SIGSEGV); -} - -static int __init oops_setup(char *s) -{ -	if (!s) -		return -EINVAL; -	if (!strcmp(s, "panic")) -		panic_on_oops = 1; -	return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ -	if (!s) -		return -EINVAL; -	kstack_depth_to_print = simple_strtoul(s, NULL, 0); -	return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ -	code_bytes = simple_strtoul(s, NULL, 0); -	if (code_bytes > 8192) -		code_bytes = 8192; - -	return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 96a5db7da8a7..c302d0707048 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,19 +17,7 @@  #include <asm/stacktrace.h> -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) - -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -static unsigned int code_bytes = 64; -static int die_counter; - -void printk_address(unsigned long address, int reliable) -{ -	printk(" [<%p>] %s%pS\n", (void *) address, -			reliable ? "" : "? ", (void *) address); -} +#include "dumpstack.h"  static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,  					unsigned *usedp, char **idp) @@ -113,51 +101,6 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,   * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack   */ -static inline int valid_stack_ptr(struct thread_info *tinfo, -			void *p, unsigned int size, void *end) -{ -	void *t = tinfo; -	if (end) { -		if (p < end && p >= (end-THREAD_SIZE)) -			return 1; -		else -			return 0; -	} -	return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { -	struct stack_frame *next_frame; -	unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, -		unsigned long *stack, unsigned long bp, -		const struct stacktrace_ops *ops, void *data, -		unsigned long *end) -{ -	struct stack_frame *frame = (struct stack_frame *)bp; - -	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { -		unsigned long addr; - -		addr = *stack; -		if (__kernel_text_address(addr)) { -			if ((unsigned long) stack == bp + sizeof(long)) { -				ops->address(data, addr, 1); -				frame = frame->next_frame; -				bp = (unsigned long) frame; -			} else { -				ops->address(data, addr, bp == 0); -			} -		} -		stack++; -	} -	return bp; -} -  void dump_trace(struct task_struct *task, struct pt_regs *regs,  		unsigned long *stack, unsigned long bp,  		const struct stacktrace_ops *ops, void *data) @@ -166,6 +109,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;  	unsigned used = 0;  	struct thread_info *tinfo; +	int graph = 0;  	if (!task)  		task = current; @@ -206,7 +150,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  				break;  			bp = print_context_stack(tinfo, stack, bp, ops, -							data, estack_end); +						 data, estack_end, &graph);  			ops->stack(data, "<EOE>");  			/*  			 * We link to the next stack via the @@ -225,7 +169,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  				if (ops->stack(data, "IRQ") < 0)  					break;  				bp = print_context_stack(tinfo, stack, bp, -						ops, data, irqstack_end); +					ops, data, irqstack_end, &graph);  				/*  				 * We link to the next stack (which would be  				 * the process stack normally) the last @@ -243,62 +187,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,  	/*  	 * This handles the process stack:  	 */ -	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); +	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph);  	put_cpu();  }  EXPORT_SYMBOL(dump_trace); -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ -	printk(data); -	print_symbol(msg, symbol); -	printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ -	printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ -	printk("%s <%s> ", (char *)data, name); -	return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ -	touch_nmi_watchdog(); -	printk(data); -	printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { -	.warning = print_trace_warning, -	.warning_symbol = print_trace_warning_symbol, -	.stack = print_trace_stack, -	.address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, -		unsigned long *stack, unsigned long bp, char *log_lvl) -{ -	printk("%sCall Trace:\n", log_lvl); -	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, -		unsigned long *stack, unsigned long bp) -{ -	show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void +void  show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  		unsigned long *sp, unsigned long bp, char *log_lvl)  { @@ -342,33 +236,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,  	show_trace_log_lvl(task, regs, sp, bp, log_lvl);  } -void show_stack(struct task_struct *task, unsigned long *sp) -{ -	show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ -	unsigned long bp = 0; -	unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER -	if (!bp) -		get_bp(bp); -#endif - -	printk("Pid: %d, comm: %.20s %s %s %.*s\n", -		current->pid, current->comm, print_tainted(), -		init_utsname()->release, -		(int)strcspn(init_utsname()->version, " "), -		init_utsname()->version); -	show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); -  void show_registers(struct pt_regs *regs)  {  	int i; @@ -429,147 +296,3 @@ int is_valid_bugaddr(unsigned long ip)  	return ud2 == 0x0b0f;  } -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ -	int cpu; -	unsigned long flags; - -	oops_enter(); - -	/* racy, but better than risking deadlock. */ -	raw_local_irq_save(flags); -	cpu = smp_processor_id(); -	if (!__raw_spin_trylock(&die_lock)) { -		if (cpu == die_owner) -			/* nested oops. should stop eventually */; -		else -			__raw_spin_lock(&die_lock); -	} -	die_nest_count++; -	die_owner = cpu; -	console_verbose(); -	bust_spinlocks(1); -	return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ -	die_owner = -1; -	bust_spinlocks(0); -	die_nest_count--; -	if (!die_nest_count) -		/* Nest count reaches zero, release the lock. */ -		__raw_spin_unlock(&die_lock); -	raw_local_irq_restore(flags); -	if (!regs) { -		oops_exit(); -		return; -	} -	if (in_interrupt()) -		panic("Fatal exception in interrupt"); -	if (panic_on_oops) -		panic("Fatal exception"); -	oops_exit(); -	do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ -	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT -	printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP -	printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC -	printk("DEBUG_PAGEALLOC"); -#endif -	printk("\n"); -	sysfs_printk_last_file(); -	if (notify_die(DIE_OOPS, str, regs, err, -			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) -		return 1; - -	show_registers(regs); -	add_taint(TAINT_DIE); -	/* Executive summary in case the oops scrolled away */ -	printk(KERN_ALERT "RIP "); -	printk_address(regs->ip, 1); -	printk(" RSP <%016lx>\n", regs->sp); -	if (kexec_should_crash(current)) -		crash_kexec(regs); -	return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ -	unsigned long flags = oops_begin(); - -	if (!user_mode(regs)) -		report_bug(regs->ip, regs); - -	if (__die(str, regs, err)) -		regs = NULL; -	oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ -	unsigned long flags; - -	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) -		return; - -	flags = oops_begin(); -	/* -	 * We are in trouble anyway, lets at least try -	 * to get a message out. -	 */ -	printk(KERN_EMERG "%s", str); -	printk(" on CPU%d, ip %08lx, registers:\n", -		smp_processor_id(), regs->ip); -	show_registers(regs); -	if (kexec_should_crash(current)) -		crash_kexec(regs); -	if (do_panic || panic_on_oops) -		panic("Non maskable interrupt"); -	oops_end(flags, NULL, SIGBUS); -	nmi_exit(); -	local_irq_enable(); -	do_exit(SIGBUS); -} - -static int __init oops_setup(char *s) -{ -	if (!s) -		return -EINVAL; -	if (!strcmp(s, "panic")) -		panic_on_oops = 1; -	return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ -	if (!s) -		return -EINVAL; -	kstack_depth_to_print = simple_strtoul(s, NULL, 0); -	return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ -	code_bytes = simple_strtoul(s, NULL, 0); -	if (code_bytes > 8192) -		code_bytes = 8192; - -	return 1; -} -__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 28b597ef9ca1..43ceb3f454bf 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1157,6 +1157,9 @@ ENTRY(mcount)  END(mcount)  ENTRY(ftrace_caller) +	cmpl $0, function_trace_stop +	jne  ftrace_stub +  	pushl %eax  	pushl %ecx  	pushl %edx @@ -1171,6 +1174,11 @@ ftrace_call:  	popl %edx  	popl %ecx  	popl %eax +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: +	jmp ftrace_stub +#endif  .globl ftrace_stub  ftrace_stub: @@ -1180,8 +1188,18 @@ END(ftrace_caller)  #else /* ! CONFIG_DYNAMIC_FTRACE */  ENTRY(mcount) +	cmpl $0, function_trace_stop +	jne  ftrace_stub +  	cmpl $ftrace_stub, ftrace_trace_function  	jnz trace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	cmpl $ftrace_stub, ftrace_graph_return +	jnz ftrace_graph_caller + +	cmpl $ftrace_graph_entry_stub, ftrace_graph_entry +	jnz ftrace_graph_caller +#endif  .globl ftrace_stub  ftrace_stub:  	ret @@ -1200,12 +1218,43 @@ trace:  	popl %edx  	popl %ecx  	popl %eax -  	jmp ftrace_stub  END(mcount)  #endif /* CONFIG_DYNAMIC_FTRACE */  #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) +	cmpl $0, function_trace_stop +	jne ftrace_stub + +	pushl %eax +	pushl %ecx +	pushl %edx +	movl 0xc(%esp), %edx +	lea 0x4(%ebp), %eax +	subl $MCOUNT_INSN_SIZE, %edx +	call prepare_ftrace_return +	popl %edx +	popl %ecx +	popl %eax +	ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: +	pushl $0 +	pushl %eax +	pushl %ecx +	pushl %edx +	call ftrace_return_to_handler +	movl %eax, 0xc(%esp) +	popl %edx +	popl %ecx +	popl %eax +	ret +#endif +  .section .rodata,"a"  #include "syscall_table_32.S" diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b86f332c96a6..54e0bbdccb99 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -68,6 +68,8 @@ ENTRY(mcount)  END(mcount)  ENTRY(ftrace_caller) +	cmpl $0, function_trace_stop +	jne  ftrace_stub  	/* taken from glibc */  	subq $0x38, %rsp @@ -96,6 +98,12 @@ ftrace_call:  	movq (%rsp), %rax  	addq $0x38, %rsp +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: +	jmp ftrace_stub +#endif +  .globl ftrace_stub  ftrace_stub:  	retq @@ -103,8 +111,20 @@ END(ftrace_caller)  #else /* ! CONFIG_DYNAMIC_FTRACE */  ENTRY(mcount) +	cmpl $0, function_trace_stop +	jne  ftrace_stub +  	cmpq $ftrace_stub, ftrace_trace_function  	jnz trace + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +	cmpq $ftrace_stub, ftrace_graph_return +	jnz ftrace_graph_caller + +	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry +	jnz ftrace_graph_caller +#endif +  .globl ftrace_stub  ftrace_stub:  	retq @@ -140,6 +160,69 @@ END(mcount)  #endif /* CONFIG_DYNAMIC_FTRACE */  #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) +	cmpl $0, function_trace_stop +	jne ftrace_stub + +	subq $0x38, %rsp +	movq %rax, (%rsp) +	movq %rcx, 8(%rsp) +	movq %rdx, 16(%rsp) +	movq %rsi, 24(%rsp) +	movq %rdi, 32(%rsp) +	movq %r8, 40(%rsp) +	movq %r9, 48(%rsp) + +	leaq 8(%rbp), %rdi +	movq 0x38(%rsp), %rsi +	subq $MCOUNT_INSN_SIZE, %rsi + +	call	prepare_ftrace_return + +	movq 48(%rsp), %r9 +	movq 40(%rsp), %r8 +	movq 32(%rsp), %rdi +	movq 24(%rsp), %rsi +	movq 16(%rsp), %rdx +	movq 8(%rsp), %rcx +	movq (%rsp), %rax +	addq $0x38, %rsp +	retq +END(ftrace_graph_caller) + + +.globl return_to_handler +return_to_handler: +	subq  $80, %rsp + +	movq %rax, (%rsp) +	movq %rcx, 8(%rsp) +	movq %rdx, 16(%rsp) +	movq %rsi, 24(%rsp) +	movq %rdi, 32(%rsp) +	movq %r8, 40(%rsp) +	movq %r9, 48(%rsp) +	movq %r10, 56(%rsp) +	movq %r11, 64(%rsp) + +	call ftrace_return_to_handler + +	movq %rax, 72(%rsp) +	movq 64(%rsp), %r11 +	movq 56(%rsp), %r10 +	movq 48(%rsp), %r9 +	movq 40(%rsp), %r8 +	movq 32(%rsp), %rdi +	movq 24(%rsp), %rsi +	movq 16(%rsp), %rdx +	movq 8(%rsp), %rcx +	movq (%rsp), %rax +	addq $72, %rsp +	retq +#endif + +  #ifndef CONFIG_PREEMPT  #define retint_kernel retint_restore_args  #endif	 diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 0aa2c443d600..53699c931ad4 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -38,8 +38,11 @@  #include <asm/io.h>  #include <asm/nmi.h>  #include <asm/smp.h> +#include <asm/atomic.h>  #include <asm/apicdef.h>  #include <mach_mpparse.h> +#include <asm/genapic.h> +#include <asm/setup.h>  /*   * ES7000 chipsets @@ -161,6 +164,43 @@ es7000_rename_gsi(int ioapic, int gsi)  	return gsi;  } +static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) +{ +	unsigned long vect = 0, psaival = 0; + +	if (psai == NULL) +		return -1; + +	vect = ((unsigned long)__pa(eip)/0x1000) << 16; +	psaival = (0x1000000 | vect | cpu); + +	while (*psai & 0x1000000) +		; + +	*psai = psaival; + +	return 0; +} + +static void noop_wait_for_deassert(atomic_t *deassert_not_used) +{ +} + +static int __init es7000_update_genapic(void) +{ +	genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; + +	/* MPENTIUMIII */ +	if (boot_cpu_data.x86 == 6 && +	    (boot_cpu_data.x86_model >= 7 || boot_cpu_data.x86_model <= 11)) { +		es7000_update_genapic_to_cluster(); +		genapic->wait_for_init_deassert = noop_wait_for_deassert; +		genapic->wakeup_cpu = wakeup_secondary_cpu_via_mip; +	} + +	return 0; +} +  void __init  setup_unisys(void)  { @@ -176,6 +216,8 @@ setup_unisys(void)  	else  		es7000_plat = ES7000_CLASSIC;  	ioapic_renumber_irq = es7000_rename_gsi; + +	x86_quirks->update_genapic = es7000_update_genapic;  }  /* @@ -317,26 +359,6 @@ es7000_mip_write(struct mip_reg *mip_reg)  	return status;  } -int -es7000_start_cpu(int cpu, unsigned long eip) -{ -	unsigned long vect = 0, psaival = 0; - -	if (psai == NULL) -		return -1; - -	vect = ((unsigned long)__pa(eip)/0x1000) << 16; -	psaival = (0x1000000 | vect | cpu); - -	while (*psai & 0x1000000) -                ; - -	*psai = psaival; - -	return 0; - -} -  void __init  es7000_sw_apic(void)  { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 50ea0ac8c9bf..1b43086b097a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -14,14 +14,17 @@  #include <linux/uaccess.h>  #include <linux/ftrace.h>  #include <linux/percpu.h> +#include <linux/sched.h>  #include <linux/init.h>  #include <linux/list.h>  #include <asm/ftrace.h> +#include <linux/ftrace.h>  #include <asm/nops.h> +#include <asm/nmi.h> -static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; +#ifdef CONFIG_DYNAMIC_FTRACE  union ftrace_code_union {  	char code[MCOUNT_INSN_SIZE]; @@ -31,18 +34,12 @@ union ftrace_code_union {  	} __attribute__((packed));  }; -  static int ftrace_calc_offset(long ip, long addr)  {  	return (int)(addr - ip);  } -unsigned char *ftrace_nop_replace(void) -{ -	return ftrace_nop; -} - -unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr) +static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  {  	static union ftrace_code_union calc; @@ -56,7 +53,142 @@ unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)  	return calc.code;  } -int +/* + * Modifying code must take extra care. On an SMP machine, if + * the code being modified is also being executed on another CPU + * that CPU will have undefined results and possibly take a GPF. + * We use kstop_machine to stop other CPUS from exectuing code. + * But this does not stop NMIs from happening. We still need + * to protect against that. We separate out the modification of + * the code to take care of this. + * + * Two buffers are added: An IP buffer and a "code" buffer. + * + * 1) Put the instruction pointer into the IP buffer + *    and the new code into the "code" buffer. + * 2) Set a flag that says we are modifying code + * 3) Wait for any running NMIs to finish. + * 4) Write the code + * 5) clear the flag. + * 6) Wait for any running NMIs to finish. + * + * If an NMI is executed, the first thing it does is to call + * "ftrace_nmi_enter". This will check if the flag is set to write + * and if it is, it will write what is in the IP and "code" buffers. + * + * The trick is, it does not matter if everyone is writing the same + * content to the code location. Also, if a CPU is executing code + * it is OK to write to that code location if the contents being written + * are the same as what exists. + */ + +static atomic_t in_nmi = ATOMIC_INIT(0); +static int mod_code_status;		/* holds return value of text write */ +static int mod_code_write;		/* set when NMI should do the write */ +static void *mod_code_ip;		/* holds the IP to write to */ +static void *mod_code_newcode;		/* holds the text to write to the IP */ + +static unsigned nmi_wait_count; +static atomic_t nmi_update_count = ATOMIC_INIT(0); + +int ftrace_arch_read_dyn_info(char *buf, int size) +{ +	int r; + +	r = snprintf(buf, size, "%u %u", +		     nmi_wait_count, +		     atomic_read(&nmi_update_count)); +	return r; +} + +static void ftrace_mod_code(void) +{ +	/* +	 * Yes, more than one CPU process can be writing to mod_code_status. +	 *    (and the code itself) +	 * But if one were to fail, then they all should, and if one were +	 * to succeed, then they all should. +	 */ +	mod_code_status = probe_kernel_write(mod_code_ip, mod_code_newcode, +					     MCOUNT_INSN_SIZE); +} + +void ftrace_nmi_enter(void) +{ +	atomic_inc(&in_nmi); +	/* Must have in_nmi seen before reading write flag */ +	smp_mb(); +	if (mod_code_write) { +		ftrace_mod_code(); +		atomic_inc(&nmi_update_count); +	} +} + +void ftrace_nmi_exit(void) +{ +	/* Finish all executions before clearing in_nmi */ +	smp_wmb(); +	atomic_dec(&in_nmi); +} + +static void wait_for_nmi(void) +{ +	int waited = 0; + +	while (atomic_read(&in_nmi)) { +		waited = 1; +		cpu_relax(); +	} + +	if (waited) +		nmi_wait_count++; +} + +static int +do_ftrace_mod_code(unsigned long ip, void *new_code) +{ +	mod_code_ip = (void *)ip; +	mod_code_newcode = new_code; + +	/* The buffers need to be visible before we let NMIs write them */ +	smp_wmb(); + +	mod_code_write = 1; + +	/* Make sure write bit is visible before we wait on NMIs */ +	smp_mb(); + +	wait_for_nmi(); + +	/* Make sure all running NMIs have finished before we write the code */ +	smp_mb(); + +	ftrace_mod_code(); + +	/* Make sure the write happens before clearing the bit */ +	smp_wmb(); + +	mod_code_write = 0; + +	/* make sure NMIs see the cleared bit */ +	smp_mb(); + +	wait_for_nmi(); + +	return mod_code_status; +} + + + + +static unsigned char ftrace_nop[MCOUNT_INSN_SIZE]; + +static unsigned char *ftrace_nop_replace(void) +{ +	return ftrace_nop; +} + +static int  ftrace_modify_code(unsigned long ip, unsigned char *old_code,  		   unsigned char *new_code)  { @@ -81,7 +213,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,  		return -EINVAL;  	/* replace the text with the new text */ -	if (probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE)) +	if (do_ftrace_mod_code(ip, new_code))  		return -EPERM;  	sync_core(); @@ -89,6 +221,29 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,  	return 0;  } +int ftrace_make_nop(struct module *mod, +		    struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char *new, *old; +	unsigned long ip = rec->ip; + +	old = ftrace_call_replace(ip, addr); +	new = ftrace_nop_replace(); + +	return ftrace_modify_code(rec->ip, old, new); +} + +int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) +{ +	unsigned char *new, *old; +	unsigned long ip = rec->ip; + +	old = ftrace_nop_replace(); +	new = ftrace_call_replace(ip, addr); + +	return ftrace_modify_code(rec->ip, old, new); +} +  int ftrace_update_ftrace_func(ftrace_func_t func)  {  	unsigned long ip = (unsigned long)(&ftrace_call); @@ -165,3 +320,218 @@ int __init ftrace_dyn_arch_init(void *data)  	return 0;  } +#endif + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE +extern void ftrace_graph_call(void); + +static int ftrace_mod_jmp(unsigned long ip, +			  int old_offset, int new_offset) +{ +	unsigned char code[MCOUNT_INSN_SIZE]; + +	if (probe_kernel_read(code, (void *)ip, MCOUNT_INSN_SIZE)) +		return -EFAULT; + +	if (code[0] != 0xe9 || old_offset != *(int *)(&code[1])) +		return -EINVAL; + +	*(int *)(&code[1]) = new_offset; + +	if (do_ftrace_mod_code(ip, &code)) +		return -EPERM; + +	return 0; +} + +int ftrace_enable_ftrace_graph_caller(void) +{ +	unsigned long ip = (unsigned long)(&ftrace_graph_call); +	int old_offset, new_offset; + +	old_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); +	new_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); + +	return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +int ftrace_disable_ftrace_graph_caller(void) +{ +	unsigned long ip = (unsigned long)(&ftrace_graph_call); +	int old_offset, new_offset; + +	old_offset = (unsigned long)(&ftrace_graph_caller) - (ip + MCOUNT_INSN_SIZE); +	new_offset = (unsigned long)(&ftrace_stub) - (ip + MCOUNT_INSN_SIZE); + +	return ftrace_mod_jmp(ip, old_offset, new_offset); +} + +#else /* CONFIG_DYNAMIC_FTRACE */ + +/* + * These functions are picked from those used on + * this page for dynamic ftrace. They have been + * simplified to ignore all traces in NMI context. + */ +static atomic_t in_nmi; + +void ftrace_nmi_enter(void) +{ +	atomic_inc(&in_nmi); +} + +void ftrace_nmi_exit(void) +{ +	atomic_dec(&in_nmi); +} + +#endif /* !CONFIG_DYNAMIC_FTRACE */ + +/* Add a function return address to the trace stack on thread info.*/ +static int push_return_trace(unsigned long ret, unsigned long long time, +				unsigned long func, int *depth) +{ +	int index; + +	if (!current->ret_stack) +		return -EBUSY; + +	/* The return trace stack is full */ +	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { +		atomic_inc(¤t->trace_overrun); +		return -EBUSY; +	} + +	index = ++current->curr_ret_stack; +	barrier(); +	current->ret_stack[index].ret = ret; +	current->ret_stack[index].func = func; +	current->ret_stack[index].calltime = time; +	*depth = index; + +	return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) +{ +	int index; + +	index = current->curr_ret_stack; + +	if (unlikely(index < 0)) { +		ftrace_graph_stop(); +		WARN_ON(1); +		/* Might as well panic, otherwise we have no where to go */ +		*ret = (unsigned long)panic; +		return; +	} + +	*ret = current->ret_stack[index].ret; +	trace->func = current->ret_stack[index].func; +	trace->calltime = current->ret_stack[index].calltime; +	trace->overrun = atomic_read(¤t->trace_overrun); +	trace->depth = index; +	barrier(); +	current->curr_ret_stack--; + +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(void) +{ +	struct ftrace_graph_ret trace; +	unsigned long ret; + +	pop_return_trace(&trace, &ret); +	trace.rettime = cpu_clock(raw_smp_processor_id()); +	ftrace_graph_return(&trace); + +	if (unlikely(!ret)) { +		ftrace_graph_stop(); +		WARN_ON(1); +		/* Might as well panic. What else to do? */ +		ret = (unsigned long)panic; +	} + +	return ret; +} + +/* + * Hook the return address and push it in the stack of return addrs + * in current thread info. + */ +void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) +{ +	unsigned long old; +	unsigned long long calltime; +	int faulted; +	struct ftrace_graph_ent trace; +	unsigned long return_hooker = (unsigned long) +				&return_to_handler; + +	/* Nmi's are currently unsupported */ +	if (unlikely(atomic_read(&in_nmi))) +		return; + +	if (unlikely(atomic_read(¤t->tracing_graph_pause))) +		return; + +	/* +	 * Protect against fault, even if it shouldn't +	 * happen. This tool is too much intrusive to +	 * ignore such a protection. +	 */ +	asm volatile( +		"1: " _ASM_MOV " (%[parent_old]), %[old]\n" +		"2: " _ASM_MOV " %[return_hooker], (%[parent_replaced])\n" +		"   movl $0, %[faulted]\n" + +		".section .fixup, \"ax\"\n" +		"3: movl $1, %[faulted]\n" +		".previous\n" + +		_ASM_EXTABLE(1b, 3b) +		_ASM_EXTABLE(2b, 3b) + +		: [parent_replaced] "=r" (parent), [old] "=r" (old), +		  [faulted] "=r" (faulted) +		: [parent_old] "0" (parent), [return_hooker] "r" (return_hooker) +		: "memory" +	); + +	if (unlikely(faulted)) { +		ftrace_graph_stop(); +		WARN_ON(1); +		return; +	} + +	if (unlikely(!__kernel_text_address(old))) { +		ftrace_graph_stop(); +		*parent = old; +		WARN_ON(1); +		return; +	} + +	calltime = cpu_clock(raw_smp_processor_id()); + +	if (push_return_trace(old, calltime, +				self_addr, &trace.depth) == -EBUSY) { +		*parent = old; +		return; +	} + +	trace.func = self_addr; + +	/* Only trace if the calling function expects to */ +	if (!ftrace_graph_entry(&trace)) { +		current->curr_ret_stack--; +		*parent = old; +	} +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6c9bfc9e1e95..2bced78b0b8e 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -21,6 +21,7 @@  #include <asm/smp.h>  #include <asm/ipi.h>  #include <asm/genapic.h> +#include <asm/setup.h>  extern struct genapic apic_flat;  extern struct genapic apic_physflat; @@ -53,6 +54,9 @@ void __init setup_apic_routing(void)  			genapic = &apic_physflat;  		printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);  	} + +	if (x86_quirks->update_genapic) +		x86_quirks->update_genapic();  }  /* Same for both flat and physical. */ diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 1184210e6d0c..d7f0993b8056 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -108,8 +108,33 @@ static int __init parse_noapic(char *str)  early_param("noapic", parse_noapic);  struct irq_pin_list; + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { +	int apic, pin; +	struct irq_pin_list *next; +}; + +static struct irq_pin_list *get_one_free_irq_2_pin(int cpu) +{ +	struct irq_pin_list *pin; +	int node; + +	node = cpu_to_node(cpu); + +	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node); +	printk(KERN_DEBUG "  alloc irq_2_pin on cpu %d node %d\n", cpu, node); + +	return pin; +} +  struct irq_cfg { -	unsigned int irq;  	struct irq_pin_list *irq_2_pin;  	cpumask_t domain;  	cpumask_t old_domain; @@ -119,81 +144,95 @@ struct irq_cfg {  };  /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg irq_cfgx[] = { +#else  static struct irq_cfg irq_cfgx[NR_IRQS] = { -	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  }, -	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  }, -	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  }, -	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  }, -	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  }, -	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  }, -	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  }, -	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  }, -	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  }, -	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  }, -	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, -	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, -	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, -	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, -	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, -	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +#endif +	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  }, +	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  }, +	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  }, +	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  }, +	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  }, +	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  }, +	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  }, +	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  }, +	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  }, +	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  }, +	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, +	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, +	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, +	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, +	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, +	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },  }; -#define for_each_irq_cfg(irq, cfg)		\ -	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) - -static struct irq_cfg *irq_cfg(unsigned int irq) +void __init arch_early_irq_init(void)  { -	return irq < nr_irqs ? irq_cfgx + irq : NULL; +	struct irq_cfg *cfg; +	struct irq_desc *desc; +	int count; +	int i; + +	cfg = irq_cfgx; +	count = ARRAY_SIZE(irq_cfgx); + +	for (i = 0; i < count; i++) { +		desc = irq_to_desc(i); +		desc->chip_data = &cfg[i]; +	}  } -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +#ifdef CONFIG_SPARSE_IRQ +static struct irq_cfg *irq_cfg(unsigned int irq)  { -	return irq_cfg(irq); +	struct irq_cfg *cfg = NULL; +	struct irq_desc *desc; + +	desc = irq_to_desc(irq); +	if (desc) +		cfg = desc->chip_data; + +	return cfg;  } -/* - * Rough estimation of how many shared IRQs there are, can be changed - * anytime. - */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +static struct irq_cfg *get_one_free_irq_cfg(int cpu) +{ +	struct irq_cfg *cfg; +	int node; -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ +	node = cpu_to_node(cpu); -struct irq_pin_list { -	int apic, pin; -	struct irq_pin_list *next; -}; +	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); +	printk(KERN_DEBUG "  alloc irq_cfg on cpu %d node %d\n", cpu, node); -static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; -static struct irq_pin_list *irq_2_pin_ptr; +	return cfg; +} -static void __init irq_2_pin_init(void) +void arch_init_chip_data(struct irq_desc *desc, int cpu)  { -	struct irq_pin_list *pin = irq_2_pin_head; -	int i; - -	for (i = 1; i < PIN_MAP_SIZE; i++) -		pin[i-1].next = &pin[i]; +	struct irq_cfg *cfg; -	irq_2_pin_ptr = &pin[0]; +	cfg = desc->chip_data; +	if (!cfg) { +		desc->chip_data = get_one_free_irq_cfg(cpu); +		if (!desc->chip_data) { +			printk(KERN_ERR "can not alloc irq_cfg\n"); +			BUG_ON(1); +		} +	}  } -static struct irq_pin_list *get_one_free_irq_2_pin(void) +#else +static struct irq_cfg *irq_cfg(unsigned int irq)  { -	struct irq_pin_list *pin = irq_2_pin_ptr; +	return irq < nr_irqs ? irq_cfgx + irq : NULL; +} -	if (!pin) -		panic("can not get more irq_2_pin\n"); +#endif -	irq_2_pin_ptr = pin->next; -	pin->next = NULL; -	return pin; +static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask) +{  }  struct io_apic { @@ -237,11 +276,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned  	writel(value, &io_apic->data);  } -static bool io_apic_level_ack_pending(unsigned int irq) +static bool io_apic_level_ack_pending(struct irq_cfg *cfg)  {  	struct irq_pin_list *entry;  	unsigned long flags; -	struct irq_cfg *cfg = irq_cfg(irq);  	spin_lock_irqsave(&ioapic_lock, flags);  	entry = cfg->irq_2_pin; @@ -323,13 +361,12 @@ static void ioapic_mask_entry(int apic, int pin)  }  #ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)  {  	int apic, pin; -	struct irq_cfg *cfg;  	struct irq_pin_list *entry; +	u8 vector = cfg->vector; -	cfg = irq_cfg(irq);  	entry = cfg->irq_2_pin;  	for (;;) {  		unsigned int reg; @@ -359,24 +396,27 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)  	}  } -static int assign_irq_vector(int irq, cpumask_t mask); +static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask); -static void set_ioapic_affinity_irq(unsigned int irq, -				    const struct cpumask *mask) +static void set_ioapic_affinity_irq_desc(struct irq_desc *desc, +					 const struct cpumask *mask)  {  	struct irq_cfg *cfg;  	unsigned long flags;  	unsigned int dest;  	cpumask_t tmp; -	struct irq_desc *desc; +	unsigned int irq;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; -	cfg = irq_cfg(irq); -	if (assign_irq_vector(irq, *mask)) +	irq = desc->irq; +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; +	set_extra_move_desc(desc, *mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp);  	/* @@ -384,12 +424,21 @@ static void set_ioapic_affinity_irq(unsigned int irq,  	 */  	dest = SET_APIC_LOGICAL_ID(dest); -	desc = irq_to_desc(irq);  	spin_lock_irqsave(&ioapic_lock, flags); -	__target_IO_APIC_irq(irq, dest, cfg->vector); +	__target_IO_APIC_irq(irq, dest, cfg);  	cpumask_copy(&desc->affinity, mask);  	spin_unlock_irqrestore(&ioapic_lock, flags);  } + +static void set_ioapic_affinity_irq(unsigned int irq, +				    const struct cpumask *mask) +{ +	struct irq_desc *desc; + +	desc = irq_to_desc(irq); + +	set_ioapic_affinity_irq_desc(desc, mask); +}  #endif /* CONFIG_SMP */  /* @@ -397,16 +446,18 @@ static void set_ioapic_affinity_irq(unsigned int irq,   * shared ISA-space IRQs, so we have to support them. We are super   * fast in the common case, and fast for shared ISA-space IRQs.   */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) +static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)  { -	struct irq_cfg *cfg;  	struct irq_pin_list *entry; -	/* first time to refer irq_cfg, so with new */ -	cfg = irq_cfg_alloc(irq);  	entry = cfg->irq_2_pin;  	if (!entry) { -		entry = get_one_free_irq_2_pin(); +		entry = get_one_free_irq_2_pin(cpu); +		if (!entry) { +			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n", +					apic, pin); +			return; +		}  		cfg->irq_2_pin = entry;  		entry->apic = apic;  		entry->pin = pin; @@ -421,7 +472,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)  		entry = entry->next;  	} -	entry->next = get_one_free_irq_2_pin(); +	entry->next = get_one_free_irq_2_pin(cpu);  	entry = entry->next;  	entry->apic = apic;  	entry->pin = pin; @@ -430,11 +481,10 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)  /*   * Reroute an IRQ to a different pin.   */ -static void __init replace_pin_at_irq(unsigned int irq, +static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,  				      int oldapic, int oldpin,  				      int newapic, int newpin)  { -	struct irq_cfg *cfg = irq_cfg(irq);  	struct irq_pin_list *entry = cfg->irq_2_pin;  	int replaced = 0; @@ -451,18 +501,16 @@ static void __init replace_pin_at_irq(unsigned int irq,  	/* why? call replace before add? */  	if (!replaced) -		add_pin_to_irq(irq, newapic, newpin); +		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);  } -static inline void io_apic_modify_irq(unsigned int irq, +static inline void io_apic_modify_irq(struct irq_cfg *cfg,  				int mask_and, int mask_or,  				void (*final)(struct irq_pin_list *entry))  {  	int pin; -	struct irq_cfg *cfg;  	struct irq_pin_list *entry; -	cfg = irq_cfg(irq);  	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {  		unsigned int reg;  		pin = entry->pin; @@ -475,9 +523,9 @@ static inline void io_apic_modify_irq(unsigned int irq,  	}  } -static void __unmask_IO_APIC_irq(unsigned int irq) +static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)  { -	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);  }  #ifdef CONFIG_X86_64 @@ -492,47 +540,64 @@ void io_apic_sync(struct irq_pin_list *entry)  	readl(&io_apic->data);  } -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg)  { -	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);  }  #else /* CONFIG_X86_32 */ -static void __mask_IO_APIC_irq(unsigned int irq) +static void __mask_IO_APIC_irq(struct irq_cfg *cfg)  { -	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +	io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, NULL);  } -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)  { -	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, +	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,  			IO_APIC_REDIR_MASKED, NULL);  } -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)  { -	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, +	io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,  			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);  }  #endif /* CONFIG_X86_32 */ -static void mask_IO_APIC_irq (unsigned int irq) +static void mask_IO_APIC_irq_desc(struct irq_desc *desc)  { +	struct irq_cfg *cfg = desc->chip_data;  	unsigned long flags; +	BUG_ON(!cfg); +  	spin_lock_irqsave(&ioapic_lock, flags); -	__mask_IO_APIC_irq(irq); +	__mask_IO_APIC_irq(cfg);  	spin_unlock_irqrestore(&ioapic_lock, flags);  } -static void unmask_IO_APIC_irq (unsigned int irq) +static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)  { +	struct irq_cfg *cfg = desc->chip_data;  	unsigned long flags;  	spin_lock_irqsave(&ioapic_lock, flags); -	__unmask_IO_APIC_irq(irq); +	__unmask_IO_APIC_irq(cfg);  	spin_unlock_irqrestore(&ioapic_lock, flags);  } +static void mask_IO_APIC_irq(unsigned int irq) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	mask_IO_APIC_irq_desc(desc); +} +static void unmask_IO_APIC_irq(unsigned int irq) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	unmask_IO_APIC_irq_desc(desc); +} +  static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)  {  	struct IO_APIC_route_entry entry; @@ -809,7 +874,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);   */  static int EISA_ELCR(unsigned int irq)  { -	if (irq < 16) { +	if (irq < NR_IRQS_LEGACY) {  		unsigned int port = 0x4d0 + (irq >> 3);  		return (inb(port) >> (irq & 7)) & 1;  	} @@ -1034,7 +1099,7 @@ void unlock_vector_lock(void)  	spin_unlock(&vector_lock);  } -static int __assign_irq_vector(int irq, cpumask_t mask) +static int __assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)  {  	/*  	 * NOTE! The local APIC isn't very good at handling @@ -1050,16 +1115,13 @@ static int __assign_irq_vector(int irq, cpumask_t mask)  	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;  	unsigned int old_vector;  	int cpu; -	struct irq_cfg *cfg; -	cfg = irq_cfg(irq); +	if ((cfg->move_in_progress) || cfg->move_cleanup_count) +		return -EBUSY;  	/* Only try and allocate irqs on cpus that are present */  	cpus_and(mask, mask, cpu_online_map); -	if ((cfg->move_in_progress) || cfg->move_cleanup_count) -		return -EBUSY; -  	old_vector = cfg->vector;  	if (old_vector) {  		cpumask_t tmp; @@ -1113,24 +1175,22 @@ next:  	return -ENOSPC;  } -static int assign_irq_vector(int irq, cpumask_t mask) +static int assign_irq_vector(int irq, struct irq_cfg *cfg, cpumask_t mask)  {  	int err;  	unsigned long flags;  	spin_lock_irqsave(&vector_lock, flags); -	err = __assign_irq_vector(irq, mask); +	err = __assign_irq_vector(irq, cfg, mask);  	spin_unlock_irqrestore(&vector_lock, flags);  	return err;  } -static void __clear_irq_vector(int irq) +static void __clear_irq_vector(int irq, struct irq_cfg *cfg)  { -	struct irq_cfg *cfg;  	cpumask_t mask;  	int cpu, vector; -	cfg = irq_cfg(irq);  	BUG_ON(!cfg->vector);  	vector = cfg->vector; @@ -1162,9 +1222,13 @@ void __setup_vector_irq(int cpu)  	/* This function must be called with vector_lock held */  	int irq, vector;  	struct irq_cfg *cfg; +	struct irq_desc *desc;  	/* Mark the inuse vectors */ -	for_each_irq_cfg(irq, cfg) { +	for_each_irq_desc(irq, desc) { +		if (!desc) +			continue; +		cfg = desc->chip_data;  		if (!cpu_isset(cpu, cfg->domain))  			continue;  		vector = cfg->vector; @@ -1215,11 +1279,8 @@ static inline int IO_APIC_irq_trigger(int irq)  }  #endif -static void ioapic_register_intr(int irq, unsigned long trigger) +static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger)  { -	struct irq_desc *desc; - -	desc = irq_to_desc(irq);  	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||  	    trigger == IOAPIC_LEVEL) @@ -1311,7 +1372,7 @@ static int setup_ioapic_entry(int apic, int irq,  	return 0;  } -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, struct irq_desc *desc,  			      int trigger, int polarity)  {  	struct irq_cfg *cfg; @@ -1321,10 +1382,10 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,  	if (!IO_APIC_IRQ(irq))  		return; -	cfg = irq_cfg(irq); +	cfg = desc->chip_data;  	mask = TARGET_CPUS; -	if (assign_irq_vector(irq, mask)) +	if (assign_irq_vector(irq, cfg, mask))  		return;  	cpus_and(mask, cfg->domain, mask); @@ -1341,12 +1402,12 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,  			       cfg->vector)) {  		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",  		       mp_ioapics[apic].mp_apicid, pin); -		__clear_irq_vector(irq); +		__clear_irq_vector(irq, cfg);  		return;  	} -	ioapic_register_intr(irq, trigger); -	if (irq < 16) +	ioapic_register_intr(irq, desc, trigger); +	if (irq < NR_IRQS_LEGACY)  		disable_8259A_irq(irq);  	ioapic_write_entry(apic, pin, entry); @@ -1356,6 +1417,9 @@ static void __init setup_IO_APIC_irqs(void)  {  	int apic, pin, idx, irq;  	int notcon = 0; +	struct irq_desc *desc; +	struct irq_cfg *cfg; +	int cpu = boot_cpu_id;  	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); @@ -1387,9 +1451,15 @@ static void __init setup_IO_APIC_irqs(void)  			if (multi_timer_check(apic, irq))  				continue;  #endif -			add_pin_to_irq(irq, apic, pin); +			desc = irq_to_desc_alloc_cpu(irq, cpu); +			if (!desc) { +				printk(KERN_INFO "can not get irq_desc for %d\n", irq); +				continue; +			} +			cfg = desc->chip_data; +			add_pin_to_irq_cpu(cfg, cpu, apic, pin); -			setup_IO_APIC_irq(apic, pin, irq, +			setup_IO_APIC_irq(apic, pin, irq, desc,  					irq_trigger(idx), irq_polarity(idx));  		}  	} @@ -1448,6 +1518,7 @@ __apicdebuginit(void) print_IO_APIC(void)  	union IO_APIC_reg_03 reg_03;  	unsigned long flags;  	struct irq_cfg *cfg; +	struct irq_desc *desc;  	unsigned int irq;  	if (apic_verbosity == APIC_QUIET) @@ -1537,8 +1608,13 @@ __apicdebuginit(void) print_IO_APIC(void)  	}  	}  	printk(KERN_DEBUG "IRQ to pin mappings:\n"); -	for_each_irq_cfg(irq, cfg) { -		struct irq_pin_list *entry = cfg->irq_2_pin; +	for_each_irq_desc(irq, desc) { +		struct irq_pin_list *entry; + +		if (!desc) +			continue; +		cfg = desc->chip_data; +		entry = cfg->irq_2_pin;  		if (!entry)  			continue;  		printk(KERN_DEBUG "IRQ%d ", irq); @@ -2022,14 +2098,16 @@ static unsigned int startup_ioapic_irq(unsigned int irq)  {  	int was_pending = 0;  	unsigned long flags; +	struct irq_cfg *cfg;  	spin_lock_irqsave(&ioapic_lock, flags); -	if (irq < 16) { +	if (irq < NR_IRQS_LEGACY) {  		disable_8259A_irq(irq);  		if (i8259A_irq_pending(irq))  			was_pending = 1;  	} -	__unmask_IO_APIC_irq(irq); +	cfg = irq_cfg(irq); +	__unmask_IO_APIC_irq(cfg);  	spin_unlock_irqrestore(&ioapic_lock, flags);  	return was_pending; @@ -2092,35 +2170,37 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);   * as simple as edge triggered migration and we can do the irq migration   * with a simple atomic update to IO-APIC RTE.   */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) +static void migrate_ioapic_irq_desc(struct irq_desc *desc, cpumask_t mask)  {  	struct irq_cfg *cfg; -	struct irq_desc *desc;  	cpumask_t tmp, cleanup_mask;  	struct irte irte;  	int modify_ioapic_rte;  	unsigned int dest;  	unsigned long flags; +	unsigned int irq;  	cpus_and(tmp, mask, cpu_online_map);  	if (cpus_empty(tmp))  		return; +	irq = desc->irq;  	if (get_irte(irq, &irte))  		return; -	if (assign_irq_vector(irq, mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, mask); +  	cpus_and(tmp, cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp); -	desc = irq_to_desc(irq);  	modify_ioapic_rte = desc->status & IRQ_LEVEL;  	if (modify_ioapic_rte) {  		spin_lock_irqsave(&ioapic_lock, flags); -		__target_IO_APIC_irq(irq, dest, cfg->vector); +		__target_IO_APIC_irq(irq, dest, cfg);  		spin_unlock_irqrestore(&ioapic_lock, flags);  	} @@ -2142,14 +2222,14 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask)  	desc->affinity = mask;  } -static int migrate_irq_remapped_level(int irq) +static int migrate_irq_remapped_level_desc(struct irq_desc *desc)  {  	int ret = -1; -	struct irq_desc *desc = irq_to_desc(irq); +	struct irq_cfg *cfg = desc->chip_data; -	mask_IO_APIC_irq(irq); +	mask_IO_APIC_irq_desc(desc); -	if (io_apic_level_ack_pending(irq)) { +	if (io_apic_level_ack_pending(cfg)) {  		/*  		 * Interrupt in progress. Migrating irq now will change the  		 * vector information in the IO-APIC RTE and that will confuse @@ -2161,14 +2241,15 @@ static int migrate_irq_remapped_level(int irq)  	}  	/* everthing is clear. we have right of way */ -	migrate_ioapic_irq(irq, desc->pending_mask); +	migrate_ioapic_irq_desc(desc, desc->pending_mask);  	ret = 0;  	desc->status &= ~IRQ_MOVE_PENDING;  	cpus_clear(desc->pending_mask);  unmask: -	unmask_IO_APIC_irq(irq); +	unmask_IO_APIC_irq_desc(desc); +  	return ret;  } @@ -2178,6 +2259,9 @@ static void ir_irq_migration(struct work_struct *work)  	struct irq_desc *desc;  	for_each_irq_desc(irq, desc) { +		if (!desc) +			continue; +  		if (desc->status & IRQ_MOVE_PENDING) {  			unsigned long flags; @@ -2198,19 +2282,24 @@ static void ir_irq_migration(struct work_struct *work)  /*   * Migrates the IRQ destination in the process context.   */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, -				       const struct cpumask *mask) +static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, +					    const struct cpumask *mask)  { -	struct irq_desc *desc = irq_to_desc(irq); -  	if (desc->status & IRQ_LEVEL) {  		desc->status |= IRQ_MOVE_PENDING;  		cpumask_copy(&desc->pending_mask, mask); -		migrate_irq_remapped_level(irq); +		migrate_irq_remapped_level_desc(desc);  		return;  	} -	migrate_ioapic_irq(irq, *mask); +	migrate_ioapic_irq_desc(desc, mask); +} +static void set_ir_ioapic_affinity_irq(unsigned int irq, +				       const struct cpumask *mask) +{ +	struct irq_desc *desc = irq_to_desc(irq); + +	set_ir_ioapic_affinity_irq_desc(desc, mask);  }  #endif @@ -2230,6 +2319,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)  		struct irq_cfg *cfg;  		irq = __get_cpu_var(vector_irq)[vector]; +		if (irq == -1) +			continue; +  		desc = irq_to_desc(irq);  		if (!desc)  			continue; @@ -2251,9 +2343,10 @@ unlock:  	irq_exit();  } -static void irq_complete_move(unsigned int irq) +static void irq_complete_move(struct irq_desc **descp)  { -	struct irq_cfg *cfg = irq_cfg(irq); +	struct irq_desc *desc = *descp; +	struct irq_cfg *cfg = desc->chip_data;  	unsigned vector, me;  	if (likely(!cfg->move_in_progress)) @@ -2271,8 +2364,9 @@ static void irq_complete_move(unsigned int irq)  	}  }  #else -static inline void irq_complete_move(unsigned int irq) {} +static inline void irq_complete_move(struct irq_desc **descp) {}  #endif +  #ifdef CONFIG_INTR_REMAP  static void ack_x2apic_level(unsigned int irq)  { @@ -2283,11 +2377,14 @@ static void ack_x2apic_edge(unsigned int irq)  {  	ack_x2APIC_irq();  } +  #endif  static void ack_apic_edge(unsigned int irq)  { -	irq_complete_move(irq); +	struct irq_desc *desc = irq_to_desc(irq); + +	irq_complete_move(&desc);  	move_native_irq(irq);  	ack_APIC_irq();  } @@ -2296,18 +2393,21 @@ atomic_t irq_mis_count;  static void ack_apic_level(unsigned int irq)  { +	struct irq_desc *desc = irq_to_desc(irq); +  #ifdef CONFIG_X86_32  	unsigned long v;  	int i;  #endif +	struct irq_cfg *cfg;  	int do_unmask_irq = 0; -	irq_complete_move(irq); +	irq_complete_move(&desc);  #ifdef CONFIG_GENERIC_PENDING_IRQ  	/* If we are moving the irq we need to mask it */ -	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { +	if (unlikely(desc->status & IRQ_MOVE_PENDING)) {  		do_unmask_irq = 1; -		mask_IO_APIC_irq(irq); +		mask_IO_APIC_irq_desc(desc);  	}  #endif @@ -2331,7 +2431,8 @@ static void ack_apic_level(unsigned int irq)  	* operation to prevent an edge-triggered interrupt escaping meanwhile.  	* The idea is from Manfred Spraul.  --macro  	*/ -	i = irq_cfg(irq)->vector; +	cfg = desc->chip_data; +	i = cfg->vector;  	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));  #endif @@ -2370,17 +2471,18 @@ static void ack_apic_level(unsigned int irq)  		 * accurate and is causing problems then it is a hardware bug  		 * and you can go talk to the chipset vendor about it.  		 */ -		if (!io_apic_level_ack_pending(irq)) +		cfg = desc->chip_data; +		if (!io_apic_level_ack_pending(cfg))  			move_masked_irq(irq); -		unmask_IO_APIC_irq(irq); +		unmask_IO_APIC_irq_desc(desc);  	}  #ifdef CONFIG_X86_32  	if (!(v & (1 << (i & 0x1f)))) {  		atomic_inc(&irq_mis_count);  		spin_lock(&ioapic_lock); -		__mask_and_edge_IO_APIC_irq(irq); -		__unmask_and_level_IO_APIC_irq(irq); +		__mask_and_edge_IO_APIC_irq(cfg); +		__unmask_and_level_IO_APIC_irq(cfg);  		spin_unlock(&ioapic_lock);  	}  #endif @@ -2431,20 +2533,22 @@ static inline void init_IO_APIC_traps(void)  	 * Also, we've got to be careful not to trash gate  	 * 0x80, because int 0x80 is hm, kind of importantish. ;)  	 */ -	for_each_irq_cfg(irq, cfg) { -		if (IO_APIC_IRQ(irq) && !cfg->vector) { +	for_each_irq_desc(irq, desc) { +		if (!desc) +			continue; + +		cfg = desc->chip_data; +		if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {  			/*  			 * Hmm.. We don't have an entry for this,  			 * so default to an old-fashioned 8259  			 * interrupt if we can..  			 */ -			if (irq < 16) +			if (irq < NR_IRQS_LEGACY)  				make_8259A_irq(irq); -			else { -				desc = irq_to_desc(irq); +			else  				/* Strange. Oh, well.. */  				desc->chip = &no_irq_chip; -			}  		}  	}  } @@ -2469,7 +2573,7 @@ static void unmask_lapic_irq(unsigned int irq)  	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);  } -static void ack_lapic_irq (unsigned int irq) +static void ack_lapic_irq(unsigned int irq)  {  	ack_APIC_irq();  } @@ -2481,11 +2585,8 @@ static struct irq_chip lapic_chip __read_mostly = {  	.ack		= ack_lapic_irq,  }; -static void lapic_register_intr(int irq) +static void lapic_register_intr(int irq, struct irq_desc *desc)  { -	struct irq_desc *desc; - -	desc = irq_to_desc(irq);  	desc->status &= ~IRQ_LEVEL;  	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,  				      "edge"); @@ -2589,7 +2690,9 @@ int timer_through_8259 __initdata;   */  static inline void __init check_timer(void)  { -	struct irq_cfg *cfg = irq_cfg(0); +	struct irq_desc *desc = irq_to_desc(0); +	struct irq_cfg *cfg = desc->chip_data; +	int cpu = boot_cpu_id;  	int apic1, pin1, apic2, pin2;  	unsigned long flags;  	unsigned int ver; @@ -2604,7 +2707,7 @@ static inline void __init check_timer(void)  	 * get/set the timer IRQ vector:  	 */  	disable_8259A_irq(0); -	assign_irq_vector(0, TARGET_CPUS); +	assign_irq_vector(0, cfg, TARGET_CPUS);  	/*  	 * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2655,10 +2758,10 @@ static inline void __init check_timer(void)  		 * Ok, does IRQ0 through the IOAPIC work?  		 */  		if (no_pin1) { -			add_pin_to_irq(0, apic1, pin1); +			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);  			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);  		} -		unmask_IO_APIC_irq(0); +		unmask_IO_APIC_irq_desc(desc);  		if (timer_irq_works()) {  			if (nmi_watchdog == NMI_IO_APIC) {  				setup_nmi(); @@ -2684,9 +2787,9 @@ static inline void __init check_timer(void)  		/*  		 * legacy devices should be connected to IO APIC #0  		 */ -		replace_pin_at_irq(0, apic1, pin1, apic2, pin2); +		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);  		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); -		unmask_IO_APIC_irq(0); +		unmask_IO_APIC_irq_desc(desc);  		enable_8259A_irq(0);  		if (timer_irq_works()) {  			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); @@ -2718,7 +2821,7 @@ static inline void __init check_timer(void)  	apic_printk(APIC_QUIET, KERN_INFO  		    "...trying to set up timer as Virtual Wire IRQ...\n"); -	lapic_register_intr(0); +	lapic_register_intr(0, desc);  	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */  	enable_8259A_irq(0); @@ -2903,22 +3006,26 @@ unsigned int create_irq_nr(unsigned int irq_want)  	unsigned int irq;  	unsigned int new;  	unsigned long flags; -	struct irq_cfg *cfg_new; - -	irq_want = nr_irqs - 1; +	struct irq_cfg *cfg_new = NULL; +	int cpu = boot_cpu_id; +	struct irq_desc *desc_new = NULL;  	irq = 0;  	spin_lock_irqsave(&vector_lock, flags); -	for (new = irq_want; new > 0; new--) { +	for (new = irq_want; new < NR_IRQS; new++) {  		if (platform_legacy_irq(new))  			continue; -		cfg_new = irq_cfg(new); -		if (cfg_new && cfg_new->vector != 0) + +		desc_new = irq_to_desc_alloc_cpu(new, cpu); +		if (!desc_new) { +			printk(KERN_INFO "can not get irq_desc for %d\n", new); +			continue; +		} +		cfg_new = desc_new->chip_data; + +		if (cfg_new->vector != 0)  			continue; -		/* check if need to create one */ -		if (!cfg_new) -			cfg_new = irq_cfg_alloc(new); -		if (__assign_irq_vector(new, TARGET_CPUS) == 0) +		if (__assign_irq_vector(new, cfg_new, TARGET_CPUS) == 0)  			irq = new;  		break;  	} @@ -2926,15 +3033,21 @@ unsigned int create_irq_nr(unsigned int irq_want)  	if (irq > 0) {  		dynamic_irq_init(irq); +		/* restore it, in case dynamic_irq_init clear it */ +		if (desc_new) +			desc_new->chip_data = cfg_new;  	}  	return irq;  } +static int nr_irqs_gsi = NR_IRQS_LEGACY;  int create_irq(void)  { +	unsigned int irq_want;  	int irq; -	irq = create_irq_nr(nr_irqs - 1); +	irq_want = nr_irqs_gsi; +	irq = create_irq_nr(irq_want);  	if (irq == 0)  		irq = -1; @@ -2945,14 +3058,22 @@ int create_irq(void)  void destroy_irq(unsigned int irq)  {  	unsigned long flags; +	struct irq_cfg *cfg; +	struct irq_desc *desc; +	/* store it, in case dynamic_irq_cleanup clear it */ +	desc = irq_to_desc(irq); +	cfg = desc->chip_data;  	dynamic_irq_cleanup(irq); +	/* connect back irq_cfg */ +	if (desc) +		desc->chip_data = cfg;  #ifdef CONFIG_INTR_REMAP  	free_irte(irq);  #endif  	spin_lock_irqsave(&vector_lock, flags); -	__clear_irq_vector(irq); +	__clear_irq_vector(irq, cfg);  	spin_unlock_irqrestore(&vector_lock, flags);  } @@ -2967,12 +3088,12 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  	unsigned dest;  	cpumask_t tmp; +	cfg = irq_cfg(irq);  	tmp = TARGET_CPUS; -	err = assign_irq_vector(irq, tmp); +	err = assign_irq_vector(irq, cfg, tmp);  	if (err)  		return err; -	cfg = irq_cfg(irq);  	cpus_and(tmp, cfg->domain, tmp);  	dest = cpu_mask_to_apicid(tmp); @@ -3030,34 +3151,34 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms  #ifdef CONFIG_SMP  static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg;  	struct msi_msg msg;  	unsigned int dest;  	cpumask_t tmp; -	struct irq_desc *desc;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; -	if (assign_irq_vector(irq, *mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, *mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp); -	read_msi_msg(irq, &msg); +	read_msi_msg_desc(desc, &msg);  	msg.data &= ~MSI_DATA_VECTOR_MASK;  	msg.data |= MSI_DATA_VECTOR(cfg->vector);  	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;  	msg.address_lo |= MSI_ADDR_DEST_ID(dest); -	write_msi_msg(irq, &msg); -	desc = irq_to_desc(irq); +	write_msi_msg_desc(desc, &msg);  	cpumask_copy(&desc->affinity, mask);  } -  #ifdef CONFIG_INTR_REMAP  /*   * Migrate the MSI irq to another cpumask. This migration is @@ -3066,11 +3187,11 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)  static void ir_set_msi_irq_affinity(unsigned int irq,  				    const struct cpumask *mask)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg;  	unsigned int dest;  	cpumask_t tmp, cleanup_mask;  	struct irte irte; -	struct irq_desc *desc;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; @@ -3078,10 +3199,12 @@ static void ir_set_msi_irq_affinity(unsigned int irq,  	if (get_irte(irq, &irte))  		return; -	if (assign_irq_vector(irq, *mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp); @@ -3105,9 +3228,9 @@ static void ir_set_msi_irq_affinity(unsigned int irq,  		cfg->move_in_progress = 0;  	} -	desc = irq_to_desc(irq);  	cpumask_copy(&desc->affinity, mask);  } +  #endif  #endif /* CONFIG_SMP */ @@ -3166,7 +3289,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)  }  #endif -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)  {  	int ret;  	struct msi_msg msg; @@ -3175,7 +3298,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)  	if (ret < 0)  		return ret; -	set_irq_msi(irq, desc); +	set_irq_msi(irq, msidesc);  	write_msi_msg(irq, &msg);  #ifdef CONFIG_INTR_REMAP @@ -3195,26 +3318,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)  	return 0;  } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ -	unsigned int irq; - -	irq = dev->bus->number; -	irq <<= 8; -	irq |= dev->devfn; -	irq <<= 12; - -	return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)  {  	unsigned int irq;  	int ret;  	unsigned int irq_want; -	irq_want = build_irq_for_pci_dev(dev) + 0x100; - +	irq_want = nr_irqs_gsi;  	irq = create_irq_nr(irq_want);  	if (irq == 0)  		return -1; @@ -3228,7 +3338,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)  		goto error;  no_ir:  #endif -	ret = setup_msi_irq(dev, desc, irq); +	ret = setup_msi_irq(dev, msidesc, irq);  	if (ret < 0) {  		destroy_irq(irq);  		return ret; @@ -3246,7 +3356,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  {  	unsigned int irq;  	int ret, sub_handle; -	struct msi_desc *desc; +	struct msi_desc *msidesc;  	unsigned int irq_want;  #ifdef CONFIG_INTR_REMAP @@ -3254,10 +3364,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  	int index = 0;  #endif -	irq_want = build_irq_for_pci_dev(dev) + 0x100; +	irq_want = nr_irqs_gsi;  	sub_handle = 0; -	list_for_each_entry(desc, &dev->msi_list, list) { -		irq = create_irq_nr(irq_want--); +	list_for_each_entry(msidesc, &dev->msi_list, list) { +		irq = create_irq_nr(irq_want); +		irq_want++;  		if (irq == 0)  			return -1;  #ifdef CONFIG_INTR_REMAP @@ -3289,7 +3400,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)  		}  no_ir:  #endif -		ret = setup_msi_irq(dev, desc, irq); +		ret = setup_msi_irq(dev, msidesc, irq);  		if (ret < 0)  			goto error;  		sub_handle++; @@ -3310,19 +3421,21 @@ void arch_teardown_msi_irq(unsigned int irq)  #ifdef CONFIG_SMP  static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg;  	struct msi_msg msg;  	unsigned int dest;  	cpumask_t tmp; -	struct irq_desc *desc;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; -	if (assign_irq_vector(irq, *mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, *mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp); @@ -3334,9 +3447,9 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);  	dmar_msi_write(irq, &msg); -	desc = irq_to_desc(irq);  	cpumask_copy(&desc->affinity, mask);  } +  #endif /* CONFIG_SMP */  struct irq_chip dmar_msi_type = { @@ -3370,8 +3483,8 @@ int arch_setup_dmar_msi(unsigned int irq)  #ifdef CONFIG_SMP  static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg; -	struct irq_desc *desc;  	struct msi_msg msg;  	unsigned int dest;  	cpumask_t tmp; @@ -3379,10 +3492,12 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; -	if (assign_irq_vector(irq, *mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, *mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp); @@ -3394,9 +3509,9 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)  	msg.address_lo |= MSI_ADDR_DEST_ID(dest);  	hpet_msi_write(irq, &msg); -	desc = irq_to_desc(irq);  	cpumask_copy(&desc->affinity, mask);  } +  #endif /* CONFIG_SMP */  struct irq_chip hpet_msi_type = { @@ -3451,25 +3566,27 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)  static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)  { +	struct irq_desc *desc = irq_to_desc(irq);  	struct irq_cfg *cfg;  	unsigned int dest;  	cpumask_t tmp; -	struct irq_desc *desc;  	if (!cpumask_intersects(mask, cpu_online_mask))  		return; -	if (assign_irq_vector(irq, *mask)) +	cfg = desc->chip_data; +	if (assign_irq_vector(irq, cfg, *mask))  		return; -	cfg = irq_cfg(irq); +	set_extra_move_desc(desc, *mask); +  	cpumask_and(&tmp, &cfg->domain, mask);  	dest = cpu_mask_to_apicid(tmp);  	target_ht_irq(irq, dest, cfg->vector); -	desc = irq_to_desc(irq);  	cpumask_copy(&desc->affinity, mask);  } +  #endif  static struct irq_chip ht_irq_chip = { @@ -3489,13 +3606,13 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)  	int err;  	cpumask_t tmp; +	cfg = irq_cfg(irq);  	tmp = TARGET_CPUS; -	err = assign_irq_vector(irq, tmp); +	err = assign_irq_vector(irq, cfg, tmp);  	if (!err) {  		struct ht_irq_msg msg;  		unsigned dest; -		cfg = irq_cfg(irq);  		cpus_and(tmp, cfg->domain, tmp);  		dest = cpu_mask_to_apicid(tmp); @@ -3541,7 +3658,9 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  	unsigned long flags;  	int err; -	err = assign_irq_vector(irq, *eligible_cpu); +	cfg = irq_cfg(irq); + +	err = assign_irq_vector(irq, cfg, *eligible_cpu);  	if (err != 0)  		return err; @@ -3550,8 +3669,6 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,  				      irq_name);  	spin_unlock_irqrestore(&vector_lock, flags); -	cfg = irq_cfg(irq); -  	mmr_value = 0;  	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;  	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); @@ -3603,9 +3720,16 @@ int __init io_apic_get_redir_entries (int ioapic)  	return reg_01.bits.entries;  } -int __init probe_nr_irqs(void) +void __init probe_nr_irqs_gsi(void)  { -	return NR_IRQS; +	int idx; +	int nr = 0; + +	for (idx = 0; idx < nr_ioapics; idx++) +		nr += io_apic_get_redir_entries(idx) + 1; + +	if (nr > nr_irqs_gsi) +		nr_irqs_gsi = nr;  }  /* -------------------------------------------------------------------------- @@ -3704,19 +3828,31 @@ int __init io_apic_get_version(int ioapic)  int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)  { +	struct irq_desc *desc; +	struct irq_cfg *cfg; +	int cpu = boot_cpu_id; +  	if (!IO_APIC_IRQ(irq)) {  		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",  			ioapic);  		return -EINVAL;  	} +	desc = irq_to_desc_alloc_cpu(irq, cpu); +	if (!desc) { +		printk(KERN_INFO "can not get irq_desc %d\n", irq); +		return 0; +	} +  	/*  	 * IRQs < 16 are already in the irq_2_pin[] map  	 */ -	if (irq >= 16) -		add_pin_to_irq(irq, ioapic, pin); +	if (irq >= NR_IRQS_LEGACY) { +		cfg = desc->chip_data; +		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin); +	} -	setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); +	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);  	return 0;  } @@ -3770,9 +3906,10 @@ void __init setup_ioapic_dest(void)  			 * when you have too many devices, because at that time only boot  			 * cpu is online.  			 */ -			cfg = irq_cfg(irq); +			desc = irq_to_desc(irq); +			cfg = desc->chip_data;  			if (!cfg->vector) { -				setup_IO_APIC_irq(ioapic, pin, irq, +				setup_IO_APIC_irq(ioapic, pin, irq, desc,  						  irq_trigger(irq_entry),  						  irq_polarity(irq_entry));  				continue; @@ -3782,7 +3919,6 @@ void __init setup_ioapic_dest(void)  			/*  			 * Honour affinities which have been set in early boot  			 */ -			desc = irq_to_desc(irq);  			if (desc->status &  			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))  				mask = desc->affinity; @@ -3791,10 +3927,10 @@ void __init setup_ioapic_dest(void)  #ifdef CONFIG_INTR_REMAP  			if (intr_remapping_enabled) -				set_ir_ioapic_affinity_irq(irq, &mask); +				set_ir_ioapic_affinity_irq_desc(desc, &mask);  			else  #endif -				set_ioapic_affinity_irq(irq, &mask); +				set_ioapic_affinity_irq_desc(desc, &mask);  		}  	} @@ -3843,7 +3979,6 @@ void __init ioapic_init_mappings(void)  	struct resource *ioapic_res;  	int i; -	irq_2_pin_init();  	ioapic_res = ioapic_setup_resources();  	for (i = 0; i < nr_ioapics; i++) {  		if (smp_found_config) { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f649..3f1d9d18df67 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)  	}  	desc = irq_to_desc(i); +	if (!desc) +		return 0; +  	spin_lock_irqsave(&desc->lock, flags);  #ifndef CONFIG_SMP  	any_count = kstat_irqs(i); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 87870a49be4e..9cf9cbbf7a02 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map)  	for_each_irq_desc(irq, desc) {  		cpumask_t mask; +		if (!desc) +			continue;  		if (irq == 2)  			continue; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7d37f847544d..27f2307b0a34 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map)  		int break_affinity = 0;  		int set_affinity = 1; +		if (!desc) +			continue;  		if (irq == 2)  			continue; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 845aa9803e80..6a92f47c52e7 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -68,8 +68,7 @@ void __init init_ISA_irqs (void)  	/*  	 * 16 old-style INTA-cycle interrupts:  	 */ -	for (i = 0; i < 16; i++) { -		/* first time call this irq_desc */ +	for (i = 0; i < NR_IRQS_LEGACY; i++) {  		struct irq_desc *desc = irq_to_desc(i);  		desc->status = IRQ_DISABLED; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index ff0235391285..40c1e62ec785 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,8 +142,7 @@ void __init init_ISA_irqs(void)  	init_bsp_APIC();  	init_8259A(0); -	for (i = 0; i < 16; i++) { -		/* first time call this irq_desc */ +	for (i = 0; i < NR_IRQS_LEGACY; i++) {  		struct irq_desc *desc = irq_to_desc(i);  		desc->status = IRQ_DISABLED; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0f4c1fd5a1f4..45e3b69808ba 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -586,26 +586,23 @@ static void __init __get_smp_config(unsigned int early)  {  	struct intel_mp_floating *mpf = mpf_found; -	if (x86_quirks->mach_get_smp_config) { -		if (x86_quirks->mach_get_smp_config(early)) -			return; -	} +	if (!mpf) +		return; +  	if (acpi_lapic && early)  		return; +  	/* -	 * ACPI supports both logical (e.g. Hyper-Threading) and physical -	 * processors, where MPS only supports physical. +	 * MPS doesn't support hyperthreading, aka only have +	 * thread 0 apic id in MPS table  	 */ -	if (acpi_lapic && acpi_ioapic) { -		printk(KERN_INFO "Using ACPI (MADT) for SMP configuration " -		       "information\n"); +	if (acpi_lapic && acpi_ioapic)  		return; -	} else if (acpi_lapic) -		printk(KERN_INFO "Using ACPI for processor (LAPIC) " -		       "configuration information\n"); -	if (!mpf) -		return; +	if (x86_quirks->mach_get_smp_config) { +		if (x86_quirks->mach_get_smp_config(early)) +			return; +	}  	printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",  	       mpf->mpf_specification); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 4caff39078e0..0deea37a53cf 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -31,7 +31,7 @@  #include <asm/numaq.h>  #include <asm/topology.h>  #include <asm/processor.h> -#include <asm/mpspec.h> +#include <asm/genapic.h>  #include <asm/e820.h>  #include <asm/setup.h> @@ -235,6 +235,13 @@ static int __init numaq_setup_ioapic_ids(void)  	return 1;  } +static int __init numaq_update_genapic(void) +{ +	genapic->wakeup_cpu = wakeup_secondary_cpu_via_nmi; + +	return 0; +} +  static struct x86_quirks numaq_x86_quirks __initdata = {  	.arch_pre_time_init	= numaq_pre_time_init,  	.arch_time_init		= NULL, @@ -250,6 +257,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {  	.mpc_oem_pci_bus	= mpc_oem_pci_bus,  	.smp_read_mpc_oem	= smp_read_mpc_oem,  	.setup_ioapic_ids	= numaq_setup_ioapic_ids, +	.update_genapic		= numaq_update_genapic,  };  void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c622772744d8..95d811a9594f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,7 +7,9 @@  #include <linux/module.h>  #include <linux/pm.h>  #include <linux/clockchips.h> +#include <linux/ftrace.h>  #include <asm/system.h> +#include <asm/apic.h>  unsigned long idle_halt;  EXPORT_SYMBOL(idle_halt); @@ -100,6 +102,9 @@ static inline int hlt_use_halt(void)  void default_idle(void)  {  	if (hlt_use_halt()) { +		struct power_trace it; + +		trace_power_start(&it, POWER_CSTATE, 1);  		current_thread_info()->status &= ~TS_POLLING;  		/*  		 * TS_POLLING-cleared state must be visible before we @@ -112,6 +117,7 @@ void default_idle(void)  		else  			local_irq_enable();  		current_thread_info()->status |= TS_POLLING; +		trace_power_end(&it);  	} else {  		local_irq_enable();  		/* loop is done by the caller */ @@ -122,6 +128,21 @@ void default_idle(void)  EXPORT_SYMBOL(default_idle);  #endif +void stop_this_cpu(void *dummy) +{ +	local_irq_disable(); +	/* +	 * Remove this CPU: +	 */ +	cpu_clear(smp_processor_id(), cpu_online_map); +	disable_local_APIC(); + +	for (;;) { +		if (hlt_works(smp_processor_id())) +			halt(); +	} +} +  static void do_nothing(void *unused)  {  } @@ -154,24 +175,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);   */  void mwait_idle_with_hints(unsigned long ax, unsigned long cx)  { +	struct power_trace it; + +	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);  	if (!need_resched()) {  		__monitor((void *)¤t_thread_info()->flags, 0, 0);  		smp_mb();  		if (!need_resched())  			__mwait(ax, cx);  	} +	trace_power_end(&it);  }  /* Default MONITOR/MWAIT with no hints, used for default C1 state */  static void mwait_idle(void)  { +	struct power_trace it;  	if (!need_resched()) { +		trace_power_start(&it, POWER_CSTATE, 1);  		__monitor((void *)¤t_thread_info()->flags, 0, 0);  		smp_mb();  		if (!need_resched())  			__sti_mwait(0, 0);  		else  			local_irq_enable(); +		trace_power_end(&it);  	} else  		local_irq_enable();  } @@ -183,9 +211,13 @@ static void mwait_idle(void)   */  static void poll_idle(void)  { +	struct power_trace it; + +	trace_power_start(&it, POWER_CSTATE, 0);  	local_irq_enable();  	while (!need_resched())  		cpu_relax(); +	trace_power_end(&it);  }  /* diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0a1302fe6d45..24c2276aa453 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,6 +38,7 @@  #include <linux/percpu.h>  #include <linux/prctl.h>  #include <linux/dmi.h> +#include <linux/ftrace.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> @@ -548,7 +549,8 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,   * the task-switch, and shows up in ret_from_fork in entry.S,   * for example.   */ -struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +__notrace_funcgraph struct task_struct * +__switch_to(struct task_struct *prev_p, struct task_struct *next_p)  {  	struct thread_struct *prev = &prev_p->thread,  				 *next = &next_p->thread; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index c958120fb1b6..fbb321d53d34 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -39,6 +39,7 @@  #include <linux/prctl.h>  #include <linux/uaccess.h>  #include <linux/io.h> +#include <linux/ftrace.h>  #include <asm/pgtable.h>  #include <asm/system.h> @@ -551,8 +552,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,   * - could test fs/gs bitsliced   *   * Kprobes not supported here. Set the probe on schedule instead. + * Function graph tracer not supported too.   */ -struct task_struct * +__notrace_funcgraph struct task_struct *  __switch_to(struct task_struct *prev_p, struct task_struct *next_p)  {  	struct thread_struct *prev = &prev_p->thread; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 0a6d8c12e10d..2c8ec1ba75e6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -668,14 +668,14 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,  	size_t bts_index, bts_end;  	int error; -	error = ds_get_bts_end(child, &bts_end); +	error = ds_get_bts_end(child->bts, &bts_end);  	if (error < 0)  		return error;  	if (bts_end <= index)  		return -EINVAL; -	error = ds_get_bts_index(child, &bts_index); +	error = ds_get_bts_index(child->bts, &bts_index);  	if (error < 0)  		return error; @@ -684,7 +684,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,  	if (bts_end <= bts_index)  		bts_index -= bts_end; -	error = ds_access_bts(child, bts_index, &bts_record); +	error = ds_access_bts(child->bts, bts_index, &bts_record);  	if (error < 0)  		return error; @@ -705,14 +705,14 @@ static int ptrace_bts_drain(struct task_struct *child,  	size_t end, i;  	int error; -	error = ds_get_bts_index(child, &end); +	error = ds_get_bts_index(child->bts, &end);  	if (error < 0)  		return error;  	if (size < (end * sizeof(struct bts_struct)))  		return -EIO; -	error = ds_access_bts(child, 0, (const void **)&raw); +	error = ds_access_bts(child->bts, 0, (const void **)&raw);  	if (error < 0)  		return error; @@ -723,18 +723,13 @@ static int ptrace_bts_drain(struct task_struct *child,  			return -EFAULT;  	} -	error = ds_clear_bts(child); +	error = ds_clear_bts(child->bts);  	if (error < 0)  		return error;  	return end;  } -static void ptrace_bts_ovfl(struct task_struct *child) -{ -	send_sig(child->thread.bts_ovfl_signal, child, 0); -} -  static int ptrace_bts_config(struct task_struct *child,  			     long cfg_size,  			     const struct ptrace_bts_config __user *ucfg) @@ -760,23 +755,45 @@ static int ptrace_bts_config(struct task_struct *child,  		goto errout;  	if (cfg.flags & PTRACE_BTS_O_ALLOC) { -		ds_ovfl_callback_t ovfl = NULL; +		bts_ovfl_callback_t ovfl = NULL;  		unsigned int sig = 0; -		/* we ignore the error in case we were not tracing child */ -		(void)ds_release_bts(child); +		error = -EINVAL; +		if (cfg.size < (10 * bts_cfg.sizeof_bts)) +			goto errout;  		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {  			if (!cfg.signal)  				goto errout; +			error = -EOPNOTSUPP; +			goto errout; +  			sig  = cfg.signal; -			ovfl = ptrace_bts_ovfl;  		} -		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); -		if (error < 0) +		if (child->bts) { +			(void)ds_release_bts(child->bts); +			kfree(child->bts_buffer); + +			child->bts = NULL; +			child->bts_buffer = NULL; +		} + +		error = -ENOMEM; +		child->bts_buffer = kzalloc(cfg.size, GFP_KERNEL); +		if (!child->bts_buffer) +			goto errout; + +		child->bts = ds_request_bts(child, child->bts_buffer, cfg.size, +					    ovfl, /* th = */ (size_t)-1); +		if (IS_ERR(child->bts)) { +			error = PTR_ERR(child->bts); +			kfree(child->bts_buffer); +			child->bts = NULL; +			child->bts_buffer = NULL;  			goto errout; +		}  		child->thread.bts_ovfl_signal = sig;  	} @@ -823,15 +840,15 @@ static int ptrace_bts_status(struct task_struct *child,  	if (cfg_size < sizeof(cfg))  		return -EIO; -	error = ds_get_bts_end(child, &end); +	error = ds_get_bts_end(child->bts, &end);  	if (error < 0)  		return error; -	error = ds_access_bts(child, /* index = */ 0, &base); +	error = ds_access_bts(child->bts, /* index = */ 0, &base);  	if (error < 0)  		return error; -	error = ds_access_bts(child, /* index = */ end, &max); +	error = ds_access_bts(child->bts, /* index = */ end, &max);  	if (error < 0)  		return error; @@ -884,10 +901,7 @@ static int ptrace_bts_write_record(struct task_struct *child,  		return -EINVAL;  	} -	/* The writing task will be the switched-to task on a context -	 * switch. It needs to write into the switched-from task's BTS -	 * buffer. */ -	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); +	return ds_write_bts(child->bts, bts_record, bts_cfg.sizeof_bts);  }  void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -929,17 +943,16 @@ void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) { +		case 0 ... 0xC: +			/* sorry, don't know about them */ +			break;  		case 0xD:  		case 0xE: /* Pentium M */  			bts_configure(&bts_cfg_pentium_m);  			break; -		case 0xF: /* Core2 */ -        case 0x1C: /* Atom */ +		default: /* Core2, Atom, ... */  			bts_configure(&bts_cfg_core2);  			break; -		default: -			/* sorry, don't know about them */ -			break;  		}  		break;  	case 0xF: @@ -973,13 +986,17 @@ void ptrace_disable(struct task_struct *child)  	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);  #endif  #ifdef CONFIG_X86_PTRACE_BTS -	(void)ds_release_bts(child); +	if (child->bts) { +		(void)ds_release_bts(child->bts); +		kfree(child->bts_buffer); +		child->bts_buffer = NULL; -	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; -	if (!child->thread.debugctlmsr) -		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); +		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; +		if (!child->thread.debugctlmsr) +			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); -	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +	}  #endif /* CONFIG_X86_PTRACE_BTS */  } @@ -1111,9 +1128,16 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)  			(child, data, (struct ptrace_bts_config __user *)addr);  		break; -	case PTRACE_BTS_SIZE: -		ret = ds_get_bts_index(child, /* pos = */ NULL); +	case PTRACE_BTS_SIZE: { +		size_t size; + +		ret = ds_get_bts_index(child->bts, &size); +		if (ret == 0) { +			BUG_ON(size != (int) size); +			ret = (int) size; +		}  		break; +	}  	case PTRACE_BTS_GET:  		ret = ptrace_bts_read_record @@ -1121,7 +1145,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)  		break;  	case PTRACE_BTS_CLEAR: -		ret = ds_clear_bts(child); +		ret = ds_clear_bts(child->bts);  		break;  	case PTRACE_BTS_DRAIN: diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index cc5a2545dd41..0e3dbc7b2bdb 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -36,7 +36,10 @@ int reboot_force;  static int reboot_cpu = -1;  #endif -/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] +/* This is set by the PCI code if either type 1 or type 2 PCI is detected */ +bool port_cf9_safe = false; + +/* reboot=b[ios] | s[mp] | t[riple] | k[bd] | e[fi] [, [w]arm | [c]old] | p[ci]     warm   Don't set the cold reboot flag     cold   Set the cold reboot flag     bios   Reboot by jumping through the BIOS (only for X86_32) @@ -45,6 +48,7 @@ static int reboot_cpu = -1;     kbd    Use the keyboard controller. cold reset (default)     acpi   Use the RESET_REG in the FADT     efi    Use efi reset_system runtime service +   pci    Use the so-called "PCI reset register", CF9     force  Avoid anything that could hang.   */  static int __init reboot_setup(char *str) @@ -79,6 +83,7 @@ static int __init reboot_setup(char *str)  		case 'k':  		case 't':  		case 'e': +		case 'p':  			reboot_type = *str;  			break; @@ -404,12 +409,27 @@ static void native_machine_emergency_restart(void)  			reboot_type = BOOT_KBD;  			break; -  		case BOOT_EFI:  			if (efi_enabled) -				efi.reset_system(reboot_mode ? EFI_RESET_WARM : EFI_RESET_COLD, +				efi.reset_system(reboot_mode ? +						 EFI_RESET_WARM : +						 EFI_RESET_COLD,  						 EFI_SUCCESS, 0, NULL); +			reboot_type = BOOT_KBD; +			break; + +		case BOOT_CF9: +			port_cf9_safe = true; +			/* fall through */ +		case BOOT_CF9_COND: +			if (port_cf9_safe) { +				u8 cf9 = inb(0xcf9) & ~6; +				outb(cf9|2, 0xcf9); /* Request hard reset */ +				udelay(50); +				outb(cf9|6, 0xcf9); /* Actually do the reset */ +				udelay(50); +			}  			reboot_type = BOOT_KBD;  			break;  		} @@ -470,6 +490,11 @@ static void native_machine_restart(char *__unused)  static void native_machine_halt(void)  { +	/* stop other cpus and apics */ +	machine_shutdown(); + +	/* stop this cpu */ +	stop_this_cpu(NULL);  }  static void native_machine_power_off(void) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d5674f7b6cc..b9018955a04f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -583,7 +583,20 @@ static int __init setup_elfcorehdr(char *arg)  early_param("elfcorehdr", setup_elfcorehdr);  #endif -static struct x86_quirks default_x86_quirks __initdata; +static int __init default_update_genapic(void) +{ +#ifdef CONFIG_X86_SMP +# if defined(CONFIG_X86_GENERICARCH) || defined(CONFIG_X86_64) +	genapic->wakeup_cpu = wakeup_secondary_cpu_via_init; +# endif +#endif + +	return 0; +} + +static struct x86_quirks default_x86_quirks __initdata = { +	.update_genapic         = default_update_genapic, +};  struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; @@ -1082,7 +1095,7 @@ void __init setup_arch(char **cmdline_p)  	ioapic_init_mappings();  	/* need to wait for io_apic is mapped */ -	nr_irqs = probe_nr_irqs(); +	probe_nr_irqs_gsi();  	kvm_guest_init(); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 18f9b19f5f8f..3f92b134ab90 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -140,19 +140,6 @@ void native_send_call_func_ipi(cpumask_t mask)  		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);  } -static void stop_this_cpu(void *dummy) -{ -	local_irq_disable(); -	/* -	 * Remove this CPU: -	 */ -	cpu_clear(smp_processor_id(), cpu_online_map); -	disable_local_APIC(); -	if (hlt_works(smp_processor_id())) -		for (;;) halt(); -	for (;;); -} -  /*   * this function calls the 'stop' function on all other CPUs in the system.   */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 468c2f9d47ae..9d58134e0231 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -62,6 +62,7 @@  #include <asm/mtrr.h>  #include <asm/vmi.h>  #include <asm/genapic.h> +#include <asm/setup.h>  #include <linux/mc146818rtc.h>  #include <mach_apic.h> @@ -530,7 +531,7 @@ static void impress_friends(void)  	pr_debug("Before bogocount - setting activated=1.\n");  } -static inline void __inquire_remote_apic(int apicid) +void __inquire_remote_apic(int apicid)  {  	unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };  	char *names[] = { "ID", "VERSION", "SPIV" }; @@ -569,14 +570,13 @@ static inline void __inquire_remote_apic(int apicid)  	}  } -#ifdef WAKE_SECONDARY_VIA_NMI  /*   * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal   * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this   * won't ... remember to clear down the APIC, etc later.   */ -static int __devinit -wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)  {  	unsigned long send_status, accept_status = 0;  	int maxlvt; @@ -593,7 +593,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)  	 * Give the other CPU some time to accept the IPI.  	 */  	udelay(200); -	if (APIC_INTEGRATED(apic_version[phys_apicid])) { +	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {  		maxlvt = lapic_get_maxlvt();  		if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */  			apic_write(APIC_ESR, 0); @@ -608,11 +608,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)  	return (send_status | accept_status);  } -#endif	/* WAKE_SECONDARY_VIA_NMI */ -#ifdef WAKE_SECONDARY_VIA_INIT -static int __devinit -wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) +int __devinit +wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)  {  	unsigned long send_status, accept_status = 0;  	int maxlvt, num_starts, j; @@ -731,7 +729,6 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)  	return (send_status | accept_status);  } -#endif	/* WAKE_SECONDARY_VIA_INIT */  struct create_idle {  	struct work_struct work; diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index a03e7f6d90c3..10786af95545 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -6,6 +6,7 @@  #include <linux/sched.h>  #include <linux/stacktrace.h>  #include <linux/module.h> +#include <linux/uaccess.h>  #include <asm/stacktrace.h>  static void save_stack_warning(void *data, char *msg) @@ -83,3 +84,66 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)  		trace->entries[trace->nr_entries++] = ULONG_MAX;  }  EXPORT_SYMBOL_GPL(save_stack_trace_tsk); + +/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ + +struct stack_frame { +	const void __user	*next_fp; +	unsigned long		ret_addr; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ +	int ret; + +	if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) +		return 0; + +	ret = 1; +	pagefault_disable(); +	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) +		ret = 0; +	pagefault_enable(); + +	return ret; +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) +{ +	const struct pt_regs *regs = task_pt_regs(current); +	const void __user *fp = (const void __user *)regs->bp; + +	if (trace->nr_entries < trace->max_entries) +		trace->entries[trace->nr_entries++] = regs->ip; + +	while (trace->nr_entries < trace->max_entries) { +		struct stack_frame frame; + +		frame.next_fp = NULL; +		frame.ret_addr = 0; +		if (!copy_stack_frame(fp, &frame)) +			break; +		if ((unsigned long)fp < regs->sp) +			break; +		if (frame.ret_addr) { +			trace->entries[trace->nr_entries++] = +				frame.ret_addr; +		} +		if (fp == frame.next_fp) +			break; +		fp = frame.next_fp; +	} +} + +void save_stack_trace_user(struct stack_trace *trace) +{ +	/* +	 * Trace user stack if we are not a kernel thread +	 */ +	if (current->mm) { +		__save_stack_trace_user(trace); +	} +	if (trace->nr_entries < trace->max_entries) +		trace->entries[trace->nr_entries++] = ULONG_MAX; +} + diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 0b8b6690a86d..6f3d3d4cd973 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -17,6 +17,9 @@   *  want per guest time just set the kernel.vsyscall64 sysctl to 0.   */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING +  #include <linux/time.h>  #include <linux/init.h>  #include <linux/kernel.h> diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9e68075544f6..4a20b2f9a381 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -39,7 +39,7 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon  #define __do_strncpy_from_user(dst, src, count, res)			   \  do {									   \  	int __d0, __d1, __d2;						   \ -	might_sleep();							   \ +	might_fault();							   \  	__asm__ __volatile__(						   \  		"	testl %1,%1\n"					   \  		"	jz 2f\n"					   \ @@ -126,7 +126,7 @@ EXPORT_SYMBOL(strncpy_from_user);  #define __do_clear_user(addr,size)					\  do {									\  	int __d0;							\ -	might_sleep();							\ +	might_fault();							\  	__asm__ __volatile__(						\  		"0:	rep; stosl\n"					\  		"	movl %2,%0\n"					\ @@ -155,7 +155,7 @@ do {									\  unsigned long  clear_user(void __user *to, unsigned long n)  { -	might_sleep(); +	might_fault();  	if (access_ok(VERIFY_WRITE, to, n))  		__do_clear_user(to, n);  	return n; @@ -197,7 +197,7 @@ long strnlen_user(const char __user *s, long n)  	unsigned long mask = -__addr_ok(s);  	unsigned long res, tmp; -	might_sleep(); +	might_fault();  	__asm__ __volatile__(  		"	testl %0, %0\n" diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index f4df6e7c718b..64d6c84e6353 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -15,7 +15,7 @@  #define __do_strncpy_from_user(dst,src,count,res)			   \  do {									   \  	long __d0, __d1, __d2;						   \ -	might_sleep();							   \ +	might_fault();							   \  	__asm__ __volatile__(						   \  		"	testq %1,%1\n"					   \  		"	jz 2f\n"					   \ @@ -64,7 +64,7 @@ EXPORT_SYMBOL(strncpy_from_user);  unsigned long __clear_user(void __user *addr, unsigned long size)  {  	long __d0; -	might_sleep(); +	might_fault();  	/* no memory constraint because it doesn't change any memory gcc knows  	   about */  	asm volatile( diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 3c3b471ea496..3624a364b7f3 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -17,6 +17,7 @@  #include <asm/bigsmp/apic.h>  #include <asm/bigsmp/ipi.h>  #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h>  static int dmi_bigsmp; /* can be set by dmi scanners */ diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c index 9e835a11a13a..e63a4a76d8cd 100644 --- a/arch/x86/mach-generic/default.c +++ b/arch/x86/mach-generic/default.c @@ -16,6 +16,7 @@  #include <asm/mach-default/mach_apic.h>  #include <asm/mach-default/mach_ipi.h>  #include <asm/mach-default/mach_mpparse.h> +#include <asm/mach-default/mach_wakecpu.h>  /* should be called last. */  static int probe_default(void) diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 28459cab3ddb..7b4e6d0d1690 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -16,7 +16,19 @@  #include <asm/es7000/apic.h>  #include <asm/es7000/ipi.h>  #include <asm/es7000/mpparse.h> -#include <asm/es7000/wakecpu.h> +#include <asm/mach-default/mach_wakecpu.h> + +void __init es7000_update_genapic_to_cluster(void) +{ +	genapic->target_cpus = target_cpus_cluster; +	genapic->int_delivery_mode = INT_DELIVERY_MODE_CLUSTER; +	genapic->int_dest_mode = INT_DEST_MODE_CLUSTER; +	genapic->no_balance_irq = NO_BALANCE_IRQ_CLUSTER; + +	genapic->init_apic_ldr = init_apic_ldr_cluster; + +	genapic->cpu_mask_to_apicid = cpu_mask_to_apicid_cluster; +}  static int probe_es7000(void)  { diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c index 5a7e4619e1c4..c346d9d0226f 100644 --- a/arch/x86/mach-generic/probe.c +++ b/arch/x86/mach-generic/probe.c @@ -15,6 +15,7 @@  #include <asm/mpspec.h>  #include <asm/apicdef.h>  #include <asm/genapic.h> +#include <asm/setup.h>  extern struct genapic apic_numaq;  extern struct genapic apic_summit; @@ -57,6 +58,9 @@ static int __init parse_apic(char *arg)  		}  	} +	if (x86_quirks->update_genapic) +		x86_quirks->update_genapic(); +  	/* Parsed again by __setup for debug/verbose */  	return 0;  } @@ -72,12 +76,15 @@ void __init generic_bigsmp_probe(void)  	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support  	 */ -	if (!cmdline_apic && genapic == &apic_default) +	if (!cmdline_apic && genapic == &apic_default) {  		if (apic_bigsmp.probe()) {  			genapic = &apic_bigsmp; +			if (x86_quirks->update_genapic) +				x86_quirks->update_genapic();  			printk(KERN_INFO "Overriding APIC driver with %s\n",  			       genapic->name);  		} +	}  #endif  } @@ -94,6 +101,9 @@ void __init generic_apic_probe(void)  		/* Not visible without early console */  		if (!apic_probe[i])  			panic("Didn't find an APIC driver"); + +		if (x86_quirks->update_genapic) +			x86_quirks->update_genapic();  	}  	printk(KERN_INFO "Using APIC driver %s\n", genapic->name);  } @@ -108,6 +118,8 @@ int __init mps_oem_check(struct mp_config_table *mpc, char *oem,  		if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {  			if (!cmdline_apic) {  				genapic = apic_probe[i]; +				if (x86_quirks->update_genapic) +					x86_quirks->update_genapic();  				printk(KERN_INFO "Switched to APIC driver `%s'.\n",  				       genapic->name);  			} @@ -124,6 +136,8 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)  		if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {  			if (!cmdline_apic) {  				genapic = apic_probe[i]; +				if (x86_quirks->update_genapic) +					x86_quirks->update_genapic();  				printk(KERN_INFO "Switched to APIC driver `%s'.\n",  				       genapic->name);  			} diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6272b5e69da6..2c6d234e0009 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -16,6 +16,7 @@  #include <asm/summit/apic.h>  #include <asm/summit/ipi.h>  #include <asm/summit/mpparse.h> +#include <asm/mach-default/mach_wakecpu.h>  static int probe_summit(void)  { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index fea4565ff576..d8cc96a2738f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -8,9 +8,8 @@ obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o  obj-$(CONFIG_HIGHMEM)		+= highmem_32.o -obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o  obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o -mmiotrace-y			:= pf_in.o mmio-mod.o +mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o  obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o  obj-$(CONFIG_NUMA)		+= numa_$(BITS).o diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 31e8730fa246..21e996a70d68 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -53,7 +53,7 @@  static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)  { -#ifdef CONFIG_MMIOTRACE_HOOKS +#ifdef CONFIG_MMIOTRACE  	if (unlikely(is_kmmio_active()))  		if (kmmio_handler(regs, addr) == 1)  			return -1; @@ -413,6 +413,7 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,  				 unsigned long error_code)  {  	unsigned long flags = oops_begin(); +	int sig = SIGKILL;  	struct task_struct *tsk;  	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n", @@ -423,8 +424,8 @@ static noinline void pgtable_bad(unsigned long address, struct pt_regs *regs,  	tsk->thread.trap_no = 14;  	tsk->thread.error_code = error_code;  	if (__die("Bad pagetable", regs, error_code)) -		regs = NULL; -	oops_end(flags, regs, SIGKILL); +		sig = 0; +	oops_end(flags, regs, sig);  }  #endif @@ -590,6 +591,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)  	int fault;  #ifdef CONFIG_X86_64  	unsigned long flags; +	int sig;  #endif  	tsk = current; @@ -849,11 +851,12 @@ no_context:  	bust_spinlocks(0);  	do_exit(SIGKILL);  #else +	sig = SIGKILL;  	if (__die("Oops", regs, error_code)) -		regs = NULL; +		sig = 0;  	/* Executive summary in case the body of the oops scrolled away */  	printk(KERN_EMERG "CR2: %016lx\n", address); -	oops_end(flags, regs, SIGKILL); +	oops_end(flags, regs, sig);  #endif  /* diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 9915293500fb..9a5af6c8fbe9 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -173,7 +173,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus,  #undef PCI_CONF2_ADDRESS -static struct pci_raw_ops pci_direct_conf2 = { +struct pci_raw_ops pci_direct_conf2 = {  	.read =		pci_conf2_read,  	.write =	pci_conf2_write,  }; @@ -289,6 +289,7 @@ int __init pci_direct_probe(void)  	if (pci_check_type1()) {  		raw_pci_ops = &pci_direct_conf1; +		port_cf9_safe = true;  		return 1;  	}  	release_resource(region); @@ -305,6 +306,7 @@ int __init pci_direct_probe(void)  	if (pci_check_type2()) {  		raw_pci_ops = &pci_direct_conf2; +		port_cf9_safe = true;  		return 2;  	} diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 15b9cf6be729..1959018aac02 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -96,6 +96,7 @@ extern struct pci_raw_ops *raw_pci_ops;  extern struct pci_raw_ops *raw_pci_ext_ops;  extern struct pci_raw_ops pci_direct_conf1; +extern bool port_cf9_safe;  /* arch_initcall level */  extern int pci_direct_probe(void); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 1ef0f90813d6..d9d35824c56f 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -9,6 +9,9 @@   * Also alternative() doesn't work.   */ +/* Disable profiling for userspace code: */ +#define DISABLE_BRANCH_PROFILING +  #include <linux/kernel.h>  #include <linux/posix-timers.h>  #include <linux/time.h> | 
