diff options
Diffstat (limited to 'arch/x86/mm/numa.c')
| -rw-r--r-- | arch/x86/mm/numa.c | 212 | 
1 files changed, 209 insertions, 3 deletions
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index ebf6d7887a38..9559d360fde7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,12 +26,50 @@ static __init int numa_setup(char *opt)  early_param("numa", numa_setup);  /* - * Which logical CPUs are on which nodes + * apicid, cpu, node mappings   */ +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { +	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE +}; +  cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];  EXPORT_SYMBOL(node_to_cpumask_map);  /* + * Map cpu index to node index + */ +DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map); + +void __cpuinit numa_set_node(int cpu, int node) +{ +	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map); + +	/* early setting, no percpu area yet */ +	if (cpu_to_node_map) { +		cpu_to_node_map[cpu] = node; +		return; +	} + +#ifdef CONFIG_DEBUG_PER_CPU_MAPS +	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) { +		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu); +		dump_stack(); +		return; +	} +#endif +	per_cpu(x86_cpu_to_node_map, cpu) = node; + +	if (node != NUMA_NO_NODE) +		set_cpu_numa_node(cpu, node); +} + +void __cpuinit numa_clear_node(int cpu) +{ +	numa_set_node(cpu, NUMA_NO_NODE); +} + +/*   * Allocate node_to_cpumask_map based on number of available nodes   * Requires node_possible_map to be valid.   * @@ -57,7 +95,174 @@ void __init setup_node_to_cpumask_map(void)  	pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);  } -#ifdef CONFIG_DEBUG_PER_CPU_MAPS +/* + * There are unfortunately some poorly designed mainboards around that + * only connect memory to a single CPU. This breaks the 1:1 cpu->node + * mapping. To avoid this fill in the mapping for all possible CPUs, + * as the number of CPUs is not known yet. We round robin the existing + * nodes. + */ +void __init numa_init_array(void) +{ +	int rr, i; + +	rr = first_node(node_online_map); +	for (i = 0; i < nr_cpu_ids; i++) { +		if (early_cpu_to_node(i) != NUMA_NO_NODE) +			continue; +		numa_set_node(i, rr); +		rr = next_node(rr, node_online_map); +		if (rr == MAX_NUMNODES) +			rr = first_node(node_online_map); +	} +} + +static __init int find_near_online_node(int node) +{ +	int n, val; +	int min_val = INT_MAX; +	int best_node = -1; + +	for_each_online_node(n) { +		val = node_distance(node, n); + +		if (val < min_val) { +			min_val = val; +			best_node = n; +		} +	} + +	return best_node; +} + +/* + * Setup early cpu_to_node. + * + * Populate cpu_to_node[] only if x86_cpu_to_apicid[], + * and apicid_to_node[] tables have valid entries for a CPU. + * This means we skip cpu_to_node[] initialisation for NUMA + * emulation and faking node case (when running a kernel compiled + * for NUMA on a non NUMA box), which is OK as cpu_to_node[] + * is already initialized in a round robin manner at numa_init_array, + * prior to this call, and this initialization is good enough + * for the fake NUMA cases. + * + * Called before the per_cpu areas are setup. + */ +void __init init_cpu_to_node(void) +{ +	int cpu; +	u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + +	BUG_ON(cpu_to_apicid == NULL); + +	for_each_possible_cpu(cpu) { +		int node = numa_cpu_node(cpu); + +		if (node == NUMA_NO_NODE) +			continue; +		if (!node_online(node)) +			node = find_near_online_node(node); +		numa_set_node(cpu, node); +	} +} + +#ifndef CONFIG_DEBUG_PER_CPU_MAPS + +# ifndef CONFIG_NUMA_EMU +void __cpuinit numa_add_cpu(int cpu) +{ +	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ +	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]); +} +# endif	/* !CONFIG_NUMA_EMU */ + +#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */ + +int __cpu_to_node(int cpu) +{ +	if (early_per_cpu_ptr(x86_cpu_to_node_map)) { +		printk(KERN_WARNING +			"cpu_to_node(%d): usage too early!\n", cpu); +		dump_stack(); +		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; +	} +	return per_cpu(x86_cpu_to_node_map, cpu); +} +EXPORT_SYMBOL(__cpu_to_node); + +/* + * Same function as cpu_to_node() but used if called before the + * per_cpu areas are setup. + */ +int early_cpu_to_node(int cpu) +{ +	if (early_per_cpu_ptr(x86_cpu_to_node_map)) +		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu]; + +	if (!cpu_possible(cpu)) { +		printk(KERN_WARNING +			"early_cpu_to_node(%d): no per_cpu area!\n", cpu); +		dump_stack(); +		return NUMA_NO_NODE; +	} +	return per_cpu(x86_cpu_to_node_map, cpu); +} + +struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable) +{ +	int node = early_cpu_to_node(cpu); +	struct cpumask *mask; +	char buf[64]; + +	if (node == NUMA_NO_NODE) { +		/* early_cpu_to_node() already emits a warning and trace */ +		return NULL; +	} +	mask = node_to_cpumask_map[node]; +	if (!mask) { +		pr_err("node_to_cpumask_map[%i] NULL\n", node); +		dump_stack(); +		return NULL; +	} + +	cpulist_scnprintf(buf, sizeof(buf), mask); +	printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", +		enable ? "numa_add_cpu" : "numa_remove_cpu", +		cpu, node, buf); +	return mask; +} + +# ifndef CONFIG_NUMA_EMU +static void __cpuinit numa_set_cpumask(int cpu, int enable) +{ +	struct cpumask *mask; + +	mask = debug_cpumask_set_cpu(cpu, enable); +	if (!mask) +		return; + +	if (enable) +		cpumask_set_cpu(cpu, mask); +	else +		cpumask_clear_cpu(cpu, mask); +} + +void __cpuinit numa_add_cpu(int cpu) +{ +	numa_set_cpumask(cpu, 1); +} + +void __cpuinit numa_remove_cpu(int cpu) +{ +	numa_set_cpumask(cpu, 0); +} +# endif	/* !CONFIG_NUMA_EMU */ +  /*   * Returns a pointer to the bitmask of CPUs on Node 'node'.   */ @@ -80,4 +285,5 @@ const struct cpumask *cpumask_of_node(int node)  	return node_to_cpumask_map[node];  }  EXPORT_SYMBOL(cpumask_of_node); -#endif + +#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */  | 
