diff options
Diffstat (limited to 'arch/x86/mm/numa_64.c')
| -rw-r--r-- | arch/x86/mm/numa_64.c | 233 | 
1 files changed, 100 insertions, 133 deletions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 83bbc70d11b..3307ea8bd43 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -427,7 +427,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,  	 * Calculate the number of big nodes that can be allocated as a result  	 * of consolidating the remainder.  	 */ -	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) & nr_nodes) / +	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /  		FAKE_NODE_MIN_SIZE;  	size &= FAKE_NODE_MIN_HASH_MASK; @@ -502,77 +502,99 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,  }  /* - * Splits num_nodes nodes up equally starting at node_start.  The return value - * is the number of nodes split up and addr is adjusted to be at the end of the - * last node allocated. + * Returns the end address of a node so that there is at least `size' amount of + * non-reserved memory or `max_addr' is reached.   */ -static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, -				      int num_nodes) +static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)  { -	unsigned int big; -	u64 size; -	int i; - -	if (num_nodes <= 0) -		return -1; -	if (num_nodes > MAX_NUMNODES) -		num_nodes = MAX_NUMNODES; -	size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / -	       num_nodes; -	/* -	 * Calculate the number of big nodes that can be allocated as a result -	 * of consolidating the leftovers. -	 */ -	big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / -	      FAKE_NODE_MIN_SIZE; - -	/* Round down to nearest FAKE_NODE_MIN_SIZE. */ -	size &= FAKE_NODE_MIN_HASH_MASK; -	if (!size) { -		printk(KERN_ERR "Not enough memory for each node.  " -		       "NUMA emulation disabled.\n"); -		return -1; -	} - -	for (i = node_start; i < num_nodes + node_start; i++) { -		u64 end = *addr + size; +	u64 end = start + size; -		if (i < big) -			end += FAKE_NODE_MIN_SIZE; -		/* -		 * The final node can have the remaining system RAM.  Other -		 * nodes receive roughly the same amount of available pages. -		 */ -		if (i == num_nodes + node_start - 1) +	while (end - start - e820_hole_size(start, end) < size) { +		end += FAKE_NODE_MIN_SIZE; +		if (end > max_addr) {  			end = max_addr; -		else -			while (end - *addr - e820_hole_size(*addr, end) < -			       size) { -				end += FAKE_NODE_MIN_SIZE; -				if (end > max_addr) { -					end = max_addr; -					break; -				} -			} -		if (setup_node_range(i, addr, end - *addr, max_addr) < 0)  			break; +		}  	} -	return i - node_start + 1; +	return end;  }  /* - * Splits the remaining system RAM into chunks of size.  The remaining memory is - * always assigned to a final node and can be asymmetric.  Returns the number of - * nodes split. + * Sets up fake nodes of `size' interleaved over physical nodes ranging from + * `addr' to `max_addr'.  The return value is the number of nodes allocated.   */ -static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, -				      u64 size) +static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)  { -	int i = node_start; -	size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; -	while (!setup_node_range(i++, addr, size, max_addr)) -		; -	return i - node_start; +	nodemask_t physnode_mask = NODE_MASK_NONE; +	u64 min_size; +	int ret = 0; +	int i; + +	if (!size) +		return -1; +	/* +	 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is +	 * increased accordingly if the requested size is too small.  This +	 * creates a uniform distribution of node sizes across the entire +	 * machine (but not necessarily over physical nodes). +	 */ +	min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) / +						MAX_NUMNODES; +	min_size = max(min_size, FAKE_NODE_MIN_SIZE); +	if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) +		min_size = (min_size + FAKE_NODE_MIN_SIZE) & +						FAKE_NODE_MIN_HASH_MASK; +	if (size < min_size) { +		pr_err("Fake node size %LuMB too small, increasing to %LuMB\n", +			size >> 20, min_size >> 20); +		size = min_size; +	} +	size &= FAKE_NODE_MIN_HASH_MASK; + +	for (i = 0; i < MAX_NUMNODES; i++) +		if (physnodes[i].start != physnodes[i].end) +			node_set(i, physnode_mask); +	/* +	 * Fill physical nodes with fake nodes of size until there is no memory +	 * left on any of them. +	 */ +	while (nodes_weight(physnode_mask)) { +		for_each_node_mask(i, physnode_mask) { +			u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; +			u64 end; + +			end = find_end_of_node(physnodes[i].start, +						physnodes[i].end, size); +			/* +			 * If there won't be at least FAKE_NODE_MIN_SIZE of +			 * non-reserved memory in ZONE_DMA32 for the next node, +			 * this one must extend to the boundary. +			 */ +			if (end < dma32_end && dma32_end - end - +			    e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) +				end = dma32_end; + +			/* +			 * If there won't be enough non-reserved memory for the +			 * next node, this one must extend to the end of the +			 * physical node. +			 */ +			if (physnodes[i].end - end - +			    e820_hole_size(end, physnodes[i].end) < size) +				end = physnodes[i].end; + +			/* +			 * Setup the fake node that will be allocated as bootmem +			 * later.  If setup_node_range() returns non-zero, there +			 * is no more memory available on this physical node. +			 */ +			if (setup_node_range(ret++, &physnodes[i].start, +						end - physnodes[i].start, +						physnodes[i].end) < 0) +				node_clear(i, physnode_mask); +		} +	} +	return ret;  }  /* @@ -582,87 +604,32 @@ static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start,  static int __init numa_emulation(unsigned long start_pfn,  			unsigned long last_pfn, int acpi, int k8)  { -	u64 size, addr = start_pfn << PAGE_SHIFT; +	u64 addr = start_pfn << PAGE_SHIFT;  	u64 max_addr = last_pfn << PAGE_SHIFT; -	int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;  	int num_phys_nodes; +	int num_nodes; +	int i;  	num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);  	/* -	 * If the numa=fake command-line is just a single number N, split the -	 * system RAM into N fake nodes. +	 * If the numa=fake command-line contains a 'M' or 'G', it represents +	 * the fixed node size.  Otherwise, if it is just a single number N, +	 * split the system RAM into N fake nodes.  	 */ -	if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { -		long n = simple_strtol(cmdline, NULL, 0); +	if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) { +		u64 size; -		num_nodes = split_nodes_interleave(addr, max_addr, -							num_phys_nodes, n); -		if (num_nodes < 0) -			return num_nodes; -		goto out; -	} +		size = memparse(cmdline, &cmdline); +		num_nodes = split_nodes_size_interleave(addr, max_addr, size); +	} else { +		unsigned long n; -	/* Parse the command line. */ -	for (coeff_flag = 0; ; cmdline++) { -		if (*cmdline && isdigit(*cmdline)) { -			num = num * 10 + *cmdline - '0'; -			continue; -		} -		if (*cmdline == '*') { -			if (num > 0) -				coeff = num; -			coeff_flag = 1; -		} -		if (!*cmdline || *cmdline == ',') { -			if (!coeff_flag) -				coeff = 1; -			/* -			 * Round down to the nearest FAKE_NODE_MIN_SIZE. -			 * Command-line coefficients are in megabytes. -			 */ -			size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; -			if (size) -				for (i = 0; i < coeff; i++, num_nodes++) -					if (setup_node_range(num_nodes, &addr, -						size, max_addr) < 0) -						goto done; -			if (!*cmdline) -				break; -			coeff_flag = 0; -			coeff = -1; -		} -		num = 0; +		n = simple_strtoul(cmdline, NULL, 0); +		num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);  	} -done: -	if (!num_nodes) -		return -1; -	/* Fill remainder of system RAM, if appropriate. */ -	if (addr < max_addr) { -		if (coeff_flag && coeff < 0) { -			/* Split remaining nodes into num-sized chunks */ -			num_nodes += split_nodes_by_size(&addr, max_addr, -							 num_nodes, num); -			goto out; -		} -		switch (*(cmdline - 1)) { -		case '*': -			/* Split remaining nodes into coeff chunks */ -			if (coeff <= 0) -				break; -			num_nodes += split_nodes_equally(&addr, max_addr, -							 num_nodes, coeff); -			break; -		case ',': -			/* Do not allocate remaining system RAM */ -			break; -		default: -			/* Give one final node */ -			setup_node_range(num_nodes, &addr, max_addr - addr, -					 max_addr); -			num_nodes++; -		} -	} -out: + +	if (num_nodes < 0) +		return num_nodes;  	memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);  	if (memnode_shift < 0) {  		memnode_shift = 0;  |