5 files changed, 196 insertions, 78 deletions
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index cdc7526e9c9..4b66a1ece6d 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -104,9 +104,10 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
 }
 
 /*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 4 between 4k and 256M.
+ * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
+ * in particular size must be a power of 4 between 4k and 256M (or 1G, for cpus
+ * that support extended page sizes).  Note that while some cpus support a
+ * page size of 4G, we don't allow its use here.
  */
 static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
 		unsigned long size, unsigned long flags, unsigned int pid)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index aa731af720c..002878ccf90 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
 
 static int min_common_depth;
 static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
+
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const unsigned int *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
 
 /*
  * Allocate node_to_cpumask_map based on number of available nodes
@@ -204,6 +210,39 @@ static const u32 *of_get_usable_memory(struct device_node *memory)
 	return prop;
 }
 
+int __node_distance(int a, int b)
+{
+	int i;
+	int distance = LOCAL_DISTANCE;
+
+	if (!form1_affinity)
+		return distance;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+			break;
+
+		/* Double the distance for each NUMA level */
+		distance *= 2;
+	}
+
+	return distance;
+}
+
+static void initialize_distance_lookup_table(int nid,
+		const unsigned int *associativity)
+{
+	int i;
+
+	if (!form1_affinity)
+		return;
+
+	for (i = 0; i < distance_ref_points_depth; i++) {
+		distance_lookup_table[nid][i] =
+			associativity[distance_ref_points[i]];
+	}
+}
+
 /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
  * info is found.
  */
@@ -225,6 +264,10 @@ static int of_node_to_nid_single(struct device_node *device)
 	/* POWER4 LPAR uses 0xffff as invalid node */
 	if (nid == 0xffff || nid >= MAX_NUMNODES)
 		nid = -1;
+
+	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
+		initialize_distance_lookup_table(nid, tmp);
+
 out:
 	return nid;
 }
@@ -251,26 +294,10 @@ int of_node_to_nid(struct device_node *device)
 }
 EXPORT_SYMBOL_GPL(of_node_to_nid);
 
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine.  This resource then has different associativity
- * characteristics relative to its multiple connections.  We ignore
- * this for now.  We also assume that all cpu and memory sets have
- * their distances represented at a common level.  This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
 static int __init find_min_common_depth(void)
 {
-	int depth, index;
-	const unsigned int *ref_points;
+	int depth;
 	struct device_node *rtas_root;
-	unsigned int len;
 	struct device_node *chosen;
 	const char *vec5;
 
@@ -280,18 +307,28 @@ static int __init find_min_common_depth(void)
 		return -1;
 
 	/*
-	 * this property is 2 32-bit integers, each representing a level of
-	 * depth in the associativity nodes.  The first is for an SMP
-	 * configuration (should be all 0's) and the second is for a normal
-	 * NUMA configuration.
+	 * This property is a set of 32-bit integers, each representing
+	 * an index into the ibm,associativity nodes.
+	 *
+	 * With form 0 affinity the first integer is for an SMP configuration
+	 * (should be all 0's) and the second is for a normal NUMA
+	 * configuration. We have only one level of NUMA.
+	 *
+	 * With form 1 affinity the first integer is the most significant
+	 * NUMA boundary and the following are progressively less significant
+	 * boundaries. There can be more than one level of NUMA.
 	 */
-	index = 1;
-	ref_points = of_get_property(rtas_root,
-			"ibm,associativity-reference-points", &len);
+	distance_ref_points = of_get_property(rtas_root,
+					"ibm,associativity-reference-points",
+					&distance_ref_points_depth);
+
+	if (!distance_ref_points) {
+		dbg("NUMA: ibm,associativity-reference-points not found.\n");
+		goto err;
+	}
+
+	distance_ref_points_depth /= sizeof(int);
 
-	/*
-	 * For form 1 affinity information we want the first field
-	 */
 #define VEC5_AFFINITY_BYTE	5
 #define VEC5_AFFINITY		0x80
 	chosen = of_find_node_by_path("/chosen");
@@ -299,19 +336,38 @@ static int __init find_min_common_depth(void)
 		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
 		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
 			dbg("Using form 1 affinity\n");
-			index = 0;
+			form1_affinity = 1;
 		}
 	}
 
-	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
-		depth = ref_points[index];
+	if (form1_affinity) {
+		depth = distance_ref_points[0];
 	} else {
-		dbg("NUMA: ibm,associativity-reference-points not found.\n");
-		depth = -1;
+		if (distance_ref_points_depth < 2) {
+			printk(KERN_WARNING "NUMA: "
+				"short ibm,associativity-reference-points\n");
+			goto err;
+		}
+
+		depth = distance_ref_points[1];
 	}
-	of_node_put(rtas_root);
 
+	/*
+	 * Warn and cap if the hardware supports more than
+	 * MAX_DISTANCE_REF_POINTS domains.
+	 */
+	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+		printk(KERN_WARNING "NUMA: distance array capped at "
+			"%d entries\n", MAX_DISTANCE_REF_POINTS);
+		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+	}
+
+	of_node_put(rtas_root);
 	return depth;
+
+err:
+	of_node_put(rtas_root);
+	return -1;
 }
 
 static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index ebc2f38eb38..2c7e801ab20 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -92,7 +92,6 @@ static void pte_free_rcu_callback(struct rcu_head *head)
 
 static void pte_free_submit(struct pte_freelist_batch *batch)
 {
-	INIT_RCU_HEAD(&batch->rcu);
 	call_rcu(&batch->rcu, pte_free_rcu_callback);
 }
 
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 8aaa8b7eb32..690566b66e8 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -89,17 +89,6 @@ void tlb_flush(struct mmu_gather *tlb)
  *    -- Cort
  */
 
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus.  Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH	smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH	do { } while (0)
-#endif
-
 static void flush_range(struct mm_struct *mm, unsigned long start,
 			unsigned long end)
 {
@@ -138,7 +127,6 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
 	flush_range(&init_mm, start, end);
-	FINISH_FLUSH;
 }
 EXPORT_SYMBOL(flush_tlb_kernel_range);
 
@@ -162,7 +150,6 @@ void flush_tlb_mm(struct mm_struct *mm)
 	 */
 	for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
 		flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
-	FINISH_FLUSH;
 }
 EXPORT_SYMBOL(flush_tlb_mm);
 
@@ -179,7 +166,6 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
 	pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
 	if (!pmd_none(*pmd))
 		flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
-	FINISH_FLUSH;
 }
 EXPORT_SYMBOL(flush_tlb_page);
 
@@ -192,6 +178,5 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
 		     unsigned long end)
 {
 	flush_range(vma->vm_mm, start, end);
-	FINISH_FLUSH;
 }
 EXPORT_SYMBOL(flush_tlb_range);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index d8695b02a96..fe391e94252 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -46,6 +46,7 @@
 struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	[MMU_PAGE_4K] = {
 		.shift	= 12,
+		.ind	= 20,
 		.enc	= BOOK3E_PAGESZ_4K,
 	},
 	[MMU_PAGE_16K] = {
@@ -54,6 +55,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	},
 	[MMU_PAGE_64K] = {
 		.shift	= 16,
+		.ind	= 28,
 		.enc	= BOOK3E_PAGESZ_64K,
 	},
 	[MMU_PAGE_1M] = {
@@ -62,6 +64,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
 	},
 	[MMU_PAGE_16M] = {
 		.shift	= 24,
+		.ind	= 36,
 		.enc	= BOOK3E_PAGESZ_16M,
 	},
 	[MMU_PAGE_256M] = {
@@ -344,16 +347,108 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
 	}
 }
 
-/*
- * Early initialization of the MMU TLB code
- */
-static void __early_init_mmu(int boot_cpu)
+static void setup_page_sizes(void)
+{
+	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+	unsigned int tlb0ps = mfspr(SPRN_TLB0PS);
+	unsigned int eptcfg = mfspr(SPRN_EPTCFG);
+	int i, psize;
+
+	/* Look for supported direct sizes */
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+		if (tlb0ps & (1U << (def->shift - 10)))
+			def->flags |= MMU_PAGE_SIZE_DIRECT;
+	}
+
+	/* Indirect page sizes supported ? */
+	if ((tlb0cfg & TLBnCFG_IND) == 0)
+		goto no_indirect;
+
+	/* Now, we only deal with one IND page size for each
+	 * direct size. Hopefully all implementations today are
+	 * unambiguous, but we might want to be careful in the
+	 * future.
+	 */
+	for (i = 0; i < 3; i++) {
+		unsigned int ps, sps;
+
+		sps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		ps = eptcfg & 0x1f;
+		eptcfg >>= 5;
+		if (!ps || !sps)
+			continue;
+		for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+			struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+			if (ps == (def->shift - 10))
+				def->flags |= MMU_PAGE_SIZE_INDIRECT;
+			if (sps == (def->shift - 10))
+				def->ind = ps + 10;
+		}
+	}
+ no_indirect:
+
+	/* Cleanup array and print summary */
+	pr_info("MMU: Supported page sizes\n");
+	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+		struct mmu_psize_def *def = &mmu_psize_defs[psize];
+		const char *__page_type_names[] = {
+			"unsupported",
+			"direct",
+			"indirect",
+			"direct & indirect"
+		};
+		if (def->flags == 0) {
+			def->shift = 0;	
+			continue;
+		}
+		pr_info("  %8ld KB as %s\n", 1ul << (def->shift - 10),
+			__page_type_names[def->flags & 0x3]);
+	}
+}
+
+static void setup_mmu_htw(void)
 {
 	extern unsigned int interrupt_base_book3e;
 	extern unsigned int exc_data_tlb_miss_htw_book3e;
 	extern unsigned int exc_instruction_tlb_miss_htw_book3e;
 
 	unsigned int *ibase = &interrupt_base_book3e;
+
+	/* Check if HW tablewalk is present, and if yes, enable it by:
+	 *
+	 * - patching the TLB miss handlers to branch to the
+	 *   one dedicates to it
+	 *
+	 * - setting the global book3e_htw_enabled
+       	 */
+	unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+
+	if ((tlb0cfg & TLBnCFG_IND) &&
+	    (tlb0cfg & TLBnCFG_PT)) {
+		/* Our exceptions vectors start with a NOP and -then- a branch
+		 * to deal with single stepping from userspace which stops on
+		 * the second instruction. Thus we need to patch the second
+		 * instruction of the exception, not the first one
+		 */
+		patch_branch(ibase + (0x1c0 / 4) + 1,
+			     (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
+		patch_branch(ibase + (0x1e0 / 4) + 1,
+			     (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
+		book3e_htw_enabled = 1;
+	}
+	pr_info("MMU: Book3E Page Tables %s\n",
+		book3e_htw_enabled ? "Enabled" : "Disabled");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
 	unsigned int mas4;
 
 	/* XXX This will have to be decided at runtime, but right
@@ -370,35 +465,17 @@ static void __early_init_mmu(int boot_cpu)
 	 */
 	mmu_vmemmap_psize = MMU_PAGE_16M;
 
-	/* Check if HW tablewalk is present, and if yes, enable it by:
-	 *
-	 * - patching the TLB miss handlers to branch to the
-	 *   one dedicates to it
-	 *
-	 * - setting the global book3e_htw_enabled
-	 *
-	 * - Set MAS4:INDD and default page size
-	 */
-
 	/* XXX This code only checks for TLB 0 capabilities and doesn't
 	 *     check what page size combos are supported by the HW. It
 	 *     also doesn't handle the case where a separate array holds
 	 *     the IND entries from the array loaded by the PT.
 	 */
 	if (boot_cpu) {
-		unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+		/* Look for supported page sizes */
+		setup_page_sizes();
 
-		/* Check if HW loader is supported */
-		if ((tlb0cfg & TLBnCFG_IND) &&
-		    (tlb0cfg & TLBnCFG_PT)) {
-			patch_branch(ibase + (0x1c0 / 4),
-			     (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
-			patch_branch(ibase + (0x1e0 / 4),
-			     (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
-			book3e_htw_enabled = 1;
-		}
-		pr_info("MMU: Book3E Page Tables %s\n",
-			book3e_htw_enabled ? "Enabled" : "Disabled");
+		/* Look for HW tablewalk support */
+		setup_mmu_htw();
 	}
 
 	/* Set MAS4 based on page table setting */