diff options
| author | Tony Lindgren <tony@atomide.com> | 2011-11-07 12:27:23 -0800 | 
|---|---|---|
| committer | Tony Lindgren <tony@atomide.com> | 2011-11-07 12:27:23 -0800 | 
| commit | d30cc16c8e48368e0518f4975a78711e53e14a0f (patch) | |
| tree | 26b57f7ab5a963cc3d6c57dff6951bd930875583 /arch/powerpc/mm/hugetlbpage.c | |
| parent | 41eb2d813f558900884e240c2f723e36c7bd151f (diff) | |
| parent | a1bcc1dcef8451b4291ea2a1b2677cb194102952 (diff) | |
| download | olio-linux-3.10-d30cc16c8e48368e0518f4975a78711e53e14a0f.tar.xz olio-linux-3.10-d30cc16c8e48368e0518f4975a78711e53e14a0f.zip  | |
Merge branch 'fixes-modulesplit' into fixes
Diffstat (limited to 'arch/powerpc/mm/hugetlbpage.c')
| -rw-r--r-- | arch/powerpc/mm/hugetlbpage.c | 400 | 
1 files changed, 353 insertions, 47 deletions
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 0b9a5c1901b..5964371303a 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -1,7 +1,8 @@  /* - * PPC64 (POWER4) Huge TLB Page Support for Kernel. + * PPC Huge TLB Page Support for Kernel.   *   * Copyright (C) 2003 David Gibson, IBM Corporation. + * Copyright (C) 2011 Becky Bruce, Freescale Semiconductor   *   * Based on the IA-32 version:   * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> @@ -11,24 +12,39 @@  #include <linux/io.h>  #include <linux/slab.h>  #include <linux/hugetlb.h> +#include <linux/of_fdt.h> +#include <linux/memblock.h> +#include <linux/bootmem.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/tlb.h> +#include <asm/setup.h>  #define PAGE_SHIFT_64K	16  #define PAGE_SHIFT_16M	24  #define PAGE_SHIFT_16G	34 -#define MAX_NUMBER_GPAGES	1024 +unsigned int HPAGE_SHIFT; -/* Tracks the 16G pages after the device tree is scanned and before the - * huge_boot_pages list is ready.  */ -static unsigned long gpage_freearray[MAX_NUMBER_GPAGES]; +/* + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready.  On 64-bit implementations, this is + * just used to track 16G pages and so is a single array.  32-bit + * implementations may have more than one gpage size due to limitations + * of the memory allocators, so we need multiple arrays + */ +#ifdef CONFIG_PPC64 +#define MAX_NUMBER_GPAGES	1024 +static u64 gpage_freearray[MAX_NUMBER_GPAGES];  static unsigned nr_gpages; - -/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad() - * will choke on pointers to hugepte tables, which is handy for - * catching screwups early. */ +#else +#define MAX_NUMBER_GPAGES	128 +struct psize_gpages { +	u64 gpage_list[MAX_NUMBER_GPAGES]; +	unsigned int nr_gpages; +}; +static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; +#endif  static inline int shift_to_mmu_psize(unsigned int shift)  { @@ -49,25 +65,6 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)  #define hugepd_none(hpd)	((hpd).pd == 0) -static inline pte_t *hugepd_page(hugepd_t hpd) -{ -	BUG_ON(!hugepd_ok(hpd)); -	return (pte_t *)((hpd.pd & ~HUGEPD_SHIFT_MASK) | 0xc000000000000000); -} - -static inline unsigned int hugepd_shift(hugepd_t hpd) -{ -	return hpd.pd & HUGEPD_SHIFT_MASK; -} - -static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr, unsigned pdshift) -{ -	unsigned long idx = (addr & ((1UL << pdshift) - 1)) >> hugepd_shift(*hpdp); -	pte_t *dir = hugepd_page(*hpdp); - -	return dir + idx; -} -  pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift)  {  	pgd_t *pg; @@ -93,7 +90,7 @@ pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, unsigned *shift  			if (is_hugepd(pm))  				hpdp = (hugepd_t *)pm;  			else if (!pmd_none(*pm)) { -				return pte_offset_map(pm, ea); +				return pte_offset_kernel(pm, ea);  			}  		}  	} @@ -114,8 +111,18 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)  static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,  			   unsigned long address, unsigned pdshift, unsigned pshift)  { -	pte_t *new = kmem_cache_zalloc(PGT_CACHE(pdshift - pshift), -				       GFP_KERNEL|__GFP_REPEAT); +	struct kmem_cache *cachep; +	pte_t *new; + +#ifdef CONFIG_PPC64 +	cachep = PGT_CACHE(pdshift - pshift); +#else +	int i; +	int num_hugepd = 1 << (pshift - pdshift); +	cachep = hugepte_cache; +#endif + +	new = kmem_cache_zalloc(cachep, GFP_KERNEL|__GFP_REPEAT);  	BUG_ON(pshift > HUGEPD_SHIFT_MASK);  	BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK); @@ -124,10 +131,31 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,  		return -ENOMEM;  	spin_lock(&mm->page_table_lock); +#ifdef CONFIG_PPC64  	if (!hugepd_none(*hpdp)) -		kmem_cache_free(PGT_CACHE(pdshift - pshift), new); +		kmem_cache_free(cachep, new);  	else -		hpdp->pd = ((unsigned long)new & ~0x8000000000000000) | pshift; +		hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +#else +	/* +	 * We have multiple higher-level entries that point to the same +	 * actual pte location.  Fill in each as we go and backtrack on error. +	 * We need all of these so the DTLB pgtable walk code can find the +	 * right higher-level entry without knowing if it's a hugepage or not. +	 */ +	for (i = 0; i < num_hugepd; i++, hpdp++) { +		if (unlikely(!hugepd_none(*hpdp))) +			break; +		else +			hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift; +	} +	/* If we bailed from the for loop early, an error occurred, clean up */ +	if (i < num_hugepd) { +		for (i = i - 1 ; i >= 0; i--, hpdp--) +			hpdp->pd = 0; +		kmem_cache_free(cachep, new); +	} +#endif  	spin_unlock(&mm->page_table_lock);  	return 0;  } @@ -169,11 +197,132 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz  	return hugepte_offset(hpdp, addr, pdshift);  } +#ifdef CONFIG_PPC32  /* Build list of addresses of gigantic pages.  This function is used in early   * boot before the buddy or bootmem allocator is setup.   */ -void add_gpage(unsigned long addr, unsigned long page_size, -	unsigned long number_of_pages) +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +{ +	unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); +	int i; + +	if (addr == 0) +		return; + +	gpage_freearray[idx].nr_gpages = number_of_pages; + +	for (i = 0; i < number_of_pages; i++) { +		gpage_freearray[idx].gpage_list[i] = addr; +		addr += page_size; +	} +} + +/* + * Moves the gigantic page addresses from the temporary list to the + * huge_boot_pages list. + */ +int alloc_bootmem_huge_page(struct hstate *hstate) +{ +	struct huge_bootmem_page *m; +	int idx = shift_to_mmu_psize(hstate->order + PAGE_SHIFT); +	int nr_gpages = gpage_freearray[idx].nr_gpages; + +	if (nr_gpages == 0) +		return 0; + +#ifdef CONFIG_HIGHMEM +	/* +	 * If gpages can be in highmem we can't use the trick of storing the +	 * data structure in the page; allocate space for this +	 */ +	m = alloc_bootmem(sizeof(struct huge_bootmem_page)); +	m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; +#else +	m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); +#endif + +	list_add(&m->list, &huge_boot_pages); +	gpage_freearray[idx].nr_gpages = nr_gpages; +	gpage_freearray[idx].gpage_list[nr_gpages] = 0; +	m->hstate = hstate; + +	return 1; +} +/* + * Scan the command line hugepagesz= options for gigantic pages; store those in + * a list that we use to allocate the memory once all options are parsed. + */ + +unsigned long gpage_npages[MMU_PAGE_COUNT]; + +static int __init do_gpage_early_setup(char *param, char *val) +{ +	static phys_addr_t size; +	unsigned long npages; + +	/* +	 * The hugepagesz and hugepages cmdline options are interleaved.  We +	 * use the size variable to keep track of whether or not this was done +	 * properly and skip over instances where it is incorrect.  Other +	 * command-line parsing code will issue warnings, so we don't need to. +	 * +	 */ +	if ((strcmp(param, "default_hugepagesz") == 0) || +	    (strcmp(param, "hugepagesz") == 0)) { +		size = memparse(val, NULL); +	} else if (strcmp(param, "hugepages") == 0) { +		if (size != 0) { +			if (sscanf(val, "%lu", &npages) <= 0) +				npages = 0; +			gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; +			size = 0; +		} +	} +	return 0; +} + + +/* + * This function allocates physical space for pages that are larger than the + * buddy allocator can handle.  We want to allocate these in highmem because + * the amount of lowmem is limited.  This means that this function MUST be + * called before lowmem_end_addr is set up in MMU_init() in order for the lmb + * allocate to grab highmem. + */ +void __init reserve_hugetlb_gpages(void) +{ +	static __initdata char cmdline[COMMAND_LINE_SIZE]; +	phys_addr_t size, base; +	int i; + +	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); +	parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + +	/* +	 * Walk gpage list in reverse, allocating larger page sizes first. +	 * Skip over unsupported sizes, or sizes that have 0 gpages allocated. +	 * When we reach the point in the list where pages are no longer +	 * considered gpages, we're done. +	 */ +	for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { +		if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) +			continue; +		else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) +			break; + +		size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); +		base = memblock_alloc_base(size * gpage_npages[i], size, +					   MEMBLOCK_ALLOC_ANYWHERE); +		add_gpage(base, size, gpage_npages[i]); +	} +} + +#else /* PPC64 */ + +/* Build list of addresses of gigantic pages.  This function is used in early + * boot before the buddy or bootmem allocator is setup. + */ +void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages)  {  	if (!addr)  		return; @@ -199,19 +348,79 @@ int alloc_bootmem_huge_page(struct hstate *hstate)  	m->hstate = hstate;  	return 1;  } +#endif  int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)  {  	return 0;  } +#ifdef CONFIG_PPC32 +#define HUGEPD_FREELIST_SIZE \ +	((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) + +struct hugepd_freelist { +	struct rcu_head	rcu; +	unsigned int index; +	void *ptes[0]; +}; + +static DEFINE_PER_CPU(struct hugepd_freelist *, hugepd_freelist_cur); + +static void hugepd_free_rcu_callback(struct rcu_head *head) +{ +	struct hugepd_freelist *batch = +		container_of(head, struct hugepd_freelist, rcu); +	unsigned int i; + +	for (i = 0; i < batch->index; i++) +		kmem_cache_free(hugepte_cache, batch->ptes[i]); + +	free_page((unsigned long)batch); +} + +static void hugepd_free(struct mmu_gather *tlb, void *hugepte) +{ +	struct hugepd_freelist **batchp; + +	batchp = &__get_cpu_var(hugepd_freelist_cur); + +	if (atomic_read(&tlb->mm->mm_users) < 2 || +	    cpumask_equal(mm_cpumask(tlb->mm), +			  cpumask_of(smp_processor_id()))) { +		kmem_cache_free(hugepte_cache, hugepte); +		return; +	} + +	if (*batchp == NULL) { +		*batchp = (struct hugepd_freelist *)__get_free_page(GFP_ATOMIC); +		(*batchp)->index = 0; +	} + +	(*batchp)->ptes[(*batchp)->index++] = hugepte; +	if ((*batchp)->index == HUGEPD_FREELIST_SIZE) { +		call_rcu_sched(&(*batchp)->rcu, hugepd_free_rcu_callback); +		*batchp = NULL; +	} +} +#endif +  static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshift,  			      unsigned long start, unsigned long end,  			      unsigned long floor, unsigned long ceiling)  {  	pte_t *hugepte = hugepd_page(*hpdp); -	unsigned shift = hugepd_shift(*hpdp); +	int i; +  	unsigned long pdmask = ~((1UL << pdshift) - 1); +	unsigned int num_hugepd = 1; + +#ifdef CONFIG_PPC64 +	unsigned int shift = hugepd_shift(*hpdp); +#else +	/* Note: On 32-bit the hpdp may be the first of several */ +	num_hugepd = (1 << (hugepd_shift(*hpdp) - pdshift)); +#endif  	start &= pdmask;  	if (start < floor) @@ -224,9 +433,15 @@ static void free_hugepd_range(struct mmu_gather *tlb, hugepd_t *hpdp, int pdshif  	if (end - 1 > ceiling - 1)  		return; -	hpdp->pd = 0; +	for (i = 0; i < num_hugepd; i++, hpdp++) +		hpdp->pd = 0; +  	tlb->need_flush = 1; +#ifdef CONFIG_PPC64  	pgtable_free_tlb(tlb, hugepte, pdshift - shift); +#else +	hugepd_free(tlb, hugepte); +#endif  }  static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -331,18 +546,27 @@ void hugetlb_free_pgd_range(struct mmu_gather *tlb,  	 * too.  	 */ -	pgd = pgd_offset(tlb->mm, addr);  	do {  		next = pgd_addr_end(addr, end); +		pgd = pgd_offset(tlb->mm, addr);  		if (!is_hugepd(pgd)) {  			if (pgd_none_or_clear_bad(pgd))  				continue;  			hugetlb_free_pud_range(tlb, pgd, addr, next, floor, ceiling);  		} else { +#ifdef CONFIG_PPC32 +			/* +			 * Increment next by the size of the huge mapping since +			 * on 32-bit there may be more than one entry at the pgd +			 * level for a single hugepage, but all of them point to +			 * the same kmem cache that holds the hugepte. +			 */ +			next = addr + (1 << hugepd_shift(*(hugepd_t *)pgd)); +#endif  			free_hugepd_range(tlb, (hugepd_t *)pgd, PGDIR_SHIFT,  					  addr, next, floor, ceiling);  		} -	} while (pgd++, addr = next, addr != end); +	} while (addr = next, addr != end);  }  struct page * @@ -390,7 +614,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add  {  	unsigned long mask;  	unsigned long pte_end; -	struct page *head, *page; +	struct page *head, *page, *tail;  	pte_t pte;  	int refs; @@ -413,6 +637,7 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add  	head = pte_page(pte);  	page = head + ((addr & (sz-1)) >> PAGE_SHIFT); +	tail = page;  	do {  		VM_BUG_ON(compound_head(page) != head);  		pages[*nr] = page; @@ -428,10 +653,20 @@ static noinline int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long add  	if (unlikely(pte_val(pte) != pte_val(*ptep))) {  		/* Could be optimized better */ -		while (*nr) { -			put_page(page); -			(*nr)--; -		} +		*nr -= refs; +		while (refs--) +			put_page(head); +		return 0; +	} + +	/* +	 * Any tail page need their mapcount reference taken before we +	 * return. +	 */ +	while (refs--) { +		if (PageTail(tail)) +			get_huge_page_tail(tail); +		tail++;  	}  	return 1; @@ -466,17 +701,35 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,  					unsigned long len, unsigned long pgoff,  					unsigned long flags)  { +#ifdef CONFIG_PPC_MM_SLICES  	struct hstate *hstate = hstate_file(file);  	int mmu_psize = shift_to_mmu_psize(huge_page_shift(hstate));  	return slice_get_unmapped_area(addr, len, flags, mmu_psize, 1, 0); +#else +	return get_unmapped_area(file, addr, len, pgoff, flags); +#endif  }  unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)  { +#ifdef CONFIG_PPC_MM_SLICES  	unsigned int psize = get_slice_psize(vma->vm_mm, vma->vm_start);  	return 1UL << mmu_psize_to_shift(psize); +#else +	if (!is_vm_hugetlb_page(vma)) +		return PAGE_SIZE; + +	return huge_page_size(hstate_vma(vma)); +#endif +} + +static inline bool is_power_of_4(unsigned long x) +{ +	if (is_power_of_2(x)) +		return (__ilog2(x) % 2) ? false : true; +	return false;  }  static int __init add_huge_page_size(unsigned long long size) @@ -486,9 +739,14 @@ static int __init add_huge_page_size(unsigned long long size)  	/* Check that it is a page size supported by the hardware and  	 * that it fits within pagetable and slice limits. */ +#ifdef CONFIG_PPC_FSL_BOOK3E +	if ((size < PAGE_SIZE) || !is_power_of_4(size)) +		return -EINVAL; +#else  	if (!is_power_of_2(size)  	    || (shift > SLICE_HIGH_SHIFT) || (shift <= PAGE_SHIFT))  		return -EINVAL; +#endif  	if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)  		return -EINVAL; @@ -525,6 +783,46 @@ static int __init hugepage_setup_sz(char *str)  }  __setup("hugepagesz=", hugepage_setup_sz); +#ifdef CONFIG_FSL_BOOKE +struct kmem_cache *hugepte_cache; +static int __init hugetlbpage_init(void) +{ +	int psize; + +	for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) { +		unsigned shift; + +		if (!mmu_psize_defs[psize].shift) +			continue; + +		shift = mmu_psize_to_shift(psize); + +		/* Don't treat normal page sizes as huge... */ +		if (shift != PAGE_SHIFT) +			if (add_huge_page_size(1ULL << shift) < 0) +				continue; +	} + +	/* +	 * Create a kmem cache for hugeptes.  The bottom bits in the pte have +	 * size information encoded in them, so align them to allow this +	 */ +	hugepte_cache =  kmem_cache_create("hugepte-cache", sizeof(pte_t), +					   HUGEPD_SHIFT_MASK + 1, 0, NULL); +	if (hugepte_cache == NULL) +		panic("%s: Unable to create kmem cache for hugeptes\n", +		      __func__); + +	/* Default hpage size = 4M */ +	if (mmu_psize_defs[MMU_PAGE_4M].shift) +		HPAGE_SHIFT = mmu_psize_defs[MMU_PAGE_4M].shift; +	else +		panic("%s: Unable to set default huge page size\n", __func__); + + +	return 0; +} +#else  static int __init hugetlbpage_init(void)  {  	int psize; @@ -567,15 +865,23 @@ static int __init hugetlbpage_init(void)  	return 0;  } - +#endif  module_init(hugetlbpage_init);  void flush_dcache_icache_hugepage(struct page *page)  {  	int i; +	void *start;  	BUG_ON(!PageCompound(page)); -	for (i = 0; i < (1UL << compound_order(page)); i++) -		__flush_dcache_icache(page_address(page+i)); +	for (i = 0; i < (1UL << compound_order(page)); i++) { +		if (!PageHighMem(page)) { +			__flush_dcache_icache(page_address(page+i)); +		} else { +			start = kmap_atomic(page+i, KM_PPC_SYNC_ICACHE); +			__flush_dcache_icache(start); +			kunmap_atomic(start, KM_PPC_SYNC_ICACHE); +		} +	}  }  |