[PATCH] powerpc: Fix pagetable bloat for hugepages
authorDavid Gibson <david@gibson.dropbear.id.au>
Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
committerPaul Mackerras <paulus@samba.org>
Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
At present, ARCH=powerpc kernels can waste considerable space in
pagetables when making large hugepage mappings.  Hugepage PTEs go in
PMD pages, but each PMD page maps 256M and so contains only 16
hugepage PTEs (128 bytes of data), but takes up a 1024 byte
allocation.  With CONFIG_PPC_64K_PAGES enabled (64k base page size),
the situation is worse.  Now hugepage PTEs are at the PTE page level
(also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation.

The PowerPC MMU already means that any 256M region is either all
hugepage, or all normal pages.  Thus, with some care, we can use a
different allocation for the hugepage PTE tables and only allocate the
128 bytes necessary.

Signed-off-by: Paul Mackerras <paulus@samba.org>
arch/powerpc/mm/hugetlbpage.c
arch/powerpc/mm/init_64.c
include/asm-powerpc/page_64.h
include/asm-powerpc/pgalloc.h

index 7370f9f33e2943d038025a6d05ba6b4e02755beb..266b8b2ceac947071d225e4de981d2229d499fc2 100644 (file)
 #define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE     (PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE     (PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE       (1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE     (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+
+#define HUGEPD_SHIFT           (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE            (1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK            (~(HUGEPD_SIZE-1))
+
+#define huge_pgtable_cache     (pgtable_cache[HUGEPTE_CACHE_NUM])
+
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK      0x1
+
+typedef struct { unsigned long pd; } hugepd_t;
+
+#define hugepd_none(hpd)       ((hpd).pd == 0)
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+       BUG_ON(!(hpd.pd & HUGEPD_OK));
+       return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+       unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+       pte_t *dir = hugepd_page(*hpdp);
+
+       return dir + idx;
+}
+
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+                          unsigned long address)
+{
+       pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+                                     GFP_KERNEL|__GFP_REPEAT);
+
+       if (! new)
+               return -ENOMEM;
+
+       spin_lock(&mm->page_table_lock);
+       if (!hugepd_none(*hpdp))
+               kmem_cache_free(huge_pgtable_cache, new);
+       else
+               hpdp->pd = (unsigned long)new | HUGEPD_OK;
+       spin_unlock(&mm->page_table_lock);
+       return 0;
+}
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pg;
        pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
 
        BUG_ON(! in_hugepage_area(mm->context, addr));
 
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
        if (!pgd_none(*pg)) {
                pu = pud_offset(pg, addr);
                if (!pud_none(*pu)) {
-                       pm = pmd_offset(pu, addr);
 #ifdef CONFIG_PPC_64K_PAGES
-                       /* Currently, we use the normal PTE offset within full
-                        * size PTE pages, thus our huge PTEs are scattered in
-                        * the PTE page and we do waste some. We may change
-                        * that in the future, but the current mecanism keeps
-                        * things much simpler
-                        */
-                       if (!pmd_none(*pm)) {
-                               /* Note: pte_offset_* are all equivalent on
-                                * ppc64 as we don't have HIGHMEM
-                                */
-                               pt = pte_offset_kernel(pm, addr);
-                               return pt;
-                       }
-#else /* CONFIG_PPC_64K_PAGES */
-                       /* On 4k pages, we put huge PTEs in the PMD page */
-                       pt = (pte_t *)pm;
-                       return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
+                       pmd_t *pm;
+                       pm = pmd_offset(pu, addr);
+                       if (!pmd_none(*pm))
+                               return hugepte_offset((hugepd_t *)pm, addr);
+#else
+                       return hugepte_offset((hugepd_t *)pu, addr);
+#endif
                }
        }
 
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pg;
        pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
+       hugepd_t *hpdp = NULL;
 
        BUG_ON(! in_hugepage_area(mm->context, addr));
 
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
        pu = pud_alloc(mm, pg, addr);
 
        if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+               pmd_t *pm;
                pm = pmd_alloc(mm, pu, addr);
-               if (pm) {
+               if (pm)
+                       hpdp = (hugepd_t *)pm;
+#else
+               hpdp = (hugepd_t *)pu;
+#endif
+       }
+
+       if (! hpdp)
+               return NULL;
+
+       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+               return NULL;
+
+       return hugepte_offset(hpdp, addr);
+}
+
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+       pte_t *hugepte = hugepd_page(*hpdp);
+
+       hpdp->pd = 0;
+       tlb->need_flush = 1;
+       pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+                                                HUGEPTE_TABLE_SIZE-1));
+}
+
 #ifdef CONFIG_PPC_64K_PAGES
-                       /* See comment in huge_pte_offset. Note that if we ever
-                        * want to put the page size in the PMD, we would have
-                        * to open code our own pte_alloc* function in order
-                        * to populate and set the size atomically
-                        */
-                       pt = pte_alloc_map(mm, pm, addr);
-#else /* CONFIG_PPC_64K_PAGES */
-                       pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
-                       return pt;
-               }
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pmd_t *pmd;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               if (pmd_none(*pmd))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pmd);
+       } while (pmd++, addr = next, addr != end);
+
+       start &= PUD_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PUD_MASK;
+               if (!ceiling)
+                       return;
        }
+       if (end - 1 > ceiling - 1)
+               return;
 
-       return NULL;
+       pmd = pmd_offset(pud, start);
+       pud_clear(pud);
+       pmd_free_tlb(tlb, pmd);
+}
+#endif
+
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pud_t *pud;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+               if (pud_none_or_clear_bad(pud))
+                       continue;
+               hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+               if (pud_none(*pud))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+       } while (pud++, addr = next, addr != end);
+
+       start &= PGDIR_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PGDIR_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               return;
+
+       pud = pud_offset(pgd, start);
+       pgd_clear(pgd);
+       pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+                           unsigned long addr, unsigned long end,
+                           unsigned long floor, unsigned long ceiling)
+{
+       pgd_t *pgd;
+       unsigned long next;
+       unsigned long start;
+
+       /*
+        * Comments below take from the normal free_pgd_range().  They
+        * apply here too.  The tests against HUGEPD_MASK below are
+        * essential, because we *don't* test for this at the bottom
+        * level.  Without them we'll attempt to free a hugepte table
+        * when we unmap just part of it, even if there are other
+        * active mappings using it.
+        *
+        * The next few lines have given us lots of grief...
+        *
+        * Why are we testing HUGEPD* at this top level?  Because
+        * often there will be no work to do at all, and we'd prefer
+        * not to go all the way down to the bottom just to discover
+        * that.
+        *
+        * Why all these "- 1"s?  Because 0 represents both the bottom
+        * of the address space and the top of it (using -1 for the
+        * top wouldn't help much: the masks would do the wrong thing).
+        * The rule is that addr 0 and floor 0 refer to the bottom of
+        * the address space, but end 0 and ceiling 0 refer to the top
+        * Comparisons need to use "end - 1" and "ceiling - 1" (though
+        * that end 0 case should be mythical).
+        *
+        * Wherever addr is brought up or ceiling brought down, we
+        * must be careful to reject "the opposite 0" before it
+        * confuses the subsequent tests.  But what about where end is
+        * brought down by HUGEPD_SIZE below? no, end can't go down to
+        * 0 there.
+        *
+        * Whereas we round start (addr) and ceiling down, by different
+        * masks at different levels, in order to test whether a table
+        * now has no other vmas using it, so can be freed, we don't
+        * bother to round floor or end up - the tests don't need that.
+        */
+
+       addr &= HUGEPD_MASK;
+       if (addr < floor) {
+               addr += HUGEPD_SIZE;
+               if (!addr)
+                       return;
+       }
+       if (ceiling) {
+               ceiling &= HUGEPD_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               end -= HUGEPD_SIZE;
+       if (addr > end - 1)
+               return;
+
+       start = addr;
+       pgd = pgd_offset((*tlb)->mm, addr);
+       do {
+               BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(pgd))
+                       continue;
+               hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+       } while (pgd++, addr = next, addr != end);
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
  out:
        return err;
 }
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(addr, 0, kmem_cache_size(cache));
+}
+
+static int __init hugetlbpage_init(void)
+{
+       if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+               return -ENODEV;
+
+       huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+                                              HUGEPTE_TABLE_SIZE,
+                                              HUGEPTE_TABLE_SIZE,
+                                              SLAB_HWCACHE_ALIGN |
+                                              SLAB_MUST_HWCACHE_ALIGN,
+                                              zero_ctor, NULL);
+       if (! huge_pgtable_cache)
+               panic("hugetlbpage_init(): could not create hugepte cache\n");
+
+       return 0;
+}
+
+module_init(hugetlbpage_init);
index babebd15bdc49d7dc1fd905c8450948673c35876..9e30f968c184af90dfad58f5ff357edfe81e0ed3 100644 (file)
@@ -162,7 +162,14 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 #endif /* CONFIG_PPC_64K_PAGES */
 
+#ifdef CONFIG_HUGETLB_PAGE
+/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
+ * can't put into the tables above, because HPAGE_SHIFT is not compile
+ * time constant. */
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+#else
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+#endif
 
 void pgtable_cache_init(void)
 {
index 3fb061bab9ecd75f0fc912d23fc7cbb71f5c45c7..eab779c219958b18b82afd1ff5c55bc5de05f89b 100644 (file)
@@ -101,6 +101,7 @@ extern unsigned int HPAGE_SHIFT;
                                      - (1U << GET_HTLB_AREA(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 #define ARCH_HAS_SETCLEAR_HUGE_PTE
 
index a00ee002cd1165f9eb298a4112a4a74712c05966..9f0917c6865978a0c39dfd51a6194a06cb7f4a04 100644 (file)
@@ -17,11 +17,13 @@ extern kmem_cache_t *pgtable_cache[];
 #define PTE_CACHE_NUM  0
 #define PMD_CACHE_NUM  1
 #define PGD_CACHE_NUM  2
+#define HUGEPTE_CACHE_NUM 3
 #else
 #define PTE_CACHE_NUM  0
 #define PMD_CACHE_NUM  1
 #define PUD_CACHE_NUM  1
 #define PGD_CACHE_NUM  0
+#define HUGEPTE_CACHE_NUM 2
 #endif
 
 /*