arch/x86/xen/mmu.c

   1 /*
   2  * Xen mmu operations
   3  *
   4  * This file contains the various mmu fetch and update operations.
   5  * The most important job they must perform is the mapping between the
   6  * domain's pfn and the overall machine mfns.
   7  *
   8  * Xen allows guests to directly update the pagetable, in a controlled
   9  * fashion.  In other words, the guest modifies the same pagetable
  10  * that the CPU actually uses, which eliminates the overhead of having
  11  * a separate shadow pagetable.
  12  *
  13  * In order to allow this, it falls on the guest domain to map its
  14  * notion of a "physical" pfn - which is just a domain-local linear
  15  * address - into a real "machine address" which the CPU's MMU can
  16  * use.
  17  *
  18  * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
  19  * inserted directly into the pagetable.  When creating a new
  20  * pte/pmd/pgd, it converts the passed pfn into an mfn.  Conversely,
  21  * when reading the content back with __(pgd|pmd|pte)_val, it converts
  22  * the mfn back into a pfn.
  23  *
  24  * The other constraint is that all pages which make up a pagetable
  25  * must be mapped read-only in the guest.  This prevents uncontrolled
  26  * guest updates to the pagetable.  Xen strictly enforces this, and
  27  * will disallow any pagetable update which will end up mapping a
  28  * pagetable page RW, and will disallow using any writable page as a
  29  * pagetable.
  30  *
  31  * Naively, when loading %cr3 with the base of a new pagetable, Xen
  32  * would need to validate the whole pagetable before going on.
  33  * Naturally, this is quite slow.  The solution is to "pin" a
  34  * pagetable, which enforces all the constraints on the pagetable even
  35  * when it is not actively in use.  This menas that Xen can be assured
  36  * that it is still valid when you do load it into %cr3, and doesn't
  37  * need to revalidate it.
  38  *
  39  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
  40  */
  41 #include <linux/sched.h>
  42 #include <linux/highmem.h>
  43 #include <linux/debugfs.h>
  44 #include <linux/bug.h>
  45 #include <linux/vmalloc.h>
  46 #include <linux/module.h>
  47 #include <linux/gfp.h>
  48 #include <linux/memblock.h>
  49 #include <linux/seq_file.h>
  50
  51 #include <asm/pgtable.h>
  52 #include <asm/tlbflush.h>
  53 #include <asm/fixmap.h>
  54 #include <asm/mmu_context.h>
  55 #include <asm/setup.h>
  56 #include <asm/paravirt.h>
  57 #include <asm/e820.h>
  58 #include <asm/linkage.h>
  59 #include <asm/page.h>
  60 #include <asm/init.h>
  61 #include <asm/pat.h>
  62
  63 #include <asm/xen/hypercall.h>
  64 #include <asm/xen/hypervisor.h>
  65
  66 #include <xen/xen.h>
  67 #include <xen/page.h>
  68 #include <xen/interface/xen.h>
  69 #include <xen/interface/hvm/hvm_op.h>
  70 #include <xen/interface/version.h>
  71 #include <xen/interface/memory.h>
  72 #include <xen/hvc-console.h>
  73
  74 #include "multicalls.h"
  75 #include "mmu.h"
  76 #include "debugfs.h"
  77
  78 #define MMU_UPDATE_HISTO        30
  79
  80 /*
  81  * Protects atomic reservation decrease/increase against concurrent increases.
  82  * Also protects non-atomic updates of current_pages and driver_pages, and
  83  * balloon lists.
  84  */
  85 DEFINE_SPINLOCK(xen_reservation_lock);
  86
  87 #ifdef CONFIG_XEN_DEBUG_FS
  88
  89 static struct {
  90         u32 pgd_update;
  91         u32 pgd_update_pinned;
  92         u32 pgd_update_batched;
  93
  94         u32 pud_update;
  95         u32 pud_update_pinned;
  96         u32 pud_update_batched;
  97
  98         u32 pmd_update;
  99         u32 pmd_update_pinned;
 100         u32 pmd_update_batched;
 101
 102         u32 pte_update;
 103         u32 pte_update_pinned;
 104         u32 pte_update_batched;
 105
 106         u32 mmu_update;
 107         u32 mmu_update_extended;
 108         u32 mmu_update_histo[MMU_UPDATE_HISTO];
 109
 110         u32 prot_commit;
 111         u32 prot_commit_batched;
 112
 113         u32 set_pte_at;
 114         u32 set_pte_at_batched;
 115         u32 set_pte_at_pinned;
 116         u32 set_pte_at_current;
 117         u32 set_pte_at_kernel;
 118 } mmu_stats;
 119
 120 static u8 zero_stats;
 121
 122 static inline void check_zero(void)
 123 {
 124         if (unlikely(zero_stats)) {
 125                 memset(&mmu_stats, 0, sizeof(mmu_stats));
 126                 zero_stats = 0;
 127         }
 128 }
 129
 130 #define ADD_STATS(elem, val)                    \
 131         do { check_zero(); mmu_stats.elem += (val); } while(0)
 132
 133 #else  /* !CONFIG_XEN_DEBUG_FS */
 134
 135 #define ADD_STATS(elem, val)    do { (void)(val); } while(0)
 136
 137 #endif /* CONFIG_XEN_DEBUG_FS */
 138
 139
 140 /*
 141  * Identity map, in addition to plain kernel map.  This needs to be
 142  * large enough to allocate page table pages to allocate the rest.
 143  * Each page can map 2MB.
 144  */
 145 #define LEVEL1_IDENT_ENTRIES    (PTRS_PER_PTE * 4)
 146 static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
 147
 148 #ifdef CONFIG_X86_64
 149 /* l3 pud for userspace vsyscall mapping */
 150 static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
 151 #endif /* CONFIG_X86_64 */
 152
 153 /*
 154  * Note about cr3 (pagetable base) values:
 155  *
 156  * xen_cr3 contains the current logical cr3 value; it contains the
 157  * last set cr3.  This may not be the current effective cr3, because
 158  * its update may be being lazily deferred.  However, a vcpu looking
 159  * at its own cr3 can use this value knowing that it everything will
 160  * be self-consistent.
 161  *
 162  * xen_current_cr3 contains the actual vcpu cr3; it is set once the
 163  * hypercall to set the vcpu cr3 is complete (so it may be a little
 164  * out of date, but it will never be set early).  If one vcpu is
 165  * looking at another vcpu's cr3 value, it should use this variable.
 166  */
 167 DEFINE_PER_CPU(unsigned long, xen_cr3);  /* cr3 stored as physaddr */
 168 DEFINE_PER_CPU(unsigned long, xen_current_cr3);  /* actual vcpu cr3 */
 169
 170
 171 /*
 172  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
 173  * redzone above it, so round it up to a PGD boundary.
 174  */
 175 #define USER_LIMIT      ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
 176
 177 unsigned long arbitrary_virt_to_mfn(void *vaddr)
 178 {
 179         xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
 180
 181         return PFN_DOWN(maddr.maddr);
 182 }
 183
 184 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 185 {
 186         unsigned long address = (unsigned long)vaddr;
 187         unsigned int level;
 188         pte_t *pte;
 189         unsigned offset;
 190
 191         /*
 192          * if the PFN is in the linear mapped vaddr range, we can just use
 193          * the (quick) virt_to_machine() p2m lookup
 194          */
 195         if (virt_addr_valid(vaddr))
 196                 return virt_to_machine(vaddr);
 197
 198         /* otherwise we have to do a (slower) full page-table walk */
 199
 200         pte = lookup_address(address, &level);
 201         BUG_ON(pte == NULL);
 202         offset = address & ~PAGE_MASK;
 203         return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
 204 }
 205 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
 206
 207 void make_lowmem_page_readonly(void *vaddr)
 208 {
 209         pte_t *pte, ptev;
 210         unsigned long address = (unsigned long)vaddr;
 211         unsigned int level;
 212
 213         pte = lookup_address(address, &level);
 214         if (pte == NULL)
 215                 return;         /* vaddr missing */
 216
 217         ptev = pte_wrprotect(*pte);
 218
 219         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 220                 BUG();
 221 }
 222
 223 void make_lowmem_page_readwrite(void *vaddr)
 224 {
 225         pte_t *pte, ptev;
 226         unsigned long address = (unsigned long)vaddr;
 227         unsigned int level;
 228
 229         pte = lookup_address(address, &level);
 230         if (pte == NULL)
 231                 return;         /* vaddr missing */
 232
 233         ptev = pte_mkwrite(*pte);
 234
 235         if (HYPERVISOR_update_va_mapping(address, ptev, 0))
 236                 BUG();
 237 }
 238
 239
 240 static bool xen_page_pinned(void *ptr)
 241 {
 242         struct page *page = virt_to_page(ptr);
 243
 244         return PagePinned(page);
 245 }
 246
 247 static bool xen_iomap_pte(pte_t pte)
 248 {
 249         return pte_flags(pte) & _PAGE_IOMAP;
 250 }
 251
 252 void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
 253 {
 254         struct multicall_space mcs;
 255         struct mmu_update *u;
 256
 257         mcs = xen_mc_entry(sizeof(*u));
 258         u = mcs.args;
 259
 260         /* ptep might be kmapped when using 32-bit HIGHPTE */
 261         u->ptr = arbitrary_virt_to_machine(ptep).maddr;
 262         u->val = pte_val_ma(pteval);
 263
 264         MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
 265
 266         xen_mc_issue(PARAVIRT_LAZY_MMU);
 267 }
 268 EXPORT_SYMBOL_GPL(xen_set_domain_pte);
 269
 270 static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
 271 {
 272         xen_set_domain_pte(ptep, pteval, DOMID_IO);
 273 }
 274
 275 static void xen_extend_mmu_update(const struct mmu_update *update)
 276 {
 277         struct multicall_space mcs;
 278         struct mmu_update *u;
 279
 280         mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
 281
 282         if (mcs.mc != NULL) {
 283                 ADD_STATS(mmu_update_extended, 1);
 284                 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
 285
 286                 mcs.mc->args[1]++;
 287
 288                 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
 289                         ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
 290                 else
 291                         ADD_STATS(mmu_update_histo[0], 1);
 292         } else {
 293                 ADD_STATS(mmu_update, 1);
 294                 mcs = __xen_mc_entry(sizeof(*u));
 295                 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
 296                 ADD_STATS(mmu_update_histo[1], 1);
 297         }
 298
 299         u = mcs.args;
 300         *u = *update;
 301 }
 302
 303 void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
 304 {
 305         struct mmu_update u;
 306
 307         preempt_disable();
 308
 309         xen_mc_batch();
 310
 311         /* ptr may be ioremapped for 64-bit pagetable setup */
 312         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 313         u.val = pmd_val_ma(val);
 314         xen_extend_mmu_update(&u);
 315
 316         ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 317
 318         xen_mc_issue(PARAVIRT_LAZY_MMU);
 319
 320         preempt_enable();
 321 }
 322
 323 void xen_set_pmd(pmd_t *ptr, pmd_t val)
 324 {
 325         ADD_STATS(pmd_update, 1);
 326
 327         /* If page is not pinned, we can just update the entry
 328            directly */
 329         if (!xen_page_pinned(ptr)) {
 330                 *ptr = val;
 331                 return;
 332         }
 333
 334         ADD_STATS(pmd_update_pinned, 1);
 335
 336         xen_set_pmd_hyper(ptr, val);
 337 }
 338
 339 /*
 340  * Associate a virtual page frame with a given physical page frame
 341  * and protection flags for that frame.
 342  */
 343 void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 344 {
 345         set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
 346 }
 347
 348 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 349                     pte_t *ptep, pte_t pteval)
 350 {
 351         if (xen_iomap_pte(pteval)) {
 352                 xen_set_iomap_pte(ptep, pteval);
 353                 goto out;
 354         }
 355
 356         ADD_STATS(set_pte_at, 1);
 357 //      ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 358         ADD_STATS(set_pte_at_current, mm == current->mm);
 359         ADD_STATS(set_pte_at_kernel, mm == &init_mm);
 360
 361         if (mm == current->mm || mm == &init_mm) {
 362                 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
 363                         struct multicall_space mcs;
 364                         mcs = xen_mc_entry(0);
 365
 366                         MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
 367                         ADD_STATS(set_pte_at_batched, 1);
 368                         xen_mc_issue(PARAVIRT_LAZY_MMU);
 369                         goto out;
 370                 } else
 371                         if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
 372                                 goto out;
 373         }
 374         xen_set_pte(ptep, pteval);
 375
 376 out:    return;
 377 }
 378
 379 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
 380                                  unsigned long addr, pte_t *ptep)
 381 {
 382         /* Just return the pte as-is.  We preserve the bits on commit */
 383         return *ptep;
 384 }
 385
 386 void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 387                                  pte_t *ptep, pte_t pte)
 388 {
 389         struct mmu_update u;
 390
 391         xen_mc_batch();
 392
 393         u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
 394         u.val = pte_val_ma(pte);
 395         xen_extend_mmu_update(&u);
 396
 397         ADD_STATS(prot_commit, 1);
 398         ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 399
 400         xen_mc_issue(PARAVIRT_LAZY_MMU);
 401 }
 402
 403 /* Assume pteval_t is equivalent to all the other *val_t types. */
 404 static pteval_t pte_mfn_to_pfn(pteval_t val)
 405 {
 406         if (val & _PAGE_PRESENT) {
 407                 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 408                 pteval_t flags = val & PTE_FLAGS_MASK;
 409                 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
 410         }
 411
 412         return val;
 413 }
 414
 415 static pteval_t pte_pfn_to_mfn(pteval_t val)
 416 {
 417         if (val & _PAGE_PRESENT) {
 418                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 419                 pteval_t flags = val & PTE_FLAGS_MASK;
 420                 unsigned long mfn;
 421
 422                 if (!xen_feature(XENFEAT_auto_translated_physmap))
 423                         mfn = get_phys_to_machine(pfn);
 424                 else
 425                         mfn = pfn;
 426                 /*
 427                  * If there's no mfn for the pfn, then just create an
 428                  * empty non-present pte.  Unfortunately this loses
 429                  * information about the original pfn, so
 430                  * pte_mfn_to_pfn is asymmetric.
 431                  */
 432                 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 433                         mfn = 0;
 434                         flags = 0;
 435                 } else {
 436                         /*
 437                          * Paramount to do this test _after_ the
 438                          * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
 439                          * IDENTITY_FRAME_BIT resolves to true.
 440                          */
 441                         mfn &= ~FOREIGN_FRAME_BIT;
 442                         if (mfn & IDENTITY_FRAME_BIT) {
 443                                 mfn &= ~IDENTITY_FRAME_BIT;
 444                                 flags |= _PAGE_IOMAP;
 445                         }
 446                 }
 447                 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 448         }
 449
 450         return val;
 451 }
 452
 453 static pteval_t iomap_pte(pteval_t val)
 454 {
 455         if (val & _PAGE_PRESENT) {
 456                 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
 457                 pteval_t flags = val & PTE_FLAGS_MASK;
 458
 459                 /* We assume the pte frame number is a MFN, so
 460                    just use it as-is. */
 461                 val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
 462         }
 463
 464         return val;
 465 }
 466
 467 pteval_t xen_pte_val(pte_t pte)
 468 {
 469         pteval_t pteval = pte.pte;
 470
 471         /* If this is a WC pte, convert back from Xen WC to Linux WC */
 472         if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
 473                 WARN_ON(!pat_enabled);
 474                 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 475         }
 476
 477         if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
 478                 return pteval;
 479
 480         return pte_mfn_to_pfn(pteval);
 481 }
 482 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
 483
 484 pgdval_t xen_pgd_val(pgd_t pgd)
 485 {
 486         return pte_mfn_to_pfn(pgd.pgd);
 487 }
 488 PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
 489
 490 /*
 491  * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
 492  * are reserved for now, to correspond to the Intel-reserved PAT
 493  * types.
 494  *
 495  * We expect Linux's PAT set as follows:
 496  *
 497  * Idx  PTE flags        Linux    Xen    Default
 498  * 0                     WB       WB     WB
 499  * 1            PWT      WC       WT     WT
 500  * 2        PCD          UC-      UC-    UC-
 501  * 3        PCD PWT      UC       UC     UC
 502  * 4    PAT              WB       WC     WB
 503  * 5    PAT     PWT      WC       WP     WT
 504  * 6    PAT PCD          UC-      UC     UC-
 505  * 7    PAT PCD PWT      UC       UC     UC
 506  */
 507
 508 void xen_set_pat(u64 pat)
 509 {
 510         /* We expect Linux to use a PAT setting of
 511          * UC UC- WC WB (ignoring the PAT flag) */
 512         WARN_ON(pat != 0x0007010600070106ull);
 513 }
 514
 515 pte_t xen_make_pte(pteval_t pte)
 516 {
 517         phys_addr_t addr = (pte & PTE_PFN_MASK);
 518
 519         /* If Linux is trying to set a WC pte, then map to the Xen WC.
 520          * If _PAGE_PAT is set, then it probably means it is really
 521          * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
 522          * things work out OK...
 523          *
 524          * (We should never see kernel mappings with _PAGE_PSE set,
 525          * but we could see hugetlbfs mappings, I think.).
 526          */
 527         if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
 528                 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
 529                         pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 530         }
 531
 532         /*
 533          * Unprivileged domains are allowed to do IOMAPpings for
 534          * PCI passthrough, but not map ISA space.  The ISA
 535          * mappings are just dummy local mappings to keep other
 536          * parts of the kernel happy.
 537          */
 538         if (unlikely(pte & _PAGE_IOMAP) &&
 539             (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 540                 pte = iomap_pte(pte);
 541         } else {
 542                 pte &= ~_PAGE_IOMAP;
 543                 pte = pte_pfn_to_mfn(pte);
 544         }
 545
 546         return native_make_pte(pte);
 547 }
 548 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
 549
 550 pgd_t xen_make_pgd(pgdval_t pgd)
 551 {
 552         pgd = pte_pfn_to_mfn(pgd);
 553         return native_make_pgd(pgd);
 554 }
 555 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
 556
 557 pmdval_t xen_pmd_val(pmd_t pmd)
 558 {
 559         return pte_mfn_to_pfn(pmd.pmd);
 560 }
 561 PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
 562
 563 void xen_set_pud_hyper(pud_t *ptr, pud_t val)
 564 {
 565         struct mmu_update u;
 566
 567         preempt_disable();
 568
 569         xen_mc_batch();
 570
 571         /* ptr may be ioremapped for 64-bit pagetable setup */
 572         u.ptr = arbitrary_virt_to_machine(ptr).maddr;
 573         u.val = pud_val_ma(val);
 574         xen_extend_mmu_update(&u);
 575
 576         ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 577
 578         xen_mc_issue(PARAVIRT_LAZY_MMU);
 579
 580         preempt_enable();
 581 }
 582
 583 void xen_set_pud(pud_t *ptr, pud_t val)
 584 {
 585         ADD_STATS(pud_update, 1);
 586
 587         /* If page is not pinned, we can just update the entry
 588            directly */
 589         if (!xen_page_pinned(ptr)) {
 590                 *ptr = val;
 591                 return;
 592         }
 593
 594         ADD_STATS(pud_update_pinned, 1);
 595
 596         xen_set_pud_hyper(ptr, val);
 597 }
 598
 599 void xen_set_pte(pte_t *ptep, pte_t pte)
 600 {
 601         if (xen_iomap_pte(pte)) {
 602                 xen_set_iomap_pte(ptep, pte);
 603                 return;
 604         }
 605
 606         ADD_STATS(pte_update, 1);
 607 //      ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
 608         ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 609
 610 #ifdef CONFIG_X86_PAE
 611         ptep->pte_high = pte.pte_high;
 612         smp_wmb();
 613         ptep->pte_low = pte.pte_low;
 614 #else
 615         *ptep = pte;
 616 #endif
 617 }
 618
 619 #ifdef CONFIG_X86_PAE
 620 void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
 621 {
 622         if (xen_iomap_pte(pte)) {
 623                 xen_set_iomap_pte(ptep, pte);
 624                 return;
 625         }
 626
 627         set_64bit((u64 *)ptep, native_pte_val(pte));
 628 }
 629
 630 void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 631 {
 632         ptep->pte_low = 0;
 633         smp_wmb();              /* make sure low gets written first */
 634         ptep->pte_high = 0;
 635 }
 636
 637 void xen_pmd_clear(pmd_t *pmdp)
 638 {
 639         set_pmd(pmdp, __pmd(0));
 640 }
 641 #endif  /* CONFIG_X86_PAE */
 642
 643 pmd_t xen_make_pmd(pmdval_t pmd)
 644 {
 645         pmd = pte_pfn_to_mfn(pmd);
 646         return native_make_pmd(pmd);
 647 }
 648 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 649
 650 #if PAGETABLE_LEVELS == 4
 651 pudval_t xen_pud_val(pud_t pud)
 652 {
 653         return pte_mfn_to_pfn(pud.pud);
 654 }
 655 PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
 656
 657 pud_t xen_make_pud(pudval_t pud)
 658 {
 659         pud = pte_pfn_to_mfn(pud);
 660
 661         return native_make_pud(pud);
 662 }
 663 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
 664
 665 pgd_t *xen_get_user_pgd(pgd_t *pgd)
 666 {
 667         pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
 668         unsigned offset = pgd - pgd_page;
 669         pgd_t *user_ptr = NULL;
 670
 671         if (offset < pgd_index(USER_LIMIT)) {
 672                 struct page *page = virt_to_page(pgd_page);
 673                 user_ptr = (pgd_t *)page->private;
 674                 if (user_ptr)
 675                         user_ptr += offset;
 676         }
 677
 678         return user_ptr;
 679 }
 680
 681 static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 682 {
 683         struct mmu_update u;
 684
 685         u.ptr = virt_to_machine(ptr).maddr;
 686         u.val = pgd_val_ma(val);
 687         xen_extend_mmu_update(&u);
 688 }
 689
 690 /*
 691  * Raw hypercall-based set_pgd, intended for in early boot before
 692  * there's a page structure.  This implies:
 693  *  1. The only existing pagetable is the kernel's
 694  *  2. It is always pinned
 695  *  3. It has no user pagetable attached to it
 696  */
 697 void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
 698 {
 699         preempt_disable();
 700
 701         xen_mc_batch();
 702
 703         __xen_set_pgd_hyper(ptr, val);
 704
 705         xen_mc_issue(PARAVIRT_LAZY_MMU);
 706
 707         preempt_enable();
 708 }
 709
 710 void xen_set_pgd(pgd_t *ptr, pgd_t val)
 711 {
 712         pgd_t *user_ptr = xen_get_user_pgd(ptr);
 713
 714         ADD_STATS(pgd_update, 1);
 715
 716         /* If page is not pinned, we can just update the entry
 717            directly */
 718         if (!xen_page_pinned(ptr)) {
 719                 *ptr = val;
 720                 if (user_ptr) {
 721                         WARN_ON(xen_page_pinned(user_ptr));
 722                         *user_ptr = val;
 723                 }
 724                 return;
 725         }
 726
 727         ADD_STATS(pgd_update_pinned, 1);
 728         ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
 729
 730         /* If it's pinned, then we can at least batch the kernel and
 731            user updates together. */
 732         xen_mc_batch();
 733
 734         __xen_set_pgd_hyper(ptr, val);
 735         if (user_ptr)
 736                 __xen_set_pgd_hyper(user_ptr, val);
 737
 738         xen_mc_issue(PARAVIRT_LAZY_MMU);
 739 }
 740 #endif  /* PAGETABLE_LEVELS == 4 */
 741
 742 /*
 743  * (Yet another) pagetable walker.  This one is intended for pinning a
 744  * pagetable.  This means that it walks a pagetable and calls the
 745  * callback function on each page it finds making up the page table,
 746  * at every level.  It walks the entire pagetable, but it only bothers
 747  * pinning pte pages which are below limit.  In the normal case this
 748  * will be STACK_TOP_MAX, but at boot we need to pin up to
 749  * FIXADDR_TOP.
 750  *
 751  * For 32-bit the important bit is that we don't pin beyond there,
 752  * because then we start getting into Xen's ptes.
 753  *
 754  * For 64-bit, we must skip the Xen hole in the middle of the address
 755  * space, just after the big x86-64 virtual hole.
 756  */
 757 static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 758                           int (*func)(struct mm_struct *mm, struct page *,
 759                                       enum pt_level),
 760                           unsigned long limit)
 761 {
 762         int flush = 0;
 763         unsigned hole_low, hole_high;
 764         unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
 765         unsigned pgdidx, pudidx, pmdidx;
 766
 767         /* The limit is the last byte to be touched */
 768         limit--;
 769         BUG_ON(limit >= FIXADDR_TOP);
 770
 771         if (xen_feature(XENFEAT_auto_translated_physmap))
 772                 return 0;
 773
 774         /*
 775          * 64-bit has a great big hole in the middle of the address
 776          * space, which contains the Xen mappings.  On 32-bit these
 777          * will end up making a zero-sized hole and so is a no-op.
 778          */
 779         hole_low = pgd_index(USER_LIMIT);
 780         hole_high = pgd_index(PAGE_OFFSET);
 781
 782         pgdidx_limit = pgd_index(limit);
 783 #if PTRS_PER_PUD > 1
 784         pudidx_limit = pud_index(limit);
 785 #else
 786         pudidx_limit = 0;
 787 #endif
 788 #if PTRS_PER_PMD > 1
 789         pmdidx_limit = pmd_index(limit);
 790 #else
 791         pmdidx_limit = 0;
 792 #endif
 793
 794         for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
 795                 pud_t *pud;
 796
 797                 if (pgdidx >= hole_low && pgdidx < hole_high)
 798                         continue;
 799
 800                 if (!pgd_val(pgd[pgdidx]))
 801                         continue;
 802
 803                 pud = pud_offset(&pgd[pgdidx], 0);
 804
 805                 if (PTRS_PER_PUD > 1) /* not folded */
 806                         flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
 807
 808                 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
 809                         pmd_t *pmd;
 810
 811                         if (pgdidx == pgdidx_limit &&
 812                             pudidx > pudidx_limit)
 813                                 goto out;
 814
 815                         if (pud_none(pud[pudidx]))
 816                                 continue;
 817
 818                         pmd = pmd_offset(&pud[pudidx], 0);
 819
 820                         if (PTRS_PER_PMD > 1) /* not folded */
 821                                 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
 822
 823                         for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
 824                                 struct page *pte;
 825
 826                                 if (pgdidx == pgdidx_limit &&
 827                                     pudidx == pudidx_limit &&
 828                                     pmdidx > pmdidx_limit)
 829                                         goto out;
 830
 831                                 if (pmd_none(pmd[pmdidx]))
 832                                         continue;
 833
 834                                 pte = pmd_page(pmd[pmdidx]);
 835                                 flush |= (*func)(mm, pte, PT_PTE);
 836                         }
 837                 }
 838         }
 839
 840 out:
 841         /* Do the top level last, so that the callbacks can use it as
 842            a cue to do final things like tlb flushes. */
 843         flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
 844
 845         return flush;
 846 }
 847
 848 static int xen_pgd_walk(struct mm_struct *mm,
 849                         int (*func)(struct mm_struct *mm, struct page *,
 850                                     enum pt_level),
 851                         unsigned long limit)
 852 {
 853         return __xen_pgd_walk(mm, mm->pgd, func, limit);
 854 }
 855
 856 /* If we're using split pte locks, then take the page's lock and
 857    return a pointer to it.  Otherwise return NULL. */
 858 static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
 859 {
 860         spinlock_t *ptl = NULL;
 861
 862 #if USE_SPLIT_PTLOCKS
 863         ptl = __pte_lockptr(page);
 864         spin_lock_nest_lock(ptl, &mm->page_table_lock);
 865 #endif
 866
 867         return ptl;
 868 }
 869
 870 static void xen_pte_unlock(void *v)
 871 {
 872         spinlock_t *ptl = v;
 873         spin_unlock(ptl);
 874 }
 875
 876 static void xen_do_pin(unsigned level, unsigned long pfn)
 877 {
 878         struct mmuext_op *op;
 879         struct multicall_space mcs;
 880
 881         mcs = __xen_mc_entry(sizeof(*op));
 882         op = mcs.args;
 883         op->cmd = level;
 884         op->arg1.mfn = pfn_to_mfn(pfn);
 885         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 886 }
 887
 888 static int xen_pin_page(struct mm_struct *mm, struct page *page,
 889                         enum pt_level level)
 890 {
 891         unsigned pgfl = TestSetPagePinned(page);
 892         int flush;
 893
 894         if (pgfl)
 895                 flush = 0;              /* already pinned */
 896         else if (PageHighMem(page))
 897                 /* kmaps need flushing if we found an unpinned
 898                    highpage */
 899                 flush = 1;
 900         else {
 901                 void *pt = lowmem_page_address(page);
 902                 unsigned long pfn = page_to_pfn(page);
 903                 struct multicall_space mcs = __xen_mc_entry(0);
 904                 spinlock_t *ptl;
 905
 906                 flush = 0;
 907
 908                 /*
 909                  * We need to hold the pagetable lock between the time
 910                  * we make the pagetable RO and when we actually pin
 911                  * it.  If we don't, then other users may come in and
 912                  * attempt to update the pagetable by writing it,
 913                  * which will fail because the memory is RO but not
 914                  * pinned, so Xen won't do the trap'n'emulate.
 915                  *
 916                  * If we're using split pte locks, we can't hold the
 917                  * entire pagetable's worth of locks during the
 918                  * traverse, because we may wrap the preempt count (8
 919                  * bits).  The solution is to mark RO and pin each PTE
 920                  * page while holding the lock.  This means the number
 921                  * of locks we end up holding is never more than a
 922                  * batch size (~32 entries, at present).
 923                  *
 924                  * If we're not using split pte locks, we needn't pin
 925                  * the PTE pages independently, because we're
 926                  * protected by the overall pagetable lock.
 927                  */
 928                 ptl = NULL;
 929                 if (level == PT_PTE)
 930                         ptl = xen_pte_lock(page, mm);
 931
 932                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
 933                                         pfn_pte(pfn, PAGE_KERNEL_RO),
 934                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
 935
 936                 if (ptl) {
 937                         xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
 938
 939                         /* Queue a deferred unlock for when this batch
 940                            is completed. */
 941                         xen_mc_callback(xen_pte_unlock, ptl);
 942                 }
 943         }
 944
 945         return flush;
 946 }
 947
 948 /* This is called just after a mm has been created, but it has not
 949    been used yet.  We need to make sure that its pagetable is all
 950    read-only, and can be pinned. */
 951 static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
 952 {
 953         xen_mc_batch();
 954
 955         if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) {
 956                 /* re-enable interrupts for flushing */
 957                 xen_mc_issue(0);
 958
 959                 kmap_flush_unused();
 960
 961                 xen_mc_batch();
 962         }
 963
 964 #ifdef CONFIG_X86_64
 965         {
 966                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
 967
 968                 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
 969
 970                 if (user_pgd) {
 971                         xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
 972                         xen_do_pin(MMUEXT_PIN_L4_TABLE,
 973                                    PFN_DOWN(__pa(user_pgd)));
 974                 }
 975         }
 976 #else /* CONFIG_X86_32 */
 977 #ifdef CONFIG_X86_PAE
 978         /* Need to make sure unshared kernel PMD is pinnable */
 979         xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
 980                      PT_PMD);
 981 #endif
 982         xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
 983 #endif /* CONFIG_X86_64 */
 984         xen_mc_issue(0);
 985 }
 986
 987 static void xen_pgd_pin(struct mm_struct *mm)
 988 {
 989         __xen_pgd_pin(mm, mm->pgd);
 990 }
 991
 992 /*
 993  * On save, we need to pin all pagetables to make sure they get their
 994  * mfns turned into pfns.  Search the list for any unpinned pgds and pin
 995  * them (unpinned pgds are not currently in use, probably because the
 996  * process is under construction or destruction).
 997  *
 998  * Expected to be called in stop_machine() ("equivalent to taking
 999  * every spinlock in the system"), so the locking doesn't really
1000  * matter all that much.
1001  */
1002 void xen_mm_pin_all(void)
1003 {
1004         unsigned long flags;
1005         struct page *page;
1006
1007         spin_lock_irqsave(&pgd_lock, flags);
1008
1009         list_for_each_entry(page, &pgd_list, lru) {
1010                 if (!PagePinned(page)) {
1011                         __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
1012                         SetPageSavePinned(page);
1013                 }
1014         }
1015
1016         spin_unlock_irqrestore(&pgd_lock, flags);
1017 }
1018
1019 /*
1020  * The init_mm pagetable is really pinned as soon as its created, but
1021  * that's before we have page structures to store the bits.  So do all
1022  * the book-keeping now.
1023  */
1024 static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
1025                                   enum pt_level level)
1026 {
1027         SetPagePinned(page);
1028         return 0;
1029 }
1030
1031 static void __init xen_mark_init_mm_pinned(void)
1032 {
1033         xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
1034 }
1035
1036 static int xen_unpin_page(struct mm_struct *mm, struct page *page,
1037                           enum pt_level level)
1038 {
1039         unsigned pgfl = TestClearPagePinned(page);
1040
1041         if (pgfl && !PageHighMem(page)) {
1042                 void *pt = lowmem_page_address(page);
1043                 unsigned long pfn = page_to_pfn(page);
1044                 spinlock_t *ptl = NULL;
1045                 struct multicall_space mcs;
1046
1047                 /*
1048                  * Do the converse to pin_page.  If we're using split
1049                  * pte locks, we must be holding the lock for while
1050                  * the pte page is unpinned but still RO to prevent
1051                  * concurrent updates from seeing it in this
1052                  * partially-pinned state.
1053                  */
1054                 if (level == PT_PTE) {
1055                         ptl = xen_pte_lock(page, mm);
1056
1057                         if (ptl)
1058                                 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
1059                 }
1060
1061                 mcs = __xen_mc_entry(0);
1062
1063                 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
1064                                         pfn_pte(pfn, PAGE_KERNEL),
1065                                         level == PT_PGD ? UVMF_TLB_FLUSH : 0);
1066
1067                 if (ptl) {
1068                         /* unlock when batch completed */
1069                         xen_mc_callback(xen_pte_unlock, ptl);
1070                 }
1071         }
1072
1073         return 0;               /* never need to flush on unpin */
1074 }
1075
1076 /* Release a pagetables pages back as normal RW */
1077 static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
1078 {
1079         xen_mc_batch();
1080
1081         xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1082
1083 #ifdef CONFIG_X86_64
1084         {
1085                 pgd_t *user_pgd = xen_get_user_pgd(pgd);
1086
1087                 if (user_pgd) {
1088                         xen_do_pin(MMUEXT_UNPIN_TABLE,
1089                                    PFN_DOWN(__pa(user_pgd)));
1090                         xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
1091                 }
1092         }
1093 #endif
1094
1095 #ifdef CONFIG_X86_PAE
1096         /* Need to make sure unshared kernel PMD is unpinned */
1097         xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]),
1098                        PT_PMD);
1099 #endif
1100
1101         __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT);
1102
1103         xen_mc_issue(0);
1104 }
1105
1106 static void xen_pgd_unpin(struct mm_struct *mm)
1107 {
1108         __xen_pgd_unpin(mm, mm->pgd);
1109 }
1110
1111 /*
1112  * On resume, undo any pinning done at save, so that the rest of the
1113  * kernel doesn't see any unexpected pinned pagetables.
1114  */
1115 void xen_mm_unpin_all(void)
1116 {
1117         unsigned long flags;
1118         struct page *page;
1119
1120         spin_lock_irqsave(&pgd_lock, flags);
1121
1122         list_for_each_entry(page, &pgd_list, lru) {
1123                 if (PageSavePinned(page)) {
1124                         BUG_ON(!PagePinned(page));
1125                         __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
1126                         ClearPageSavePinned(page);
1127                 }
1128         }
1129
1130         spin_unlock_irqrestore(&pgd_lock, flags);
1131 }
1132
1133 void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1134 {
1135         spin_lock(&next->page_table_lock);
1136         xen_pgd_pin(next);
1137         spin_unlock(&next->page_table_lock);
1138 }
1139
1140 void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1141 {
1142         spin_lock(&mm->page_table_lock);
1143         xen_pgd_pin(mm);
1144         spin_unlock(&mm->page_table_lock);
1145 }
1146
1147
1148 #ifdef CONFIG_SMP
1149 /* Another cpu may still have their %cr3 pointing at the pagetable, so
1150    we need to repoint it somewhere else before we can unpin it. */
1151 static void drop_other_mm_ref(void *info)
1152 {
1153         struct mm_struct *mm = info;
1154         struct mm_struct *active_mm;
1155
1156         active_mm = percpu_read(cpu_tlbstate.active_mm);
1157
1158         if (active_mm == mm)
1159                 leave_mm(smp_processor_id());
1160
1161         /* If this cpu still has a stale cr3 reference, then make sure
1162            it has been flushed. */
1163         if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
1164                 load_cr3(swapper_pg_dir);
1165 }
1166
1167 static void xen_drop_mm_ref(struct mm_struct *mm)
1168 {
1169         cpumask_var_t mask;
1170         unsigned cpu;
1171
1172         if (current->active_mm == mm) {
1173                 if (current->mm == mm)
1174                         load_cr3(swapper_pg_dir);
1175                 else
1176                         leave_mm(smp_processor_id());
1177         }
1178
1179         /* Get the "official" set of cpus referring to our pagetable. */
1180         if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
1181                 for_each_online_cpu(cpu) {
1182                         if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
1183                             && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
1184                                 continue;
1185                         smp_call_function_single(cpu, drop_other_mm_ref, mm, 1);
1186                 }
1187                 return;
1188         }
1189         cpumask_copy(mask, mm_cpumask(mm));
1190
1191         /* It's possible that a vcpu may have a stale reference to our
1192            cr3, because its in lazy mode, and it hasn't yet flushed
1193            its set of pending hypercalls yet.  In this case, we can
1194            look at its actual current cr3 value, and force it to flush
1195            if needed. */
1196         for_each_online_cpu(cpu) {
1197                 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
1198                         cpumask_set_cpu(cpu, mask);
1199         }
1200
1201         if (!cpumask_empty(mask))
1202                 smp_call_function_many(mask, drop_other_mm_ref, mm, 1);
1203         free_cpumask_var(mask);
1204 }
1205 #else
1206 static void xen_drop_mm_ref(struct mm_struct *mm)
1207 {
1208         if (current->active_mm == mm)
1209                 load_cr3(swapper_pg_dir);
1210 }
1211 #endif
1212
1213 /*
1214  * While a process runs, Xen pins its pagetables, which means that the
1215  * hypervisor forces it to be read-only, and it controls all updates
1216  * to it.  This means that all pagetable updates have to go via the
1217  * hypervisor, which is moderately expensive.
1218  *
1219  * Since we're pulling the pagetable down, we switch to use init_mm,
1220  * unpin old process pagetable and mark it all read-write, which
1221  * allows further operations on it to be simple memory accesses.
1222  *
1223  * The only subtle point is that another CPU may be still using the
1224  * pagetable because of lazy tlb flushing.  This means we need need to
1225  * switch all CPUs off this pagetable before we can unpin it.
1226  */
1227 void xen_exit_mmap(struct mm_struct *mm)
1228 {
1229         get_cpu();              /* make sure we don't move around */
1230         xen_drop_mm_ref(mm);
1231         put_cpu();
1232
1233         spin_lock(&mm->page_table_lock);
1234
1235         /* pgd may not be pinned in the error exit path of execve */
1236         if (xen_page_pinned(mm->pgd))
1237                 xen_pgd_unpin(mm);
1238
1239         spin_unlock(&mm->page_table_lock);
1240 }
1241
1242 static __init void xen_pagetable_setup_start(pgd_t *base)
1243 {
1244 }
1245
1246 static void xen_post_allocator_init(void);
1247
1248 static __init void xen_pagetable_setup_done(pgd_t *base)
1249 {
1250         xen_setup_shared_info();
1251         xen_post_allocator_init();
1252 }
1253
1254 static void xen_write_cr2(unsigned long cr2)
1255 {
1256         percpu_read(xen_vcpu)->arch.cr2 = cr2;
1257 }
1258
1259 static unsigned long xen_read_cr2(void)
1260 {
1261         return percpu_read(xen_vcpu)->arch.cr2;
1262 }
1263
1264 unsigned long xen_read_cr2_direct(void)
1265 {
1266         return percpu_read(xen_vcpu_info.arch.cr2);
1267 }
1268
1269 static void xen_flush_tlb(void)
1270 {
1271         struct mmuext_op *op;
1272         struct multicall_space mcs;
1273
1274         preempt_disable();
1275
1276         mcs = xen_mc_entry(sizeof(*op));
1277
1278         op = mcs.args;
1279         op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1280         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1281
1282         xen_mc_issue(PARAVIRT_LAZY_MMU);
1283
1284         preempt_enable();
1285 }
1286
1287 static void xen_flush_tlb_single(unsigned long addr)
1288 {
1289         struct mmuext_op *op;
1290         struct multicall_space mcs;
1291
1292         preempt_disable();
1293
1294         mcs = xen_mc_entry(sizeof(*op));
1295         op = mcs.args;
1296         op->cmd = MMUEXT_INVLPG_LOCAL;
1297         op->arg1.linear_addr = addr & PAGE_MASK;
1298         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1299
1300         xen_mc_issue(PARAVIRT_LAZY_MMU);
1301
1302         preempt_enable();
1303 }
1304
1305 static void xen_flush_tlb_others(const struct cpumask *cpus,
1306                                  struct mm_struct *mm, unsigned long va)
1307 {
1308         struct {
1309                 struct mmuext_op op;
1310                 DECLARE_BITMAP(mask, NR_CPUS);
1311         } *args;
1312         struct multicall_space mcs;
1313
1314         if (cpumask_empty(cpus))
1315                 return;         /* nothing to do */
1316
1317         mcs = xen_mc_entry(sizeof(*args));
1318         args = mcs.args;
1319         args->op.arg2.vcpumask = to_cpumask(args->mask);
1320
1321         /* Remove us, and any offline CPUS. */
1322         cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
1323         cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
1324
1325         if (va == TLB_FLUSH_ALL) {
1326                 args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1327         } else {
1328                 args->op.cmd = MMUEXT_INVLPG_MULTI;
1329                 args->op.arg1.linear_addr = va;
1330         }
1331
1332         MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
1333
1334         xen_mc_issue(PARAVIRT_LAZY_MMU);
1335 }
1336
1337 static unsigned long xen_read_cr3(void)
1338 {
1339         return percpu_read(xen_cr3);
1340 }
1341
1342 static void set_current_cr3(void *v)
1343 {
1344         percpu_write(xen_current_cr3, (unsigned long)v);
1345 }
1346
1347 static void __xen_write_cr3(bool kernel, unsigned long cr3)
1348 {
1349         struct mmuext_op *op;
1350         struct multicall_space mcs;
1351         unsigned long mfn;
1352
1353         if (cr3)
1354                 mfn = pfn_to_mfn(PFN_DOWN(cr3));
1355         else
1356                 mfn = 0;
1357
1358         WARN_ON(mfn == 0 && kernel);
1359
1360         mcs = __xen_mc_entry(sizeof(*op));
1361
1362         op = mcs.args;
1363         op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1364         op->arg1.mfn = mfn;
1365
1366         MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
1367
1368         if (kernel) {
1369                 percpu_write(xen_cr3, cr3);
1370
1371                 /* Update xen_current_cr3 once the batch has actually
1372                    been submitted. */
1373                 xen_mc_callback(set_current_cr3, (void *)cr3);
1374         }
1375 }
1376
1377 static void xen_write_cr3(unsigned long cr3)
1378 {
1379         BUG_ON(preemptible());
1380
1381         xen_mc_batch();  /* disables interrupts */
1382
1383         /* Update while interrupts are disabled, so its atomic with
1384            respect to ipis */
1385         percpu_write(xen_cr3, cr3);
1386
1387         __xen_write_cr3(true, cr3);
1388
1389 #ifdef CONFIG_X86_64
1390         {
1391                 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1392                 if (user_pgd)
1393                         __xen_write_cr3(false, __pa(user_pgd));
1394                 else
1395                         __xen_write_cr3(false, 0);
1396         }
1397 #endif
1398
1399         xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
1400 }
1401
1402 static int xen_pgd_alloc(struct mm_struct *mm)
1403 {
1404         pgd_t *pgd = mm->pgd;
1405         int ret = 0;
1406
1407         BUG_ON(PagePinned(virt_to_page(pgd)));
1408
1409 #ifdef CONFIG_X86_64
1410         {
1411                 struct page *page = virt_to_page(pgd);
1412                 pgd_t *user_pgd;
1413
1414                 BUG_ON(page->private != 0);
1415
1416                 ret = -ENOMEM;
1417
1418                 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
1419                 page->private = (unsigned long)user_pgd;
1420
1421                 if (user_pgd != NULL) {
1422                         user_pgd[pgd_index(VSYSCALL_START)] =
1423                                 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
1424                         ret = 0;
1425                 }
1426
1427                 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1428         }
1429 #endif
1430
1431         return ret;
1432 }
1433
1434 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1435 {
1436 #ifdef CONFIG_X86_64
1437         pgd_t *user_pgd = xen_get_user_pgd(pgd);
1438
1439         if (user_pgd)
1440                 free_page((unsigned long)user_pgd);
1441 #endif
1442 }
1443
1444 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1445 {
1446         unsigned long pfn = pte_pfn(pte);
1447
1448 #ifdef CONFIG_X86_32
1449         /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1450         if (pte_val_ma(*ptep) & _PAGE_PRESENT)
1451                 pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
1452                                pte_val_ma(pte));
1453 #endif
1454
1455         /*
1456          * If the new pfn is within the range of the newly allocated
1457          * kernel pagetable, and it isn't being mapped into an
1458          * early_ioremap fixmap slot, make sure it is RO.
1459          */
1460         if (!is_early_ioremap_ptep(ptep) &&
1461             pfn >= e820_table_start && pfn < e820_table_end)
1462                 pte = pte_wrprotect(pte);
1463
1464         return pte;
1465 }
1466
1467 /* Init-time set_pte while constructing initial pagetables, which
1468    doesn't allow RO pagetable pages to be remapped RW */
1469 static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
1470 {
1471         pte = mask_rw_pte(ptep, pte);
1472
1473         xen_set_pte(ptep, pte);
1474 }
1475
1476 static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1477 {
1478         struct mmuext_op op;
1479         op.cmd = cmd;
1480         op.arg1.mfn = pfn_to_mfn(pfn);
1481         if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
1482                 BUG();
1483 }
1484
1485 /* Early in boot, while setting up the initial pagetable, assume
1486    everything is pinned. */
1487 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1488 {
1489 #ifdef CONFIG_FLATMEM
1490         BUG_ON(mem_map);        /* should only be used early */
1491 #endif
1492         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1493         pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1494 }
1495
1496 /* Used for pmd and pud */
1497 static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1498 {
1499 #ifdef CONFIG_FLATMEM
1500         BUG_ON(mem_map);        /* should only be used early */
1501 #endif
1502         make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1503 }
1504
1505 /* Early release_pte assumes that all pts are pinned, since there's
1506    only init_mm and anything attached to that is pinned. */
1507 static __init void xen_release_pte_init(unsigned long pfn)
1508 {
1509         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1510         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1511 }
1512
1513 static __init void xen_release_pmd_init(unsigned long pfn)
1514 {
1515         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1516 }
1517
1518 /* This needs to make sure the new pte page is pinned iff its being
1519    attached to a pinned pagetable. */
1520 static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
1521 {
1522         struct page *page = pfn_to_page(pfn);
1523
1524         if (PagePinned(virt_to_page(mm->pgd))) {
1525                 SetPagePinned(page);
1526
1527                 if (!PageHighMem(page)) {
1528                         make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
1529                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1530                                 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1531                 } else {
1532                         /* make sure there are no stray mappings of
1533                            this page */
1534                         kmap_flush_unused();
1535                 }
1536         }
1537 }
1538
1539 static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1540 {
1541         xen_alloc_ptpage(mm, pfn, PT_PTE);
1542 }
1543
1544 static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1545 {
1546         xen_alloc_ptpage(mm, pfn, PT_PMD);
1547 }
1548
1549 /* This should never happen until we're OK to use struct page */
1550 static void xen_release_ptpage(unsigned long pfn, unsigned level)
1551 {
1552         struct page *page = pfn_to_page(pfn);
1553
1554         if (PagePinned(page)) {
1555                 if (!PageHighMem(page)) {
1556                         if (level == PT_PTE && USE_SPLIT_PTLOCKS)
1557                                 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1558                         make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1559                 }
1560                 ClearPagePinned(page);
1561         }
1562 }
1563
1564 static void xen_release_pte(unsigned long pfn)
1565 {
1566         xen_release_ptpage(pfn, PT_PTE);
1567 }
1568
1569 static void xen_release_pmd(unsigned long pfn)
1570 {
1571         xen_release_ptpage(pfn, PT_PMD);
1572 }
1573
1574 #if PAGETABLE_LEVELS == 4
1575 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1576 {
1577         xen_alloc_ptpage(mm, pfn, PT_PUD);
1578 }
1579
1580 static void xen_release_pud(unsigned long pfn)
1581 {
1582         xen_release_ptpage(pfn, PT_PUD);
1583 }
1584 #endif
1585
1586 void __init xen_reserve_top(void)
1587 {
1588 #ifdef CONFIG_X86_32
1589         unsigned long top = HYPERVISOR_VIRT_START;
1590         struct xen_platform_parameters pp;
1591
1592         if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1593                 top = pp.virt_start;
1594
1595         reserve_top_address(-top);
1596 #endif  /* CONFIG_X86_32 */
1597 }
1598
1599 /*
1600  * Like __va(), but returns address in the kernel mapping (which is
1601  * all we have until the physical memory mapping has been set up.
1602  */
1603 static void *__ka(phys_addr_t paddr)
1604 {
1605 #ifdef CONFIG_X86_64
1606         return (void *)(paddr + __START_KERNEL_map);
1607 #else
1608         return __va(paddr);
1609 #endif
1610 }
1611
1612 /* Convert a machine address to physical address */
1613 static unsigned long m2p(phys_addr_t maddr)
1614 {
1615         phys_addr_t paddr;
1616
1617         maddr &= PTE_PFN_MASK;
1618         paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1619
1620         return paddr;
1621 }
1622
1623 /* Convert a machine address to kernel virtual */
1624 static void *m2v(phys_addr_t maddr)
1625 {
1626         return __ka(m2p(maddr));
1627 }
1628
1629 /* Set the page permissions on an identity-mapped pages */
1630 static void set_page_prot(void *addr, pgprot_t prot)
1631 {
1632         unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1633         pte_t pte = pfn_pte(pfn, prot);
1634
1635         if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1636                 BUG();
1637 }
1638
1639 static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1640 {
1641         unsigned pmdidx, pteidx;
1642         unsigned ident_pte;
1643         unsigned long pfn;
1644
1645         level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1646                                       PAGE_SIZE);
1647
1648         ident_pte = 0;
1649         pfn = 0;
1650         for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1651                 pte_t *pte_page;
1652
1653                 /* Reuse or allocate a page of ptes */
1654                 if (pmd_present(pmd[pmdidx]))
1655                         pte_page = m2v(pmd[pmdidx].pmd);
1656                 else {
1657                         /* Check for free pte pages */
1658                         if (ident_pte == LEVEL1_IDENT_ENTRIES)
1659                                 break;
1660
1661                         pte_page = &level1_ident_pgt[ident_pte];
1662                         ident_pte += PTRS_PER_PTE;
1663
1664                         pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1665                 }
1666
1667                 /* Install mappings */
1668                 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1669                         pte_t pte;
1670
1671                         if (pfn > max_pfn_mapped)
1672                                 max_pfn_mapped = pfn;
1673
1674                         if (!pte_none(pte_page[pteidx]))
1675                                 continue;
1676
1677                         pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1678                         pte_page[pteidx] = pte;
1679                 }
1680         }
1681
1682         for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1683                 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1684
1685         set_page_prot(pmd, PAGE_KERNEL_RO);
1686 }
1687
1688 void __init xen_setup_machphys_mapping(void)
1689 {
1690         struct xen_machphys_mapping mapping;
1691         unsigned long machine_to_phys_nr_ents;
1692
1693         if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1694                 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1695                 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1696         } else {
1697                 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1698         }
1699         machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1700 }
1701
1702 #ifdef CONFIG_X86_64
1703 static void convert_pfn_mfn(void *v)
1704 {
1705         pte_t *pte = v;
1706         int i;
1707
1708         /* All levels are converted the same way, so just treat them
1709            as ptes. */
1710         for (i = 0; i < PTRS_PER_PTE; i++)
1711                 pte[i] = xen_make_pte(pte[i].pte);
1712 }
1713
1714 /*
1715  * Set up the inital kernel pagetable.
1716  *
1717  * We can construct this by grafting the Xen provided pagetable into
1718  * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
1719  * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
1720  * means that only the kernel has a physical mapping to start with -
1721  * but that's enough to get __va working.  We need to fill in the rest
1722  * of the physical mapping once some sort of allocator has been set
1723  * up.
1724  */
1725 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1726                                          unsigned long max_pfn)
1727 {
1728         pud_t *l3;
1729         pmd_t *l2;
1730
1731         /* Zap identity mapping */
1732         init_level4_pgt[0] = __pgd(0);
1733
1734         /* Pre-constructed entries are in pfn, so convert to mfn */
1735         convert_pfn_mfn(init_level4_pgt);
1736         convert_pfn_mfn(level3_ident_pgt);
1737         convert_pfn_mfn(level3_kernel_pgt);
1738
1739         l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1740         l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1741
1742         memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1743         memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1744
1745         l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1746         l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1747         memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1748
1749         /* Set up identity map */
1750         xen_map_identity_early(level2_ident_pgt, max_pfn);
1751
1752         /* Make pagetable pieces RO */
1753         set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1754         set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1755         set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1756         set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1757         set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1758         set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1759
1760         /* Pin down new L4 */
1761         pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1762                           PFN_DOWN(__pa_symbol(init_level4_pgt)));
1763
1764         /* Unpin Xen-provided one */
1765         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1766
1767         /* Switch over */
1768         pgd = init_level4_pgt;
1769
1770         /*
1771          * At this stage there can be no user pgd, and no page
1772          * structure to attach it to, so make sure we just set kernel
1773          * pgd.
1774          */
1775         xen_mc_batch();
1776         __xen_write_cr3(true, __pa(pgd));
1777         xen_mc_issue(PARAVIRT_LAZY_CPU);
1778
1779         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1780                       __pa(xen_start_info->pt_base +
1781                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1782                       "XEN PAGETABLES");
1783
1784         return pgd;
1785 }
1786 #else   /* !CONFIG_X86_64 */
1787 static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1788 static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1789
1790 static __init void xen_write_cr3_init(unsigned long cr3)
1791 {
1792         unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1793
1794         BUG_ON(read_cr3() != __pa(initial_page_table));
1795         BUG_ON(cr3 != __pa(swapper_pg_dir));
1796
1797         /*
1798          * We are switching to swapper_pg_dir for the first time (from
1799          * initial_page_table) and therefore need to mark that page
1800          * read-only and then pin it.
1801          *
1802          * Xen disallows sharing of kernel PMDs for PAE
1803          * guests. Therefore we must copy the kernel PMD from
1804          * initial_page_table into a new kernel PMD to be used in
1805          * swapper_pg_dir.
1806          */
1807         swapper_kernel_pmd =
1808                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1809         memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1810                sizeof(pmd_t) * PTRS_PER_PMD);
1811         swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1812                 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1813         set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1814
1815         set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1816         xen_write_cr3(cr3);
1817         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1818
1819         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1820                           PFN_DOWN(__pa(initial_page_table)));
1821         set_page_prot(initial_page_table, PAGE_KERNEL);
1822         set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1823
1824         pv_mmu_ops.write_cr3 = &xen_write_cr3;
1825 }
1826
1827 __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1828                                          unsigned long max_pfn)
1829 {
1830         pmd_t *kernel_pmd;
1831
1832         initial_kernel_pmd =
1833                 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1834
1835         max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1836                                   xen_start_info->nr_pt_frames * PAGE_SIZE +
1837                                   512*1024);
1838
1839         kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1840         memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1841
1842         xen_map_identity_early(initial_kernel_pmd, max_pfn);
1843
1844         memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1845         initial_page_table[KERNEL_PGD_BOUNDARY] =
1846                 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1847
1848         set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1849         set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1850         set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1851
1852         pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1853
1854         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1855                           PFN_DOWN(__pa(initial_page_table)));
1856         xen_write_cr3(__pa(initial_page_table));
1857
1858         memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1859                       __pa(xen_start_info->pt_base +
1860                            xen_start_info->nr_pt_frames * PAGE_SIZE),
1861                       "XEN PAGETABLES");
1862
1863         return initial_page_table;
1864 }
1865 #endif  /* CONFIG_X86_64 */
1866
1867 static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1868
1869 static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1870 {
1871         pte_t pte;
1872
1873         phys >>= PAGE_SHIFT;
1874
1875         switch (idx) {
1876         case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
1877 #ifdef CONFIG_X86_F00F_BUG
1878         case FIX_F00F_IDT:
1879 #endif
1880 #ifdef CONFIG_X86_32
1881         case FIX_WP_TEST:
1882         case FIX_VDSO:
1883 # ifdef CONFIG_HIGHMEM
1884         case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1885 # endif
1886 #else
1887         case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1888 #endif
1889         case FIX_TEXT_POKE0:
1890         case FIX_TEXT_POKE1:
1891                 /* All local page mappings */
1892                 pte = pfn_pte(phys, prot);
1893                 break;
1894
1895 #ifdef CONFIG_X86_LOCAL_APIC
1896         case FIX_APIC_BASE:     /* maps dummy local APIC */
1897                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1898                 break;
1899 #endif
1900
1901 #ifdef CONFIG_X86_IO_APIC
1902         case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1903                 /*
1904                  * We just don't map the IO APIC - all access is via
1905                  * hypercalls.  Keep the address in the pte for reference.
1906                  */
1907                 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1908                 break;
1909 #endif
1910
1911         case FIX_PARAVIRT_BOOTMAP:
1912                 /* This is an MFN, but it isn't an IO mapping from the
1913                    IO domain */
1914                 pte = mfn_pte(phys, prot);
1915                 break;
1916
1917         default:
1918                 /* By default, set_fixmap is used for hardware mappings */
1919                 pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
1920                 break;
1921         }
1922
1923         __native_set_fixmap(idx, pte);
1924
1925 #ifdef CONFIG_X86_64
1926         /* Replicate changes to map the vsyscall page into the user
1927            pagetable vsyscall mapping. */
1928         if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1929                 unsigned long vaddr = __fix_to_virt(idx);
1930                 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1931         }
1932 #endif
1933 }
1934
1935 __init void xen_ident_map_ISA(void)
1936 {
1937         unsigned long pa;
1938
1939         /*
1940          * If we're dom0, then linear map the ISA machine addresses into
1941          * the kernel's address space.
1942          */
1943         if (!xen_initial_domain())
1944                 return;
1945
1946         xen_raw_printk("Xen: setup ISA identity maps\n");
1947
1948         for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1949                 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1950
1951                 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1952                         BUG();
1953         }
1954
1955         xen_flush_tlb();
1956 }
1957
1958 static __init void xen_post_allocator_init(void)
1959 {
1960         pv_mmu_ops.set_pte = xen_set_pte;
1961         pv_mmu_ops.set_pmd = xen_set_pmd;
1962         pv_mmu_ops.set_pud = xen_set_pud;
1963 #if PAGETABLE_LEVELS == 4
1964         pv_mmu_ops.set_pgd = xen_set_pgd;
1965 #endif
1966
1967         /* This will work as long as patching hasn't happened yet
1968            (which it hasn't) */
1969         pv_mmu_ops.alloc_pte = xen_alloc_pte;
1970         pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1971         pv_mmu_ops.release_pte = xen_release_pte;
1972         pv_mmu_ops.release_pmd = xen_release_pmd;
1973 #if PAGETABLE_LEVELS == 4
1974         pv_mmu_ops.alloc_pud = xen_alloc_pud;
1975         pv_mmu_ops.release_pud = xen_release_pud;
1976 #endif
1977
1978 #ifdef CONFIG_X86_64
1979         SetPagePinned(virt_to_page(level3_user_vsyscall));
1980 #endif
1981         xen_mark_init_mm_pinned();
1982 }
1983
1984 static void xen_leave_lazy_mmu(void)
1985 {
1986         preempt_disable();
1987         xen_mc_flush();
1988         paravirt_leave_lazy_mmu();
1989         preempt_enable();
1990 }
1991
1992 static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1993         .read_cr2 = xen_read_cr2,
1994         .write_cr2 = xen_write_cr2,
1995
1996         .read_cr3 = xen_read_cr3,
1997 #ifdef CONFIG_X86_32
1998         .write_cr3 = xen_write_cr3_init,
1999 #else
2000         .write_cr3 = xen_write_cr3,
2001 #endif
2002
2003         .flush_tlb_user = xen_flush_tlb,
2004         .flush_tlb_kernel = xen_flush_tlb,
2005         .flush_tlb_single = xen_flush_tlb_single,
2006         .flush_tlb_others = xen_flush_tlb_others,
2007
2008         .pte_update = paravirt_nop,
2009         .pte_update_defer = paravirt_nop,
2010
2011         .pgd_alloc = xen_pgd_alloc,
2012         .pgd_free = xen_pgd_free,
2013
2014         .alloc_pte = xen_alloc_pte_init,
2015         .release_pte = xen_release_pte_init,
2016         .alloc_pmd = xen_alloc_pmd_init,
2017         .release_pmd = xen_release_pmd_init,
2018
2019         .set_pte = xen_set_pte_init,
2020         .set_pte_at = xen_set_pte_at,
2021         .set_pmd = xen_set_pmd_hyper,
2022
2023         .ptep_modify_prot_start = __ptep_modify_prot_start,
2024         .ptep_modify_prot_commit = __ptep_modify_prot_commit,
2025
2026         .pte_val = PV_CALLEE_SAVE(xen_pte_val),
2027         .pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2028
2029         .make_pte = PV_CALLEE_SAVE(xen_make_pte),
2030         .make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2031
2032 #ifdef CONFIG_X86_PAE
2033         .set_pte_atomic = xen_set_pte_atomic,
2034         .pte_clear = xen_pte_clear,
2035         .pmd_clear = xen_pmd_clear,
2036 #endif  /* CONFIG_X86_PAE */
2037         .set_pud = xen_set_pud_hyper,
2038
2039         .make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2040         .pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2041
2042 #if PAGETABLE_LEVELS == 4
2043         .pud_val = PV_CALLEE_SAVE(xen_pud_val),
2044         .make_pud = PV_CALLEE_SAVE(xen_make_pud),
2045         .set_pgd = xen_set_pgd_hyper,
2046
2047         .alloc_pud = xen_alloc_pmd_init,
2048         .release_pud = xen_release_pmd_init,
2049 #endif  /* PAGETABLE_LEVELS == 4 */
2050
2051         .activate_mm = xen_activate_mm,
2052         .dup_mmap = xen_dup_mmap,
2053         .exit_mmap = xen_exit_mmap,
2054
2055         .lazy_mode = {
2056                 .enter = paravirt_enter_lazy_mmu,
2057                 .leave = xen_leave_lazy_mmu,
2058         },
2059
2060         .set_fixmap = xen_set_fixmap,
2061 };
2062
2063 void __init xen_init_mmu_ops(void)
2064 {
2065         x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2066         x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2067         pv_mmu_ops = xen_mmu_ops;
2068
2069         memset(dummy_mapping, 0xff, PAGE_SIZE);
2070 }
2071
2072 /* Protected by xen_reservation_lock. */
2073 #define MAX_CONTIG_ORDER 9 /* 2MB */
2074 static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER];
2075
2076 #define VOID_PTE (mfn_pte(0, __pgprot(0)))
2077 static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2078                                 unsigned long *in_frames,
2079                                 unsigned long *out_frames)
2080 {
2081         int i;
2082         struct multicall_space mcs;
2083
2084         xen_mc_batch();
2085         for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) {
2086                 mcs = __xen_mc_entry(0);
2087
2088                 if (in_frames)
2089                         in_frames[i] = virt_to_mfn(vaddr);
2090
2091                 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2092                 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2093
2094                 if (out_frames)
2095                         out_frames[i] = virt_to_pfn(vaddr);
2096         }
2097         xen_mc_issue(0);
2098 }
2099
2100 /*
2101  * Update the pfn-to-mfn mappings for a virtual address range, either to
2102  * point to an array of mfns, or contiguously from a single starting
2103  * mfn.
2104  */
2105 static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2106                                      unsigned long *mfns,
2107                                      unsigned long first_mfn)
2108 {
2109         unsigned i, limit;
2110         unsigned long mfn;
2111
2112         xen_mc_batch();
2113
2114         limit = 1u << order;
2115         for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) {
2116                 struct multicall_space mcs;
2117                 unsigned flags;
2118
2119                 mcs = __xen_mc_entry(0);
2120                 if (mfns)
2121                         mfn = mfns[i];
2122                 else
2123                         mfn = first_mfn + i;
2124
2125                 if (i < (limit - 1))
2126                         flags = 0;
2127                 else {
2128                         if (order == 0)
2129                                 flags = UVMF_INVLPG | UVMF_ALL;
2130                         else
2131                                 flags = UVMF_TLB_FLUSH | UVMF_ALL;
2132                 }
2133
2134                 MULTI_update_va_mapping(mcs.mc, vaddr,
2135                                 mfn_pte(mfn, PAGE_KERNEL), flags);
2136
2137                 set_phys_to_machine(virt_to_pfn(vaddr), mfn);
2138         }
2139
2140         xen_mc_issue(0);
2141 }
2142
2143 /*
2144  * Perform the hypercall to exchange a region of our pfns to point to
2145  * memory with the required contiguous alignment.  Takes the pfns as
2146  * input, and populates mfns as output.
2147  *
2148  * Returns a success code indicating whether the hypervisor was able to
2149  * satisfy the request or not.
2150  */
2151 static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2152                                unsigned long *pfns_in,
2153                                unsigned long extents_out,
2154                                unsigned int order_out,
2155                                unsigned long *mfns_out,
2156                                unsigned int address_bits)
2157 {
2158         long rc;
2159         int success;
2160
2161         struct xen_memory_exchange exchange = {
2162                 .in = {
2163                         .nr_extents   = extents_in,
2164                         .extent_order = order_in,
2165                         .extent_start = pfns_in,
2166                         .domid        = DOMID_SELF
2167                 },
2168                 .out = {
2169                         .nr_extents   = extents_out,
2170                         .extent_order = order_out,
2171                         .extent_start = mfns_out,
2172                         .address_bits = address_bits,
2173                         .domid        = DOMID_SELF
2174                 }
2175         };
2176
2177         BUG_ON(extents_in << order_in != extents_out << order_out);
2178
2179         rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange);
2180         success = (exchange.nr_exchanged == extents_in);
2181
2182         BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0)));
2183         BUG_ON(success && (rc != 0));
2184
2185         return success;
2186 }
2187
2188 int xen_create_contiguous_region(unsigned long vstart, unsigned int order,
2189                                  unsigned int address_bits)
2190 {
2191         unsigned long *in_frames = discontig_frames, out_frame;
2192         unsigned long  flags;
2193         int            success;
2194
2195         /*
2196          * Currently an auto-translated guest will not perform I/O, nor will
2197          * it require PAE page directories below 4GB. Therefore any calls to
2198          * this function are redundant and can be ignored.
2199          */
2200
2201         if (xen_feature(XENFEAT_auto_translated_physmap))
2202                 return 0;
2203
2204         if (unlikely(order > MAX_CONTIG_ORDER))
2205                 return -ENOMEM;
2206
2207         memset((void *) vstart, 0, PAGE_SIZE << order);
2208
2209         spin_lock_irqsave(&xen_reservation_lock, flags);
2210
2211         /* 1. Zap current PTEs, remembering MFNs. */
2212         xen_zap_pfn_range(vstart, order, in_frames, NULL);
2213
2214         /* 2. Get a new contiguous memory extent. */
2215         out_frame = virt_to_pfn(vstart);
2216         success = xen_exchange_memory(1UL << order, 0, in_frames,
2217                                       1, order, &out_frame,
2218                                       address_bits);
2219
2220         /* 3. Map the new extent in place of old pages. */
2221         if (success)
2222                 xen_remap_exchanged_ptes(vstart, order, NULL, out_frame);
2223         else
2224                 xen_remap_exchanged_ptes(vstart, order, in_frames, 0);
2225
2226         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2227
2228         return success ? 0 : -ENOMEM;
2229 }
2230 EXPORT_SYMBOL_GPL(xen_create_contiguous_region);
2231
2232 void xen_destroy_contiguous_region(unsigned long vstart, unsigned int order)
2233 {
2234         unsigned long *out_frames = discontig_frames, in_frame;
2235         unsigned long  flags;
2236         int success;
2237
2238         if (xen_feature(XENFEAT_auto_translated_physmap))
2239                 return;
2240
2241         if (unlikely(order > MAX_CONTIG_ORDER))
2242                 return;
2243
2244         memset((void *) vstart, 0, PAGE_SIZE << order);
2245
2246         spin_lock_irqsave(&xen_reservation_lock, flags);
2247
2248         /* 1. Find start MFN of contiguous extent. */
2249         in_frame = virt_to_mfn(vstart);
2250
2251         /* 2. Zap current PTEs. */
2252         xen_zap_pfn_range(vstart, order, NULL, out_frames);
2253
2254         /* 3. Do the exchange for non-contiguous MFNs. */
2255         success = xen_exchange_memory(1, order, &in_frame, 1UL << order,
2256                                         0, out_frames, 0);
2257
2258         /* 4. Map new pages in place of old pages. */
2259         if (success)
2260                 xen_remap_exchanged_ptes(vstart, order, out_frames, 0);
2261         else
2262                 xen_remap_exchanged_ptes(vstart, order, NULL, in_frame);
2263
2264         spin_unlock_irqrestore(&xen_reservation_lock, flags);
2265 }
2266 EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region);
2267
2268 #ifdef CONFIG_XEN_PVHVM
2269 static void xen_hvm_exit_mmap(struct mm_struct *mm)
2270 {
2271         struct xen_hvm_pagetable_dying a;
2272         int rc;
2273
2274         a.domid = DOMID_SELF;
2275         a.gpa = __pa(mm->pgd);
2276         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2277         WARN_ON_ONCE(rc < 0);
2278 }
2279
2280 static int is_pagetable_dying_supported(void)
2281 {
2282         struct xen_hvm_pagetable_dying a;
2283         int rc = 0;
2284
2285         a.domid = DOMID_SELF;
2286         a.gpa = 0x00;
2287         rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a);
2288         if (rc < 0) {
2289                 printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n");
2290                 return 0;
2291         }
2292         return 1;
2293 }
2294
2295 void __init xen_hvm_init_mmu_ops(void)
2296 {
2297         if (is_pagetable_dying_supported())
2298                 pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap;
2299 }
2300 #endif
2301
2302 #define REMAP_BATCH_SIZE 16
2303
2304 struct remap_data {
2305         unsigned long mfn;
2306         pgprot_t prot;
2307         struct mmu_update *mmu_update;
2308 };
2309
2310 static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2311                                  unsigned long addr, void *data)
2312 {
2313         struct remap_data *rmd = data;
2314         pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2315
2316         rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr;
2317         rmd->mmu_update->val = pte_val_ma(pte);
2318         rmd->mmu_update++;
2319
2320         return 0;
2321 }
2322
2323 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2324                                unsigned long addr,
2325                                unsigned long mfn, int nr,
2326                                pgprot_t prot, unsigned domid)
2327 {
2328         struct remap_data rmd;
2329         struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2330         int batch;
2331         unsigned long range;
2332         int err = 0;
2333
2334         prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2335
2336         BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2337                                 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2338
2339         rmd.mfn = mfn;
2340         rmd.prot = prot;
2341
2342         while (nr) {
2343                 batch = min(REMAP_BATCH_SIZE, nr);
2344                 range = (unsigned long)batch << PAGE_SHIFT;
2345
2346                 rmd.mmu_update = mmu_update;
2347                 err = apply_to_page_range(vma->vm_mm, addr, range,
2348                                           remap_area_mfn_pte_fn, &rmd);
2349                 if (err)
2350                         goto out;
2351
2352                 err = -EFAULT;
2353                 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2354                         goto out;
2355
2356                 nr -= batch;
2357                 addr += range;
2358         }
2359
2360         err = 0;
2361 out:
2362
2363         flush_tlb_all();
2364
2365         return err;
2366 }
2367 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2368
2369 #ifdef CONFIG_XEN_DEBUG_FS
2370
2371 static int p2m_dump_open(struct inode *inode, struct file *filp)
2372 {
2373         return single_open(filp, p2m_dump_show, NULL);
2374 }
2375
2376 static const struct file_operations p2m_dump_fops = {
2377         .open           = p2m_dump_open,
2378         .read           = seq_read,
2379         .llseek         = seq_lseek,
2380         .release        = single_release,
2381 };
2382
2383 static struct dentry *d_mmu_debug;
2384
2385 static int __init xen_mmu_debugfs(void)
2386 {
2387         struct dentry *d_xen = xen_init_debugfs();
2388
2389         if (d_xen == NULL)
2390                 return -ENOMEM;
2391
2392         d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2393
2394         debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2395
2396         debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2397         debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2398                            &mmu_stats.pgd_update_pinned);
2399         debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2400                            &mmu_stats.pgd_update_pinned);
2401
2402         debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2403         debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2404                            &mmu_stats.pud_update_pinned);
2405         debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2406                            &mmu_stats.pud_update_pinned);
2407
2408         debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2409         debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2410                            &mmu_stats.pmd_update_pinned);
2411         debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2412                            &mmu_stats.pmd_update_pinned);
2413
2414         debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2415 //      debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2416 //                         &mmu_stats.pte_update_pinned);
2417         debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2418                            &mmu_stats.pte_update_pinned);
2419
2420         debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2421         debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2422                            &mmu_stats.mmu_update_extended);
2423         xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2424                                      mmu_stats.mmu_update_histo, 20);
2425
2426         debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2427         debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2428                            &mmu_stats.set_pte_at_batched);
2429         debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2430                            &mmu_stats.set_pte_at_current);
2431         debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2432                            &mmu_stats.set_pte_at_kernel);
2433
2434         debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2435         debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2436                            &mmu_stats.prot_commit_batched);
2437
2438         debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2439         return 0;
2440 }
2441 fs_initcall(xen_mmu_debugfs);
2442
2443 #endif  /* CONFIG_XEN_DEBUG_FS */