From: Ingo Molnar Date: Mon, 21 Jul 2008 14:37:17 +0000 (+0200) Subject: Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core... X-Git-Tag: firefly_0821_release~19458^2~8 X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=acee709cab689ec7703770e8b8cb5cc3a4abcb31;p=firefly-linux-kernel-4.4.55.git Merge branches 'x86/urgent', 'x86/amd-iommu', 'x86/apic', 'x86/cleanups', 'x86/core', 'x86/cpu', 'x86/fixmap', 'x86/gart', 'x86/kprobes', 'x86/memtest', 'x86/modules', 'x86/nmi', 'x86/pat', 'x86/reboot', 'x86/setup', 'x86/step', 'x86/unify-pci', 'x86/uv', 'x86/xen' and 'xen-64bit' into x86/for-linus --- acee709cab689ec7703770e8b8cb5cc3a4abcb31 diff --cc arch/x86/ia32/ia32entry.S index 20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,b5e329da166c,20371d0635e4,20371d0635e4,20371d0635e4,20371d0635e4,8796d1905255,20371d0635e4,20371d0635e4,20371d0635e4,0ae1e77eae50..23d146ce676b --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@@@@@@@@@@@@@@@@@@@@ -136,14 -136,14 -136,14 -136,14 -136,14 -136,14 -136,14 -136,14 -136,14 -136,14 -123,13 -136,14 -136,14 -136,14 -136,14 -140,13 -136,14 -136,14 -136,14 -136,14 +140,13 @@@@@@@@@@@@@@@@@@@@@ ENTRY(ia32_sysenter_target .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) ---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ ---------- ---- ---- TI_flags(%r10) +++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys --------------- ----sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys +++++++++++++++ ++++sysenter_do_call: IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@@@@@@@@@@@@@@@@@@@@ -241,9 -241,9 -241,9 -241,9 -241,9 -241,9 -241,9 -241,9 -241,9 -241,9 -230,8 -241,9 -241,9 -241,9 -241,9 -244,8 -241,9 -241,9 -241,9 -241,9 +244,8 @@@@@@@@@@@@@@@@@@@@@ ENTRY(ia32_cstar_target .quad 1b,ia32_badarg .previous GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) ---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ ---------- ---- ---- TI_flags(%r10) +++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: @@@@@@@@@@@@@@@@@@@@@ -321,7 -321,7 -321,7 -321,7 -321,7 -321,7 -321,7 -321,7 -321,7 -321,7 -310,7 -321,7 -321,7 -321,7 -321,7 -323,7 -321,7 -321,7 -321,7 -321,8 +323,8 @@@@@@@@@@@@@@@@@@@@@ ENTRY(ia32_syscall /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP - swapgs +++++++++++++++++++ PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS /* * No need to follow this irqs on/off section: the syscall * disabled irqs and here we enable it straight after entry: @@@@@@@@@@@@@@@@@@@@@ -335,9 -335,9 -335,9 -335,9 -335,9 -335,9 -335,9 -335,9 -335,9 -335,9 -324,8 -335,9 -335,9 -335,9 -335,9 -337,8 -335,9 -335,9 -335,9 -336,9 +338,8 @@@@@@@@@@@@@@@@@@@@@ this could be a problem. */ SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) - orl $TS_COMPAT,threadinfo_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%r10) + orl $TS_COMPAT,TI_status(%r10) ---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ ---------- ---- ---- TI_flags(%r10) +++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax diff --cc arch/x86/kernel/amd_iommu.c index f2766d84c7a0,8c3deb027d3a,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,cf2f74bcde53,f2766d84c7a0,f2766d84c7a0,000000000000,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0,f2766d84c7a0..c25210e6ac88 mode 100644,100644,100644,100644,100644,100644,100644,100644,100644,100644,000000,100644,100644,100644,100644,100644,100644,100644,100644,100644..100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@@@@@@@@@@@@@@@@@@@@ -1,962 -1,1167 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,0 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 -1,962 +1,1167 @@@@@@@@@@@@@@@@@@@@@ + /* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + #include + #include + #include + #include + #include + #include ------- -- ---------#include +++++++ ++++++++++++#include + #include + #include + + #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) + + #define to_pages(addr, size) \ + (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) + + ++++++++++++++++++#define EXIT_LOOP_COUNT 10000000 + ++++++++++++++++++ + static DEFINE_RWLOCK(amd_iommu_devtable_lock); + - -------- ---------struct command { + ++++++++++++++++++/* + ++++++++++++++++++ * general struct to manage commands send to an IOMMU + ++++++++++++++++++ */ + ++++++++++++++++++struct iommu_cmd { + u32 data[4]; + }; + + static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, + struct unity_map_entry *e); + + ++++++++++++++++++/* returns !0 if the IOMMU is caching non-present entries in its TLB */ + static int iommu_has_npcache(struct amd_iommu *iommu) + { + return iommu->cap & IOMMU_CAP_NPCACHE; + } + - -------- ---------static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * IOMMU command queuing functions + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * Writes the command to the IOMMUs command buffer and informs the + ++++++++++++++++++ * hardware about the new command. Must be called with iommu->lock held. + ++++++++++++++++++ */ + ++++++++++++++++++static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) + { + u32 tail, head; + u8 *target; + + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + target = (iommu->cmd_buf + tail); + memcpy_toio(target, cmd, sizeof(*cmd)); + tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + if (tail == head) + return -ENOMEM; + writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + return 0; + } + - -------- ---------static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) + ++++++++++++++++++/* + ++++++++++++++++++ * General queuing function for commands. Takes iommu->lock and calls + ++++++++++++++++++ * __iommu_queue_command(). + ++++++++++++++++++ */ + ++++++++++++++++++static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) + { + unsigned long flags; + int ret; + + spin_lock_irqsave(&iommu->lock, flags); + ret = __iommu_queue_command(iommu, cmd); + spin_unlock_irqrestore(&iommu->lock, flags); + + return ret; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function is called whenever we need to ensure that the IOMMU has + ++++++++++++++++++ * completed execution of all commands we sent. It sends a + ++++++++++++++++++ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + ++++++++++++++++++ * us about that by writing a value to a physical address we pass with + ++++++++++++++++++ * the command. + ++++++++++++++++++ */ + static int iommu_completion_wait(struct amd_iommu *iommu) + { + int ret; - -------- --------- struct command cmd; + ++++++++++++++++++ struct iommu_cmd cmd; + volatile u64 ready = 0; + unsigned long ready_phys = virt_to_phys(&ready); + ++++++++++++++++++ unsigned long i = 0; + + memset(&cmd, 0, sizeof(cmd)); + cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - -------- --------- cmd.data[1] = HIGH_U32(ready_phys); + ++++++++++++++++++ cmd.data[1] = upper_32_bits(ready_phys); + cmd.data[2] = 1; /* value written to 'ready' */ + CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); + + iommu->need_sync = 0; + + ret = iommu_queue_command(iommu, &cmd); + + if (ret) + return ret; + - -------- --------- while (!ready) + ++++++++++++++++++ while (!ready && (i < EXIT_LOOP_COUNT)) { + ++++++++++++++++++ ++i; + cpu_relax(); + ++++++++++++++++++ } + ++++++++++++++++++ + ++++++++++++++++++ if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + ++++++++++++++++++ printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Command send function for invalidating a device table entry + ++++++++++++++++++ */ + static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) + { - -------- --------- struct command cmd; + ++++++++++++++++++ struct iommu_cmd cmd; + + BUG_ON(iommu == NULL); + + memset(&cmd, 0, sizeof(cmd)); + CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); + cmd.data[0] = devid; + + iommu->need_sync = 1; + + return iommu_queue_command(iommu, &cmd); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Generic command send function for invalidaing TLB entries + ++++++++++++++++++ */ + static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, + u64 address, u16 domid, int pde, int s) + { - -------- --------- struct command cmd; + ++++++++++++++++++ struct iommu_cmd cmd; + + memset(&cmd, 0, sizeof(cmd)); + address &= PAGE_MASK; + CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); + cmd.data[1] |= domid; + cmd.data[2] = LOW_U32(address); - -------- --------- cmd.data[3] = HIGH_U32(address); - -------- --------- if (s) + ++++++++++++++++++ cmd.data[3] = upper_32_bits(address); + ++++++++++++++++++ if (s) /* size bit - we flush more than one 4kb page */ + cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - -------- --------- if (pde) + ++++++++++++++++++ if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ + cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + + iommu->need_sync = 1; + + return iommu_queue_command(iommu, &cmd); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * TLB invalidation function which is called from the mapping functions. + ++++++++++++++++++ * It invalidates a single PTE if the range to flush is within a single + ++++++++++++++++++ * page. Otherwise it flushes the whole TLB of the IOMMU. + ++++++++++++++++++ */ + static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, + u64 address, size_t size) + { + int s = 0; + unsigned pages = to_pages(address, size); + + address &= PAGE_MASK; + + if (pages > 1) { + /* + * If we have to flush more than one page, flush all + * TLB entries for this domain + */ + address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + s = 1; + } + + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s); + + return 0; + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The functions below are used the create the page table mappings for + ++++++++++++++++++ * unity mapped regions. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * Generic mapping functions. It maps a physical address into a DMA + ++++++++++++++++++ * address space. It allocates the page table pages if necessary. + ++++++++++++++++++ * In the future it can be extended to a generic mapping function + ++++++++++++++++++ * supporting all features of AMD IOMMU page tables like level skipping + ++++++++++++++++++ * and full 64 bit address spaces. + ++++++++++++++++++ */ + static int iommu_map(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long phys_addr, + int prot) + { + u64 __pte, *pte, *page; + + bus_addr = PAGE_ALIGN(bus_addr); + phys_addr = PAGE_ALIGN(bus_addr); + + /* only support 512GB address spaces for now */ + if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK)) + return -EINVAL; + + pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + *pte = IOMMU_L2_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)]; + + if (!IOMMU_PTE_PRESENT(*pte)) { + page = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + *pte = IOMMU_L1_PDE(virt_to_phys(page)); + } + + pte = IOMMU_PTE_PAGE(*pte); + pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)]; + + if (IOMMU_PTE_PRESENT(*pte)) + return -EBUSY; + + __pte = phys_addr | IOMMU_PTE_P; + if (prot & IOMMU_PROT_IR) + __pte |= IOMMU_PTE_IR; + if (prot & IOMMU_PROT_IW) + __pte |= IOMMU_PTE_IW; + + *pte = __pte; + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function checks if a specific unity mapping entry is needed for + ++++++++++++++++++ * this specific IOMMU. + ++++++++++++++++++ */ + static int iommu_for_unity_map(struct amd_iommu *iommu, + struct unity_map_entry *entry) + { + u16 bdf, i; + + for (i = entry->devid_start; i <= entry->devid_end; ++i) { + bdf = amd_iommu_alias_table[i]; + if (amd_iommu_rlookup_table[bdf] == iommu) + return 1; + } + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Init the unity mappings for a specific IOMMU in the system + ++++++++++++++++++ * + ++++++++++++++++++ * Basically iterates over all unity mapping entries and applies them to + ++++++++++++++++++ * the default domain DMA of that IOMMU if necessary. + ++++++++++++++++++ */ + static int iommu_init_unity_mappings(struct amd_iommu *iommu) + { + struct unity_map_entry *entry; + int ret; + + list_for_each_entry(entry, &amd_iommu_unity_map, list) { + if (!iommu_for_unity_map(iommu, entry)) + continue; + ret = dma_ops_unity_map(iommu->default_dom, entry); + if (ret) + return ret; + } + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function actually applies the mapping to the page table of the + ++++++++++++++++++ * dma_ops domain. + ++++++++++++++++++ */ + static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, + struct unity_map_entry *e) + { + u64 addr; + int ret; + + for (addr = e->address_start; addr < e->address_end; + addr += PAGE_SIZE) { + ret = iommu_map(&dma_dom->domain, addr, addr, e->prot); + if (ret) + return ret; + /* + * if unity mapping is in aperture range mark the page + * as allocated in the aperture + */ + if (addr < dma_dom->aperture_size) + __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap); + } + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Inits the unity mappings required for a specific device + ++++++++++++++++++ */ + static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, + u16 devid) + { + struct unity_map_entry *e; + int ret; + + list_for_each_entry(e, &amd_iommu_unity_map, list) { + if (!(devid >= e->devid_start && devid <= e->devid_end)) + continue; + ret = dma_ops_unity_map(dma_dom, e); + if (ret) + return ret; + } + + return 0; + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The next functions belong to the address allocator for the dma_ops + ++++++++++++++++++ * interface functions. They work like the allocators in the other IOMMU + ++++++++++++++++++ * drivers. Its basically a bitmap which marks the allocated pages in + ++++++++++++++++++ * the aperture. Maybe it could be enhanced in the future to a more + ++++++++++++++++++ * efficient allocator. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + static unsigned long dma_mask_to_pages(unsigned long mask) + { + return (mask >> PAGE_SHIFT) + + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The address allocator core function. + ++++++++++++++++++ * + ++++++++++++++++++ * called with domain->lock held + ++++++++++++++++++ */ + static unsigned long dma_ops_alloc_addresses(struct device *dev, + struct dma_ops_domain *dom, + unsigned int pages) + { + unsigned long limit = dma_mask_to_pages(*dev->dma_mask); + unsigned long address; + unsigned long size = dom->aperture_size >> PAGE_SHIFT; + unsigned long boundary_size; + + boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + PAGE_SIZE) >> PAGE_SHIFT; + limit = limit < size ? limit : size; + + if (dom->next_bit >= limit) + dom->next_bit = 0; + + address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, + 0 , boundary_size, 0); + if (address == -1) + address = iommu_area_alloc(dom->bitmap, limit, 0, pages, + 0, boundary_size, 0); + + if (likely(address != -1)) { + dom->next_bit = address + pages; + address <<= PAGE_SHIFT; + } else + address = bad_dma_address; + + WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); + + return address; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The address free function. + ++++++++++++++++++ * + ++++++++++++++++++ * called with domain->lock held + ++++++++++++++++++ */ + static void dma_ops_free_addresses(struct dma_ops_domain *dom, + unsigned long address, + unsigned int pages) + { + address >>= PAGE_SHIFT; + iommu_area_free(dom->bitmap, address, pages); + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The next functions belong to the domain allocation. A domain is + ++++++++++++++++++ * allocated for every IOMMU as the default domain. If device isolation + ++++++++++++++++++ * is enabled, every device get its own domain. The most important thing + ++++++++++++++++++ * about domains is the page table mapping the DMA address space they + ++++++++++++++++++ * contain. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + static u16 domain_id_alloc(void) + { + unsigned long flags; + int id; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + id = find_first_zero_bit(amd_iommu_pd_alloc_bitmap, MAX_DOMAIN_ID); + BUG_ON(id == 0); + if (id > 0 && id < MAX_DOMAIN_ID) + __set_bit(id, amd_iommu_pd_alloc_bitmap); + else + id = 0; + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return id; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Used to reserve address ranges in the aperture (e.g. for exclusion + ++++++++++++++++++ * ranges. + ++++++++++++++++++ */ + static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, + unsigned long start_page, + unsigned int pages) + { + unsigned int last_page = dom->aperture_size >> PAGE_SHIFT; + + if (start_page + pages > last_page) + pages = last_page - start_page; + + set_bit_string(dom->bitmap, start_page, pages); + } + + static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) + { + int i, j; + u64 *p1, *p2, *p3; + + p1 = dma_dom->domain.pt_root; + + if (!p1) + return; + + for (i = 0; i < 512; ++i) { + if (!IOMMU_PTE_PRESENT(p1[i])) + continue; + + p2 = IOMMU_PTE_PAGE(p1[i]); + for (j = 0; j < 512; ++i) { + if (!IOMMU_PTE_PRESENT(p2[j])) + continue; + p3 = IOMMU_PTE_PAGE(p2[j]); + free_page((unsigned long)p3); + } + + free_page((unsigned long)p2); + } + + free_page((unsigned long)p1); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Free a domain, only used if something went wrong in the + ++++++++++++++++++ * allocation path and we need to free an already allocated page table + ++++++++++++++++++ */ + static void dma_ops_domain_free(struct dma_ops_domain *dom) + { + if (!dom) + return; + + dma_ops_free_pagetable(dom); + + kfree(dom->pte_pages); + + kfree(dom->bitmap); + + kfree(dom); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Allocates a new protection domain usable for the dma_ops functions. + ++++++++++++++++++ * It also intializes the page table and the address allocator data + ++++++++++++++++++ * structures required for the dma_ops interface + ++++++++++++++++++ */ + static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, + unsigned order) + { + struct dma_ops_domain *dma_dom; + unsigned i, num_pte_pages; + u64 *l2_pde; + u64 address; + + /* + * Currently the DMA aperture must be between 32 MB and 1GB in size + */ + if ((order < 25) || (order > 30)) + return NULL; + + dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL); + if (!dma_dom) + return NULL; + + spin_lock_init(&dma_dom->domain.lock); + + dma_dom->domain.id = domain_id_alloc(); + if (dma_dom->domain.id == 0) + goto free_dma_dom; + dma_dom->domain.mode = PAGE_MODE_3_LEVEL; + dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); + dma_dom->domain.priv = dma_dom; + if (!dma_dom->domain.pt_root) + goto free_dma_dom; + dma_dom->aperture_size = (1ULL << order); + dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8), + GFP_KERNEL); + if (!dma_dom->bitmap) + goto free_dma_dom; + /* + * mark the first page as allocated so we never return 0 as + * a valid dma-address. So we can use 0 as error value + */ + dma_dom->bitmap[0] = 1; + dma_dom->next_bit = 0; + + ++++++++++++++++++ /* Intialize the exclusion range if necessary */ + if (iommu->exclusion_start && + iommu->exclusion_start < dma_dom->aperture_size) { + unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; + int pages = to_pages(iommu->exclusion_start, + iommu->exclusion_length); + dma_ops_reserve_addresses(dma_dom, startpage, pages); + } + + ++++++++++++++++++ /* + ++++++++++++++++++ * At the last step, build the page tables so we don't need to + ++++++++++++++++++ * allocate page table pages in the dma_ops mapping/unmapping + ++++++++++++++++++ * path. + ++++++++++++++++++ */ + num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); + dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), + GFP_KERNEL); + if (!dma_dom->pte_pages) + goto free_dma_dom; + + l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL); + if (l2_pde == NULL) + goto free_dma_dom; + + dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde)); + + for (i = 0; i < num_pte_pages; ++i) { + dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL); + if (!dma_dom->pte_pages[i]) + goto free_dma_dom; + address = virt_to_phys(dma_dom->pte_pages[i]); + l2_pde[i] = IOMMU_L1_PDE(address); + } + + return dma_dom; + + free_dma_dom: + dma_ops_domain_free(dma_dom); + + return NULL; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Find out the protection domain structure for a given PCI device. This + ++++++++++++++++++ * will give us the pointer to the page table root for example. + ++++++++++++++++++ */ + static struct protection_domain *domain_for_device(u16 devid) + { + struct protection_domain *dom; + unsigned long flags; + + read_lock_irqsave(&amd_iommu_devtable_lock, flags); + dom = amd_iommu_pd_table[devid]; + read_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + return dom; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * If a device is not yet associated with a domain, this function does + ++++++++++++++++++ * assigns it visible for the hardware + ++++++++++++++++++ */ + static void set_device_domain(struct amd_iommu *iommu, + struct protection_domain *domain, + u16 devid) + { + unsigned long flags; + + u64 pte_root = virt_to_phys(domain->pt_root); + + pte_root |= (domain->mode & 0x07) << 9; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; + + write_lock_irqsave(&amd_iommu_devtable_lock, flags); + amd_iommu_dev_table[devid].data[0] = pte_root; + amd_iommu_dev_table[devid].data[1] = pte_root >> 32; + amd_iommu_dev_table[devid].data[2] = domain->id; + + amd_iommu_pd_table[devid] = domain; + write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); + + iommu_queue_inv_dev_entry(iommu, devid); + + iommu->need_sync = 1; + } + + ++++++++++++++++++/***************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The next functions belong to the dma_ops mapping/unmapping code. + ++++++++++++++++++ * + ++++++++++++++++++ *****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * In the dma_ops path we only have the struct device. This function + ++++++++++++++++++ * finds the corresponding IOMMU, the protection domain and the + ++++++++++++++++++ * requestor id for a given device. + ++++++++++++++++++ * If the device is not yet associated with a domain this is also done + ++++++++++++++++++ * in this function. + ++++++++++++++++++ */ + static int get_device_resources(struct device *dev, + struct amd_iommu **iommu, + struct protection_domain **domain, + u16 *bdf) + { + struct dma_ops_domain *dma_dom; + struct pci_dev *pcidev; + u16 _bdf; + + BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); + + pcidev = to_pci_dev(dev); - -------- --------- _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + ++++++++++++++++++ _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + ++++++++++++++++++ /* device not translated by any IOMMU in the system? */ + if (_bdf >= amd_iommu_last_bdf) { + *iommu = NULL; + *domain = NULL; + *bdf = 0xffff; + return 0; + } + + *bdf = amd_iommu_alias_table[_bdf]; + + *iommu = amd_iommu_rlookup_table[*bdf]; + if (*iommu == NULL) + return 0; + dma_dom = (*iommu)->default_dom; + *domain = domain_for_device(*bdf); + if (*domain == NULL) { + *domain = &dma_dom->domain; + set_device_domain(*iommu, *domain, *bdf); + printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " + "device ", (*domain)->id); + print_devid(_bdf, 1); + } + + return 1; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This is the generic map function. It maps one 4kb page at paddr to + ++++++++++++++++++ * the given address in the DMA address space for the domain. + ++++++++++++++++++ */ + static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, + struct dma_ops_domain *dom, + unsigned long address, + phys_addr_t paddr, + int direction) + { + u64 *pte, __pte; + + WARN_ON(address > dom->aperture_size); + + paddr &= PAGE_MASK; + + pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte += IOMMU_PTE_L0_INDEX(address); + + __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; + + if (direction == DMA_TO_DEVICE) + __pte |= IOMMU_PTE_IR; + else if (direction == DMA_FROM_DEVICE) + __pte |= IOMMU_PTE_IW; + else if (direction == DMA_BIDIRECTIONAL) + __pte |= IOMMU_PTE_IR | IOMMU_PTE_IW; + + WARN_ON(*pte); + + *pte = __pte; + + return (dma_addr_t)address; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The generic unmapping function for on page in the DMA address space. + ++++++++++++++++++ */ + static void dma_ops_domain_unmap(struct amd_iommu *iommu, + struct dma_ops_domain *dom, + unsigned long address) + { + u64 *pte; + + if (address >= dom->aperture_size) + return; + + WARN_ON(address & 0xfffULL || address > dom->aperture_size); + + pte = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)]; + pte += IOMMU_PTE_L0_INDEX(address); + + WARN_ON(!*pte); + + *pte = 0ULL; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function contains common code for mapping of a physically + ++++++++++++++++++ * contiguous memory region into DMA address space. It is uses by all + ++++++++++++++++++ * mapping functions provided by this IOMMU driver. + ++++++++++++++++++ * Must be called with the domain lock held. + ++++++++++++++++++ */ + static dma_addr_t __map_single(struct device *dev, + struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + phys_addr_t paddr, + size_t size, + int dir) + { + dma_addr_t offset = paddr & ~PAGE_MASK; + dma_addr_t address, start; + unsigned int pages; + int i; + + pages = to_pages(paddr, size); + paddr &= PAGE_MASK; + + address = dma_ops_alloc_addresses(dev, dma_dom, pages); + if (unlikely(address == bad_dma_address)) + goto out; + + start = address; + for (i = 0; i < pages; ++i) { + dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); + paddr += PAGE_SIZE; + start += PAGE_SIZE; + } + address += offset; + + out: + return address; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Does the reverse of the __map_single function. Must be called with + ++++++++++++++++++ * the domain lock held too + ++++++++++++++++++ */ + static void __unmap_single(struct amd_iommu *iommu, + struct dma_ops_domain *dma_dom, + dma_addr_t dma_addr, + size_t size, + int dir) + { + dma_addr_t i, start; + unsigned int pages; + + if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) + return; + + pages = to_pages(dma_addr, size); + dma_addr &= PAGE_MASK; + start = dma_addr; + + for (i = 0; i < pages; ++i) { + dma_ops_domain_unmap(iommu, dma_dom, start); + start += PAGE_SIZE; + } + + dma_ops_free_addresses(dma_dom, dma_addr, pages); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported map_single function for dma_ops. + ++++++++++++++++++ */ + static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, + size_t size, int dir) + { + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + dma_addr_t addr; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (iommu == NULL || domain == NULL) + ++++++++++++++++++ /* device not handled by any AMD IOMMU */ + return (dma_addr_t)paddr; + + spin_lock_irqsave(&domain->lock, flags); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); + if (addr == bad_dma_address) + goto out; + + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + out: + spin_unlock_irqrestore(&domain->lock, flags); + + return addr; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported unmap_single function for dma_ops. + ++++++++++++++++++ */ + static void unmap_single(struct device *dev, dma_addr_t dma_addr, + size_t size, int dir) + { + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + + if (!get_device_resources(dev, &iommu, &domain, &devid)) + ++++++++++++++++++ /* device not handled by any AMD IOMMU */ + return; + + spin_lock_irqsave(&domain->lock, flags); + + __unmap_single(iommu, domain->priv, dma_addr, size, dir); + + iommu_flush_pages(iommu, domain->id, dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This is a special map_sg function which is used if we should map a + ++++++++++++++++++ * device which is not handled by an AMD IOMMU in the system. + ++++++++++++++++++ */ + static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) + { + struct scatterlist *s; + int i; + + for_each_sg(sglist, s, nelems, i) { + s->dma_address = (dma_addr_t)sg_phys(s); + s->dma_length = s->length; + } + + return nelems; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported map_sg function for dma_ops (handles scatter-gather + ++++++++++++++++++ * lists). + ++++++++++++++++++ */ + static int map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) + { + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + int i; + struct scatterlist *s; + phys_addr_t paddr; + int mapped_elems = 0; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) + return map_sg_no_iommu(dev, sglist, nelems, dir); + + spin_lock_irqsave(&domain->lock, flags); + + for_each_sg(sglist, s, nelems, i) { + paddr = sg_phys(s); + + s->dma_address = __map_single(dev, iommu, domain->priv, + paddr, s->length, dir); + + if (s->dma_address) { + s->dma_length = s->length; + mapped_elems++; + } else + goto unmap; + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, s->dma_address, + s->dma_length); + } + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + out: + spin_unlock_irqrestore(&domain->lock, flags); + + return mapped_elems; + unmap: + for_each_sg(sglist, s, mapped_elems, i) { + if (s->dma_address) + __unmap_single(iommu, domain->priv, s->dma_address, + s->dma_length, dir); + s->dma_address = s->dma_length = 0; + } + + mapped_elems = 0; + + goto out; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported map_sg function for dma_ops (handles scatter-gather + ++++++++++++++++++ * lists). + ++++++++++++++++++ */ + static void unmap_sg(struct device *dev, struct scatterlist *sglist, + int nelems, int dir) + { + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + struct scatterlist *s; + u16 devid; + int i; + + if (!get_device_resources(dev, &iommu, &domain, &devid)) + return; + + spin_lock_irqsave(&domain->lock, flags); + + for_each_sg(sglist, s, nelems, i) { + __unmap_single(iommu, domain->priv, s->dma_address, + s->dma_length, dir); + iommu_flush_pages(iommu, domain->id, s->dma_address, + s->dma_length); + s->dma_address = s->dma_length = 0; + } + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported alloc_coherent function for dma_ops. + ++++++++++++++++++ */ + static void *alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) + { + unsigned long flags; + void *virt_addr; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + phys_addr_t paddr; + + virt_addr = (void *)__get_free_pages(flag, get_order(size)); + if (!virt_addr) + return 0; + + memset(virt_addr, 0, size); + paddr = virt_to_phys(virt_addr); + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) { + *dma_addr = (dma_addr_t)paddr; + return virt_addr; + } + + spin_lock_irqsave(&domain->lock, flags); + + *dma_addr = __map_single(dev, iommu, domain->priv, paddr, + size, DMA_BIDIRECTIONAL); + + if (*dma_addr == bad_dma_address) { + free_pages((unsigned long)virt_addr, get_order(size)); + virt_addr = NULL; + goto out; + } + + if (iommu_has_npcache(iommu)) + iommu_flush_pages(iommu, domain->id, *dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + out: + spin_unlock_irqrestore(&domain->lock, flags); + + return virt_addr; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * The exported free_coherent function for dma_ops. + ++++++++++++++++++ * FIXME: fix the generic x86 DMA layer so that it actually calls that + ++++++++++++++++++ * function. + ++++++++++++++++++ */ + static void free_coherent(struct device *dev, size_t size, + void *virt_addr, dma_addr_t dma_addr) + { + unsigned long flags; + struct amd_iommu *iommu; + struct protection_domain *domain; + u16 devid; + + get_device_resources(dev, &iommu, &domain, &devid); + + if (!iommu || !domain) + goto free_mem; + + spin_lock_irqsave(&domain->lock, flags); + + __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); + iommu_flush_pages(iommu, domain->id, dma_addr, size); + + if (iommu->need_sync) + iommu_completion_wait(iommu); + + spin_unlock_irqrestore(&domain->lock, flags); + + free_mem: + free_pages((unsigned long)virt_addr, get_order(size)); + } + + /* + ++++++++++++++++++ * The function for pre-allocating protection domains. + ++++++++++++++++++ * + * If the driver core informs the DMA layer if a driver grabs a device + * we don't need to preallocate the protection domains anymore. + * For now we have to. + */ + void prealloc_protection_domains(void) + { + struct pci_dev *dev = NULL; + struct dma_ops_domain *dma_dom; + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + u16 devid; + + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + devid = (dev->bus->number << 8) | dev->devfn; + if (devid >= amd_iommu_last_bdf) + continue; + devid = amd_iommu_alias_table[devid]; + if (domain_for_device(devid)) + continue; + iommu = amd_iommu_rlookup_table[devid]; + if (!iommu) + continue; + dma_dom = dma_ops_domain_alloc(iommu, order); + if (!dma_dom) + continue; + init_unity_mappings_for_device(dma_dom, devid); + set_device_domain(iommu, &dma_dom->domain, devid); + printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", + dma_dom->domain.id); + print_devid(devid, 1); + } + } + + static struct dma_mapping_ops amd_iommu_dma_ops = { + .alloc_coherent = alloc_coherent, + .free_coherent = free_coherent, + .map_single = map_single, + .unmap_single = unmap_single, + .map_sg = map_sg, + .unmap_sg = unmap_sg, + }; + + ++++++++++++++++++/* + ++++++++++++++++++ * The function which clues the AMD IOMMU driver into dma_ops. + ++++++++++++++++++ */ + int __init amd_iommu_init_dma_ops(void) + { + struct amd_iommu *iommu; + int order = amd_iommu_aperture_order; + int ret; + + ++++++++++++++++++ /* + ++++++++++++++++++ * first allocate a default protection domain for every IOMMU we + ++++++++++++++++++ * found in the system. Devices not assigned to any other + ++++++++++++++++++ * protection domain will be assigned to the default one. + ++++++++++++++++++ */ + list_for_each_entry(iommu, &amd_iommu_list, list) { + iommu->default_dom = dma_ops_domain_alloc(iommu, order); + if (iommu->default_dom == NULL) + return -ENOMEM; + ret = iommu_init_unity_mappings(iommu); + if (ret) + goto free_domains; + } + + ++++++++++++++++++ /* + ++++++++++++++++++ * If device isolation is enabled, pre-allocate the protection + ++++++++++++++++++ * domains for each device. + ++++++++++++++++++ */ + if (amd_iommu_isolate) + prealloc_protection_domains(); + + iommu_detected = 1; + force_iommu = 1; + bad_dma_address = 0; + #ifdef CONFIG_GART_IOMMU + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; + #endif + + ++++++++++++++++++ /* Make the driver finally visible to the drivers */ + dma_ops = &amd_iommu_dma_ops; + + return 0; + + free_domains: + + list_for_each_entry(iommu, &amd_iommu_list, list) { + if (iommu->default_dom) + dma_ops_domain_free(iommu->default_dom); + } + + return ret; + } diff --cc arch/x86/kernel/amd_iommu_init.c index 2a13e430437d,7661b02d7208,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,66438284c699,2a13e430437d,2a13e430437d,000000000000,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d,2a13e430437d..c9d8ff2eb130 mode 100644,100644,100644,100644,100644,100644,100644,100644,100644,100644,000000,100644,100644,100644,100644,100644,100644,100644,100644,100644..100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@@@@@@@@@@@@@@@@@@@@ -1,875 -1,1060 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,0 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 -1,875 +1,1060 @@@@@@@@@@@@@@@@@@@@@ + /* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + #include + #include + #include + #include + #include + #include + #include + #include ------- -- ---------#include +++++++ ++++++++++++#include + + /* + * definitions for the ACPI scanning code + */ - -------- ---------#define UPDATE_LAST_BDF(x) do {\ - -------- --------- if ((x) > amd_iommu_last_bdf) \ - -------- --------- amd_iommu_last_bdf = (x); \ - -------- --------- } while (0); - -------- --------- - -------- ---------#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) + #define PCI_BUS(x) (((x) >> 8) & 0xff) + #define IVRS_HEADER_LENGTH 48 - -------- ---------#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) + + #define ACPI_IVHD_TYPE 0x10 + #define ACPI_IVMD_TYPE_ALL 0x20 + #define ACPI_IVMD_TYPE 0x21 + #define ACPI_IVMD_TYPE_RANGE 0x22 + + #define IVHD_DEV_ALL 0x01 + #define IVHD_DEV_SELECT 0x02 + #define IVHD_DEV_SELECT_RANGE_START 0x03 + #define IVHD_DEV_RANGE_END 0x04 + #define IVHD_DEV_ALIAS 0x42 + #define IVHD_DEV_ALIAS_RANGE 0x43 + #define IVHD_DEV_EXT_SELECT 0x46 + #define IVHD_DEV_EXT_SELECT_RANGE 0x47 + + #define IVHD_FLAG_HT_TUN_EN 0x00 + #define IVHD_FLAG_PASSPW_EN 0x01 + #define IVHD_FLAG_RESPASSPW_EN 0x02 + #define IVHD_FLAG_ISOC_EN 0x03 + + #define IVMD_FLAG_EXCL_RANGE 0x08 + #define IVMD_FLAG_UNITY_MAP 0x01 + + #define ACPI_DEVFLAG_INITPASS 0x01 + #define ACPI_DEVFLAG_EXTINT 0x02 + #define ACPI_DEVFLAG_NMI 0x04 + #define ACPI_DEVFLAG_SYSMGT1 0x10 + #define ACPI_DEVFLAG_SYSMGT2 0x20 + #define ACPI_DEVFLAG_LINT0 0x40 + #define ACPI_DEVFLAG_LINT1 0x80 + #define ACPI_DEVFLAG_ATSDIS 0x10000000 + + ++++++++++++++++++/* + ++++++++++++++++++ * ACPI table definitions + ++++++++++++++++++ * + ++++++++++++++++++ * These data structures are laid over the table to parse the important values + ++++++++++++++++++ * out of it. + ++++++++++++++++++ */ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * structure describing one IOMMU in the ACPI table. Typically followed by one + ++++++++++++++++++ * or more ivhd_entrys. + ++++++++++++++++++ */ + struct ivhd_header { + u8 type; + u8 flags; + u16 length; + u16 devid; + u16 cap_ptr; + u64 mmio_phys; + u16 pci_seg; + u16 info; + u32 reserved; + } __attribute__((packed)); + + ++++++++++++++++++/* + ++++++++++++++++++ * A device entry describing which devices a specific IOMMU translates and + ++++++++++++++++++ * which requestor ids they use. + ++++++++++++++++++ */ + struct ivhd_entry { + u8 type; + u16 devid; + u8 flags; + u32 ext; + } __attribute__((packed)); + + ++++++++++++++++++/* + ++++++++++++++++++ * An AMD IOMMU memory definition structure. It defines things like exclusion + ++++++++++++++++++ * ranges for devices and regions that should be unity mapped. + ++++++++++++++++++ */ + struct ivmd_header { + u8 type; + u8 flags; + u16 length; + u16 devid; + u16 aux; + u64 resv; + u64 range_start; + u64 range_length; + } __attribute__((packed)); + + static int __initdata amd_iommu_detected; + - -------- ---------u16 amd_iommu_last_bdf; - -------- ---------struct list_head amd_iommu_unity_map; - -------- ---------unsigned amd_iommu_aperture_order = 26; - -------- ---------int amd_iommu_isolate; + ++++++++++++++++++u16 amd_iommu_last_bdf; /* largest PCI device id we have + ++++++++++++++++++ to handle */ + ++++++++++++++++++LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings + ++++++++++++++++++ we find in ACPI */ + ++++++++++++++++++unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ + ++++++++++++++++++int amd_iommu_isolate; /* if 1, device isolation is enabled */ + ++++++++++++++++++ + ++++++++++++++++++LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the + ++++++++++++++++++ system */ + - -------- ---------struct list_head amd_iommu_list; + ++++++++++++++++++/* + ++++++++++++++++++ * Pointer to the device table which is shared by all AMD IOMMUs + ++++++++++++++++++ * it is indexed by the PCI device id or the HT unit id and contains + ++++++++++++++++++ * information about the domain the device belongs to as well as the + ++++++++++++++++++ * page table root pointer. + ++++++++++++++++++ */ + struct dev_table_entry *amd_iommu_dev_table; + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * The alias table is a driver specific data structure which contains the + ++++++++++++++++++ * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + ++++++++++++++++++ * More than one device can share the same requestor id. + ++++++++++++++++++ */ + u16 *amd_iommu_alias_table; + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * The rlookup table is used to find the IOMMU which is responsible + ++++++++++++++++++ * for a specific device. It is also indexed by the PCI device id. + ++++++++++++++++++ */ + struct amd_iommu **amd_iommu_rlookup_table; + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * The pd table (protection domain table) is used to find the protection domain + ++++++++++++++++++ * data structure a device belongs to. Indexed with the PCI device id too. + ++++++++++++++++++ */ + struct protection_domain **amd_iommu_pd_table; + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + ++++++++++++++++++ * to know which ones are already in use. + ++++++++++++++++++ */ + unsigned long *amd_iommu_pd_alloc_bitmap; + - -------- ---------static u32 dev_table_size; - -------- ---------static u32 alias_table_size; - -------- ---------static u32 rlookup_table_size; + ++++++++++++++++++static u32 dev_table_size; /* size of the device table */ + ++++++++++++++++++static u32 alias_table_size; /* size of the alias table */ + ++++++++++++++++++static u32 rlookup_table_size; /* size if the rlookup table */ + + ++++++++++++++++++static inline void update_last_devid(u16 devid) + ++++++++++++++++++{ + ++++++++++++++++++ if (devid > amd_iommu_last_bdf) + ++++++++++++++++++ amd_iommu_last_bdf = devid; + ++++++++++++++++++} + ++++++++++++++++++ + ++++++++++++++++++static inline unsigned long tbl_size(int entry_size) + ++++++++++++++++++{ + ++++++++++++++++++ unsigned shift = PAGE_SHIFT + + ++++++++++++++++++ get_order(amd_iommu_last_bdf * entry_size); + ++++++++++++++++++ + ++++++++++++++++++ return 1UL << shift; + ++++++++++++++++++} + ++++++++++++++++++ + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * AMD IOMMU MMIO register space handling functions + ++++++++++++++++++ * + ++++++++++++++++++ * These functions are used to program the IOMMU device registers in + ++++++++++++++++++ * MMIO space required for that driver. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * This function set the exclusion range in the IOMMU. DMA accesses to the + ++++++++++++++++++ * exclusion range are passed through untranslated + ++++++++++++++++++ */ + static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) + { + u64 start = iommu->exclusion_start & PAGE_MASK; + u64 limit = (start + iommu->exclusion_length) & PAGE_MASK; + u64 entry; + + if (!iommu->exclusion_start) + return; + + entry = start | MMIO_EXCL_ENABLE_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET, + &entry, sizeof(entry)); + + entry = limit; + memcpy_toio(iommu->mmio_base + MMIO_EXCL_LIMIT_OFFSET, + &entry, sizeof(entry)); + } + + ++++++++++++++++++/* Programs the physical address of the device table into the IOMMU hardware */ + static void __init iommu_set_device_table(struct amd_iommu *iommu) + { + u32 entry; + + BUG_ON(iommu->mmio_base == NULL); + + entry = virt_to_phys(amd_iommu_dev_table); + entry |= (dev_table_size >> 12) - 1; + memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, + &entry, sizeof(entry)); + } + + ++++++++++++++++++/* Generic functions to enable/disable certain features of the IOMMU. */ + static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) + { + u32 ctrl; + + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl |= (1 << bit); + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + } + + static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) + { + u32 ctrl; + + ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl &= ~(1 << bit); + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); + } + + ++++++++++++++++++/* Function to enable the hardware */ + void __init iommu_enable(struct amd_iommu *iommu) + { + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); + print_devid(iommu->devid, 0); + printk(" cap 0x%hx\n", iommu->cap_ptr); + + iommu_feature_enable(iommu, CONTROL_IOMMU_EN); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + ++++++++++++++++++ * the system has one. + ++++++++++++++++++ */ + static u8 * __init iommu_map_mmio_space(u64 address) + { + u8 *ret; + + if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) + return NULL; + + ret = ioremap_nocache(address, MMIO_REGION_LENGTH); + if (ret != NULL) + return ret; + + release_mem_region(address, MMIO_REGION_LENGTH); + + return NULL; + } + + static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) + { + if (iommu->mmio_base) + iounmap(iommu->mmio_base); + release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The functions below belong to the first pass of AMD IOMMU ACPI table + ++++++++++++++++++ * parsing. In this pass we try to find out the highest device id this + ++++++++++++++++++ * code has to handle. Upon this information the size of the shared data + ++++++++++++++++++ * structures is determined later. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * This function reads the last device id the IOMMU has to handle from the PCI + ++++++++++++++++++ * capability header for this IOMMU + ++++++++++++++++++ */ + static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) + { + u32 cap; + + cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - -------- --------- UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + ++++++++++++++++++ update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * After reading the highest device id from the IOMMU PCI capability header + ++++++++++++++++++ * this function looks if there is a higher device id defined in the ACPI table + ++++++++++++++++++ */ + static int __init find_last_devid_from_ivhd(struct ivhd_header *h) + { + u8 *p = (void *)h, *end = (void *)h; + struct ivhd_entry *dev; + + p += sizeof(*h); + end += h->length; + + find_last_devid_on_pci(PCI_BUS(h->devid), + PCI_SLOT(h->devid), + PCI_FUNC(h->devid), + h->cap_ptr); + + while (p < end) { + dev = (struct ivhd_entry *)p; + switch (dev->type) { + case IVHD_DEV_SELECT: + case IVHD_DEV_RANGE_END: + case IVHD_DEV_ALIAS: + case IVHD_DEV_EXT_SELECT: - -------- --------- UPDATE_LAST_BDF(dev->devid); + ++++++++++++++++++ /* all the above subfield types refer to device ids */ + ++++++++++++++++++ update_last_devid(dev->devid); + break; + default: + break; + } + p += 0x04 << (*p >> 6); + } + + WARN_ON(p != end); + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Iterate over all IVHD entries in the ACPI table and find the highest device + ++++++++++++++++++ * id which we need to handle. This is the first of three functions which parse + ++++++++++++++++++ * the ACPI table. So we check the checksum here. + ++++++++++++++++++ */ + static int __init find_last_devid_acpi(struct acpi_table_header *table) + { + int i; + u8 checksum = 0, *p = (u8 *)table, *end = (u8 *)table; + struct ivhd_header *h; + + /* + * Validate checksum here so we don't need to do it when + * we actually parse the table + */ + for (i = 0; i < table->length; ++i) + checksum += p[i]; + if (checksum != 0) + /* ACPI table corrupt */ + return -ENODEV; + + p += IVRS_HEADER_LENGTH; + + end += table->length; + while (p < end) { + h = (struct ivhd_header *)p; + switch (h->type) { + case ACPI_IVHD_TYPE: + find_last_devid_from_ivhd(h); + break; + default: + break; + } + p += h->length; + } + WARN_ON(p != end); + + return 0; + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The following functions belong the the code path which parses the ACPI table + ++++++++++++++++++ * the second time. In this ACPI parsing iteration we allocate IOMMU specific + ++++++++++++++++++ * data structures, initialize the device/alias/rlookup table and also + ++++++++++++++++++ * basically initialize the hardware. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * Allocates the command buffer. This buffer is per AMD IOMMU. We can + ++++++++++++++++++ * write commands to that buffer later and the IOMMU will execute them + ++++++++++++++++++ * asynchronously + ++++++++++++++++++ */ + static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) + { - -------- --------- u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + ++++++++++++++++++ u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(CMD_BUFFER_SIZE)); - -------- --------- u64 entry = 0; + ++++++++++++++++++ u64 entry; + + if (cmd_buf == NULL) + return NULL; + + iommu->cmd_buf_size = CMD_BUFFER_SIZE; + - -------- --------- memset(cmd_buf, 0, CMD_BUFFER_SIZE); - -------- --------- + entry = (u64)virt_to_phys(cmd_buf); + entry |= MMIO_CMD_SIZE_512; + memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, + &entry, sizeof(entry)); + + iommu_feature_enable(iommu, CONTROL_CMDBUF_EN); + + return cmd_buf; + } + + static void __init free_command_buffer(struct amd_iommu *iommu) + { - -------- --------- if (iommu->cmd_buf) - -------- --------- free_pages((unsigned long)iommu->cmd_buf, - -------- --------- get_order(CMD_BUFFER_SIZE)); + ++++++++++++++++++ free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); + } + + ++++++++++++++++++/* sets a specific bit in the device table entry. */ + static void set_dev_entry_bit(u16 devid, u8 bit) + { + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + amd_iommu_dev_table[devid].data[i] |= (1 << _bit); + } + - -------- ---------static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) + ++++++++++++++++++/* Writes the specific IOMMU for a device into the rlookup table */ + ++++++++++++++++++static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) + ++++++++++++++++++{ + ++++++++++++++++++ amd_iommu_rlookup_table[devid] = iommu; + ++++++++++++++++++} + ++++++++++++++++++ + ++++++++++++++++++/* + ++++++++++++++++++ * This function takes the device specific flags read from the ACPI + ++++++++++++++++++ * table and sets up the device table entry with that information + ++++++++++++++++++ */ + ++++++++++++++++++static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + ++++++++++++++++++ u16 devid, u32 flags, u32 ext_flags) + { + if (flags & ACPI_DEVFLAG_INITPASS) + set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); + if (flags & ACPI_DEVFLAG_EXTINT) + set_dev_entry_bit(devid, DEV_ENTRY_EINT_PASS); + if (flags & ACPI_DEVFLAG_NMI) + set_dev_entry_bit(devid, DEV_ENTRY_NMI_PASS); + if (flags & ACPI_DEVFLAG_SYSMGT1) + set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1); + if (flags & ACPI_DEVFLAG_SYSMGT2) + set_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2); + if (flags & ACPI_DEVFLAG_LINT0) + set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); + if (flags & ACPI_DEVFLAG_LINT1) + set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); - -------- ---------} + - -------- ---------static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) - -------- ---------{ - -------- --------- amd_iommu_rlookup_table[devid] = iommu; + ++++++++++++++++++ set_iommu_for_device(iommu, devid); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Reads the device exclusion range from ACPI and initialize IOMMU with + ++++++++++++++++++ * it + ++++++++++++++++++ */ + static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) + { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + + if (!(m->flags & IVMD_FLAG_EXCL_RANGE)) + return; + + if (iommu) { + ++++++++++++++++++ /* + ++++++++++++++++++ * We only can configure exclusion ranges per IOMMU, not + ++++++++++++++++++ * per device. But we can enable the exclusion range per + ++++++++++++++++++ * device. This is done here + ++++++++++++++++++ */ + set_dev_entry_bit(m->devid, DEV_ENTRY_EX); + iommu->exclusion_start = m->range_start; + iommu->exclusion_length = m->range_length; + } + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function reads some important data from the IOMMU PCI space and + ++++++++++++++++++ * initializes the driver data structure with it. It reads the hardware + ++++++++++++++++++ * capabilities and the first/last device entries + ++++++++++++++++++ */ + static void __init init_iommu_from_pci(struct amd_iommu *iommu) + { + int bus = PCI_BUS(iommu->devid); + int dev = PCI_SLOT(iommu->devid); + int fn = PCI_FUNC(iommu->devid); + int cap_ptr = iommu->cap_ptr; + u32 range; + + iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); + + range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - -------- --------- iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - -------- --------- iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + ++++++++++++++++++ iommu->first_device = calc_devid(MMIO_GET_BUS(range), + ++++++++++++++++++ MMIO_GET_FD(range)); + ++++++++++++++++++ iommu->last_device = calc_devid(MMIO_GET_BUS(range), + ++++++++++++++++++ MMIO_GET_LD(range)); + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Takes a pointer to an AMD IOMMU entry in the ACPI table and + ++++++++++++++++++ * initializes the hardware and our data structures with it. + ++++++++++++++++++ */ + static void __init init_iommu_from_acpi(struct amd_iommu *iommu, + struct ivhd_header *h) + { + u8 *p = (u8 *)h; + u8 *end = p, flags = 0; + u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; + u32 ext_flags = 0; - -------- --------- bool alias = 0; + ++++++++++++++++++ bool alias = false; + struct ivhd_entry *e; + + /* + * First set the recommended feature enable bits from ACPI + * into the IOMMU control registers + */ + h->flags & IVHD_FLAG_HT_TUN_EN ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + h->flags & IVHD_FLAG_PASSPW_EN ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + h->flags & IVHD_FLAG_RESPASSPW_EN ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + h->flags & IVHD_FLAG_ISOC_EN ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + + /* + * Done. Now parse the device entries + */ + p += sizeof(struct ivhd_header); + end += h->length; + + while (p < end) { + e = (struct ivhd_entry *)p; + switch (e->type) { + case IVHD_DEV_ALL: + for (dev_i = iommu->first_device; + dev_i <= iommu->last_device; ++dev_i) - -------- --------- set_dev_entry_from_acpi(dev_i, e->flags, 0); + ++++++++++++++++++ set_dev_entry_from_acpi(iommu, dev_i, + ++++++++++++++++++ e->flags, 0); + break; + case IVHD_DEV_SELECT: + devid = e->devid; - -------- --------- set_dev_entry_from_acpi(devid, e->flags, 0); + ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + break; + case IVHD_DEV_SELECT_RANGE_START: + devid_start = e->devid; + flags = e->flags; + ext_flags = 0; - -------- --------- alias = 0; + ++++++++++++++++++ alias = false; + break; + case IVHD_DEV_ALIAS: + devid = e->devid; + devid_to = e->ext >> 8; - -------- --------- set_dev_entry_from_acpi(devid, e->flags, 0); + ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags, 0); + amd_iommu_alias_table[devid] = devid_to; + break; + case IVHD_DEV_ALIAS_RANGE: + devid_start = e->devid; + flags = e->flags; + devid_to = e->ext >> 8; + ext_flags = 0; - -------- --------- alias = 1; + ++++++++++++++++++ alias = true; + break; + case IVHD_DEV_EXT_SELECT: + devid = e->devid; - -------- --------- set_dev_entry_from_acpi(devid, e->flags, e->ext); + ++++++++++++++++++ set_dev_entry_from_acpi(iommu, devid, e->flags, + ++++++++++++++++++ e->ext); + break; + case IVHD_DEV_EXT_SELECT_RANGE: + devid_start = e->devid; + flags = e->flags; + ext_flags = e->ext; - -------- --------- alias = 0; + ++++++++++++++++++ alias = false; + break; + case IVHD_DEV_RANGE_END: + devid = e->devid; + for (dev_i = devid_start; dev_i <= devid; ++dev_i) { + if (alias) + amd_iommu_alias_table[dev_i] = devid_to; - -------- --------- set_dev_entry_from_acpi( + ++++++++++++++++++ set_dev_entry_from_acpi(iommu, + amd_iommu_alias_table[dev_i], + flags, ext_flags); + } + break; + default: + break; + } + + p += 0x04 << (e->type >> 6); + } + } + + ++++++++++++++++++/* Initializes the device->iommu mapping for the driver */ + static int __init init_iommu_devices(struct amd_iommu *iommu) + { + u16 i; + + for (i = iommu->first_device; i <= iommu->last_device; ++i) + set_iommu_for_device(iommu, i); + + return 0; + } + + static void __init free_iommu_one(struct amd_iommu *iommu) + { + free_command_buffer(iommu); + iommu_unmap_mmio_space(iommu); + } + + static void __init free_iommu_all(void) + { + struct amd_iommu *iommu, *next; + + list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) { + list_del(&iommu->list); + free_iommu_one(iommu); + kfree(iommu); + } + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function clues the initialization function for one IOMMU + ++++++++++++++++++ * together and also allocates the command buffer and programs the + ++++++++++++++++++ * hardware. It does NOT enable the IOMMU. This is done afterwards. + ++++++++++++++++++ */ + static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) + { + spin_lock_init(&iommu->lock); + list_add_tail(&iommu->list, &amd_iommu_list); + + /* + * Copy data from ACPI table entry to the iommu struct + */ + iommu->devid = h->devid; + iommu->cap_ptr = h->cap_ptr; + iommu->mmio_phys = h->mmio_phys; + iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); + if (!iommu->mmio_base) + return -ENOMEM; + + iommu_set_device_table(iommu); + iommu->cmd_buf = alloc_command_buffer(iommu); + if (!iommu->cmd_buf) + return -ENOMEM; + + init_iommu_from_pci(iommu); + init_iommu_from_acpi(iommu, h); + init_iommu_devices(iommu); + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * Iterates over all IOMMU entries in the ACPI table, allocates the + ++++++++++++++++++ * IOMMU structure and initializes it with init_iommu_one() + ++++++++++++++++++ */ + static int __init init_iommu_all(struct acpi_table_header *table) + { + u8 *p = (u8 *)table, *end = (u8 *)table; + struct ivhd_header *h; + struct amd_iommu *iommu; + int ret; + - -------- --------- INIT_LIST_HEAD(&amd_iommu_list); - -------- --------- + end += table->length; + p += IVRS_HEADER_LENGTH; + + while (p < end) { + h = (struct ivhd_header *)p; + switch (*p) { + case ACPI_IVHD_TYPE: + iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); + if (iommu == NULL) + return -ENOMEM; + ret = init_iommu_one(iommu, h); + if (ret) + return ret; + break; + default: + break; + } + p += h->length; + + } + WARN_ON(p != end); + + return 0; + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * The next functions belong to the third pass of parsing the ACPI + ++++++++++++++++++ * table. In this last pass the memory mapping requirements are + ++++++++++++++++++ * gathered (like exclusion and unity mapping reanges). + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + static void __init free_unity_maps(void) + { + struct unity_map_entry *entry, *next; + + list_for_each_entry_safe(entry, next, &amd_iommu_unity_map, list) { + list_del(&entry->list); + kfree(entry); + } + } + + ++++++++++++++++++/* called when we find an exclusion range definition in ACPI */ + static int __init init_exclusion_range(struct ivmd_header *m) + { + int i; + + switch (m->type) { + case ACPI_IVMD_TYPE: + set_device_exclusion_range(m->devid, m); + break; + case ACPI_IVMD_TYPE_ALL: + for (i = 0; i < amd_iommu_last_bdf; ++i) + set_device_exclusion_range(i, m); + break; + case ACPI_IVMD_TYPE_RANGE: + for (i = m->devid; i <= m->aux; ++i) + set_device_exclusion_range(i, m); + break; + default: + break; + } + + return 0; + } + + ++++++++++++++++++/* called for unity map ACPI definition */ + static int __init init_unity_map_range(struct ivmd_header *m) + { + struct unity_map_entry *e = 0; + + e = kzalloc(sizeof(*e), GFP_KERNEL); + if (e == NULL) + return -ENOMEM; + + switch (m->type) { + default: + case ACPI_IVMD_TYPE: + e->devid_start = e->devid_end = m->devid; + break; + case ACPI_IVMD_TYPE_ALL: + e->devid_start = 0; + e->devid_end = amd_iommu_last_bdf; + break; + case ACPI_IVMD_TYPE_RANGE: + e->devid_start = m->devid; + e->devid_end = m->aux; + break; + } + e->address_start = PAGE_ALIGN(m->range_start); + e->address_end = e->address_start + PAGE_ALIGN(m->range_length); + e->prot = m->flags >> 1; + + list_add_tail(&e->list, &amd_iommu_unity_map); + + return 0; + } + + ++++++++++++++++++/* iterates over all memory definitions we find in the ACPI table */ + static int __init init_memory_definitions(struct acpi_table_header *table) + { + u8 *p = (u8 *)table, *end = (u8 *)table; + struct ivmd_header *m; + - -------- --------- INIT_LIST_HEAD(&amd_iommu_unity_map); - -------- --------- + end += table->length; + p += IVRS_HEADER_LENGTH; + + while (p < end) { + m = (struct ivmd_header *)p; + if (m->flags & IVMD_FLAG_EXCL_RANGE) + init_exclusion_range(m); + else if (m->flags & IVMD_FLAG_UNITY_MAP) + init_unity_map_range(m); + + p += m->length; + } + + return 0; + } + + ++++++++++++++++++/* + ++++++++++++++++++ * This function finally enables all IOMMUs found in the system after + ++++++++++++++++++ * they have been initialized + ++++++++++++++++++ */ + static void __init enable_iommus(void) + { + struct amd_iommu *iommu; + + list_for_each_entry(iommu, &amd_iommu_list, list) { + iommu_set_exclusion_range(iommu); + iommu_enable(iommu); + } + } + + /* + * Suspend/Resume support + * disable suspend until real resume implemented + */ + + static int amd_iommu_resume(struct sys_device *dev) + { + return 0; + } + + static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) + { + return -EINVAL; + } + + static struct sysdev_class amd_iommu_sysdev_class = { + .name = "amd_iommu", + .suspend = amd_iommu_suspend, + .resume = amd_iommu_resume, + }; + + static struct sys_device device_amd_iommu = { + .id = 0, + .cls = &amd_iommu_sysdev_class, + }; + + ++++++++++++++++++/* + ++++++++++++++++++ * This is the core init function for AMD IOMMU hardware in the system. + ++++++++++++++++++ * This function is called from the generic x86 DMA layer initialization + ++++++++++++++++++ * code. + ++++++++++++++++++ * + ++++++++++++++++++ * This function basically parses the ACPI table for AMD IOMMU (IVRS) + ++++++++++++++++++ * three times: + ++++++++++++++++++ * + ++++++++++++++++++ * 1 pass) Find the highest PCI device id the driver has to handle. + ++++++++++++++++++ * Upon this information the size of the data structures is + ++++++++++++++++++ * determined that needs to be allocated. + ++++++++++++++++++ * + ++++++++++++++++++ * 2 pass) Initialize the data structures just allocated with the + ++++++++++++++++++ * information in the ACPI table about available AMD IOMMUs + ++++++++++++++++++ * in the system. It also maps the PCI devices in the + ++++++++++++++++++ * system to specific IOMMUs + ++++++++++++++++++ * + ++++++++++++++++++ * 3 pass) After the basic data structures are allocated and + ++++++++++++++++++ * initialized we update them with information about memory + ++++++++++++++++++ * remapping requirements parsed out of the ACPI table in + ++++++++++++++++++ * this last pass. + ++++++++++++++++++ * + ++++++++++++++++++ * After that the hardware is initialized and ready to go. In the last + ++++++++++++++++++ * step we do some Linux specific things like registering the driver in + ++++++++++++++++++ * the dma_ops interface and initializing the suspend/resume support + ++++++++++++++++++ * functions. Finally it prints some information about AMD IOMMUs and + ++++++++++++++++++ * the driver state and enables the hardware. + ++++++++++++++++++ */ + int __init amd_iommu_init(void) + { + int i, ret = 0; + + + if (no_iommu) { + printk(KERN_INFO "AMD IOMMU disabled by kernel command line\n"); + return 0; + } + + if (!amd_iommu_detected) + return -ENODEV; + + /* + * First parse ACPI tables to find the largest Bus/Dev/Func + * we need to handle. Upon this information the shared data + * structures for the IOMMUs in the system will be allocated + */ + if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) + return -ENODEV; + - -------- --------- dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - -------- --------- alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - -------- --------- rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + ++++++++++++++++++ dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + ++++++++++++++++++ alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + ++++++++++++++++++ rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); + + ret = -ENOMEM; + + /* Device table - directly used by all IOMMUs */ - -------- --------- amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + ++++++++++++++++++ amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(dev_table_size)); + if (amd_iommu_dev_table == NULL) + goto out; + + /* + * Alias table - map PCI Bus/Dev/Func to Bus/Dev/Func the + * IOMMU see for that device + */ + amd_iommu_alias_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(alias_table_size)); + if (amd_iommu_alias_table == NULL) + goto free; + + /* IOMMU rlookup table - find the IOMMU for a specific device */ + amd_iommu_rlookup_table = (void *)__get_free_pages(GFP_KERNEL, + get_order(rlookup_table_size)); + if (amd_iommu_rlookup_table == NULL) + goto free; + + /* + * Protection Domain table - maps devices to protection domains + * This table has the same size as the rlookup_table + */ - -------- --------- amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + ++++++++++++++++++ amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(rlookup_table_size)); + if (amd_iommu_pd_table == NULL) + goto free; + - -------- --------- amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + ++++++++++++++++++ amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + ++++++++++++++++++ GFP_KERNEL | __GFP_ZERO, + get_order(MAX_DOMAIN_ID/8)); + if (amd_iommu_pd_alloc_bitmap == NULL) + goto free; + + /* - -------- --------- * memory is allocated now; initialize the device table with all zeroes - -------- --------- * and let all alias entries point to itself + ++++++++++++++++++ * let all alias entries point to itself + */ - -------- --------- memset(amd_iommu_dev_table, 0, dev_table_size); + for (i = 0; i < amd_iommu_last_bdf; ++i) + amd_iommu_alias_table[i] = i; + - -------- --------- memset(amd_iommu_pd_table, 0, rlookup_table_size); - -------- --------- memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - -------- --------- + /* + * never allocate domain 0 because its used as the non-allocated and + * error value placeholder + */ + amd_iommu_pd_alloc_bitmap[0] = 1; + + /* + * now the data structures are allocated and basically initialized + * start the real acpi table scan + */ + ret = -ENODEV; + if (acpi_table_parse("IVRS", init_iommu_all) != 0) + goto free; + + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) + goto free; + + ret = amd_iommu_init_dma_ops(); + if (ret) + goto free; + + ret = sysdev_class_register(&amd_iommu_sysdev_class); + if (ret) + goto free; + + ret = sysdev_register(&device_amd_iommu); + if (ret) + goto free; + + enable_iommus(); + + printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n", + (1 << (amd_iommu_aperture_order-20))); + + printk(KERN_INFO "AMD IOMMU: device isolation "); + if (amd_iommu_isolate) + printk("enabled\n"); + else + printk("disabled\n"); + + out: + return ret; + + free: - -------- --------- if (amd_iommu_pd_alloc_bitmap) - -------- --------- free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + ++++++++++++++++++ free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + - -------- --------- if (amd_iommu_pd_table) - -------- --------- free_pages((unsigned long)amd_iommu_pd_table, - -------- --------- get_order(rlookup_table_size)); + ++++++++++++++++++ free_pages((unsigned long)amd_iommu_pd_table, + ++++++++++++++++++ get_order(rlookup_table_size)); + - -------- --------- if (amd_iommu_rlookup_table) - -------- --------- free_pages((unsigned long)amd_iommu_rlookup_table, - -------- --------- get_order(rlookup_table_size)); + ++++++++++++++++++ free_pages((unsigned long)amd_iommu_rlookup_table, + ++++++++++++++++++ get_order(rlookup_table_size)); + - -------- --------- if (amd_iommu_alias_table) - -------- --------- free_pages((unsigned long)amd_iommu_alias_table, - -------- --------- get_order(alias_table_size)); + ++++++++++++++++++ free_pages((unsigned long)amd_iommu_alias_table, + ++++++++++++++++++ get_order(alias_table_size)); + - -------- --------- if (amd_iommu_dev_table) - -------- --------- free_pages((unsigned long)amd_iommu_dev_table, - -------- --------- get_order(dev_table_size)); + ++++++++++++++++++ free_pages((unsigned long)amd_iommu_dev_table, + ++++++++++++++++++ get_order(dev_table_size)); + + free_iommu_all(); + + free_unity_maps(); + + goto out; + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * Early detect code. This code runs at IOMMU detection time in the DMA + ++++++++++++++++++ * layer. It just looks if there is an IVRS ACPI table to detect AMD + ++++++++++++++++++ * IOMMUs + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + static int __init early_amd_iommu_detect(struct acpi_table_header *table) + { + return 0; + } + + void __init amd_iommu_detect(void) + { - -------- --------- if (swiotlb || no_iommu || iommu_detected) + ++++++++++++++++++ if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) + return; + + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { + iommu_detected = 1; + amd_iommu_detected = 1; + #ifdef CONFIG_GART_IOMMU + gart_iommu_aperture_disabled = 1; + gart_iommu_aperture = 0; + #endif + } + } + + ++++++++++++++++++/**************************************************************************** + ++++++++++++++++++ * + ++++++++++++++++++ * Parsing functions for the AMD IOMMU specific kernel command line + ++++++++++++++++++ * options. + ++++++++++++++++++ * + ++++++++++++++++++ ****************************************************************************/ + ++++++++++++++++++ + static int __init parse_amd_iommu_options(char *str) + { + for (; *str; ++str) { + if (strcmp(str, "isolate") == 0) + amd_iommu_isolate = 1; + } + + return 1; + } + + static int __init parse_amd_iommu_size_options(char *str) + { - -------- --------- for (; *str; ++str) { - -------- --------- if (strcmp(str, "32M") == 0) - -------- --------- amd_iommu_aperture_order = 25; - -------- --------- if (strcmp(str, "64M") == 0) - -------- --------- amd_iommu_aperture_order = 26; - -------- --------- if (strcmp(str, "128M") == 0) - -------- --------- amd_iommu_aperture_order = 27; - -------- --------- if (strcmp(str, "256M") == 0) - -------- --------- amd_iommu_aperture_order = 28; - -------- --------- if (strcmp(str, "512M") == 0) - -------- --------- amd_iommu_aperture_order = 29; - -------- --------- if (strcmp(str, "1G") == 0) - -------- --------- amd_iommu_aperture_order = 30; - -------- --------- } + ++++++++++++++++++ unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + ++++++++++++++++++ + ++++++++++++++++++ if ((order > 24) && (order < 31)) + ++++++++++++++++++ amd_iommu_aperture_order = order; + + return 1; + } + + __setup("amd_iommu=", parse_amd_iommu_options); + __setup("amd_iommu_size=", parse_amd_iommu_size_options); diff --cc arch/x86/kernel/apic_32.c index a437d027f20b,a437d027f20b,7f30c0f3dbe4,a437d027f20b,e9a00e5074b2,a437d027f20b,3e58b676d23b,3e947208b9d9,a437d027f20b,a437d027f20b,4b99b1bdeb6c,a437d027f20b,3e58b676d23b,a437d027f20b,a437d027f20b,a437d027f20b,3e58b676d23b,a437d027f20b,a437d027f20b,a437d027f20b..d6c898358371 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@@@@@@@@@@@@@@@@@@@@ -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -74,7 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 -75,17 +75,17 @@@@@@@@@@@@@@@@@@@@@ char system_vectors[NR_VECTORS] = { [0 /* * Debug level, exported for io_apic.c */ -- -----------------int apic_verbosity; ++ +++++++++++++++++unsigned int apic_verbosity; + + int pic_mode; + + /* Have we found an MP table */ + int smp_found_config; + + static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, + }; static unsigned int calibration_result; @@@@@@@@@@@@@@@@@@@@@ -543,22 -543,22 -514,55 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 -532,22 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 -543,22 +514,55 @@@@@@@@@@@@@@@@@@@@@ static int __init calibrate_APIC_clock( if (!local_apic_timer_verify_ok) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); ++ +++++++++++++++++ return -1; ++ +++++++++++++++++ } ++ +++++++++++++++++ ++ +++++++++++++++++ return 0; ++ +++++++++++++++++} ++ +++++++++++++++++ ++ +++++++++++++++++/* ++ +++++++++++++++++ * Setup the boot APIC ++ +++++++++++++++++ * ++ +++++++++++++++++ * Calibrate and verify the result. ++ +++++++++++++++++ */ ++ +++++++++++++++++void __init setup_boot_APIC_clock(void) ++ +++++++++++++++++{ ++ +++++++++++++++++ /* ++ +++++++++++++++++ * The local apic timer can be disabled via the kernel ++ +++++++++++++++++ * commandline or from the CPU detection code. Register the lapic ++ +++++++++++++++++ * timer as a dummy clock event source on SMP systems, so the ++ +++++++++++++++++ * broadcast mechanism is used. On UP systems simply ignore it. ++ +++++++++++++++++ */ ++ +++++++++++++++++ if (local_apic_timer_disabled) { /* No broadcast on UP ! */ -- ----------------- if (num_possible_cpus() == 1) -- ----------------- return; -- ----------------- } else { -- ----------------- /* -- ----------------- * If nmi_watchdog is set to IO_APIC, we need the -- ----------------- * PIT/HPET going. Otherwise register lapic as a dummy -- ----------------- * device. -- ----------------- */ -- ----------------- if (nmi_watchdog != NMI_IO_APIC) -- ----------------- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; -- ----------------- else -- ----------------- printk(KERN_WARNING "APIC timer registered as dummy," -- ------- --------- " due to nmi_watchdog=%d!\n", nmi_watchdog); - " due to nmi_watchdog=1!\n"); ++ +++++++++++++++++ if (num_possible_cpus() > 1) { ++ +++++++++++++++++ lapic_clockevent.mult = 1; ++ +++++++++++++++++ setup_APIC_timer(); ++ +++++++++++++++++ } ++ +++++++++++++++++ return; ++ + + + } ++ + + + ++ +++++++++++++++++ apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" ++ +++++++++++++++++ "calibrating APIC timer ...\n"); ++ +++++++++++++++++ ++ +++++++++++++++++ if (calibrate_APIC_clock()) { ++ +++++++++++++++++ /* No broadcast on UP ! */ ++ +++++++++++++++++ if (num_possible_cpus() > 1) ++ +++++++++++++++++ setup_APIC_timer(); ++ +++++++++++++++++ return; ++ +++ ++ + +++ +++ } ++ +++ ++ + +++ +++ ++ +++++++++++++++++ /* ++ +++++++++++++++++ * If nmi_watchdog is set to IO_APIC, we need the ++ +++++++++++++++++ * PIT/HPET going. Otherwise register lapic as a dummy ++ +++++++++++++++++ * device. ++ +++++++++++++++++ */ ++ +++++++++++++++++ if (nmi_watchdog != NMI_IO_APIC) ++ +++++++++++++++++ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; ++ +++++++++++++++++ else ++ +++++++++++++++++ printk(KERN_WARNING "APIC timer registered as dummy," ++ +++++++++++++++++ " due to nmi_watchdog=%d!\n", nmi_watchdog); ++ +++++++++++++++++ /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } @@@@@@@@@@@@@@@@@@@@@ -1214,9 -1214,9 -1218,9 -1214,9 -1214,6 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 -1236,9 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 -1214,9 +1218,6 @@@@@@@@@@@@@@@@@@@@@ int apic_version[MAX_APICS] int __init APIC_init_uniprocessor(void) { ---- ----- --------- if (disable_apic) - if (enable_local_apic < 0) ---- --------------- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); ---- --------------- if (!smp_found_config && !cpu_has_apic) return -1; @@@@@@@@@@@@@@@@@@@@@ -1333,17 -1333,17 -1337,17 -1333,17 -1330,17 -1333,17 -1333,13 -1333,13 -1333,17 -1333,17 -1351,13 -1333,17 -1333,13 -1333,17 -1333,17 -1333,17 -1333,13 -1333,17 -1333,17 -1333,17 +1334,17 @@@@@@@@@@@@@@@@@@@@@ void __init smp_intr_init(void * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. */ - set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); /* IPI for invalidation */ - set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); /* IPI for generic function call */ - set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); ++ + + + ++ + + + /* IPI for single call function */ ++ + + + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, ++ + + + call_function_single_interrupt); } #endif @@@@@@@@@@@@@@@@@@@@@ -1699,8 -1699,8 -1703,8 -1699,8 -1696,8 -1699,8 -1695,8 -1695,8 -1699,8 -1699,8 -1710,8 -1699,8 -1695,8 -1699,8 -1699,8 -1699,8 -1695,8 -1699,8 -1699,8 -1699,8 +1700,8 @@@@@@@@@@@@@@@@@@@@@ early_param("lapic", parse_lapic) static int __init parse_nolapic(char *arg) { - enable_local_apic = -1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + disable_apic = 1; ---- ----- --------- clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); ++++ +++++++++++++++ setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("nolapic", parse_nolapic); diff --cc arch/x86/kernel/apic_64.c index 1e3d32e27c14,1e3d32e27c14,98c70f044e19,1e3d32e27c14,16e586cacbdc,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,0633cfd0dc29,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14,1e3d32e27c14..7f1f030da7ee --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@@@@@@@@@@@@@@@@@@@@ -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,7 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 -54,10 +54,10 @@@@@@@@@@@@@@@@@@@@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o /* * Debug level, exported for io_apic.c */ -- -----------------int apic_verbosity; ++ +++++++++++++++++unsigned int apic_verbosity; + + /* Have we found an MP table */ + int smp_found_config; static struct resource lapic_resource = { .name = "Local APIC", diff --cc arch/x86/kernel/cpu/common_64.c index 7b8cc72feb40,7b8cc72feb40,7b8cc72feb40,2a4475beea4a,daee611f0140,7b8cc72feb40,7b8cc72feb40,751850235291,7b8cc72feb40,7b8cc72feb40,000000000000,7b8cc72feb40,7b8cc72feb40,7b8cc72feb40,7b8cc72feb40,7b8cc72feb40,36537ab9e56a,7b8cc72feb40,7b8cc72feb40,736f50fa433d..dd6e3f15017e mode 100644,100644,100644,100644,100644,100644,100644,100644,100644,100644,000000,100644,100644,100644,100644,100644,100644,100644,100644,100644..100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@@@@@@@@@@@@@@@@@@@@ -1,681 -1,681 -1,681 -1,678 -1,676 -1,681 -1,681 -1,676 -1,681 -1,681 -1,0 -1,681 -1,681 -1,681 -1,681 -1,681 -1,679 -1,681 -1,681 -1,678 +1,670 @@@@@@@@@@@@@@@@@@@@@ + #include + #include + #include + #include + #include + #include + #include + #include + #include --- ------ ---------#include + #include + #include --- ------ ---------#include + #include --- ------ ---------#include + #include + #include + #include +++++++++++++++++++ #include + #include + #include + #include + #include + #include + #ifdef CONFIG_X86_LOCAL_APIC + #include + #include + #include + #endif + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include "cpu.h" + + /* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ + /* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ + DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, + } }; + EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; + + /* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ + void switch_to_new_gdt(void) + { + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + } + + struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; + + static void __cpuinit default_init(struct cpuinfo_x86 *c) + { + display_cacheinfo(c); + } + + static struct cpu_dev __cpuinitdata default_cpu = { + .c_init = default_init, + .c_vendor = "Unknown", + }; + static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; + + int __cpuinit get_model_name(struct cpuinfo_x86 *c) + { + unsigned int *v; + + if (c->extended_cpuid_level < 0x80000004) + return 0; + + v = (unsigned int *) c->x86_model_id; + cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); + cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); + cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); + c->x86_model_id[48] = 0; + return 1; + } + + + void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) + { - - unsigned int n, dummy, eax, ebx, ecx, edx; + + + unsigned int n, dummy, ebx, ecx, edx; + + n = c->extended_cpuid_level; + + if (n >= 0x80000005) { + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " + "D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; + } + + if (n >= 0x80000006) { + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + ecx = cpuid_ecx(0x80000006); + c->x86_cache_size = ecx >> 16; + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + c->x86_cache_size, ecx & 0xFF); + } - - if (n >= 0x80000008) { - - cpuid(0x80000008, &eax, &dummy, &dummy, &dummy); - - c->x86_virt_bits = (eax >> 8) & 0xff; - - c->x86_phys_bits = eax & 0xff; - - } + } + + void __cpuinit detect_ht(struct cpuinfo_x86 *c) + { + #ifdef CONFIG_SMP + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + + if (!cpu_has(c, X86_FEATURE_HT)) + return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of " + "siblings %d", smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id(index_msb); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); + } + out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } + + #endif + } + + static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) + { + char *v = c->x86_vendor_id; + int i; + static int printed; + + for (i = 0; i < X86_VENDOR_NUM; i++) { + if (cpu_devs[i]) { + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + c->x86_vendor = i; + this_cpu = cpu_devs[i]; + return; + } + } + } + if (!printed) { + printed++; + printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: Your system may be unstable.\n"); + } + c->x86_vendor = X86_VENDOR_UNKNOWN; + } + + static void __init early_cpu_support_print(void) + { + int i,j; + struct cpu_dev *cpu_devx; + + printk("KERNEL supported cpus:\n"); + for (i = 0; i < X86_VENDOR_NUM; i++) { + cpu_devx = cpu_devs[i]; + if (!cpu_devx) + continue; + for (j = 0; j < 2; j++) { + if (!cpu_devx->c_ident[j]) + continue; + printk(" %s %s\n", cpu_devx->c_vendor, + cpu_devx->c_ident[j]); + } + } + } + + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); + + void __init early_cpu_init(void) + { + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start ; + cvdev < __x86cpuvendor_end ; + cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + early_cpu_support_print(); + early_identify_cpu(&boot_cpu_data); + } + + /* Do some early cpuid on the boot CPU to get some parameter that are + needed before check_bugs. Everything advanced is in identify_cpu + below. */ + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) + { + u32 tfms, xlvl; + + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; + c->extended_cpuid_level = 0; + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + /* Get vendor name */ + cpuid(0x00000000, (unsigned int *)&c->cpuid_level, + (unsigned int *)&c->x86_vendor_id[0], + (unsigned int *)&c->x86_vendor_id[8], + (unsigned int *)&c->x86_vendor_id[4]); + + get_cpu_vendor(c); + + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + __u32 misc; + cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], + &c->x86_capability[0]); + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; + if (c->x86 == 0xf) + c->x86 += (tfms >> 20) & 0xff; + if (c->x86 >= 0x6) + c->x86_model += ((tfms >> 16) & 0xF) << 4; + if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) + c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + } else { + /* Have CPUID level 0 only - unheard of */ + c->x86 = 4; + } + + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; + #ifdef CONFIG_SMP + c->phys_proc_id = c->initial_apicid; + #endif + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); + } + if (xlvl >= 0x80000004) + get_model_name(c); /* Default name */ + } + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + /* Don't set x86_cpuid_level here for now to not confuse. */ + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } + ---- ----- --------- c->extended_cpuid_level = cpuid_eax(0x80000000); + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); + + + if (c->extended_cpuid_level >= 0x80000008) { + + + u32 eax = cpuid_eax(0x80000008); + + + + + + c->x86_virt_bits = (eax >> 8) & 0xff; + + + c->x86_phys_bits = eax & 0xff; + + + } + + ------- -- ----- -- /* Assume all 64-bit CPUs support 32-bit syscall */ ------- -- ----- -- set_cpu_cap(c, X86_FEATURE_SYSCALL32); ------- -- ----- -- + if (c->x86_vendor != X86_VENDOR_UNKNOWN && + cpu_devs[c->x86_vendor]->c_early_init) + cpu_devs[c->x86_vendor]->c_early_init(c); + + validate_pat_support(c); ---- ----- --------- ---- ----- --------- /* early_param could clear that, but recall get it set again */ ---- ----- --------- if (disable_apic) ---- ----- --------- clear_cpu_cap(c, X86_FEATURE_APIC); + } + + /* + * This does the hard work of actually picking apart the CPU stuff... + */ + static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) + { + int i; + + early_identify_cpu(c); + + init_scattered_cpuid_features(c); + + c->apicid = phys_pkg_id(0); + + /* + * Vendor-specific initialization. In this section we + * canonicalize the feature flags, meaning if there are + * features a certain CPU supports which CPUID doesn't + * tell us, CPUID claiming incorrect flags, or other bugs, + * we handle them here. + * + * At the end of this section, c->x86_capability better + * indicate the features this CPU genuinely supports! + */ + if (this_cpu->c_init) + this_cpu->c_init(c); + + detect_ht(c); + + /* + * On SMP, boot_cpu_data holds the common feature set between + * all CPUs; so make sure that we indicate which features are + * common between the CPUs. The first time this routine gets + * executed, c == &boot_cpu_data. + */ + if (c != &boot_cpu_data) { + /* AND the already accumulated flags with these */ + for (i = 0; i < NCAPINTS; i++) + boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + } + + /* Clear all flags overriden by options */ + for (i = 0; i < NCAPINTS; i++) + c->x86_capability[i] &= ~cleared_cpu_caps[i]; + + #ifdef CONFIG_X86_MCE + mcheck_init(c); + #endif + select_idle_routine(c); + + #ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); + #endif + + } + + void __cpuinit identify_boot_cpu(void) + { + identify_cpu(&boot_cpu_data); + } + + void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) + { + BUG_ON(c == &boot_cpu_data); + identify_cpu(c); + mtrr_ap_init(); + } + + static __init int setup_noclflush(char *arg) + { + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; + } + __setup("noclflush", setup_noclflush); + + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) + { + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); + + if (c->x86_mask || c->cpuid_level >= 0) + printk(KERN_CONT " stepping %02x\n", c->x86_mask); + else + printk(KERN_CONT "\n"); + } + + static __init int setup_disablecpuid(char *arg) + { + int bit; + if (get_option(&arg, &bit) && bit < NCAPINTS*32) + setup_clear_cpu_cap(bit); + else + return 0; + return 1; + } + __setup("clearcpuid=", setup_disablecpuid); + + cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; + + struct x8664_pda **_cpu_pda __read_mostly; + EXPORT_SYMBOL(_cpu_pda); + + struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + + char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + + unsigned long __supported_pte_mask __read_mostly = ~0UL; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + + static int do_not_nx __cpuinitdata; + + /* noexec=on|off + Control non executable mappings for 64bit processes. + + on Enable(default) + off Disable + */ + static int __init nonx_setup(char *str) + { + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; + } + early_param("noexec", nonx_setup); + + int force_personality32; + + /* noexec32=on|off + Control non executable heap for 32bit processes. + To control the stack too use noexec=off + + on PROT_READ does not imply PROT_EXEC for 32bit processes (default) + off PROT_READ implies PROT_EXEC + */ + static int __init nonx32_setup(char *str) + { + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; + } + __setup("noexec32=", nonx32_setup); + + void pda_init(int cpu) + { + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + loadsegment(fs, 0); + loadsegment(gs, 0); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + } else { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", cpu); + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } + + pda->irqstackptr += IRQSTACKSIZE-64; + } + + char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + ---------- -------- DEBUG_STKSZ] ---------- -------- __attribute__((section(".bss.page_aligned"))); +++++++++++++++++++ DEBUG_STKSZ] __page_aligned_bss; + + extern asmlinkage void ignore_sysret(void); + + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + + #ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); + #endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); + } + + void __cpuinit check_efer(void) + { + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; + } + + unsigned long kernel_eflags; + + /* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ + DEFINE_PER_CPU(struct orig_ist, orig_ist); + + /* + * cpu_init() initializes state that is per-CPU. Some data is already + * initialized (naturally) in the bootstrap process, such as the GDT + * and IDT. We reload them nevertheless, this function acts as a + * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init. + */ + void __cpuinit cpu_init(void) + { + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + + /* + * set up and load the per-CPU TSS + */ + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception stack %ld %d\n", + v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + + #ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { + #endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); + #ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } + #endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); + } diff --cc arch/x86/kernel/entry_32.S index 6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,53393c306e11,6bc07f0f1202,6bc07f0f1202,c778e4fa55a2,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,6bc07f0f1202,cadf73f70d33,cfe28a715434,6bc07f0f1202,ad5264c29e9b,6bc07f0f1202..cdfd94cc6b14 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@@@@@@@@@@@@@@@@@@@@ -51,8 -51,8 -51,8 -51,8 -51,8 -51,8 -51,8 -51,7 -51,8 -51,8 -51,7 -51,8 -51,8 -51,8 -51,8 -51,8 -51,7 -51,8 -51,8 -51,8 +51,8 @@@@@@@@@@@@@@@@@@@@@ #include #include #include - #include "irq_vectors.h" + + + #include + #include /* * We use macros for low-level operations which need to be overridden @@@@@@@@@@@@@@@@@@@@@ -1024,8 -1024,8 -1024,8 -1024,8 -1024,8 -1024,8 -1024,8 -1023,7 -1024,8 -1024,8 -1023,7 -1024,8 -1024,8 -1024,8 -1024,8 -1015,8 -1023,8 -1024,8 -1024,9 -1024,8 +1015,9 @@@@@@@@@@@@@@@@@@@@@ ENDPROC(kernel_thread_helper ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ ++++++++++++++++++ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp + + CFI_ENDPROC ENTRY(xen_hypervisor_callback) CFI_STARTPROC diff --cc arch/x86/kernel/entry_64.S index ae63e584c340,ae63e584c340,ae63e584c340,ae63e584c340,ae63e584c340,ae63e584c340,ba41bf42748d,466b9284ed2f,ae63e584c340,ae63e584c340,556a8df522a7,ae63e584c340,ba41bf42748d,ae63e584c340,ae63e584c340,63001c6ecf6d,466b9284ed2f,ae63e584c340,ae63e584c340,80d5663db3bc..8410e26f4183 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@@@@@@@@@@@@@@@@@@@@ -349,8 -349,8 -349,8 -349,8 -349,8 -349,8 -349,8 -243,8 -349,8 -349,8 -244,7 -349,8 -349,8 -349,8 -349,8 -349,7 -243,8 -349,8 -349,8 -349,8 +349,7 @@@@@@@@@@@@@@@@@@@@@ ENTRY(system_call_after_swapgs movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) ---------- ---- ---- testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ ---------- ---- ---- TI_flags(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) +++++++++++++++ ++++ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys diff --cc arch/x86/kernel/nmi.c index ec024b3baad0,ec024b3baad0,384b49fed598,ec024b3baad0,ec024b3baad0,ec024b3baad0,716b89284be0,8dfe9db87a9e,ec024b3baad0,ec024b3baad0,000000000000,e0b44b7b717a,716b89284be0,ec024b3baad0,ec024b3baad0,ec024b3baad0,716b89284be0,ec024b3baad0,ec024b3baad0,ec024b3baad0..ac6d51222e7d mode 100644,100644,100644,100644,100644,100644,100644,100644,100644,100644,000000,100644,100644,100644,100644,100644,100644,100644,100644,100644..100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@@@@@@@@@@@@@@@@@@@@ -1,516 -1,516 -1,516 -1,516 -1,516 -1,516 -1,516 -1,513 -1,516 -1,516 -1,0 -1,523 -1,516 -1,516 -1,516 -1,516 -1,516 -1,516 -1,516 -1,516 +1,523 @@@@@@@@@@@@@@@@@@@@@ + /* + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + + #include + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + + #include + + #include + + int unknown_nmi_panic; + int nmi_watchdog_enabled; + + static cpumask_t backtrace_mask = CPU_MASK_NONE; + + /* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ + atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ + EXPORT_SYMBOL(nmi_active); + + unsigned int nmi_watchdog = NMI_NONE; + EXPORT_SYMBOL(nmi_watchdog); + + static int panic_on_timeout; + + static unsigned int nmi_hz = HZ; + static DEFINE_PER_CPU(short, wd_enabled); + static int endflag __initdata; + + static inline unsigned int get_nmi_count(int cpu) + { + #ifdef CONFIG_X86_64 + return cpu_pda(cpu)->__nmi_count; + #else + return nmi_count(cpu); + #endif + } + + static inline int mce_in_progress(void) + { + #if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + return atomic_read(&mce_entry) > 0; + #endif + return 0; + } + + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + static inline unsigned int get_timer_irqs(int cpu) + { + #ifdef CONFIG_X86_64 + return read_pda(apic_timer_irqs) + read_pda(irq0_irqs); + #else + return per_cpu(irq_stat, cpu).apic_timer_irqs + + per_cpu(irq_stat, cpu).irq0_irqs; + #endif + } + + #ifdef CONFIG_SMP + /* + * The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ + static __init void nmi_cpu_busy(void *data) + { + local_irq_enable_in_hardirq(); + /* + * Intentionally don't use cpu_relax here. This is + * to make sure that the performance counter really ticks, + * even if there is a simulator or similar that catches the + * pause instruction. On a real HT machine this is fine because + * all other CPUs are busy with "useless" delay loops and don't + * care if they get somewhat less cycles. + */ + while (endflag == 0) + mb(); + } + #endif + + int __init check_nmi_watchdog(void) + { + unsigned int *prev_nmi_count; + int cpu; + + if (!nmi_watchdog_active() || !atomic_read(&nmi_active)) + return 0; + + prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) + goto error; + + printk(KERN_INFO "Testing NMI watchdog ... "); + + #ifdef CONFIG_SMP + if (nmi_watchdog == NMI_LOCAL_APIC) -- - - smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); ++ + + + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); + #endif + + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = get_nmi_count(cpu); + local_irq_enable(); + mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ + + for_each_online_cpu(cpu) { + if (!per_cpu(wd_enabled, cpu)) + continue; + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk(KERN_WARNING "WARNING: CPU#%d: NMI " + "appears to be stuck (%d->%d)!\n", + cpu, + prev_nmi_count[cpu], + get_nmi_count(cpu)); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + endflag = 1; + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + goto error; + } + printk("OK.\n"); + + /* + * now that we know it works we can reduce NMI frequency to + * something more reasonable; makes a difference in some configs + */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(prev_nmi_count); + return 0; + error: + if (nmi_watchdog == NMI_IO_APIC && !timer_through_8259) + disable_8259A_irq(0); + + #ifdef CONFIG_X86_32 + + timer_ack = 0; + + #endif + return -1; + } + + static int __init setup_nmi_watchdog(char *str) + { + unsigned int nmi; + + if (!strncmp(str, "panic", 5)) { + panic_on_timeout = 1; + str = strchr(str, ','); + if (!str) + return 1; + ++str; + } + + get_option(&str, &nmi); + + if (nmi >= NMI_INVALID) + return 0; + + nmi_watchdog = nmi; + return 1; + } + __setup("nmi_watchdog=", setup_nmi_watchdog); + + /* + * Suspend/resume support + */ + #ifdef CONFIG_PM + + static int nmi_pm_active; /* nmi_active before suspend */ + + static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) + { + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; + } + + static int lapic_nmi_resume(struct sys_device *dev) + { + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; + } + + static struct sysdev_class nmi_sysclass = { + .name = "lapic_nmi", + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, + }; + + static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, + }; + + static int __init init_lapic_nmi_sysfs(void) + { + int error; + + /* + * should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if (atomic_read(&nmi_active) < 0) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; + } + + /* must come after the local APIC's device_initcall() */ + late_initcall(init_lapic_nmi_sysfs); + + #endif /* CONFIG_PM */ + + static void __acpi_nmi_enable(void *__unused) + { -- ------- --------- apic_write_around(APIC_LVT0, APIC_DM_NMI); ++ +++++++++++++++++ apic_write(APIC_LVT0, APIC_DM_NMI); + } + + /* + * Enable timer based NMIs on all CPUs: + */ + void acpi_nmi_enable(void) + { + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) -- - - on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); ++ + + + on_each_cpu(__acpi_nmi_enable, NULL, 1); + } + + static void __acpi_nmi_disable(void *__unused) + { -- ------- --------- apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); ++ +++++++++++++++++ apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + } + + /* + * Disable timer based NMIs on all CPUs: + */ + void acpi_nmi_disable(void) + { + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) -- - - on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); ++ + + + on_each_cpu(__acpi_nmi_disable, NULL, 1); + } + + void setup_apic_nmi_watchdog(void *unused) + { + if (__get_cpu_var(wd_enabled)) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + /* enable it before to avoid race with handler */ + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } + } + + void stop_apic_nmi_watchdog(void *unused) + { + /* only support LOCAL and IO APICs for now */ + if (!nmi_watchdog_active()) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); + } + + /* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up here too!] + */ + + static DEFINE_PER_CPU(unsigned, last_irq_sum); + static DEFINE_PER_CPU(local_t, alert_counter); + static DEFINE_PER_CPU(int, nmi_touch); + + void touch_nmi_watchdog(void) + { + if (nmi_watchdog_active()) { + unsigned cpu; + + /* + * Tell other CPUs to reset their alert counters. We cannot + * do it ourselves because the alert count increase is not + * atomic. + */ + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } + } + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); + } + EXPORT_SYMBOL(touch_nmi_watchdog); + + notrace __kprobes int + nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) + { + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + unsigned int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc = 0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + sum = get_timer_irqs(cpu); + + if (__get_cpu_var(nmi_touch)) { + __get_cpu_var(nmi_touch) = 0; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + + /* Could check oops_in_progress here too, but it's safer not to */ + if (mce_in_progress()) + touched = 1; + + /* if the none of the timers isn't firing, this cpu isn't doing much */ + if (!touched && __get_cpu_var(last_irq_sum) == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + local_inc(&__get_cpu_var(alert_counter)); + if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi("BUG: NMI Watchdog detected LOCKUP", + regs, panic_on_timeout); + } else { + __get_cpu_var(last_irq_sum) = sum; + local_set(&__get_cpu_var(alert_counter), 0); + } + + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* + * don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; + } + + #ifdef CONFIG_SYSCTL + +++++++++++ ++++++++static int __init setup_unknown_nmi_panic(char *str) +++++++++++ ++++++++{ +++++++++++ ++++++++ unknown_nmi_panic = 1; +++++++++++ ++++++++ return 1; +++++++++++ ++++++++} +++++++++++ ++++++++__setup("unknown_nmi_panic", setup_unknown_nmi_panic); +++++++++++ ++++++++ + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) + { + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(buf, regs, 1); /* Always panic here */ + return 0; + } + + /* + * proc handler for /proc/sys/kernel/nmi + */ + int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) + { + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) { + printk(KERN_WARNING + "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk(KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; + } + + #endif /* CONFIG_SYSCTL */ + + int do_nmi_callback(struct pt_regs *regs, int cpu) + { + #ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); + #endif + return 0; + } + + void __trigger_all_cpu_backtrace(void) + { + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } + } diff --cc arch/x86/kernel/paravirt.c index e0f571d58c19,e0f571d58c19,5d7326a60b7c,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,74f0c5ea2a03,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,e0f571d58c19,2963ab5d91ee..b4564d089b43 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@@@@@@@@@@@@@@@@@@@@ -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,7 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,8 -29,9 +29,9 @@@@@@@@@@@@@@@@@@@@@ #include #include #include +++++++++++++++++++ #include #include + #include #include #include #include diff --cc arch/x86/kernel/pci-dma.c index 8467ec2320f1,8467ec2320f1,8467ec2320f1,702714bd1511,8467ec2320f1,8467ec2320f1,8467ec2320f1,d12945de0565,8467ec2320f1,8467ec2320f1,dc00a1331ace,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1,8467ec2320f1..a4213c00dffc --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@@@@@@@@@@@@@@@@@@@@ -5,12 -5,12 -5,12 -5,11 -5,12 -5,12 -5,12 -5,12 -5,12 -5,12 -5,11 -5,12 -5,12 -5,12 -5,12 -5,12 -5,12 -5,12 -5,12 -5,12 +5,11 @@@@@@@@@@@@@@@@@@@@@ #include #include ------- ------------#include +++++++ ++++++++++++#include #include + #include --- ----------------int forbid_dac __read_mostly; --- ----------------EXPORT_SYMBOL(forbid_dac); +++ ++++++++++++++++static int forbid_dac __read_mostly; const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); @@@@@@@@@@@@@@@@@@@@@ -123,12 -123,12 -123,12 -122,12 -123,12 -123,12 -123,12 -120,9 -123,12 -123,12 -121,10 -123,12 -123,12 -123,12 -123,12 -123,12 -123,12 -123,12 -123,12 -123,12 +119,9 @@@@@@@@@@@@@@@@@@@@@ void __init pci_iommu_alloc(void detect_intel_iommu(); - #ifdef CONFIG_SWIOTLB + amd_iommu_detect(); + ------- -- ---------#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); ------- ------------#endif } #endif @@@@@@@@@@@@@@@@@@@@@ -505,12 -505,12 -505,12 -504,12 -505,12 -505,12 -505,12 -496,9 -505,12 -505,12 -501,10 -505,12 -505,12 -505,12 -505,12 -505,12 -505,12 -505,12 -505,12 -505,12 +495,9 @@@@@@@@@@@@@@@@@@@@@ static int __init pci_iommu_init(void intel_iommu_init(); - #ifdef CONFIG_GART_IOMMU + amd_iommu_init(); + ------- -- ---------#ifdef CONFIG_GART_IOMMU gart_iommu_init(); ------- ------------#endif no_iommu_init(); return 0; diff --cc arch/x86/kernel/process.c index 4d629c62f4f8,4d629c62f4f8,4d629c62f4f8,9f94bb1c8117,4d629c62f4f8,74f2d196adb4,4061d63aabe7,4061d63aabe7,7dceea947232,4d629c62f4f8,ba370dc8685b,4d629c62f4f8,4061d63aabe7,7dceea947232,4d629c62f4f8,4d629c62f4f8,4061d63aabe7,4d629c62f4f8,4d629c62f4f8,4d629c62f4f8..7fc4d5b0a6a0 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@@@@@@@@@@@@@@@@@@@@ -6,15 -6,15 -6,15 -6,15 -6,15 -6,16 -6,9 -6,9 -6,9 -6,15 -6,8 -6,15 -6,9 -6,9 -6,15 -6,15 -6,9 -6,15 -6,15 -6,15 +6,16 @@@@@@@@@@@@@@@@@@@@@ #include #include #include + #include +++ + ++ + #include +++ + ++ + +++ + ++ + unsigned long idle_halt; +++ + ++ + EXPORT_SYMBOL(idle_halt); +++ + ++ + unsigned long idle_nomwait; +++ + ++ + EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +++++ ++++++++++++++static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { @@@@@@@@@@@@@@@@@@@@@ -199,15 -199,15 -199,15 -199,16 -199,15 -200,15 -193,15 -193,15 -193,15 -199,15 -122,8 -199,15 -193,15 -193,15 -199,15 -199,15 -193,15 -199,15 -199,15 -199,15 +200,16 @@@@@@@@@@@@@@@@@@@@@ static void poll_idle(void * * idle=mwait overrides this decision and forces the usage of mwait. */ +++ ++++++++++++++++static int __cpuinitdata force_mwait; + + #define MWAIT_INFO 0x05 + #define MWAIT_ECX_EXTENDED_INFO 0x01 + #define MWAIT_EDX_C1 0xf0 + static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) { + u32 eax, ebx, ecx, edx; + if (force_mwait) return 1; diff --cc arch/x86/kernel/setup.c index 531b55b8e81a,531b55b8e81a,531b55b8e81a,4064616cfa85,531b55b8e81a,531b55b8e81a,36c540d4ac4b,e5d208934bfc,36c540d4ac4b,531b55b8e81a,6f80b852a196,531b55b8e81a,36c540d4ac4b,36c540d4ac4b,531b55b8e81a,531b55b8e81a,987b6fde3a99,531b55b8e81a,531b55b8e81a,c9010f82141d..ec952aa5394a --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@@@@@@@@@@@@@@@@@@@@ -1,894 -1,894 -1,894 -1,885 -1,894 -1,894 -1,889 -1,880 -1,889 -1,894 -1,139 -1,894 -1,889 -1,889 -1,894 -1,894 -1,881 -1,894 -1,894 -1,897 +1,888 @@@@@@@@@@@@@@@@@@@@@ - #include + /* + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + + /* + * This file handles the architecture-dependent parts of initialization + */ + + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include #include + #include #include - #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + + #include + #include + #include + #include + #include + #include + #include + #include --- ------ ---------#include + + #include --- ------ ---------#include --- ------ ---------#include --- ------ ---------#include + #include + #include + #include + #include + #include - #include - #include + #include + + #include