X-Git-Url: http://demsky.eecs.uci.edu/git/?a=blobdiff_plain;f=arch%2Fx86%2Fkvm%2Fmmu.c;h=3e893cd9038911e888b03eeab666801256e3e086;hb=f8f559422b6c6a05469dfde614b67789b6142cb5;hp=004cc87b781c2694a0f6428ef8b6614ff60711b7;hpb=e15e6119062d20cc96f95c8b345e361589a90244;p=firefly-linux-kernel-4.4.55.git diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 004cc87b781c..3e893cd90389 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -197,15 +197,60 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask) } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); -static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access) +/* + * spte bits of bit 3 ~ bit 11 are used as low 9 bits of generation number, + * the bits of bits 52 ~ bit 61 are used as high 10 bits of generation + * number. + */ +#define MMIO_SPTE_GEN_LOW_SHIFT 3 +#define MMIO_SPTE_GEN_HIGH_SHIFT 52 + +#define MMIO_GEN_SHIFT 19 +#define MMIO_GEN_LOW_SHIFT 9 +#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 1) +#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1) +#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1) + +static u64 generation_mmio_spte_mask(unsigned int gen) +{ + u64 mask; + + WARN_ON(gen > MMIO_MAX_GEN); + + mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT; + mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT; + return mask; +} + +static unsigned int get_mmio_spte_generation(u64 spte) +{ + unsigned int gen; + + spte &= ~shadow_mmio_mask; + + gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK; + gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT; + return gen; +} + +static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +{ + return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; +} + +static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, + unsigned access) { struct kvm_mmu_page *sp = page_header(__pa(sptep)); + unsigned int gen = kvm_current_mmio_generation(kvm); + u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; - + mask |= shadow_mmio_mask | access | gfn << PAGE_SHIFT; sp->mmio_cached = true; - trace_mark_mmio_spte(sptep, gfn, access); - mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT); + + trace_mark_mmio_spte(sptep, gfn, access, gen); + mmu_spte_set(sptep, mask); } static bool is_mmio_spte(u64 spte) @@ -215,24 +260,33 @@ static bool is_mmio_spte(u64 spte) static gfn_t get_mmio_spte_gfn(u64 spte) { - return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) >> PAGE_SHIFT; } static unsigned get_mmio_spte_access(u64 spte) { - return (spte & ~shadow_mmio_mask) & ~PAGE_MASK; + u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask; + return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) +static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } return false; } +static bool check_mmio_spte(struct kvm *kvm, u64 spte) +{ + return likely(get_mmio_spte_generation(spte) == + kvm_current_mmio_generation(kvm)); +} + static inline u64 rsvd_bits(int s, int e) { return ((1ULL << (e - s + 1)) - 1) << s; @@ -1511,6 +1565,12 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, if (!direct) sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); + + /* + * The active_mmu_pages list is the FIFO list, do not move the + * page until it is zapped. kvm_zap_obsolete_pages depends on + * this feature. See the comments in kvm_zap_obsolete_pages(). + */ list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); sp->parent_ptes = 0; mmu_page_add_parent_pte(vcpu, sp, parent_pte); @@ -1648,6 +1708,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); +/* + * NOTE: we should pay more attention on the zapped-obsolete page + * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk + * since it has been deleted from active_mmu_pages but still can be found + * at hast list. + * + * for_each_gfn_indirect_valid_sp has skipped that kind of page and + * kvm_mmu_get_page(), the only user of for_each_gfn_sp(), has skipped + * all the obsolete pages. + */ #define for_each_gfn_sp(_kvm, _sp, _gfn) \ hlist_for_each_entry(_sp, \ &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ @@ -1838,6 +1908,11 @@ static void clear_sp_write_flooding_count(u64 *spte) __clear_sp_write_flooding_count(sp); } +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -1864,6 +1939,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, role.quadrant = quadrant; } for_each_gfn_sp(vcpu->kvm, sp, gfn) { + if (is_obsolete_sp(vcpu->kvm, sp)) + continue; + if (!need_sync && sp->unsync) need_sync = true; @@ -1900,6 +1978,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, account_shadowed(vcpu->kvm, gfn); } + sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); trace_kvm_mmu_get_page(sp, true); return sp; @@ -2070,8 +2149,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, ret = mmu_zap_unsync_children(kvm, sp, invalid_list); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + if (!sp->role.invalid && !sp->role.direct) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { @@ -2081,7 +2162,13 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mod_used_mmu_pages(kvm, -1); } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); - kvm_reload_remote_mmus(kvm); + + /* + * The obsolete pages can not be used on any vcpus. + * See the comments in kvm_mmu_invalidate_zap_all_pages(). + */ + if (!sp->role.invalid && !is_obsolete_sp(kvm, sp)) + kvm_reload_remote_mmus(kvm); } sp->role.invalid = 1; @@ -2331,7 +2418,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2869,22 +2956,25 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; - spin_lock(&vcpu->kvm->mmu_lock); + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL && (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL || vcpu->arch.mmu.direct_map)) { hpa_t root = vcpu->arch.mmu.root_hpa; + spin_lock(&vcpu->kvm->mmu_lock); sp = page_header(root); --sp->root_count; if (!sp->root_count && sp->role.invalid) { kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } - vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.root_hpa = INVALID_PAGE; return; } + + spin_lock(&vcpu->kvm->mmu_lock); for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -3148,17 +3238,12 @@ static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) return spte; } -/* - * If it is a real mmio page fault, return 1 and emulat the instruction - * directly, return 0 to let CPU fault again on the address, -1 is - * returned if bug is detected. - */ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; if (quickly_check_mmio_pf(vcpu, addr, direct)) - return 1; + return RET_MMIO_PF_EMULATE; spte = walk_shadow_page_get_mmio_spte(vcpu, addr); @@ -3166,12 +3251,15 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); + if (!check_mmio_spte(vcpu->kvm, spte)) + return RET_MMIO_PF_INVALID; + if (direct) addr = 0; trace_handle_mmio_page_fault(addr, gfn, access); vcpu_cache_mmio_info(vcpu, addr, gfn, access); - return 1; + return RET_MMIO_PF_EMULATE; } /* @@ -3179,13 +3267,13 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) * it's a BUG if the gfn is not a mmio page. */ if (direct && !check_direct_spte_mmio_pf(spte)) - return -1; + return RET_MMIO_PF_BUG; /* * If the page table is zapped by other cpus, let CPU fault again on * the address. */ - return 0; + return RET_MMIO_PF_RETRY; } EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); @@ -3195,7 +3283,7 @@ static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, int ret; ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret < 0); + WARN_ON(ret == RET_MMIO_PF_BUG); return ret; } @@ -3207,8 +3295,12 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gva, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gva, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3284,8 +3376,12 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); - if (unlikely(error_code & PFERR_RSVD_MASK)) - return handle_mmio_page_fault(vcpu, gpa, error_code, true); + if (unlikely(error_code & PFERR_RSVD_MASK)) { + r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + + if (likely(r != RET_MMIO_PF_INVALID)) + return r; + } r = mmu_topup_memory_caches(vcpu); if (r) @@ -3391,8 +3487,8 @@ static inline void protect_clean_gpte(unsigned *access, unsigned gpte) *access &= mask; } -static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, - int *nr_present) +static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, + unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { if (gfn != get_mmio_spte_gfn(*sptep)) { @@ -3401,7 +3497,7 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, } (*nr_present)++; - mark_mmio_spte(sptep, gfn, access); + mark_mmio_spte(kvm, sptep, gfn, access); return true; } @@ -3764,9 +3860,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; r = mmu_alloc_roots(vcpu); - spin_lock(&vcpu->kvm->mmu_lock); - mmu_sync_roots(vcpu); - spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_sync_roots(vcpu); if (r) goto out; /* set_cr3() should ensure TLB has been flushed */ @@ -4179,22 +4273,88 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_all(struct kvm *kvm) +#define BATCH_ZAP_PAGES 10 +static void kvm_zap_obsolete_pages(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; - LIST_HEAD(invalid_list); + int batch = 0; - spin_lock(&kvm->mmu_lock); restart: - list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) + list_for_each_entry_safe_reverse(sp, node, + &kvm->arch.active_mmu_pages, link) { + int ret; + + /* + * No obsolete page exists before new created page since + * active_mmu_pages is the FIFO list. + */ + if (!is_obsolete_sp(kvm, sp)) + break; + + /* + * Since we are reversely walking the list and the invalid + * list will be moved to the head, skip the invalid page + * can help us to avoid the infinity list walking. + */ + if (sp->role.invalid) + continue; + + /* + * Need not flush tlb since we only zap the sp with invalid + * generation number. + */ + if (batch >= BATCH_ZAP_PAGES && + cond_resched_lock(&kvm->mmu_lock)) { + batch = 0; goto restart; + } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + ret = kvm_mmu_prepare_zap_page(kvm, sp, + &kvm->arch.zapped_obsolete_pages); + batch += ret; + + if (ret) + goto restart; + } + + /* + * Should flush tlb before free page tables since lockless-walking + * may use the pages. + */ + kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages); +} + +/* + * Fast invalidate all shadow pages and use lock-break technique + * to zap obsolete pages. + * + * It's required when memslot is being deleted or VM is being + * destroyed, in these cases, we should ensure that KVM MMU does + * not use any resource of the being-deleted slot or all slots + * after calling the function. + */ +void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm) +{ + spin_lock(&kvm->mmu_lock); + trace_kvm_mmu_invalidate_zap_all_pages(kvm); + kvm->arch.mmu_valid_gen++; + + /* + * Notify all vcpus to reload its shadow page table + * and flush TLB. Then all vcpus will switch to new + * shadow page table with the new mmu_valid_gen. + * + * Note: we should do this under the protection of + * mmu-lock, otherwise, vcpu would purge shadow page + * but miss tlb flush. + */ + kvm_reload_remote_mmus(kvm); + + kvm_zap_obsolete_pages(kvm); spin_unlock(&kvm->mmu_lock); } -void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) +static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm) { struct kvm_mmu_page *sp, *node; LIST_HEAD(invalid_list); @@ -4212,6 +4372,24 @@ restart: spin_unlock(&kvm->mmu_lock); } +static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) +{ + return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); +} + +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +{ + /* + * The very rare case: if the generation-number is round, + * zap all shadow pages. + * + * The max value is MMIO_MAX_GEN - 1 since it is not called + * when mark memslot invalid. + */ + if (unlikely(kvm_current_mmio_generation(kvm) >= (MMIO_MAX_GEN - 1))) + kvm_mmu_zap_mmio_sptes(kvm); +} + static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; @@ -4240,15 +4418,23 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) * want to shrink a VM that only started to populate its MMU * anyway. */ - if (!kvm->arch.n_used_mmu_pages) + if (!kvm->arch.n_used_mmu_pages && + !kvm_has_zapped_obsolete_pages(kvm)) continue; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); + if (kvm_has_zapped_obsolete_pages(kvm)) { + kvm_mmu_commit_zap_page(kvm, + &kvm->arch.zapped_obsolete_pages); + goto unlock; + } + prepare_zap_oldest_mmu_page(kvm, &invalid_list); kvm_mmu_commit_zap_page(kvm, &invalid_list); +unlock: spin_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, idx);