Merge branch '81xx' into omap-for-v4.4/fixes
[firefly-linux-kernel-4.4.55.git] / arch / x86 / kvm / vmx.c
index 6a8bc64566abde57f8914f103a6b5d9d49ed8ae8..87acc5221740a588d256f9f0ed4059d366165872 100644 (file)
@@ -35,6 +35,7 @@
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
+#include <asm/cpu.h>
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/vmx.h>
@@ -45,6 +46,7 @@
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
 #include <asm/apic.h>
+#include <asm/irq_remapping.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -105,6 +107,8 @@ static u64 __read_mostly host_xss;
 static bool __read_mostly enable_pml = 1;
 module_param_named(pml, enable_pml, bool, S_IRUGO);
 
+#define KVM_VMX_TSC_MULTIPLIER_MAX     0xffffffffffffffffULL
+
 #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
 #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
 #define KVM_VM_CR0_ALWAYS_ON                                           \
@@ -424,6 +428,9 @@ struct nested_vmx {
        /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
        u64 vmcs01_debugctl;
 
+       u16 vpid02;
+       u16 last_vpid;
+
        u32 nested_vmx_procbased_ctls_low;
        u32 nested_vmx_procbased_ctls_high;
        u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +447,33 @@ struct nested_vmx {
        u32 nested_vmx_misc_low;
        u32 nested_vmx_misc_high;
        u32 nested_vmx_ept_caps;
+       u32 nested_vmx_vpid_caps;
 };
 
 #define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
        u32 pir[8];     /* Posted interrupt requested */
-       u32 control;    /* bit 0 of control is outstanding notification bit */
-       u32 rsvd[7];
+       union {
+               struct {
+                               /* bit 256 - Outstanding Notification */
+                       u16     on      : 1,
+                               /* bit 257 - Suppress Notification */
+                               sn      : 1,
+                               /* bit 271:258 - Reserved */
+                               rsvd_1  : 14;
+                               /* bit 279:272 - Notification Vector */
+                       u8      nv;
+                               /* bit 287:280 - Reserved */
+                       u8      rsvd_2;
+                               /* bit 319:288 - Notification Destination */
+                       u32     ndst;
+               };
+               u64 control;
+       };
+       u32 rsvd[6];
 } __aligned(64);
 
 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +493,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
        return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+       return clear_bit(POSTED_INTR_SN,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+       return set_bit(POSTED_INTR_SN,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+       return test_bit(POSTED_INTR_ON,
+                       (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+       return test_bit(POSTED_INTR_SN,
+                       (unsigned long *)&pi_desc->control);
+}
+
 struct vcpu_vmx {
        struct kvm_vcpu       vcpu;
        unsigned long         host_rsp;
@@ -532,8 +582,6 @@ struct vcpu_vmx {
        s64 vnmi_blocked_time;
        u32 exit_reason;
 
-       bool rdtscp_enabled;
-
        /* Posted interrupt descriptor */
        struct pi_desc pi_desc;
 
@@ -563,6 +611,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
        return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+       return &(to_vmx(vcpu)->pi_desc);
+}
+
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 #define FIELD(number, name)    [number] = VMCS12_OFFSET(name)
 #define FIELD64(number, name)  [number] = VMCS12_OFFSET(name), \
@@ -809,7 +862,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
                            struct kvm_segment *var, int seg);
@@ -831,6 +884,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1006,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
        return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
 {
-       return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+       return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1043,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 
 static inline bool cpu_has_vmx_posted_intr(void)
 {
-       return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+       return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+               vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 }
 
 static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1123,9 @@ static inline bool cpu_has_vmx_ple(void)
                SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
-       return flexpriority_enabled && irqchip_in_kernel(kvm);
+       return flexpriority_enabled && lapic_in_kernel(vcpu);
 }
 
 static inline bool cpu_has_vmx_vpid(void)
@@ -1113,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void)
        return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
 }
 
+static inline bool cpu_has_vmx_tsc_scaling(void)
+{
+       return vmcs_config.cpu_based_2nd_exec_ctrl &
+               SECONDARY_EXEC_TSC_SCALING;
+}
+
 static inline bool report_flexpriority(void)
 {
        return flexpriority_enabled;
@@ -1157,6 +1224,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+       return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 {
        return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1409,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
                         __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
 {
-       if (vmx->vpid == 0)
+       if (vpid == 0)
                return;
 
        if (cpu_has_vmx_invvpid_single())
-               __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+               __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 }
 
 static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1424,10 @@ static inline void vpid_sync_vcpu_global(void)
                __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 }
 
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
 {
        if (cpu_has_vmx_invvpid_single())
-               vpid_sync_vcpu_single(vmx);
+               vpid_sync_vcpu_single(vpid);
        else
                vpid_sync_vcpu_global();
 }
@@ -1567,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
        u32 eb;
 
        eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
-            (1u << NM_VECTOR) | (1u << DB_VECTOR);
+            (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
        if ((vcpu->guest_debug &
             (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
            (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1895,6 +1967,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
        preempt_enable();
 }
 
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP))
+               return;
+
+       do {
+               old.control = new.control = pi_desc->control;
+
+               /*
+                * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+                * are two possible cases:
+                * 1. After running 'pre_block', context switch
+                *    happened. For this case, 'sn' was set in
+                *    vmx_vcpu_put(), so we need to clear it here.
+                * 2. After running 'pre_block', we were blocked,
+                *    and woken up by some other guy. For this case,
+                *    we don't need to do anything, 'pi_post_block'
+                *    will do everything for us. However, we cannot
+                *    check whether it is case #1 or case #2 here
+                *    (maybe, not needed), so we also clear sn here,
+                *    I think it is not a big deal.
+                */
+               if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+                       if (vcpu->cpu != cpu) {
+                               dest = cpu_physical_id(cpu);
+
+                               if (x2apic_enabled())
+                                       new.ndst = dest;
+                               else
+                                       new.ndst = (dest << 8) & 0xFF00;
+                       }
+
+                       /* set 'NV' to 'notification vector' */
+                       new.nv = POSTED_INTR_VECTOR;
+               }
+
+               /* Allow posting non-urgent interrupts */
+               new.sn = 0;
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+}
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
@@ -1943,12 +2061,35 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
                rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
                vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
+
+               /* Setup TSC multiplier */
+               if (cpu_has_vmx_tsc_scaling())
+                       vmcs_write64(TSC_MULTIPLIER,
+                                    vcpu->arch.tsc_scaling_ratio);
+
                vmx->loaded_vmcs->cpu = cpu;
        }
+
+       vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP))
+               return;
+
+       /* Set SN when the vCPU is preempted */
+       if (vcpu->preempted)
+               pi_set_sn(pi_desc);
 }
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
+       vmx_vcpu_pi_put(vcpu);
+
        __vmx_load_host_state(to_vmx(vcpu));
        if (!vmm_exclusive) {
                __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2348,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
                if (index >= 0)
                        move_msr_up(vmx, index, save_nmsrs++);
                index = __find_msr_index(vmx, MSR_TSC_AUX);
-               if (index >= 0 && vmx->rdtscp_enabled)
+               if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
                        move_msr_up(vmx, index, save_nmsrs++);
                /*
                 * MSR_STAR is only needed on long mode guests, and only
@@ -2230,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 
 /*
  * reads and returns guest's timestamp counter "register"
- * guest_tsc = host_tsc + tsc_offset    -- 21.3
+ * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset
+ * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3
  */
-static u64 guest_read_tsc(void)
+static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
 {
        u64 host_tsc, tsc_offset;
 
        host_tsc = rdtsc();
        tsc_offset = vmcs_read64(TSC_OFFSET);
-       return host_tsc + tsc_offset;
+       return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset;
 }
 
 /*
@@ -2255,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
        return host_tsc + tsc_offset;
 }
 
-/*
- * Engage any workarounds for mis-matched TSC rates.  Currently limited to
- * software catchup for faster rates on slower CPUs.
- */
-static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
-{
-       if (!scale)
-               return;
-
-       if (user_tsc_khz > tsc_khz) {
-               vcpu->arch.tsc_catchup = 1;
-               vcpu->arch.tsc_always_catchup = 1;
-       } else
-               WARN(1, "user requested TSC rate below hardware speed\n");
-}
-
 static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
 {
        return vmcs_read64(TSC_OFFSET);
@@ -2302,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
        }
 }
 
-static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host)
+static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
 {
        u64 offset = vmcs_read64(TSC_OFFSET);
 
@@ -2315,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
                                           offset + adjustment);
 }
 
-static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
-{
-       return target_tsc - rdtsc();
-}
-
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2377,7 +2498,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        vmx->nested.nested_vmx_pinbased_ctls_high |=
                PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
                PIN_BASED_VMX_PREEMPTION_TIMER;
-       if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+       if (vmx_cpu_uses_apicv(&vmx->vcpu))
                vmx->nested.nested_vmx_pinbased_ctls_high |=
                        PIN_BASED_POSTED_INTR;
 
@@ -2471,10 +2592,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                SECONDARY_EXEC_RDTSCP |
                SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+               SECONDARY_EXEC_ENABLE_VPID |
                SECONDARY_EXEC_APIC_REGISTER_VIRT |
                SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                SECONDARY_EXEC_WBINVD_EXITING |
-               SECONDARY_EXEC_XSAVES;
+               SECONDARY_EXEC_XSAVES |
+               SECONDARY_EXEC_PCOMMIT;
 
        if (enable_ept) {
                /* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2616,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
        } else
                vmx->nested.nested_vmx_ept_caps = 0;
 
+       if (enable_vpid)
+               vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+                               VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+       else
+               vmx->nested.nested_vmx_vpid_caps = 0;
+
        if (enable_unrestricted_guest)
                vmx->nested.nested_vmx_secondary_ctls_high |=
                        SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2737,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
                break;
        case MSR_IA32_VMX_EPT_VPID_CAP:
                /* Currently, no nested vpid support */
-               *pdata = vmx->nested.nested_vmx_ept_caps;
+               *pdata = vmx->nested.nested_vmx_ept_caps |
+                       ((u64)vmx->nested.nested_vmx_vpid_caps << 32);
                break;
        default:
                return 1;
@@ -2642,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
        case MSR_EFER:
                return kvm_get_msr_common(vcpu, msr_info);
        case MSR_IA32_TSC:
-               msr_info->data = guest_read_tsc();
+               msr_info->data = guest_read_tsc(vcpu);
                break;
        case MSR_IA32_SYSENTER_CS:
                msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
@@ -2673,7 +2803,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                msr_info->data = vcpu->arch.ia32_xss;
                break;
        case MSR_TSC_AUX:
-               if (!to_vmx(vcpu)->rdtscp_enabled)
+               if (!guest_cpuid_has_rdtscp(vcpu))
                        return 1;
                /* Otherwise falls through */
        default:
@@ -2779,7 +2909,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
                        clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
                break;
        case MSR_TSC_AUX:
-               if (!vmx->rdtscp_enabled)
+               if (!guest_cpuid_has_rdtscp(vcpu))
                        return 1;
                /* Check reserved bit, higher 32 bits should be zero */
                if ((data >> 32) != 0)
@@ -2874,6 +3004,8 @@ static int hardware_enable(void)
                return -EBUSY;
 
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+       INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+       spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
        /*
         * Now we can enable the vmclear operation in kdump
@@ -3015,7 +3147,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
                        SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
                        SECONDARY_EXEC_SHADOW_VMCS |
                        SECONDARY_EXEC_XSAVES |
-                       SECONDARY_EXEC_ENABLE_PML;
+                       SECONDARY_EXEC_ENABLE_PML |
+                       SECONDARY_EXEC_PCOMMIT |
+                       SECONDARY_EXEC_TSC_SCALING;
                if (adjust_vmx_controls(min2, opt2,
                                        MSR_IA32_VMX_PROCBASED_CTLS2,
                                        &_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3575,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 #endif
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
-       vpid_sync_context(to_vmx(vcpu));
+       vpid_sync_context(vpid);
        if (enable_ept) {
                if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
                        return;
@@ -3451,6 +3585,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
        }
 }
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+       __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
        ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3783,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                if (!is_paging(vcpu)) {
                        hw_cr4 &= ~X86_CR4_PAE;
                        hw_cr4 |= X86_CR4_PSE;
-                       /*
-                        * SMEP/SMAP is disabled if CPU is in non-paging mode
-                        * in hardware. However KVM always uses paging mode to
-                        * emulate guest non-paging mode with TDP.
-                        * To emulate this behavior, SMEP/SMAP needs to be
-                        * manually disabled when guest switches to non-paging
-                        * mode.
-                        */
-                       hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
                } else if (!(cr4 & X86_CR4_PAE)) {
                        hw_cr4 &= ~X86_CR4_PAE;
                }
        }
 
+       if (!enable_unrestricted_guest && !is_paging(vcpu))
+               /*
+                * SMEP/SMAP is disabled if CPU is in non-paging mode in
+                * hardware.  However KVM always uses paging mode without
+                * unrestricted guest.
+                * To emulate this behavior, SMEP/SMAP needs to be manually
+                * disabled when guest switches to non-paging mode.
+                */
+               hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
        vmcs_writel(CR4_READ_SHADOW, cr4);
        vmcs_writel(GUEST_CR4, hw_cr4);
        return 0;
@@ -4146,29 +4286,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
        return r;
 }
 
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
 {
        int vpid;
 
-       vmx->vpid = 0;
        if (!enable_vpid)
-               return;
+               return 0;
        spin_lock(&vmx_vpid_lock);
        vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-       if (vpid < VMX_NR_VPIDS) {
-               vmx->vpid = vpid;
+       if (vpid < VMX_NR_VPIDS)
                __set_bit(vpid, vmx_vpid_bitmap);
-       }
+       else
+               vpid = 0;
        spin_unlock(&vmx_vpid_lock);
+       return vpid;
 }
 
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
 {
-       if (!enable_vpid)
+       if (!enable_vpid || vpid == 0)
                return;
        spin_lock(&vmx_vpid_lock);
-       if (vmx->vpid != 0)
-               __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+       __clear_bit(vpid, vmx_vpid_bitmap);
        spin_unlock(&vmx_vpid_lock);
 }
 
@@ -4323,9 +4462,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
                        msr, MSR_TYPE_W);
 }
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
-       return enable_apicv && irqchip_in_kernel(kvm);
+       return enable_apicv && lapic_in_kernel(vcpu);
 }
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4508,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_SMP
        if (vcpu->mode == IN_GUEST_MODE) {
+               struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+               /*
+                * Currently, we don't support urgent interrupt,
+                * all interrupts are recognized as non-urgent
+                * interrupt, so we cannot post interrupts when
+                * 'SN' is set.
+                *
+                * If the vcpu is in guest mode, it means it is
+                * running instead of being scheduled out and
+                * waiting in the run queue, and that's the only
+                * case when 'SN' is set currently, warning if
+                * 'SN' is set.
+                */
+               WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
                apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
                                POSTED_INTR_VECTOR);
                return true;
@@ -4505,7 +4660,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
        u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
-       if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+       if (!vmx_cpu_uses_apicv(&vmx->vcpu))
                pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
        return pin_based_exec_ctrl;
 }
@@ -4517,7 +4672,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
        if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
                exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 
-       if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+       if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
                exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
                exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4689,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
        u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-       if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+       if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
                exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
        if (vmx->vpid == 0)
                exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4703,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
                exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
        if (!ple_gap)
                exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-       if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+       if (!vmx_cpu_uses_apicv(&vmx->vcpu))
                exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
        exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4713,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
           a current VMCS12
        */
        exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-       /* PML is enabled/disabled in creating/destorying vcpu */
-       exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+       if (!enable_pml)
+               exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+       /* Currently, we allow L1 guest to directly run pcommit instruction. */
+       exec_control &= ~SECONDARY_EXEC_PCOMMIT;
 
        return exec_control;
 }
@@ -4604,12 +4763,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
        vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
-       if (cpu_has_secondary_exec_ctrls()) {
+       if (cpu_has_secondary_exec_ctrls())
                vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
                                vmx_secondary_exec_control(vmx));
-       }
 
-       if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+       if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
                vmcs_write64(EOI_EXIT_BITMAP0, 0);
                vmcs_write64(EOI_EXIT_BITMAP1, 0);
                vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4911,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        if (cpu_has_vmx_tpr_shadow() && !init_event) {
                vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-               if (vm_need_tpr_shadow(vcpu->kvm))
+               if (cpu_need_tpr_shadow(vcpu))
                        vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
                                     __pa(vcpu->arch.apic->regs));
                vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4919,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
        kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-       if (vmx_vm_has_apicv(vcpu->kvm))
+       if (vmx_cpu_uses_apicv(vcpu))
                memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 
        if (vmx->vpid != 0)
@@ -4771,12 +4929,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
        vmx_set_cr0(vcpu, cr0); /* enter rmode */
        vmx->vcpu.arch.cr0 = cr0;
        vmx_set_cr4(vcpu, 0);
-       if (!init_event)
-               vmx_set_efer(vcpu, 0);
+       vmx_set_efer(vcpu, 0);
        vmx_fpu_activate(vcpu);
        update_exception_bitmap(vcpu);
 
-       vpid_sync_context(vmx);
+       vpid_sync_context(vmx->vpid);
 }
 
 /*
@@ -5104,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)
                return handle_rmode_exception(vcpu, ex_no, error_code);
 
        switch (ex_no) {
+       case AC_VECTOR:
+               kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+               return 1;
        case DB_VECTOR:
                dr6 = vmcs_readl(EXIT_QUALIFICATION);
                if (!(vcpu->guest_debug &
@@ -5296,7 +5456,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
                                u8 cr8 = (u8)val;
                                err = kvm_set_cr8(vcpu, cr8);
                                kvm_complete_insn_gp(vcpu, err);
-                               if (irqchip_in_kernel(vcpu->kvm))
+                               if (lapic_in_kernel(vcpu))
                                        return 1;
                                if (cr8_prev <= cr8)
                                        return 1;
@@ -5510,17 +5670,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
        kvm_make_request(KVM_REQ_EVENT, vcpu);
 
        ++vcpu->stat.irq_window_exits;
-
-       /*
-        * If the user space waits to inject interrupts, exit as soon as
-        * possible
-        */
-       if (!irqchip_in_kernel(vcpu->kvm) &&
-           vcpu->run->request_interrupt_window &&
-           !kvm_cpu_has_interrupt(vcpu)) {
-               vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-               return 0;
-       }
        return 1;
 }
 
@@ -5753,10 +5902,11 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
                skip_emulated_instruction(vcpu);
+               trace_kvm_fast_mmio(gpa);
                return 1;
        }
 
-       ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+       ret = handle_mmio_page_fault(vcpu, gpa, true);
        if (likely(ret == RET_MMIO_PF_EMULATE))
                return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
                                              EMULATE_DONE;
@@ -5910,6 +6060,25 @@ static void update_ple_window_actual_max(void)
                                            ple_window_grow, INT_MIN);
 }
 
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+       struct kvm_vcpu *vcpu;
+       int cpu = smp_processor_id();
+
+       spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+       list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+                       blocked_vcpu_list) {
+               struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+               if (pi_test_on(pi_desc) == 1)
+                       kvm_vcpu_kick(vcpu);
+       }
+       spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
 static __init int hardware_setup(void)
 {
        int r = -ENOMEM, i, msr;
@@ -6028,6 +6197,12 @@ static __init int hardware_setup(void)
        if (!cpu_has_vmx_apicv())
                enable_apicv = 0;
 
+       if (cpu_has_vmx_tsc_scaling()) {
+               kvm_has_tsc_control = true;
+               kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
+               kvm_tsc_scaling_ratio_frac_bits = 48;
+       }
+
        if (enable_apicv)
                kvm_x86_ops->update_cr8_intercept = NULL;
        else {
@@ -6096,6 +6271,8 @@ static __init int hardware_setup(void)
                kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
        }
 
+       kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
        return alloc_kvm_area();
 
 out8:
@@ -6627,7 +6804,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
-       u32 exec_control;
        if (vmx->nested.current_vmptr == -1ull)
                return;
 
@@ -6640,9 +6816,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
                   they were modified */
                copy_shadow_to_vmcs12(vmx);
                vmx->nested.sync_shadow_vmcs = false;
-               exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-               exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+               vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+                               SECONDARY_EXEC_SHADOW_VMCS);
                vmcs_write64(VMCS_LINK_POINTER, -1ull);
        }
        vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6837,7 @@ static void free_nested(struct vcpu_vmx *vmx)
                return;
 
        vmx->nested.vmxon = false;
+       free_vpid(vmx->nested.vpid02);
        nested_release_vmcs12(vmx);
        if (enable_shadow_vmcs)
                free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7214,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        gpa_t vmptr;
-       u32 exec_control;
 
        if (!nested_vmx_check_permission(vcpu))
                return 1;
@@ -7070,9 +7245,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
                vmx->nested.current_vmcs12 = new_vmcs12;
                vmx->nested.current_vmcs12_page = page;
                if (enable_shadow_vmcs) {
-                       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-                       exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
-                       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+                       vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+                                     SECONDARY_EXEC_SHADOW_VMCS);
                        vmcs_write64(VMCS_LINK_POINTER,
                                     __pa(vmx->nested.current_shadow_vmcs));
                        vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7352,63 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
 {
-       kvm_queue_exception(vcpu, UD_VECTOR);
+       struct vcpu_vmx *vmx = to_vmx(vcpu);
+       u32 vmx_instruction_info;
+       unsigned long type, types;
+       gva_t gva;
+       struct x86_exception e;
+       int vpid;
+
+       if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+             SECONDARY_EXEC_ENABLE_VPID) ||
+                       !(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+               kvm_queue_exception(vcpu, UD_VECTOR);
+               return 1;
+       }
+
+       if (!nested_vmx_check_permission(vcpu))
+               return 1;
+
+       vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+       type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+       types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+       if (!(types & (1UL << type))) {
+               nested_vmx_failValid(vcpu,
+                       VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+               return 1;
+       }
+
+       /* according to the intel vmx instruction reference, the memory
+        * operand is read even if it isn't needed (e.g., for type==global)
+        */
+       if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+                       vmx_instruction_info, false, &gva))
+               return 1;
+       if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+                               sizeof(u32), &e)) {
+               kvm_inject_page_fault(vcpu, &e);
+               return 1;
+       }
+
+       switch (type) {
+       case VMX_VPID_EXTENT_ALL_CONTEXT:
+               if (get_vmcs12(vcpu)->virtual_processor_id == 0) {
+                       nested_vmx_failValid(vcpu,
+                               VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+                       return 1;
+               }
+               __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+               nested_vmx_succeed(vcpu);
+               break;
+       default:
+               /* Trap single context invalidation invvpid calls */
+               BUG_ON(1);
+               break;
+       }
+
+       skip_emulated_instruction(vcpu);
        return 1;
 }
 
@@ -7207,6 +7437,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
        return 1;
 }
 
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+       /* we never catch pcommit instruct for L1 guest. */
+       WARN_ON(1);
+       return 1;
+}
+
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7494,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
        [EXIT_REASON_XSAVES]                  = handle_xsaves,
        [EXIT_REASON_XRSTORS]                 = handle_xrstors,
        [EXIT_REASON_PML_FULL]                = handle_pml_full,
+       [EXIT_REASON_PCOMMIT]                 = handle_pcommit,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7796,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
                 * the XSS exit bitmap in vmcs12.
                 */
                return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+       case EXIT_REASON_PCOMMIT:
+               return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
        default:
                return true;
        }
@@ -7569,10 +7809,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
        *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
 {
        struct page *pml_pg;
-       u32 exec_control;
 
        pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
        if (!pml_pg)
@@ -7583,24 +7822,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
        vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
        vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
-       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-       exec_control |= SECONDARY_EXEC_ENABLE_PML;
-       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
        return 0;
 }
 
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
-       u32 exec_control;
-
-       ASSERT(vmx->pml_pg);
-       __free_page(vmx->pml_pg);
-       vmx->pml_pg = NULL;
-
-       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-       exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-       vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+       if (vmx->pml_pg) {
+               __free_page(vmx->pml_pg);
+               vmx->pml_pg = NULL;
+       }
 }
 
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7782,6 +8012,9 @@ static void dump_vmcs(void)
               vmcs_read32(IDT_VECTORING_INFO_FIELD),
               vmcs_read32(IDT_VECTORING_ERROR_CODE));
        pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET));
+       if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
+               pr_err("TSC Multiplier = 0x%016lx\n",
+                      vmcs_readl(TSC_MULTIPLIER));
        if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW)
                pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
        if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
@@ -7924,10 +8157,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
         * apicv
         */
        if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-                               !vmx_vm_has_apicv(vcpu->kvm))
+                               !vmx_cpu_uses_apicv(vcpu))
                return;
 
-       if (!vm_need_tpr_shadow(vcpu->kvm))
+       if (!cpu_need_tpr_shadow(vcpu))
                return;
 
        sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8262,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
        }
 }
 
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
-       if (!vmx_vm_has_apicv(vcpu->kvm))
+       u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+       if (!vmx_cpu_uses_apicv(vcpu))
                return;
 
        vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8711,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
        struct vcpu_vmx *vmx = to_vmx(vcpu);
 
        if (enable_pml)
-               vmx_disable_pml(vmx);
-       free_vpid(vmx);
+               vmx_destroy_pml_buffer(vmx);
+       free_vpid(vmx->vpid);
        leave_guest_mode(vcpu);
        vmx_load_vmcs01(vcpu);
        free_nested(vmx);
@@ -8497,7 +8731,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        if (!vmx)
                return ERR_PTR(-ENOMEM);
 
-       allocate_vpid(vmx);
+       vmx->vpid = allocate_vpid();
 
        err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
        if (err)
@@ -8530,7 +8764,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        put_cpu();
        if (err)
                goto free_vmcs;
-       if (vm_need_virtualize_apic_accesses(kvm)) {
+       if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
                err = alloc_apic_access_page(kvm);
                if (err)
                        goto free_vmcs;
@@ -8545,8 +8779,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
                        goto free_vmcs;
        }
 
-       if (nested)
+       if (nested) {
                nested_vmx_setup_ctls_msrs(vmx);
+               vmx->nested.vpid02 = allocate_vpid();
+       }
 
        vmx->nested.posted_intr_nv = -1;
        vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8795,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
         * for the guest, etc.
         */
        if (enable_pml) {
-               err = vmx_enable_pml(vmx);
+               err = vmx_create_pml_buffer(vmx);
                if (err)
                        goto free_vmcs;
        }
@@ -8567,13 +8803,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
        return &vmx->vcpu;
 
 free_vmcs:
+       free_vpid(vmx->nested.vpid02);
        free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
        kfree(vmx->guest_msrs);
 uninit_vcpu:
        kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
-       free_vpid(vmx);
+       free_vpid(vmx->vpid);
        kmem_cache_free(kvm_vcpu_cache, vmx);
        return ERR_PTR(err);
 }
@@ -8648,49 +8885,67 @@ static int vmx_get_lpage_level(void)
                return PT_PDPE_LEVEL;
 }
 
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+       /*
+        * These bits in the secondary execution controls field
+        * are dynamic, the others are mostly based on the hypervisor
+        * architecture and the guest's CPUID.  Do not touch the
+        * dynamic bits.
+        */
+       u32 mask =
+               SECONDARY_EXEC_SHADOW_VMCS |
+               SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+               SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+       u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+       vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+                    (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
        struct kvm_cpuid_entry2 *best;
        struct vcpu_vmx *vmx = to_vmx(vcpu);
-       u32 exec_control;
+       u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
-       vmx->rdtscp_enabled = false;
        if (vmx_rdtscp_supported()) {
-               exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-               if (exec_control & SECONDARY_EXEC_RDTSCP) {
-                       best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-                       if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-                               vmx->rdtscp_enabled = true;
-                       else {
-                               exec_control &= ~SECONDARY_EXEC_RDTSCP;
-                               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-                                               exec_control);
-                       }
+               bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+               if (!rdtscp_enabled)
+                       secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+               if (nested) {
+                       if (rdtscp_enabled)
+                               vmx->nested.nested_vmx_secondary_ctls_high |=
+                                       SECONDARY_EXEC_RDTSCP;
+                       else
+                               vmx->nested.nested_vmx_secondary_ctls_high &=
+                                       ~SECONDARY_EXEC_RDTSCP;
                }
-               if (nested && !vmx->rdtscp_enabled)
-                       vmx->nested.nested_vmx_secondary_ctls_high &=
-                               ~SECONDARY_EXEC_RDTSCP;
        }
 
        /* Exposing INVPCID only when PCID is exposed */
        best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
        if (vmx_invpcid_supported() &&
-           best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
-           guest_cpuid_has_pcid(vcpu)) {
-               exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-               exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
-               vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-                            exec_control);
-       } else {
-               if (cpu_has_secondary_exec_ctrls()) {
-                       exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-                       exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-                       vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-                                    exec_control);
-               }
+           (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+           !guest_cpuid_has_pcid(vcpu))) {
+               secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
                if (best)
                        best->ebx &= ~bit(X86_FEATURE_INVPCID);
        }
+
+       vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+       if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+               if (guest_cpuid_has_pcommit(vcpu))
+                       vmx->nested.nested_vmx_secondary_ctls_high |=
+                               SECONDARY_EXEC_PCOMMIT;
+               else
+                       vmx->nested.nested_vmx_secondary_ctls_high &=
+                               ~SECONDARY_EXEC_PCOMMIT;
+       }
 }
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9553,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        if (cpu_has_secondary_exec_ctrls()) {
                exec_control = vmx_secondary_exec_control(vmx);
-               if (!vmx->rdtscp_enabled)
-                       exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
                /* Take the following fields only from vmcs12 */
                exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
                                  SECONDARY_EXEC_RDTSCP |
                                  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                 SECONDARY_EXEC_APIC_REGISTER_VIRT);
+                                 SECONDARY_EXEC_APIC_REGISTER_VIRT |
+                                 SECONDARY_EXEC_PCOMMIT);
                if (nested_cpu_has(vmcs12,
                                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
                        exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9578,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
                                vmcs_write64(APIC_ACCESS_ADDR,
                                  page_to_phys(vmx->nested.apic_access_page));
                } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-                           (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+                           cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
                        exec_control |=
                                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
                        kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9688,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
        if (enable_vpid) {
                /*
-                * Trivially support vpid by letting L2s share their parent
-                * L1's vpid. TODO: move to a more elaborate solution, giving
-                * each L2 its own vpid and exposing the vpid feature to L1.
+                * There is no direct mapping between vpid02 and vpid12, the
+                * vpid02 is per-vCPU for L0 and reused while the value of
+                * vpid12 is changed w/ one invvpid during nested vmentry.
+                * The vpid12 is allocated by L1 for L2, so it will not
+                * influence global bitmap(for vpid01 and vpid02 allocation)
+                * even if spawn a lot of nested vCPUs.
                 */
-               vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-               vmx_flush_tlb(vcpu);
+               if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+                       if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+                               vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+                               __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+                       }
+               } else {
+                       vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+                       vmx_flush_tlb(vcpu);
+               }
+
        }
 
        if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10545,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
        kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ *   we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ *      'NDST' <-- vcpu->pre_pcpu
+ *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ *   interrupt is posted for this vCPU, we cannot block it, in
+ *   this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+       unsigned long flags;
+       unsigned int dest;
+       struct pi_desc old, new;
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       vcpu->pre_pcpu = vcpu->cpu;
+       spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+                         vcpu->pre_pcpu), flags);
+       list_add_tail(&vcpu->blocked_vcpu_list,
+                     &per_cpu(blocked_vcpu_on_cpu,
+                     vcpu->pre_pcpu));
+       spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+                              vcpu->pre_pcpu), flags);
+
+       do {
+               old.control = new.control = pi_desc->control;
+
+               /*
+                * We should not block the vCPU if
+                * an interrupt is posted for it.
+                */
+               if (pi_test_on(pi_desc) == 1) {
+                       spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+                                         vcpu->pre_pcpu), flags);
+                       list_del(&vcpu->blocked_vcpu_list);
+                       spin_unlock_irqrestore(
+                                       &per_cpu(blocked_vcpu_on_cpu_lock,
+                                       vcpu->pre_pcpu), flags);
+                       vcpu->pre_pcpu = -1;
+
+                       return 1;
+               }
+
+               WARN((pi_desc->sn == 1),
+                    "Warning: SN field of posted-interrupts "
+                    "is set before blocking\n");
+
+               /*
+                * Since vCPU can be preempted during this process,
+                * vcpu->cpu could be different with pre_pcpu, we
+                * need to set pre_pcpu as the destination of wakeup
+                * notification event, then we can find the right vCPU
+                * to wakeup in wakeup handler if interrupts happen
+                * when the vCPU is in blocked state.
+                */
+               dest = cpu_physical_id(vcpu->pre_pcpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               /* set 'NV' to 'wakeup vector' */
+               new.nv = POSTED_INTR_WAKEUP_VECTOR;
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+
+       return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+       struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+       struct pi_desc old, new;
+       unsigned int dest;
+       unsigned long flags;
+
+       if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP))
+               return;
+
+       do {
+               old.control = new.control = pi_desc->control;
+
+               dest = cpu_physical_id(vcpu->cpu);
+
+               if (x2apic_enabled())
+                       new.ndst = dest;
+               else
+                       new.ndst = (dest << 8) & 0xFF00;
+
+               /* Allow posting non-urgent interrupts */
+               new.sn = 0;
+
+               /* set 'NV' to 'notification vector' */
+               new.nv = POSTED_INTR_VECTOR;
+       } while (cmpxchg(&pi_desc->control, old.control,
+                       new.control) != old.control);
+
+       if(vcpu->pre_pcpu != -1) {
+               spin_lock_irqsave(
+                       &per_cpu(blocked_vcpu_on_cpu_lock,
+                       vcpu->pre_pcpu), flags);
+               list_del(&vcpu->blocked_vcpu_list);
+               spin_unlock_irqrestore(
+                       &per_cpu(blocked_vcpu_on_cpu_lock,
+                       vcpu->pre_pcpu), flags);
+               vcpu->pre_pcpu = -1;
+       }
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+                             uint32_t guest_irq, bool set)
+{
+       struct kvm_kernel_irq_routing_entry *e;
+       struct kvm_irq_routing_table *irq_rt;
+       struct kvm_lapic_irq irq;
+       struct kvm_vcpu *vcpu;
+       struct vcpu_data vcpu_info;
+       int idx, ret = -EINVAL;
+
+       if (!kvm_arch_has_assigned_device(kvm) ||
+               !irq_remapping_cap(IRQ_POSTING_CAP))
+               return 0;
+
+       idx = srcu_read_lock(&kvm->irq_srcu);
+       irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+       BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+       hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+               if (e->type != KVM_IRQ_ROUTING_MSI)
+                       continue;
+               /*
+                * VT-d PI cannot support posting multicast/broadcast
+                * interrupts to a vCPU, we still use interrupt remapping
+                * for these kind of interrupts.
+                *
+                * For lowest-priority interrupts, we only support
+                * those with single CPU as the destination, e.g. user
+                * configures the interrupts via /proc/irq or uses
+                * irqbalance to make the interrupts single-CPU.
+                *
+                * We will support full lowest-priority interrupt later.
+                */
+
+               kvm_set_msi_irq(e, &irq);
+               if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+                       continue;
+
+               vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+               vcpu_info.vector = irq.vector;
+
+               trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+                               vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+               if (set)
+                       ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+               else {
+                       /* suppress notification event before unposting */
+                       pi_set_sn(vcpu_to_pi_desc(vcpu));
+                       ret = irq_set_vcpu_affinity(host_irq, NULL);
+                       pi_clear_sn(vcpu_to_pi_desc(vcpu));
+               }
+
+               if (ret < 0) {
+                       printk(KERN_INFO "%s: failed to update PI IRTE\n",
+                                       __func__);
+                       goto out;
+               }
+       }
+
+       ret = 0;
+out:
+       srcu_read_unlock(&kvm->irq_srcu, idx);
+       return ret;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
        .cpu_has_kvm_support = cpu_has_kvm_support,
        .disabled_by_bios = vmx_disabled_by_bios,
@@ -10297,7 +10759,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .vcpu_load = vmx_vcpu_load,
        .vcpu_put = vmx_vcpu_put,
 
-       .update_db_bp_intercept = update_exception_bitmap,
+       .update_bp_intercept = update_exception_bitmap,
        .get_msr = vmx_get_msr,
        .set_msr = vmx_set_msr,
        .get_segment_base = vmx_get_segment_base,
@@ -10347,7 +10809,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .update_cr8_intercept = update_cr8_intercept,
        .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
        .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-       .vm_has_apicv = vmx_vm_has_apicv,
+       .cpu_uses_apicv = vmx_cpu_uses_apicv,
        .load_eoi_exitmap = vmx_load_eoi_exitmap,
        .hwapic_irr_update = vmx_hwapic_irr_update,
        .hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10371,11 +10833,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-       .set_tsc_khz = vmx_set_tsc_khz,
        .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
-       .adjust_tsc_offset = vmx_adjust_tsc_offset,
-       .compute_tsc_offset = vmx_compute_tsc_offset,
+       .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
        .read_l1_tsc = vmx_read_l1_tsc,
 
        .set_tdp_cr3 = vmx_set_cr3,
@@ -10394,7 +10854,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .flush_log_dirty = vmx_flush_log_dirty,
        .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 
+       .pre_block = vmx_pre_block,
+       .post_block = vmx_post_block,
+
        .pmu_ops = &intel_pmu_ops,
+
+       .update_pi_irte = vmx_update_pi_irte,
 };
 
 static int __init vmx_init(void)