KVM: x86: add module parameter to disable periodic kvmclock sync

[firefly-linux-kernel-4.4.55.git] / arch / x86 / kvm / x86.c
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c

index e1a81267f3f632e971d1fc63cfe687be2e7c60f3..2211213a84e7a4184d297842ecc354b0760a6573 100644 (file)
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -99,6 +99,9 @@ module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
  unsigned int min_timer_period_us = 500;
  module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
  
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
  bool kvm_has_tsc_control;
  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
  u32  kvm_max_guest_tsc_khz;
@@ -572,8 +575,7 @@ out:
  int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         unsigned long old_cr0 = kvm_read_cr0(vcpu);
-       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
-                                   X86_CR0_CD | X86_CR0_NW;
+       unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
  
         cr0 |= X86_CR0_ET;
  
@@ -702,8 +704,9 @@ EXPORT_SYMBOL_GPL(kvm_set_xcr);
  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
  {
         unsigned long old_cr4 = kvm_read_cr4(vcpu);
-       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
-                                  X86_CR4_PAE | X86_CR4_SMEP;
+       unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+                                  X86_CR4_SMEP | X86_CR4_SMAP;
+
         if (cr4 & CR4_RESERVED_BITS)
                 return 1;
  
@@ -744,9 +747,6 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
             (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
                 kvm_mmu_reset_context(vcpu);
  
-       if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
-               update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
         if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
                 kvm_update_cpuid(vcpu);
  
@@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
                 &guest_hv_clock, sizeof(guest_hv_clock))))
                 return 0;
  
-       /*
-        * The interface expects us to write an even number signaling that the
-        * update is finished. Since the guest won't see the intermediate
-        * state, we just increase by 2 at the end.
+       /* This VCPU is paused, but it's legal for a guest to read another
+        * VCPU's kvmclock, so we really have to follow the specification where
+        * it says that version is odd if data is being modified, and even after
+        * it is consistent.
+        *
+        * Version field updates must be kept separate.  This is because
+        * kvm_write_guest_cached might use a "rep movs" instruction, and
+        * writes within a string instruction are weakly ordered.  So there
+        * are three writes overall.
+        *
+        * As a small optimization, only write the version field in the first
+        * and third write.  The vcpu->pv_time cache is still valid, because the
+        * version field is the first in the struct.
          */
-       vcpu->hv_clock.version = guest_hv_clock.version + 2;
+       BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+       vcpu->hv_clock.version = guest_hv_clock.version + 1;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
+
+       smp_wmb();
  
         /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
         pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
         kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
                                 &vcpu->hv_clock,
                                 sizeof(vcpu->hv_clock));
+
+       smp_wmb();
+
+       vcpu->hv_clock.version++;
+       kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+                               &vcpu->hv_clock,
+                               sizeof(vcpu->hv_clock.version));
         return 0;
  }
  
@@ -1747,6 +1770,9 @@ static void kvmclock_sync_fn(struct work_struct *work)
                                            kvmclock_sync_work);
         struct kvm *kvm = container_of(ka, struct kvm, arch);
  
+       if (!kvmclock_periodic_sync)
+               return;
+
         schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                         KVMCLOCK_SYNC_PERIOD);
@@ -1831,6 +1857,63 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  }
  EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
  
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+       struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state;
+       unsigned char mtrr_enabled = mtrr_state->enabled;
+       gfn_t start, end, mask;
+       int index;
+       bool is_fixed = true;
+
+       if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+             !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+               return;
+
+       if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType)
+               return;
+
+       switch (msr) {
+       case MSR_MTRRfix64K_00000:
+               start = 0x0;
+               end = 0x80000;
+               break;
+       case MSR_MTRRfix16K_80000:
+               start = 0x80000;
+               end = 0xa0000;
+               break;
+       case MSR_MTRRfix16K_A0000:
+               start = 0xa0000;
+               end = 0xc0000;
+               break;
+       case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+               index = msr - MSR_MTRRfix4K_C0000;
+               start = 0xc0000 + index * (32 << 10);
+               end = start + (32 << 10);
+               break;
+       case MSR_MTRRdefType:
+               is_fixed = false;
+               start = 0x0;
+               end = ~0ULL;
+               break;
+       default:
+               /* variable range MTRRs. */
+               is_fixed = false;
+               index = (msr - 0x200) / 2;
+               start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) +
+                      (mtrr_state->var_ranges[index].base_lo & PAGE_MASK);
+               mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) +
+                      (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK);
+               mask |= ~0ULL << cpuid_maxphyaddr(vcpu);
+
+               end = ((start & mask) | ~mask) + 1;
+       }
+
+       if (is_fixed && !(mtrr_enabled & 0x1))
+               return;
+
+       kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
  static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
  {
         u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
@@ -1864,7 +1947,7 @@ static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
                 *pt = data;
         }
  
-       kvm_mmu_reset_context(vcpu);
+       update_mtrr(vcpu, msr);
         return 0;
  }
  
@@ -2777,6 +2860,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
         case KVM_CAP_HYPERV_TIME:
         case KVM_CAP_IOAPIC_POLARITY_IGNORED:
         case KVM_CAP_TSC_DEADLINE_TIMER:
+       case KVM_CAP_ENABLE_CAP_VM:
+       case KVM_CAP_DISABLE_QUIRKS:
  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
         case KVM_CAP_ASSIGN_DEV_IRQ:
         case KVM_CAP_PCI_2_3:
@@ -3824,6 +3909,26 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
         return 0;
  }
  
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+                                  struct kvm_enable_cap *cap)
+{
+       int r;
+
+       if (cap->flags)
+               return -EINVAL;
+
+       switch (cap->cap) {
+       case KVM_CAP_DISABLE_QUIRKS:
+               kvm->arch.disabled_quirks = cap->args[0];
+               r = 0;
+               break;
+       default:
+               r = -EINVAL;
+               break;
+       }
+       return r;
+}
+
  long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
  {
@@ -4076,7 +4181,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = 0;
                 break;
         }
+       case KVM_ENABLE_CAP: {
+               struct kvm_enable_cap cap;
  
+               r = -EFAULT;
+               if (copy_from_user(&cap, argp, sizeof(cap)))
+                       goto out;
+               r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+               break;
+       }
         default:
                 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
         }
@@ -5799,7 +5912,6 @@ int kvm_arch_init(void *opaque)
         kvm_set_mmio_spte_mask();
  
         kvm_x86_ops = ops;
-       kvm_init_msr_list();
  
         kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
                         PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5932,6 +6044,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
         lapic_irq.shorthand = 0;
         lapic_irq.dest_mode = 0;
         lapic_irq.dest_id = apicid;
+       lapic_irq.msi_redir_hint = false;
  
         lapic_irq.delivery_mode = APIC_DM_REMRD;
         kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
@@ -6175,6 +6288,8 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
                 return;
  
         page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+       if (is_error_page(page))
+               return;
         kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
  
         /*
@@ -6325,7 +6440,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
         if (req_immediate_exit)
                 smp_send_reschedule(vcpu->cpu);
  
-       kvm_guest_enter();
+       __kvm_guest_enter();
  
         if (unlikely(vcpu->arch.switch_db_regs)) {
                 set_debugreg(0, 7);
@@ -6981,7 +7096,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
         return 0;
  }
  
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
  {
         int err;
  
@@ -6989,7 +7104,9 @@ int fx_init(struct kvm_vcpu *vcpu)
         if (err)
                 return err;
  
-       fpu_finit(&vcpu->arch.guest_fpu);
+       if (!init_event)
+               fpu_finit(&vcpu->arch.guest_fpu);
+
         if (cpu_has_xsaves)
                 vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
                         host_xcr0 | XSTATE_COMPACTION_ENABLED;
@@ -7031,14 +7148,25 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
  {
         kvm_put_guest_xcr0(vcpu);
  
-       if (!vcpu->guest_fpu_loaded)
+       if (!vcpu->guest_fpu_loaded) {
+               vcpu->fpu_counter = 0;
                 return;
+       }
  
         vcpu->guest_fpu_loaded = 0;
         fpu_save_init(&vcpu->arch.guest_fpu);
         __kernel_fpu_end();
         ++vcpu->stat.fpu_reload;
-       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       /*
+        * If using eager FPU mode, or if the guest is a frequent user
+        * of the FPU, just leave the FPU active for next time.
+        * Every 255 times fpu_counter rolls over to 0; a guest that uses
+        * the FPU in bursts will revert to loading it on demand.
+        */
+       if (!vcpu->arch.eager_fpu) {
+               if (++vcpu->fpu_counter < 5)
+                       kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+       }
         trace_kvm_fpu(0);
  }
  
@@ -7054,11 +7182,21 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                                 unsigned int id)
  {
+       struct kvm_vcpu *vcpu;
+
         if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
                 printk_once(KERN_WARNING
                 "kvm: SMP vm created on host with unstable TSC; "
                 "guest TSC will not be reliable\n");
-       return kvm_x86_ops->vcpu_create(kvm, id);
+
+       vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+       /*
+        * Activate fpu unconditionally in case the guest needs eager FPU.  It will be
+        * deactivated soon if it doesn't.
+        */
+       kvm_x86_ops->fpu_activate(vcpu);
+       return vcpu;
  }
  
  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -7069,7 +7207,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
         r = vcpu_load(vcpu);
         if (r)
                 return r;
-       kvm_vcpu_reset(vcpu);
+       kvm_vcpu_reset(vcpu, false);
         kvm_mmu_setup(vcpu);
         vcpu_put(vcpu);
  
@@ -7089,6 +7227,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
         kvm_write_tsc(vcpu, &msr);
         vcpu_put(vcpu);
  
+       if (!kvmclock_periodic_sync)
+               return;
+
         schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
                                         KVMCLOCK_SYNC_PERIOD);
  }
@@ -7107,7 +7248,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
         kvm_x86_ops->vcpu_free(vcpu);
  }
  
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
  {
         atomic_set(&vcpu->arch.nmi_queued, 0);
         vcpu->arch.nmi_pending = 0;
@@ -7134,13 +7275,14 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
         kvm_async_pf_hash_reset(vcpu);
         vcpu->arch.apf.halted = false;
  
-       kvm_pmu_reset(vcpu);
+       if (!init_event)
+               kvm_pmu_reset(vcpu);
  
         memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
         vcpu->arch.regs_avail = ~0;
         vcpu->arch.regs_dirty = ~0;
  
-       kvm_x86_ops->vcpu_reset(vcpu);
+       kvm_x86_ops->vcpu_reset(vcpu, init_event);
  }
  
  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -7253,7 +7395,14 @@ void kvm_arch_hardware_disable(void)
  
  int kvm_arch_hardware_setup(void)
  {
-       return kvm_x86_ops->hardware_setup();
+       int r;
+
+       r = kvm_x86_ops->hardware_setup();
+       if (r != 0)
+               return r;
+
+       kvm_init_msr_list();
+       return 0;
  }
  
  void kvm_arch_hardware_unsetup(void)
@@ -7322,7 +7471,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
                 goto fail_free_mce_banks;
         }
  
-       r = fx_init(vcpu);
+       r = fx_init(vcpu, false);
         if (r)
                 goto fail_free_wbinvd_dirty_mask;
  
@@ -7334,6 +7483,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
  
         vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
  
+       vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
         kvm_async_pf_hash_reset(vcpu);
         kvm_pmu_init(vcpu);
  
@@ -7547,7 +7698,7 @@ out_free:
         return -ENOMEM;
  }
  
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
  {
         /*
          * memslots->generation has been incremented.
@@ -7558,7 +7709,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm)
  
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                 struct kvm_memory_slot *memslot,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                 enum kvm_mr_change change)
  {
         /*
@@ -7636,14 +7787,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
  }
  
  void kvm_arch_commit_memory_region(struct kvm *kvm,
-                               struct kvm_userspace_memory_region *mem,
+                               const struct kvm_userspace_memory_region *mem,
                                 const struct kvm_memory_slot *old,
+                               const struct kvm_memory_slot *new,
                                 enum kvm_mr_change change)
  {
-       struct kvm_memory_slot *new;
         int nr_mmu_pages = 0;
  
-       if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+       if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
                 int ret;
  
                 ret = vm_munmap(old->userspace_addr,
@@ -7660,9 +7811,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
         if (nr_mmu_pages)
                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
  
-       /* It's OK to get 'new' slot here as it has already been installed */
-       new = id_to_memslot(kvm->memslots, mem->slot);
-
         /*
          * Dirty logging tracks sptes in 4k granularity, meaning that large
          * sptes have to be split.  If live migration is successful, the guest
@@ -7687,9 +7835,11 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
          * been zapped so no dirty logging staff is needed for old slot. For
          * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
          * new and it's also covered when dealing with the new slot.
+        *
+        * FIXME: const-ify all uses of struct kvm_memory_slot.
          */
         if (change != KVM_MR_DELETE)
-               kvm_mmu_slot_apply_flags(kvm, new);
+               kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
  }
  
  void kvm_arch_flush_shadow_all(struct kvm *kvm)