unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
+static bool __read_mostly kvmclock_periodic_sync = true;
+module_param(kvmclock_periodic_sync, bool, S_IRUGO);
+
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32 kvm_max_guest_tsc_khz;
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
- unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
- X86_CR0_CD | X86_CR0_NW;
+ unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
- X86_CR4_PAE | X86_CR4_SMEP;
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP;
+
if (cr4 & CR4_RESERVED_BITS)
return 1;
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
- if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
- update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
-
if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
kvm_update_cpuid(vcpu);
&guest_hv_clock, sizeof(guest_hv_clock))))
return 0;
- /*
- * The interface expects us to write an even number signaling that the
- * update is finished. Since the guest won't see the intermediate
- * state, we just increase by 2 at the end.
+ /* This VCPU is paused, but it's legal for a guest to read another
+ * VCPU's kvmclock, so we really have to follow the specification where
+ * it says that version is odd if data is being modified, and even after
+ * it is consistent.
+ *
+ * Version field updates must be kept separate. This is because
+ * kvm_write_guest_cached might use a "rep movs" instruction, and
+ * writes within a string instruction are weakly ordered. So there
+ * are three writes overall.
+ *
+ * As a small optimization, only write the version field in the first
+ * and third write. The vcpu->pv_time cache is still valid, because the
+ * version field is the first in the struct.
*/
- vcpu->hv_clock.version = guest_hv_clock.version + 2;
+ BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+ vcpu->hv_clock.version = guest_hv_clock.version + 1;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
+
+ smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
+
+ smp_wmb();
+
+ vcpu->hv_clock.version++;
+ kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
return 0;
}
kvmclock_sync_work);
struct kvm *kvm = container_of(ka, struct kvm, arch);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
+static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr)
+{
+ struct mtrr_state_type *mtrr_state = &vcpu->arch.mtrr_state;
+ unsigned char mtrr_enabled = mtrr_state->enabled;
+ gfn_t start, end, mask;
+ int index;
+ bool is_fixed = true;
+
+ if (msr == MSR_IA32_CR_PAT || !tdp_enabled ||
+ !kvm_arch_has_noncoherent_dma(vcpu->kvm))
+ return;
+
+ if (!(mtrr_enabled & 0x2) && msr != MSR_MTRRdefType)
+ return;
+
+ switch (msr) {
+ case MSR_MTRRfix64K_00000:
+ start = 0x0;
+ end = 0x80000;
+ break;
+ case MSR_MTRRfix16K_80000:
+ start = 0x80000;
+ end = 0xa0000;
+ break;
+ case MSR_MTRRfix16K_A0000:
+ start = 0xa0000;
+ end = 0xc0000;
+ break;
+ case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
+ index = msr - MSR_MTRRfix4K_C0000;
+ start = 0xc0000 + index * (32 << 10);
+ end = start + (32 << 10);
+ break;
+ case MSR_MTRRdefType:
+ is_fixed = false;
+ start = 0x0;
+ end = ~0ULL;
+ break;
+ default:
+ /* variable range MTRRs. */
+ is_fixed = false;
+ index = (msr - 0x200) / 2;
+ start = (((u64)mtrr_state->var_ranges[index].base_hi) << 32) +
+ (mtrr_state->var_ranges[index].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state->var_ranges[index].mask_hi) << 32) +
+ (mtrr_state->var_ranges[index].mask_lo & PAGE_MASK);
+ mask |= ~0ULL << cpuid_maxphyaddr(vcpu);
+
+ end = ((start & mask) | ~mask) + 1;
+ }
+
+ if (is_fixed && !(mtrr_enabled & 0x1))
+ return;
+
+ kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end));
+}
+
static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
*pt = data;
}
- kvm_mmu_reset_context(vcpu);
+ update_mtrr(vcpu, msr);
return 0;
}
case KVM_CAP_HYPERV_TIME:
case KVM_CAP_IOAPIC_POLARITY_IGNORED:
case KVM_CAP_TSC_DEADLINE_TIMER:
+ case KVM_CAP_ENABLE_CAP_VM:
+ case KVM_CAP_DISABLE_QUIRKS:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
return 0;
}
+static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
+{
+ int r;
+
+ if (cap->flags)
+ return -EINVAL;
+
+ switch (cap->cap) {
+ case KVM_CAP_DISABLE_QUIRKS:
+ kvm->arch.disabled_quirks = cap->args[0];
+ r = 0;
+ break;
+ default:
+ r = -EINVAL;
+ break;
+ }
+ return r;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
r = 0;
break;
}
+ case KVM_ENABLE_CAP: {
+ struct kvm_enable_cap cap;
+ r = -EFAULT;
+ if (copy_from_user(&cap, argp, sizeof(cap)))
+ goto out;
+ r = kvm_vm_ioctl_enable_cap(kvm, &cap);
+ break;
+ }
default:
r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
kvm_set_mmio_spte_mask();
kvm_x86_ops = ops;
- kvm_init_msr_list();
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
lapic_irq.dest_id = apicid;
+ lapic_irq.msi_redir_hint = false;
lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
return;
page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
/*
if (req_immediate_exit)
smp_send_reschedule(vcpu->cpu);
- kvm_guest_enter();
+ __kvm_guest_enter();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
return 0;
}
-int fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu, bool init_event)
{
int err;
if (err)
return err;
- fpu_finit(&vcpu->arch.guest_fpu);
+ if (!init_event)
+ fpu_finit(&vcpu->arch.guest_fpu);
+
if (cpu_has_xsaves)
vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;
{
kvm_put_guest_xcr0(vcpu);
- if (!vcpu->guest_fpu_loaded)
+ if (!vcpu->guest_fpu_loaded) {
+ vcpu->fpu_counter = 0;
return;
+ }
vcpu->guest_fpu_loaded = 0;
fpu_save_init(&vcpu->arch.guest_fpu);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
- kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ /*
+ * If using eager FPU mode, or if the guest is a frequent user
+ * of the FPU, just leave the FPU active for next time.
+ * Every 255 times fpu_counter rolls over to 0; a guest that uses
+ * the FPU in bursts will revert to loading it on demand.
+ */
+ if (!vcpu->arch.eager_fpu) {
+ if (++vcpu->fpu_counter < 5)
+ kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
+ }
trace_kvm_fpu(0);
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
+ struct kvm_vcpu *vcpu;
+
if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
printk_once(KERN_WARNING
"kvm: SMP vm created on host with unstable TSC; "
"guest TSC will not be reliable\n");
- return kvm_x86_ops->vcpu_create(kvm, id);
+
+ vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+
+ /*
+ * Activate fpu unconditionally in case the guest needs eager FPU. It will be
+ * deactivated soon if it doesn't.
+ */
+ kvm_x86_ops->fpu_activate(vcpu);
+ return vcpu;
}
int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
if (r)
return r;
- kvm_vcpu_reset(vcpu);
+ kvm_vcpu_reset(vcpu, false);
kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
kvm_write_tsc(vcpu, &msr);
vcpu_put(vcpu);
+ if (!kvmclock_periodic_sync)
+ return;
+
schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
KVMCLOCK_SYNC_PERIOD);
}
kvm_x86_ops->vcpu_free(vcpu);
}
-void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
+void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
- kvm_pmu_reset(vcpu);
+ if (!init_event)
+ kvm_pmu_reset(vcpu);
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
- kvm_x86_ops->vcpu_reset(vcpu);
+ kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
int kvm_arch_hardware_setup(void)
{
- return kvm_x86_ops->hardware_setup();
+ int r;
+
+ r = kvm_x86_ops->hardware_setup();
+ if (r != 0)
+ return r;
+
+ kvm_init_msr_list();
+ return 0;
}
void kvm_arch_hardware_unsetup(void)
goto fail_free_mce_banks;
}
- r = fx_init(vcpu);
+ r = fx_init(vcpu, false);
if (r)
goto fail_free_wbinvd_dirty_mask;
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
return -ENOMEM;
}
-void kvm_arch_memslots_updated(struct kvm *kvm)
+void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
{
/*
* memslots->generation has been incremented.
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
/*
}
void kvm_arch_commit_memory_region(struct kvm *kvm,
- struct kvm_userspace_memory_region *mem,
+ const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- struct kvm_memory_slot *new;
int nr_mmu_pages = 0;
- if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
+ if (change == KVM_MR_DELETE && old->id >= KVM_USER_MEM_SLOTS) {
int ret;
ret = vm_munmap(old->userspace_addr,
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
- /* It's OK to get 'new' slot here as it has already been installed */
- new = id_to_memslot(kvm->memslots, mem->slot);
-
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
* sptes have to be split. If live migration is successful, the guest
* been zapped so no dirty logging staff is needed for old slot. For
* KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
* new and it's also covered when dealing with the new slot.
+ *
+ * FIXME: const-ify all uses of struct kvm_memory_slot.
*/
if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, new);
+ kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)