1 /*
2 * vvmx.c: Support virtual VMX for nested virtualization.
3 *
4 * Copyright (c) 2010, Intel Corporation.
5 * Author: Qing He <qing.he@intel.com>
6 * Eddie Dong <eddie.dong@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 *
17 * You should have received a copy of the GNU General Public License along with
18 * this program; If not, see <http://www.gnu.org/licenses/>.
19 *
20 */
21
22 #include <asm/types.h>
23 #include <asm/mtrr.h>
24 #include <asm/p2m.h>
25 #include <asm/hvm/ioreq.h>
26 #include <asm/hvm/vmx/vmx.h>
27 #include <asm/hvm/vmx/vvmx.h>
28 #include <asm/hvm/nestedhvm.h>
29
30 static DEFINE_PER_CPU(u64 *, vvmcs_buf);
31
32 static void nvmx_purge_vvmcs(struct vcpu *v);
33
nvmx_vcpu_in_vmx(const struct vcpu * v)34 static bool nvmx_vcpu_in_vmx(const struct vcpu *v)
35 {
36 return vcpu_2_nvmx(v).vmxon_region_pa != INVALID_PADDR;
37 }
38
39 #define VMCS_BUF_SIZE 100
40
nvmx_cpu_up_prepare(unsigned int cpu)41 int nvmx_cpu_up_prepare(unsigned int cpu)
42 {
43 if ( per_cpu(vvmcs_buf, cpu) != NULL )
44 return 0;
45
46 per_cpu(vvmcs_buf, cpu) = xzalloc_array(u64, VMCS_BUF_SIZE);
47
48 if ( per_cpu(vvmcs_buf, cpu) != NULL )
49 return 0;
50
51 return -ENOMEM;
52 }
53
nvmx_cpu_dead(unsigned int cpu)54 void nvmx_cpu_dead(unsigned int cpu)
55 {
56 xfree(per_cpu(vvmcs_buf, cpu));
57 per_cpu(vvmcs_buf, cpu) = NULL;
58 }
59
nvmx_vcpu_initialise(struct vcpu * v)60 int nvmx_vcpu_initialise(struct vcpu *v)
61 {
62 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
63 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
64 struct page_info *pg = alloc_domheap_page(NULL, 0);
65
66 if ( !pg )
67 {
68 gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n");
69 return -ENOMEM;
70 }
71 nvcpu->nv_n2vmcx_pa = page_to_maddr(pg);
72
73 /* non-root VMREAD/VMWRITE bitmap. */
74 if ( cpu_has_vmx_vmcs_shadowing )
75 {
76 struct page_info *vmread_bitmap, *vmwrite_bitmap;
77 unsigned long *vw;
78
79 vmread_bitmap = alloc_domheap_page(NULL, 0);
80 if ( !vmread_bitmap )
81 {
82 gdprintk(XENLOG_ERR, "nest: allocation for vmread bitmap failed\n");
83 return -ENOMEM;
84 }
85 v->arch.hvm_vmx.vmread_bitmap = vmread_bitmap;
86
87 clear_domain_page(_mfn(page_to_mfn(vmread_bitmap)));
88
89 vmwrite_bitmap = alloc_domheap_page(NULL, 0);
90 if ( !vmwrite_bitmap )
91 {
92 gdprintk(XENLOG_ERR, "nest: allocation for vmwrite bitmap failed\n");
93 return -ENOMEM;
94 }
95 v->arch.hvm_vmx.vmwrite_bitmap = vmwrite_bitmap;
96
97 vw = __map_domain_page(vmwrite_bitmap);
98 clear_page(vw);
99
100 /*
101 * For the following 6 encodings, we need to handle them in VMM.
102 * Let them vmexit as usual.
103 */
104 set_bit(IO_BITMAP_A, vw);
105 set_bit(VMCS_HIGH(IO_BITMAP_A), vw);
106 set_bit(IO_BITMAP_B, vw);
107 set_bit(VMCS_HIGH(IO_BITMAP_B), vw);
108 set_bit(MSR_BITMAP, vw);
109 set_bit(VMCS_HIGH(MSR_BITMAP), vw);
110
111 unmap_domain_page(vw);
112 }
113
114 nvmx->ept.enabled = 0;
115 nvmx->guest_vpid = 0;
116 nvmx->vmxon_region_pa = INVALID_PADDR;
117 nvcpu->nv_vvmcx = NULL;
118 nvcpu->nv_vvmcxaddr = INVALID_PADDR;
119 nvmx->intr.intr_info = 0;
120 nvmx->intr.error_code = 0;
121 nvmx->iobitmap[0] = NULL;
122 nvmx->iobitmap[1] = NULL;
123 nvmx->msrbitmap = NULL;
124 INIT_LIST_HEAD(&nvmx->launched_list);
125 return 0;
126 }
127
nvmx_vcpu_destroy(struct vcpu * v)128 void nvmx_vcpu_destroy(struct vcpu *v)
129 {
130 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
131 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
132 struct vvmcs_list *item, *n;
133
134 /*
135 * When destroying the vcpu, it may be running on behalf of L2 guest.
136 * Therefore we need to switch the VMCS pointer back to the L1 VMCS,
137 * in order to avoid double free of L2 VMCS and the possible memory
138 * leak of L1 VMCS page.
139 */
140 if ( nvcpu->nv_n1vmcx_pa )
141 v->arch.hvm_vmx.vmcs_pa = nvcpu->nv_n1vmcx_pa;
142
143 if ( nvcpu->nv_n2vmcx_pa )
144 {
145 __vmpclear(nvcpu->nv_n2vmcx_pa);
146 free_domheap_page(maddr_to_page(nvcpu->nv_n2vmcx_pa));
147 nvcpu->nv_n2vmcx_pa = 0;
148 }
149
150 /* Must also cope with nvmx_vcpu_initialise() not having got called. */
151 if ( nvmx->launched_list.next )
152 list_for_each_entry_safe(item, n, &nvmx->launched_list, node)
153 {
154 list_del(&item->node);
155 xfree(item);
156 }
157
158 if ( v->arch.hvm_vmx.vmread_bitmap )
159 {
160 free_domheap_page(v->arch.hvm_vmx.vmread_bitmap);
161 v->arch.hvm_vmx.vmread_bitmap = NULL;
162 }
163 if ( v->arch.hvm_vmx.vmwrite_bitmap )
164 {
165 free_domheap_page(v->arch.hvm_vmx.vmwrite_bitmap);
166 v->arch.hvm_vmx.vmwrite_bitmap = NULL;
167 }
168 }
169
nvmx_domain_relinquish_resources(struct domain * d)170 void nvmx_domain_relinquish_resources(struct domain *d)
171 {
172 struct vcpu *v;
173
174 for_each_vcpu ( d, v )
175 nvmx_purge_vvmcs(v);
176 }
177
nvmx_vcpu_reset(struct vcpu * v)178 int nvmx_vcpu_reset(struct vcpu *v)
179 {
180 return 0;
181 }
182
nvmx_vcpu_eptp_base(struct vcpu * v)183 uint64_t nvmx_vcpu_eptp_base(struct vcpu *v)
184 {
185 return get_vvmcs(v, EPT_POINTER) & PAGE_MASK;
186 }
187
nvmx_ept_enabled(struct vcpu * v)188 bool_t nvmx_ept_enabled(struct vcpu *v)
189 {
190 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
191
192 return !!(nvmx->ept.enabled);
193 }
194
195 struct vmx_inst_decoded {
196 #define VMX_INST_MEMREG_TYPE_MEMORY 0
197 #define VMX_INST_MEMREG_TYPE_REG 1
198 int type;
199 union {
200 struct {
201 unsigned long mem;
202 unsigned int len;
203 };
204 enum vmx_regs_enc reg1;
205 };
206
207 enum vmx_regs_enc reg2;
208 };
209
210 enum vmx_ops_result {
211 VMSUCCEED,
212 VMFAIL_VALID,
213 VMFAIL_INVALID,
214 };
215
216 #define CASE_SET_REG(REG, reg) \
217 case VMX_REG_ ## REG: regs->reg = value; break
218 #define CASE_GET_REG(REG, reg) \
219 case VMX_REG_ ## REG: value = regs->reg; break
220
vvmcs_offset(u32 width,u32 type,u32 index)221 static int vvmcs_offset(u32 width, u32 type, u32 index)
222 {
223 int offset;
224
225 offset = (index & 0x1f) | type << 5 | width << 7;
226
227 if ( offset == 0 ) /* vpid */
228 offset = 0x3f;
229
230 return offset;
231 }
232
get_vvmcs_virtual(void * vvmcs,u32 vmcs_encoding)233 u64 get_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding)
234 {
235 union vmcs_encoding enc;
236 u64 *content = (u64 *) vvmcs;
237 int offset;
238 u64 res;
239
240 enc.word = vmcs_encoding;
241 offset = vvmcs_offset(enc.width, enc.type, enc.index);
242 res = content[offset];
243
244 switch ( enc.width ) {
245 case VVMCS_WIDTH_16:
246 res &= 0xffff;
247 break;
248 case VVMCS_WIDTH_64:
249 if ( enc.access_type )
250 res >>= 32;
251 break;
252 case VVMCS_WIDTH_32:
253 res &= 0xffffffff;
254 break;
255 case VVMCS_WIDTH_NATURAL:
256 default:
257 break;
258 }
259
260 return res;
261 }
262
get_vvmcs_real(const struct vcpu * v,u32 encoding)263 u64 get_vvmcs_real(const struct vcpu *v, u32 encoding)
264 {
265 return virtual_vmcs_vmread(v, encoding);
266 }
267
get_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 * val)268 enum vmx_insn_errno get_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 *val)
269 {
270 *val = get_vvmcs_virtual(vvmcs, encoding);
271
272 /*
273 * TODO: This should not always succeed. Fields and values need to be
274 * audited against the features offered to the guest in the VT-x MSRs.
275 * This should be fixed when the MSR levelling work is started, at which
276 * point there will be a cpuid_policy-like object.
277 */
278 return VMX_INSN_SUCCEED;
279 }
280
get_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 * val)281 enum vmx_insn_errno get_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
282 u64 *val)
283 {
284 return virtual_vmcs_vmread_safe(v, encoding, val);
285 }
286
set_vvmcs_virtual(void * vvmcs,u32 vmcs_encoding,u64 val)287 void set_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding, u64 val)
288 {
289 union vmcs_encoding enc;
290 u64 *content = (u64 *) vvmcs;
291 int offset;
292 u64 res;
293
294 enc.word = vmcs_encoding;
295 offset = vvmcs_offset(enc.width, enc.type, enc.index);
296 res = content[offset];
297
298 switch ( enc.width ) {
299 case VVMCS_WIDTH_16:
300 res = val & 0xffff;
301 break;
302 case VVMCS_WIDTH_64:
303 if ( enc.access_type )
304 {
305 res &= 0xffffffff;
306 res |= val << 32;
307 }
308 else
309 res = val;
310 break;
311 case VVMCS_WIDTH_32:
312 res = val & 0xffffffff;
313 break;
314 case VVMCS_WIDTH_NATURAL:
315 default:
316 res = val;
317 break;
318 }
319
320 content[offset] = res;
321 }
322
set_vvmcs_real(const struct vcpu * v,u32 encoding,u64 val)323 void set_vvmcs_real(const struct vcpu *v, u32 encoding, u64 val)
324 {
325 virtual_vmcs_vmwrite(v, encoding, val);
326 }
327
set_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 val)328 enum vmx_insn_errno set_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 val)
329 {
330 set_vvmcs_virtual(vvmcs, encoding, val);
331
332 /*
333 * TODO: This should not always succeed. Fields and values need to be
334 * audited against the features offered to the guest in the VT-x MSRs.
335 * This should be fixed when the MSR levelling work is started, at which
336 * point there will be a cpuid_policy-like object.
337 */
338 return VMX_INSN_SUCCEED;
339 }
340
set_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 val)341 enum vmx_insn_errno set_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
342 u64 val)
343 {
344 return virtual_vmcs_vmwrite_safe(v, encoding, val);
345 }
346
reg_read(struct cpu_user_regs * regs,enum vmx_regs_enc index)347 static unsigned long reg_read(struct cpu_user_regs *regs,
348 enum vmx_regs_enc index)
349 {
350 unsigned long *pval = decode_register(index, regs, 0);
351
352 return *pval;
353 }
354
reg_write(struct cpu_user_regs * regs,enum vmx_regs_enc index,unsigned long value)355 static void reg_write(struct cpu_user_regs *regs,
356 enum vmx_regs_enc index,
357 unsigned long value)
358 {
359 unsigned long *pval = decode_register(index, regs, 0);
360
361 *pval = value;
362 }
363
__n2_pin_exec_control(struct vcpu * v)364 static inline u32 __n2_pin_exec_control(struct vcpu *v)
365 {
366 return get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
367 }
368
__n2_exec_control(struct vcpu * v)369 static inline u32 __n2_exec_control(struct vcpu *v)
370 {
371 return get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL);
372 }
373
__n2_secondary_exec_control(struct vcpu * v)374 static inline u32 __n2_secondary_exec_control(struct vcpu *v)
375 {
376 u64 second_ctrl = 0;
377
378 if ( __n2_exec_control(v) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
379 second_ctrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
380
381 return second_ctrl;
382 }
383
vmx_inst_check_privilege(struct cpu_user_regs * regs,int vmxop_check)384 static int vmx_inst_check_privilege(struct cpu_user_regs *regs, int vmxop_check)
385 {
386 struct vcpu *v = current;
387
388 if ( vmxop_check )
389 {
390 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ||
391 !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) )
392 goto invalid_op;
393 }
394 else if ( !nvmx_vcpu_in_vmx(v) )
395 goto invalid_op;
396
397 if ( vmx_guest_x86_mode(v) < (hvm_long_mode_active(v) ? 8 : 2) )
398 goto invalid_op;
399 else if ( nestedhvm_vcpu_in_guestmode(v) )
400 goto vmexit;
401
402 if ( vmx_get_cpl() > 0 )
403 goto gp_fault;
404
405 return X86EMUL_OKAY;
406
407 vmexit:
408 gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: vmexit\n");
409 vcpu_nestedhvm(v).nv_vmexit_pending = 1;
410 return X86EMUL_EXCEPTION;
411
412 invalid_op:
413 gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: invalid_op\n");
414 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
415 return X86EMUL_EXCEPTION;
416
417 gp_fault:
418 gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: gp_fault\n");
419 hvm_inject_hw_exception(TRAP_gp_fault, 0);
420 return X86EMUL_EXCEPTION;
421 }
422
decode_vmx_inst(struct cpu_user_regs * regs,struct vmx_inst_decoded * decode,unsigned long * poperandS,int vmxon_check)423 static int decode_vmx_inst(struct cpu_user_regs *regs,
424 struct vmx_inst_decoded *decode,
425 unsigned long *poperandS, int vmxon_check)
426 {
427 struct vcpu *v = current;
428 union vmx_inst_info info;
429 struct segment_register seg;
430 unsigned long base, index, seg_base, disp, offset;
431 int scale, size;
432
433 if ( vmx_inst_check_privilege(regs, vmxon_check) != X86EMUL_OKAY )
434 return X86EMUL_EXCEPTION;
435
436 __vmread(VMX_INSTRUCTION_INFO, &offset);
437 info.word = offset;
438
439 if ( info.fields.memreg ) {
440 decode->type = VMX_INST_MEMREG_TYPE_REG;
441 decode->reg1 = info.fields.reg1;
442 if ( poperandS != NULL )
443 *poperandS = reg_read(regs, decode->reg1);
444 }
445 else
446 {
447 bool mode_64bit = (vmx_guest_x86_mode(v) == 8);
448
449 decode->type = VMX_INST_MEMREG_TYPE_MEMORY;
450
451 if ( info.fields.segment > x86_seg_gs )
452 goto gp_fault;
453 hvm_get_segment_register(v, info.fields.segment, &seg);
454 seg_base = seg.base;
455
456 base = info.fields.base_reg_invalid ? 0 :
457 reg_read(regs, info.fields.base_reg);
458
459 index = info.fields.index_reg_invalid ? 0 :
460 reg_read(regs, info.fields.index_reg);
461
462 scale = 1 << info.fields.scaling;
463
464 __vmread(EXIT_QUALIFICATION, &disp);
465
466 size = 1 << (info.fields.addr_size + 1);
467
468 offset = base + index * scale + disp;
469 base = !mode_64bit || info.fields.segment >= x86_seg_fs ?
470 seg_base + offset : offset;
471 if ( offset + size - 1 < offset ||
472 (mode_64bit ?
473 !is_canonical_address((long)base < 0 ? base :
474 base + size - 1) :
475 offset + size - 1 > seg.limit) )
476 goto gp_fault;
477
478 if ( poperandS != NULL )
479 {
480 pagefault_info_t pfinfo;
481 int rc = hvm_copy_from_guest_linear(poperandS, base, size,
482 0, &pfinfo);
483
484 if ( rc == HVMTRANS_bad_linear_to_gfn )
485 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
486 if ( rc != HVMTRANS_okay )
487 return X86EMUL_EXCEPTION;
488 }
489 decode->mem = base;
490 decode->len = size;
491 }
492
493 decode->reg2 = info.fields.reg2;
494
495 return X86EMUL_OKAY;
496
497 gp_fault:
498 hvm_inject_hw_exception(TRAP_gp_fault, 0);
499 return X86EMUL_EXCEPTION;
500 }
501
vmsucceed(struct cpu_user_regs * regs)502 static void vmsucceed(struct cpu_user_regs *regs)
503 {
504 regs->eflags &= ~X86_EFLAGS_ARITH_MASK;
505 }
506
vmfail_valid(struct cpu_user_regs * regs,enum vmx_insn_errno errno)507 static void vmfail_valid(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
508 {
509 struct vcpu *v = current;
510 unsigned int eflags = regs->eflags;
511
512 regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_ZF;
513 set_vvmcs(v, VM_INSTRUCTION_ERROR, errno);
514 }
515
vmfail_invalid(struct cpu_user_regs * regs)516 static void vmfail_invalid(struct cpu_user_regs *regs)
517 {
518 unsigned int eflags = regs->eflags;
519
520 regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_CF;
521 }
522
vmfail(struct cpu_user_regs * regs,enum vmx_insn_errno errno)523 static void vmfail(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
524 {
525 if ( errno == VMX_INSN_SUCCEED )
526 return;
527
528 if ( vcpu_nestedhvm(current).nv_vvmcxaddr != INVALID_PADDR &&
529 errno != VMX_INSN_FAIL_INVALID )
530 vmfail_valid(regs, errno);
531 else
532 vmfail_invalid(regs);
533 }
534
nvmx_intercepts_exception(struct vcpu * v,unsigned int vector,int error_code)535 bool_t nvmx_intercepts_exception(
536 struct vcpu *v, unsigned int vector, int error_code)
537 {
538 u32 exception_bitmap, pfec_match=0, pfec_mask=0;
539 int r;
540
541 ASSERT(vector < 32);
542
543 exception_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
544 r = exception_bitmap & (1 << vector) ? 1: 0;
545
546 if ( vector == TRAP_page_fault )
547 {
548 pfec_match = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH);
549 pfec_mask = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK);
550 if ( (error_code & pfec_mask) != pfec_match )
551 r = !r;
552 }
553 return r;
554 }
555
556 /*
557 * Nested VMX uses "strict" condition to exit from
558 * L2 guest if either L1 VMM or L0 VMM expect to exit.
559 */
__shadow_control(struct vcpu * v,unsigned int field,u32 host_value)560 static inline u32 __shadow_control(struct vcpu *v,
561 unsigned int field,
562 u32 host_value)
563 {
564 return get_vvmcs(v, field) | host_value;
565 }
566
set_shadow_control(struct vcpu * v,unsigned int field,u32 host_value)567 static void set_shadow_control(struct vcpu *v,
568 unsigned int field,
569 u32 host_value)
570 {
571 __vmwrite(field, __shadow_control(v, field, host_value));
572 }
573
_shadow_io_bitmap(struct vcpu * v)574 unsigned long *_shadow_io_bitmap(struct vcpu *v)
575 {
576 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
577 int port80, portED;
578 u8 *bitmap;
579
580 bitmap = nvmx->iobitmap[0];
581 port80 = bitmap[0x80 >> 3] & (1 << (0x80 & 0x7)) ? 1 : 0;
582 portED = bitmap[0xed >> 3] & (1 << (0xed & 0x7)) ? 1 : 0;
583
584 return nestedhvm_vcpu_iomap_get(port80, portED);
585 }
586
nvmx_update_exec_control(struct vcpu * v,u32 host_cntrl)587 void nvmx_update_exec_control(struct vcpu *v, u32 host_cntrl)
588 {
589 u32 pio_cntrl = (CPU_BASED_ACTIVATE_IO_BITMAP
590 | CPU_BASED_UNCOND_IO_EXITING);
591 unsigned long *bitmap;
592 u32 shadow_cntrl;
593
594 shadow_cntrl = __n2_exec_control(v);
595 pio_cntrl &= shadow_cntrl;
596 /* Enforce the removed features */
597 shadow_cntrl &= ~(CPU_BASED_ACTIVATE_MSR_BITMAP
598 | CPU_BASED_ACTIVATE_IO_BITMAP
599 | CPU_BASED_UNCOND_IO_EXITING);
600 shadow_cntrl |= host_cntrl;
601 if ( pio_cntrl == CPU_BASED_UNCOND_IO_EXITING ) {
602 /* L1 VMM intercepts all I/O instructions */
603 shadow_cntrl |= CPU_BASED_UNCOND_IO_EXITING;
604 shadow_cntrl &= ~CPU_BASED_ACTIVATE_IO_BITMAP;
605 }
606 else {
607 /* Use IO_BITMAP in shadow */
608 if ( pio_cntrl == 0 ) {
609 /*
610 * L1 VMM doesn't intercept IO instruction.
611 * Use host configuration and reset IO_BITMAP
612 */
613 bitmap = hvm_io_bitmap;
614 }
615 else {
616 /* use IO bitmap */
617 bitmap = _shadow_io_bitmap(v);
618 }
619 __vmwrite(IO_BITMAP_A, virt_to_maddr(bitmap));
620 __vmwrite(IO_BITMAP_B, virt_to_maddr(bitmap) + PAGE_SIZE);
621 }
622
623 /* TODO: change L0 intr window to MTF or NMI window */
624 __vmwrite(CPU_BASED_VM_EXEC_CONTROL, shadow_cntrl);
625 }
626
nvmx_update_secondary_exec_control(struct vcpu * v,unsigned long host_cntrl)627 void nvmx_update_secondary_exec_control(struct vcpu *v,
628 unsigned long host_cntrl)
629 {
630 u32 shadow_cntrl;
631 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
632 u32 apicv_bit = SECONDARY_EXEC_APIC_REGISTER_VIRT |
633 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
634
635 host_cntrl &= ~apicv_bit;
636 host_cntrl &= ~SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
637 shadow_cntrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
638
639 /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
640 ASSERT(!(shadow_cntrl & apicv_bit));
641
642 nvmx->ept.enabled = !!(shadow_cntrl & SECONDARY_EXEC_ENABLE_EPT);
643 shadow_cntrl |= host_cntrl;
644 __vmwrite(SECONDARY_VM_EXEC_CONTROL, shadow_cntrl);
645 }
646
nvmx_update_pin_control(struct vcpu * v,unsigned long host_cntrl)647 static void nvmx_update_pin_control(struct vcpu *v, unsigned long host_cntrl)
648 {
649 u32 shadow_cntrl;
650
651 host_cntrl &= ~PIN_BASED_POSTED_INTERRUPT;
652 shadow_cntrl = get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
653
654 /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
655 ASSERT(!(shadow_cntrl & PIN_BASED_POSTED_INTERRUPT));
656
657 shadow_cntrl |= host_cntrl;
658 __vmwrite(PIN_BASED_VM_EXEC_CONTROL, shadow_cntrl);
659 }
660
nvmx_update_exit_control(struct vcpu * v,unsigned long host_cntrl)661 static void nvmx_update_exit_control(struct vcpu *v, unsigned long host_cntrl)
662 {
663 u32 shadow_cntrl;
664
665 shadow_cntrl = get_vvmcs(v, VM_EXIT_CONTROLS);
666 shadow_cntrl &= ~(VM_EXIT_SAVE_DEBUG_CNTRLS
667 | VM_EXIT_LOAD_HOST_PAT
668 | VM_EXIT_LOAD_HOST_EFER
669 | VM_EXIT_LOAD_PERF_GLOBAL_CTRL);
670 shadow_cntrl |= host_cntrl;
671 __vmwrite(VM_EXIT_CONTROLS, shadow_cntrl);
672 }
673
nvmx_update_entry_control(struct vcpu * v)674 static void nvmx_update_entry_control(struct vcpu *v)
675 {
676 u32 shadow_cntrl;
677
678 shadow_cntrl = get_vvmcs(v, VM_ENTRY_CONTROLS);
679 shadow_cntrl &= ~(VM_ENTRY_LOAD_GUEST_PAT
680 | VM_ENTRY_LOAD_GUEST_EFER
681 | VM_ENTRY_LOAD_PERF_GLOBAL_CTRL);
682 __vmwrite(VM_ENTRY_CONTROLS, shadow_cntrl);
683 }
684
nvmx_update_exception_bitmap(struct vcpu * v,unsigned long value)685 void nvmx_update_exception_bitmap(struct vcpu *v, unsigned long value)
686 {
687 set_shadow_control(v, EXCEPTION_BITMAP, value);
688 }
689
nvmx_update_apic_access_address(struct vcpu * v)690 static void nvmx_update_apic_access_address(struct vcpu *v)
691 {
692 u32 ctrl;
693
694 ctrl = __n2_secondary_exec_control(v);
695 if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
696 {
697 p2m_type_t p2mt;
698 unsigned long apic_gpfn;
699 struct page_info *apic_pg;
700
701 apic_gpfn = get_vvmcs(v, APIC_ACCESS_ADDR) >> PAGE_SHIFT;
702 apic_pg = get_page_from_gfn(v->domain, apic_gpfn, &p2mt, P2M_ALLOC);
703 ASSERT(apic_pg && !p2m_is_paging(p2mt));
704 __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(apic_pg));
705 put_page(apic_pg);
706 }
707 else
708 __vmwrite(APIC_ACCESS_ADDR, 0);
709 }
710
nvmx_update_virtual_apic_address(struct vcpu * v)711 static void nvmx_update_virtual_apic_address(struct vcpu *v)
712 {
713 u32 ctrl;
714
715 ctrl = __n2_exec_control(v);
716 if ( ctrl & CPU_BASED_TPR_SHADOW )
717 {
718 p2m_type_t p2mt;
719 unsigned long vapic_gpfn;
720 struct page_info *vapic_pg;
721
722 vapic_gpfn = get_vvmcs(v, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
723 vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
724 ASSERT(vapic_pg && !p2m_is_paging(p2mt));
725 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
726 put_page(vapic_pg);
727 }
728 else
729 __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
730 }
731
nvmx_update_tpr_threshold(struct vcpu * v)732 static void nvmx_update_tpr_threshold(struct vcpu *v)
733 {
734 u32 ctrl = __n2_exec_control(v);
735
736 if ( ctrl & CPU_BASED_TPR_SHADOW )
737 __vmwrite(TPR_THRESHOLD, get_vvmcs(v, TPR_THRESHOLD));
738 else
739 __vmwrite(TPR_THRESHOLD, 0);
740 }
741
nvmx_update_pfec(struct vcpu * v)742 static void nvmx_update_pfec(struct vcpu *v)
743 {
744 __vmwrite(PAGE_FAULT_ERROR_CODE_MASK,
745 get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK));
746 __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH,
747 get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH));
748 }
749
__clear_current_vvmcs(struct vcpu * v)750 static void __clear_current_vvmcs(struct vcpu *v)
751 {
752 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
753
754 if ( nvcpu->nv_n2vmcx_pa )
755 __vmpclear(nvcpu->nv_n2vmcx_pa);
756 }
757
758 /*
759 * Refreshes the MSR bitmap mapping for the current nested vcpu. Returns true
760 * for a successful mapping, and returns false for MSR_BITMAP parameter errors
761 * or gfn mapping errors.
762 */
_map_msr_bitmap(struct vcpu * v)763 static bool __must_check _map_msr_bitmap(struct vcpu *v)
764 {
765 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
766 uint64_t gpa;
767
768 if ( nvmx->msrbitmap )
769 {
770 hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
771 nvmx->msrbitmap = NULL;
772 }
773
774 gpa = get_vvmcs(v, MSR_BITMAP);
775
776 if ( !IS_ALIGNED(gpa, PAGE_SIZE) )
777 return false;
778
779 nvmx->msrbitmap = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
780
781 return nvmx->msrbitmap != NULL;
782 }
783
_map_io_bitmap(struct vcpu * v,u64 vmcs_reg)784 static bool_t __must_check _map_io_bitmap(struct vcpu *v, u64 vmcs_reg)
785 {
786 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
787 unsigned long gpa;
788 int index;
789
790 index = vmcs_reg == IO_BITMAP_A ? 0 : 1;
791 if (nvmx->iobitmap[index])
792 hvm_unmap_guest_frame(nvmx->iobitmap[index], 1);
793 gpa = get_vvmcs(v, vmcs_reg);
794 nvmx->iobitmap[index] = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
795
796 return nvmx->iobitmap[index] != NULL;
797 }
798
map_io_bitmap_all(struct vcpu * v)799 static inline bool_t __must_check map_io_bitmap_all(struct vcpu *v)
800 {
801 return _map_io_bitmap(v, IO_BITMAP_A) &&
802 _map_io_bitmap(v, IO_BITMAP_B);
803 }
804
nvmx_purge_vvmcs(struct vcpu * v)805 static void nvmx_purge_vvmcs(struct vcpu *v)
806 {
807 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
808 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
809 int i;
810
811 __clear_current_vvmcs(v);
812 if ( nvcpu->nv_vvmcxaddr != INVALID_PADDR )
813 hvm_unmap_guest_frame(nvcpu->nv_vvmcx, 1);
814 nvcpu->nv_vvmcx = NULL;
815 nvcpu->nv_vvmcxaddr = INVALID_PADDR;
816 v->arch.hvm_vmx.vmcs_shadow_maddr = 0;
817 for (i=0; i<2; i++) {
818 if ( nvmx->iobitmap[i] ) {
819 hvm_unmap_guest_frame(nvmx->iobitmap[i], 1);
820 nvmx->iobitmap[i] = NULL;
821 }
822 }
823 if ( nvmx->msrbitmap ) {
824 hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
825 nvmx->msrbitmap = NULL;
826 }
827 }
828
nvmx_get_tsc_offset(struct vcpu * v)829 u64 nvmx_get_tsc_offset(struct vcpu *v)
830 {
831 u64 offset = 0;
832
833 if ( get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL) &
834 CPU_BASED_USE_TSC_OFFSETING )
835 offset = get_vvmcs(v, TSC_OFFSET);
836
837 return offset;
838 }
839
840 /*
841 * Context synchronized between shadow and virtual VMCS.
842 */
843 static const u16 vmcs_gstate_field[] = {
844 /* 16 BITS */
845 GUEST_ES_SELECTOR,
846 GUEST_CS_SELECTOR,
847 GUEST_SS_SELECTOR,
848 GUEST_DS_SELECTOR,
849 GUEST_FS_SELECTOR,
850 GUEST_GS_SELECTOR,
851 GUEST_LDTR_SELECTOR,
852 GUEST_TR_SELECTOR,
853 /* 64 BITS */
854 VMCS_LINK_POINTER,
855 GUEST_IA32_DEBUGCTL,
856 GUEST_PAT,
857 GUEST_EFER,
858 GUEST_PERF_GLOBAL_CTRL,
859 /* 32 BITS */
860 GUEST_ES_LIMIT,
861 GUEST_CS_LIMIT,
862 GUEST_SS_LIMIT,
863 GUEST_DS_LIMIT,
864 GUEST_FS_LIMIT,
865 GUEST_GS_LIMIT,
866 GUEST_LDTR_LIMIT,
867 GUEST_TR_LIMIT,
868 GUEST_GDTR_LIMIT,
869 GUEST_IDTR_LIMIT,
870 GUEST_ES_AR_BYTES,
871 GUEST_CS_AR_BYTES,
872 GUEST_SS_AR_BYTES,
873 GUEST_DS_AR_BYTES,
874 GUEST_FS_AR_BYTES,
875 GUEST_GS_AR_BYTES,
876 GUEST_LDTR_AR_BYTES,
877 GUEST_TR_AR_BYTES,
878 GUEST_INTERRUPTIBILITY_INFO,
879 GUEST_ACTIVITY_STATE,
880 GUEST_SYSENTER_CS,
881 GUEST_PREEMPTION_TIMER,
882 /* natural */
883 GUEST_ES_BASE,
884 GUEST_CS_BASE,
885 GUEST_SS_BASE,
886 GUEST_DS_BASE,
887 GUEST_FS_BASE,
888 GUEST_GS_BASE,
889 GUEST_LDTR_BASE,
890 GUEST_TR_BASE,
891 GUEST_GDTR_BASE,
892 GUEST_IDTR_BASE,
893 GUEST_DR7,
894 /*
895 * Following guest states are in local cache (cpu_user_regs)
896 GUEST_RSP,
897 GUEST_RIP,
898 */
899 GUEST_RFLAGS,
900 GUEST_PENDING_DBG_EXCEPTIONS,
901 GUEST_SYSENTER_ESP,
902 GUEST_SYSENTER_EIP,
903 };
904
905 static const u16 gpdpte_fields[] = {
906 GUEST_PDPTE(0),
907 GUEST_PDPTE(1),
908 GUEST_PDPTE(2),
909 GUEST_PDPTE(3),
910 };
911
912 /*
913 * Context: shadow -> virtual VMCS
914 */
915 static const u16 vmcs_ro_field[] = {
916 GUEST_PHYSICAL_ADDRESS,
917 VM_INSTRUCTION_ERROR,
918 VM_EXIT_REASON,
919 VM_EXIT_INTR_INFO,
920 VM_EXIT_INTR_ERROR_CODE,
921 IDT_VECTORING_INFO,
922 IDT_VECTORING_ERROR_CODE,
923 VM_EXIT_INSTRUCTION_LEN,
924 VMX_INSTRUCTION_INFO,
925 EXIT_QUALIFICATION,
926 GUEST_LINEAR_ADDRESS
927 };
928
929 static struct vmcs_host_to_guest {
930 u16 host_field;
931 u16 guest_field;
932 } const vmcs_h2g_field[] = {
933 {HOST_ES_SELECTOR, GUEST_ES_SELECTOR},
934 {HOST_CS_SELECTOR, GUEST_CS_SELECTOR},
935 {HOST_SS_SELECTOR, GUEST_SS_SELECTOR},
936 {HOST_DS_SELECTOR, GUEST_DS_SELECTOR},
937 {HOST_FS_SELECTOR, GUEST_FS_SELECTOR},
938 {HOST_GS_SELECTOR, GUEST_GS_SELECTOR},
939 {HOST_TR_SELECTOR, GUEST_TR_SELECTOR},
940 {HOST_SYSENTER_CS, GUEST_SYSENTER_CS},
941 {HOST_FS_BASE, GUEST_FS_BASE},
942 {HOST_GS_BASE, GUEST_GS_BASE},
943 {HOST_TR_BASE, GUEST_TR_BASE},
944 {HOST_GDTR_BASE, GUEST_GDTR_BASE},
945 {HOST_IDTR_BASE, GUEST_IDTR_BASE},
946 {HOST_SYSENTER_ESP, GUEST_SYSENTER_ESP},
947 {HOST_SYSENTER_EIP, GUEST_SYSENTER_EIP},
948 };
949
vvmcs_to_shadow(const struct vcpu * v,unsigned int field)950 static void vvmcs_to_shadow(const struct vcpu *v, unsigned int field)
951 {
952 __vmwrite(field, get_vvmcs(v, field));
953 }
954
vvmcs_to_shadow_bulk(struct vcpu * v,unsigned int n,const u16 * field)955 static void vvmcs_to_shadow_bulk(struct vcpu *v, unsigned int n,
956 const u16 *field)
957 {
958 u64 *value = this_cpu(vvmcs_buf);
959 unsigned int i;
960
961 if ( !cpu_has_vmx_vmcs_shadowing )
962 goto fallback;
963
964 if ( !value || n > VMCS_BUF_SIZE )
965 {
966 gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \
967 buffer: %p, buffer size: %d, fields number: %d.\n",
968 value, VMCS_BUF_SIZE, n);
969 goto fallback;
970 }
971
972 virtual_vmcs_enter(v);
973 for ( i = 0; i < n; i++ )
974 __vmread(field[i], &value[i]);
975 virtual_vmcs_exit(v);
976
977 for ( i = 0; i < n; i++ )
978 __vmwrite(field[i], value[i]);
979
980 return;
981
982 fallback:
983 for ( i = 0; i < n; i++ )
984 vvmcs_to_shadow(v, field[i]);
985 }
986
shadow_to_vvmcs(const struct vcpu * v,unsigned int field)987 static inline void shadow_to_vvmcs(const struct vcpu *v, unsigned int field)
988 {
989 unsigned long value;
990
991 if ( vmread_safe(field, &value) == 0 )
992 set_vvmcs(v, field, value);
993 }
994
shadow_to_vvmcs_bulk(struct vcpu * v,unsigned int n,const u16 * field)995 static void shadow_to_vvmcs_bulk(struct vcpu *v, unsigned int n,
996 const u16 *field)
997 {
998 u64 *value = this_cpu(vvmcs_buf);
999 unsigned int i;
1000
1001 if ( !cpu_has_vmx_vmcs_shadowing )
1002 goto fallback;
1003
1004 if ( !value || n > VMCS_BUF_SIZE )
1005 {
1006 gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \
1007 buffer: %p, buffer size: %d, fields number: %d.\n",
1008 value, VMCS_BUF_SIZE, n);
1009 goto fallback;
1010 }
1011
1012 for ( i = 0; i < n; i++ )
1013 __vmread(field[i], &value[i]);
1014
1015 virtual_vmcs_enter(v);
1016 for ( i = 0; i < n; i++ )
1017 __vmwrite(field[i], value[i]);
1018 virtual_vmcs_exit(v);
1019
1020 return;
1021
1022 fallback:
1023 for ( i = 0; i < n; i++ )
1024 shadow_to_vvmcs(v, field[i]);
1025 }
1026
load_shadow_control(struct vcpu * v)1027 static void load_shadow_control(struct vcpu *v)
1028 {
1029 /*
1030 * Set shadow controls: PIN_BASED, CPU_BASED, EXIT, ENTRY
1031 * and EXCEPTION
1032 * Enforce the removed features
1033 */
1034 nvmx_update_pin_control(v, vmx_pin_based_exec_control);
1035 vmx_update_cpu_exec_control(v);
1036 vmx_update_secondary_exec_control(v);
1037 nvmx_update_exit_control(v, vmx_vmexit_control);
1038 nvmx_update_entry_control(v);
1039 vmx_update_exception_bitmap(v);
1040 nvmx_update_apic_access_address(v);
1041 nvmx_update_virtual_apic_address(v);
1042 nvmx_update_tpr_threshold(v);
1043 nvmx_update_pfec(v);
1044 }
1045
load_shadow_guest_state(struct vcpu * v)1046 static void load_shadow_guest_state(struct vcpu *v)
1047 {
1048 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1049 u32 control;
1050 u64 cr_gh_mask, cr_read_shadow;
1051 int rc;
1052
1053 static const u16 vmentry_fields[] = {
1054 VM_ENTRY_INTR_INFO,
1055 VM_ENTRY_EXCEPTION_ERROR_CODE,
1056 VM_ENTRY_INSTRUCTION_LEN,
1057 };
1058
1059 /* vvmcs.gstate to shadow vmcs.gstate */
1060 vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1061 vmcs_gstate_field);
1062
1063 nvcpu->guest_cr[0] = get_vvmcs(v, CR0_READ_SHADOW);
1064 nvcpu->guest_cr[4] = get_vvmcs(v, CR4_READ_SHADOW);
1065
1066 rc = hvm_set_cr0(get_vvmcs(v, GUEST_CR0), 1);
1067 if ( rc == X86EMUL_EXCEPTION )
1068 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1069
1070 rc = hvm_set_cr4(get_vvmcs(v, GUEST_CR4), 1);
1071 if ( rc == X86EMUL_EXCEPTION )
1072 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1073
1074 rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), 1);
1075 if ( rc == X86EMUL_EXCEPTION )
1076 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1077
1078 control = get_vvmcs(v, VM_ENTRY_CONTROLS);
1079 if ( control & VM_ENTRY_LOAD_GUEST_PAT )
1080 hvm_set_guest_pat(v, get_vvmcs(v, GUEST_PAT));
1081 if ( control & VM_ENTRY_LOAD_PERF_GLOBAL_CTRL )
1082 {
1083 rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1084 get_vvmcs(v, GUEST_PERF_GLOBAL_CTRL), 0);
1085 if ( rc == X86EMUL_EXCEPTION )
1086 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1087 }
1088
1089 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
1090
1091 vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmentry_fields), vmentry_fields);
1092
1093 /*
1094 * While emulate CR0 and CR4 for nested virtualization, set the CR0/CR4
1095 * guest host mask to 0xffffffff in shadow VMCS (follow the host L1 VMCS),
1096 * then calculate the corresponding read shadow separately for CR0 and CR4.
1097 */
1098 cr_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
1099 cr_read_shadow = (get_vvmcs(v, GUEST_CR0) & ~cr_gh_mask) |
1100 (get_vvmcs(v, CR0_READ_SHADOW) & cr_gh_mask);
1101 __vmwrite(CR0_READ_SHADOW, cr_read_shadow);
1102
1103 cr_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
1104 cr_read_shadow = (get_vvmcs(v, GUEST_CR4) & ~cr_gh_mask) |
1105 (get_vvmcs(v, CR4_READ_SHADOW) & cr_gh_mask);
1106 __vmwrite(CR4_READ_SHADOW, cr_read_shadow);
1107
1108 /* TODO: CR3 target control */
1109 }
1110
get_shadow_eptp(struct vcpu * v)1111 uint64_t get_shadow_eptp(struct vcpu *v)
1112 {
1113 struct p2m_domain *p2m = p2m_get_nestedp2m(v);
1114 struct ept_data *ept = &p2m->ept;
1115
1116 ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
1117 return ept->eptp;
1118 }
1119
get_host_eptp(struct vcpu * v)1120 static uint64_t get_host_eptp(struct vcpu *v)
1121 {
1122 return p2m_get_hostp2m(v->domain)->ept.eptp;
1123 }
1124
nvmx_vpid_enabled(const struct vcpu * v)1125 static bool_t nvmx_vpid_enabled(const struct vcpu *v)
1126 {
1127 uint32_t second_cntl;
1128
1129 second_cntl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
1130 if ( second_cntl & SECONDARY_EXEC_ENABLE_VPID )
1131 return 1;
1132 return 0;
1133 }
1134
nvmx_set_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1135 static void nvmx_set_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1136 {
1137 paddr_t vvmcs_maddr = v->arch.hvm_vmx.vmcs_shadow_maddr;
1138
1139 __vmpclear(vvmcs_maddr);
1140 vvmcs->vmcs_revision_id |= VMCS_RID_TYPE_MASK;
1141 __vmwrite(VMCS_LINK_POINTER, vvmcs_maddr);
1142 __vmwrite(VMREAD_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmread_bitmap));
1143 __vmwrite(VMWRITE_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmwrite_bitmap));
1144 }
1145
nvmx_clear_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1146 static void nvmx_clear_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1147 {
1148 paddr_t vvmcs_maddr = v->arch.hvm_vmx.vmcs_shadow_maddr;
1149
1150 __vmpclear(vvmcs_maddr);
1151 vvmcs->vmcs_revision_id &= ~VMCS_RID_TYPE_MASK;
1152 __vmwrite(VMCS_LINK_POINTER, ~0ul);
1153 __vmwrite(VMREAD_BITMAP, 0);
1154 __vmwrite(VMWRITE_BITMAP, 0);
1155 }
1156
virtual_vmentry(struct cpu_user_regs * regs)1157 static void virtual_vmentry(struct cpu_user_regs *regs)
1158 {
1159 struct vcpu *v = current;
1160 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1161 unsigned long lm_l1, lm_l2;
1162
1163 vmx_vmcs_switch(v->arch.hvm_vmx.vmcs_pa, nvcpu->nv_n2vmcx_pa);
1164
1165 nestedhvm_vcpu_enter_guestmode(v);
1166 nvcpu->nv_vmentry_pending = 0;
1167 nvcpu->nv_vmswitch_in_progress = 1;
1168
1169 /*
1170 * EFER handling:
1171 * hvm_set_efer won't work if CR0.PG = 1, so we change the value
1172 * directly to make hvm_long_mode_active(v) work in L2.
1173 * An additional update_paging_modes is also needed if
1174 * there is 32/64 switch. v->arch.hvm_vcpu.guest_efer doesn't
1175 * need to be saved, since its value on vmexit is determined by
1176 * L1 exit_controls
1177 */
1178 lm_l1 = hvm_long_mode_active(v);
1179 lm_l2 = !!(get_vvmcs(v, VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
1180
1181 if ( lm_l2 )
1182 v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME;
1183 else
1184 v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME);
1185
1186 load_shadow_control(v);
1187 load_shadow_guest_state(v);
1188
1189 if ( lm_l1 != lm_l2 )
1190 paging_update_paging_modes(v);
1191
1192 if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1193 !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1194 vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1195
1196 regs->rip = get_vvmcs(v, GUEST_RIP);
1197 regs->rsp = get_vvmcs(v, GUEST_RSP);
1198 regs->rflags = get_vvmcs(v, GUEST_RFLAGS);
1199
1200 /* updating host cr0 to sync TS bit */
1201 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1202
1203 /* Setup virtual ETP for L2 guest*/
1204 if ( nestedhvm_paging_mode_hap(v) )
1205 /* This will setup the initial np2m for the nested vCPU */
1206 __vmwrite(EPT_POINTER, get_shadow_eptp(v));
1207 else
1208 __vmwrite(EPT_POINTER, get_host_eptp(v));
1209
1210 /* nested VPID support! */
1211 if ( cpu_has_vmx_vpid && nvmx_vpid_enabled(v) )
1212 {
1213 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1214 uint32_t new_vpid = get_vvmcs(v, VIRTUAL_PROCESSOR_ID);
1215
1216 if ( nvmx->guest_vpid != new_vpid )
1217 {
1218 hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid);
1219 nvmx->guest_vpid = new_vpid;
1220 }
1221 }
1222
1223 }
1224
sync_vvmcs_guest_state(struct vcpu * v,struct cpu_user_regs * regs)1225 static void sync_vvmcs_guest_state(struct vcpu *v, struct cpu_user_regs *regs)
1226 {
1227 /* copy shadow vmcs.gstate back to vvmcs.gstate */
1228 shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1229 vmcs_gstate_field);
1230 /* RIP, RSP are in user regs */
1231 set_vvmcs(v, GUEST_RIP, regs->rip);
1232 set_vvmcs(v, GUEST_RSP, regs->rsp);
1233
1234 /* CR3 sync if exec doesn't want cr3 load exiting: i.e. nested EPT */
1235 if ( !(__n2_exec_control(v) & CPU_BASED_CR3_LOAD_EXITING) )
1236 shadow_to_vvmcs(v, GUEST_CR3);
1237 }
1238
sync_vvmcs_ro(struct vcpu * v)1239 static void sync_vvmcs_ro(struct vcpu *v)
1240 {
1241 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1242
1243 shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_ro_field), vmcs_ro_field);
1244
1245 /* Adjust exit_reason/exit_qualifciation for violation case */
1246 if ( get_vvmcs(v, VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION )
1247 {
1248 set_vvmcs(v, EXIT_QUALIFICATION, nvmx->ept.exit_qual);
1249 set_vvmcs(v, VM_EXIT_REASON, nvmx->ept.exit_reason);
1250 }
1251 }
1252
load_vvmcs_host_state(struct vcpu * v)1253 static void load_vvmcs_host_state(struct vcpu *v)
1254 {
1255 int i, rc;
1256 u64 r;
1257 u32 control;
1258
1259 for ( i = 0; i < ARRAY_SIZE(vmcs_h2g_field); i++ )
1260 {
1261 r = get_vvmcs(v, vmcs_h2g_field[i].host_field);
1262 __vmwrite(vmcs_h2g_field[i].guest_field, r);
1263 }
1264
1265 rc = hvm_set_cr0(get_vvmcs(v, HOST_CR0), 1);
1266 if ( rc == X86EMUL_EXCEPTION )
1267 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1268
1269 rc = hvm_set_cr4(get_vvmcs(v, HOST_CR4), 1);
1270 if ( rc == X86EMUL_EXCEPTION )
1271 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1272
1273 rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), 1);
1274 if ( rc == X86EMUL_EXCEPTION )
1275 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1276
1277 control = get_vvmcs(v, VM_EXIT_CONTROLS);
1278 if ( control & VM_EXIT_LOAD_HOST_PAT )
1279 hvm_set_guest_pat(v, get_vvmcs(v, HOST_PAT));
1280 if ( control & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
1281 {
1282 rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1283 get_vvmcs(v, HOST_PERF_GLOBAL_CTRL), 1);
1284 if ( rc == X86EMUL_EXCEPTION )
1285 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1286 }
1287
1288 hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
1289
1290 set_vvmcs(v, VM_ENTRY_INTR_INFO, 0);
1291 }
1292
sync_exception_state(struct vcpu * v)1293 static void sync_exception_state(struct vcpu *v)
1294 {
1295 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1296
1297 if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) )
1298 return;
1299
1300 switch ( MASK_EXTR(nvmx->intr.intr_info, INTR_INFO_INTR_TYPE_MASK) )
1301 {
1302 case X86_EVENTTYPE_EXT_INTR:
1303 /* rename exit_reason to EXTERNAL_INTERRUPT */
1304 set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXTERNAL_INTERRUPT);
1305 set_vvmcs(v, EXIT_QUALIFICATION, 0);
1306 set_vvmcs(v, VM_EXIT_INTR_INFO,
1307 nvmx->intr.intr_info);
1308 break;
1309
1310 case X86_EVENTTYPE_HW_EXCEPTION:
1311 case X86_EVENTTYPE_SW_INTERRUPT:
1312 case X86_EVENTTYPE_SW_EXCEPTION:
1313 /* throw to L1 */
1314 set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1315 set_vvmcs(v, VM_EXIT_INTR_ERROR_CODE, nvmx->intr.error_code);
1316 break;
1317 case X86_EVENTTYPE_NMI:
1318 set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXCEPTION_NMI);
1319 set_vvmcs(v, EXIT_QUALIFICATION, 0);
1320 set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1321 break;
1322 default:
1323 gdprintk(XENLOG_ERR, "Exception state %lx not handled\n",
1324 nvmx->intr.intr_info);
1325 break;
1326 }
1327 }
1328
nvmx_update_apicv(struct vcpu * v)1329 static void nvmx_update_apicv(struct vcpu *v)
1330 {
1331 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1332 unsigned long reason = get_vvmcs(v, VM_EXIT_REASON);
1333 uint32_t intr_info = get_vvmcs(v, VM_EXIT_INTR_INFO);
1334
1335 if ( reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
1336 nvmx->intr.source == hvm_intsrc_lapic &&
1337 (intr_info & INTR_INFO_VALID_MASK) )
1338 {
1339 uint16_t status;
1340 uint32_t rvi, ppr;
1341 uint32_t vector = intr_info & 0xff;
1342 struct vlapic *vlapic = vcpu_vlapic(v);
1343
1344 vlapic_ack_pending_irq(v, vector, 1);
1345
1346 ppr = vlapic_set_ppr(vlapic);
1347 WARN_ON((ppr & 0xf0) != (vector & 0xf0));
1348
1349 status = vector << VMX_GUEST_INTR_STATUS_SVI_OFFSET;
1350 rvi = vlapic_has_pending_irq(v);
1351 if ( rvi != -1 )
1352 status |= rvi & VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
1353
1354 __vmwrite(GUEST_INTR_STATUS, status);
1355 }
1356 }
1357
virtual_vmexit(struct cpu_user_regs * regs)1358 static void virtual_vmexit(struct cpu_user_regs *regs)
1359 {
1360 struct vcpu *v = current;
1361 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1362 unsigned long lm_l1, lm_l2;
1363
1364 sync_vvmcs_ro(v);
1365 sync_vvmcs_guest_state(v, regs);
1366 sync_exception_state(v);
1367
1368 if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1369 !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1370 shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1371
1372 /* This will clear current pCPU bit in p2m->dirty_cpumask */
1373 np2m_schedule(NP2M_SCHEDLE_OUT);
1374
1375 vmx_vmcs_switch(v->arch.hvm_vmx.vmcs_pa, nvcpu->nv_n1vmcx_pa);
1376
1377 nestedhvm_vcpu_exit_guestmode(v);
1378 nvcpu->nv_vmexit_pending = 0;
1379 nvcpu->nv_vmswitch_in_progress = 1;
1380
1381 lm_l2 = hvm_long_mode_active(v);
1382 lm_l1 = !!(get_vvmcs(v, VM_EXIT_CONTROLS) & VM_EXIT_IA32E_MODE);
1383
1384 if ( lm_l1 )
1385 v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME;
1386 else
1387 v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME);
1388
1389 vmx_update_cpu_exec_control(v);
1390 vmx_update_secondary_exec_control(v);
1391 vmx_update_exception_bitmap(v);
1392
1393 load_vvmcs_host_state(v);
1394
1395 if ( lm_l1 != lm_l2 )
1396 paging_update_paging_modes(v);
1397
1398 regs->rip = get_vvmcs(v, HOST_RIP);
1399 regs->rsp = get_vvmcs(v, HOST_RSP);
1400 /* VM exit clears all bits except bit 1 */
1401 regs->rflags = X86_EFLAGS_MBS;
1402
1403 /* updating host cr0 to sync TS bit */
1404 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1405
1406 if ( cpu_has_vmx_virtual_intr_delivery )
1407 nvmx_update_apicv(v);
1408
1409 nvcpu->nv_vmswitch_in_progress = 0;
1410 vmsucceed(regs);
1411 }
1412
nvmx_eptp_update(void)1413 static void nvmx_eptp_update(void)
1414 {
1415 struct vcpu *curr = current;
1416
1417 if ( !nestedhvm_vcpu_in_guestmode(curr) ||
1418 vcpu_nestedhvm(curr).nv_vmexit_pending ||
1419 !vcpu_nestedhvm(curr).stale_np2m ||
1420 !nestedhvm_paging_mode_hap(curr) )
1421 return;
1422
1423 /*
1424 * Interrupts are enabled here, so we need to clear stale_np2m
1425 * before we do the vmwrite. If we do it in the other order, an
1426 * and IPI comes in changing the shadow eptp after the vmwrite,
1427 * we'll complete the vmenter with a stale eptp value.
1428 */
1429 vcpu_nestedhvm(curr).stale_np2m = false;
1430 __vmwrite(EPT_POINTER, get_shadow_eptp(curr));
1431 }
1432
nvmx_switch_guest(void)1433 void nvmx_switch_guest(void)
1434 {
1435 struct vcpu *v = current;
1436 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1437 struct cpu_user_regs *regs = guest_cpu_user_regs();
1438
1439 nvmx_eptp_update();
1440
1441 /*
1442 * A pending IO emulation may still be not finished. In this case, no
1443 * virtual vmswitch is allowed. Or else, the following IO emulation will
1444 * be handled in a wrong VCPU context. If there are no IO backends - PVH
1445 * guest by itself or a PVH guest with an HVM guest running inside - we
1446 * don't want to continue as this setup is not implemented nor supported
1447 * as of right now.
1448 */
1449 if ( hvm_io_pending(v) )
1450 return;
1451 /*
1452 * a softirq may interrupt us between a virtual vmentry is
1453 * just handled and the true vmentry. If during this window,
1454 * a L1 virtual interrupt causes another virtual vmexit, we
1455 * cannot let that happen or VM_ENTRY_INTR_INFO will be lost.
1456 */
1457 if ( unlikely(nvcpu->nv_vmswitch_in_progress) )
1458 return;
1459
1460 if ( nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmexit_pending )
1461 virtual_vmexit(regs);
1462 else if ( !nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmentry_pending )
1463 virtual_vmentry(regs);
1464 }
1465
1466 /*
1467 * VMX instructions handling
1468 */
1469
nvmx_handle_vmxon(struct cpu_user_regs * regs)1470 int nvmx_handle_vmxon(struct cpu_user_regs *regs)
1471 {
1472 struct vcpu *v=current;
1473 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1474 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1475 struct vmx_inst_decoded decode;
1476 unsigned long gpa = 0;
1477 uint32_t nvmcs_revid;
1478 int rc;
1479
1480 rc = decode_vmx_inst(regs, &decode, &gpa, 1);
1481 if ( rc != X86EMUL_OKAY )
1482 return rc;
1483
1484 if ( nvmx_vcpu_in_vmx(v) )
1485 {
1486 vmfail(regs, VMX_INSN_VMXON_IN_VMX_ROOT);
1487 return X86EMUL_OKAY;
1488 }
1489
1490 if ( (gpa & ~PAGE_MASK) || !gfn_valid(v->domain, _gfn(gpa >> PAGE_SHIFT)) )
1491 {
1492 vmfail_invalid(regs);
1493 return X86EMUL_OKAY;
1494 }
1495
1496 rc = hvm_copy_from_guest_phys(&nvmcs_revid, gpa, sizeof(nvmcs_revid));
1497 if ( rc != HVMTRANS_okay ||
1498 (nvmcs_revid & ~VMX_BASIC_REVISION_MASK) ||
1499 ((nvmcs_revid ^ vmx_basic_msr) & VMX_BASIC_REVISION_MASK) )
1500 {
1501 vmfail_invalid(regs);
1502 return X86EMUL_OKAY;
1503 }
1504
1505 nvmx->vmxon_region_pa = gpa;
1506
1507 /*
1508 * `fork' the host vmcs to shadow_vmcs
1509 * vmcs_lock is not needed since we are on current
1510 */
1511 nvcpu->nv_n1vmcx_pa = v->arch.hvm_vmx.vmcs_pa;
1512 __vmpclear(v->arch.hvm_vmx.vmcs_pa);
1513 copy_domain_page(_mfn(PFN_DOWN(nvcpu->nv_n2vmcx_pa)),
1514 _mfn(PFN_DOWN(v->arch.hvm_vmx.vmcs_pa)));
1515 __vmptrld(v->arch.hvm_vmx.vmcs_pa);
1516 v->arch.hvm_vmx.launched = 0;
1517 vmsucceed(regs);
1518
1519 return X86EMUL_OKAY;
1520 }
1521
nvmx_handle_vmxoff(struct cpu_user_regs * regs)1522 int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
1523 {
1524 struct vcpu *v=current;
1525 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1526 int rc;
1527
1528 rc = vmx_inst_check_privilege(regs, 0);
1529 if ( rc != X86EMUL_OKAY )
1530 return rc;
1531
1532 nvmx_purge_vvmcs(v);
1533 nvmx->vmxon_region_pa = INVALID_PADDR;
1534
1535 vmsucceed(regs);
1536 return X86EMUL_OKAY;
1537 }
1538
vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1539 static bool_t vvmcs_launched(struct list_head *launched_list,
1540 unsigned long vvmcs_mfn)
1541 {
1542 struct vvmcs_list *vvmcs;
1543 struct list_head *pos;
1544 bool_t launched = 0;
1545
1546 list_for_each(pos, launched_list)
1547 {
1548 vvmcs = list_entry(pos, struct vvmcs_list, node);
1549 if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1550 {
1551 launched = 1;
1552 break;
1553 }
1554 }
1555
1556 return launched;
1557 }
1558
set_vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1559 static int set_vvmcs_launched(struct list_head *launched_list,
1560 unsigned long vvmcs_mfn)
1561 {
1562 struct vvmcs_list *vvmcs;
1563
1564 if ( vvmcs_launched(launched_list, vvmcs_mfn) )
1565 return 0;
1566
1567 vvmcs = xzalloc(struct vvmcs_list);
1568 if ( !vvmcs )
1569 return -ENOMEM;
1570
1571 vvmcs->vvmcs_mfn = vvmcs_mfn;
1572 list_add(&vvmcs->node, launched_list);
1573
1574 return 0;
1575 }
1576
clear_vvmcs_launched(struct list_head * launched_list,paddr_t vvmcs_mfn)1577 static void clear_vvmcs_launched(struct list_head *launched_list,
1578 paddr_t vvmcs_mfn)
1579 {
1580 struct vvmcs_list *vvmcs;
1581 struct list_head *pos;
1582
1583 list_for_each(pos, launched_list)
1584 {
1585 vvmcs = list_entry(pos, struct vvmcs_list, node);
1586 if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1587 {
1588 list_del(&vvmcs->node);
1589 xfree(vvmcs);
1590 break;
1591 }
1592 }
1593 }
1594
nvmx_vmresume(struct vcpu * v,struct cpu_user_regs * regs)1595 static int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs)
1596 {
1597 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1598 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1599
1600 /* check VMCS is valid and IO BITMAP is set */
1601 if ( (nvcpu->nv_vvmcxaddr != INVALID_PADDR) &&
1602 ((nvmx->iobitmap[0] && nvmx->iobitmap[1]) ||
1603 !(__n2_exec_control(v) & CPU_BASED_ACTIVATE_IO_BITMAP) ) )
1604 nvcpu->nv_vmentry_pending = 1;
1605 else
1606 vmfail_invalid(regs);
1607
1608 return X86EMUL_OKAY;
1609 }
1610
nvmx_handle_vmresume(struct cpu_user_regs * regs)1611 int nvmx_handle_vmresume(struct cpu_user_regs *regs)
1612 {
1613 bool_t launched;
1614 struct vcpu *v = current;
1615 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1616 unsigned long intr_shadow;
1617 int rc = vmx_inst_check_privilege(regs, 0);
1618
1619 if ( rc != X86EMUL_OKAY )
1620 return rc;
1621
1622 if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1623 {
1624 vmfail_invalid(regs);
1625 return X86EMUL_OKAY;
1626 }
1627
1628 __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1629 if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1630 {
1631 vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1632 return X86EMUL_OKAY;
1633 }
1634
1635 launched = vvmcs_launched(&nvmx->launched_list,
1636 PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1637 if ( !launched )
1638 {
1639 vmfail_valid(regs, VMX_INSN_VMRESUME_NONLAUNCHED_VMCS);
1640 return X86EMUL_OKAY;
1641 }
1642 return nvmx_vmresume(v,regs);
1643 }
1644
nvmx_handle_vmlaunch(struct cpu_user_regs * regs)1645 int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
1646 {
1647 bool_t launched;
1648 struct vcpu *v = current;
1649 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1650 unsigned long intr_shadow;
1651 int rc = vmx_inst_check_privilege(regs, 0);
1652
1653 if ( rc != X86EMUL_OKAY )
1654 return rc;
1655
1656 if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1657 {
1658 vmfail_invalid(regs);
1659 return X86EMUL_OKAY;
1660 }
1661
1662 __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1663 if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1664 {
1665 vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1666 return X86EMUL_OKAY;
1667 }
1668
1669 launched = vvmcs_launched(&nvmx->launched_list,
1670 PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1671 if ( launched )
1672 {
1673 vmfail_valid(regs, VMX_INSN_VMLAUNCH_NONCLEAR_VMCS);
1674 return X86EMUL_OKAY;
1675 }
1676 else {
1677 rc = nvmx_vmresume(v,regs);
1678 if ( rc == X86EMUL_OKAY )
1679 {
1680 if ( set_vvmcs_launched(&nvmx->launched_list,
1681 PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr)) < 0 )
1682 return X86EMUL_UNHANDLEABLE;
1683 }
1684 }
1685 return rc;
1686 }
1687
nvmx_handle_vmptrld(struct cpu_user_regs * regs)1688 int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
1689 {
1690 struct vcpu *v = current;
1691 struct vmx_inst_decoded decode;
1692 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1693 unsigned long gpa = 0;
1694 int rc;
1695
1696 rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1697 if ( rc != X86EMUL_OKAY )
1698 return rc;
1699
1700 if ( gpa == vcpu_2_nvmx(v).vmxon_region_pa || gpa & 0xfff )
1701 {
1702 vmfail_invalid(regs);
1703 goto out;
1704 }
1705
1706 if ( nvcpu->nv_vvmcxaddr != gpa )
1707 nvmx_purge_vvmcs(v);
1708
1709 if ( nvcpu->nv_vvmcxaddr == INVALID_PADDR )
1710 {
1711 bool_t writable;
1712 void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
1713
1714 if ( vvmcx )
1715 {
1716 if ( writable )
1717 {
1718 struct vmcs_struct *vvmcs = vvmcx;
1719
1720 if ( ((vvmcs->vmcs_revision_id ^ vmx_basic_msr) &
1721 VMX_BASIC_REVISION_MASK) ||
1722 (!cpu_has_vmx_vmcs_shadowing &&
1723 (vvmcs->vmcs_revision_id & ~VMX_BASIC_REVISION_MASK)) )
1724 {
1725 hvm_unmap_guest_frame(vvmcx, 1);
1726 vmfail(regs, VMX_INSN_VMPTRLD_INCORRECT_VMCS_ID);
1727
1728 return X86EMUL_OKAY;
1729 }
1730 nvcpu->nv_vvmcx = vvmcx;
1731 nvcpu->nv_vvmcxaddr = gpa;
1732 v->arch.hvm_vmx.vmcs_shadow_maddr =
1733 pfn_to_paddr(domain_page_map_to_mfn(vvmcx));
1734 }
1735 else
1736 {
1737 hvm_unmap_guest_frame(vvmcx, 1);
1738 vvmcx = NULL;
1739 }
1740 }
1741 if ( !vvmcx ||
1742 !map_io_bitmap_all(v) ||
1743 !_map_msr_bitmap(v) )
1744 {
1745 vmfail_valid(regs, VMX_INSN_VMPTRLD_INVALID_PHYADDR);
1746 goto out;
1747 }
1748 }
1749
1750 if ( cpu_has_vmx_vmcs_shadowing )
1751 nvmx_set_vmcs_pointer(v, nvcpu->nv_vvmcx);
1752
1753 vmsucceed(regs);
1754
1755 out:
1756 return X86EMUL_OKAY;
1757 }
1758
nvmx_handle_vmptrst(struct cpu_user_regs * regs)1759 int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
1760 {
1761 struct vcpu *v = current;
1762 struct vmx_inst_decoded decode;
1763 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1764 pagefault_info_t pfinfo;
1765 unsigned long gpa = 0;
1766 int rc;
1767
1768 rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1769 if ( rc != X86EMUL_OKAY )
1770 return rc;
1771
1772 gpa = nvcpu->nv_vvmcxaddr;
1773
1774 rc = hvm_copy_to_guest_linear(decode.mem, &gpa, decode.len, 0, &pfinfo);
1775 if ( rc == HVMTRANS_bad_linear_to_gfn )
1776 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1777 if ( rc != HVMTRANS_okay )
1778 return X86EMUL_EXCEPTION;
1779
1780 vmsucceed(regs);
1781 return X86EMUL_OKAY;
1782 }
1783
nvmx_handle_vmclear(struct cpu_user_regs * regs)1784 int nvmx_handle_vmclear(struct cpu_user_regs *regs)
1785 {
1786 struct vcpu *v = current;
1787 struct vmx_inst_decoded decode;
1788 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1789 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1790 unsigned long gpa = 0;
1791 void *vvmcs;
1792 int rc;
1793
1794 rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1795 if ( rc != X86EMUL_OKAY )
1796 return rc;
1797
1798 BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
1799 if ( gpa & 0xfff )
1800 rc = VMFAIL_INVALID;
1801 else if ( gpa == nvcpu->nv_vvmcxaddr )
1802 {
1803 if ( cpu_has_vmx_vmcs_shadowing )
1804 nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
1805 clear_vvmcs_launched(&nvmx->launched_list,
1806 PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1807 nvmx_purge_vvmcs(v);
1808 }
1809 else
1810 {
1811 /* Even if this VMCS isn't the current one, we must clear it. */
1812 bool_t writable;
1813
1814 vvmcs = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 0, &writable);
1815 if ( vvmcs )
1816 {
1817 if ( writable )
1818 clear_vvmcs_launched(&nvmx->launched_list,
1819 domain_page_map_to_mfn(vvmcs));
1820 else
1821 rc = VMFAIL_VALID;
1822 hvm_unmap_guest_frame(vvmcs, 0);
1823 }
1824 }
1825
1826 if ( rc == VMSUCCEED )
1827 vmsucceed(regs);
1828 else if ( rc == VMFAIL_VALID )
1829 vmfail_valid(regs, VMX_INSN_VMCLEAR_INVALID_PHYADDR);
1830 else
1831 vmfail_invalid(regs);
1832
1833 return X86EMUL_OKAY;
1834 }
1835
nvmx_handle_vmread(struct cpu_user_regs * regs)1836 int nvmx_handle_vmread(struct cpu_user_regs *regs)
1837 {
1838 struct vcpu *v = current;
1839 struct vmx_inst_decoded decode;
1840 pagefault_info_t pfinfo;
1841 u64 value = 0;
1842 int rc;
1843
1844 rc = decode_vmx_inst(regs, &decode, NULL, 0);
1845 if ( rc != X86EMUL_OKAY )
1846 return rc;
1847
1848 if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1849 {
1850 vmfail_invalid(regs);
1851 return X86EMUL_OKAY;
1852 }
1853
1854 rc = get_vvmcs_safe(v, reg_read(regs, decode.reg2), &value);
1855 if ( rc != VMX_INSN_SUCCEED )
1856 {
1857 vmfail(regs, rc);
1858 return X86EMUL_OKAY;
1859 }
1860
1861 switch ( decode.type ) {
1862 case VMX_INST_MEMREG_TYPE_MEMORY:
1863 rc = hvm_copy_to_guest_linear(decode.mem, &value, decode.len, 0, &pfinfo);
1864 if ( rc == HVMTRANS_bad_linear_to_gfn )
1865 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1866 if ( rc != HVMTRANS_okay )
1867 return X86EMUL_EXCEPTION;
1868 break;
1869 case VMX_INST_MEMREG_TYPE_REG:
1870 reg_write(regs, decode.reg1, value);
1871 break;
1872 }
1873
1874 vmsucceed(regs);
1875 return X86EMUL_OKAY;
1876 }
1877
nvmx_handle_vmwrite(struct cpu_user_regs * regs)1878 int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
1879 {
1880 struct vcpu *v = current;
1881 struct vmx_inst_decoded decode;
1882 unsigned long operand;
1883 u64 vmcs_encoding;
1884 bool_t okay = 1;
1885 enum vmx_insn_errno err;
1886
1887 if ( decode_vmx_inst(regs, &decode, &operand, 0)
1888 != X86EMUL_OKAY )
1889 return X86EMUL_EXCEPTION;
1890
1891 if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1892 {
1893 vmfail_invalid(regs);
1894 return X86EMUL_OKAY;
1895 }
1896
1897 vmcs_encoding = reg_read(regs, decode.reg2);
1898 err = set_vvmcs_safe(v, vmcs_encoding, operand);
1899 if ( err != VMX_INSN_SUCCEED )
1900 {
1901 vmfail(regs, err);
1902 return X86EMUL_OKAY;
1903 }
1904
1905 switch ( vmcs_encoding & ~VMCS_HIGH(0) )
1906 {
1907 case IO_BITMAP_A:
1908 okay = _map_io_bitmap(v, IO_BITMAP_A);
1909 break;
1910 case IO_BITMAP_B:
1911 okay = _map_io_bitmap(v, IO_BITMAP_B);
1912 break;
1913 case MSR_BITMAP:
1914 okay = _map_msr_bitmap(v);
1915 break;
1916 }
1917
1918 if ( okay )
1919 vmsucceed(regs);
1920 else
1921 vmfail_valid(regs, VMX_INSN_UNSUPPORTED_VMCS_COMPONENT);
1922
1923 return X86EMUL_OKAY;
1924 }
1925
nvmx_handle_invept(struct cpu_user_regs * regs)1926 int nvmx_handle_invept(struct cpu_user_regs *regs)
1927 {
1928 struct vmx_inst_decoded decode;
1929 unsigned long eptp;
1930 int ret;
1931
1932 if ( (ret = decode_vmx_inst(regs, &decode, &eptp, 0)) != X86EMUL_OKAY )
1933 return ret;
1934
1935 switch ( reg_read(regs, decode.reg2) )
1936 {
1937 case INVEPT_SINGLE_CONTEXT:
1938 {
1939 np2m_flush_base(current, eptp);
1940 break;
1941 }
1942 case INVEPT_ALL_CONTEXT:
1943 p2m_flush_nestedp2m(current->domain);
1944 __invept(INVEPT_ALL_CONTEXT, 0, 0);
1945 break;
1946 default:
1947 vmfail_invalid(regs);
1948 return X86EMUL_OKAY;
1949 }
1950 vmsucceed(regs);
1951 return X86EMUL_OKAY;
1952 }
1953
nvmx_handle_invvpid(struct cpu_user_regs * regs)1954 int nvmx_handle_invvpid(struct cpu_user_regs *regs)
1955 {
1956 struct vmx_inst_decoded decode;
1957 unsigned long vpid;
1958 int ret;
1959
1960 if ( (ret = decode_vmx_inst(regs, &decode, &vpid, 0)) != X86EMUL_OKAY )
1961 return ret;
1962
1963 switch ( reg_read(regs, decode.reg2) )
1964 {
1965 /* Just invalidate all tlb entries for all types! */
1966 case INVVPID_INDIVIDUAL_ADDR:
1967 case INVVPID_SINGLE_CONTEXT:
1968 case INVVPID_ALL_CONTEXT:
1969 hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(current).nv_n2asid);
1970 break;
1971 default:
1972 vmfail_invalid(regs);
1973 return X86EMUL_OKAY;
1974 }
1975
1976 vmsucceed(regs);
1977 return X86EMUL_OKAY;
1978 }
1979
1980 #define __emul_value(enable1, default1) \
1981 ((enable1 | default1) << 32 | (default1))
1982
1983 #define gen_vmx_msr(enable1, default1, host_value) \
1984 (((__emul_value(enable1, default1) & host_value) & (~0ul << 32)) | \
1985 ((uint32_t)(__emul_value(enable1, default1) | host_value)))
1986
1987 /*
1988 * Capability reporting
1989 */
nvmx_msr_read_intercept(unsigned int msr,u64 * msr_content)1990 int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
1991 {
1992 struct vcpu *v = current;
1993 struct domain *d = v->domain;
1994 u64 data = 0, host_data = 0;
1995 int r = 1;
1996
1997 /* VMX capablity MSRs are available only when guest supports VMX. */
1998 if ( !nestedhvm_enabled(d) || !d->arch.cpuid->basic.vmx )
1999 return 0;
2000
2001 /*
2002 * These MSRs are only available when flags in other MSRs are set.
2003 * These prerequisites are listed in the Intel 64 and IA-32
2004 * Architectures Software Developer’s Manual, Vol 3, Appendix A.
2005 */
2006 switch ( msr )
2007 {
2008 case MSR_IA32_VMX_PROCBASED_CTLS2:
2009 if ( !cpu_has_vmx_secondary_exec_control )
2010 return 0;
2011 break;
2012
2013 case MSR_IA32_VMX_EPT_VPID_CAP:
2014 if ( !(cpu_has_vmx_ept || cpu_has_vmx_vpid) )
2015 return 0;
2016 break;
2017
2018 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2019 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2020 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2021 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2022 if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) )
2023 return 0;
2024 break;
2025
2026 case MSR_IA32_VMX_VMFUNC:
2027 if ( !cpu_has_vmx_vmfunc )
2028 return 0;
2029 break;
2030 }
2031
2032 rdmsrl(msr, host_data);
2033
2034 /*
2035 * Remove unsupport features from n1 guest capability MSR
2036 */
2037 switch (msr) {
2038 case MSR_IA32_VMX_BASIC:
2039 {
2040 const struct vmcs_struct *vmcs =
2041 map_domain_page(_mfn(PFN_DOWN(v->arch.hvm_vmx.vmcs_pa)));
2042
2043 data = (host_data & (~0ul << 32)) |
2044 (vmcs->vmcs_revision_id & 0x7fffffff);
2045 unmap_domain_page(vmcs);
2046 break;
2047 }
2048 case MSR_IA32_VMX_PINBASED_CTLS:
2049 case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2050 /* 1-settings */
2051 data = PIN_BASED_EXT_INTR_MASK |
2052 PIN_BASED_NMI_EXITING |
2053 PIN_BASED_PREEMPT_TIMER;
2054 data = gen_vmx_msr(data, VMX_PINBASED_CTLS_DEFAULT1, host_data);
2055 break;
2056 case MSR_IA32_VMX_PROCBASED_CTLS:
2057 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2058 {
2059 u32 default1_bits = VMX_PROCBASED_CTLS_DEFAULT1;
2060 /* 1-settings */
2061 data = CPU_BASED_HLT_EXITING |
2062 CPU_BASED_VIRTUAL_INTR_PENDING |
2063 CPU_BASED_CR8_LOAD_EXITING |
2064 CPU_BASED_CR8_STORE_EXITING |
2065 CPU_BASED_INVLPG_EXITING |
2066 CPU_BASED_CR3_LOAD_EXITING |
2067 CPU_BASED_CR3_STORE_EXITING |
2068 CPU_BASED_MONITOR_EXITING |
2069 CPU_BASED_MWAIT_EXITING |
2070 CPU_BASED_MOV_DR_EXITING |
2071 CPU_BASED_ACTIVATE_IO_BITMAP |
2072 CPU_BASED_USE_TSC_OFFSETING |
2073 CPU_BASED_UNCOND_IO_EXITING |
2074 CPU_BASED_RDTSC_EXITING |
2075 CPU_BASED_MONITOR_TRAP_FLAG |
2076 CPU_BASED_VIRTUAL_NMI_PENDING |
2077 CPU_BASED_ACTIVATE_MSR_BITMAP |
2078 CPU_BASED_PAUSE_EXITING |
2079 CPU_BASED_RDPMC_EXITING |
2080 CPU_BASED_TPR_SHADOW |
2081 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2082
2083 if ( msr == MSR_IA32_VMX_TRUE_PROCBASED_CTLS )
2084 default1_bits &= ~(CPU_BASED_CR3_LOAD_EXITING |
2085 CPU_BASED_CR3_STORE_EXITING |
2086 CPU_BASED_INVLPG_EXITING);
2087
2088 data = gen_vmx_msr(data, default1_bits, host_data);
2089 break;
2090 }
2091 case MSR_IA32_VMX_PROCBASED_CTLS2:
2092 /* 1-settings */
2093 data = SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING |
2094 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2095 SECONDARY_EXEC_ENABLE_VPID |
2096 SECONDARY_EXEC_UNRESTRICTED_GUEST |
2097 SECONDARY_EXEC_ENABLE_EPT;
2098 data = gen_vmx_msr(data, 0, host_data);
2099 break;
2100 case MSR_IA32_VMX_EXIT_CTLS:
2101 case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2102 /* 1-settings */
2103 data = VM_EXIT_ACK_INTR_ON_EXIT |
2104 VM_EXIT_IA32E_MODE |
2105 VM_EXIT_SAVE_PREEMPT_TIMER |
2106 VM_EXIT_SAVE_GUEST_PAT |
2107 VM_EXIT_LOAD_HOST_PAT |
2108 VM_EXIT_SAVE_GUEST_EFER |
2109 VM_EXIT_LOAD_HOST_EFER |
2110 VM_EXIT_LOAD_PERF_GLOBAL_CTRL;
2111 data = gen_vmx_msr(data, VMX_EXIT_CTLS_DEFAULT1, host_data);
2112 break;
2113 case MSR_IA32_VMX_ENTRY_CTLS:
2114 case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2115 /* 1-settings */
2116 data = VM_ENTRY_LOAD_GUEST_PAT |
2117 VM_ENTRY_LOAD_GUEST_EFER |
2118 VM_ENTRY_LOAD_PERF_GLOBAL_CTRL |
2119 VM_ENTRY_IA32E_MODE;
2120 data = gen_vmx_msr(data, VMX_ENTRY_CTLS_DEFAULT1, host_data);
2121 break;
2122
2123 case MSR_IA32_VMX_VMCS_ENUM:
2124 /* The max index of VVMCS encoding is 0x1f. */
2125 data = 0x1f << 1;
2126 break;
2127 case MSR_IA32_VMX_CR0_FIXED0:
2128 /* PG, PE bits must be 1 in VMX operation */
2129 data = X86_CR0_PE | X86_CR0_PG;
2130 break;
2131 case MSR_IA32_VMX_CR0_FIXED1:
2132 /* allow 0-settings for all bits */
2133 data = 0xffffffff;
2134 break;
2135 case MSR_IA32_VMX_CR4_FIXED0:
2136 /* VMXE bit must be 1 in VMX operation */
2137 data = X86_CR4_VMXE;
2138 break;
2139 case MSR_IA32_VMX_CR4_FIXED1:
2140 data = hvm_cr4_guest_valid_bits(v, 0);
2141 break;
2142 case MSR_IA32_VMX_MISC:
2143 /* Do not support CR3-target feature now */
2144 data = host_data & ~VMX_MISC_CR3_TARGET;
2145 break;
2146 case MSR_IA32_VMX_EPT_VPID_CAP:
2147 data = nept_get_ept_vpid_cap();
2148 break;
2149 default:
2150 r = 0;
2151 break;
2152 }
2153
2154 *msr_content = data;
2155 return r;
2156 }
2157
2158 /* This function uses L2_gpa to walk the P2M page table in L1. If the
2159 * walk is successful, the translated value is returned in
2160 * L1_gpa. The result value tells what to do next.
2161 */
2162 int
nvmx_hap_walk_L1_p2m(struct vcpu * v,paddr_t L2_gpa,paddr_t * L1_gpa,unsigned int * page_order,uint8_t * p2m_acc,bool_t access_r,bool_t access_w,bool_t access_x)2163 nvmx_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa,
2164 unsigned int *page_order, uint8_t *p2m_acc,
2165 bool_t access_r, bool_t access_w, bool_t access_x)
2166 {
2167 int rc;
2168 unsigned long gfn;
2169 uint64_t exit_qual;
2170 uint32_t exit_reason = EXIT_REASON_EPT_VIOLATION;
2171 uint32_t rwx_rights = (access_x << 2) | (access_w << 1) | access_r;
2172 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2173
2174 vmx_vmcs_enter(v);
2175
2176 __vmread(EXIT_QUALIFICATION, &exit_qual);
2177 rc = nept_translate_l2ga(v, L2_gpa, page_order, rwx_rights, &gfn, p2m_acc,
2178 &exit_qual, &exit_reason);
2179 switch ( rc )
2180 {
2181 case EPT_TRANSLATE_SUCCEED:
2182 *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
2183 rc = NESTEDHVM_PAGEFAULT_DONE;
2184 break;
2185 case EPT_TRANSLATE_VIOLATION:
2186 case EPT_TRANSLATE_MISCONFIG:
2187 rc = NESTEDHVM_PAGEFAULT_INJECT;
2188 nvmx->ept.exit_reason = exit_reason;
2189 nvmx->ept.exit_qual = exit_qual;
2190 break;
2191 case EPT_TRANSLATE_RETRY:
2192 rc = NESTEDHVM_PAGEFAULT_RETRY;
2193 break;
2194 default:
2195 gdprintk(XENLOG_ERR, "GUEST EPT translation error!:%d\n", rc);
2196 BUG();
2197 break;
2198 }
2199
2200 vmx_vmcs_exit(v);
2201
2202 return rc;
2203 }
2204
nvmx_idtv_handling(void)2205 void nvmx_idtv_handling(void)
2206 {
2207 struct vcpu *v = current;
2208 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2209 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2210 unsigned long idtv_info, reason;
2211
2212 __vmread(IDT_VECTORING_INFO, &idtv_info);
2213 if ( likely(!(idtv_info & INTR_INFO_VALID_MASK)) )
2214 return;
2215
2216 /*
2217 * If L0 can solve the fault that causes idt vectoring, it should
2218 * be reinjected, otherwise, pass to L1.
2219 */
2220 __vmread(VM_EXIT_REASON, &reason);
2221 if ( reason != EXIT_REASON_EPT_VIOLATION ?
2222 !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) :
2223 !nvcpu->nv_vmexit_pending )
2224 {
2225 __vmwrite(VM_ENTRY_INTR_INFO, idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2226 if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2227 {
2228 __vmread(IDT_VECTORING_ERROR_CODE, &reason);
2229 __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, reason);
2230 }
2231 /*
2232 * SDM 23.2.4, if L1 tries to inject a software interrupt
2233 * and the delivery fails, VM_EXIT_INSTRUCTION_LEN receives
2234 * the value of previous VM_ENTRY_INSTRUCTION_LEN.
2235 *
2236 * This means EXIT_INSTRUCTION_LEN is always valid here, for
2237 * software interrupts both injected by L1, and generated in L2.
2238 */
2239 __vmread(VM_EXIT_INSTRUCTION_LEN, &reason);
2240 __vmwrite(VM_ENTRY_INSTRUCTION_LEN, reason);
2241 }
2242 }
2243
2244 /*
2245 * L2 VMExit handling
2246 * return 1: Done or skip the normal layer 0 hypervisor process.
2247 * Typically it requires layer 1 hypervisor processing
2248 * or it may be already processed here.
2249 * 0: Require the normal layer 0 process.
2250 */
nvmx_n2_vmexit_handler(struct cpu_user_regs * regs,unsigned int exit_reason)2251 int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
2252 unsigned int exit_reason)
2253 {
2254 struct vcpu *v = current;
2255 struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2256 struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2257 u32 ctrl;
2258
2259 nvcpu->nv_vmexit_pending = 0;
2260 nvmx->intr.intr_info = 0;
2261 nvmx->intr.error_code = 0;
2262
2263 switch (exit_reason) {
2264 case EXIT_REASON_EXCEPTION_NMI:
2265 {
2266 unsigned long intr_info;
2267 u32 valid_mask = MASK_INSR(X86_EVENTTYPE_HW_EXCEPTION,
2268 INTR_INFO_INTR_TYPE_MASK) |
2269 INTR_INFO_VALID_MASK;
2270 u64 exec_bitmap;
2271 int vector;
2272
2273 __vmread(VM_EXIT_INTR_INFO, &intr_info);
2274 vector = intr_info & INTR_INFO_VECTOR_MASK;
2275 /*
2276 * decided by L0 and L1 exception bitmap, if the vetor is set by
2277 * both, L0 has priority on #PF and #NM, L1 has priority on others
2278 */
2279 if ( vector == TRAP_page_fault )
2280 {
2281 if ( paging_mode_hap(v->domain) )
2282 nvcpu->nv_vmexit_pending = 1;
2283 }
2284 else if ( vector == TRAP_no_device )
2285 {
2286 if ( v->fpu_dirtied )
2287 nvcpu->nv_vmexit_pending = 1;
2288 }
2289 else if ( (intr_info & valid_mask) == valid_mask )
2290 {
2291 exec_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
2292
2293 if ( exec_bitmap & (1 << vector) )
2294 nvcpu->nv_vmexit_pending = 1;
2295 }
2296 break;
2297 }
2298 case EXIT_REASON_WBINVD:
2299 case EXIT_REASON_EPT_VIOLATION:
2300 case EXIT_REASON_EPT_MISCONFIG:
2301 case EXIT_REASON_EXTERNAL_INTERRUPT:
2302 /* pass to L0 handler */
2303 break;
2304 case VMX_EXIT_REASONS_FAILED_VMENTRY:
2305 case EXIT_REASON_TRIPLE_FAULT:
2306 case EXIT_REASON_TASK_SWITCH:
2307 case EXIT_REASON_CPUID:
2308 case EXIT_REASON_VMCALL:
2309 case EXIT_REASON_VMCLEAR:
2310 case EXIT_REASON_VMLAUNCH:
2311 case EXIT_REASON_VMPTRLD:
2312 case EXIT_REASON_VMPTRST:
2313 case EXIT_REASON_VMREAD:
2314 case EXIT_REASON_VMRESUME:
2315 case EXIT_REASON_VMWRITE:
2316 case EXIT_REASON_VMXOFF:
2317 case EXIT_REASON_VMXON:
2318 case EXIT_REASON_INVEPT:
2319 case EXIT_REASON_XSETBV:
2320 /* inject to L1 */
2321 nvcpu->nv_vmexit_pending = 1;
2322 break;
2323
2324 case EXIT_REASON_MSR_READ:
2325 case EXIT_REASON_MSR_WRITE:
2326 ctrl = __n2_exec_control(v);
2327
2328 /* Without ACTIVATE_MSR_BITMAP, all MSRs are intercepted. */
2329 if ( !(ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP) )
2330 nvcpu->nv_vmexit_pending = 1;
2331 else if ( !nvmx->msrbitmap )
2332 /* ACTIVATE_MSR_BITMAP set, but L2 bitmap not mapped??? */
2333 domain_crash(v->domain);
2334 else
2335 nvcpu->nv_vmexit_pending =
2336 vmx_msr_is_intercepted(nvmx->msrbitmap, regs->ecx,
2337 exit_reason == EXIT_REASON_MSR_WRITE);
2338 break;
2339
2340 case EXIT_REASON_IO_INSTRUCTION:
2341 ctrl = __n2_exec_control(v);
2342 if ( ctrl & CPU_BASED_ACTIVATE_IO_BITMAP )
2343 {
2344 unsigned long qual;
2345 u16 port, size;
2346
2347 __vmread(EXIT_QUALIFICATION, &qual);
2348 port = qual >> 16;
2349 size = (qual & 7) + 1;
2350 do {
2351 const u8 *bitmap = nvmx->iobitmap[port >> 15];
2352
2353 if ( bitmap[(port & 0x7fff) >> 3] & (1 << (port & 7)) )
2354 nvcpu->nv_vmexit_pending = 1;
2355 if ( !--size )
2356 break;
2357 if ( !++port )
2358 nvcpu->nv_vmexit_pending = 1;
2359 } while ( !nvcpu->nv_vmexit_pending );
2360 if ( !nvcpu->nv_vmexit_pending )
2361 printk(XENLOG_G_WARNING "L0 PIO %04x\n", port);
2362 }
2363 else if ( ctrl & CPU_BASED_UNCOND_IO_EXITING )
2364 nvcpu->nv_vmexit_pending = 1;
2365 break;
2366
2367 case EXIT_REASON_PENDING_VIRT_INTR:
2368 ctrl = __n2_exec_control(v);
2369 if ( ctrl & CPU_BASED_VIRTUAL_INTR_PENDING )
2370 nvcpu->nv_vmexit_pending = 1;
2371 break;
2372 case EXIT_REASON_PENDING_VIRT_NMI:
2373 ctrl = __n2_exec_control(v);
2374 if ( ctrl & CPU_BASED_VIRTUAL_NMI_PENDING )
2375 nvcpu->nv_vmexit_pending = 1;
2376 break;
2377 case EXIT_REASON_MONITOR_TRAP_FLAG:
2378 ctrl = __n2_exec_control(v);
2379 if ( ctrl & CPU_BASED_MONITOR_TRAP_FLAG)
2380 nvcpu->nv_vmexit_pending = 1;
2381 break;
2382 case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
2383 case EXIT_REASON_ACCESS_LDTR_OR_TR:
2384 ctrl = __n2_secondary_exec_control(v);
2385 if ( ctrl & SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING )
2386 nvcpu->nv_vmexit_pending = 1;
2387 break;
2388 case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
2389 ctrl = __n2_pin_exec_control(v);
2390 if ( ctrl & PIN_BASED_PREEMPT_TIMER )
2391 nvcpu->nv_vmexit_pending = 1;
2392 break;
2393 /* L1 has priority handling several other types of exits */
2394 case EXIT_REASON_HLT:
2395 ctrl = __n2_exec_control(v);
2396 if ( ctrl & CPU_BASED_HLT_EXITING )
2397 nvcpu->nv_vmexit_pending = 1;
2398 break;
2399 case EXIT_REASON_RDTSC:
2400 ctrl = __n2_exec_control(v);
2401 if ( ctrl & CPU_BASED_RDTSC_EXITING )
2402 nvcpu->nv_vmexit_pending = 1;
2403 else
2404 {
2405 /*
2406 * special handler is needed if L1 doesn't intercept rdtsc,
2407 * avoiding changing guest_tsc and messing up timekeeping in L1
2408 */
2409 msr_split(regs, hvm_get_guest_tsc(v) + get_vvmcs(v, TSC_OFFSET));
2410 update_guest_eip();
2411
2412 return 1;
2413 }
2414 break;
2415 case EXIT_REASON_RDPMC:
2416 ctrl = __n2_exec_control(v);
2417 if ( ctrl & CPU_BASED_RDPMC_EXITING )
2418 nvcpu->nv_vmexit_pending = 1;
2419 break;
2420 case EXIT_REASON_MWAIT_INSTRUCTION:
2421 ctrl = __n2_exec_control(v);
2422 if ( ctrl & CPU_BASED_MWAIT_EXITING )
2423 nvcpu->nv_vmexit_pending = 1;
2424 break;
2425 case EXIT_REASON_PAUSE_INSTRUCTION:
2426 ctrl = __n2_exec_control(v);
2427 if ( ctrl & CPU_BASED_PAUSE_EXITING )
2428 nvcpu->nv_vmexit_pending = 1;
2429 break;
2430 case EXIT_REASON_MONITOR_INSTRUCTION:
2431 ctrl = __n2_exec_control(v);
2432 if ( ctrl & CPU_BASED_MONITOR_EXITING )
2433 nvcpu->nv_vmexit_pending = 1;
2434 break;
2435 case EXIT_REASON_DR_ACCESS:
2436 ctrl = __n2_exec_control(v);
2437 if ( (ctrl & CPU_BASED_MOV_DR_EXITING) &&
2438 v->arch.hvm_vcpu.flag_dr_dirty )
2439 nvcpu->nv_vmexit_pending = 1;
2440 break;
2441 case EXIT_REASON_INVLPG:
2442 ctrl = __n2_exec_control(v);
2443 if ( ctrl & CPU_BASED_INVLPG_EXITING )
2444 nvcpu->nv_vmexit_pending = 1;
2445 break;
2446 case EXIT_REASON_CR_ACCESS:
2447 {
2448 unsigned long exit_qualification;
2449 int cr, write;
2450 u32 mask = 0;
2451
2452 __vmread(EXIT_QUALIFICATION, &exit_qualification);
2453 cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
2454 write = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
2455 /* also according to guest exec_control */
2456 ctrl = __n2_exec_control(v);
2457
2458 if ( cr == 3 )
2459 {
2460 mask = write? CPU_BASED_CR3_STORE_EXITING:
2461 CPU_BASED_CR3_LOAD_EXITING;
2462 if ( ctrl & mask )
2463 nvcpu->nv_vmexit_pending = 1;
2464 }
2465 else if ( cr == 8 )
2466 {
2467 mask = write? CPU_BASED_CR8_STORE_EXITING:
2468 CPU_BASED_CR8_LOAD_EXITING;
2469 if ( ctrl & mask )
2470 nvcpu->nv_vmexit_pending = 1;
2471 }
2472 else /* CR0, CR4, CLTS, LMSW */
2473 {
2474 /*
2475 * While getting the VM exit for CR0/CR4 access, check if L1 VMM owns
2476 * the bit.
2477 * If so, inject the VM exit to L1 VMM.
2478 * Otherwise, L0 will handle it and sync the value to L1 virtual VMCS.
2479 */
2480 unsigned long old_val, val, changed_bits;
2481 switch ( VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification) )
2482 {
2483 case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
2484 {
2485 unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
2486 unsigned long *reg;
2487
2488 if ( (reg = decode_register(gp, guest_cpu_user_regs(), 0)) == NULL )
2489 {
2490 gdprintk(XENLOG_ERR, "invalid gpr: %lx\n", gp);
2491 break;
2492 }
2493 val = *reg;
2494 if ( cr == 0 )
2495 {
2496 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2497
2498 __vmread(CR0_READ_SHADOW, &old_val);
2499 changed_bits = old_val ^ val;
2500 if ( changed_bits & cr0_gh_mask )
2501 nvcpu->nv_vmexit_pending = 1;
2502 else
2503 {
2504 u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2505
2506 set_vvmcs(v, GUEST_CR0,
2507 (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2508 }
2509 }
2510 else if ( cr == 4 )
2511 {
2512 u64 cr4_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
2513
2514 __vmread(CR4_READ_SHADOW, &old_val);
2515 changed_bits = old_val ^ val;
2516 if ( changed_bits & cr4_gh_mask )
2517 nvcpu->nv_vmexit_pending = 1;
2518 else
2519 {
2520 u64 guest_cr4 = get_vvmcs(v, GUEST_CR4);
2521
2522 set_vvmcs(v, GUEST_CR4,
2523 (guest_cr4 & cr4_gh_mask) | (val & ~cr4_gh_mask));
2524 }
2525 }
2526 else
2527 nvcpu->nv_vmexit_pending = 1;
2528 break;
2529 }
2530 case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
2531 {
2532 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2533
2534 if ( cr0_gh_mask & X86_CR0_TS )
2535 nvcpu->nv_vmexit_pending = 1;
2536 else
2537 {
2538 u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2539
2540 set_vvmcs(v, GUEST_CR0, (guest_cr0 & ~X86_CR0_TS));
2541 }
2542 break;
2543 }
2544 case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
2545 {
2546 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2547
2548 __vmread(CR0_READ_SHADOW, &old_val);
2549 old_val &= X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS;
2550 val = VMX_CONTROL_REG_ACCESS_DATA(exit_qualification) &
2551 (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS);
2552 changed_bits = old_val ^ val;
2553 if ( changed_bits & cr0_gh_mask )
2554 nvcpu->nv_vmexit_pending = 1;
2555 else
2556 {
2557 u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2558
2559 set_vvmcs(v, GUEST_CR0, (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2560 }
2561 break;
2562 }
2563 default:
2564 break;
2565 }
2566 }
2567 break;
2568 }
2569 case EXIT_REASON_APIC_ACCESS:
2570 ctrl = __n2_secondary_exec_control(v);
2571 if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
2572 nvcpu->nv_vmexit_pending = 1;
2573 break;
2574 case EXIT_REASON_TPR_BELOW_THRESHOLD:
2575 ctrl = __n2_exec_control(v);
2576 if ( ctrl & CPU_BASED_TPR_SHADOW )
2577 nvcpu->nv_vmexit_pending = 1;
2578 break;
2579 default:
2580 gprintk(XENLOG_ERR, "Unexpected nested vmexit: reason %u\n",
2581 exit_reason);
2582 }
2583
2584 return ( nvcpu->nv_vmexit_pending == 1 );
2585 }
2586
nvmx_set_cr_read_shadow(struct vcpu * v,unsigned int cr)2587 void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr)
2588 {
2589 unsigned long cr_field, read_shadow_field, mask_field;
2590
2591 switch ( cr )
2592 {
2593 case 0:
2594 cr_field = GUEST_CR0;
2595 read_shadow_field = CR0_READ_SHADOW;
2596 mask_field = CR0_GUEST_HOST_MASK;
2597 break;
2598 case 4:
2599 cr_field = GUEST_CR4;
2600 read_shadow_field = CR4_READ_SHADOW;
2601 mask_field = CR4_GUEST_HOST_MASK;
2602 break;
2603 default:
2604 gdprintk(XENLOG_WARNING, "Set read shadow for CR%d.\n", cr);
2605 return;
2606 }
2607
2608 if ( !nestedhvm_vmswitch_in_progress(v) )
2609 {
2610 unsigned long virtual_cr_mask =
2611 get_vvmcs(v, mask_field);
2612
2613 /*
2614 * We get here when L2 changed cr in a way that did not change
2615 * any of L1's shadowed bits (see nvmx_n2_vmexit_handler),
2616 * but did change L0 shadowed bits. So we first calculate the
2617 * effective cr value that L1 would like to write into the
2618 * hardware. It consists of the L2-owned bits from the new
2619 * value combined with the L1-owned bits from L1's guest cr.
2620 */
2621 v->arch.hvm_vcpu.guest_cr[cr] &= ~virtual_cr_mask;
2622 v->arch.hvm_vcpu.guest_cr[cr] |= virtual_cr_mask &
2623 get_vvmcs(v, cr_field);
2624 }
2625
2626 /* nvcpu.guest_cr is what L2 write to cr actually. */
2627 __vmwrite(read_shadow_field, v->arch.hvm_vcpu.nvcpu.guest_cr[cr]);
2628 }
2629
2630 /*
2631 * Local variables:
2632 * mode: C
2633 * c-file-style: "BSD"
2634 * c-basic-offset: 4
2635 * tab-width: 4
2636 * indent-tabs-mode: nil
2637 * End:
2638 */
2639