1 /*
2  * vvmx.c: Support virtual VMX for nested virtualization.
3  *
4  * Copyright (c) 2010, Intel Corporation.
5  * Author: Qing He <qing.he@intel.com>
6  *         Eddie Dong <eddie.dong@intel.com>
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; If not, see <http://www.gnu.org/licenses/>.
19  *
20  */
21 
22 #include <asm/types.h>
23 #include <asm/mtrr.h>
24 #include <asm/p2m.h>
25 #include <asm/hvm/ioreq.h>
26 #include <asm/hvm/vmx/vmx.h>
27 #include <asm/hvm/vmx/vvmx.h>
28 #include <asm/hvm/nestedhvm.h>
29 
30 static DEFINE_PER_CPU(u64 *, vvmcs_buf);
31 
32 static void nvmx_purge_vvmcs(struct vcpu *v);
33 
nvmx_vcpu_in_vmx(const struct vcpu * v)34 static bool nvmx_vcpu_in_vmx(const struct vcpu *v)
35 {
36     return vcpu_2_nvmx(v).vmxon_region_pa != INVALID_PADDR;
37 }
38 
39 #define VMCS_BUF_SIZE 100
40 
nvmx_cpu_up_prepare(unsigned int cpu)41 int nvmx_cpu_up_prepare(unsigned int cpu)
42 {
43     if ( per_cpu(vvmcs_buf, cpu) != NULL )
44         return 0;
45 
46     per_cpu(vvmcs_buf, cpu) = xzalloc_array(u64, VMCS_BUF_SIZE);
47 
48     if ( per_cpu(vvmcs_buf, cpu) != NULL )
49         return 0;
50 
51     return -ENOMEM;
52 }
53 
nvmx_cpu_dead(unsigned int cpu)54 void nvmx_cpu_dead(unsigned int cpu)
55 {
56     xfree(per_cpu(vvmcs_buf, cpu));
57     per_cpu(vvmcs_buf, cpu) = NULL;
58 }
59 
nvmx_vcpu_initialise(struct vcpu * v)60 int nvmx_vcpu_initialise(struct vcpu *v)
61 {
62     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
63     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
64     struct page_info *pg = alloc_domheap_page(NULL, 0);
65 
66     if ( !pg )
67     {
68         gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n");
69         return -ENOMEM;
70     }
71     nvcpu->nv_n2vmcx_pa = page_to_maddr(pg);
72 
73     /* non-root VMREAD/VMWRITE bitmap. */
74     if ( cpu_has_vmx_vmcs_shadowing )
75     {
76         struct page_info *vmread_bitmap, *vmwrite_bitmap;
77         unsigned long *vw;
78 
79         vmread_bitmap = alloc_domheap_page(NULL, 0);
80         if ( !vmread_bitmap )
81         {
82             gdprintk(XENLOG_ERR, "nest: allocation for vmread bitmap failed\n");
83             return -ENOMEM;
84         }
85         v->arch.hvm_vmx.vmread_bitmap = vmread_bitmap;
86 
87         clear_domain_page(_mfn(page_to_mfn(vmread_bitmap)));
88 
89         vmwrite_bitmap = alloc_domheap_page(NULL, 0);
90         if ( !vmwrite_bitmap )
91         {
92             gdprintk(XENLOG_ERR, "nest: allocation for vmwrite bitmap failed\n");
93             return -ENOMEM;
94         }
95         v->arch.hvm_vmx.vmwrite_bitmap = vmwrite_bitmap;
96 
97         vw = __map_domain_page(vmwrite_bitmap);
98         clear_page(vw);
99 
100         /*
101          * For the following 6 encodings, we need to handle them in VMM.
102          * Let them vmexit as usual.
103          */
104         set_bit(IO_BITMAP_A, vw);
105         set_bit(VMCS_HIGH(IO_BITMAP_A), vw);
106         set_bit(IO_BITMAP_B, vw);
107         set_bit(VMCS_HIGH(IO_BITMAP_B), vw);
108         set_bit(MSR_BITMAP, vw);
109         set_bit(VMCS_HIGH(MSR_BITMAP), vw);
110 
111         unmap_domain_page(vw);
112     }
113 
114     nvmx->ept.enabled = 0;
115     nvmx->guest_vpid = 0;
116     nvmx->vmxon_region_pa = INVALID_PADDR;
117     nvcpu->nv_vvmcx = NULL;
118     nvcpu->nv_vvmcxaddr = INVALID_PADDR;
119     nvmx->intr.intr_info = 0;
120     nvmx->intr.error_code = 0;
121     nvmx->iobitmap[0] = NULL;
122     nvmx->iobitmap[1] = NULL;
123     nvmx->msrbitmap = NULL;
124     INIT_LIST_HEAD(&nvmx->launched_list);
125     return 0;
126 }
127 
nvmx_vcpu_destroy(struct vcpu * v)128 void nvmx_vcpu_destroy(struct vcpu *v)
129 {
130     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
131     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
132     struct vvmcs_list *item, *n;
133 
134     /*
135      * When destroying the vcpu, it may be running on behalf of L2 guest.
136      * Therefore we need to switch the VMCS pointer back to the L1 VMCS,
137      * in order to avoid double free of L2 VMCS and the possible memory
138      * leak of L1 VMCS page.
139      */
140     if ( nvcpu->nv_n1vmcx_pa )
141         v->arch.hvm_vmx.vmcs_pa = nvcpu->nv_n1vmcx_pa;
142 
143     if ( nvcpu->nv_n2vmcx_pa )
144     {
145         __vmpclear(nvcpu->nv_n2vmcx_pa);
146         free_domheap_page(maddr_to_page(nvcpu->nv_n2vmcx_pa));
147         nvcpu->nv_n2vmcx_pa = 0;
148     }
149 
150     /* Must also cope with nvmx_vcpu_initialise() not having got called. */
151     if ( nvmx->launched_list.next )
152         list_for_each_entry_safe(item, n, &nvmx->launched_list, node)
153         {
154             list_del(&item->node);
155             xfree(item);
156         }
157 
158     if ( v->arch.hvm_vmx.vmread_bitmap )
159     {
160         free_domheap_page(v->arch.hvm_vmx.vmread_bitmap);
161         v->arch.hvm_vmx.vmread_bitmap = NULL;
162     }
163     if ( v->arch.hvm_vmx.vmwrite_bitmap )
164     {
165         free_domheap_page(v->arch.hvm_vmx.vmwrite_bitmap);
166         v->arch.hvm_vmx.vmwrite_bitmap = NULL;
167     }
168 }
169 
nvmx_domain_relinquish_resources(struct domain * d)170 void nvmx_domain_relinquish_resources(struct domain *d)
171 {
172     struct vcpu *v;
173 
174     for_each_vcpu ( d, v )
175         nvmx_purge_vvmcs(v);
176 }
177 
nvmx_vcpu_reset(struct vcpu * v)178 int nvmx_vcpu_reset(struct vcpu *v)
179 {
180     return 0;
181 }
182 
nvmx_vcpu_eptp_base(struct vcpu * v)183 uint64_t nvmx_vcpu_eptp_base(struct vcpu *v)
184 {
185     return get_vvmcs(v, EPT_POINTER) & PAGE_MASK;
186 }
187 
nvmx_ept_enabled(struct vcpu * v)188 bool_t nvmx_ept_enabled(struct vcpu *v)
189 {
190     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
191 
192     return !!(nvmx->ept.enabled);
193 }
194 
195 struct vmx_inst_decoded {
196 #define VMX_INST_MEMREG_TYPE_MEMORY 0
197 #define VMX_INST_MEMREG_TYPE_REG    1
198     int type;
199     union {
200         struct {
201             unsigned long mem;
202             unsigned int  len;
203         };
204         enum vmx_regs_enc reg1;
205     };
206 
207     enum vmx_regs_enc reg2;
208 };
209 
210 enum vmx_ops_result {
211     VMSUCCEED,
212     VMFAIL_VALID,
213     VMFAIL_INVALID,
214 };
215 
216 #define CASE_SET_REG(REG, reg)      \
217     case VMX_REG_ ## REG: regs->reg = value; break
218 #define CASE_GET_REG(REG, reg)      \
219     case VMX_REG_ ## REG: value = regs->reg; break
220 
vvmcs_offset(u32 width,u32 type,u32 index)221 static int vvmcs_offset(u32 width, u32 type, u32 index)
222 {
223     int offset;
224 
225     offset = (index & 0x1f) | type << 5 | width << 7;
226 
227     if ( offset == 0 )    /* vpid */
228         offset = 0x3f;
229 
230     return offset;
231 }
232 
get_vvmcs_virtual(void * vvmcs,u32 vmcs_encoding)233 u64 get_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding)
234 {
235     union vmcs_encoding enc;
236     u64 *content = (u64 *) vvmcs;
237     int offset;
238     u64 res;
239 
240     enc.word = vmcs_encoding;
241     offset = vvmcs_offset(enc.width, enc.type, enc.index);
242     res = content[offset];
243 
244     switch ( enc.width ) {
245     case VVMCS_WIDTH_16:
246         res &= 0xffff;
247         break;
248    case VVMCS_WIDTH_64:
249         if ( enc.access_type )
250             res >>= 32;
251         break;
252     case VVMCS_WIDTH_32:
253         res &= 0xffffffff;
254         break;
255     case VVMCS_WIDTH_NATURAL:
256     default:
257         break;
258     }
259 
260     return res;
261 }
262 
get_vvmcs_real(const struct vcpu * v,u32 encoding)263 u64 get_vvmcs_real(const struct vcpu *v, u32 encoding)
264 {
265     return virtual_vmcs_vmread(v, encoding);
266 }
267 
get_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 * val)268 enum vmx_insn_errno get_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 *val)
269 {
270     *val = get_vvmcs_virtual(vvmcs, encoding);
271 
272     /*
273      * TODO: This should not always succeed. Fields and values need to be
274      * audited against the features offered to the guest in the VT-x MSRs.
275      * This should be fixed when the MSR levelling work is started, at which
276      * point there will be a cpuid_policy-like object.
277      */
278     return VMX_INSN_SUCCEED;
279 }
280 
get_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 * val)281 enum vmx_insn_errno get_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
282                                         u64 *val)
283 {
284     return virtual_vmcs_vmread_safe(v, encoding, val);
285 }
286 
set_vvmcs_virtual(void * vvmcs,u32 vmcs_encoding,u64 val)287 void set_vvmcs_virtual(void *vvmcs, u32 vmcs_encoding, u64 val)
288 {
289     union vmcs_encoding enc;
290     u64 *content = (u64 *) vvmcs;
291     int offset;
292     u64 res;
293 
294     enc.word = vmcs_encoding;
295     offset = vvmcs_offset(enc.width, enc.type, enc.index);
296     res = content[offset];
297 
298     switch ( enc.width ) {
299     case VVMCS_WIDTH_16:
300         res = val & 0xffff;
301         break;
302     case VVMCS_WIDTH_64:
303         if ( enc.access_type )
304         {
305             res &= 0xffffffff;
306             res |= val << 32;
307         }
308         else
309             res = val;
310         break;
311     case VVMCS_WIDTH_32:
312         res = val & 0xffffffff;
313         break;
314     case VVMCS_WIDTH_NATURAL:
315     default:
316         res = val;
317         break;
318     }
319 
320     content[offset] = res;
321 }
322 
set_vvmcs_real(const struct vcpu * v,u32 encoding,u64 val)323 void set_vvmcs_real(const struct vcpu *v, u32 encoding, u64 val)
324 {
325     virtual_vmcs_vmwrite(v, encoding, val);
326 }
327 
set_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 val)328 enum vmx_insn_errno set_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 val)
329 {
330     set_vvmcs_virtual(vvmcs, encoding, val);
331 
332     /*
333      * TODO: This should not always succeed. Fields and values need to be
334      * audited against the features offered to the guest in the VT-x MSRs.
335      * This should be fixed when the MSR levelling work is started, at which
336      * point there will be a cpuid_policy-like object.
337      */
338     return VMX_INSN_SUCCEED;
339 }
340 
set_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 val)341 enum vmx_insn_errno set_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
342                                         u64 val)
343 {
344     return virtual_vmcs_vmwrite_safe(v, encoding, val);
345 }
346 
reg_read(struct cpu_user_regs * regs,enum vmx_regs_enc index)347 static unsigned long reg_read(struct cpu_user_regs *regs,
348                               enum vmx_regs_enc index)
349 {
350     unsigned long *pval = decode_register(index, regs, 0);
351 
352     return *pval;
353 }
354 
reg_write(struct cpu_user_regs * regs,enum vmx_regs_enc index,unsigned long value)355 static void reg_write(struct cpu_user_regs *regs,
356                       enum vmx_regs_enc index,
357                       unsigned long value)
358 {
359     unsigned long *pval = decode_register(index, regs, 0);
360 
361     *pval = value;
362 }
363 
__n2_pin_exec_control(struct vcpu * v)364 static inline u32 __n2_pin_exec_control(struct vcpu *v)
365 {
366     return get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
367 }
368 
__n2_exec_control(struct vcpu * v)369 static inline u32 __n2_exec_control(struct vcpu *v)
370 {
371     return get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL);
372 }
373 
__n2_secondary_exec_control(struct vcpu * v)374 static inline u32 __n2_secondary_exec_control(struct vcpu *v)
375 {
376     u64 second_ctrl = 0;
377 
378     if ( __n2_exec_control(v) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
379         second_ctrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
380 
381     return second_ctrl;
382 }
383 
vmx_inst_check_privilege(struct cpu_user_regs * regs,int vmxop_check)384 static int vmx_inst_check_privilege(struct cpu_user_regs *regs, int vmxop_check)
385 {
386     struct vcpu *v = current;
387 
388     if ( vmxop_check )
389     {
390         if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) ||
391              !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VMXE) )
392             goto invalid_op;
393     }
394     else if ( !nvmx_vcpu_in_vmx(v) )
395         goto invalid_op;
396 
397     if ( vmx_guest_x86_mode(v) < (hvm_long_mode_active(v) ? 8 : 2) )
398         goto invalid_op;
399     else if ( nestedhvm_vcpu_in_guestmode(v) )
400         goto vmexit;
401 
402     if ( vmx_get_cpl() > 0 )
403         goto gp_fault;
404 
405     return X86EMUL_OKAY;
406 
407 vmexit:
408     gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: vmexit\n");
409     vcpu_nestedhvm(v).nv_vmexit_pending = 1;
410     return X86EMUL_EXCEPTION;
411 
412 invalid_op:
413     gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: invalid_op\n");
414     hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
415     return X86EMUL_EXCEPTION;
416 
417 gp_fault:
418     gdprintk(XENLOG_ERR, "vmx_inst_check_privilege: gp_fault\n");
419     hvm_inject_hw_exception(TRAP_gp_fault, 0);
420     return X86EMUL_EXCEPTION;
421 }
422 
decode_vmx_inst(struct cpu_user_regs * regs,struct vmx_inst_decoded * decode,unsigned long * poperandS,int vmxon_check)423 static int decode_vmx_inst(struct cpu_user_regs *regs,
424                            struct vmx_inst_decoded *decode,
425                            unsigned long *poperandS, int vmxon_check)
426 {
427     struct vcpu *v = current;
428     union vmx_inst_info info;
429     struct segment_register seg;
430     unsigned long base, index, seg_base, disp, offset;
431     int scale, size;
432 
433     if ( vmx_inst_check_privilege(regs, vmxon_check) != X86EMUL_OKAY )
434         return X86EMUL_EXCEPTION;
435 
436     __vmread(VMX_INSTRUCTION_INFO, &offset);
437     info.word = offset;
438 
439     if ( info.fields.memreg ) {
440         decode->type = VMX_INST_MEMREG_TYPE_REG;
441         decode->reg1 = info.fields.reg1;
442         if ( poperandS != NULL )
443             *poperandS = reg_read(regs, decode->reg1);
444     }
445     else
446     {
447         bool mode_64bit = (vmx_guest_x86_mode(v) == 8);
448 
449         decode->type = VMX_INST_MEMREG_TYPE_MEMORY;
450 
451         if ( info.fields.segment > x86_seg_gs )
452             goto gp_fault;
453         hvm_get_segment_register(v, info.fields.segment, &seg);
454         seg_base = seg.base;
455 
456         base = info.fields.base_reg_invalid ? 0 :
457             reg_read(regs, info.fields.base_reg);
458 
459         index = info.fields.index_reg_invalid ? 0 :
460             reg_read(regs, info.fields.index_reg);
461 
462         scale = 1 << info.fields.scaling;
463 
464         __vmread(EXIT_QUALIFICATION, &disp);
465 
466         size = 1 << (info.fields.addr_size + 1);
467 
468         offset = base + index * scale + disp;
469         base = !mode_64bit || info.fields.segment >= x86_seg_fs ?
470                seg_base + offset : offset;
471         if ( offset + size - 1 < offset ||
472              (mode_64bit ?
473               !is_canonical_address((long)base < 0 ? base :
474                                     base + size - 1) :
475               offset + size - 1 > seg.limit) )
476             goto gp_fault;
477 
478         if ( poperandS != NULL )
479         {
480             pagefault_info_t pfinfo;
481             int rc = hvm_copy_from_guest_linear(poperandS, base, size,
482                                                 0, &pfinfo);
483 
484             if ( rc == HVMTRANS_bad_linear_to_gfn )
485                 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
486             if ( rc != HVMTRANS_okay )
487                 return X86EMUL_EXCEPTION;
488         }
489         decode->mem = base;
490         decode->len = size;
491     }
492 
493     decode->reg2 = info.fields.reg2;
494 
495     return X86EMUL_OKAY;
496 
497 gp_fault:
498     hvm_inject_hw_exception(TRAP_gp_fault, 0);
499     return X86EMUL_EXCEPTION;
500 }
501 
vmsucceed(struct cpu_user_regs * regs)502 static void vmsucceed(struct cpu_user_regs *regs)
503 {
504     regs->eflags &= ~X86_EFLAGS_ARITH_MASK;
505 }
506 
vmfail_valid(struct cpu_user_regs * regs,enum vmx_insn_errno errno)507 static void vmfail_valid(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
508 {
509     struct vcpu *v = current;
510     unsigned int eflags = regs->eflags;
511 
512     regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_ZF;
513     set_vvmcs(v, VM_INSTRUCTION_ERROR, errno);
514 }
515 
vmfail_invalid(struct cpu_user_regs * regs)516 static void vmfail_invalid(struct cpu_user_regs *regs)
517 {
518     unsigned int eflags = regs->eflags;
519 
520     regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_CF;
521 }
522 
vmfail(struct cpu_user_regs * regs,enum vmx_insn_errno errno)523 static void vmfail(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
524 {
525     if ( errno == VMX_INSN_SUCCEED )
526         return;
527 
528     if ( vcpu_nestedhvm(current).nv_vvmcxaddr != INVALID_PADDR &&
529          errno != VMX_INSN_FAIL_INVALID )
530         vmfail_valid(regs, errno);
531     else
532         vmfail_invalid(regs);
533 }
534 
nvmx_intercepts_exception(struct vcpu * v,unsigned int vector,int error_code)535 bool_t nvmx_intercepts_exception(
536     struct vcpu *v, unsigned int vector, int error_code)
537 {
538     u32 exception_bitmap, pfec_match=0, pfec_mask=0;
539     int r;
540 
541     ASSERT(vector < 32);
542 
543     exception_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
544     r = exception_bitmap & (1 << vector) ? 1: 0;
545 
546     if ( vector == TRAP_page_fault )
547     {
548         pfec_match = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH);
549         pfec_mask  = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK);
550         if ( (error_code & pfec_mask) != pfec_match )
551             r = !r;
552     }
553     return r;
554 }
555 
556 /*
557  * Nested VMX uses "strict" condition to exit from
558  * L2 guest if either L1 VMM or L0 VMM expect to exit.
559  */
__shadow_control(struct vcpu * v,unsigned int field,u32 host_value)560 static inline u32 __shadow_control(struct vcpu *v,
561                                  unsigned int field,
562                                  u32 host_value)
563 {
564     return get_vvmcs(v, field) | host_value;
565 }
566 
set_shadow_control(struct vcpu * v,unsigned int field,u32 host_value)567 static void set_shadow_control(struct vcpu *v,
568                                unsigned int field,
569                                u32 host_value)
570 {
571     __vmwrite(field, __shadow_control(v, field, host_value));
572 }
573 
_shadow_io_bitmap(struct vcpu * v)574 unsigned long *_shadow_io_bitmap(struct vcpu *v)
575 {
576     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
577     int port80, portED;
578     u8 *bitmap;
579 
580     bitmap = nvmx->iobitmap[0];
581     port80 = bitmap[0x80 >> 3] & (1 << (0x80 & 0x7)) ? 1 : 0;
582     portED = bitmap[0xed >> 3] & (1 << (0xed & 0x7)) ? 1 : 0;
583 
584     return nestedhvm_vcpu_iomap_get(port80, portED);
585 }
586 
nvmx_update_exec_control(struct vcpu * v,u32 host_cntrl)587 void nvmx_update_exec_control(struct vcpu *v, u32 host_cntrl)
588 {
589     u32 pio_cntrl = (CPU_BASED_ACTIVATE_IO_BITMAP
590                      | CPU_BASED_UNCOND_IO_EXITING);
591     unsigned long *bitmap;
592     u32 shadow_cntrl;
593 
594     shadow_cntrl = __n2_exec_control(v);
595     pio_cntrl &= shadow_cntrl;
596     /* Enforce the removed features */
597     shadow_cntrl &= ~(CPU_BASED_ACTIVATE_MSR_BITMAP
598                       | CPU_BASED_ACTIVATE_IO_BITMAP
599                       | CPU_BASED_UNCOND_IO_EXITING);
600     shadow_cntrl |= host_cntrl;
601     if ( pio_cntrl == CPU_BASED_UNCOND_IO_EXITING ) {
602         /* L1 VMM intercepts all I/O instructions */
603         shadow_cntrl |= CPU_BASED_UNCOND_IO_EXITING;
604         shadow_cntrl &= ~CPU_BASED_ACTIVATE_IO_BITMAP;
605     }
606     else {
607         /* Use IO_BITMAP in shadow */
608         if ( pio_cntrl == 0 ) {
609             /*
610              * L1 VMM doesn't intercept IO instruction.
611              * Use host configuration and reset IO_BITMAP
612              */
613             bitmap = hvm_io_bitmap;
614         }
615         else {
616             /* use IO bitmap */
617             bitmap = _shadow_io_bitmap(v);
618         }
619         __vmwrite(IO_BITMAP_A, virt_to_maddr(bitmap));
620         __vmwrite(IO_BITMAP_B, virt_to_maddr(bitmap) + PAGE_SIZE);
621     }
622 
623     /* TODO: change L0 intr window to MTF or NMI window */
624     __vmwrite(CPU_BASED_VM_EXEC_CONTROL, shadow_cntrl);
625 }
626 
nvmx_update_secondary_exec_control(struct vcpu * v,unsigned long host_cntrl)627 void nvmx_update_secondary_exec_control(struct vcpu *v,
628                                         unsigned long host_cntrl)
629 {
630     u32 shadow_cntrl;
631     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
632     u32 apicv_bit = SECONDARY_EXEC_APIC_REGISTER_VIRT |
633                     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
634 
635     host_cntrl &= ~apicv_bit;
636     host_cntrl &= ~SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
637     shadow_cntrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
638 
639     /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
640     ASSERT(!(shadow_cntrl & apicv_bit));
641 
642     nvmx->ept.enabled = !!(shadow_cntrl & SECONDARY_EXEC_ENABLE_EPT);
643     shadow_cntrl |= host_cntrl;
644     __vmwrite(SECONDARY_VM_EXEC_CONTROL, shadow_cntrl);
645 }
646 
nvmx_update_pin_control(struct vcpu * v,unsigned long host_cntrl)647 static void nvmx_update_pin_control(struct vcpu *v, unsigned long host_cntrl)
648 {
649     u32 shadow_cntrl;
650 
651     host_cntrl &= ~PIN_BASED_POSTED_INTERRUPT;
652     shadow_cntrl = get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
653 
654     /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
655     ASSERT(!(shadow_cntrl & PIN_BASED_POSTED_INTERRUPT));
656 
657     shadow_cntrl |= host_cntrl;
658     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, shadow_cntrl);
659 }
660 
nvmx_update_exit_control(struct vcpu * v,unsigned long host_cntrl)661 static void nvmx_update_exit_control(struct vcpu *v, unsigned long host_cntrl)
662 {
663     u32 shadow_cntrl;
664 
665     shadow_cntrl = get_vvmcs(v, VM_EXIT_CONTROLS);
666     shadow_cntrl &= ~(VM_EXIT_SAVE_DEBUG_CNTRLS
667                       | VM_EXIT_LOAD_HOST_PAT
668                       | VM_EXIT_LOAD_HOST_EFER
669                       | VM_EXIT_LOAD_PERF_GLOBAL_CTRL);
670     shadow_cntrl |= host_cntrl;
671     __vmwrite(VM_EXIT_CONTROLS, shadow_cntrl);
672 }
673 
nvmx_update_entry_control(struct vcpu * v)674 static void nvmx_update_entry_control(struct vcpu *v)
675 {
676     u32 shadow_cntrl;
677 
678     shadow_cntrl = get_vvmcs(v, VM_ENTRY_CONTROLS);
679     shadow_cntrl &= ~(VM_ENTRY_LOAD_GUEST_PAT
680                       | VM_ENTRY_LOAD_GUEST_EFER
681                       | VM_ENTRY_LOAD_PERF_GLOBAL_CTRL);
682     __vmwrite(VM_ENTRY_CONTROLS, shadow_cntrl);
683 }
684 
nvmx_update_exception_bitmap(struct vcpu * v,unsigned long value)685 void nvmx_update_exception_bitmap(struct vcpu *v, unsigned long value)
686 {
687     set_shadow_control(v, EXCEPTION_BITMAP, value);
688 }
689 
nvmx_update_apic_access_address(struct vcpu * v)690 static void nvmx_update_apic_access_address(struct vcpu *v)
691 {
692     u32 ctrl;
693 
694     ctrl = __n2_secondary_exec_control(v);
695     if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
696     {
697         p2m_type_t p2mt;
698         unsigned long apic_gpfn;
699         struct page_info *apic_pg;
700 
701         apic_gpfn = get_vvmcs(v, APIC_ACCESS_ADDR) >> PAGE_SHIFT;
702         apic_pg = get_page_from_gfn(v->domain, apic_gpfn, &p2mt, P2M_ALLOC);
703         ASSERT(apic_pg && !p2m_is_paging(p2mt));
704         __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(apic_pg));
705         put_page(apic_pg);
706     }
707     else
708         __vmwrite(APIC_ACCESS_ADDR, 0);
709 }
710 
nvmx_update_virtual_apic_address(struct vcpu * v)711 static void nvmx_update_virtual_apic_address(struct vcpu *v)
712 {
713     u32 ctrl;
714 
715     ctrl = __n2_exec_control(v);
716     if ( ctrl & CPU_BASED_TPR_SHADOW )
717     {
718         p2m_type_t p2mt;
719         unsigned long vapic_gpfn;
720         struct page_info *vapic_pg;
721 
722         vapic_gpfn = get_vvmcs(v, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
723         vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
724         ASSERT(vapic_pg && !p2m_is_paging(p2mt));
725         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
726         put_page(vapic_pg);
727     }
728     else
729         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
730 }
731 
nvmx_update_tpr_threshold(struct vcpu * v)732 static void nvmx_update_tpr_threshold(struct vcpu *v)
733 {
734     u32 ctrl = __n2_exec_control(v);
735 
736     if ( ctrl & CPU_BASED_TPR_SHADOW )
737         __vmwrite(TPR_THRESHOLD, get_vvmcs(v, TPR_THRESHOLD));
738     else
739         __vmwrite(TPR_THRESHOLD, 0);
740 }
741 
nvmx_update_pfec(struct vcpu * v)742 static void nvmx_update_pfec(struct vcpu *v)
743 {
744     __vmwrite(PAGE_FAULT_ERROR_CODE_MASK,
745               get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK));
746     __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH,
747               get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH));
748 }
749 
__clear_current_vvmcs(struct vcpu * v)750 static void __clear_current_vvmcs(struct vcpu *v)
751 {
752     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
753 
754     if ( nvcpu->nv_n2vmcx_pa )
755         __vmpclear(nvcpu->nv_n2vmcx_pa);
756 }
757 
758 /*
759  * Refreshes the MSR bitmap mapping for the current nested vcpu.  Returns true
760  * for a successful mapping, and returns false for MSR_BITMAP parameter errors
761  * or gfn mapping errors.
762  */
_map_msr_bitmap(struct vcpu * v)763 static bool __must_check _map_msr_bitmap(struct vcpu *v)
764 {
765     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
766     uint64_t gpa;
767 
768     if ( nvmx->msrbitmap )
769     {
770         hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
771         nvmx->msrbitmap = NULL;
772     }
773 
774     gpa = get_vvmcs(v, MSR_BITMAP);
775 
776     if ( !IS_ALIGNED(gpa, PAGE_SIZE) )
777         return false;
778 
779     nvmx->msrbitmap = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
780 
781     return nvmx->msrbitmap != NULL;
782 }
783 
_map_io_bitmap(struct vcpu * v,u64 vmcs_reg)784 static bool_t __must_check _map_io_bitmap(struct vcpu *v, u64 vmcs_reg)
785 {
786     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
787     unsigned long gpa;
788     int index;
789 
790     index = vmcs_reg == IO_BITMAP_A ? 0 : 1;
791     if (nvmx->iobitmap[index])
792         hvm_unmap_guest_frame(nvmx->iobitmap[index], 1);
793     gpa = get_vvmcs(v, vmcs_reg);
794     nvmx->iobitmap[index] = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
795 
796     return nvmx->iobitmap[index] != NULL;
797 }
798 
map_io_bitmap_all(struct vcpu * v)799 static inline bool_t __must_check map_io_bitmap_all(struct vcpu *v)
800 {
801    return _map_io_bitmap(v, IO_BITMAP_A) &&
802           _map_io_bitmap(v, IO_BITMAP_B);
803 }
804 
nvmx_purge_vvmcs(struct vcpu * v)805 static void nvmx_purge_vvmcs(struct vcpu *v)
806 {
807     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
808     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
809     int i;
810 
811     __clear_current_vvmcs(v);
812     if ( nvcpu->nv_vvmcxaddr != INVALID_PADDR )
813         hvm_unmap_guest_frame(nvcpu->nv_vvmcx, 1);
814     nvcpu->nv_vvmcx = NULL;
815     nvcpu->nv_vvmcxaddr = INVALID_PADDR;
816     v->arch.hvm_vmx.vmcs_shadow_maddr = 0;
817     for (i=0; i<2; i++) {
818         if ( nvmx->iobitmap[i] ) {
819             hvm_unmap_guest_frame(nvmx->iobitmap[i], 1);
820             nvmx->iobitmap[i] = NULL;
821         }
822     }
823     if ( nvmx->msrbitmap ) {
824         hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
825         nvmx->msrbitmap = NULL;
826     }
827 }
828 
nvmx_get_tsc_offset(struct vcpu * v)829 u64 nvmx_get_tsc_offset(struct vcpu *v)
830 {
831     u64 offset = 0;
832 
833     if ( get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL) &
834          CPU_BASED_USE_TSC_OFFSETING )
835         offset = get_vvmcs(v, TSC_OFFSET);
836 
837     return offset;
838 }
839 
840 /*
841  * Context synchronized between shadow and virtual VMCS.
842  */
843 static const u16 vmcs_gstate_field[] = {
844     /* 16 BITS */
845     GUEST_ES_SELECTOR,
846     GUEST_CS_SELECTOR,
847     GUEST_SS_SELECTOR,
848     GUEST_DS_SELECTOR,
849     GUEST_FS_SELECTOR,
850     GUEST_GS_SELECTOR,
851     GUEST_LDTR_SELECTOR,
852     GUEST_TR_SELECTOR,
853     /* 64 BITS */
854     VMCS_LINK_POINTER,
855     GUEST_IA32_DEBUGCTL,
856     GUEST_PAT,
857     GUEST_EFER,
858     GUEST_PERF_GLOBAL_CTRL,
859     /* 32 BITS */
860     GUEST_ES_LIMIT,
861     GUEST_CS_LIMIT,
862     GUEST_SS_LIMIT,
863     GUEST_DS_LIMIT,
864     GUEST_FS_LIMIT,
865     GUEST_GS_LIMIT,
866     GUEST_LDTR_LIMIT,
867     GUEST_TR_LIMIT,
868     GUEST_GDTR_LIMIT,
869     GUEST_IDTR_LIMIT,
870     GUEST_ES_AR_BYTES,
871     GUEST_CS_AR_BYTES,
872     GUEST_SS_AR_BYTES,
873     GUEST_DS_AR_BYTES,
874     GUEST_FS_AR_BYTES,
875     GUEST_GS_AR_BYTES,
876     GUEST_LDTR_AR_BYTES,
877     GUEST_TR_AR_BYTES,
878     GUEST_INTERRUPTIBILITY_INFO,
879     GUEST_ACTIVITY_STATE,
880     GUEST_SYSENTER_CS,
881     GUEST_PREEMPTION_TIMER,
882     /* natural */
883     GUEST_ES_BASE,
884     GUEST_CS_BASE,
885     GUEST_SS_BASE,
886     GUEST_DS_BASE,
887     GUEST_FS_BASE,
888     GUEST_GS_BASE,
889     GUEST_LDTR_BASE,
890     GUEST_TR_BASE,
891     GUEST_GDTR_BASE,
892     GUEST_IDTR_BASE,
893     GUEST_DR7,
894     /*
895      * Following guest states are in local cache (cpu_user_regs)
896      GUEST_RSP,
897      GUEST_RIP,
898      */
899     GUEST_RFLAGS,
900     GUEST_PENDING_DBG_EXCEPTIONS,
901     GUEST_SYSENTER_ESP,
902     GUEST_SYSENTER_EIP,
903 };
904 
905 static const u16 gpdpte_fields[] = {
906     GUEST_PDPTE(0),
907     GUEST_PDPTE(1),
908     GUEST_PDPTE(2),
909     GUEST_PDPTE(3),
910 };
911 
912 /*
913  * Context: shadow -> virtual VMCS
914  */
915 static const u16 vmcs_ro_field[] = {
916     GUEST_PHYSICAL_ADDRESS,
917     VM_INSTRUCTION_ERROR,
918     VM_EXIT_REASON,
919     VM_EXIT_INTR_INFO,
920     VM_EXIT_INTR_ERROR_CODE,
921     IDT_VECTORING_INFO,
922     IDT_VECTORING_ERROR_CODE,
923     VM_EXIT_INSTRUCTION_LEN,
924     VMX_INSTRUCTION_INFO,
925     EXIT_QUALIFICATION,
926     GUEST_LINEAR_ADDRESS
927 };
928 
929 static struct vmcs_host_to_guest {
930     u16 host_field;
931     u16 guest_field;
932 } const vmcs_h2g_field[] = {
933     {HOST_ES_SELECTOR, GUEST_ES_SELECTOR},
934     {HOST_CS_SELECTOR, GUEST_CS_SELECTOR},
935     {HOST_SS_SELECTOR, GUEST_SS_SELECTOR},
936     {HOST_DS_SELECTOR, GUEST_DS_SELECTOR},
937     {HOST_FS_SELECTOR, GUEST_FS_SELECTOR},
938     {HOST_GS_SELECTOR, GUEST_GS_SELECTOR},
939     {HOST_TR_SELECTOR, GUEST_TR_SELECTOR},
940     {HOST_SYSENTER_CS, GUEST_SYSENTER_CS},
941     {HOST_FS_BASE, GUEST_FS_BASE},
942     {HOST_GS_BASE, GUEST_GS_BASE},
943     {HOST_TR_BASE, GUEST_TR_BASE},
944     {HOST_GDTR_BASE, GUEST_GDTR_BASE},
945     {HOST_IDTR_BASE, GUEST_IDTR_BASE},
946     {HOST_SYSENTER_ESP, GUEST_SYSENTER_ESP},
947     {HOST_SYSENTER_EIP, GUEST_SYSENTER_EIP},
948 };
949 
vvmcs_to_shadow(const struct vcpu * v,unsigned int field)950 static void vvmcs_to_shadow(const struct vcpu *v, unsigned int field)
951 {
952     __vmwrite(field, get_vvmcs(v, field));
953 }
954 
vvmcs_to_shadow_bulk(struct vcpu * v,unsigned int n,const u16 * field)955 static void vvmcs_to_shadow_bulk(struct vcpu *v, unsigned int n,
956                                  const u16 *field)
957 {
958     u64 *value = this_cpu(vvmcs_buf);
959     unsigned int i;
960 
961     if ( !cpu_has_vmx_vmcs_shadowing )
962         goto fallback;
963 
964     if ( !value || n > VMCS_BUF_SIZE )
965     {
966         gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \
967                  buffer: %p, buffer size: %d, fields number: %d.\n",
968                  value, VMCS_BUF_SIZE, n);
969         goto fallback;
970     }
971 
972     virtual_vmcs_enter(v);
973     for ( i = 0; i < n; i++ )
974         __vmread(field[i], &value[i]);
975     virtual_vmcs_exit(v);
976 
977     for ( i = 0; i < n; i++ )
978         __vmwrite(field[i], value[i]);
979 
980     return;
981 
982 fallback:
983     for ( i = 0; i < n; i++ )
984         vvmcs_to_shadow(v, field[i]);
985 }
986 
shadow_to_vvmcs(const struct vcpu * v,unsigned int field)987 static inline void shadow_to_vvmcs(const struct vcpu *v, unsigned int field)
988 {
989     unsigned long value;
990 
991     if ( vmread_safe(field, &value) == 0 )
992         set_vvmcs(v, field, value);
993 }
994 
shadow_to_vvmcs_bulk(struct vcpu * v,unsigned int n,const u16 * field)995 static void shadow_to_vvmcs_bulk(struct vcpu *v, unsigned int n,
996                                  const u16 *field)
997 {
998     u64 *value = this_cpu(vvmcs_buf);
999     unsigned int i;
1000 
1001     if ( !cpu_has_vmx_vmcs_shadowing )
1002         goto fallback;
1003 
1004     if ( !value || n > VMCS_BUF_SIZE )
1005     {
1006         gdprintk(XENLOG_DEBUG, "vmcs sync fall back to non-bulk mode, \
1007                  buffer: %p, buffer size: %d, fields number: %d.\n",
1008                  value, VMCS_BUF_SIZE, n);
1009         goto fallback;
1010     }
1011 
1012     for ( i = 0; i < n; i++ )
1013         __vmread(field[i], &value[i]);
1014 
1015     virtual_vmcs_enter(v);
1016     for ( i = 0; i < n; i++ )
1017         __vmwrite(field[i], value[i]);
1018     virtual_vmcs_exit(v);
1019 
1020     return;
1021 
1022 fallback:
1023     for ( i = 0; i < n; i++ )
1024         shadow_to_vvmcs(v, field[i]);
1025 }
1026 
load_shadow_control(struct vcpu * v)1027 static void load_shadow_control(struct vcpu *v)
1028 {
1029     /*
1030      * Set shadow controls:  PIN_BASED, CPU_BASED, EXIT, ENTRY
1031      * and EXCEPTION
1032      * Enforce the removed features
1033      */
1034     nvmx_update_pin_control(v, vmx_pin_based_exec_control);
1035     vmx_update_cpu_exec_control(v);
1036     vmx_update_secondary_exec_control(v);
1037     nvmx_update_exit_control(v, vmx_vmexit_control);
1038     nvmx_update_entry_control(v);
1039     vmx_update_exception_bitmap(v);
1040     nvmx_update_apic_access_address(v);
1041     nvmx_update_virtual_apic_address(v);
1042     nvmx_update_tpr_threshold(v);
1043     nvmx_update_pfec(v);
1044 }
1045 
load_shadow_guest_state(struct vcpu * v)1046 static void load_shadow_guest_state(struct vcpu *v)
1047 {
1048     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1049     u32 control;
1050     u64 cr_gh_mask, cr_read_shadow;
1051     int rc;
1052 
1053     static const u16 vmentry_fields[] = {
1054         VM_ENTRY_INTR_INFO,
1055         VM_ENTRY_EXCEPTION_ERROR_CODE,
1056         VM_ENTRY_INSTRUCTION_LEN,
1057     };
1058 
1059     /* vvmcs.gstate to shadow vmcs.gstate */
1060     vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1061                          vmcs_gstate_field);
1062 
1063     nvcpu->guest_cr[0] = get_vvmcs(v, CR0_READ_SHADOW);
1064     nvcpu->guest_cr[4] = get_vvmcs(v, CR4_READ_SHADOW);
1065 
1066     rc = hvm_set_cr0(get_vvmcs(v, GUEST_CR0), 1);
1067     if ( rc == X86EMUL_EXCEPTION )
1068         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1069 
1070     rc = hvm_set_cr4(get_vvmcs(v, GUEST_CR4), 1);
1071     if ( rc == X86EMUL_EXCEPTION )
1072         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1073 
1074     rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), 1);
1075     if ( rc == X86EMUL_EXCEPTION )
1076         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1077 
1078     control = get_vvmcs(v, VM_ENTRY_CONTROLS);
1079     if ( control & VM_ENTRY_LOAD_GUEST_PAT )
1080         hvm_set_guest_pat(v, get_vvmcs(v, GUEST_PAT));
1081     if ( control & VM_ENTRY_LOAD_PERF_GLOBAL_CTRL )
1082     {
1083         rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1084                                      get_vvmcs(v, GUEST_PERF_GLOBAL_CTRL), 0);
1085         if ( rc == X86EMUL_EXCEPTION )
1086             hvm_inject_hw_exception(TRAP_gp_fault, 0);
1087     }
1088 
1089     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
1090 
1091     vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmentry_fields), vmentry_fields);
1092 
1093     /*
1094      * While emulate CR0 and CR4 for nested virtualization, set the CR0/CR4
1095      * guest host mask to 0xffffffff in shadow VMCS (follow the host L1 VMCS),
1096      * then calculate the corresponding read shadow separately for CR0 and CR4.
1097      */
1098     cr_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
1099     cr_read_shadow = (get_vvmcs(v, GUEST_CR0) & ~cr_gh_mask) |
1100                      (get_vvmcs(v, CR0_READ_SHADOW) & cr_gh_mask);
1101     __vmwrite(CR0_READ_SHADOW, cr_read_shadow);
1102 
1103     cr_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
1104     cr_read_shadow = (get_vvmcs(v, GUEST_CR4) & ~cr_gh_mask) |
1105                      (get_vvmcs(v, CR4_READ_SHADOW) & cr_gh_mask);
1106     __vmwrite(CR4_READ_SHADOW, cr_read_shadow);
1107 
1108     /* TODO: CR3 target control */
1109 }
1110 
get_shadow_eptp(struct vcpu * v)1111 uint64_t get_shadow_eptp(struct vcpu *v)
1112 {
1113     struct p2m_domain *p2m = p2m_get_nestedp2m(v);
1114     struct ept_data *ept = &p2m->ept;
1115 
1116     ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
1117     return ept->eptp;
1118 }
1119 
get_host_eptp(struct vcpu * v)1120 static uint64_t get_host_eptp(struct vcpu *v)
1121 {
1122     return p2m_get_hostp2m(v->domain)->ept.eptp;
1123 }
1124 
nvmx_vpid_enabled(const struct vcpu * v)1125 static bool_t nvmx_vpid_enabled(const struct vcpu *v)
1126 {
1127     uint32_t second_cntl;
1128 
1129     second_cntl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
1130     if ( second_cntl & SECONDARY_EXEC_ENABLE_VPID )
1131         return 1;
1132     return 0;
1133 }
1134 
nvmx_set_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1135 static void nvmx_set_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1136 {
1137     paddr_t vvmcs_maddr = v->arch.hvm_vmx.vmcs_shadow_maddr;
1138 
1139     __vmpclear(vvmcs_maddr);
1140     vvmcs->vmcs_revision_id |= VMCS_RID_TYPE_MASK;
1141     __vmwrite(VMCS_LINK_POINTER, vvmcs_maddr);
1142     __vmwrite(VMREAD_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmread_bitmap));
1143     __vmwrite(VMWRITE_BITMAP, page_to_maddr(v->arch.hvm_vmx.vmwrite_bitmap));
1144 }
1145 
nvmx_clear_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1146 static void nvmx_clear_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1147 {
1148     paddr_t vvmcs_maddr = v->arch.hvm_vmx.vmcs_shadow_maddr;
1149 
1150     __vmpclear(vvmcs_maddr);
1151     vvmcs->vmcs_revision_id &= ~VMCS_RID_TYPE_MASK;
1152     __vmwrite(VMCS_LINK_POINTER, ~0ul);
1153     __vmwrite(VMREAD_BITMAP, 0);
1154     __vmwrite(VMWRITE_BITMAP, 0);
1155 }
1156 
virtual_vmentry(struct cpu_user_regs * regs)1157 static void virtual_vmentry(struct cpu_user_regs *regs)
1158 {
1159     struct vcpu *v = current;
1160     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1161     unsigned long lm_l1, lm_l2;
1162 
1163     vmx_vmcs_switch(v->arch.hvm_vmx.vmcs_pa, nvcpu->nv_n2vmcx_pa);
1164 
1165     nestedhvm_vcpu_enter_guestmode(v);
1166     nvcpu->nv_vmentry_pending = 0;
1167     nvcpu->nv_vmswitch_in_progress = 1;
1168 
1169     /*
1170      * EFER handling:
1171      * hvm_set_efer won't work if CR0.PG = 1, so we change the value
1172      * directly to make hvm_long_mode_active(v) work in L2.
1173      * An additional update_paging_modes is also needed if
1174      * there is 32/64 switch. v->arch.hvm_vcpu.guest_efer doesn't
1175      * need to be saved, since its value on vmexit is determined by
1176      * L1 exit_controls
1177      */
1178     lm_l1 = hvm_long_mode_active(v);
1179     lm_l2 = !!(get_vvmcs(v, VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
1180 
1181     if ( lm_l2 )
1182         v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME;
1183     else
1184         v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME);
1185 
1186     load_shadow_control(v);
1187     load_shadow_guest_state(v);
1188 
1189     if ( lm_l1 != lm_l2 )
1190         paging_update_paging_modes(v);
1191 
1192     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1193          !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1194         vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1195 
1196     regs->rip = get_vvmcs(v, GUEST_RIP);
1197     regs->rsp = get_vvmcs(v, GUEST_RSP);
1198     regs->rflags = get_vvmcs(v, GUEST_RFLAGS);
1199 
1200     /* updating host cr0 to sync TS bit */
1201     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1202 
1203     /* Setup virtual ETP for L2 guest*/
1204     if ( nestedhvm_paging_mode_hap(v) )
1205         /* This will setup the initial np2m for the nested vCPU */
1206         __vmwrite(EPT_POINTER, get_shadow_eptp(v));
1207     else
1208         __vmwrite(EPT_POINTER, get_host_eptp(v));
1209 
1210     /* nested VPID support! */
1211     if ( cpu_has_vmx_vpid && nvmx_vpid_enabled(v) )
1212     {
1213         struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1214         uint32_t new_vpid = get_vvmcs(v, VIRTUAL_PROCESSOR_ID);
1215 
1216         if ( nvmx->guest_vpid != new_vpid )
1217         {
1218             hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid);
1219             nvmx->guest_vpid = new_vpid;
1220         }
1221     }
1222 
1223 }
1224 
sync_vvmcs_guest_state(struct vcpu * v,struct cpu_user_regs * regs)1225 static void sync_vvmcs_guest_state(struct vcpu *v, struct cpu_user_regs *regs)
1226 {
1227     /* copy shadow vmcs.gstate back to vvmcs.gstate */
1228     shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1229                          vmcs_gstate_field);
1230     /* RIP, RSP are in user regs */
1231     set_vvmcs(v, GUEST_RIP, regs->rip);
1232     set_vvmcs(v, GUEST_RSP, regs->rsp);
1233 
1234     /* CR3 sync if exec doesn't want cr3 load exiting: i.e. nested EPT */
1235     if ( !(__n2_exec_control(v) & CPU_BASED_CR3_LOAD_EXITING) )
1236         shadow_to_vvmcs(v, GUEST_CR3);
1237 }
1238 
sync_vvmcs_ro(struct vcpu * v)1239 static void sync_vvmcs_ro(struct vcpu *v)
1240 {
1241     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1242 
1243     shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_ro_field), vmcs_ro_field);
1244 
1245     /* Adjust exit_reason/exit_qualifciation for violation case */
1246     if ( get_vvmcs(v, VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION )
1247     {
1248         set_vvmcs(v, EXIT_QUALIFICATION, nvmx->ept.exit_qual);
1249         set_vvmcs(v, VM_EXIT_REASON, nvmx->ept.exit_reason);
1250     }
1251 }
1252 
load_vvmcs_host_state(struct vcpu * v)1253 static void load_vvmcs_host_state(struct vcpu *v)
1254 {
1255     int i, rc;
1256     u64 r;
1257     u32 control;
1258 
1259     for ( i = 0; i < ARRAY_SIZE(vmcs_h2g_field); i++ )
1260     {
1261         r = get_vvmcs(v, vmcs_h2g_field[i].host_field);
1262         __vmwrite(vmcs_h2g_field[i].guest_field, r);
1263     }
1264 
1265     rc = hvm_set_cr0(get_vvmcs(v, HOST_CR0), 1);
1266     if ( rc == X86EMUL_EXCEPTION )
1267         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1268 
1269     rc = hvm_set_cr4(get_vvmcs(v, HOST_CR4), 1);
1270     if ( rc == X86EMUL_EXCEPTION )
1271         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1272 
1273     rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), 1);
1274     if ( rc == X86EMUL_EXCEPTION )
1275         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1276 
1277     control = get_vvmcs(v, VM_EXIT_CONTROLS);
1278     if ( control & VM_EXIT_LOAD_HOST_PAT )
1279         hvm_set_guest_pat(v, get_vvmcs(v, HOST_PAT));
1280     if ( control & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
1281     {
1282         rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1283                                      get_vvmcs(v, HOST_PERF_GLOBAL_CTRL), 1);
1284         if ( rc == X86EMUL_EXCEPTION )
1285             hvm_inject_hw_exception(TRAP_gp_fault, 0);
1286     }
1287 
1288     hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0);
1289 
1290     set_vvmcs(v, VM_ENTRY_INTR_INFO, 0);
1291 }
1292 
sync_exception_state(struct vcpu * v)1293 static void sync_exception_state(struct vcpu *v)
1294 {
1295     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1296 
1297     if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) )
1298         return;
1299 
1300     switch ( MASK_EXTR(nvmx->intr.intr_info, INTR_INFO_INTR_TYPE_MASK) )
1301     {
1302     case X86_EVENTTYPE_EXT_INTR:
1303         /* rename exit_reason to EXTERNAL_INTERRUPT */
1304         set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXTERNAL_INTERRUPT);
1305         set_vvmcs(v, EXIT_QUALIFICATION, 0);
1306         set_vvmcs(v, VM_EXIT_INTR_INFO,
1307                     nvmx->intr.intr_info);
1308         break;
1309 
1310     case X86_EVENTTYPE_HW_EXCEPTION:
1311     case X86_EVENTTYPE_SW_INTERRUPT:
1312     case X86_EVENTTYPE_SW_EXCEPTION:
1313         /* throw to L1 */
1314         set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1315         set_vvmcs(v, VM_EXIT_INTR_ERROR_CODE, nvmx->intr.error_code);
1316         break;
1317     case X86_EVENTTYPE_NMI:
1318         set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXCEPTION_NMI);
1319         set_vvmcs(v, EXIT_QUALIFICATION, 0);
1320         set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1321         break;
1322     default:
1323         gdprintk(XENLOG_ERR, "Exception state %lx not handled\n",
1324                nvmx->intr.intr_info);
1325         break;
1326     }
1327 }
1328 
nvmx_update_apicv(struct vcpu * v)1329 static void nvmx_update_apicv(struct vcpu *v)
1330 {
1331     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1332     unsigned long reason = get_vvmcs(v, VM_EXIT_REASON);
1333     uint32_t intr_info = get_vvmcs(v, VM_EXIT_INTR_INFO);
1334 
1335     if ( reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
1336          nvmx->intr.source == hvm_intsrc_lapic &&
1337          (intr_info & INTR_INFO_VALID_MASK) )
1338     {
1339         uint16_t status;
1340         uint32_t rvi, ppr;
1341         uint32_t vector = intr_info & 0xff;
1342         struct vlapic *vlapic = vcpu_vlapic(v);
1343 
1344         vlapic_ack_pending_irq(v, vector, 1);
1345 
1346         ppr = vlapic_set_ppr(vlapic);
1347         WARN_ON((ppr & 0xf0) != (vector & 0xf0));
1348 
1349         status = vector << VMX_GUEST_INTR_STATUS_SVI_OFFSET;
1350         rvi = vlapic_has_pending_irq(v);
1351         if ( rvi != -1 )
1352             status |= rvi & VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
1353 
1354         __vmwrite(GUEST_INTR_STATUS, status);
1355     }
1356 }
1357 
virtual_vmexit(struct cpu_user_regs * regs)1358 static void virtual_vmexit(struct cpu_user_regs *regs)
1359 {
1360     struct vcpu *v = current;
1361     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1362     unsigned long lm_l1, lm_l2;
1363 
1364     sync_vvmcs_ro(v);
1365     sync_vvmcs_guest_state(v, regs);
1366     sync_exception_state(v);
1367 
1368     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1369          !(v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
1370         shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1371 
1372     /* This will clear current pCPU bit in p2m->dirty_cpumask */
1373     np2m_schedule(NP2M_SCHEDLE_OUT);
1374 
1375     vmx_vmcs_switch(v->arch.hvm_vmx.vmcs_pa, nvcpu->nv_n1vmcx_pa);
1376 
1377     nestedhvm_vcpu_exit_guestmode(v);
1378     nvcpu->nv_vmexit_pending = 0;
1379     nvcpu->nv_vmswitch_in_progress = 1;
1380 
1381     lm_l2 = hvm_long_mode_active(v);
1382     lm_l1 = !!(get_vvmcs(v, VM_EXIT_CONTROLS) & VM_EXIT_IA32E_MODE);
1383 
1384     if ( lm_l1 )
1385         v->arch.hvm_vcpu.guest_efer |= EFER_LMA | EFER_LME;
1386     else
1387         v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME);
1388 
1389     vmx_update_cpu_exec_control(v);
1390     vmx_update_secondary_exec_control(v);
1391     vmx_update_exception_bitmap(v);
1392 
1393     load_vvmcs_host_state(v);
1394 
1395     if ( lm_l1 != lm_l2 )
1396         paging_update_paging_modes(v);
1397 
1398     regs->rip = get_vvmcs(v, HOST_RIP);
1399     regs->rsp = get_vvmcs(v, HOST_RSP);
1400     /* VM exit clears all bits except bit 1 */
1401     regs->rflags = X86_EFLAGS_MBS;
1402 
1403     /* updating host cr0 to sync TS bit */
1404     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1405 
1406     if ( cpu_has_vmx_virtual_intr_delivery )
1407         nvmx_update_apicv(v);
1408 
1409     nvcpu->nv_vmswitch_in_progress = 0;
1410     vmsucceed(regs);
1411 }
1412 
nvmx_eptp_update(void)1413 static void nvmx_eptp_update(void)
1414 {
1415     struct vcpu *curr = current;
1416 
1417     if ( !nestedhvm_vcpu_in_guestmode(curr) ||
1418           vcpu_nestedhvm(curr).nv_vmexit_pending ||
1419          !vcpu_nestedhvm(curr).stale_np2m ||
1420          !nestedhvm_paging_mode_hap(curr) )
1421         return;
1422 
1423     /*
1424      * Interrupts are enabled here, so we need to clear stale_np2m
1425      * before we do the vmwrite.  If we do it in the other order, an
1426      * and IPI comes in changing the shadow eptp after the vmwrite,
1427      * we'll complete the vmenter with a stale eptp value.
1428      */
1429     vcpu_nestedhvm(curr).stale_np2m = false;
1430     __vmwrite(EPT_POINTER, get_shadow_eptp(curr));
1431 }
1432 
nvmx_switch_guest(void)1433 void nvmx_switch_guest(void)
1434 {
1435     struct vcpu *v = current;
1436     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1437     struct cpu_user_regs *regs = guest_cpu_user_regs();
1438 
1439     nvmx_eptp_update();
1440 
1441     /*
1442      * A pending IO emulation may still be not finished. In this case, no
1443      * virtual vmswitch is allowed. Or else, the following IO emulation will
1444      * be handled in a wrong VCPU context. If there are no IO backends - PVH
1445      * guest by itself or a PVH guest with an HVM guest running inside - we
1446      * don't want to continue as this setup is not implemented nor supported
1447      * as of right now.
1448      */
1449     if ( hvm_io_pending(v) )
1450         return;
1451     /*
1452      * a softirq may interrupt us between a virtual vmentry is
1453      * just handled and the true vmentry. If during this window,
1454      * a L1 virtual interrupt causes another virtual vmexit, we
1455      * cannot let that happen or VM_ENTRY_INTR_INFO will be lost.
1456      */
1457     if ( unlikely(nvcpu->nv_vmswitch_in_progress) )
1458         return;
1459 
1460     if ( nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmexit_pending )
1461         virtual_vmexit(regs);
1462     else if ( !nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmentry_pending )
1463         virtual_vmentry(regs);
1464 }
1465 
1466 /*
1467  * VMX instructions handling
1468  */
1469 
nvmx_handle_vmxon(struct cpu_user_regs * regs)1470 int nvmx_handle_vmxon(struct cpu_user_regs *regs)
1471 {
1472     struct vcpu *v=current;
1473     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1474     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1475     struct vmx_inst_decoded decode;
1476     unsigned long gpa = 0;
1477     uint32_t nvmcs_revid;
1478     int rc;
1479 
1480     rc = decode_vmx_inst(regs, &decode, &gpa, 1);
1481     if ( rc != X86EMUL_OKAY )
1482         return rc;
1483 
1484     if ( nvmx_vcpu_in_vmx(v) )
1485     {
1486         vmfail(regs, VMX_INSN_VMXON_IN_VMX_ROOT);
1487         return X86EMUL_OKAY;
1488     }
1489 
1490     if ( (gpa & ~PAGE_MASK) || !gfn_valid(v->domain, _gfn(gpa >> PAGE_SHIFT)) )
1491     {
1492         vmfail_invalid(regs);
1493         return X86EMUL_OKAY;
1494     }
1495 
1496     rc = hvm_copy_from_guest_phys(&nvmcs_revid, gpa, sizeof(nvmcs_revid));
1497     if ( rc != HVMTRANS_okay ||
1498          (nvmcs_revid & ~VMX_BASIC_REVISION_MASK) ||
1499          ((nvmcs_revid ^ vmx_basic_msr) & VMX_BASIC_REVISION_MASK) )
1500     {
1501         vmfail_invalid(regs);
1502         return X86EMUL_OKAY;
1503     }
1504 
1505     nvmx->vmxon_region_pa = gpa;
1506 
1507     /*
1508      * `fork' the host vmcs to shadow_vmcs
1509      * vmcs_lock is not needed since we are on current
1510      */
1511     nvcpu->nv_n1vmcx_pa = v->arch.hvm_vmx.vmcs_pa;
1512     __vmpclear(v->arch.hvm_vmx.vmcs_pa);
1513     copy_domain_page(_mfn(PFN_DOWN(nvcpu->nv_n2vmcx_pa)),
1514                      _mfn(PFN_DOWN(v->arch.hvm_vmx.vmcs_pa)));
1515     __vmptrld(v->arch.hvm_vmx.vmcs_pa);
1516     v->arch.hvm_vmx.launched = 0;
1517     vmsucceed(regs);
1518 
1519     return X86EMUL_OKAY;
1520 }
1521 
nvmx_handle_vmxoff(struct cpu_user_regs * regs)1522 int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
1523 {
1524     struct vcpu *v=current;
1525     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1526     int rc;
1527 
1528     rc = vmx_inst_check_privilege(regs, 0);
1529     if ( rc != X86EMUL_OKAY )
1530         return rc;
1531 
1532     nvmx_purge_vvmcs(v);
1533     nvmx->vmxon_region_pa = INVALID_PADDR;
1534 
1535     vmsucceed(regs);
1536     return X86EMUL_OKAY;
1537 }
1538 
vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1539 static bool_t vvmcs_launched(struct list_head *launched_list,
1540                              unsigned long vvmcs_mfn)
1541 {
1542     struct vvmcs_list *vvmcs;
1543     struct list_head *pos;
1544     bool_t launched = 0;
1545 
1546     list_for_each(pos, launched_list)
1547     {
1548         vvmcs = list_entry(pos, struct vvmcs_list, node);
1549         if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1550         {
1551             launched = 1;
1552             break;
1553         }
1554     }
1555 
1556     return launched;
1557 }
1558 
set_vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1559 static int set_vvmcs_launched(struct list_head *launched_list,
1560                               unsigned long vvmcs_mfn)
1561 {
1562     struct vvmcs_list *vvmcs;
1563 
1564     if ( vvmcs_launched(launched_list, vvmcs_mfn) )
1565         return 0;
1566 
1567     vvmcs = xzalloc(struct vvmcs_list);
1568     if ( !vvmcs )
1569         return -ENOMEM;
1570 
1571     vvmcs->vvmcs_mfn = vvmcs_mfn;
1572     list_add(&vvmcs->node, launched_list);
1573 
1574     return 0;
1575 }
1576 
clear_vvmcs_launched(struct list_head * launched_list,paddr_t vvmcs_mfn)1577 static void clear_vvmcs_launched(struct list_head *launched_list,
1578                                  paddr_t vvmcs_mfn)
1579 {
1580     struct vvmcs_list *vvmcs;
1581     struct list_head *pos;
1582 
1583     list_for_each(pos, launched_list)
1584     {
1585         vvmcs = list_entry(pos, struct vvmcs_list, node);
1586         if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1587         {
1588             list_del(&vvmcs->node);
1589             xfree(vvmcs);
1590             break;
1591         }
1592     }
1593 }
1594 
nvmx_vmresume(struct vcpu * v,struct cpu_user_regs * regs)1595 static int nvmx_vmresume(struct vcpu *v, struct cpu_user_regs *regs)
1596 {
1597     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1598     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1599 
1600     /* check VMCS is valid and IO BITMAP is set */
1601     if ( (nvcpu->nv_vvmcxaddr != INVALID_PADDR) &&
1602             ((nvmx->iobitmap[0] && nvmx->iobitmap[1]) ||
1603             !(__n2_exec_control(v) & CPU_BASED_ACTIVATE_IO_BITMAP) ) )
1604         nvcpu->nv_vmentry_pending = 1;
1605     else
1606         vmfail_invalid(regs);
1607 
1608     return X86EMUL_OKAY;
1609 }
1610 
nvmx_handle_vmresume(struct cpu_user_regs * regs)1611 int nvmx_handle_vmresume(struct cpu_user_regs *regs)
1612 {
1613     bool_t launched;
1614     struct vcpu *v = current;
1615     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1616     unsigned long intr_shadow;
1617     int rc = vmx_inst_check_privilege(regs, 0);
1618 
1619     if ( rc != X86EMUL_OKAY )
1620         return rc;
1621 
1622     if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1623     {
1624         vmfail_invalid(regs);
1625         return X86EMUL_OKAY;
1626     }
1627 
1628     __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1629     if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1630     {
1631         vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1632         return X86EMUL_OKAY;
1633     }
1634 
1635     launched = vvmcs_launched(&nvmx->launched_list,
1636                               PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1637     if ( !launched )
1638     {
1639         vmfail_valid(regs, VMX_INSN_VMRESUME_NONLAUNCHED_VMCS);
1640         return X86EMUL_OKAY;
1641     }
1642     return nvmx_vmresume(v,regs);
1643 }
1644 
nvmx_handle_vmlaunch(struct cpu_user_regs * regs)1645 int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
1646 {
1647     bool_t launched;
1648     struct vcpu *v = current;
1649     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1650     unsigned long intr_shadow;
1651     int rc = vmx_inst_check_privilege(regs, 0);
1652 
1653     if ( rc != X86EMUL_OKAY )
1654         return rc;
1655 
1656     if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1657     {
1658         vmfail_invalid(regs);
1659         return X86EMUL_OKAY;
1660     }
1661 
1662     __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1663     if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1664     {
1665         vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1666         return X86EMUL_OKAY;
1667     }
1668 
1669     launched = vvmcs_launched(&nvmx->launched_list,
1670                               PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1671     if ( launched )
1672     {
1673         vmfail_valid(regs, VMX_INSN_VMLAUNCH_NONCLEAR_VMCS);
1674         return X86EMUL_OKAY;
1675     }
1676     else {
1677         rc = nvmx_vmresume(v,regs);
1678         if ( rc == X86EMUL_OKAY )
1679         {
1680             if ( set_vvmcs_launched(&nvmx->launched_list,
1681                                     PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr)) < 0 )
1682                 return X86EMUL_UNHANDLEABLE;
1683         }
1684     }
1685     return rc;
1686 }
1687 
nvmx_handle_vmptrld(struct cpu_user_regs * regs)1688 int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
1689 {
1690     struct vcpu *v = current;
1691     struct vmx_inst_decoded decode;
1692     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1693     unsigned long gpa = 0;
1694     int rc;
1695 
1696     rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1697     if ( rc != X86EMUL_OKAY )
1698         return rc;
1699 
1700     if ( gpa == vcpu_2_nvmx(v).vmxon_region_pa || gpa & 0xfff )
1701     {
1702         vmfail_invalid(regs);
1703         goto out;
1704     }
1705 
1706     if ( nvcpu->nv_vvmcxaddr != gpa )
1707         nvmx_purge_vvmcs(v);
1708 
1709     if ( nvcpu->nv_vvmcxaddr == INVALID_PADDR )
1710     {
1711         bool_t writable;
1712         void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
1713 
1714         if ( vvmcx )
1715         {
1716             if ( writable )
1717             {
1718                 struct vmcs_struct *vvmcs = vvmcx;
1719 
1720                 if ( ((vvmcs->vmcs_revision_id ^ vmx_basic_msr) &
1721                                          VMX_BASIC_REVISION_MASK) ||
1722                      (!cpu_has_vmx_vmcs_shadowing &&
1723                       (vvmcs->vmcs_revision_id & ~VMX_BASIC_REVISION_MASK)) )
1724                 {
1725                     hvm_unmap_guest_frame(vvmcx, 1);
1726                     vmfail(regs, VMX_INSN_VMPTRLD_INCORRECT_VMCS_ID);
1727 
1728                     return X86EMUL_OKAY;
1729                 }
1730                 nvcpu->nv_vvmcx = vvmcx;
1731                 nvcpu->nv_vvmcxaddr = gpa;
1732                 v->arch.hvm_vmx.vmcs_shadow_maddr =
1733                     pfn_to_paddr(domain_page_map_to_mfn(vvmcx));
1734             }
1735             else
1736             {
1737                 hvm_unmap_guest_frame(vvmcx, 1);
1738                 vvmcx = NULL;
1739             }
1740         }
1741         if ( !vvmcx ||
1742              !map_io_bitmap_all(v) ||
1743              !_map_msr_bitmap(v) )
1744         {
1745             vmfail_valid(regs, VMX_INSN_VMPTRLD_INVALID_PHYADDR);
1746             goto out;
1747         }
1748     }
1749 
1750     if ( cpu_has_vmx_vmcs_shadowing )
1751         nvmx_set_vmcs_pointer(v, nvcpu->nv_vvmcx);
1752 
1753     vmsucceed(regs);
1754 
1755 out:
1756     return X86EMUL_OKAY;
1757 }
1758 
nvmx_handle_vmptrst(struct cpu_user_regs * regs)1759 int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
1760 {
1761     struct vcpu *v = current;
1762     struct vmx_inst_decoded decode;
1763     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1764     pagefault_info_t pfinfo;
1765     unsigned long gpa = 0;
1766     int rc;
1767 
1768     rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1769     if ( rc != X86EMUL_OKAY )
1770         return rc;
1771 
1772     gpa = nvcpu->nv_vvmcxaddr;
1773 
1774     rc = hvm_copy_to_guest_linear(decode.mem, &gpa, decode.len, 0, &pfinfo);
1775     if ( rc == HVMTRANS_bad_linear_to_gfn )
1776         hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1777     if ( rc != HVMTRANS_okay )
1778         return X86EMUL_EXCEPTION;
1779 
1780     vmsucceed(regs);
1781     return X86EMUL_OKAY;
1782 }
1783 
nvmx_handle_vmclear(struct cpu_user_regs * regs)1784 int nvmx_handle_vmclear(struct cpu_user_regs *regs)
1785 {
1786     struct vcpu *v = current;
1787     struct vmx_inst_decoded decode;
1788     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1789     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1790     unsigned long gpa = 0;
1791     void *vvmcs;
1792     int rc;
1793 
1794     rc = decode_vmx_inst(regs, &decode, &gpa, 0);
1795     if ( rc != X86EMUL_OKAY )
1796         return rc;
1797 
1798     BUILD_BUG_ON(X86EMUL_OKAY != VMSUCCEED); /* rc = VMSUCCEED; */
1799     if ( gpa & 0xfff )
1800         rc = VMFAIL_INVALID;
1801     else if ( gpa == nvcpu->nv_vvmcxaddr )
1802     {
1803         if ( cpu_has_vmx_vmcs_shadowing )
1804             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
1805         clear_vvmcs_launched(&nvmx->launched_list,
1806                              PFN_DOWN(v->arch.hvm_vmx.vmcs_shadow_maddr));
1807         nvmx_purge_vvmcs(v);
1808     }
1809     else
1810     {
1811         /* Even if this VMCS isn't the current one, we must clear it. */
1812         bool_t writable;
1813 
1814         vvmcs = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 0, &writable);
1815         if ( vvmcs )
1816         {
1817             if ( writable )
1818                 clear_vvmcs_launched(&nvmx->launched_list,
1819                                      domain_page_map_to_mfn(vvmcs));
1820             else
1821                 rc = VMFAIL_VALID;
1822             hvm_unmap_guest_frame(vvmcs, 0);
1823         }
1824     }
1825 
1826     if ( rc == VMSUCCEED )
1827         vmsucceed(regs);
1828     else if ( rc == VMFAIL_VALID )
1829         vmfail_valid(regs, VMX_INSN_VMCLEAR_INVALID_PHYADDR);
1830     else
1831         vmfail_invalid(regs);
1832 
1833     return X86EMUL_OKAY;
1834 }
1835 
nvmx_handle_vmread(struct cpu_user_regs * regs)1836 int nvmx_handle_vmread(struct cpu_user_regs *regs)
1837 {
1838     struct vcpu *v = current;
1839     struct vmx_inst_decoded decode;
1840     pagefault_info_t pfinfo;
1841     u64 value = 0;
1842     int rc;
1843 
1844     rc = decode_vmx_inst(regs, &decode, NULL, 0);
1845     if ( rc != X86EMUL_OKAY )
1846         return rc;
1847 
1848     if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1849     {
1850         vmfail_invalid(regs);
1851         return X86EMUL_OKAY;
1852     }
1853 
1854     rc = get_vvmcs_safe(v, reg_read(regs, decode.reg2), &value);
1855     if ( rc != VMX_INSN_SUCCEED )
1856     {
1857         vmfail(regs, rc);
1858         return X86EMUL_OKAY;
1859     }
1860 
1861     switch ( decode.type ) {
1862     case VMX_INST_MEMREG_TYPE_MEMORY:
1863         rc = hvm_copy_to_guest_linear(decode.mem, &value, decode.len, 0, &pfinfo);
1864         if ( rc == HVMTRANS_bad_linear_to_gfn )
1865             hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1866         if ( rc != HVMTRANS_okay )
1867             return X86EMUL_EXCEPTION;
1868         break;
1869     case VMX_INST_MEMREG_TYPE_REG:
1870         reg_write(regs, decode.reg1, value);
1871         break;
1872     }
1873 
1874     vmsucceed(regs);
1875     return X86EMUL_OKAY;
1876 }
1877 
nvmx_handle_vmwrite(struct cpu_user_regs * regs)1878 int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
1879 {
1880     struct vcpu *v = current;
1881     struct vmx_inst_decoded decode;
1882     unsigned long operand;
1883     u64 vmcs_encoding;
1884     bool_t okay = 1;
1885     enum vmx_insn_errno err;
1886 
1887     if ( decode_vmx_inst(regs, &decode, &operand, 0)
1888              != X86EMUL_OKAY )
1889         return X86EMUL_EXCEPTION;
1890 
1891     if ( vcpu_nestedhvm(v).nv_vvmcxaddr == INVALID_PADDR )
1892     {
1893         vmfail_invalid(regs);
1894         return X86EMUL_OKAY;
1895     }
1896 
1897     vmcs_encoding = reg_read(regs, decode.reg2);
1898     err = set_vvmcs_safe(v, vmcs_encoding, operand);
1899     if ( err != VMX_INSN_SUCCEED )
1900     {
1901         vmfail(regs, err);
1902         return X86EMUL_OKAY;
1903     }
1904 
1905     switch ( vmcs_encoding & ~VMCS_HIGH(0) )
1906     {
1907     case IO_BITMAP_A:
1908         okay = _map_io_bitmap(v, IO_BITMAP_A);
1909         break;
1910     case IO_BITMAP_B:
1911         okay = _map_io_bitmap(v, IO_BITMAP_B);
1912         break;
1913     case MSR_BITMAP:
1914         okay = _map_msr_bitmap(v);
1915         break;
1916     }
1917 
1918     if ( okay )
1919         vmsucceed(regs);
1920     else
1921         vmfail_valid(regs, VMX_INSN_UNSUPPORTED_VMCS_COMPONENT);
1922 
1923     return X86EMUL_OKAY;
1924 }
1925 
nvmx_handle_invept(struct cpu_user_regs * regs)1926 int nvmx_handle_invept(struct cpu_user_regs *regs)
1927 {
1928     struct vmx_inst_decoded decode;
1929     unsigned long eptp;
1930     int ret;
1931 
1932     if ( (ret = decode_vmx_inst(regs, &decode, &eptp, 0)) != X86EMUL_OKAY )
1933         return ret;
1934 
1935     switch ( reg_read(regs, decode.reg2) )
1936     {
1937     case INVEPT_SINGLE_CONTEXT:
1938     {
1939         np2m_flush_base(current, eptp);
1940         break;
1941     }
1942     case INVEPT_ALL_CONTEXT:
1943         p2m_flush_nestedp2m(current->domain);
1944         __invept(INVEPT_ALL_CONTEXT, 0, 0);
1945         break;
1946     default:
1947         vmfail_invalid(regs);
1948         return X86EMUL_OKAY;
1949     }
1950     vmsucceed(regs);
1951     return X86EMUL_OKAY;
1952 }
1953 
nvmx_handle_invvpid(struct cpu_user_regs * regs)1954 int nvmx_handle_invvpid(struct cpu_user_regs *regs)
1955 {
1956     struct vmx_inst_decoded decode;
1957     unsigned long vpid;
1958     int ret;
1959 
1960     if ( (ret = decode_vmx_inst(regs, &decode, &vpid, 0)) != X86EMUL_OKAY )
1961         return ret;
1962 
1963     switch ( reg_read(regs, decode.reg2) )
1964     {
1965     /* Just invalidate all tlb entries for all types! */
1966     case INVVPID_INDIVIDUAL_ADDR:
1967     case INVVPID_SINGLE_CONTEXT:
1968     case INVVPID_ALL_CONTEXT:
1969         hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(current).nv_n2asid);
1970         break;
1971     default:
1972         vmfail_invalid(regs);
1973         return X86EMUL_OKAY;
1974     }
1975 
1976     vmsucceed(regs);
1977     return X86EMUL_OKAY;
1978 }
1979 
1980 #define __emul_value(enable1, default1) \
1981     ((enable1 | default1) << 32 | (default1))
1982 
1983 #define gen_vmx_msr(enable1, default1, host_value) \
1984     (((__emul_value(enable1, default1) & host_value) & (~0ul << 32)) | \
1985     ((uint32_t)(__emul_value(enable1, default1) | host_value)))
1986 
1987 /*
1988  * Capability reporting
1989  */
nvmx_msr_read_intercept(unsigned int msr,u64 * msr_content)1990 int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
1991 {
1992     struct vcpu *v = current;
1993     struct domain *d = v->domain;
1994     u64 data = 0, host_data = 0;
1995     int r = 1;
1996 
1997     /* VMX capablity MSRs are available only when guest supports VMX. */
1998     if ( !nestedhvm_enabled(d) || !d->arch.cpuid->basic.vmx )
1999         return 0;
2000 
2001     /*
2002      * These MSRs are only available when flags in other MSRs are set.
2003      * These prerequisites are listed in the Intel 64 and IA-32
2004      * Architectures Software Developer’s Manual, Vol 3, Appendix A.
2005      */
2006     switch ( msr )
2007     {
2008     case MSR_IA32_VMX_PROCBASED_CTLS2:
2009         if ( !cpu_has_vmx_secondary_exec_control )
2010             return 0;
2011         break;
2012 
2013     case MSR_IA32_VMX_EPT_VPID_CAP:
2014         if ( !(cpu_has_vmx_ept || cpu_has_vmx_vpid) )
2015             return 0;
2016         break;
2017 
2018     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2019     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2020     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2021     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2022         if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) )
2023             return 0;
2024         break;
2025 
2026     case MSR_IA32_VMX_VMFUNC:
2027         if ( !cpu_has_vmx_vmfunc )
2028             return 0;
2029         break;
2030     }
2031 
2032     rdmsrl(msr, host_data);
2033 
2034     /*
2035      * Remove unsupport features from n1 guest capability MSR
2036      */
2037     switch (msr) {
2038     case MSR_IA32_VMX_BASIC:
2039     {
2040         const struct vmcs_struct *vmcs =
2041             map_domain_page(_mfn(PFN_DOWN(v->arch.hvm_vmx.vmcs_pa)));
2042 
2043         data = (host_data & (~0ul << 32)) |
2044                (vmcs->vmcs_revision_id & 0x7fffffff);
2045         unmap_domain_page(vmcs);
2046         break;
2047     }
2048     case MSR_IA32_VMX_PINBASED_CTLS:
2049     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2050         /* 1-settings */
2051         data = PIN_BASED_EXT_INTR_MASK |
2052                PIN_BASED_NMI_EXITING |
2053                PIN_BASED_PREEMPT_TIMER;
2054         data = gen_vmx_msr(data, VMX_PINBASED_CTLS_DEFAULT1, host_data);
2055         break;
2056     case MSR_IA32_VMX_PROCBASED_CTLS:
2057     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2058     {
2059         u32 default1_bits = VMX_PROCBASED_CTLS_DEFAULT1;
2060         /* 1-settings */
2061         data = CPU_BASED_HLT_EXITING |
2062                CPU_BASED_VIRTUAL_INTR_PENDING |
2063                CPU_BASED_CR8_LOAD_EXITING |
2064                CPU_BASED_CR8_STORE_EXITING |
2065                CPU_BASED_INVLPG_EXITING |
2066                CPU_BASED_CR3_LOAD_EXITING |
2067                CPU_BASED_CR3_STORE_EXITING |
2068                CPU_BASED_MONITOR_EXITING |
2069                CPU_BASED_MWAIT_EXITING |
2070                CPU_BASED_MOV_DR_EXITING |
2071                CPU_BASED_ACTIVATE_IO_BITMAP |
2072                CPU_BASED_USE_TSC_OFFSETING |
2073                CPU_BASED_UNCOND_IO_EXITING |
2074                CPU_BASED_RDTSC_EXITING |
2075                CPU_BASED_MONITOR_TRAP_FLAG |
2076                CPU_BASED_VIRTUAL_NMI_PENDING |
2077                CPU_BASED_ACTIVATE_MSR_BITMAP |
2078                CPU_BASED_PAUSE_EXITING |
2079                CPU_BASED_RDPMC_EXITING |
2080                CPU_BASED_TPR_SHADOW |
2081                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2082 
2083         if ( msr == MSR_IA32_VMX_TRUE_PROCBASED_CTLS )
2084             default1_bits &= ~(CPU_BASED_CR3_LOAD_EXITING |
2085                                CPU_BASED_CR3_STORE_EXITING |
2086                                CPU_BASED_INVLPG_EXITING);
2087 
2088         data = gen_vmx_msr(data, default1_bits, host_data);
2089         break;
2090     }
2091     case MSR_IA32_VMX_PROCBASED_CTLS2:
2092         /* 1-settings */
2093         data = SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING |
2094                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2095                SECONDARY_EXEC_ENABLE_VPID |
2096                SECONDARY_EXEC_UNRESTRICTED_GUEST |
2097                SECONDARY_EXEC_ENABLE_EPT;
2098         data = gen_vmx_msr(data, 0, host_data);
2099         break;
2100     case MSR_IA32_VMX_EXIT_CTLS:
2101     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2102         /* 1-settings */
2103         data = VM_EXIT_ACK_INTR_ON_EXIT |
2104                VM_EXIT_IA32E_MODE |
2105                VM_EXIT_SAVE_PREEMPT_TIMER |
2106                VM_EXIT_SAVE_GUEST_PAT |
2107                VM_EXIT_LOAD_HOST_PAT |
2108                VM_EXIT_SAVE_GUEST_EFER |
2109                VM_EXIT_LOAD_HOST_EFER |
2110                VM_EXIT_LOAD_PERF_GLOBAL_CTRL;
2111         data = gen_vmx_msr(data, VMX_EXIT_CTLS_DEFAULT1, host_data);
2112         break;
2113     case MSR_IA32_VMX_ENTRY_CTLS:
2114     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2115         /* 1-settings */
2116         data = VM_ENTRY_LOAD_GUEST_PAT |
2117                VM_ENTRY_LOAD_GUEST_EFER |
2118                VM_ENTRY_LOAD_PERF_GLOBAL_CTRL |
2119                VM_ENTRY_IA32E_MODE;
2120         data = gen_vmx_msr(data, VMX_ENTRY_CTLS_DEFAULT1, host_data);
2121         break;
2122 
2123     case MSR_IA32_VMX_VMCS_ENUM:
2124         /* The max index of VVMCS encoding is 0x1f. */
2125         data = 0x1f << 1;
2126         break;
2127     case MSR_IA32_VMX_CR0_FIXED0:
2128         /* PG, PE bits must be 1 in VMX operation */
2129         data = X86_CR0_PE | X86_CR0_PG;
2130         break;
2131     case MSR_IA32_VMX_CR0_FIXED1:
2132         /* allow 0-settings for all bits */
2133         data = 0xffffffff;
2134         break;
2135     case MSR_IA32_VMX_CR4_FIXED0:
2136         /* VMXE bit must be 1 in VMX operation */
2137         data = X86_CR4_VMXE;
2138         break;
2139     case MSR_IA32_VMX_CR4_FIXED1:
2140         data = hvm_cr4_guest_valid_bits(v, 0);
2141         break;
2142     case MSR_IA32_VMX_MISC:
2143         /* Do not support CR3-target feature now */
2144         data = host_data & ~VMX_MISC_CR3_TARGET;
2145         break;
2146     case MSR_IA32_VMX_EPT_VPID_CAP:
2147         data = nept_get_ept_vpid_cap();
2148         break;
2149     default:
2150         r = 0;
2151         break;
2152     }
2153 
2154     *msr_content = data;
2155     return r;
2156 }
2157 
2158 /* This function uses L2_gpa to walk the P2M page table in L1. If the
2159  * walk is successful, the translated value is returned in
2160  * L1_gpa. The result value tells what to do next.
2161  */
2162 int
nvmx_hap_walk_L1_p2m(struct vcpu * v,paddr_t L2_gpa,paddr_t * L1_gpa,unsigned int * page_order,uint8_t * p2m_acc,bool_t access_r,bool_t access_w,bool_t access_x)2163 nvmx_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa,
2164                      unsigned int *page_order, uint8_t *p2m_acc,
2165                      bool_t access_r, bool_t access_w, bool_t access_x)
2166 {
2167     int rc;
2168     unsigned long gfn;
2169     uint64_t exit_qual;
2170     uint32_t exit_reason = EXIT_REASON_EPT_VIOLATION;
2171     uint32_t rwx_rights = (access_x << 2) | (access_w << 1) | access_r;
2172     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2173 
2174     vmx_vmcs_enter(v);
2175 
2176     __vmread(EXIT_QUALIFICATION, &exit_qual);
2177     rc = nept_translate_l2ga(v, L2_gpa, page_order, rwx_rights, &gfn, p2m_acc,
2178                              &exit_qual, &exit_reason);
2179     switch ( rc )
2180     {
2181     case EPT_TRANSLATE_SUCCEED:
2182         *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
2183         rc = NESTEDHVM_PAGEFAULT_DONE;
2184         break;
2185     case EPT_TRANSLATE_VIOLATION:
2186     case EPT_TRANSLATE_MISCONFIG:
2187         rc = NESTEDHVM_PAGEFAULT_INJECT;
2188         nvmx->ept.exit_reason = exit_reason;
2189         nvmx->ept.exit_qual = exit_qual;
2190         break;
2191     case EPT_TRANSLATE_RETRY:
2192         rc = NESTEDHVM_PAGEFAULT_RETRY;
2193         break;
2194     default:
2195         gdprintk(XENLOG_ERR, "GUEST EPT translation error!:%d\n", rc);
2196         BUG();
2197         break;
2198     }
2199 
2200     vmx_vmcs_exit(v);
2201 
2202     return rc;
2203 }
2204 
nvmx_idtv_handling(void)2205 void nvmx_idtv_handling(void)
2206 {
2207     struct vcpu *v = current;
2208     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2209     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2210     unsigned long idtv_info, reason;
2211 
2212     __vmread(IDT_VECTORING_INFO, &idtv_info);
2213     if ( likely(!(idtv_info & INTR_INFO_VALID_MASK)) )
2214         return;
2215 
2216     /*
2217      * If L0 can solve the fault that causes idt vectoring, it should
2218      * be reinjected, otherwise, pass to L1.
2219      */
2220     __vmread(VM_EXIT_REASON, &reason);
2221     if ( reason != EXIT_REASON_EPT_VIOLATION ?
2222          !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) :
2223          !nvcpu->nv_vmexit_pending )
2224     {
2225         __vmwrite(VM_ENTRY_INTR_INFO, idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2226         if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2227         {
2228             __vmread(IDT_VECTORING_ERROR_CODE, &reason);
2229             __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, reason);
2230         }
2231         /*
2232          * SDM 23.2.4, if L1 tries to inject a software interrupt
2233          * and the delivery fails, VM_EXIT_INSTRUCTION_LEN receives
2234          * the value of previous VM_ENTRY_INSTRUCTION_LEN.
2235          *
2236          * This means EXIT_INSTRUCTION_LEN is always valid here, for
2237          * software interrupts both injected by L1, and generated in L2.
2238          */
2239         __vmread(VM_EXIT_INSTRUCTION_LEN, &reason);
2240         __vmwrite(VM_ENTRY_INSTRUCTION_LEN, reason);
2241    }
2242 }
2243 
2244 /*
2245  * L2 VMExit handling
2246  *    return 1: Done or skip the normal layer 0 hypervisor process.
2247  *              Typically it requires layer 1 hypervisor processing
2248  *              or it may be already processed here.
2249  *           0: Require the normal layer 0 process.
2250  */
nvmx_n2_vmexit_handler(struct cpu_user_regs * regs,unsigned int exit_reason)2251 int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
2252                                unsigned int exit_reason)
2253 {
2254     struct vcpu *v = current;
2255     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2256     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2257     u32 ctrl;
2258 
2259     nvcpu->nv_vmexit_pending = 0;
2260     nvmx->intr.intr_info = 0;
2261     nvmx->intr.error_code = 0;
2262 
2263     switch (exit_reason) {
2264     case EXIT_REASON_EXCEPTION_NMI:
2265     {
2266         unsigned long intr_info;
2267         u32 valid_mask = MASK_INSR(X86_EVENTTYPE_HW_EXCEPTION,
2268                                   INTR_INFO_INTR_TYPE_MASK) |
2269                          INTR_INFO_VALID_MASK;
2270         u64 exec_bitmap;
2271         int vector;
2272 
2273         __vmread(VM_EXIT_INTR_INFO, &intr_info);
2274         vector = intr_info & INTR_INFO_VECTOR_MASK;
2275         /*
2276          * decided by L0 and L1 exception bitmap, if the vetor is set by
2277          * both, L0 has priority on #PF and #NM, L1 has priority on others
2278          */
2279         if ( vector == TRAP_page_fault )
2280         {
2281             if ( paging_mode_hap(v->domain) )
2282                 nvcpu->nv_vmexit_pending = 1;
2283         }
2284         else if ( vector == TRAP_no_device )
2285         {
2286             if ( v->fpu_dirtied )
2287                 nvcpu->nv_vmexit_pending = 1;
2288         }
2289         else if ( (intr_info & valid_mask) == valid_mask )
2290         {
2291             exec_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
2292 
2293             if ( exec_bitmap & (1 << vector) )
2294                 nvcpu->nv_vmexit_pending = 1;
2295         }
2296         break;
2297     }
2298     case EXIT_REASON_WBINVD:
2299     case EXIT_REASON_EPT_VIOLATION:
2300     case EXIT_REASON_EPT_MISCONFIG:
2301     case EXIT_REASON_EXTERNAL_INTERRUPT:
2302         /* pass to L0 handler */
2303         break;
2304     case VMX_EXIT_REASONS_FAILED_VMENTRY:
2305     case EXIT_REASON_TRIPLE_FAULT:
2306     case EXIT_REASON_TASK_SWITCH:
2307     case EXIT_REASON_CPUID:
2308     case EXIT_REASON_VMCALL:
2309     case EXIT_REASON_VMCLEAR:
2310     case EXIT_REASON_VMLAUNCH:
2311     case EXIT_REASON_VMPTRLD:
2312     case EXIT_REASON_VMPTRST:
2313     case EXIT_REASON_VMREAD:
2314     case EXIT_REASON_VMRESUME:
2315     case EXIT_REASON_VMWRITE:
2316     case EXIT_REASON_VMXOFF:
2317     case EXIT_REASON_VMXON:
2318     case EXIT_REASON_INVEPT:
2319     case EXIT_REASON_XSETBV:
2320         /* inject to L1 */
2321         nvcpu->nv_vmexit_pending = 1;
2322         break;
2323 
2324     case EXIT_REASON_MSR_READ:
2325     case EXIT_REASON_MSR_WRITE:
2326         ctrl = __n2_exec_control(v);
2327 
2328         /* Without ACTIVATE_MSR_BITMAP, all MSRs are intercepted. */
2329         if ( !(ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP) )
2330             nvcpu->nv_vmexit_pending = 1;
2331         else if ( !nvmx->msrbitmap )
2332             /* ACTIVATE_MSR_BITMAP set, but L2 bitmap not mapped??? */
2333             domain_crash(v->domain);
2334         else
2335             nvcpu->nv_vmexit_pending =
2336                 vmx_msr_is_intercepted(nvmx->msrbitmap, regs->ecx,
2337                                        exit_reason == EXIT_REASON_MSR_WRITE);
2338         break;
2339 
2340     case EXIT_REASON_IO_INSTRUCTION:
2341         ctrl = __n2_exec_control(v);
2342         if ( ctrl & CPU_BASED_ACTIVATE_IO_BITMAP )
2343         {
2344             unsigned long qual;
2345             u16 port, size;
2346 
2347             __vmread(EXIT_QUALIFICATION, &qual);
2348             port = qual >> 16;
2349             size = (qual & 7) + 1;
2350             do {
2351                 const u8 *bitmap = nvmx->iobitmap[port >> 15];
2352 
2353                 if ( bitmap[(port & 0x7fff) >> 3] & (1 << (port & 7)) )
2354                     nvcpu->nv_vmexit_pending = 1;
2355                 if ( !--size )
2356                     break;
2357                 if ( !++port )
2358                     nvcpu->nv_vmexit_pending = 1;
2359             } while ( !nvcpu->nv_vmexit_pending );
2360             if ( !nvcpu->nv_vmexit_pending )
2361                 printk(XENLOG_G_WARNING "L0 PIO %04x\n", port);
2362         }
2363         else if ( ctrl & CPU_BASED_UNCOND_IO_EXITING )
2364             nvcpu->nv_vmexit_pending = 1;
2365         break;
2366 
2367     case EXIT_REASON_PENDING_VIRT_INTR:
2368         ctrl = __n2_exec_control(v);
2369         if ( ctrl & CPU_BASED_VIRTUAL_INTR_PENDING )
2370             nvcpu->nv_vmexit_pending = 1;
2371         break;
2372     case EXIT_REASON_PENDING_VIRT_NMI:
2373         ctrl = __n2_exec_control(v);
2374         if ( ctrl & CPU_BASED_VIRTUAL_NMI_PENDING )
2375             nvcpu->nv_vmexit_pending = 1;
2376         break;
2377     case EXIT_REASON_MONITOR_TRAP_FLAG:
2378         ctrl = __n2_exec_control(v);
2379         if ( ctrl & CPU_BASED_MONITOR_TRAP_FLAG)
2380             nvcpu->nv_vmexit_pending = 1;
2381         break;
2382     case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
2383     case EXIT_REASON_ACCESS_LDTR_OR_TR:
2384         ctrl = __n2_secondary_exec_control(v);
2385         if ( ctrl & SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING )
2386             nvcpu->nv_vmexit_pending = 1;
2387         break;
2388     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
2389         ctrl = __n2_pin_exec_control(v);
2390         if ( ctrl & PIN_BASED_PREEMPT_TIMER )
2391             nvcpu->nv_vmexit_pending = 1;
2392         break;
2393     /* L1 has priority handling several other types of exits */
2394     case EXIT_REASON_HLT:
2395         ctrl = __n2_exec_control(v);
2396         if ( ctrl & CPU_BASED_HLT_EXITING )
2397             nvcpu->nv_vmexit_pending = 1;
2398         break;
2399     case EXIT_REASON_RDTSC:
2400         ctrl = __n2_exec_control(v);
2401         if ( ctrl & CPU_BASED_RDTSC_EXITING )
2402             nvcpu->nv_vmexit_pending = 1;
2403         else
2404         {
2405             /*
2406              * special handler is needed if L1 doesn't intercept rdtsc,
2407              * avoiding changing guest_tsc and messing up timekeeping in L1
2408              */
2409             msr_split(regs, hvm_get_guest_tsc(v) + get_vvmcs(v, TSC_OFFSET));
2410             update_guest_eip();
2411 
2412             return 1;
2413         }
2414         break;
2415     case EXIT_REASON_RDPMC:
2416         ctrl = __n2_exec_control(v);
2417         if ( ctrl & CPU_BASED_RDPMC_EXITING )
2418             nvcpu->nv_vmexit_pending = 1;
2419         break;
2420     case EXIT_REASON_MWAIT_INSTRUCTION:
2421         ctrl = __n2_exec_control(v);
2422         if ( ctrl & CPU_BASED_MWAIT_EXITING )
2423             nvcpu->nv_vmexit_pending = 1;
2424         break;
2425     case EXIT_REASON_PAUSE_INSTRUCTION:
2426         ctrl = __n2_exec_control(v);
2427         if ( ctrl & CPU_BASED_PAUSE_EXITING )
2428             nvcpu->nv_vmexit_pending = 1;
2429         break;
2430     case EXIT_REASON_MONITOR_INSTRUCTION:
2431         ctrl = __n2_exec_control(v);
2432         if ( ctrl & CPU_BASED_MONITOR_EXITING )
2433             nvcpu->nv_vmexit_pending = 1;
2434         break;
2435     case EXIT_REASON_DR_ACCESS:
2436         ctrl = __n2_exec_control(v);
2437         if ( (ctrl & CPU_BASED_MOV_DR_EXITING) &&
2438             v->arch.hvm_vcpu.flag_dr_dirty )
2439             nvcpu->nv_vmexit_pending = 1;
2440         break;
2441     case EXIT_REASON_INVLPG:
2442         ctrl = __n2_exec_control(v);
2443         if ( ctrl & CPU_BASED_INVLPG_EXITING )
2444             nvcpu->nv_vmexit_pending = 1;
2445         break;
2446     case EXIT_REASON_CR_ACCESS:
2447     {
2448         unsigned long exit_qualification;
2449         int cr, write;
2450         u32 mask = 0;
2451 
2452         __vmread(EXIT_QUALIFICATION, &exit_qualification);
2453         cr = VMX_CONTROL_REG_ACCESS_NUM(exit_qualification);
2454         write = VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification);
2455         /* also according to guest exec_control */
2456         ctrl = __n2_exec_control(v);
2457 
2458         if ( cr == 3 )
2459         {
2460             mask = write? CPU_BASED_CR3_STORE_EXITING:
2461                           CPU_BASED_CR3_LOAD_EXITING;
2462             if ( ctrl & mask )
2463                 nvcpu->nv_vmexit_pending = 1;
2464         }
2465         else if ( cr == 8 )
2466         {
2467             mask = write? CPU_BASED_CR8_STORE_EXITING:
2468                           CPU_BASED_CR8_LOAD_EXITING;
2469             if ( ctrl & mask )
2470                 nvcpu->nv_vmexit_pending = 1;
2471         }
2472         else  /* CR0, CR4, CLTS, LMSW */
2473         {
2474             /*
2475              * While getting the VM exit for CR0/CR4 access, check if L1 VMM owns
2476              * the bit.
2477              * If so, inject the VM exit to L1 VMM.
2478              * Otherwise, L0 will handle it and sync the value to L1 virtual VMCS.
2479              */
2480             unsigned long old_val, val, changed_bits;
2481             switch ( VMX_CONTROL_REG_ACCESS_TYPE(exit_qualification) )
2482             {
2483             case VMX_CONTROL_REG_ACCESS_TYPE_MOV_TO_CR:
2484             {
2485                 unsigned long gp = VMX_CONTROL_REG_ACCESS_GPR(exit_qualification);
2486                 unsigned long *reg;
2487 
2488                 if ( (reg = decode_register(gp, guest_cpu_user_regs(), 0)) == NULL )
2489                 {
2490                     gdprintk(XENLOG_ERR, "invalid gpr: %lx\n", gp);
2491                     break;
2492                 }
2493                 val = *reg;
2494                 if ( cr == 0 )
2495                 {
2496                     u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2497 
2498                     __vmread(CR0_READ_SHADOW, &old_val);
2499                     changed_bits = old_val ^ val;
2500                     if ( changed_bits & cr0_gh_mask )
2501                         nvcpu->nv_vmexit_pending = 1;
2502                     else
2503                     {
2504                         u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2505 
2506                         set_vvmcs(v, GUEST_CR0,
2507                                   (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2508                     }
2509                 }
2510                 else if ( cr == 4 )
2511                 {
2512                     u64 cr4_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
2513 
2514                     __vmread(CR4_READ_SHADOW, &old_val);
2515                     changed_bits = old_val ^ val;
2516                     if ( changed_bits & cr4_gh_mask )
2517                         nvcpu->nv_vmexit_pending = 1;
2518                     else
2519                     {
2520                         u64 guest_cr4 = get_vvmcs(v, GUEST_CR4);
2521 
2522                         set_vvmcs(v, GUEST_CR4,
2523                                   (guest_cr4 & cr4_gh_mask) | (val & ~cr4_gh_mask));
2524                     }
2525                 }
2526                 else
2527                     nvcpu->nv_vmexit_pending = 1;
2528                 break;
2529             }
2530             case VMX_CONTROL_REG_ACCESS_TYPE_CLTS:
2531             {
2532                 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2533 
2534                 if ( cr0_gh_mask & X86_CR0_TS )
2535                     nvcpu->nv_vmexit_pending = 1;
2536                 else
2537                 {
2538                     u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2539 
2540                     set_vvmcs(v, GUEST_CR0, (guest_cr0 & ~X86_CR0_TS));
2541                 }
2542                 break;
2543             }
2544             case VMX_CONTROL_REG_ACCESS_TYPE_LMSW:
2545             {
2546                 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2547 
2548                 __vmread(CR0_READ_SHADOW, &old_val);
2549                 old_val &= X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS;
2550                 val = VMX_CONTROL_REG_ACCESS_DATA(exit_qualification) &
2551                       (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS);
2552                 changed_bits = old_val ^ val;
2553                 if ( changed_bits & cr0_gh_mask )
2554                     nvcpu->nv_vmexit_pending = 1;
2555                 else
2556                 {
2557                     u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2558 
2559                     set_vvmcs(v, GUEST_CR0, (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2560                 }
2561                 break;
2562             }
2563             default:
2564                 break;
2565             }
2566         }
2567         break;
2568     }
2569     case EXIT_REASON_APIC_ACCESS:
2570         ctrl = __n2_secondary_exec_control(v);
2571         if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
2572             nvcpu->nv_vmexit_pending = 1;
2573         break;
2574     case EXIT_REASON_TPR_BELOW_THRESHOLD:
2575         ctrl = __n2_exec_control(v);
2576         if ( ctrl & CPU_BASED_TPR_SHADOW )
2577             nvcpu->nv_vmexit_pending = 1;
2578         break;
2579     default:
2580         gprintk(XENLOG_ERR, "Unexpected nested vmexit: reason %u\n",
2581                 exit_reason);
2582     }
2583 
2584     return ( nvcpu->nv_vmexit_pending == 1 );
2585 }
2586 
nvmx_set_cr_read_shadow(struct vcpu * v,unsigned int cr)2587 void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr)
2588 {
2589     unsigned long cr_field, read_shadow_field, mask_field;
2590 
2591     switch ( cr )
2592     {
2593     case 0:
2594         cr_field = GUEST_CR0;
2595         read_shadow_field = CR0_READ_SHADOW;
2596         mask_field = CR0_GUEST_HOST_MASK;
2597         break;
2598     case 4:
2599         cr_field = GUEST_CR4;
2600         read_shadow_field = CR4_READ_SHADOW;
2601         mask_field = CR4_GUEST_HOST_MASK;
2602         break;
2603     default:
2604         gdprintk(XENLOG_WARNING, "Set read shadow for CR%d.\n", cr);
2605         return;
2606     }
2607 
2608     if ( !nestedhvm_vmswitch_in_progress(v) )
2609     {
2610         unsigned long virtual_cr_mask =
2611             get_vvmcs(v, mask_field);
2612 
2613         /*
2614          * We get here when L2 changed cr in a way that did not change
2615          * any of L1's shadowed bits (see nvmx_n2_vmexit_handler),
2616          * but did change L0 shadowed bits. So we first calculate the
2617          * effective cr value that L1 would like to write into the
2618          * hardware. It consists of the L2-owned bits from the new
2619          * value combined with the L1-owned bits from L1's guest cr.
2620          */
2621         v->arch.hvm_vcpu.guest_cr[cr] &= ~virtual_cr_mask;
2622         v->arch.hvm_vcpu.guest_cr[cr] |= virtual_cr_mask &
2623             get_vvmcs(v, cr_field);
2624     }
2625 
2626     /* nvcpu.guest_cr is what L2 write to cr actually. */
2627     __vmwrite(read_shadow_field, v->arch.hvm_vcpu.nvcpu.guest_cr[cr]);
2628 }
2629 
2630 /*
2631  * Local variables:
2632  * mode: C
2633  * c-file-style: "BSD"
2634  * c-basic-offset: 4
2635  * tab-width: 4
2636  * indent-tabs-mode: nil
2637  * End:
2638  */
2639