1 /*
2  * vvmx.c: Support virtual VMX for nested virtualization.
3  *
4  * Copyright (c) 2010, Intel Corporation.
5  * Author: Qing He <qing.he@intel.com>
6  *         Eddie Dong <eddie.dong@intel.com>
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; If not, see <http://www.gnu.org/licenses/>.
19  *
20  */
21 
22 #include <asm/types.h>
23 #include <asm/mtrr.h>
24 #include <asm/p2m.h>
25 #include <asm/hvm/ioreq.h>
26 #include <asm/hvm/vmx/vmx.h>
27 #include <asm/hvm/vmx/vvmx.h>
28 #include <asm/hvm/nestedhvm.h>
29 
30 static DEFINE_PER_CPU(u64 *, vvmcs_buf);
31 
32 static void nvmx_purge_vvmcs(struct vcpu *v);
33 
nvmx_vcpu_in_vmx(const struct vcpu * v)34 static bool nvmx_vcpu_in_vmx(const struct vcpu *v)
35 {
36     return vcpu_2_nvmx(v).vmxon_region_pa != INVALID_PADDR;
37 }
38 
39 #define VMCS_BUF_SIZE 100
40 
nvmx_cpu_up_prepare(unsigned int cpu)41 int nvmx_cpu_up_prepare(unsigned int cpu)
42 {
43     uint64_t **vvmcs_buf;
44 
45     if ( cpu_has_vmx_vmcs_shadowing &&
46          *(vvmcs_buf = &per_cpu(vvmcs_buf, cpu)) == NULL )
47     {
48         void *ptr = xzalloc_array(uint64_t, VMCS_BUF_SIZE);
49 
50         if ( !ptr )
51             return -ENOMEM;
52 
53         *vvmcs_buf = ptr;
54     }
55 
56     return 0;
57 }
58 
nvmx_cpu_dead(unsigned int cpu)59 void nvmx_cpu_dead(unsigned int cpu)
60 {
61     XFREE(per_cpu(vvmcs_buf, cpu));
62 }
63 
nvmx_vcpu_initialise(struct vcpu * v)64 int nvmx_vcpu_initialise(struct vcpu *v)
65 {
66     struct domain *d = v->domain;
67     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
68     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
69     struct page_info *pg = alloc_domheap_page(NULL, 0);
70 
71     /*
72      * Gross bodge.  The nested p2m logic can't cope with the CVE-2018-12207
73      * workaround of using NX EPT superpages, and livelocks.  Nested HVM isn't
74      * security supported, so disable the workaround until the nested p2m
75      * logic can be improved.
76      */
77     if ( !d->arch.hvm.vmx.exec_sp )
78     {
79         d->arch.hvm.vmx.exec_sp = true;
80         p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_rw);
81     }
82 
83     if ( !pg )
84     {
85         gdprintk(XENLOG_ERR, "nest: allocation for shadow vmcs failed\n");
86         return -ENOMEM;
87     }
88     nvcpu->nv_n2vmcx_pa = page_to_maddr(pg);
89 
90     /* non-root VMREAD/VMWRITE bitmap. */
91     if ( cpu_has_vmx_vmcs_shadowing )
92     {
93         struct page_info *vmread_bitmap, *vmwrite_bitmap;
94         unsigned long *vw;
95 
96         vmread_bitmap = alloc_domheap_page(NULL, 0);
97         if ( !vmread_bitmap )
98         {
99             gdprintk(XENLOG_ERR, "nest: allocation for vmread bitmap failed\n");
100             return -ENOMEM;
101         }
102         v->arch.hvm.vmx.vmread_bitmap = vmread_bitmap;
103 
104         clear_domain_page(page_to_mfn(vmread_bitmap));
105 
106         vmwrite_bitmap = alloc_domheap_page(NULL, 0);
107         if ( !vmwrite_bitmap )
108         {
109             gdprintk(XENLOG_ERR, "nest: allocation for vmwrite bitmap failed\n");
110             return -ENOMEM;
111         }
112         v->arch.hvm.vmx.vmwrite_bitmap = vmwrite_bitmap;
113 
114         vw = __map_domain_page(vmwrite_bitmap);
115         clear_page(vw);
116 
117         /*
118          * For the following 6 encodings, we need to handle them in VMM.
119          * Let them vmexit as usual.
120          */
121         set_bit(IO_BITMAP_A, vw);
122         set_bit(VMCS_HIGH(IO_BITMAP_A), vw);
123         set_bit(IO_BITMAP_B, vw);
124         set_bit(VMCS_HIGH(IO_BITMAP_B), vw);
125         set_bit(MSR_BITMAP, vw);
126         set_bit(VMCS_HIGH(MSR_BITMAP), vw);
127 
128         unmap_domain_page(vw);
129     }
130 
131     if ( cpu_has_vmx_msr_bitmap )
132     {
133         nvmx->msr_merged = alloc_xenheap_page();
134         if ( !nvmx->msr_merged )
135             return -ENOMEM;
136     }
137 
138     nvmx->ept.enabled = 0;
139     nvmx->guest_vpid = 0;
140     nvmx->vmxon_region_pa = INVALID_PADDR;
141     nvcpu->nv_vvmcx = NULL;
142     nvcpu->nv_vvmcxaddr = INVALID_PADDR;
143     nvmx->intr.intr_info = 0;
144     nvmx->intr.error_code = 0;
145     nvmx->iobitmap[0] = NULL;
146     nvmx->iobitmap[1] = NULL;
147     nvmx->msrbitmap = NULL;
148     INIT_LIST_HEAD(&nvmx->launched_list);
149     return 0;
150 }
151 
nvmx_vcpu_destroy(struct vcpu * v)152 void nvmx_vcpu_destroy(struct vcpu *v)
153 {
154     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
155     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
156     struct vvmcs_list *item, *n;
157 
158     /*
159      * When destroying the vcpu, it may be running on behalf of L2 guest.
160      * Therefore we need to switch the VMCS pointer back to the L1 VMCS,
161      * in order to avoid double free of L2 VMCS and the possible memory
162      * leak of L1 VMCS page.
163      */
164     if ( nvcpu->nv_n1vmcx_pa )
165         v->arch.hvm.vmx.vmcs_pa = nvcpu->nv_n1vmcx_pa;
166 
167     if ( nvcpu->nv_n2vmcx_pa )
168     {
169         __vmpclear(nvcpu->nv_n2vmcx_pa);
170         free_domheap_page(maddr_to_page(nvcpu->nv_n2vmcx_pa));
171         nvcpu->nv_n2vmcx_pa = 0;
172     }
173 
174     /* Must also cope with nvmx_vcpu_initialise() not having got called. */
175     if ( nvmx->launched_list.next )
176         list_for_each_entry_safe(item, n, &nvmx->launched_list, node)
177         {
178             list_del(&item->node);
179             xfree(item);
180         }
181 
182     if ( v->arch.hvm.vmx.vmread_bitmap )
183     {
184         free_domheap_page(v->arch.hvm.vmx.vmread_bitmap);
185         v->arch.hvm.vmx.vmread_bitmap = NULL;
186     }
187     if ( v->arch.hvm.vmx.vmwrite_bitmap )
188     {
189         free_domheap_page(v->arch.hvm.vmx.vmwrite_bitmap);
190         v->arch.hvm.vmx.vmwrite_bitmap = NULL;
191     }
192 }
193 
vcpu_relinquish_resources(struct vcpu * v)194 static void vcpu_relinquish_resources(struct vcpu *v)
195 {
196     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
197 
198     FREE_XENHEAP_PAGE(nvmx->msr_merged);
199 }
200 
nvmx_domain_relinquish_resources(struct domain * d)201 void nvmx_domain_relinquish_resources(struct domain *d)
202 {
203     struct vcpu *v;
204 
205     for_each_vcpu ( d, v )
206     {
207         nvmx_purge_vvmcs(v);
208         vcpu_relinquish_resources(v);
209     }
210 }
211 
nvmx_vcpu_reset(struct vcpu * v)212 int nvmx_vcpu_reset(struct vcpu *v)
213 {
214     return 0;
215 }
216 
nvmx_vcpu_eptp_base(struct vcpu * v)217 uint64_t nvmx_vcpu_eptp_base(struct vcpu *v)
218 {
219     return get_vvmcs(v, EPT_POINTER) & PAGE_MASK;
220 }
221 
nvmx_ept_enabled(struct vcpu * v)222 bool_t nvmx_ept_enabled(struct vcpu *v)
223 {
224     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
225 
226     return !!(nvmx->ept.enabled);
227 }
228 
229 struct vmx_inst_decoded {
230 #define VMX_INST_MEMREG_TYPE_MEMORY 0
231 #define VMX_INST_MEMREG_TYPE_REG    1
232     int type;
233     union {
234         struct {
235             unsigned long mem;
236             unsigned int  len;
237         };
238         unsigned int reg1;
239     };
240 
241     unsigned int reg2;
242 };
243 
vvmcs_offset(u32 width,u32 type,u32 index)244 static int vvmcs_offset(u32 width, u32 type, u32 index)
245 {
246     int offset;
247 
248     offset = (index & 0x1f) | type << 5 | width << 7;
249 
250     if ( offset == 0 )    /* vpid */
251         offset = 0x3f;
252 
253     return offset;
254 }
255 
get_vvmcs_virtual(void * vvmcs,uint32_t vmcs_encoding)256 uint64_t get_vvmcs_virtual(void *vvmcs, uint32_t vmcs_encoding)
257 {
258     union vmcs_encoding enc;
259     uint64_t *content = vvmcs;
260     int offset;
261     uint64_t res;
262 
263     enc.word = vmcs_encoding;
264     offset = vvmcs_offset(enc.width, enc.type, enc.index);
265     res = content[offset];
266 
267     switch ( enc.width ) {
268     case VVMCS_WIDTH_16:
269         res &= 0xffff;
270         break;
271    case VVMCS_WIDTH_64:
272         if ( enc.access_type )
273             res >>= 32;
274         break;
275     case VVMCS_WIDTH_32:
276         res &= 0xffffffff;
277         break;
278     case VVMCS_WIDTH_NATURAL:
279     default:
280         break;
281     }
282 
283     return res;
284 }
285 
get_vvmcs_real(const struct vcpu * v,u32 encoding)286 u64 get_vvmcs_real(const struct vcpu *v, u32 encoding)
287 {
288     return virtual_vmcs_vmread(v, encoding);
289 }
290 
get_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 * val)291 enum vmx_insn_errno get_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 *val)
292 {
293     *val = get_vvmcs_virtual(vvmcs, encoding);
294 
295     /*
296      * TODO: This should not always succeed. Fields and values need to be
297      * audited against the features offered to the guest in the VT-x MSRs.
298      * This should be fixed when the MSR levelling work is started, at which
299      * point there will be a cpuid_policy-like object.
300      */
301     return VMX_INSN_SUCCEED;
302 }
303 
get_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 * val)304 enum vmx_insn_errno get_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
305                                         u64 *val)
306 {
307     return virtual_vmcs_vmread_safe(v, encoding, val);
308 }
309 
set_vvmcs_virtual(void * vvmcs,uint32_t vmcs_encoding,uint64_t val)310 void set_vvmcs_virtual(void *vvmcs, uint32_t vmcs_encoding, uint64_t val)
311 {
312     union vmcs_encoding enc;
313     uint64_t *content = vvmcs;
314     int offset;
315     uint64_t res;
316 
317     enc.word = vmcs_encoding;
318     offset = vvmcs_offset(enc.width, enc.type, enc.index);
319     res = content[offset];
320 
321     switch ( enc.width ) {
322     case VVMCS_WIDTH_16:
323         res = val & 0xffff;
324         break;
325     case VVMCS_WIDTH_64:
326         if ( enc.access_type )
327         {
328             res &= 0xffffffff;
329             res |= val << 32;
330         }
331         else
332             res = val;
333         break;
334     case VVMCS_WIDTH_32:
335         res = val & 0xffffffff;
336         break;
337     case VVMCS_WIDTH_NATURAL:
338     default:
339         res = val;
340         break;
341     }
342 
343     content[offset] = res;
344 }
345 
set_vvmcs_real(const struct vcpu * v,u32 encoding,u64 val)346 void set_vvmcs_real(const struct vcpu *v, u32 encoding, u64 val)
347 {
348     virtual_vmcs_vmwrite(v, encoding, val);
349 }
350 
set_vvmcs_virtual_safe(void * vvmcs,u32 encoding,u64 val)351 enum vmx_insn_errno set_vvmcs_virtual_safe(void *vvmcs, u32 encoding, u64 val)
352 {
353     set_vvmcs_virtual(vvmcs, encoding, val);
354 
355     /*
356      * TODO: This should not always succeed. Fields and values need to be
357      * audited against the features offered to the guest in the VT-x MSRs.
358      * This should be fixed when the MSR levelling work is started, at which
359      * point there will be a cpuid_policy-like object.
360      */
361     return VMX_INSN_SUCCEED;
362 }
363 
set_vvmcs_real_safe(const struct vcpu * v,u32 encoding,u64 val)364 enum vmx_insn_errno set_vvmcs_real_safe(const struct vcpu *v, u32 encoding,
365                                         u64 val)
366 {
367     return virtual_vmcs_vmwrite_safe(v, encoding, val);
368 }
369 
reg_read(struct cpu_user_regs * regs,unsigned int index)370 static unsigned long reg_read(struct cpu_user_regs *regs,
371                               unsigned int index)
372 {
373     return *decode_gpr(regs, index);
374 }
375 
reg_write(struct cpu_user_regs * regs,unsigned int index,unsigned long value)376 static void reg_write(struct cpu_user_regs *regs,
377                       unsigned int index,
378                       unsigned long value)
379 {
380     *decode_gpr(regs, index) = value;
381 }
382 
__n2_pin_exec_control(struct vcpu * v)383 static inline u32 __n2_pin_exec_control(struct vcpu *v)
384 {
385     return get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
386 }
387 
__n2_exec_control(struct vcpu * v)388 static inline u32 __n2_exec_control(struct vcpu *v)
389 {
390     return get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL);
391 }
392 
__n2_secondary_exec_control(struct vcpu * v)393 static inline u32 __n2_secondary_exec_control(struct vcpu *v)
394 {
395     u64 second_ctrl = 0;
396 
397     if ( __n2_exec_control(v) & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
398         second_ctrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
399 
400     return second_ctrl;
401 }
402 
decode_vmx_inst(struct cpu_user_regs * regs,struct vmx_inst_decoded * decode,unsigned long * poperandS)403 static int decode_vmx_inst(struct cpu_user_regs *regs,
404                            struct vmx_inst_decoded *decode,
405                            unsigned long *poperandS)
406 {
407     struct vcpu *v = current;
408     union vmx_inst_info info;
409     struct segment_register seg;
410     unsigned long base, index, seg_base, disp, offset;
411     int scale, size;
412 
413     __vmread(VMX_INSTRUCTION_INFO, &offset);
414     info.word = offset;
415 
416     if ( info.fields.memreg ) {
417         decode->type = VMX_INST_MEMREG_TYPE_REG;
418         decode->reg1 = info.fields.reg1;
419         if ( poperandS != NULL )
420             *poperandS = reg_read(regs, decode->reg1);
421     }
422     else
423     {
424         bool mode_64bit = (vmx_guest_x86_mode(v) == 8);
425 
426         decode->type = VMX_INST_MEMREG_TYPE_MEMORY;
427 
428         if ( info.fields.segment > x86_seg_gs )
429             goto gp_fault;
430         hvm_get_segment_register(v, info.fields.segment, &seg);
431         seg_base = seg.base;
432 
433         base = info.fields.base_reg_invalid ? 0 :
434             reg_read(regs, info.fields.base_reg);
435 
436         index = info.fields.index_reg_invalid ? 0 :
437             reg_read(regs, info.fields.index_reg);
438 
439         scale = 1 << info.fields.scaling;
440 
441         __vmread(EXIT_QUALIFICATION, &disp);
442 
443         size = 1 << (info.fields.addr_size + 1);
444 
445         offset = base + index * scale + disp;
446         base = !mode_64bit || info.fields.segment >= x86_seg_fs ?
447                seg_base + offset : offset;
448         if ( offset + size - 1 < offset ||
449              (mode_64bit ?
450               !is_canonical_address((long)base < 0 ? base :
451                                     base + size - 1) :
452               offset + size - 1 > seg.limit) )
453             goto gp_fault;
454 
455         if ( poperandS != NULL )
456         {
457             pagefault_info_t pfinfo;
458             int rc = hvm_copy_from_guest_linear(poperandS, base, size,
459                                                 0, &pfinfo);
460 
461             if ( rc == HVMTRANS_bad_linear_to_gfn )
462                 hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
463             if ( rc != HVMTRANS_okay )
464                 return X86EMUL_EXCEPTION;
465         }
466         decode->mem = base;
467         decode->len = size;
468     }
469 
470     decode->reg2 = info.fields.reg2;
471 
472     return X86EMUL_OKAY;
473 
474 gp_fault:
475     hvm_inject_hw_exception(TRAP_gp_fault, 0);
476     return X86EMUL_EXCEPTION;
477 }
478 
vmsucceed(struct cpu_user_regs * regs)479 static void vmsucceed(struct cpu_user_regs *regs)
480 {
481     regs->eflags &= ~X86_EFLAGS_ARITH_MASK;
482 }
483 
vmfail_valid(struct cpu_user_regs * regs,enum vmx_insn_errno errno)484 static void vmfail_valid(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
485 {
486     struct vcpu *v = current;
487     unsigned int eflags = regs->eflags;
488 
489     ASSERT(vvmcx_valid(v));
490 
491     regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_ZF;
492     set_vvmcs(v, VM_INSTRUCTION_ERROR, errno);
493 }
494 
vmfail_invalid(struct cpu_user_regs * regs)495 static void vmfail_invalid(struct cpu_user_regs *regs)
496 {
497     struct vcpu *v = current;
498     unsigned int eflags = regs->eflags;
499 
500     ASSERT(!vvmcx_valid(v));
501 
502     regs->eflags = (eflags & ~X86_EFLAGS_ARITH_MASK) | X86_EFLAGS_CF;
503 }
504 
vmfail(struct cpu_user_regs * regs,enum vmx_insn_errno errno)505 static void vmfail(struct cpu_user_regs *regs, enum vmx_insn_errno errno)
506 {
507     if ( errno == VMX_INSN_SUCCEED )
508         return;
509 
510     if ( vvmcx_valid(current) && errno != VMX_INSN_FAIL_INVALID )
511         vmfail_valid(regs, errno);
512     else
513         vmfail_invalid(regs);
514 }
515 
nvmx_intercepts_exception(struct vcpu * v,unsigned int vector,int error_code)516 bool_t nvmx_intercepts_exception(
517     struct vcpu *v, unsigned int vector, int error_code)
518 {
519     u32 exception_bitmap, pfec_match=0, pfec_mask=0;
520     int r;
521 
522     ASSERT(vector < 32);
523 
524     exception_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
525     r = exception_bitmap & (1 << vector) ? 1: 0;
526 
527     if ( vector == TRAP_page_fault )
528     {
529         pfec_match = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH);
530         pfec_mask  = get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK);
531         if ( (error_code & pfec_mask) != pfec_match )
532             r = !r;
533     }
534     return r;
535 }
536 
537 /*
538  * Nested VMX uses "strict" condition to exit from
539  * L2 guest if either L1 VMM or L0 VMM expect to exit.
540  */
__shadow_control(struct vcpu * v,unsigned int field,u32 host_value)541 static inline u32 __shadow_control(struct vcpu *v,
542                                  unsigned int field,
543                                  u32 host_value)
544 {
545     return get_vvmcs(v, field) | host_value;
546 }
547 
set_shadow_control(struct vcpu * v,unsigned int field,u32 host_value)548 static void set_shadow_control(struct vcpu *v,
549                                unsigned int field,
550                                u32 host_value)
551 {
552     __vmwrite(field, __shadow_control(v, field, host_value));
553 }
554 
_shadow_io_bitmap(struct vcpu * v)555 unsigned long *_shadow_io_bitmap(struct vcpu *v)
556 {
557     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
558     int port80, portED;
559     u8 *bitmap;
560 
561     bitmap = nvmx->iobitmap[0];
562     port80 = bitmap[0x80 >> 3] & (1 << (0x80 & 0x7)) ? 1 : 0;
563     portED = bitmap[0xed >> 3] & (1 << (0xed & 0x7)) ? 1 : 0;
564 
565     return nestedhvm_vcpu_iomap_get(port80, portED);
566 }
567 
update_msrbitmap(struct vcpu * v,uint32_t shadow_ctrl)568 static void update_msrbitmap(struct vcpu *v, uint32_t shadow_ctrl)
569 {
570     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
571     struct vmx_msr_bitmap *msr_bitmap = nvmx->msr_merged;
572 
573     if ( !(shadow_ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP) ||
574          !nvmx->msrbitmap )
575        return;
576 
577     bitmap_or(msr_bitmap->read_low, nvmx->msrbitmap->read_low,
578               v->arch.hvm.vmx.msr_bitmap->read_low,
579               sizeof(msr_bitmap->read_low) * 8);
580     bitmap_or(msr_bitmap->read_high, nvmx->msrbitmap->read_high,
581               v->arch.hvm.vmx.msr_bitmap->read_high,
582               sizeof(msr_bitmap->read_high) * 8);
583     bitmap_or(msr_bitmap->write_low, nvmx->msrbitmap->write_low,
584               v->arch.hvm.vmx.msr_bitmap->write_low,
585               sizeof(msr_bitmap->write_low) * 8);
586     bitmap_or(msr_bitmap->write_high, nvmx->msrbitmap->write_high,
587               v->arch.hvm.vmx.msr_bitmap->write_high,
588               sizeof(msr_bitmap->write_high) * 8);
589 
590     /*
591      * Nested VMX doesn't support any x2APIC hardware virtualization, so
592      * make sure all the x2APIC MSRs are trapped.
593      */
594     bitmap_set(msr_bitmap->read_low, MSR_X2APIC_FIRST, 0x100);
595     bitmap_set(msr_bitmap->write_low, MSR_X2APIC_FIRST, 0x100);
596 
597     __vmwrite(MSR_BITMAP, virt_to_maddr(nvmx->msr_merged));
598 }
599 
nvmx_update_exec_control(struct vcpu * v,u32 host_cntrl)600 void nvmx_update_exec_control(struct vcpu *v, u32 host_cntrl)
601 {
602     u32 pio_cntrl = (CPU_BASED_ACTIVATE_IO_BITMAP
603                      | CPU_BASED_UNCOND_IO_EXITING);
604     unsigned long *bitmap;
605     u32 shadow_cntrl;
606 
607     shadow_cntrl = __n2_exec_control(v);
608     pio_cntrl &= shadow_cntrl;
609     /* Enforce the removed features */
610     shadow_cntrl &= ~(CPU_BASED_ACTIVATE_IO_BITMAP
611                       | CPU_BASED_UNCOND_IO_EXITING);
612     /*
613      * Do NOT enforce the MSR bitmap currently used by L1, as certain hardware
614      * virtualization features require specific MSR bitmap settings, but
615      * without the guest also using these same features the bitmap could be
616      * leaking through unwanted MSR accesses.
617      */
618     shadow_cntrl |= host_cntrl & ~CPU_BASED_ACTIVATE_MSR_BITMAP;
619     if ( !(shadow_cntrl & host_cntrl & CPU_BASED_ACTIVATE_MSR_BITMAP) )
620       shadow_cntrl &= ~CPU_BASED_ACTIVATE_MSR_BITMAP;
621     if ( pio_cntrl == CPU_BASED_UNCOND_IO_EXITING ) {
622         /* L1 VMM intercepts all I/O instructions */
623         shadow_cntrl |= CPU_BASED_UNCOND_IO_EXITING;
624         shadow_cntrl &= ~CPU_BASED_ACTIVATE_IO_BITMAP;
625     }
626     else {
627         /* Use IO_BITMAP in shadow */
628         if ( pio_cntrl == 0 ) {
629             /*
630              * L1 VMM doesn't intercept IO instruction.
631              * Use host configuration and reset IO_BITMAP
632              */
633             bitmap = hvm_io_bitmap;
634         }
635         else {
636             /* use IO bitmap */
637             bitmap = _shadow_io_bitmap(v);
638         }
639         __vmwrite(IO_BITMAP_A, virt_to_maddr(bitmap));
640         __vmwrite(IO_BITMAP_B, virt_to_maddr(bitmap) + PAGE_SIZE);
641     }
642 
643     update_msrbitmap(v, shadow_cntrl);
644 
645     /* TODO: change L0 intr window to MTF or NMI window */
646     __vmwrite(CPU_BASED_VM_EXEC_CONTROL, shadow_cntrl);
647 }
648 
nvmx_update_secondary_exec_control(struct vcpu * v,unsigned long host_cntrl)649 void nvmx_update_secondary_exec_control(struct vcpu *v,
650                                         unsigned long host_cntrl)
651 {
652     u32 shadow_cntrl;
653     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
654     u32 apicv_bit = SECONDARY_EXEC_APIC_REGISTER_VIRT |
655                     SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
656                     SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
657 
658     host_cntrl &= ~apicv_bit;
659     host_cntrl &= ~SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
660     shadow_cntrl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
661 
662     /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
663     ASSERT(!(shadow_cntrl & apicv_bit));
664 
665     nvmx->ept.enabled = !!(shadow_cntrl & SECONDARY_EXEC_ENABLE_EPT);
666     shadow_cntrl |= host_cntrl;
667     __vmwrite(SECONDARY_VM_EXEC_CONTROL, shadow_cntrl);
668 }
669 
nvmx_update_pin_control(struct vcpu * v,unsigned long host_cntrl)670 static void nvmx_update_pin_control(struct vcpu *v, unsigned long host_cntrl)
671 {
672     u32 shadow_cntrl;
673 
674     host_cntrl &= ~PIN_BASED_POSTED_INTERRUPT;
675     shadow_cntrl = get_vvmcs(v, PIN_BASED_VM_EXEC_CONTROL);
676 
677     /* No vAPIC-v support, so it shouldn't be set in vmcs12. */
678     ASSERT(!(shadow_cntrl & PIN_BASED_POSTED_INTERRUPT));
679 
680     shadow_cntrl |= host_cntrl;
681     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, shadow_cntrl);
682 }
683 
nvmx_update_exit_control(struct vcpu * v,unsigned long host_cntrl)684 static void nvmx_update_exit_control(struct vcpu *v, unsigned long host_cntrl)
685 {
686     u32 shadow_cntrl;
687 
688     shadow_cntrl = get_vvmcs(v, VM_EXIT_CONTROLS);
689     shadow_cntrl &= ~(VM_EXIT_SAVE_DEBUG_CNTRLS
690                       | VM_EXIT_LOAD_HOST_PAT
691                       | VM_EXIT_LOAD_HOST_EFER
692                       | VM_EXIT_LOAD_PERF_GLOBAL_CTRL);
693     shadow_cntrl |= host_cntrl;
694     __vmwrite(VM_EXIT_CONTROLS, shadow_cntrl);
695 }
696 
nvmx_update_entry_control(struct vcpu * v)697 static void nvmx_update_entry_control(struct vcpu *v)
698 {
699     u32 shadow_cntrl;
700 
701     shadow_cntrl = get_vvmcs(v, VM_ENTRY_CONTROLS);
702     shadow_cntrl &= ~(VM_ENTRY_LOAD_GUEST_PAT
703                       | VM_ENTRY_LOAD_GUEST_EFER
704                       | VM_ENTRY_LOAD_PERF_GLOBAL_CTRL);
705     __vmwrite(VM_ENTRY_CONTROLS, shadow_cntrl);
706 }
707 
nvmx_update_exception_bitmap(struct vcpu * v,unsigned long value)708 void nvmx_update_exception_bitmap(struct vcpu *v, unsigned long value)
709 {
710     set_shadow_control(v, EXCEPTION_BITMAP, value);
711 }
712 
nvmx_update_apic_access_address(struct vcpu * v)713 static void nvmx_update_apic_access_address(struct vcpu *v)
714 {
715     u32 ctrl;
716 
717     ctrl = __n2_secondary_exec_control(v);
718     if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
719     {
720         p2m_type_t p2mt;
721         unsigned long apic_gpfn;
722         struct page_info *apic_pg;
723 
724         apic_gpfn = get_vvmcs(v, APIC_ACCESS_ADDR) >> PAGE_SHIFT;
725         apic_pg = get_page_from_gfn(v->domain, apic_gpfn, &p2mt, P2M_ALLOC);
726         ASSERT(apic_pg && !p2m_is_paging(p2mt));
727         __vmwrite(APIC_ACCESS_ADDR, page_to_maddr(apic_pg));
728         put_page(apic_pg);
729     }
730     else
731         __vmwrite(APIC_ACCESS_ADDR, 0);
732 }
733 
nvmx_update_virtual_apic_address(struct vcpu * v)734 static void nvmx_update_virtual_apic_address(struct vcpu *v)
735 {
736     u32 ctrl;
737 
738     ctrl = __n2_exec_control(v);
739     if ( ctrl & CPU_BASED_TPR_SHADOW )
740     {
741         p2m_type_t p2mt;
742         unsigned long vapic_gpfn;
743         struct page_info *vapic_pg;
744 
745         vapic_gpfn = get_vvmcs(v, VIRTUAL_APIC_PAGE_ADDR) >> PAGE_SHIFT;
746         vapic_pg = get_page_from_gfn(v->domain, vapic_gpfn, &p2mt, P2M_ALLOC);
747         ASSERT(vapic_pg && !p2m_is_paging(p2mt));
748         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vapic_pg));
749         put_page(vapic_pg);
750     }
751     else
752         __vmwrite(VIRTUAL_APIC_PAGE_ADDR, 0);
753 }
754 
nvmx_update_tpr_threshold(struct vcpu * v)755 static void nvmx_update_tpr_threshold(struct vcpu *v)
756 {
757     u32 ctrl = __n2_exec_control(v);
758 
759     if ( ctrl & CPU_BASED_TPR_SHADOW )
760         __vmwrite(TPR_THRESHOLD, get_vvmcs(v, TPR_THRESHOLD));
761     else
762         __vmwrite(TPR_THRESHOLD, 0);
763 }
764 
nvmx_update_pfec(struct vcpu * v)765 static void nvmx_update_pfec(struct vcpu *v)
766 {
767     __vmwrite(PAGE_FAULT_ERROR_CODE_MASK,
768               get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MASK));
769     __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH,
770               get_vvmcs(v, PAGE_FAULT_ERROR_CODE_MATCH));
771 }
772 
__clear_current_vvmcs(struct vcpu * v)773 static void __clear_current_vvmcs(struct vcpu *v)
774 {
775     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
776 
777     if ( nvcpu->nv_n2vmcx_pa )
778         __vmpclear(nvcpu->nv_n2vmcx_pa);
779 }
780 
unmap_msr_bitmap(struct vcpu * v)781 static void unmap_msr_bitmap(struct vcpu *v)
782 {
783     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
784 
785     if ( nvmx->msrbitmap )
786     {
787         hvm_unmap_guest_frame(nvmx->msrbitmap, 1);
788         nvmx->msrbitmap = NULL;
789     }
790 }
791 
792 /*
793  * Refreshes the MSR bitmap mapping for the current nested vcpu.  Returns true
794  * for a successful mapping, and returns false for MSR_BITMAP parameter errors
795  * or gfn mapping errors.
796  */
_map_msr_bitmap(struct vcpu * v)797 static bool __must_check _map_msr_bitmap(struct vcpu *v)
798 {
799     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
800     uint64_t gpa;
801 
802     unmap_msr_bitmap(v);
803     gpa = get_vvmcs(v, MSR_BITMAP);
804 
805     if ( !IS_ALIGNED(gpa, PAGE_SIZE) )
806         return false;
807 
808     nvmx->msrbitmap = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
809 
810     return nvmx->msrbitmap != NULL;
811 }
812 
unmap_io_bitmap(struct vcpu * v,unsigned int idx)813 static void unmap_io_bitmap(struct vcpu *v, unsigned int idx)
814 {
815     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
816 
817     if ( nvmx->iobitmap[idx] )
818     {
819         hvm_unmap_guest_frame(nvmx->iobitmap[idx], 1);
820         nvmx->iobitmap[idx] = NULL;
821     }
822 }
823 
_map_io_bitmap(struct vcpu * v,u64 vmcs_reg)824 static bool_t __must_check _map_io_bitmap(struct vcpu *v, u64 vmcs_reg)
825 {
826     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
827     unsigned long gpa;
828     int index;
829 
830     index = vmcs_reg == IO_BITMAP_A ? 0 : 1;
831     unmap_io_bitmap(v, index);
832     gpa = get_vvmcs(v, vmcs_reg);
833     nvmx->iobitmap[index] = hvm_map_guest_frame_ro(gpa >> PAGE_SHIFT, 1);
834 
835     return nvmx->iobitmap[index] != NULL;
836 }
837 
map_io_bitmap_all(struct vcpu * v)838 static inline bool_t __must_check map_io_bitmap_all(struct vcpu *v)
839 {
840    return _map_io_bitmap(v, IO_BITMAP_A) &&
841           _map_io_bitmap(v, IO_BITMAP_B);
842 }
843 
nvmx_purge_vvmcs(struct vcpu * v)844 static void nvmx_purge_vvmcs(struct vcpu *v)
845 {
846     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
847     int i;
848 
849     __clear_current_vvmcs(v);
850     if ( vvmcx_valid(v) )
851         hvm_unmap_guest_frame(nvcpu->nv_vvmcx, 1);
852     nvcpu->nv_vvmcx = NULL;
853     nvcpu->nv_vvmcxaddr = INVALID_PADDR;
854     v->arch.hvm.vmx.vmcs_shadow_maddr = 0;
855 
856     for ( i = 0; i < 2; i++ )
857         unmap_io_bitmap(v, i);
858 
859     unmap_msr_bitmap(v);
860 }
861 
nvmx_get_tsc_offset(struct vcpu * v)862 u64 nvmx_get_tsc_offset(struct vcpu *v)
863 {
864     u64 offset = 0;
865 
866     if ( get_vvmcs(v, CPU_BASED_VM_EXEC_CONTROL) &
867          CPU_BASED_USE_TSC_OFFSETING )
868         offset = get_vvmcs(v, TSC_OFFSET);
869 
870     return offset;
871 }
872 
873 /*
874  * Context synchronized between shadow and virtual VMCS.
875  */
876 static const u16 vmcs_gstate_field[] = {
877     /* 16 BITS */
878     GUEST_ES_SELECTOR,
879     GUEST_CS_SELECTOR,
880     GUEST_SS_SELECTOR,
881     GUEST_DS_SELECTOR,
882     GUEST_FS_SELECTOR,
883     GUEST_GS_SELECTOR,
884     GUEST_LDTR_SELECTOR,
885     GUEST_TR_SELECTOR,
886     /* 64 BITS */
887     VMCS_LINK_POINTER,
888     GUEST_IA32_DEBUGCTL,
889     GUEST_PAT,
890     GUEST_EFER,
891     GUEST_PERF_GLOBAL_CTRL,
892     /* 32 BITS */
893     GUEST_ES_LIMIT,
894     GUEST_CS_LIMIT,
895     GUEST_SS_LIMIT,
896     GUEST_DS_LIMIT,
897     GUEST_FS_LIMIT,
898     GUEST_GS_LIMIT,
899     GUEST_LDTR_LIMIT,
900     GUEST_TR_LIMIT,
901     GUEST_GDTR_LIMIT,
902     GUEST_IDTR_LIMIT,
903     GUEST_ES_AR_BYTES,
904     GUEST_CS_AR_BYTES,
905     GUEST_SS_AR_BYTES,
906     GUEST_DS_AR_BYTES,
907     GUEST_FS_AR_BYTES,
908     GUEST_GS_AR_BYTES,
909     GUEST_LDTR_AR_BYTES,
910     GUEST_TR_AR_BYTES,
911     GUEST_INTERRUPTIBILITY_INFO,
912     GUEST_ACTIVITY_STATE,
913     GUEST_SYSENTER_CS,
914     GUEST_PREEMPTION_TIMER,
915     /* natural */
916     GUEST_ES_BASE,
917     GUEST_CS_BASE,
918     GUEST_SS_BASE,
919     GUEST_DS_BASE,
920     GUEST_FS_BASE,
921     GUEST_GS_BASE,
922     GUEST_LDTR_BASE,
923     GUEST_TR_BASE,
924     GUEST_GDTR_BASE,
925     GUEST_IDTR_BASE,
926     GUEST_DR7,
927     /*
928      * Following guest states are in local cache (cpu_user_regs)
929      GUEST_RSP,
930      GUEST_RIP,
931      */
932     GUEST_RFLAGS,
933     GUEST_PENDING_DBG_EXCEPTIONS,
934     GUEST_SYSENTER_ESP,
935     GUEST_SYSENTER_EIP,
936 };
937 
938 static const u16 gpdpte_fields[] = {
939     GUEST_PDPTE(0),
940     GUEST_PDPTE(1),
941     GUEST_PDPTE(2),
942     GUEST_PDPTE(3),
943 };
944 
945 /*
946  * Context: shadow -> virtual VMCS
947  */
948 static const u16 vmcs_ro_field[] = {
949     GUEST_PHYSICAL_ADDRESS,
950     VM_INSTRUCTION_ERROR,
951     VM_EXIT_REASON,
952     VM_EXIT_INTR_INFO,
953     VM_EXIT_INTR_ERROR_CODE,
954     IDT_VECTORING_INFO,
955     IDT_VECTORING_ERROR_CODE,
956     VM_EXIT_INSTRUCTION_LEN,
957     VMX_INSTRUCTION_INFO,
958     EXIT_QUALIFICATION,
959     GUEST_LINEAR_ADDRESS
960 };
961 
962 static struct vmcs_host_to_guest {
963     u16 host_field;
964     u16 guest_field;
965 } const vmcs_h2g_field[] = {
966     {HOST_ES_SELECTOR, GUEST_ES_SELECTOR},
967     {HOST_CS_SELECTOR, GUEST_CS_SELECTOR},
968     {HOST_SS_SELECTOR, GUEST_SS_SELECTOR},
969     {HOST_DS_SELECTOR, GUEST_DS_SELECTOR},
970     {HOST_FS_SELECTOR, GUEST_FS_SELECTOR},
971     {HOST_GS_SELECTOR, GUEST_GS_SELECTOR},
972     {HOST_TR_SELECTOR, GUEST_TR_SELECTOR},
973     {HOST_SYSENTER_CS, GUEST_SYSENTER_CS},
974     {HOST_FS_BASE, GUEST_FS_BASE},
975     {HOST_GS_BASE, GUEST_GS_BASE},
976     {HOST_TR_BASE, GUEST_TR_BASE},
977     {HOST_GDTR_BASE, GUEST_GDTR_BASE},
978     {HOST_IDTR_BASE, GUEST_IDTR_BASE},
979     {HOST_SYSENTER_ESP, GUEST_SYSENTER_ESP},
980     {HOST_SYSENTER_EIP, GUEST_SYSENTER_EIP},
981 };
982 
vvmcs_to_shadow(const struct vcpu * v,unsigned int field)983 static void vvmcs_to_shadow(const struct vcpu *v, unsigned int field)
984 {
985     __vmwrite(field, get_vvmcs(v, field));
986 }
987 
vvmcs_to_shadow_bulk(struct vcpu * v,unsigned int n,const u16 * field)988 static void vvmcs_to_shadow_bulk(struct vcpu *v, unsigned int n,
989                                  const u16 *field)
990 {
991     u64 *value = this_cpu(vvmcs_buf);
992     unsigned int i;
993 
994     if ( !cpu_has_vmx_vmcs_shadowing )
995         goto fallback;
996 
997     if ( n > VMCS_BUF_SIZE )
998     {
999         if ( IS_ENABLED(CONFIG_DEBUG) )
1000             printk_once(XENLOG_ERR "%pv VMCS sync too many fields %u\n",
1001                         v, n);
1002         goto fallback;
1003     }
1004 
1005     virtual_vmcs_enter(v);
1006     for ( i = 0; i < n; i++ )
1007         __vmread(field[i], &value[i]);
1008     virtual_vmcs_exit(v);
1009 
1010     for ( i = 0; i < n; i++ )
1011         __vmwrite(field[i], value[i]);
1012 
1013     return;
1014 
1015 fallback:
1016     for ( i = 0; i < n; i++ )
1017         vvmcs_to_shadow(v, field[i]);
1018 }
1019 
shadow_to_vvmcs(const struct vcpu * v,unsigned int field)1020 static inline void shadow_to_vvmcs(const struct vcpu *v, unsigned int field)
1021 {
1022     unsigned long value;
1023 
1024     if ( vmread_safe(field, &value) == 0 )
1025         set_vvmcs(v, field, value);
1026 }
1027 
shadow_to_vvmcs_bulk(struct vcpu * v,unsigned int n,const u16 * field)1028 static void shadow_to_vvmcs_bulk(struct vcpu *v, unsigned int n,
1029                                  const u16 *field)
1030 {
1031     u64 *value = this_cpu(vvmcs_buf);
1032     unsigned int i;
1033 
1034     if ( !cpu_has_vmx_vmcs_shadowing )
1035         goto fallback;
1036 
1037     if ( n > VMCS_BUF_SIZE )
1038     {
1039         if ( IS_ENABLED(CONFIG_DEBUG) )
1040             printk_once(XENLOG_ERR "%pv VMCS sync too many fields %u\n",
1041                         v, n);
1042         goto fallback;
1043     }
1044 
1045     for ( i = 0; i < n; i++ )
1046         __vmread(field[i], &value[i]);
1047 
1048     virtual_vmcs_enter(v);
1049     for ( i = 0; i < n; i++ )
1050         __vmwrite(field[i], value[i]);
1051     virtual_vmcs_exit(v);
1052 
1053     return;
1054 
1055 fallback:
1056     for ( i = 0; i < n; i++ )
1057         shadow_to_vvmcs(v, field[i]);
1058 }
1059 
load_shadow_control(struct vcpu * v)1060 static void load_shadow_control(struct vcpu *v)
1061 {
1062     /*
1063      * Set shadow controls:  PIN_BASED, CPU_BASED, EXIT, ENTRY
1064      * and EXCEPTION
1065      * Enforce the removed features
1066      */
1067     nvmx_update_pin_control(v, vmx_pin_based_exec_control);
1068     vmx_update_cpu_exec_control(v);
1069     vmx_update_secondary_exec_control(v);
1070     nvmx_update_exit_control(v, vmx_vmexit_control);
1071     nvmx_update_entry_control(v);
1072     vmx_update_exception_bitmap(v);
1073     nvmx_update_apic_access_address(v);
1074     nvmx_update_virtual_apic_address(v);
1075     nvmx_update_tpr_threshold(v);
1076     nvmx_update_pfec(v);
1077 }
1078 
load_shadow_guest_state(struct vcpu * v)1079 static void load_shadow_guest_state(struct vcpu *v)
1080 {
1081     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1082     u32 control;
1083     u64 cr_gh_mask, cr_read_shadow;
1084     int rc;
1085 
1086     static const u16 vmentry_fields[] = {
1087         VM_ENTRY_INTR_INFO,
1088         VM_ENTRY_EXCEPTION_ERROR_CODE,
1089         VM_ENTRY_INSTRUCTION_LEN,
1090     };
1091 
1092     /* vvmcs.gstate to shadow vmcs.gstate */
1093     vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1094                          vmcs_gstate_field);
1095 
1096     nvcpu->guest_cr[0] = get_vvmcs(v, CR0_READ_SHADOW);
1097     nvcpu->guest_cr[4] = get_vvmcs(v, CR4_READ_SHADOW);
1098 
1099     rc = hvm_set_cr4(get_vvmcs(v, GUEST_CR4), true);
1100     if ( rc == X86EMUL_EXCEPTION )
1101         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1102 
1103     rc = hvm_set_cr0(get_vvmcs(v, GUEST_CR0), true);
1104     if ( rc == X86EMUL_EXCEPTION )
1105         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1106 
1107     rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
1108     if ( rc == X86EMUL_EXCEPTION )
1109         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1110 
1111     control = get_vvmcs(v, VM_ENTRY_CONTROLS);
1112     if ( control & VM_ENTRY_LOAD_GUEST_PAT )
1113         hvm_set_guest_pat(v, get_vvmcs(v, GUEST_PAT));
1114     if ( control & VM_ENTRY_LOAD_PERF_GLOBAL_CTRL )
1115     {
1116         rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1117                                      get_vvmcs(v, GUEST_PERF_GLOBAL_CTRL), false);
1118         if ( rc == X86EMUL_EXCEPTION )
1119             hvm_inject_hw_exception(TRAP_gp_fault, 0);
1120     }
1121 
1122     hvm_set_tsc_offset(v, v->arch.hvm.cache_tsc_offset, 0);
1123 
1124     vvmcs_to_shadow_bulk(v, ARRAY_SIZE(vmentry_fields), vmentry_fields);
1125 
1126     /*
1127      * While emulate CR0 and CR4 for nested virtualization, set the CR0/CR4
1128      * guest host mask to 0xffffffff in shadow VMCS (follow the host L1 VMCS),
1129      * then calculate the corresponding read shadow separately for CR0 and CR4.
1130      */
1131     cr_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
1132     cr_read_shadow = (get_vvmcs(v, GUEST_CR0) & ~cr_gh_mask) |
1133                      (get_vvmcs(v, CR0_READ_SHADOW) & cr_gh_mask);
1134     __vmwrite(CR0_READ_SHADOW, cr_read_shadow);
1135 
1136     cr_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
1137     cr_read_shadow = (get_vvmcs(v, GUEST_CR4) & ~cr_gh_mask) |
1138                      (get_vvmcs(v, CR4_READ_SHADOW) & cr_gh_mask);
1139     __vmwrite(CR4_READ_SHADOW, cr_read_shadow);
1140     /* Add the nested host mask to the one set by vmx_update_guest_cr. */
1141     v->arch.hvm.vmx.cr4_host_mask |= cr_gh_mask;
1142     __vmwrite(CR4_GUEST_HOST_MASK, v->arch.hvm.vmx.cr4_host_mask);
1143 
1144     /* TODO: CR3 target control */
1145 }
1146 
get_shadow_eptp(struct vcpu * v)1147 static uint64_t get_shadow_eptp(struct vcpu *v)
1148 {
1149     struct p2m_domain *p2m = p2m_get_nestedp2m(v);
1150     struct ept_data *ept = &p2m->ept;
1151 
1152     ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
1153     return ept->eptp;
1154 }
1155 
get_host_eptp(struct vcpu * v)1156 static uint64_t get_host_eptp(struct vcpu *v)
1157 {
1158     return p2m_get_hostp2m(v->domain)->ept.eptp;
1159 }
1160 
nvmx_vpid_enabled(const struct vcpu * v)1161 static bool_t nvmx_vpid_enabled(const struct vcpu *v)
1162 {
1163     uint32_t second_cntl;
1164 
1165     second_cntl = get_vvmcs(v, SECONDARY_VM_EXEC_CONTROL);
1166     if ( second_cntl & SECONDARY_EXEC_ENABLE_VPID )
1167         return 1;
1168     return 0;
1169 }
1170 
nvmx_set_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1171 static void nvmx_set_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1172 {
1173     paddr_t vvmcs_maddr = v->arch.hvm.vmx.vmcs_shadow_maddr;
1174 
1175     __vmpclear(vvmcs_maddr);
1176     vvmcs->vmcs_revision_id |= VMCS_RID_TYPE_MASK;
1177     v->arch.hvm.vmx.secondary_exec_control |=
1178         SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
1179     __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1180               v->arch.hvm.vmx.secondary_exec_control);
1181     __vmwrite(VMCS_LINK_POINTER, vvmcs_maddr);
1182     __vmwrite(VMREAD_BITMAP, page_to_maddr(v->arch.hvm.vmx.vmread_bitmap));
1183     __vmwrite(VMWRITE_BITMAP, page_to_maddr(v->arch.hvm.vmx.vmwrite_bitmap));
1184 }
1185 
nvmx_clear_vmcs_pointer(struct vcpu * v,struct vmcs_struct * vvmcs)1186 static void nvmx_clear_vmcs_pointer(struct vcpu *v, struct vmcs_struct *vvmcs)
1187 {
1188     paddr_t vvmcs_maddr = v->arch.hvm.vmx.vmcs_shadow_maddr;
1189 
1190     __vmpclear(vvmcs_maddr);
1191     vvmcs->vmcs_revision_id &= ~VMCS_RID_TYPE_MASK;
1192     v->arch.hvm.vmx.secondary_exec_control &=
1193         ~SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
1194     __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1195               v->arch.hvm.vmx.secondary_exec_control);
1196     __vmwrite(VMCS_LINK_POINTER, ~0ul);
1197     __vmwrite(VMREAD_BITMAP, 0);
1198     __vmwrite(VMWRITE_BITMAP, 0);
1199 }
1200 
virtual_vmentry(struct cpu_user_regs * regs)1201 static void virtual_vmentry(struct cpu_user_regs *regs)
1202 {
1203     struct vcpu *v = current;
1204     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1205     unsigned long lm_l1, lm_l2;
1206 
1207     vmx_vmcs_switch(v->arch.hvm.vmx.vmcs_pa, nvcpu->nv_n2vmcx_pa);
1208 
1209     nestedhvm_vcpu_enter_guestmode(v);
1210     nvcpu->nv_vmentry_pending = 0;
1211     nvcpu->nv_vmswitch_in_progress = 1;
1212 
1213     /*
1214      * EFER handling:
1215      * hvm_set_efer won't work if CR0.PG = 1, so we change the value
1216      * directly to make hvm_long_mode_active(v) work in L2.
1217      * An additional update_paging_modes is also needed if
1218      * there is 32/64 switch. v->arch.hvm.guest_efer doesn't
1219      * need to be saved, since its value on vmexit is determined by
1220      * L1 exit_controls
1221      */
1222     lm_l1 = hvm_long_mode_active(v);
1223     lm_l2 = !!(get_vvmcs(v, VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE);
1224 
1225     if ( lm_l2 )
1226         v->arch.hvm.guest_efer |= EFER_LMA | EFER_LME;
1227     else
1228         v->arch.hvm.guest_efer &= ~(EFER_LMA | EFER_LME);
1229 
1230     load_shadow_control(v);
1231     load_shadow_guest_state(v);
1232 
1233     if ( lm_l1 != lm_l2 )
1234         paging_update_paging_modes(v);
1235 
1236     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1237          !(v->arch.hvm.guest_efer & EFER_LMA) )
1238         vvmcs_to_shadow_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1239 
1240     regs->rip = get_vvmcs(v, GUEST_RIP);
1241     regs->rsp = get_vvmcs(v, GUEST_RSP);
1242     regs->rflags = get_vvmcs(v, GUEST_RFLAGS);
1243 
1244     /* updating host cr0 to sync TS bit */
1245     __vmwrite(HOST_CR0, v->arch.hvm.vmx.host_cr0);
1246 
1247     /* Setup virtual ETP for L2 guest*/
1248     if ( nestedhvm_paging_mode_hap(v) )
1249         /* This will setup the initial np2m for the nested vCPU */
1250         __vmwrite(EPT_POINTER, get_shadow_eptp(v));
1251     else
1252         __vmwrite(EPT_POINTER, get_host_eptp(v));
1253 
1254     /* nested VPID support! */
1255     if ( cpu_has_vmx_vpid && nvmx_vpid_enabled(v) )
1256     {
1257         struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1258         uint32_t new_vpid = get_vvmcs(v, VIRTUAL_PROCESSOR_ID);
1259 
1260         if ( nvmx->guest_vpid != new_vpid )
1261         {
1262             hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(v).nv_n2asid);
1263             nvmx->guest_vpid = new_vpid;
1264         }
1265     }
1266 
1267 }
1268 
sync_vvmcs_guest_state(struct vcpu * v,struct cpu_user_regs * regs)1269 static void sync_vvmcs_guest_state(struct vcpu *v, struct cpu_user_regs *regs)
1270 {
1271     /* copy shadow vmcs.gstate back to vvmcs.gstate */
1272     shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_gstate_field),
1273                          vmcs_gstate_field);
1274     /* RIP, RSP are in user regs */
1275     set_vvmcs(v, GUEST_RIP, regs->rip);
1276     set_vvmcs(v, GUEST_RSP, regs->rsp);
1277 
1278     /* CR3 sync if exec doesn't want cr3 load exiting: i.e. nested EPT */
1279     if ( !(__n2_exec_control(v) & CPU_BASED_CR3_LOAD_EXITING) )
1280         shadow_to_vvmcs(v, GUEST_CR3);
1281 
1282     if ( v->arch.hvm.vmx.cr4_host_mask != ~0UL )
1283         /* Only need to update nested GUEST_CR4 if not all bits are trapped. */
1284         set_vvmcs(v, GUEST_CR4, v->arch.hvm.guest_cr[4]);
1285 }
1286 
sync_vvmcs_ro(struct vcpu * v)1287 static void sync_vvmcs_ro(struct vcpu *v)
1288 {
1289     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1290 
1291     shadow_to_vvmcs_bulk(v, ARRAY_SIZE(vmcs_ro_field), vmcs_ro_field);
1292 
1293     /* Adjust exit_reason/exit_qualifciation for violation case */
1294     if ( get_vvmcs(v, VM_EXIT_REASON) == EXIT_REASON_EPT_VIOLATION )
1295     {
1296         set_vvmcs(v, EXIT_QUALIFICATION, nvmx->ept.exit_qual);
1297         set_vvmcs(v, VM_EXIT_REASON, nvmx->ept.exit_reason);
1298     }
1299 }
1300 
load_vvmcs_host_state(struct vcpu * v)1301 static void load_vvmcs_host_state(struct vcpu *v)
1302 {
1303     int i, rc;
1304     u64 r;
1305     u32 control;
1306 
1307     for ( i = 0; i < ARRAY_SIZE(vmcs_h2g_field); i++ )
1308     {
1309         r = get_vvmcs(v, vmcs_h2g_field[i].host_field);
1310         __vmwrite(vmcs_h2g_field[i].guest_field, r);
1311     }
1312 
1313     rc = hvm_set_cr4(get_vvmcs(v, HOST_CR4), true);
1314     if ( rc == X86EMUL_EXCEPTION )
1315         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1316 
1317     rc = hvm_set_cr0(get_vvmcs(v, HOST_CR0), true);
1318     if ( rc == X86EMUL_EXCEPTION )
1319         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1320 
1321     rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
1322     if ( rc == X86EMUL_EXCEPTION )
1323         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1324 
1325     control = get_vvmcs(v, VM_EXIT_CONTROLS);
1326     if ( control & VM_EXIT_LOAD_HOST_PAT )
1327         hvm_set_guest_pat(v, get_vvmcs(v, HOST_PAT));
1328     if ( control & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
1329     {
1330         rc = hvm_msr_write_intercept(MSR_CORE_PERF_GLOBAL_CTRL,
1331                                      get_vvmcs(v, HOST_PERF_GLOBAL_CTRL), true);
1332         if ( rc == X86EMUL_EXCEPTION )
1333             hvm_inject_hw_exception(TRAP_gp_fault, 0);
1334     }
1335 
1336     hvm_set_tsc_offset(v, v->arch.hvm.cache_tsc_offset, 0);
1337 
1338     set_vvmcs(v, VM_ENTRY_INTR_INFO, 0);
1339 
1340     if ( v->arch.hvm.vmx.exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP )
1341         __vmwrite(MSR_BITMAP, virt_to_maddr(v->arch.hvm.vmx.msr_bitmap));
1342 }
1343 
sync_exception_state(struct vcpu * v)1344 static void sync_exception_state(struct vcpu *v)
1345 {
1346     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1347     uint32_t exit_ctrl = get_vvmcs(v, VM_EXIT_CONTROLS);
1348 
1349     if ( !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) )
1350         return;
1351 
1352     switch ( MASK_EXTR(nvmx->intr.intr_info, INTR_INFO_INTR_TYPE_MASK) )
1353     {
1354     case X86_EVENTTYPE_EXT_INTR:
1355         /* rename exit_reason to EXTERNAL_INTERRUPT */
1356         set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXTERNAL_INTERRUPT);
1357         set_vvmcs(v, EXIT_QUALIFICATION, 0);
1358         set_vvmcs(v, VM_EXIT_INTR_INFO,
1359                   (exit_ctrl & VM_EXIT_ACK_INTR_ON_EXIT) ? nvmx->intr.intr_info
1360                                                          : 0);
1361         break;
1362 
1363     case X86_EVENTTYPE_HW_EXCEPTION:
1364     case X86_EVENTTYPE_SW_INTERRUPT:
1365     case X86_EVENTTYPE_SW_EXCEPTION:
1366         /* throw to L1 */
1367         set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1368         set_vvmcs(v, VM_EXIT_INTR_ERROR_CODE, nvmx->intr.error_code);
1369         break;
1370     case X86_EVENTTYPE_NMI:
1371         set_vvmcs(v, VM_EXIT_REASON, EXIT_REASON_EXCEPTION_NMI);
1372         set_vvmcs(v, EXIT_QUALIFICATION, 0);
1373         set_vvmcs(v, VM_EXIT_INTR_INFO, nvmx->intr.intr_info);
1374         break;
1375     default:
1376         gdprintk(XENLOG_ERR, "Exception state %lx not handled\n",
1377                nvmx->intr.intr_info);
1378         break;
1379     }
1380 }
1381 
nvmx_update_apicv(struct vcpu * v)1382 static void nvmx_update_apicv(struct vcpu *v)
1383 {
1384     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1385     unsigned long reason = get_vvmcs(v, VM_EXIT_REASON);
1386     unsigned long intr_info = get_vvmcs(v, VM_EXIT_INTR_INFO);
1387     unsigned long status;
1388     int rvi;
1389 
1390     if ( reason != EXIT_REASON_EXTERNAL_INTERRUPT ||
1391          nvmx->intr.source != hvm_intsrc_lapic )
1392         return;
1393 
1394     if ( intr_info & INTR_INFO_VALID_MASK )
1395     {
1396         uint32_t ppr;
1397         unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
1398         struct vlapic *vlapic = vcpu_vlapic(v);
1399 
1400         /*
1401          * Update SVI to record the current in service interrupt that's
1402          * signaled in EXIT_INTR_INFO.
1403          */
1404         vlapic_ack_pending_irq(v, vector, 1);
1405 
1406         ppr = vlapic_set_ppr(vlapic);
1407         WARN_ON((ppr & 0xf0) != (vector & 0xf0));
1408 
1409         status = vector << VMX_GUEST_INTR_STATUS_SVI_OFFSET;
1410     }
1411     else
1412        /* Keep previous SVI if there's any. */
1413        __vmread(GUEST_INTR_STATUS, &status);
1414 
1415     rvi = vlapic_has_pending_irq(v);
1416     if ( rvi != -1 )
1417     {
1418         status &= ~VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
1419         status |= rvi & VMX_GUEST_INTR_STATUS_SUBFIELD_BITMASK;
1420     }
1421 
1422     if ( status )
1423     {
1424         __vmwrite(GUEST_INTR_STATUS, status);
1425         vmx_sync_exit_bitmap(v);
1426     }
1427 }
1428 
virtual_vmexit(struct cpu_user_regs * regs)1429 static void virtual_vmexit(struct cpu_user_regs *regs)
1430 {
1431     struct vcpu *v = current;
1432     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1433     unsigned long lm_l1, lm_l2;
1434 
1435     sync_vvmcs_ro(v);
1436     sync_vvmcs_guest_state(v, regs);
1437     sync_exception_state(v);
1438 
1439     if ( nvmx_ept_enabled(v) && hvm_pae_enabled(v) &&
1440          !(v->arch.hvm.guest_efer & EFER_LMA) )
1441         shadow_to_vvmcs_bulk(v, ARRAY_SIZE(gpdpte_fields), gpdpte_fields);
1442 
1443     /* This will clear current pCPU bit in p2m->dirty_cpumask */
1444     np2m_schedule(NP2M_SCHEDLE_OUT);
1445 
1446     vmx_vmcs_switch(v->arch.hvm.vmx.vmcs_pa, nvcpu->nv_n1vmcx_pa);
1447 
1448     nestedhvm_vcpu_exit_guestmode(v);
1449     nvcpu->nv_vmexit_pending = 0;
1450     nvcpu->nv_vmswitch_in_progress = 1;
1451 
1452     lm_l2 = hvm_long_mode_active(v);
1453     lm_l1 = !!(get_vvmcs(v, VM_EXIT_CONTROLS) & VM_EXIT_IA32E_MODE);
1454 
1455     if ( lm_l1 )
1456         v->arch.hvm.guest_efer |= EFER_LMA | EFER_LME;
1457     else
1458         v->arch.hvm.guest_efer &= ~(EFER_LMA | EFER_LME);
1459 
1460     vmx_update_cpu_exec_control(v);
1461     vmx_update_secondary_exec_control(v);
1462     vmx_update_exception_bitmap(v);
1463 
1464     load_vvmcs_host_state(v);
1465 
1466     if ( lm_l1 != lm_l2 )
1467         paging_update_paging_modes(v);
1468 
1469     regs->rip = get_vvmcs(v, HOST_RIP);
1470     regs->rsp = get_vvmcs(v, HOST_RSP);
1471     /* VM exit clears all bits except bit 1 */
1472     regs->rflags = X86_EFLAGS_MBS;
1473 
1474     /* updating host cr0 to sync TS bit */
1475     __vmwrite(HOST_CR0, v->arch.hvm.vmx.host_cr0);
1476 
1477     if ( cpu_has_vmx_virtual_intr_delivery )
1478         nvmx_update_apicv(v);
1479 
1480     nvcpu->nv_vmswitch_in_progress = 0;
1481 }
1482 
nvmx_eptp_update(void)1483 static void nvmx_eptp_update(void)
1484 {
1485     struct vcpu *curr = current;
1486 
1487     if ( !nestedhvm_vcpu_in_guestmode(curr) ||
1488           vcpu_nestedhvm(curr).nv_vmexit_pending ||
1489          !vcpu_nestedhvm(curr).stale_np2m ||
1490          !nestedhvm_paging_mode_hap(curr) )
1491         return;
1492 
1493     /*
1494      * Interrupts are enabled here, so we need to clear stale_np2m
1495      * before we do the vmwrite.  If we do it in the other order, an
1496      * and IPI comes in changing the shadow eptp after the vmwrite,
1497      * we'll complete the vmenter with a stale eptp value.
1498      */
1499     vcpu_nestedhvm(curr).stale_np2m = false;
1500     __vmwrite(EPT_POINTER, get_shadow_eptp(curr));
1501 }
1502 
nvmx_switch_guest(void)1503 void nvmx_switch_guest(void)
1504 {
1505     struct vcpu *v = current;
1506     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1507     struct cpu_user_regs *regs = guest_cpu_user_regs();
1508 
1509     nvmx_eptp_update();
1510 
1511     /*
1512      * A pending IO emulation may still be not finished. In this case, no
1513      * virtual vmswitch is allowed. Or else, the following IO emulation will
1514      * be handled in a wrong VCPU context. If there are no IO backends - PVH
1515      * guest by itself or a PVH guest with an HVM guest running inside - we
1516      * don't want to continue as this setup is not implemented nor supported
1517      * as of right now.
1518      */
1519     if ( hvm_io_pending(v) )
1520         return;
1521     /*
1522      * a softirq may interrupt us between a virtual vmentry is
1523      * just handled and the true vmentry. If during this window,
1524      * a L1 virtual interrupt causes another virtual vmexit, we
1525      * cannot let that happen or VM_ENTRY_INTR_INFO will be lost.
1526      */
1527     if ( unlikely(nvcpu->nv_vmswitch_in_progress) )
1528         return;
1529 
1530     if ( nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmexit_pending )
1531         virtual_vmexit(regs);
1532     else if ( !nestedhvm_vcpu_in_guestmode(v) && nvcpu->nv_vmentry_pending )
1533         virtual_vmentry(regs);
1534 }
1535 
1536 /*
1537  * VMX instructions handling
1538  */
1539 
nvmx_handle_vmxon(struct cpu_user_regs * regs)1540 static int nvmx_handle_vmxon(struct cpu_user_regs *regs)
1541 {
1542     struct vcpu *v=current;
1543     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1544     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1545     struct vmx_inst_decoded decode;
1546     unsigned long gpa = 0;
1547     uint32_t nvmcs_revid;
1548     int rc;
1549 
1550     rc = decode_vmx_inst(regs, &decode, &gpa);
1551     if ( rc != X86EMUL_OKAY )
1552         return rc;
1553 
1554     if ( nvmx_vcpu_in_vmx(v) )
1555     {
1556         vmfail(regs, VMX_INSN_VMXON_IN_VMX_ROOT);
1557         return X86EMUL_OKAY;
1558     }
1559 
1560     if ( (gpa & ~PAGE_MASK) || !gfn_valid(v->domain, _gfn(gpa >> PAGE_SHIFT)) )
1561     {
1562         vmfail_invalid(regs);
1563         return X86EMUL_OKAY;
1564     }
1565 
1566     rc = hvm_copy_from_guest_phys(&nvmcs_revid, gpa, sizeof(nvmcs_revid));
1567     if ( rc != HVMTRANS_okay ||
1568          (nvmcs_revid & ~VMX_BASIC_REVISION_MASK) ||
1569          ((nvmcs_revid ^ vmx_basic_msr) & VMX_BASIC_REVISION_MASK) )
1570     {
1571         vmfail_invalid(regs);
1572         return X86EMUL_OKAY;
1573     }
1574 
1575     nvmx->vmxon_region_pa = gpa;
1576 
1577     /*
1578      * `fork' the host vmcs to shadow_vmcs
1579      * vmcs_lock is not needed since we are on current
1580      */
1581     nvcpu->nv_n1vmcx_pa = v->arch.hvm.vmx.vmcs_pa;
1582     __vmpclear(v->arch.hvm.vmx.vmcs_pa);
1583     copy_domain_page(_mfn(PFN_DOWN(nvcpu->nv_n2vmcx_pa)),
1584                      _mfn(PFN_DOWN(v->arch.hvm.vmx.vmcs_pa)));
1585     __vmptrld(v->arch.hvm.vmx.vmcs_pa);
1586     v->arch.hvm.vmx.launched = 0;
1587     vmsucceed(regs);
1588 
1589     return X86EMUL_OKAY;
1590 }
1591 
nvmx_handle_vmxoff(struct cpu_user_regs * regs)1592 static int nvmx_handle_vmxoff(struct cpu_user_regs *regs)
1593 {
1594     struct vcpu *v=current;
1595     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1596 
1597     nvmx_purge_vvmcs(v);
1598     nvmx->vmxon_region_pa = INVALID_PADDR;
1599 
1600     vmsucceed(regs);
1601     return X86EMUL_OKAY;
1602 }
1603 
vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1604 static bool_t vvmcs_launched(struct list_head *launched_list,
1605                              unsigned long vvmcs_mfn)
1606 {
1607     struct vvmcs_list *vvmcs;
1608     struct list_head *pos;
1609     bool_t launched = 0;
1610 
1611     list_for_each(pos, launched_list)
1612     {
1613         vvmcs = list_entry(pos, struct vvmcs_list, node);
1614         if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1615         {
1616             launched = 1;
1617             break;
1618         }
1619     }
1620 
1621     return launched;
1622 }
1623 
set_vvmcs_launched(struct list_head * launched_list,unsigned long vvmcs_mfn)1624 static int set_vvmcs_launched(struct list_head *launched_list,
1625                               unsigned long vvmcs_mfn)
1626 {
1627     struct vvmcs_list *vvmcs;
1628 
1629     if ( vvmcs_launched(launched_list, vvmcs_mfn) )
1630         return 0;
1631 
1632     vvmcs = xzalloc(struct vvmcs_list);
1633     if ( !vvmcs )
1634         return -ENOMEM;
1635 
1636     vvmcs->vvmcs_mfn = vvmcs_mfn;
1637     list_add(&vvmcs->node, launched_list);
1638 
1639     return 0;
1640 }
1641 
clear_vvmcs_launched(struct list_head * launched_list,paddr_t vvmcs_mfn)1642 static void clear_vvmcs_launched(struct list_head *launched_list,
1643                                  paddr_t vvmcs_mfn)
1644 {
1645     struct vvmcs_list *vvmcs;
1646     struct list_head *pos;
1647 
1648     list_for_each(pos, launched_list)
1649     {
1650         vvmcs = list_entry(pos, struct vvmcs_list, node);
1651         if ( vvmcs_mfn == vvmcs->vvmcs_mfn )
1652         {
1653             list_del(&vvmcs->node);
1654             xfree(vvmcs);
1655             break;
1656         }
1657     }
1658 }
1659 
nvmx_vmresume(struct vcpu * v)1660 static enum vmx_insn_errno nvmx_vmresume(struct vcpu *v)
1661 {
1662     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1663     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1664     unsigned int exec_ctrl;
1665 
1666     ASSERT(vvmcx_valid(v));
1667     exec_ctrl = __n2_exec_control(v);
1668 
1669     if ( exec_ctrl & CPU_BASED_ACTIVATE_IO_BITMAP )
1670     {
1671         if ( (nvmx->iobitmap[0] == NULL || nvmx->iobitmap[1] == NULL) &&
1672              !map_io_bitmap_all(v) )
1673             goto invalid_control_state;
1674     }
1675 
1676     if ( exec_ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP )
1677     {
1678         if ( nvmx->msrbitmap == NULL && !_map_msr_bitmap(v) )
1679             goto invalid_control_state;
1680     }
1681 
1682     nvcpu->nv_vmentry_pending = 1;
1683 
1684     return VMX_INSN_SUCCEED;
1685 
1686 invalid_control_state:
1687     return VMX_INSN_INVALID_CONTROL_STATE;
1688 }
1689 
nvmx_handle_vmresume(struct cpu_user_regs * regs)1690 static int nvmx_handle_vmresume(struct cpu_user_regs *regs)
1691 {
1692     bool_t launched;
1693     struct vcpu *v = current;
1694     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1695     unsigned long intr_shadow;
1696     int rc;
1697 
1698     if ( !vvmcx_valid(v) )
1699     {
1700         vmfail_invalid(regs);
1701         return X86EMUL_OKAY;
1702     }
1703 
1704     __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1705     if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1706     {
1707         vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1708         return X86EMUL_OKAY;
1709     }
1710 
1711     launched = vvmcs_launched(&nvmx->launched_list,
1712                               PFN_DOWN(v->arch.hvm.vmx.vmcs_shadow_maddr));
1713     if ( !launched )
1714     {
1715         vmfail_valid(regs, VMX_INSN_VMRESUME_NONLAUNCHED_VMCS);
1716         return X86EMUL_OKAY;
1717     }
1718 
1719     rc = nvmx_vmresume(v);
1720     if ( rc )
1721         vmfail_valid(regs, rc);
1722 
1723     return X86EMUL_OKAY;
1724 }
1725 
nvmx_handle_vmlaunch(struct cpu_user_regs * regs)1726 static int nvmx_handle_vmlaunch(struct cpu_user_regs *regs)
1727 {
1728     bool_t launched;
1729     struct vcpu *v = current;
1730     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1731     unsigned long intr_shadow;
1732     int rc;
1733 
1734     if ( !vvmcx_valid(v) )
1735     {
1736         vmfail_invalid(regs);
1737         return X86EMUL_OKAY;
1738     }
1739 
1740     __vmread(GUEST_INTERRUPTIBILITY_INFO, &intr_shadow);
1741     if ( intr_shadow & VMX_INTR_SHADOW_MOV_SS )
1742     {
1743         vmfail_valid(regs, VMX_INSN_VMENTRY_BLOCKED_BY_MOV_SS);
1744         return X86EMUL_OKAY;
1745     }
1746 
1747     launched = vvmcs_launched(&nvmx->launched_list,
1748                               PFN_DOWN(v->arch.hvm.vmx.vmcs_shadow_maddr));
1749     if ( launched )
1750     {
1751         vmfail_valid(regs, VMX_INSN_VMLAUNCH_NONCLEAR_VMCS);
1752         return X86EMUL_OKAY;
1753     }
1754     else {
1755         rc = nvmx_vmresume(v);
1756         if ( rc )
1757             vmfail_valid(regs, rc);
1758         else
1759         {
1760             if ( set_vvmcs_launched(&nvmx->launched_list,
1761                                     PFN_DOWN(v->arch.hvm.vmx.vmcs_shadow_maddr)) < 0 )
1762                 return X86EMUL_UNHANDLEABLE;
1763         }
1764         rc = X86EMUL_OKAY;
1765     }
1766     return rc;
1767 }
1768 
nvmx_handle_vmptrld(struct cpu_user_regs * regs)1769 static int nvmx_handle_vmptrld(struct cpu_user_regs *regs)
1770 {
1771     struct vcpu *v = current;
1772     struct vmx_inst_decoded decode;
1773     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1774     unsigned long gpa = 0;
1775     int rc;
1776 
1777     rc = decode_vmx_inst(regs, &decode, &gpa);
1778     if ( rc != X86EMUL_OKAY )
1779         return rc;
1780 
1781     if ( (gpa & ~PAGE_MASK) || !gfn_valid(v->domain, gaddr_to_gfn(gpa)) )
1782     {
1783         vmfail(regs, VMX_INSN_VMPTRLD_INVALID_PHYADDR);
1784         goto out;
1785     }
1786 
1787     if ( gpa == vcpu_2_nvmx(v).vmxon_region_pa )
1788     {
1789         vmfail(regs, VMX_INSN_VMPTRLD_WITH_VMXON_PTR);
1790         goto out;
1791     }
1792 
1793     if ( nvcpu->nv_vvmcxaddr != gpa )
1794         nvmx_purge_vvmcs(v);
1795 
1796     if ( !vvmcx_valid(v) )
1797     {
1798         bool_t writable;
1799         void *vvmcx = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 1, &writable);
1800 
1801         if ( vvmcx )
1802         {
1803             if ( writable )
1804             {
1805                 struct vmcs_struct *vvmcs = vvmcx;
1806 
1807                 if ( ((vvmcs->vmcs_revision_id ^ vmx_basic_msr) &
1808                                          VMX_BASIC_REVISION_MASK) ||
1809                      (!cpu_has_vmx_vmcs_shadowing &&
1810                       (vvmcs->vmcs_revision_id & ~VMX_BASIC_REVISION_MASK)) )
1811                 {
1812                     hvm_unmap_guest_frame(vvmcx, 1);
1813                     vmfail(regs, VMX_INSN_VMPTRLD_INCORRECT_VMCS_ID);
1814 
1815                     return X86EMUL_OKAY;
1816                 }
1817                 nvcpu->nv_vvmcx = vvmcx;
1818                 nvcpu->nv_vvmcxaddr = gpa;
1819                 v->arch.hvm.vmx.vmcs_shadow_maddr =
1820                     mfn_to_maddr(domain_page_map_to_mfn(vvmcx));
1821             }
1822             else
1823             {
1824                 hvm_unmap_guest_frame(vvmcx, 1);
1825                 vvmcx = NULL;
1826             }
1827         }
1828         else
1829         {
1830             vmfail(regs, VMX_INSN_VMPTRLD_INVALID_PHYADDR);
1831             goto out;
1832         }
1833     }
1834 
1835     if ( cpu_has_vmx_vmcs_shadowing )
1836         nvmx_set_vmcs_pointer(v, nvcpu->nv_vvmcx);
1837 
1838     vmsucceed(regs);
1839 
1840 out:
1841     return X86EMUL_OKAY;
1842 }
1843 
nvmx_handle_vmptrst(struct cpu_user_regs * regs)1844 static int nvmx_handle_vmptrst(struct cpu_user_regs *regs)
1845 {
1846     struct vcpu *v = current;
1847     struct vmx_inst_decoded decode;
1848     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1849     pagefault_info_t pfinfo;
1850     unsigned long gpa = 0;
1851     int rc;
1852 
1853     rc = decode_vmx_inst(regs, &decode, &gpa);
1854     if ( rc != X86EMUL_OKAY )
1855         return rc;
1856 
1857     gpa = nvcpu->nv_vvmcxaddr;
1858 
1859     rc = hvm_copy_to_guest_linear(decode.mem, &gpa, decode.len, 0, &pfinfo);
1860     if ( rc == HVMTRANS_bad_linear_to_gfn )
1861         hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1862     if ( rc != HVMTRANS_okay )
1863         return X86EMUL_EXCEPTION;
1864 
1865     vmsucceed(regs);
1866     return X86EMUL_OKAY;
1867 }
1868 
nvmx_handle_vmclear(struct cpu_user_regs * regs)1869 static int nvmx_handle_vmclear(struct cpu_user_regs *regs)
1870 {
1871     struct vcpu *v = current;
1872     struct vmx_inst_decoded decode;
1873     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
1874     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
1875     unsigned long gpa = 0;
1876     void *vvmcs;
1877     int rc;
1878 
1879     rc = decode_vmx_inst(regs, &decode, &gpa);
1880     if ( rc != X86EMUL_OKAY )
1881         return rc;
1882 
1883     if ( gpa == vcpu_2_nvmx(v).vmxon_region_pa )
1884     {
1885         vmfail(regs, VMX_INSN_VMCLEAR_WITH_VMXON_PTR);
1886         goto out;
1887     }
1888 
1889     if ( (gpa & ~PAGE_MASK) || !gfn_valid(v->domain, gaddr_to_gfn(gpa)) )
1890     {
1891         vmfail(regs, VMX_INSN_VMCLEAR_INVALID_PHYADDR);
1892         goto out;
1893     }
1894 
1895     if ( gpa == nvcpu->nv_vvmcxaddr )
1896     {
1897         if ( cpu_has_vmx_vmcs_shadowing )
1898             nvmx_clear_vmcs_pointer(v, nvcpu->nv_vvmcx);
1899         clear_vvmcs_launched(&nvmx->launched_list,
1900                              PFN_DOWN(v->arch.hvm.vmx.vmcs_shadow_maddr));
1901         nvmx_purge_vvmcs(v);
1902         vmsucceed(regs);
1903     }
1904     else
1905     {
1906         /* Even if this VMCS isn't the current one, we must clear it. */
1907         bool_t writable;
1908 
1909         vvmcs = hvm_map_guest_frame_rw(paddr_to_pfn(gpa), 0, &writable);
1910 
1911         if ( !vvmcs )
1912         {
1913             vmfail(regs, VMX_INSN_VMCLEAR_INVALID_PHYADDR);
1914             goto out;
1915         }
1916 
1917         if ( writable )
1918         {
1919             clear_vvmcs_launched(&nvmx->launched_list,
1920                                  mfn_x(domain_page_map_to_mfn(vvmcs)));
1921             vmsucceed(regs);
1922         }
1923         else
1924             vmfail(regs, VMX_INSN_VMCLEAR_INVALID_PHYADDR);
1925 
1926         hvm_unmap_guest_frame(vvmcs, 0);
1927     }
1928 
1929 out:
1930     return X86EMUL_OKAY;
1931 }
1932 
nvmx_handle_vmread(struct cpu_user_regs * regs)1933 static int nvmx_handle_vmread(struct cpu_user_regs *regs)
1934 {
1935     struct vcpu *v = current;
1936     struct vmx_inst_decoded decode;
1937     pagefault_info_t pfinfo;
1938     u64 value = 0;
1939     int rc;
1940 
1941     rc = decode_vmx_inst(regs, &decode, NULL);
1942     if ( rc != X86EMUL_OKAY )
1943         return rc;
1944 
1945     if ( !vvmcx_valid(v) )
1946     {
1947         vmfail_invalid(regs);
1948         return X86EMUL_OKAY;
1949     }
1950 
1951     rc = get_vvmcs_safe(v, reg_read(regs, decode.reg2), &value);
1952     if ( rc != VMX_INSN_SUCCEED )
1953     {
1954         vmfail(regs, rc);
1955         return X86EMUL_OKAY;
1956     }
1957 
1958     switch ( decode.type ) {
1959     case VMX_INST_MEMREG_TYPE_MEMORY:
1960         rc = hvm_copy_to_guest_linear(decode.mem, &value, decode.len, 0, &pfinfo);
1961         if ( rc == HVMTRANS_bad_linear_to_gfn )
1962             hvm_inject_page_fault(pfinfo.ec, pfinfo.linear);
1963         if ( rc != HVMTRANS_okay )
1964             return X86EMUL_EXCEPTION;
1965         break;
1966     case VMX_INST_MEMREG_TYPE_REG:
1967         reg_write(regs, decode.reg1, value);
1968         break;
1969     }
1970 
1971     vmsucceed(regs);
1972     return X86EMUL_OKAY;
1973 }
1974 
nvmx_handle_vmwrite(struct cpu_user_regs * regs)1975 static int nvmx_handle_vmwrite(struct cpu_user_regs *regs)
1976 {
1977     struct vcpu *v = current;
1978     struct vmx_inst_decoded decode;
1979     unsigned long operand;
1980     u64 vmcs_encoding;
1981     enum vmx_insn_errno err;
1982     int rc;
1983 
1984     rc = decode_vmx_inst(regs, &decode, &operand);
1985     if ( rc != X86EMUL_OKAY )
1986         return rc;
1987 
1988     if ( !vvmcx_valid(v) )
1989     {
1990         vmfail_invalid(regs);
1991         return X86EMUL_OKAY;
1992     }
1993 
1994     vmcs_encoding = reg_read(regs, decode.reg2);
1995     err = set_vvmcs_safe(v, vmcs_encoding, operand);
1996     if ( err != VMX_INSN_SUCCEED )
1997     {
1998         vmfail(regs, err);
1999         return X86EMUL_OKAY;
2000     }
2001 
2002     switch ( vmcs_encoding & ~VMCS_HIGH(0) )
2003     {
2004     case IO_BITMAP_A:
2005         unmap_io_bitmap(v, 0);
2006         break;
2007     case IO_BITMAP_B:
2008         unmap_io_bitmap(v, 1);
2009         break;
2010     case MSR_BITMAP:
2011         unmap_msr_bitmap(v);
2012         break;
2013     }
2014 
2015     vmsucceed(regs);
2016 
2017     return X86EMUL_OKAY;
2018 }
2019 
nvmx_handle_invept(struct cpu_user_regs * regs)2020 static int nvmx_handle_invept(struct cpu_user_regs *regs)
2021 {
2022     struct vmx_inst_decoded decode;
2023     unsigned long eptp;
2024     int ret;
2025 
2026     if ( (ret = decode_vmx_inst(regs, &decode, &eptp)) != X86EMUL_OKAY )
2027         return ret;
2028 
2029     switch ( reg_read(regs, decode.reg2) )
2030     {
2031     case INVEPT_SINGLE_CONTEXT:
2032     {
2033         np2m_flush_base(current, eptp);
2034         break;
2035     }
2036     case INVEPT_ALL_CONTEXT:
2037         p2m_flush_nestedp2m(current->domain);
2038         __invept(INVEPT_ALL_CONTEXT, 0);
2039         break;
2040     default:
2041         vmfail(regs, VMX_INSN_INVEPT_INVVPID_INVALID_OP);
2042         return X86EMUL_OKAY;
2043     }
2044     vmsucceed(regs);
2045     return X86EMUL_OKAY;
2046 }
2047 
nvmx_handle_invvpid(struct cpu_user_regs * regs)2048 static int nvmx_handle_invvpid(struct cpu_user_regs *regs)
2049 {
2050     struct vmx_inst_decoded decode;
2051     unsigned long vpid;
2052     int ret;
2053 
2054     if ( (ret = decode_vmx_inst(regs, &decode, &vpid)) != X86EMUL_OKAY )
2055         return ret;
2056 
2057     switch ( reg_read(regs, decode.reg2) )
2058     {
2059     /* Just invalidate all tlb entries for all types! */
2060     case INVVPID_INDIVIDUAL_ADDR:
2061     case INVVPID_SINGLE_CONTEXT:
2062     case INVVPID_ALL_CONTEXT:
2063         hvm_asid_flush_vcpu_asid(&vcpu_nestedhvm(current).nv_n2asid);
2064         break;
2065     default:
2066         vmfail(regs, VMX_INSN_INVEPT_INVVPID_INVALID_OP);
2067         return X86EMUL_OKAY;
2068     }
2069 
2070     vmsucceed(regs);
2071     return X86EMUL_OKAY;
2072 }
2073 
nvmx_handle_vmx_insn(struct cpu_user_regs * regs,unsigned int exit_reason)2074 int nvmx_handle_vmx_insn(struct cpu_user_regs *regs, unsigned int exit_reason)
2075 {
2076     struct vcpu *curr = current;
2077     int ret;
2078 
2079     if ( !(curr->arch.hvm.guest_cr[4] & X86_CR4_VMXE) ||
2080          !nestedhvm_enabled(curr->domain) ||
2081          (vmx_guest_x86_mode(curr) < (hvm_long_mode_active(curr) ? 8 : 2)) ||
2082          (exit_reason != EXIT_REASON_VMXON && !nvmx_vcpu_in_vmx(curr)) )
2083     {
2084         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2085         return X86EMUL_EXCEPTION;
2086     }
2087 
2088     if ( vmx_get_cpl() > 0 )
2089     {
2090         hvm_inject_hw_exception(TRAP_gp_fault, 0);
2091         return X86EMUL_EXCEPTION;
2092     }
2093 
2094     if ( nestedhvm_vcpu_in_guestmode(curr) )
2095     {
2096         /* Should have been handled by nvmx_n2_vmexit_handler()... */
2097         ASSERT_UNREACHABLE();
2098         domain_crash(curr->domain);
2099         return X86EMUL_UNHANDLEABLE;
2100     }
2101 
2102     switch ( exit_reason )
2103     {
2104     case EXIT_REASON_VMXOFF:
2105         ret = nvmx_handle_vmxoff(regs);
2106         break;
2107 
2108     case EXIT_REASON_VMXON:
2109         ret = nvmx_handle_vmxon(regs);
2110         break;
2111 
2112     case EXIT_REASON_VMCLEAR:
2113         ret = nvmx_handle_vmclear(regs);
2114         break;
2115 
2116     case EXIT_REASON_VMPTRLD:
2117         ret = nvmx_handle_vmptrld(regs);
2118         break;
2119 
2120     case EXIT_REASON_VMPTRST:
2121         ret = nvmx_handle_vmptrst(regs);
2122         break;
2123 
2124     case EXIT_REASON_VMREAD:
2125         ret = nvmx_handle_vmread(regs);
2126         break;
2127 
2128     case EXIT_REASON_VMWRITE:
2129         ret = nvmx_handle_vmwrite(regs);
2130         break;
2131 
2132     case EXIT_REASON_VMLAUNCH:
2133         ret = nvmx_handle_vmlaunch(regs);
2134         break;
2135 
2136     case EXIT_REASON_VMRESUME:
2137         ret = nvmx_handle_vmresume(regs);
2138         break;
2139 
2140     case EXIT_REASON_INVEPT:
2141         ret = nvmx_handle_invept(regs);
2142         break;
2143 
2144     case EXIT_REASON_INVVPID:
2145         ret = nvmx_handle_invvpid(regs);
2146         break;
2147 
2148     default:
2149         ASSERT_UNREACHABLE();
2150         domain_crash(curr->domain);
2151         ret = X86EMUL_UNHANDLEABLE;
2152         break;
2153     }
2154 
2155     return ret;
2156 }
2157 
2158 #define __emul_value(enable1, default1) \
2159     ((enable1 | default1) << 32 | (default1))
2160 
2161 #define gen_vmx_msr(enable1, default1, host_value) \
2162     (((__emul_value(enable1, default1) & host_value) & (~0ul << 32)) | \
2163     ((uint32_t)(__emul_value(enable1, default1) | host_value)))
2164 
2165 /*
2166  * Capability reporting
2167  */
nvmx_msr_read_intercept(unsigned int msr,u64 * msr_content)2168 int nvmx_msr_read_intercept(unsigned int msr, u64 *msr_content)
2169 {
2170     struct vcpu *v = current;
2171     struct domain *d = v->domain;
2172     u64 data = 0, host_data = 0;
2173     int r = 1;
2174 
2175     /* VMX capablity MSRs are available only when guest supports VMX. */
2176     if ( !nestedhvm_enabled(d) || !d->arch.cpuid->basic.vmx )
2177         return 0;
2178 
2179     /*
2180      * These MSRs are only available when flags in other MSRs are set.
2181      * These prerequisites are listed in the Intel 64 and IA-32
2182      * Architectures Software Developer’s Manual, Vol 3, Appendix A.
2183      */
2184     switch ( msr )
2185     {
2186     case MSR_IA32_VMX_PROCBASED_CTLS2:
2187         if ( !cpu_has_vmx_secondary_exec_control )
2188             return 0;
2189         break;
2190 
2191     case MSR_IA32_VMX_EPT_VPID_CAP:
2192         if ( !(cpu_has_vmx_ept || cpu_has_vmx_vpid) )
2193             return 0;
2194         break;
2195 
2196     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2197     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2198     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2199     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2200         if ( !(vmx_basic_msr & VMX_BASIC_DEFAULT1_ZERO) )
2201             return 0;
2202         break;
2203 
2204     case MSR_IA32_VMX_VMFUNC:
2205         if ( !cpu_has_vmx_vmfunc )
2206             return 0;
2207         break;
2208     }
2209 
2210     rdmsrl(msr, host_data);
2211 
2212     /*
2213      * Remove unsupport features from n1 guest capability MSR
2214      */
2215     switch (msr) {
2216     case MSR_IA32_VMX_BASIC:
2217     {
2218         const struct vmcs_struct *vmcs =
2219             map_domain_page(_mfn(PFN_DOWN(v->arch.hvm.vmx.vmcs_pa)));
2220 
2221         data = (host_data & (~0ul << 32)) |
2222                (vmcs->vmcs_revision_id & 0x7fffffff);
2223         unmap_domain_page(vmcs);
2224 
2225         if ( !cpu_has_vmx_vmcs_shadowing )
2226         {
2227             /* Report vmcs_region_size as 4096 */
2228             data &= ~VMX_BASIC_VMCS_SIZE_MASK;
2229             data |= 1ULL << 44;
2230         }
2231 
2232         break;
2233     }
2234     case MSR_IA32_VMX_PINBASED_CTLS:
2235     case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
2236         /* 1-settings */
2237         data = PIN_BASED_EXT_INTR_MASK |
2238                PIN_BASED_NMI_EXITING |
2239                PIN_BASED_PREEMPT_TIMER;
2240         data = gen_vmx_msr(data, VMX_PINBASED_CTLS_DEFAULT1, host_data);
2241         break;
2242     case MSR_IA32_VMX_PROCBASED_CTLS:
2243     case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
2244     {
2245         u32 default1_bits = VMX_PROCBASED_CTLS_DEFAULT1;
2246         /* 1-settings */
2247         data = CPU_BASED_HLT_EXITING |
2248                CPU_BASED_VIRTUAL_INTR_PENDING |
2249                CPU_BASED_CR8_LOAD_EXITING |
2250                CPU_BASED_CR8_STORE_EXITING |
2251                CPU_BASED_INVLPG_EXITING |
2252                CPU_BASED_CR3_LOAD_EXITING |
2253                CPU_BASED_CR3_STORE_EXITING |
2254                CPU_BASED_MONITOR_EXITING |
2255                CPU_BASED_MWAIT_EXITING |
2256                CPU_BASED_MOV_DR_EXITING |
2257                CPU_BASED_ACTIVATE_IO_BITMAP |
2258                CPU_BASED_USE_TSC_OFFSETING |
2259                CPU_BASED_UNCOND_IO_EXITING |
2260                CPU_BASED_RDTSC_EXITING |
2261                CPU_BASED_MONITOR_TRAP_FLAG |
2262                CPU_BASED_VIRTUAL_NMI_PENDING |
2263                CPU_BASED_ACTIVATE_MSR_BITMAP |
2264                CPU_BASED_PAUSE_EXITING |
2265                CPU_BASED_RDPMC_EXITING |
2266                CPU_BASED_TPR_SHADOW |
2267                CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
2268 
2269         if ( msr == MSR_IA32_VMX_TRUE_PROCBASED_CTLS )
2270             default1_bits &= ~(CPU_BASED_CR3_LOAD_EXITING |
2271                                CPU_BASED_CR3_STORE_EXITING |
2272                                CPU_BASED_INVLPG_EXITING);
2273 
2274         data = gen_vmx_msr(data, default1_bits, host_data);
2275         break;
2276     }
2277     case MSR_IA32_VMX_PROCBASED_CTLS2:
2278         /* 1-settings */
2279         data = SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING |
2280                SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2281                SECONDARY_EXEC_ENABLE_VPID |
2282                SECONDARY_EXEC_UNRESTRICTED_GUEST |
2283                SECONDARY_EXEC_ENABLE_EPT;
2284         data = gen_vmx_msr(data, 0, host_data);
2285         break;
2286     case MSR_IA32_VMX_EXIT_CTLS:
2287     case MSR_IA32_VMX_TRUE_EXIT_CTLS:
2288         /* 1-settings */
2289         data = VM_EXIT_ACK_INTR_ON_EXIT |
2290                VM_EXIT_IA32E_MODE |
2291                VM_EXIT_SAVE_PREEMPT_TIMER |
2292                VM_EXIT_SAVE_GUEST_PAT |
2293                VM_EXIT_LOAD_HOST_PAT |
2294                VM_EXIT_SAVE_GUEST_EFER |
2295                VM_EXIT_LOAD_HOST_EFER |
2296                VM_EXIT_LOAD_PERF_GLOBAL_CTRL;
2297         data = gen_vmx_msr(data, VMX_EXIT_CTLS_DEFAULT1, host_data);
2298         break;
2299     case MSR_IA32_VMX_ENTRY_CTLS:
2300     case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
2301         /* 1-settings */
2302         data = VM_ENTRY_LOAD_GUEST_PAT |
2303                VM_ENTRY_LOAD_GUEST_EFER |
2304                VM_ENTRY_LOAD_PERF_GLOBAL_CTRL |
2305                VM_ENTRY_IA32E_MODE;
2306         data = gen_vmx_msr(data, VMX_ENTRY_CTLS_DEFAULT1, host_data);
2307         break;
2308 
2309     case MSR_IA32_VMX_VMCS_ENUM:
2310         /* The max index of VVMCS encoding is 0x1f. */
2311         data = 0x1f << 1;
2312         break;
2313     case MSR_IA32_VMX_CR0_FIXED0:
2314         /* PG, PE bits must be 1 in VMX operation */
2315         data = X86_CR0_PE | X86_CR0_PG;
2316         break;
2317     case MSR_IA32_VMX_CR0_FIXED1:
2318         /* allow 0-settings for all bits */
2319         data = 0xffffffff;
2320         break;
2321     case MSR_IA32_VMX_CR4_FIXED0:
2322         /* VMXE bit must be 1 in VMX operation */
2323         data = X86_CR4_VMXE;
2324         break;
2325     case MSR_IA32_VMX_CR4_FIXED1:
2326         data = hvm_cr4_guest_valid_bits(d, false);
2327         break;
2328     case MSR_IA32_VMX_MISC:
2329         /* Do not support CR3-target feature now */
2330         data = host_data & ~VMX_MISC_CR3_TARGET;
2331         break;
2332     case MSR_IA32_VMX_EPT_VPID_CAP:
2333         data = nept_get_ept_vpid_cap();
2334         break;
2335     default:
2336         r = 0;
2337         break;
2338     }
2339 
2340     *msr_content = data;
2341     return r;
2342 }
2343 
2344 /* This function uses L2_gpa to walk the P2M page table in L1. If the
2345  * walk is successful, the translated value is returned in
2346  * L1_gpa. The result value tells what to do next.
2347  */
2348 int
nvmx_hap_walk_L1_p2m(struct vcpu * v,paddr_t L2_gpa,paddr_t * L1_gpa,unsigned int * page_order,uint8_t * p2m_acc,bool_t access_r,bool_t access_w,bool_t access_x)2349 nvmx_hap_walk_L1_p2m(struct vcpu *v, paddr_t L2_gpa, paddr_t *L1_gpa,
2350                      unsigned int *page_order, uint8_t *p2m_acc,
2351                      bool_t access_r, bool_t access_w, bool_t access_x)
2352 {
2353     int rc;
2354     unsigned long gfn;
2355     uint64_t exit_qual;
2356     uint32_t exit_reason = EXIT_REASON_EPT_VIOLATION;
2357     uint32_t rwx_rights = (access_x << 2) | (access_w << 1) | access_r;
2358     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2359 
2360     vmx_vmcs_enter(v);
2361 
2362     __vmread(EXIT_QUALIFICATION, &exit_qual);
2363     rc = nept_translate_l2ga(v, L2_gpa, page_order, rwx_rights, &gfn, p2m_acc,
2364                              &exit_qual, &exit_reason);
2365     switch ( rc )
2366     {
2367     case EPT_TRANSLATE_SUCCEED:
2368         *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK);
2369         rc = NESTEDHVM_PAGEFAULT_DONE;
2370         break;
2371     case EPT_TRANSLATE_VIOLATION:
2372     case EPT_TRANSLATE_MISCONFIG:
2373         rc = NESTEDHVM_PAGEFAULT_INJECT;
2374         nvmx->ept.exit_reason = exit_reason;
2375         nvmx->ept.exit_qual = exit_qual;
2376         break;
2377     case EPT_TRANSLATE_RETRY:
2378         rc = NESTEDHVM_PAGEFAULT_RETRY;
2379         break;
2380     default:
2381         gdprintk(XENLOG_ERR, "GUEST EPT translation error!:%d\n", rc);
2382         BUG();
2383         break;
2384     }
2385 
2386     vmx_vmcs_exit(v);
2387 
2388     return rc;
2389 }
2390 
nvmx_idtv_handling(void)2391 void nvmx_idtv_handling(void)
2392 {
2393     struct vcpu *v = current;
2394     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2395     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2396     unsigned long idtv_info, reason;
2397 
2398     __vmread(IDT_VECTORING_INFO, &idtv_info);
2399     if ( likely(!(idtv_info & INTR_INFO_VALID_MASK)) )
2400         return;
2401 
2402     /*
2403      * If L0 can solve the fault that causes idt vectoring, it should
2404      * be reinjected, otherwise, pass to L1.
2405      */
2406     __vmread(VM_EXIT_REASON, &reason);
2407     if ( reason != EXIT_REASON_EPT_VIOLATION ?
2408          !(nvmx->intr.intr_info & INTR_INFO_VALID_MASK) :
2409          !nvcpu->nv_vmexit_pending )
2410     {
2411         __vmwrite(VM_ENTRY_INTR_INFO, idtv_info & ~INTR_INFO_RESVD_BITS_MASK);
2412         if ( idtv_info & INTR_INFO_DELIVER_CODE_MASK )
2413         {
2414             __vmread(IDT_VECTORING_ERROR_CODE, &reason);
2415             __vmwrite(VM_ENTRY_EXCEPTION_ERROR_CODE, reason);
2416         }
2417         /*
2418          * SDM 23.2.4, if L1 tries to inject a software interrupt
2419          * and the delivery fails, VM_EXIT_INSTRUCTION_LEN receives
2420          * the value of previous VM_ENTRY_INSTRUCTION_LEN.
2421          *
2422          * This means EXIT_INSTRUCTION_LEN is always valid here, for
2423          * software interrupts both injected by L1, and generated in L2.
2424          */
2425         __vmread(VM_EXIT_INSTRUCTION_LEN, &reason);
2426         __vmwrite(VM_ENTRY_INSTRUCTION_LEN, reason);
2427    }
2428 }
2429 
2430 /*
2431  * L2 VMExit handling
2432  *    return 1: Done or skip the normal layer 0 hypervisor process.
2433  *              Typically it requires layer 1 hypervisor processing
2434  *              or it may be already processed here.
2435  *           0: Require the normal layer 0 process.
2436  */
nvmx_n2_vmexit_handler(struct cpu_user_regs * regs,unsigned int exit_reason)2437 int nvmx_n2_vmexit_handler(struct cpu_user_regs *regs,
2438                                unsigned int exit_reason)
2439 {
2440     struct vcpu *v = current;
2441     struct nestedvcpu *nvcpu = &vcpu_nestedhvm(v);
2442     struct nestedvmx *nvmx = &vcpu_2_nvmx(v);
2443     u32 ctrl;
2444 
2445     nvcpu->nv_vmexit_pending = 0;
2446     nvmx->intr.intr_info = 0;
2447     nvmx->intr.error_code = 0;
2448 
2449     switch (exit_reason) {
2450     case EXIT_REASON_EXCEPTION_NMI:
2451     {
2452         unsigned long intr_info;
2453         u32 valid_mask = MASK_INSR(X86_EVENTTYPE_HW_EXCEPTION,
2454                                   INTR_INFO_INTR_TYPE_MASK) |
2455                          INTR_INFO_VALID_MASK;
2456         u64 exec_bitmap;
2457         int vector;
2458 
2459         __vmread(VM_EXIT_INTR_INFO, &intr_info);
2460         vector = intr_info & INTR_INFO_VECTOR_MASK;
2461         /*
2462          * decided by L0 and L1 exception bitmap, if the vetor is set by
2463          * both, L0 has priority on #PF and #NM, L1 has priority on others
2464          */
2465         if ( vector == TRAP_page_fault )
2466         {
2467             if ( paging_mode_hap(v->domain) )
2468                 nvcpu->nv_vmexit_pending = 1;
2469         }
2470         else if ( vector == TRAP_no_device )
2471         {
2472             if ( v->fpu_dirtied )
2473                 nvcpu->nv_vmexit_pending = 1;
2474         }
2475         else if ( (intr_info & valid_mask) == valid_mask )
2476         {
2477             exec_bitmap = get_vvmcs(v, EXCEPTION_BITMAP);
2478 
2479             if ( exec_bitmap & (1 << vector) )
2480                 nvcpu->nv_vmexit_pending = 1;
2481         }
2482         break;
2483     }
2484     case EXIT_REASON_WBINVD:
2485     case EXIT_REASON_EPT_VIOLATION:
2486     case EXIT_REASON_EPT_MISCONFIG:
2487     case EXIT_REASON_EXTERNAL_INTERRUPT:
2488         /* pass to L0 handler */
2489         break;
2490     case VMX_EXIT_REASONS_FAILED_VMENTRY:
2491     case EXIT_REASON_TRIPLE_FAULT:
2492     case EXIT_REASON_TASK_SWITCH:
2493     case EXIT_REASON_CPUID:
2494     case EXIT_REASON_GETSEC:
2495     case EXIT_REASON_INVD:
2496     case EXIT_REASON_VMCALL:
2497     case EXIT_REASON_VMCLEAR:
2498     case EXIT_REASON_VMLAUNCH:
2499     case EXIT_REASON_VMPTRLD:
2500     case EXIT_REASON_VMPTRST:
2501     case EXIT_REASON_VMREAD:
2502     case EXIT_REASON_VMRESUME:
2503     case EXIT_REASON_VMWRITE:
2504     case EXIT_REASON_VMXOFF:
2505     case EXIT_REASON_VMXON:
2506     case EXIT_REASON_INVEPT:
2507     case EXIT_REASON_XSETBV:
2508     case EXIT_REASON_INVVPID:
2509         /* inject to L1 */
2510         nvcpu->nv_vmexit_pending = 1;
2511         break;
2512 
2513     case EXIT_REASON_MSR_READ:
2514     case EXIT_REASON_MSR_WRITE:
2515         ctrl = __n2_exec_control(v);
2516 
2517         /* Without ACTIVATE_MSR_BITMAP, all MSRs are intercepted. */
2518         if ( !(ctrl & CPU_BASED_ACTIVATE_MSR_BITMAP) )
2519             nvcpu->nv_vmexit_pending = 1;
2520         else if ( !nvmx->msrbitmap )
2521             /* ACTIVATE_MSR_BITMAP set, but L2 bitmap not mapped??? */
2522             domain_crash(v->domain);
2523         else
2524             nvcpu->nv_vmexit_pending =
2525                 vmx_msr_is_intercepted(nvmx->msrbitmap, regs->ecx,
2526                                        exit_reason == EXIT_REASON_MSR_WRITE);
2527         break;
2528 
2529     case EXIT_REASON_IO_INSTRUCTION:
2530         ctrl = __n2_exec_control(v);
2531         if ( ctrl & CPU_BASED_ACTIVATE_IO_BITMAP )
2532         {
2533             unsigned long qual;
2534             u16 port, size;
2535 
2536             __vmread(EXIT_QUALIFICATION, &qual);
2537             port = qual >> 16;
2538             size = (qual & 7) + 1;
2539             do {
2540                 const u8 *bitmap = nvmx->iobitmap[port >> 15];
2541 
2542                 if ( bitmap[(port & 0x7fff) >> 3] & (1 << (port & 7)) )
2543                     nvcpu->nv_vmexit_pending = 1;
2544                 if ( !--size )
2545                     break;
2546                 if ( !++port )
2547                     nvcpu->nv_vmexit_pending = 1;
2548             } while ( !nvcpu->nv_vmexit_pending );
2549             if ( !nvcpu->nv_vmexit_pending )
2550                 printk(XENLOG_G_WARNING "L0 PIO %04x\n", port);
2551         }
2552         else if ( ctrl & CPU_BASED_UNCOND_IO_EXITING )
2553             nvcpu->nv_vmexit_pending = 1;
2554         break;
2555 
2556     case EXIT_REASON_PENDING_VIRT_INTR:
2557         ctrl = __n2_exec_control(v);
2558         if ( ctrl & CPU_BASED_VIRTUAL_INTR_PENDING )
2559             nvcpu->nv_vmexit_pending = 1;
2560         break;
2561     case EXIT_REASON_PENDING_VIRT_NMI:
2562         ctrl = __n2_exec_control(v);
2563         if ( ctrl & CPU_BASED_VIRTUAL_NMI_PENDING )
2564             nvcpu->nv_vmexit_pending = 1;
2565         break;
2566     case EXIT_REASON_MONITOR_TRAP_FLAG:
2567         ctrl = __n2_exec_control(v);
2568         if ( ctrl & CPU_BASED_MONITOR_TRAP_FLAG)
2569             nvcpu->nv_vmexit_pending = 1;
2570         break;
2571     case EXIT_REASON_ACCESS_GDTR_OR_IDTR:
2572     case EXIT_REASON_ACCESS_LDTR_OR_TR:
2573         ctrl = __n2_secondary_exec_control(v);
2574         if ( ctrl & SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING )
2575             nvcpu->nv_vmexit_pending = 1;
2576         break;
2577     case EXIT_REASON_VMX_PREEMPTION_TIMER_EXPIRED:
2578         ctrl = __n2_pin_exec_control(v);
2579         if ( ctrl & PIN_BASED_PREEMPT_TIMER )
2580             nvcpu->nv_vmexit_pending = 1;
2581         break;
2582     /* L1 has priority handling several other types of exits */
2583     case EXIT_REASON_HLT:
2584         ctrl = __n2_exec_control(v);
2585         if ( ctrl & CPU_BASED_HLT_EXITING )
2586             nvcpu->nv_vmexit_pending = 1;
2587         break;
2588     case EXIT_REASON_RDTSC:
2589     case EXIT_REASON_RDTSCP:
2590         ctrl = __n2_exec_control(v);
2591         if ( ctrl & CPU_BASED_RDTSC_EXITING )
2592             nvcpu->nv_vmexit_pending = 1;
2593         else
2594         {
2595             /*
2596              * special handler is needed if L1 doesn't intercept rdtsc,
2597              * avoiding changing guest_tsc and messing up timekeeping in L1
2598              */
2599             msr_split(regs, hvm_get_guest_tsc(v) + get_vvmcs(v, TSC_OFFSET));
2600             if ( exit_reason == EXIT_REASON_RDTSCP )
2601                 regs->rcx = v->arch.msrs->tsc_aux;
2602             update_guest_eip();
2603 
2604             return 1;
2605         }
2606         break;
2607     case EXIT_REASON_RDPMC:
2608         ctrl = __n2_exec_control(v);
2609         if ( ctrl & CPU_BASED_RDPMC_EXITING )
2610             nvcpu->nv_vmexit_pending = 1;
2611         break;
2612     case EXIT_REASON_MWAIT_INSTRUCTION:
2613         ctrl = __n2_exec_control(v);
2614         if ( ctrl & CPU_BASED_MWAIT_EXITING )
2615             nvcpu->nv_vmexit_pending = 1;
2616         break;
2617     case EXIT_REASON_PAUSE_INSTRUCTION:
2618         ctrl = __n2_exec_control(v);
2619         if ( ctrl & CPU_BASED_PAUSE_EXITING )
2620             nvcpu->nv_vmexit_pending = 1;
2621         break;
2622     case EXIT_REASON_MONITOR_INSTRUCTION:
2623         ctrl = __n2_exec_control(v);
2624         if ( ctrl & CPU_BASED_MONITOR_EXITING )
2625             nvcpu->nv_vmexit_pending = 1;
2626         break;
2627     case EXIT_REASON_DR_ACCESS:
2628         ctrl = __n2_exec_control(v);
2629         if ( (ctrl & CPU_BASED_MOV_DR_EXITING) &&
2630             v->arch.hvm.flag_dr_dirty )
2631             nvcpu->nv_vmexit_pending = 1;
2632         break;
2633     case EXIT_REASON_INVLPG:
2634         ctrl = __n2_exec_control(v);
2635         if ( ctrl & CPU_BASED_INVLPG_EXITING )
2636             nvcpu->nv_vmexit_pending = 1;
2637         break;
2638     case EXIT_REASON_CR_ACCESS:
2639     {
2640         cr_access_qual_t qual;
2641         u32 mask = 0;
2642 
2643         __vmread(EXIT_QUALIFICATION, &qual.raw);
2644         /* also according to guest exec_control */
2645         ctrl = __n2_exec_control(v);
2646 
2647         /* CLTS/LMSW strictly act on CR0 */
2648         if ( qual.access_type >= VMX_CR_ACCESS_TYPE_CLTS )
2649             ASSERT(qual.cr == 0);
2650 
2651         if ( qual.cr == 3 )
2652         {
2653             mask = qual.access_type ? CPU_BASED_CR3_STORE_EXITING
2654                                     : CPU_BASED_CR3_LOAD_EXITING;
2655             if ( ctrl & mask )
2656                 nvcpu->nv_vmexit_pending = 1;
2657         }
2658         else if ( qual.cr == 8 )
2659         {
2660             mask = qual.access_type ? CPU_BASED_CR8_STORE_EXITING
2661                                     : CPU_BASED_CR8_LOAD_EXITING;
2662             if ( ctrl & mask )
2663                 nvcpu->nv_vmexit_pending = 1;
2664         }
2665         else  /* CR0, CR4, CLTS, LMSW */
2666         {
2667             /*
2668              * While getting the VM exit for CR0/CR4 access, check if L1 VMM owns
2669              * the bit.
2670              * If so, inject the VM exit to L1 VMM.
2671              * Otherwise, L0 will handle it and sync the value to L1 virtual VMCS.
2672              */
2673             unsigned long old_val, val, changed_bits;
2674 
2675             switch ( qual.access_type )
2676             {
2677             case VMX_CR_ACCESS_TYPE_MOV_TO_CR:
2678             {
2679                 val = *decode_gpr(guest_cpu_user_regs(), qual.gpr);
2680 
2681                 if ( qual.cr == 0 )
2682                 {
2683                     u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2684 
2685                     __vmread(CR0_READ_SHADOW, &old_val);
2686                     changed_bits = old_val ^ val;
2687                     if ( changed_bits & cr0_gh_mask )
2688                         nvcpu->nv_vmexit_pending = 1;
2689                     else
2690                     {
2691                         u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2692 
2693                         set_vvmcs(v, GUEST_CR0,
2694                                   (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2695                     }
2696                 }
2697                 else if ( qual.cr == 4 )
2698                 {
2699                     u64 cr4_gh_mask = get_vvmcs(v, CR4_GUEST_HOST_MASK);
2700 
2701                     __vmread(CR4_READ_SHADOW, &old_val);
2702                     changed_bits = old_val ^ val;
2703                     if ( changed_bits & cr4_gh_mask )
2704                         nvcpu->nv_vmexit_pending = 1;
2705                     else
2706                     {
2707                         u64 guest_cr4 = get_vvmcs(v, GUEST_CR4);
2708 
2709                         set_vvmcs(v, GUEST_CR4,
2710                                   (guest_cr4 & cr4_gh_mask) | (val & ~cr4_gh_mask));
2711                     }
2712                 }
2713                 else
2714                     nvcpu->nv_vmexit_pending = 1;
2715                 break;
2716             }
2717 
2718             case VMX_CR_ACCESS_TYPE_CLTS:
2719             {
2720                 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2721 
2722                 if ( cr0_gh_mask & X86_CR0_TS )
2723                     nvcpu->nv_vmexit_pending = 1;
2724                 else
2725                 {
2726                     u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2727 
2728                     set_vvmcs(v, GUEST_CR0, (guest_cr0 & ~X86_CR0_TS));
2729                 }
2730                 break;
2731             }
2732 
2733             case VMX_CR_ACCESS_TYPE_LMSW:
2734             {
2735                 u64 cr0_gh_mask = get_vvmcs(v, CR0_GUEST_HOST_MASK);
2736 
2737                 __vmread(CR0_READ_SHADOW, &old_val);
2738                 old_val &= X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS;
2739                 val = qual.lmsw_data &
2740                       (X86_CR0_PE|X86_CR0_MP|X86_CR0_EM|X86_CR0_TS);
2741                 changed_bits = old_val ^ val;
2742                 if ( changed_bits & cr0_gh_mask )
2743                     nvcpu->nv_vmexit_pending = 1;
2744                 else
2745                 {
2746                     u64 guest_cr0 = get_vvmcs(v, GUEST_CR0);
2747 
2748                     set_vvmcs(v, GUEST_CR0, (guest_cr0 & cr0_gh_mask) | (val & ~cr0_gh_mask));
2749                 }
2750                 break;
2751             }
2752 
2753             default:
2754                 ASSERT_UNREACHABLE();
2755                 break;
2756             }
2757         }
2758         break;
2759     }
2760     case EXIT_REASON_APIC_ACCESS:
2761         ctrl = __n2_secondary_exec_control(v);
2762         if ( ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES )
2763             nvcpu->nv_vmexit_pending = 1;
2764         break;
2765     case EXIT_REASON_TPR_BELOW_THRESHOLD:
2766         ctrl = __n2_exec_control(v);
2767         if ( ctrl & CPU_BASED_TPR_SHADOW )
2768             nvcpu->nv_vmexit_pending = 1;
2769         break;
2770     default:
2771         gprintk(XENLOG_ERR, "Unhandled nested vmexit: reason %u\n",
2772                 exit_reason);
2773         domain_crash(v->domain);
2774     }
2775 
2776     return ( nvcpu->nv_vmexit_pending == 1 );
2777 }
2778 
nvmx_set_cr_read_shadow(struct vcpu * v,unsigned int cr)2779 void nvmx_set_cr_read_shadow(struct vcpu *v, unsigned int cr)
2780 {
2781     unsigned long cr_field, read_shadow_field, mask_field;
2782 
2783     switch ( cr )
2784     {
2785     case 0:
2786         cr_field = GUEST_CR0;
2787         read_shadow_field = CR0_READ_SHADOW;
2788         mask_field = CR0_GUEST_HOST_MASK;
2789         break;
2790     case 4:
2791         cr_field = GUEST_CR4;
2792         read_shadow_field = CR4_READ_SHADOW;
2793         mask_field = CR4_GUEST_HOST_MASK;
2794         break;
2795     default:
2796         gdprintk(XENLOG_WARNING, "Set read shadow for CR%d.\n", cr);
2797         return;
2798     }
2799 
2800     if ( !nestedhvm_vmswitch_in_progress(v) )
2801     {
2802         unsigned long virtual_cr_mask =
2803             get_vvmcs(v, mask_field);
2804 
2805         /*
2806          * We get here when L2 changed cr in a way that did not change
2807          * any of L1's shadowed bits (see nvmx_n2_vmexit_handler),
2808          * but did change L0 shadowed bits. So we first calculate the
2809          * effective cr value that L1 would like to write into the
2810          * hardware. It consists of the L2-owned bits from the new
2811          * value combined with the L1-owned bits from L1's guest cr.
2812          */
2813         v->arch.hvm.guest_cr[cr] &= ~virtual_cr_mask;
2814         v->arch.hvm.guest_cr[cr] |= virtual_cr_mask &
2815             get_vvmcs(v, cr_field);
2816     }
2817 
2818     /* nvcpu.guest_cr is what L2 write to cr actually. */
2819     __vmwrite(read_shadow_field, v->arch.hvm.nvcpu.guest_cr[cr]);
2820 }
2821 
2822 /*
2823  * Local variables:
2824  * mode: C
2825  * c-file-style: "BSD"
2826  * c-basic-offset: 4
2827  * tab-width: 4
2828  * indent-tabs-mode: nil
2829  * End:
2830  */
2831