1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/param.h>
4 #include <xen/sched.h>
5 #include <xen/nospec.h>
6 #include <asm/cpuid.h>
7 #include <asm/hvm/hvm.h>
8 #include <asm/hvm/nestedhvm.h>
9 #include <asm/hvm/svm/svm.h>
10 #include <asm/hvm/viridian.h>
11 #include <asm/hvm/vmx/vmcs.h>
12 #include <asm/paging.h>
13 #include <asm/processor.h>
14 #include <asm/xstate.h>
15 
16 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
17 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
18 
19 static const uint32_t pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
20 static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
21 static const uint32_t hvm_hap_max_featuremask[] = INIT_HVM_HAP_MAX_FEATURES;
22 static const uint32_t pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
23 static const uint32_t hvm_shadow_def_featuremask[] = INIT_HVM_SHADOW_DEF_FEATURES;
24 static const uint32_t hvm_hap_def_featuremask[] = INIT_HVM_HAP_DEF_FEATURES;
25 static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
26 
parse_xen_cpuid(const char * s)27 static int __init parse_xen_cpuid(const char *s)
28 {
29     const char *ss;
30     int val, rc = 0;
31 
32     do {
33         static const struct feature {
34             const char *name;
35             unsigned int bit;
36         } features[] __initconstrel = INIT_FEATURE_NAMES;
37         const struct feature *lhs, *rhs, *mid = NULL /* GCC... */;
38         const char *feat;
39 
40         ss = strchr(s, ',');
41         if ( !ss )
42             ss = strchr(s, '\0');
43 
44         /* Skip the 'no-' prefix for name comparisons. */
45         feat = s;
46         if ( strncmp(s, "no-", 3) == 0 )
47             feat += 3;
48 
49         /* (Re)initalise lhs and rhs for binary search. */
50         lhs = features;
51         rhs = features + ARRAY_SIZE(features);
52 
53         while ( lhs < rhs )
54         {
55             int res;
56 
57             mid = lhs + (rhs - lhs) / 2;
58             res = cmdline_strcmp(feat, mid->name);
59 
60             if ( res < 0 )
61             {
62                 rhs = mid;
63                 continue;
64             }
65             if ( res > 0 )
66             {
67                 lhs = mid + 1;
68                 continue;
69             }
70 
71             if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
72             {
73                 if ( !val )
74                     setup_clear_cpu_cap(mid->bit);
75                 else if ( mid->bit == X86_FEATURE_RDRAND &&
76                           (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
77                     setup_force_cpu_cap(X86_FEATURE_RDRAND);
78                 mid = NULL;
79             }
80 
81             break;
82         }
83 
84         /*
85          * Mid being NULL means that the name and boolean were successfully
86          * identified.  Everything else is an error.
87          */
88         if ( mid )
89             rc = -EINVAL;
90 
91         s = ss + 1;
92     } while ( *ss );
93 
94     return rc;
95 }
96 custom_param("cpuid", parse_xen_cpuid);
97 
98 #define EMPTY_LEAF ((struct cpuid_leaf){})
zero_leaves(struct cpuid_leaf * l,unsigned int first,unsigned int last)99 static void zero_leaves(struct cpuid_leaf *l,
100                         unsigned int first, unsigned int last)
101 {
102     memset(&l[first], 0, sizeof(*l) * (last - first + 1));
103 }
104 
105 struct cpuid_policy __read_mostly     raw_cpuid_policy,
106                     __read_mostly    host_cpuid_policy;
107 #ifdef CONFIG_PV
108 struct cpuid_policy __read_mostly  pv_max_cpuid_policy;
109 struct cpuid_policy __read_mostly  pv_def_cpuid_policy;
110 #endif
111 #ifdef CONFIG_HVM
112 struct cpuid_policy __read_mostly hvm_max_cpuid_policy;
113 struct cpuid_policy __read_mostly hvm_def_cpuid_policy;
114 #endif
115 
sanitise_featureset(uint32_t * fs)116 static void sanitise_featureset(uint32_t *fs)
117 {
118     /* for_each_set_bit() uses unsigned longs.  Extend with zeroes. */
119     uint32_t disabled_features[
120         ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
121     unsigned int i;
122 
123     for ( i = 0; i < FSCAPINTS; ++i )
124     {
125         /* Clamp to known mask. */
126         fs[i] &= known_features[i];
127 
128         /*
129          * Identify which features with deep dependencies have been
130          * disabled.
131          */
132         disabled_features[i] = ~fs[i] & deep_features[i];
133     }
134 
135     for_each_set_bit(i, (void *)disabled_features,
136                      sizeof(disabled_features) * 8)
137     {
138         const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
139         unsigned int j;
140 
141         ASSERT(dfs); /* deep_features[] should guarentee this. */
142 
143         for ( j = 0; j < FSCAPINTS; ++j )
144         {
145             fs[j] &= ~dfs[j];
146             disabled_features[j] &= ~dfs[j];
147         }
148     }
149 }
150 
recalculate_xstate(struct cpuid_policy * p)151 static void recalculate_xstate(struct cpuid_policy *p)
152 {
153     uint64_t xstates = XSTATE_FP_SSE;
154     uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
155     unsigned int i, Da1 = p->xstate.Da1;
156 
157     /*
158      * The Da1 leaf is the only piece of information preserved in the common
159      * case.  Everything else is derived from other feature state.
160      */
161     memset(&p->xstate, 0, sizeof(p->xstate));
162 
163     if ( !p->basic.xsave )
164         return;
165 
166     if ( p->basic.avx )
167     {
168         xstates |= X86_XCR0_YMM;
169         xstate_size = max(xstate_size,
170                           xstate_offsets[X86_XCR0_YMM_POS] +
171                           xstate_sizes[X86_XCR0_YMM_POS]);
172     }
173 
174     if ( p->feat.mpx )
175     {
176         xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
177         xstate_size = max(xstate_size,
178                           xstate_offsets[X86_XCR0_BNDCSR_POS] +
179                           xstate_sizes[X86_XCR0_BNDCSR_POS]);
180     }
181 
182     if ( p->feat.avx512f )
183     {
184         xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
185         xstate_size = max(xstate_size,
186                           xstate_offsets[X86_XCR0_HI_ZMM_POS] +
187                           xstate_sizes[X86_XCR0_HI_ZMM_POS]);
188     }
189 
190     if ( p->feat.pku )
191     {
192         xstates |= X86_XCR0_PKRU;
193         xstate_size = max(xstate_size,
194                           xstate_offsets[X86_XCR0_PKRU_POS] +
195                           xstate_sizes[X86_XCR0_PKRU_POS]);
196     }
197 
198     p->xstate.max_size  =  xstate_size;
199     p->xstate.xcr0_low  =  xstates & ~XSTATE_XSAVES_ONLY;
200     p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
201 
202     p->xstate.Da1 = Da1;
203     if ( p->xstate.xsaves )
204     {
205         p->xstate.xss_low   =  xstates & XSTATE_XSAVES_ONLY;
206         p->xstate.xss_high  = (xstates & XSTATE_XSAVES_ONLY) >> 32;
207     }
208     else
209         xstates &= ~XSTATE_XSAVES_ONLY;
210 
211     for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
212     {
213         uint64_t curr_xstate = 1ul << i;
214 
215         if ( !(xstates & curr_xstate) )
216             continue;
217 
218         p->xstate.comp[i].size   = xstate_sizes[i];
219         p->xstate.comp[i].offset = xstate_offsets[i];
220         p->xstate.comp[i].xss    = curr_xstate & XSTATE_XSAVES_ONLY;
221         p->xstate.comp[i].align  = curr_xstate & xstate_align;
222     }
223 }
224 
225 /*
226  * Misc adjustments to the policy.  Mostly clobbering reserved fields and
227  * duplicating shared fields.  Intentionally hidden fields are annotated.
228  */
recalculate_misc(struct cpuid_policy * p)229 static void recalculate_misc(struct cpuid_policy *p)
230 {
231     p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
232     p->basic.apic_id = 0; /* Dynamic. */
233 
234     p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
235     p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
236 
237     p->basic.raw[0x8] = EMPTY_LEAF;
238 
239     /* TODO: Rework topology logic. */
240     memset(p->topo.raw, 0, sizeof(p->topo.raw));
241 
242     p->basic.raw[0xc] = EMPTY_LEAF;
243 
244     p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
245 
246     /* Most of Power/RAS hidden from guests. */
247     p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
248 
249     p->extd.raw[0x8].d = 0;
250 
251     switch ( p->x86_vendor )
252     {
253     case X86_VENDOR_INTEL:
254         p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
255         p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
256         p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
257 
258         p->extd.vendor_ebx = 0;
259         p->extd.vendor_ecx = 0;
260         p->extd.vendor_edx = 0;
261 
262         p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
263 
264         p->extd.raw[0x5] = EMPTY_LEAF;
265         p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
266 
267         p->extd.raw[0x8].a &= 0x0000ffff;
268         p->extd.raw[0x8].c = 0;
269         break;
270 
271     case X86_VENDOR_AMD:
272     case X86_VENDOR_HYGON:
273         zero_leaves(p->basic.raw, 0x2, 0x3);
274         memset(p->cache.raw, 0, sizeof(p->cache.raw));
275         zero_leaves(p->basic.raw, 0x9, 0xa);
276 
277         p->extd.vendor_ebx = p->basic.vendor_ebx;
278         p->extd.vendor_ecx = p->basic.vendor_ecx;
279         p->extd.vendor_edx = p->basic.vendor_edx;
280 
281         p->extd.raw_fms = p->basic.raw_fms;
282         p->extd.raw[0x1].b &= 0xff00ffff;
283         p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
284 
285         p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
286         p->extd.raw[0x8].c &= 0x0003f0ff;
287 
288         p->extd.raw[0x9] = EMPTY_LEAF;
289 
290         zero_leaves(p->extd.raw, 0xb, 0x18);
291 
292         p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
293         p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
294         break;
295     }
296 }
297 
calculate_raw_policy(void)298 static void __init calculate_raw_policy(void)
299 {
300     struct cpuid_policy *p = &raw_cpuid_policy;
301 
302     x86_cpuid_policy_fill_native(p);
303 
304     /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
305     ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
306 }
307 
calculate_host_policy(void)308 static void __init calculate_host_policy(void)
309 {
310     struct cpuid_policy *p = &host_cpuid_policy;
311 
312     *p = raw_cpuid_policy;
313 
314     p->basic.max_leaf =
315         min_t(uint32_t, p->basic.max_leaf,   ARRAY_SIZE(p->basic.raw) - 1);
316     p->feat.max_subleaf =
317         min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
318     p->extd.max_leaf = 0x80000000 | min_t(uint32_t, p->extd.max_leaf & 0xffff,
319                                           ARRAY_SIZE(p->extd.raw) - 1);
320 
321     cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
322     recalculate_xstate(p);
323     recalculate_misc(p);
324 
325     /* When vPMU is disabled, drop it from the host policy. */
326     if ( vpmu_mode == XENPMU_MODE_OFF )
327         p->basic.raw[0xa] = EMPTY_LEAF;
328 
329     if ( p->extd.svm )
330     {
331         /* Clamp to implemented features which require hardware support. */
332         p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
333                                (1u << SVM_FEATURE_LBRV) |
334                                (1u << SVM_FEATURE_NRIPS) |
335                                (1u << SVM_FEATURE_PAUSEFILTER) |
336                                (1u << SVM_FEATURE_DECODEASSISTS));
337         /* Enable features which are always emulated. */
338         p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
339                                (1u << SVM_FEATURE_TSCRATEMSR));
340     }
341 }
342 
guest_common_default_feature_adjustments(uint32_t * fs)343 static void __init guest_common_default_feature_adjustments(uint32_t *fs)
344 {
345     /*
346      * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
347      * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
348      * compensate.
349      *
350      * Mitigate by hiding RDRAND from guests by default, unless explicitly
351      * overridden on the Xen command line (cpuid=rdrand).  Irrespective of the
352      * default setting, guests can use RDRAND if explicitly enabled
353      * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
354      * previously using RDRAND can migrate in.
355      */
356     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
357          boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
358          cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
359         __clear_bit(X86_FEATURE_RDRAND, fs);
360 }
361 
guest_common_feature_adjustments(uint32_t * fs)362 static void __init guest_common_feature_adjustments(uint32_t *fs)
363 {
364     /* Unconditionally claim to be able to set the hypervisor bit. */
365     __set_bit(X86_FEATURE_HYPERVISOR, fs);
366 
367     /*
368      * If IBRS is offered to the guest, unconditionally offer STIBP.  It is a
369      * nop on non-HT hardware, and has this behaviour to make heterogeneous
370      * setups easier to manage.
371      */
372     if ( test_bit(X86_FEATURE_IBRSB, fs) )
373         __set_bit(X86_FEATURE_STIBP, fs);
374 
375     /*
376      * On hardware which supports IBRS/IBPB, we can offer IBPB independently
377      * of IBRS by using the AMD feature bit.  An administrator may wish for
378      * performance reasons to offer IBPB without IBRS.
379      */
380     if ( host_cpuid_policy.feat.ibrsb )
381         __set_bit(X86_FEATURE_IBPB, fs);
382 }
383 
calculate_pv_max_policy(void)384 static void __init calculate_pv_max_policy(void)
385 {
386     struct cpuid_policy *p = &pv_max_cpuid_policy;
387     uint32_t pv_featureset[FSCAPINTS];
388     unsigned int i;
389 
390     *p = host_cpuid_policy;
391     cpuid_policy_to_featureset(p, pv_featureset);
392 
393     for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
394         pv_featureset[i] &= pv_max_featuremask[i];
395 
396     /*
397      * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of
398      * administrator choice, hide the feature.
399      */
400     if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
401         __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
402 
403     guest_common_feature_adjustments(pv_featureset);
404 
405     sanitise_featureset(pv_featureset);
406     cpuid_featureset_to_policy(pv_featureset, p);
407     recalculate_xstate(p);
408 
409     p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
410 }
411 
calculate_pv_def_policy(void)412 static void __init calculate_pv_def_policy(void)
413 {
414     struct cpuid_policy *p = &pv_def_cpuid_policy;
415     uint32_t pv_featureset[FSCAPINTS];
416     unsigned int i;
417 
418     *p = pv_max_cpuid_policy;
419     cpuid_policy_to_featureset(p, pv_featureset);
420 
421     for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
422         pv_featureset[i] &= pv_def_featuremask[i];
423 
424     guest_common_feature_adjustments(pv_featureset);
425     guest_common_default_feature_adjustments(pv_featureset);
426 
427     sanitise_featureset(pv_featureset);
428     cpuid_featureset_to_policy(pv_featureset, p);
429     recalculate_xstate(p);
430 }
431 
calculate_hvm_max_policy(void)432 static void __init calculate_hvm_max_policy(void)
433 {
434     struct cpuid_policy *p = &hvm_max_cpuid_policy;
435     uint32_t hvm_featureset[FSCAPINTS];
436     unsigned int i;
437     const uint32_t *hvm_featuremask;
438 
439     *p = host_cpuid_policy;
440     cpuid_policy_to_featureset(p, hvm_featureset);
441 
442     hvm_featuremask = hvm_hap_supported() ?
443         hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
444 
445     for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
446         hvm_featureset[i] &= hvm_featuremask[i];
447 
448     /*
449      * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
450      * (x2)APIC isn't enabled.
451      */
452     __set_bit(X86_FEATURE_APIC, hvm_featureset);
453     __set_bit(X86_FEATURE_X2APIC, hvm_featureset);
454 
455     /*
456      * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
457      * long mode (and init_amd() has cleared it out of host capabilities), but
458      * HVM guests are able if running in protected mode.
459      */
460     if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
461          raw_cpuid_policy.basic.sep )
462         __set_bit(X86_FEATURE_SEP, hvm_featureset);
463 
464     /*
465      * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of
466      * administrator choice, hide the feature.
467      */
468     if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
469         __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
470 
471     /*
472      * With VT-x, some features are only supported by Xen if dedicated
473      * hardware support is also available.
474      */
475     if ( cpu_has_vmx )
476     {
477         if ( !cpu_has_vmx_mpx )
478             __clear_bit(X86_FEATURE_MPX, hvm_featureset);
479 
480         if ( !cpu_has_vmx_xsaves )
481             __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
482     }
483 
484     guest_common_feature_adjustments(hvm_featureset);
485 
486     sanitise_featureset(hvm_featureset);
487     cpuid_featureset_to_policy(hvm_featureset, p);
488     recalculate_xstate(p);
489 }
490 
calculate_hvm_def_policy(void)491 static void __init calculate_hvm_def_policy(void)
492 {
493     struct cpuid_policy *p = &hvm_def_cpuid_policy;
494     uint32_t hvm_featureset[FSCAPINTS];
495     unsigned int i;
496     const uint32_t *hvm_featuremask;
497 
498     *p = hvm_max_cpuid_policy;
499     cpuid_policy_to_featureset(p, hvm_featureset);
500 
501     hvm_featuremask = hvm_hap_supported() ?
502         hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
503 
504     for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
505         hvm_featureset[i] &= hvm_featuremask[i];
506 
507     guest_common_feature_adjustments(hvm_featureset);
508     guest_common_default_feature_adjustments(hvm_featureset);
509 
510     sanitise_featureset(hvm_featureset);
511     cpuid_featureset_to_policy(hvm_featureset, p);
512     recalculate_xstate(p);
513 }
514 
init_guest_cpuid(void)515 void __init init_guest_cpuid(void)
516 {
517     calculate_raw_policy();
518     calculate_host_policy();
519 
520     if ( IS_ENABLED(CONFIG_PV) )
521     {
522         calculate_pv_max_policy();
523         calculate_pv_def_policy();
524     }
525 
526     if ( hvm_enabled )
527     {
528         calculate_hvm_max_policy();
529         calculate_hvm_def_policy();
530     }
531 }
532 
recheck_cpu_features(unsigned int cpu)533 bool recheck_cpu_features(unsigned int cpu)
534 {
535     bool okay = true;
536     struct cpuinfo_x86 c;
537     const struct cpuinfo_x86 *bsp = &boot_cpu_data;
538     unsigned int i;
539 
540     identify_cpu(&c);
541 
542     for ( i = 0; i < NCAPINTS; ++i )
543     {
544         if ( !(~c.x86_capability[i] & bsp->x86_capability[i]) )
545             continue;
546 
547         printk(XENLOG_ERR "CPU%u: cap[%2u] is %08x (expected %08x)\n",
548                cpu, i, c.x86_capability[i], bsp->x86_capability[i]);
549         okay = false;
550     }
551 
552     return okay;
553 }
554 
recalculate_cpuid_policy(struct domain * d)555 void recalculate_cpuid_policy(struct domain *d)
556 {
557     struct cpuid_policy *p = d->arch.cpuid;
558     const struct cpuid_policy *max = is_pv_domain(d)
559         ? (IS_ENABLED(CONFIG_PV)  ?  &pv_max_cpuid_policy : NULL)
560         : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpuid_policy : NULL);
561     uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
562     unsigned int i;
563 
564     if ( !max )
565     {
566         ASSERT_UNREACHABLE();
567         return;
568     }
569 
570     p->x86_vendor = x86_cpuid_lookup_vendor(
571         p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
572 
573     p->basic.max_leaf   = min(p->basic.max_leaf,   max->basic.max_leaf);
574     p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
575     p->extd.max_leaf    = 0x80000000 | min(p->extd.max_leaf & 0xffff,
576                                            ((p->x86_vendor & (X86_VENDOR_AMD |
577                                                               X86_VENDOR_HYGON))
578                                             ? CPUID_GUEST_NR_EXTD_AMD
579                                             : CPUID_GUEST_NR_EXTD_INTEL) - 1);
580 
581     cpuid_policy_to_featureset(p, fs);
582     cpuid_policy_to_featureset(max, max_fs);
583 
584     if ( is_hvm_domain(d) )
585     {
586         /*
587          * HVM domains using Shadow paging have further restrictions on their
588          * available paging features.
589          */
590         if ( !hap_enabled(d) )
591         {
592             for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
593                 max_fs[i] &= hvm_shadow_max_featuremask[i];
594         }
595 
596         /* Hide nested-virt if it hasn't been explicitly configured. */
597         if ( !nestedhvm_enabled(d) )
598         {
599             __clear_bit(X86_FEATURE_VMX, max_fs);
600             __clear_bit(X86_FEATURE_SVM, max_fs);
601         }
602     }
603 
604     /*
605      * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY.  These bits
606      * affect how to interpret topology information in other cpuid leaves.
607      */
608     __set_bit(X86_FEATURE_HTT, max_fs);
609     __set_bit(X86_FEATURE_X2APIC, max_fs);
610     __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
611 
612     /*
613      * 32bit PV domains can't use any Long Mode features, and cannot use
614      * SYSCALL on non-AMD hardware.
615      */
616     if ( is_pv_32bit_domain(d) )
617     {
618         __clear_bit(X86_FEATURE_LM, max_fs);
619         if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
620             __clear_bit(X86_FEATURE_SYSCALL, max_fs);
621     }
622 
623     /*
624      * ITSC is masked by default (so domains are safe to migrate), but a
625      * toolstack which has configured disable_migrate or vTSC for a domain may
626      * safely select it, and needs a way of doing so.
627      */
628     if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
629         __set_bit(X86_FEATURE_ITSC, max_fs);
630 
631     /*
632      * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
633      * TSX and hide the feature bits.  Migrating-in VMs may have been booted
634      * pre-mitigation when the TSX features were visbile.
635      *
636      * This situation is compatible (albeit with a perf hit to any TSX code in
637      * the guest), so allow the feature bits to remain set.
638      */
639     if ( cpu_has_tsx_ctrl )
640     {
641         __set_bit(X86_FEATURE_HLE, max_fs);
642         __set_bit(X86_FEATURE_RTM, max_fs);
643     }
644 
645     /* Clamp the toolstacks choices to reality. */
646     for ( i = 0; i < ARRAY_SIZE(fs); i++ )
647         fs[i] &= max_fs[i];
648 
649     if ( p->basic.max_leaf < XSTATE_CPUID )
650         __clear_bit(X86_FEATURE_XSAVE, fs);
651 
652     sanitise_featureset(fs);
653 
654     /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
655     fs[FEATURESET_7b0] &= ~special_features[FEATURESET_7b0];
656     fs[FEATURESET_7b0] |= (host_cpuid_policy.feat._7b0 &
657                            special_features[FEATURESET_7b0]);
658 
659     cpuid_featureset_to_policy(fs, p);
660 
661     /* Pass host cacheline size through to guests. */
662     p->basic.clflush_size = max->basic.clflush_size;
663 
664     p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
665     p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
666                                 paging_max_paddr_bits(d));
667     p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
668                                 (p->basic.pae || p->basic.pse36) ? 36 : 32);
669 
670     p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
671 
672     recalculate_xstate(p);
673     recalculate_misc(p);
674 
675     for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
676     {
677         if ( p->cache.subleaf[i].type >= 1 &&
678              p->cache.subleaf[i].type <= 3 )
679         {
680             /* Subleaf has a valid cache type. Zero reserved fields. */
681             p->cache.raw[i].a &= 0xffffc3ffu;
682             p->cache.raw[i].d &= 0x00000007u;
683         }
684         else
685         {
686             /* Subleaf is not valid.  Zero the rest of the union. */
687             zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
688             break;
689         }
690     }
691 
692     if ( vpmu_mode == XENPMU_MODE_OFF ||
693          ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
694         p->basic.raw[0xa] = EMPTY_LEAF;
695 
696     if ( !p->extd.svm )
697         p->extd.raw[0xa] = EMPTY_LEAF;
698 
699     if ( !p->extd.page1gb )
700         p->extd.raw[0x19] = EMPTY_LEAF;
701 }
702 
init_domain_cpuid_policy(struct domain * d)703 int init_domain_cpuid_policy(struct domain *d)
704 {
705     struct cpuid_policy *p = is_pv_domain(d)
706         ? (IS_ENABLED(CONFIG_PV)  ?  &pv_def_cpuid_policy : NULL)
707         : (IS_ENABLED(CONFIG_HVM) ? &hvm_def_cpuid_policy : NULL);
708 
709     if ( !p )
710     {
711         ASSERT_UNREACHABLE();
712         return -EOPNOTSUPP;
713     }
714 
715     p = xmemdup(p);
716     if ( !p )
717         return -ENOMEM;
718 
719     if ( d->disable_migrate )
720         p->extd.itsc = cpu_has_itsc;
721 
722     /*
723      * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
724      * so dom0 can turn off workarounds as appropriate.  Temporary, until the
725      * domain policy logic gains a better understanding of MSRs.
726      */
727     if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
728         p->feat.arch_caps = true;
729 
730     d->arch.cpuid = p;
731 
732     recalculate_cpuid_policy(d);
733 
734     return 0;
735 }
736 
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)737 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
738                  uint32_t subleaf, struct cpuid_leaf *res)
739 {
740     const struct domain *d = v->domain;
741     const struct cpuid_policy *p = d->arch.cpuid;
742 
743     *res = EMPTY_LEAF;
744 
745     /*
746      * First pass:
747      * - Perform max_leaf/subleaf calculations.  Out-of-range leaves return
748      *   all zeros, following the AMD model.
749      * - Fill in *res with static data.
750      * - Dispatch the virtualised leaves to their respective handlers.
751      */
752     switch ( leaf )
753     {
754     case 0 ... CPUID_GUEST_NR_BASIC - 1:
755         ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
756         if ( leaf > min_t(uint32_t, p->basic.max_leaf,
757                           ARRAY_SIZE(p->basic.raw) - 1) )
758             return;
759 
760         switch ( leaf )
761         {
762         case 0x4:
763             if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
764                 return;
765 
766             *res = array_access_nospec(p->cache.raw, subleaf);
767             break;
768 
769         case 0x7:
770             ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
771             if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
772                                  ARRAY_SIZE(p->feat.raw) - 1) )
773                 return;
774 
775             *res = array_access_nospec(p->feat.raw, subleaf);
776             break;
777 
778         case 0xb:
779             if ( subleaf >= ARRAY_SIZE(p->topo.raw) )
780                 return;
781 
782             *res = array_access_nospec(p->topo.raw, subleaf);
783             break;
784 
785         case XSTATE_CPUID:
786             if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
787                 return;
788 
789             *res = array_access_nospec(p->xstate.raw, subleaf);
790             break;
791 
792         default:
793             *res = array_access_nospec(p->basic.raw, leaf);
794             break;
795         }
796         break;
797 
798     case 0x40000000 ... 0x400000ff:
799         if ( is_viridian_domain(d) )
800             return cpuid_viridian_leaves(v, leaf, subleaf, res);
801 
802         /*
803          * Fallthrough.
804          *
805          * Intel reserve up until 0x4fffffff for hypervisor use.  AMD reserve
806          * only until 0x400000ff, but we already use double that.
807          */
808     case 0x40000100 ... 0x400001ff:
809         return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
810 
811     case 0x80000000 ... 0x80000000 + CPUID_GUEST_NR_EXTD - 1:
812         ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
813         if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
814                                      ARRAY_SIZE(p->extd.raw) - 1) )
815             return;
816 
817         *res = array_access_nospec(p->extd.raw, leaf & 0xffff);
818         break;
819 
820     default:
821         return;
822     }
823 
824     /*
825      * Skip dynamic adjustments if we are in the wrong context.
826      *
827      * All dynamic adjustments depends on current register state, which will
828      * be stale if the vcpu is running elsewhere.  It is simpler, quicker, and
829      * more reliable for the caller to do nothing (consistently) than to hand
830      * back stale data which it can't use safely.
831      */
832     if ( v != current )
833         return;
834 
835     /*
836      * Second pass:
837      * - Dynamic adjustments
838      */
839     switch ( leaf )
840     {
841         const struct cpu_user_regs *regs;
842 
843     case 0x1:
844         /* TODO: Rework topology logic. */
845         res->b &= 0x00ffffffu;
846         if ( is_hvm_domain(d) )
847             res->b |= (v->vcpu_id * 2) << 24;
848 
849         /* TODO: Rework vPMU control in terms of toolstack choices. */
850         if ( vpmu_available(v) &&
851              vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
852         {
853             res->d |= cpufeat_mask(X86_FEATURE_DS);
854             if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
855                 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
856             if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
857                 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
858         }
859 
860         if ( is_hvm_domain(d) )
861         {
862             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
863             if ( v->arch.hvm.guest_cr[4] & X86_CR4_OSXSAVE )
864                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
865         }
866         else /* PV domain */
867         {
868             regs = guest_cpu_user_regs();
869 
870             /*
871              * !!! OSXSAVE handling for PV guests is non-architectural !!!
872              *
873              * Architecturally, the correct code here is simply:
874              *
875              *   if ( v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE )
876              *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
877              *
878              * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
879              * the XSAVE cpuid flag leaked into guests despite the feature not
880              * being available for use), buggy workarounds where introduced to
881              * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
882              * that Xen also incorrectly leaked OSXSAVE into the guest.
883              *
884              * Furthermore, providing architectural OSXSAVE behaviour to a
885              * many Linux PV guests triggered a further kernel bug when the
886              * fpu code observes that XSAVEOPT is available, assumes that
887              * xsave state had been set up for the task, and follows a wild
888              * pointer.
889              *
890              * Older Linux PVOPS kernels however do require architectural
891              * behaviour.  They observe Xen's leaked OSXSAVE and assume they
892              * can already use XSETBV, dying with a #UD because the shadowed
893              * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
894              * observed cases via stable backports of the above changeset.
895              *
896              * Therefore, the leaking of Xen's OSXSAVE setting has become a
897              * defacto part of the PV ABI and can't reasonably be corrected.
898              * It can however be restricted to only the enlightened CPUID
899              * view, as seen by the guest kernel.
900              *
901              * The following situations and logic now applies:
902              *
903              * - Hardware without CPUID faulting support and native CPUID:
904              *    There is nothing Xen can do here.  The hosts XSAVE flag will
905              *    leak through and Xen's OSXSAVE choice will leak through.
906              *
907              *    In the case that the guest kernel has not set up OSXSAVE, only
908              *    SSE will be set in xcr0, and guest userspace can't do too much
909              *    damage itself.
910              *
911              * - Enlightened CPUID or CPUID faulting available:
912              *    Xen can fully control what is seen here.  When the guest has
913              *    been configured to have XSAVE available, guest kernels need
914              *    to see the leaked OSXSAVE via the enlightened path, but
915              *    guest userspace and the native is given architectural
916              *    behaviour.
917              *
918              *    Emulated vs Faulted CPUID is distinguised based on whether a
919              *    #UD or #GP is currently being serviced.
920              */
921             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
922             if ( (v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE) ||
923                  (p->basic.xsave &&
924                   regs->entry_vector == TRAP_invalid_op &&
925                   guest_kernel_mode(v, regs) &&
926                   (read_cr4() & X86_CR4_OSXSAVE)) )
927                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
928 
929             /*
930              * At the time of writing, a PV domain is the only viable option
931              * for Dom0.  Several interactions between dom0 and Xen for real
932              * hardware setup have unfortunately been implemented based on
933              * state which incorrectly leaked into dom0.
934              *
935              * These leaks are retained for backwards compatibility, but
936              * restricted to the hardware domains kernel only.
937              */
938             if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
939             {
940                 /*
941                  * MONITOR never leaked into PV guests, as PV guests cannot
942                  * use the MONITOR/MWAIT instructions.  As such, they require
943                  * the feature to not being present in emulated CPUID.
944                  *
945                  * Modern PVOPS Linux try to be cunning and use native CPUID
946                  * to see if the hardware actually supports MONITOR, and by
947                  * extension, deep C states.
948                  *
949                  * If the feature is seen, deep-C state information is
950                  * obtained from the DSDT and handed back to Xen via the
951                  * XENPF_set_processor_pminfo hypercall.
952                  *
953                  * This mechanism is incompatible with an HVM-based hardware
954                  * domain, and also with CPUID Faulting.
955                  *
956                  * Luckily, Xen can be just as 'cunning', and distinguish an
957                  * emulated CPUID from a faulted CPUID by whether a #UD or #GP
958                  * fault is currently being serviced.  Yuck...
959                  */
960                 if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
961                     res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
962 
963                 /*
964                  * While MONITOR never leaked into PV guests, EIST always used
965                  * to.
966                  *
967                  * Modern PVOPS Linux will only parse P state information from
968                  * the DSDT and return it to Xen if EIST is seen in the
969                  * emulated CPUID information.
970                  */
971                 if ( cpu_has_eist )
972                     res->c |= cpufeat_mask(X86_FEATURE_EIST);
973             }
974         }
975         goto common_leaf1_adjustments;
976 
977     case 0x5:
978         /*
979          * Leak the hardware MONITOR leaf under the same conditions that the
980          * MONITOR feature flag is leaked.  See above for details.
981          */
982         regs = guest_cpu_user_regs();
983         if ( is_pv_domain(d) && is_hardware_domain(d) &&
984              guest_kernel_mode(v, regs) && cpu_has_monitor &&
985              regs->entry_vector == TRAP_gp_fault )
986             *res = raw_cpuid_policy.basic.raw[5];
987         break;
988 
989     case 0x7:
990         switch ( subleaf )
991         {
992         case 0:
993             /* OSPKE clear in policy.  Fast-forward CR4 back in. */
994             if ( (is_pv_domain(d)
995                   ? v->arch.pv.ctrlreg[4]
996                   : v->arch.hvm.guest_cr[4]) & X86_CR4_PKE )
997                 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
998             break;
999         }
1000         break;
1001 
1002     case 0xa:
1003         /* TODO: Rework vPMU control in terms of toolstack choices. */
1004         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
1005              !vpmu_available(v) )
1006             *res = EMPTY_LEAF;
1007         else
1008         {
1009             /* Report at most v3 since that's all we currently emulate. */
1010             if ( (res->a & 0xff) > 3 )
1011                 res->a = (res->a & ~0xff) | 3;
1012         }
1013         break;
1014 
1015     case 0xb:
1016         /*
1017          * In principle, this leaf is Intel-only.  In practice, it is tightly
1018          * coupled with x2apic, and we offer an x2apic-capable APIC emulation
1019          * to guests on AMD hardware as well.
1020          *
1021          * TODO: Rework topology logic.
1022          */
1023         if ( p->basic.x2apic )
1024         {
1025             *(uint8_t *)&res->c = subleaf;
1026 
1027             /* Fix the x2APIC identifier. */
1028             res->d = v->vcpu_id * 2;
1029         }
1030         break;
1031 
1032     case XSTATE_CPUID:
1033         switch ( subleaf )
1034         {
1035         case 1:
1036             if ( p->xstate.xsaves )
1037             {
1038                 /*
1039                  * TODO: Figure out what to do for XSS state.  VT-x manages
1040                  * host vs guest MSR_XSS automatically, so as soon as we start
1041                  * supporting any XSS states, the wrong XSS will be in
1042                  * context.
1043                  */
1044                 BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
1045 
1046                 /*
1047                  * Read CPUID[0xD,0/1].EBX from hardware.  They vary with
1048                  * enabled XSTATE, and appropraite XCR0|XSS are in context.
1049                  */
1050         case 0:
1051                 res->b = cpuid_count_ebx(leaf, subleaf);
1052             }
1053             break;
1054         }
1055         break;
1056 
1057     case 0x80000001:
1058         /* SYSCALL is hidden outside of long mode on Intel. */
1059         if ( p->x86_vendor == X86_VENDOR_INTEL &&
1060              is_hvm_domain(d) && !hvm_long_mode_active(v) )
1061             res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
1062 
1063     common_leaf1_adjustments:
1064         if ( is_hvm_domain(d) )
1065         {
1066             /* Fast-forward MSR_APIC_BASE.EN. */
1067             if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1068                 res->d &= ~cpufeat_mask(X86_FEATURE_APIC);
1069 
1070             /*
1071              * PSE36 is not supported in shadow mode.  This bit should be
1072              * clear in hvm_shadow_max_featuremask[].
1073              *
1074              * However, an unspecified version of Hyper-V from 2011 refuses to
1075              * start as the "cpu does not provide required hw features" if it
1076              * can't see PSE36.
1077              *
1078              * As a workaround, leak the toolstack-provided PSE36 value into a
1079              * shadow guest if the guest is already using PAE paging (and
1080              * won't care about reverting back to PSE paging).  Otherwise,
1081              * knoble it, so a 32bit guest doesn't get the impression that it
1082              * could try to use PSE36 paging.
1083              */
1084             if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
1085                 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
1086         }
1087         else /* PV domain */
1088         {
1089             /*
1090              * MTRR used to unconditionally leak into PV guests.  They cannot
1091              * MTRR infrastructure at all, and shouldn't be able to see the
1092              * feature.
1093              *
1094              * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
1095              * trying to use the associated MSRs.  Xenolinux-based PV dom0's
1096              * however use the MTRR feature as an indication of the presence
1097              * of the XENPF_{add,del,read}_memtype hypercalls.
1098              */
1099             if ( is_hardware_domain(d) && cpu_has_mtrr &&
1100                  guest_kernel_mode(v, guest_cpu_user_regs()) )
1101                 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
1102         }
1103         break;
1104     }
1105 }
1106 
build_assertions(void)1107 static void __init __maybe_unused build_assertions(void)
1108 {
1109     BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
1110     BUILD_BUG_ON(ARRAY_SIZE(special_features) != FSCAPINTS);
1111     BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
1112     BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
1113     BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
1114     BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
1115 
1116     /* Find some more clever allocation scheme if this trips. */
1117     BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
1118 
1119     BUILD_BUG_ON(sizeof(raw_cpuid_policy.basic) !=
1120                  sizeof(raw_cpuid_policy.basic.raw));
1121     BUILD_BUG_ON(sizeof(raw_cpuid_policy.feat) !=
1122                  sizeof(raw_cpuid_policy.feat.raw));
1123     BUILD_BUG_ON(sizeof(raw_cpuid_policy.xstate) !=
1124                  sizeof(raw_cpuid_policy.xstate.raw));
1125     BUILD_BUG_ON(sizeof(raw_cpuid_policy.extd) !=
1126                  sizeof(raw_cpuid_policy.extd.raw));
1127 }
1128 
1129 /*
1130  * Local variables:
1131  * mode: C
1132  * c-file-style: "BSD"
1133  * c-basic-offset: 4
1134  * tab-width: 4
1135  * indent-tabs-mode: nil
1136  * End:
1137  */
1138