1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/param.h>
4 #include <xen/sched.h>
5 #include <xen/nospec.h>
6 #include <asm/cpuid.h>
7 #include <asm/hvm/hvm.h>
8 #include <asm/hvm/nestedhvm.h>
9 #include <asm/hvm/svm/svm.h>
10 #include <asm/hvm/viridian.h>
11 #include <asm/hvm/vmx/vmcs.h>
12 #include <asm/paging.h>
13 #include <asm/processor.h>
14 #include <asm/xstate.h>
15
16 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
17 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
18
19 static const uint32_t pv_max_featuremask[] = INIT_PV_MAX_FEATURES;
20 static const uint32_t hvm_shadow_max_featuremask[] = INIT_HVM_SHADOW_MAX_FEATURES;
21 static const uint32_t hvm_hap_max_featuremask[] = INIT_HVM_HAP_MAX_FEATURES;
22 static const uint32_t pv_def_featuremask[] = INIT_PV_DEF_FEATURES;
23 static const uint32_t hvm_shadow_def_featuremask[] = INIT_HVM_SHADOW_DEF_FEATURES;
24 static const uint32_t hvm_hap_def_featuremask[] = INIT_HVM_HAP_DEF_FEATURES;
25 static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
26
parse_xen_cpuid(const char * s)27 static int __init parse_xen_cpuid(const char *s)
28 {
29 const char *ss;
30 int val, rc = 0;
31
32 do {
33 static const struct feature {
34 const char *name;
35 unsigned int bit;
36 } features[] __initconstrel = INIT_FEATURE_NAMES;
37 const struct feature *lhs, *rhs, *mid = NULL /* GCC... */;
38 const char *feat;
39
40 ss = strchr(s, ',');
41 if ( !ss )
42 ss = strchr(s, '\0');
43
44 /* Skip the 'no-' prefix for name comparisons. */
45 feat = s;
46 if ( strncmp(s, "no-", 3) == 0 )
47 feat += 3;
48
49 /* (Re)initalise lhs and rhs for binary search. */
50 lhs = features;
51 rhs = features + ARRAY_SIZE(features);
52
53 while ( lhs < rhs )
54 {
55 int res;
56
57 mid = lhs + (rhs - lhs) / 2;
58 res = cmdline_strcmp(feat, mid->name);
59
60 if ( res < 0 )
61 {
62 rhs = mid;
63 continue;
64 }
65 if ( res > 0 )
66 {
67 lhs = mid + 1;
68 continue;
69 }
70
71 if ( (val = parse_boolean(mid->name, s, ss)) >= 0 )
72 {
73 if ( !val )
74 setup_clear_cpu_cap(mid->bit);
75 else if ( mid->bit == X86_FEATURE_RDRAND &&
76 (cpuid_ecx(1) & cpufeat_mask(X86_FEATURE_RDRAND)) )
77 setup_force_cpu_cap(X86_FEATURE_RDRAND);
78 mid = NULL;
79 }
80
81 break;
82 }
83
84 /*
85 * Mid being NULL means that the name and boolean were successfully
86 * identified. Everything else is an error.
87 */
88 if ( mid )
89 rc = -EINVAL;
90
91 s = ss + 1;
92 } while ( *ss );
93
94 return rc;
95 }
96 custom_param("cpuid", parse_xen_cpuid);
97
98 #define EMPTY_LEAF ((struct cpuid_leaf){})
zero_leaves(struct cpuid_leaf * l,unsigned int first,unsigned int last)99 static void zero_leaves(struct cpuid_leaf *l,
100 unsigned int first, unsigned int last)
101 {
102 memset(&l[first], 0, sizeof(*l) * (last - first + 1));
103 }
104
105 struct cpuid_policy __read_mostly raw_cpuid_policy,
106 __read_mostly host_cpuid_policy;
107 #ifdef CONFIG_PV
108 struct cpuid_policy __read_mostly pv_max_cpuid_policy;
109 struct cpuid_policy __read_mostly pv_def_cpuid_policy;
110 #endif
111 #ifdef CONFIG_HVM
112 struct cpuid_policy __read_mostly hvm_max_cpuid_policy;
113 struct cpuid_policy __read_mostly hvm_def_cpuid_policy;
114 #endif
115
sanitise_featureset(uint32_t * fs)116 static void sanitise_featureset(uint32_t *fs)
117 {
118 /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
119 uint32_t disabled_features[
120 ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
121 unsigned int i;
122
123 for ( i = 0; i < FSCAPINTS; ++i )
124 {
125 /* Clamp to known mask. */
126 fs[i] &= known_features[i];
127
128 /*
129 * Identify which features with deep dependencies have been
130 * disabled.
131 */
132 disabled_features[i] = ~fs[i] & deep_features[i];
133 }
134
135 for_each_set_bit(i, (void *)disabled_features,
136 sizeof(disabled_features) * 8)
137 {
138 const uint32_t *dfs = x86_cpuid_lookup_deep_deps(i);
139 unsigned int j;
140
141 ASSERT(dfs); /* deep_features[] should guarentee this. */
142
143 for ( j = 0; j < FSCAPINTS; ++j )
144 {
145 fs[j] &= ~dfs[j];
146 disabled_features[j] &= ~dfs[j];
147 }
148 }
149 }
150
recalculate_xstate(struct cpuid_policy * p)151 static void recalculate_xstate(struct cpuid_policy *p)
152 {
153 uint64_t xstates = XSTATE_FP_SSE;
154 uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
155 unsigned int i, Da1 = p->xstate.Da1;
156
157 /*
158 * The Da1 leaf is the only piece of information preserved in the common
159 * case. Everything else is derived from other feature state.
160 */
161 memset(&p->xstate, 0, sizeof(p->xstate));
162
163 if ( !p->basic.xsave )
164 return;
165
166 if ( p->basic.avx )
167 {
168 xstates |= X86_XCR0_YMM;
169 xstate_size = max(xstate_size,
170 xstate_offsets[X86_XCR0_YMM_POS] +
171 xstate_sizes[X86_XCR0_YMM_POS]);
172 }
173
174 if ( p->feat.mpx )
175 {
176 xstates |= X86_XCR0_BNDREGS | X86_XCR0_BNDCSR;
177 xstate_size = max(xstate_size,
178 xstate_offsets[X86_XCR0_BNDCSR_POS] +
179 xstate_sizes[X86_XCR0_BNDCSR_POS]);
180 }
181
182 if ( p->feat.avx512f )
183 {
184 xstates |= X86_XCR0_OPMASK | X86_XCR0_ZMM | X86_XCR0_HI_ZMM;
185 xstate_size = max(xstate_size,
186 xstate_offsets[X86_XCR0_HI_ZMM_POS] +
187 xstate_sizes[X86_XCR0_HI_ZMM_POS]);
188 }
189
190 if ( p->feat.pku )
191 {
192 xstates |= X86_XCR0_PKRU;
193 xstate_size = max(xstate_size,
194 xstate_offsets[X86_XCR0_PKRU_POS] +
195 xstate_sizes[X86_XCR0_PKRU_POS]);
196 }
197
198 p->xstate.max_size = xstate_size;
199 p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
200 p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
201
202 p->xstate.Da1 = Da1;
203 if ( p->xstate.xsaves )
204 {
205 p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
206 p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
207 }
208 else
209 xstates &= ~XSTATE_XSAVES_ONLY;
210
211 for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
212 {
213 uint64_t curr_xstate = 1ul << i;
214
215 if ( !(xstates & curr_xstate) )
216 continue;
217
218 p->xstate.comp[i].size = xstate_sizes[i];
219 p->xstate.comp[i].offset = xstate_offsets[i];
220 p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
221 p->xstate.comp[i].align = curr_xstate & xstate_align;
222 }
223 }
224
225 /*
226 * Misc adjustments to the policy. Mostly clobbering reserved fields and
227 * duplicating shared fields. Intentionally hidden fields are annotated.
228 */
recalculate_misc(struct cpuid_policy * p)229 static void recalculate_misc(struct cpuid_policy *p)
230 {
231 p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
232 p->basic.apic_id = 0; /* Dynamic. */
233
234 p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
235 p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
236
237 p->basic.raw[0x8] = EMPTY_LEAF;
238
239 /* TODO: Rework topology logic. */
240 memset(p->topo.raw, 0, sizeof(p->topo.raw));
241
242 p->basic.raw[0xc] = EMPTY_LEAF;
243
244 p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
245
246 /* Most of Power/RAS hidden from guests. */
247 p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
248
249 p->extd.raw[0x8].d = 0;
250
251 switch ( p->x86_vendor )
252 {
253 case X86_VENDOR_INTEL:
254 p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
255 p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
256 p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
257
258 p->extd.vendor_ebx = 0;
259 p->extd.vendor_ecx = 0;
260 p->extd.vendor_edx = 0;
261
262 p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
263
264 p->extd.raw[0x5] = EMPTY_LEAF;
265 p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
266
267 p->extd.raw[0x8].a &= 0x0000ffff;
268 p->extd.raw[0x8].c = 0;
269 break;
270
271 case X86_VENDOR_AMD:
272 case X86_VENDOR_HYGON:
273 zero_leaves(p->basic.raw, 0x2, 0x3);
274 memset(p->cache.raw, 0, sizeof(p->cache.raw));
275 zero_leaves(p->basic.raw, 0x9, 0xa);
276
277 p->extd.vendor_ebx = p->basic.vendor_ebx;
278 p->extd.vendor_ecx = p->basic.vendor_ecx;
279 p->extd.vendor_edx = p->basic.vendor_edx;
280
281 p->extd.raw_fms = p->basic.raw_fms;
282 p->extd.raw[0x1].b &= 0xff00ffff;
283 p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
284
285 p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
286 p->extd.raw[0x8].c &= 0x0003f0ff;
287
288 p->extd.raw[0x9] = EMPTY_LEAF;
289
290 zero_leaves(p->extd.raw, 0xb, 0x18);
291
292 p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
293 p->extd.raw[0x1c] = EMPTY_LEAF; /* LWP - not supported. */
294 break;
295 }
296 }
297
calculate_raw_policy(void)298 static void __init calculate_raw_policy(void)
299 {
300 struct cpuid_policy *p = &raw_cpuid_policy;
301
302 x86_cpuid_policy_fill_native(p);
303
304 /* Nothing good will come from Xen and libx86 disagreeing on vendor. */
305 ASSERT(p->x86_vendor == boot_cpu_data.x86_vendor);
306 }
307
calculate_host_policy(void)308 static void __init calculate_host_policy(void)
309 {
310 struct cpuid_policy *p = &host_cpuid_policy;
311
312 *p = raw_cpuid_policy;
313
314 p->basic.max_leaf =
315 min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
316 p->feat.max_subleaf =
317 min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
318 p->extd.max_leaf = 0x80000000 | min_t(uint32_t, p->extd.max_leaf & 0xffff,
319 ARRAY_SIZE(p->extd.raw) - 1);
320
321 cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
322 recalculate_xstate(p);
323 recalculate_misc(p);
324
325 /* When vPMU is disabled, drop it from the host policy. */
326 if ( vpmu_mode == XENPMU_MODE_OFF )
327 p->basic.raw[0xa] = EMPTY_LEAF;
328
329 if ( p->extd.svm )
330 {
331 /* Clamp to implemented features which require hardware support. */
332 p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
333 (1u << SVM_FEATURE_LBRV) |
334 (1u << SVM_FEATURE_NRIPS) |
335 (1u << SVM_FEATURE_PAUSEFILTER) |
336 (1u << SVM_FEATURE_DECODEASSISTS));
337 /* Enable features which are always emulated. */
338 p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
339 (1u << SVM_FEATURE_TSCRATEMSR));
340 }
341 }
342
guest_common_default_feature_adjustments(uint32_t * fs)343 static void __init guest_common_default_feature_adjustments(uint32_t *fs)
344 {
345 /*
346 * IvyBridge client parts suffer from leakage of RDRAND data due to SRBDS
347 * (XSA-320 / CVE-2020-0543), and won't be receiving microcode to
348 * compensate.
349 *
350 * Mitigate by hiding RDRAND from guests by default, unless explicitly
351 * overridden on the Xen command line (cpuid=rdrand). Irrespective of the
352 * default setting, guests can use RDRAND if explicitly enabled
353 * (cpuid="host,rdrand=1") in the VM's config file, and VMs which were
354 * previously using RDRAND can migrate in.
355 */
356 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
357 boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x3a &&
358 cpu_has_rdrand && !is_forced_cpu_cap(X86_FEATURE_RDRAND) )
359 __clear_bit(X86_FEATURE_RDRAND, fs);
360 }
361
guest_common_feature_adjustments(uint32_t * fs)362 static void __init guest_common_feature_adjustments(uint32_t *fs)
363 {
364 /* Unconditionally claim to be able to set the hypervisor bit. */
365 __set_bit(X86_FEATURE_HYPERVISOR, fs);
366
367 /*
368 * If IBRS is offered to the guest, unconditionally offer STIBP. It is a
369 * nop on non-HT hardware, and has this behaviour to make heterogeneous
370 * setups easier to manage.
371 */
372 if ( test_bit(X86_FEATURE_IBRSB, fs) )
373 __set_bit(X86_FEATURE_STIBP, fs);
374
375 /*
376 * On hardware which supports IBRS/IBPB, we can offer IBPB independently
377 * of IBRS by using the AMD feature bit. An administrator may wish for
378 * performance reasons to offer IBPB without IBRS.
379 */
380 if ( host_cpuid_policy.feat.ibrsb )
381 __set_bit(X86_FEATURE_IBPB, fs);
382 }
383
calculate_pv_max_policy(void)384 static void __init calculate_pv_max_policy(void)
385 {
386 struct cpuid_policy *p = &pv_max_cpuid_policy;
387 uint32_t pv_featureset[FSCAPINTS];
388 unsigned int i;
389
390 *p = host_cpuid_policy;
391 cpuid_policy_to_featureset(p, pv_featureset);
392
393 for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
394 pv_featureset[i] &= pv_max_featuremask[i];
395
396 /*
397 * If Xen isn't virtualising MSR_SPEC_CTRL for PV guests because of
398 * administrator choice, hide the feature.
399 */
400 if ( !boot_cpu_has(X86_FEATURE_SC_MSR_PV) )
401 __clear_bit(X86_FEATURE_IBRSB, pv_featureset);
402
403 guest_common_feature_adjustments(pv_featureset);
404
405 sanitise_featureset(pv_featureset);
406 cpuid_featureset_to_policy(pv_featureset, p);
407 recalculate_xstate(p);
408
409 p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
410 }
411
calculate_pv_def_policy(void)412 static void __init calculate_pv_def_policy(void)
413 {
414 struct cpuid_policy *p = &pv_def_cpuid_policy;
415 uint32_t pv_featureset[FSCAPINTS];
416 unsigned int i;
417
418 *p = pv_max_cpuid_policy;
419 cpuid_policy_to_featureset(p, pv_featureset);
420
421 for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
422 pv_featureset[i] &= pv_def_featuremask[i];
423
424 guest_common_feature_adjustments(pv_featureset);
425 guest_common_default_feature_adjustments(pv_featureset);
426
427 sanitise_featureset(pv_featureset);
428 cpuid_featureset_to_policy(pv_featureset, p);
429 recalculate_xstate(p);
430 }
431
calculate_hvm_max_policy(void)432 static void __init calculate_hvm_max_policy(void)
433 {
434 struct cpuid_policy *p = &hvm_max_cpuid_policy;
435 uint32_t hvm_featureset[FSCAPINTS];
436 unsigned int i;
437 const uint32_t *hvm_featuremask;
438
439 *p = host_cpuid_policy;
440 cpuid_policy_to_featureset(p, hvm_featureset);
441
442 hvm_featuremask = hvm_hap_supported() ?
443 hvm_hap_max_featuremask : hvm_shadow_max_featuremask;
444
445 for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
446 hvm_featureset[i] &= hvm_featuremask[i];
447
448 /*
449 * Xen can provide an (x2)APIC emulation to HVM guests even if the host's
450 * (x2)APIC isn't enabled.
451 */
452 __set_bit(X86_FEATURE_APIC, hvm_featureset);
453 __set_bit(X86_FEATURE_X2APIC, hvm_featureset);
454
455 /*
456 * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
457 * long mode (and init_amd() has cleared it out of host capabilities), but
458 * HVM guests are able if running in protected mode.
459 */
460 if ( (boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
461 raw_cpuid_policy.basic.sep )
462 __set_bit(X86_FEATURE_SEP, hvm_featureset);
463
464 /*
465 * If Xen isn't virtualising MSR_SPEC_CTRL for HVM guests because of
466 * administrator choice, hide the feature.
467 */
468 if ( !boot_cpu_has(X86_FEATURE_SC_MSR_HVM) )
469 __clear_bit(X86_FEATURE_IBRSB, hvm_featureset);
470
471 /*
472 * With VT-x, some features are only supported by Xen if dedicated
473 * hardware support is also available.
474 */
475 if ( cpu_has_vmx )
476 {
477 if ( !cpu_has_vmx_mpx )
478 __clear_bit(X86_FEATURE_MPX, hvm_featureset);
479
480 if ( !cpu_has_vmx_xsaves )
481 __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
482 }
483
484 guest_common_feature_adjustments(hvm_featureset);
485
486 sanitise_featureset(hvm_featureset);
487 cpuid_featureset_to_policy(hvm_featureset, p);
488 recalculate_xstate(p);
489 }
490
calculate_hvm_def_policy(void)491 static void __init calculate_hvm_def_policy(void)
492 {
493 struct cpuid_policy *p = &hvm_def_cpuid_policy;
494 uint32_t hvm_featureset[FSCAPINTS];
495 unsigned int i;
496 const uint32_t *hvm_featuremask;
497
498 *p = hvm_max_cpuid_policy;
499 cpuid_policy_to_featureset(p, hvm_featureset);
500
501 hvm_featuremask = hvm_hap_supported() ?
502 hvm_hap_def_featuremask : hvm_shadow_def_featuremask;
503
504 for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
505 hvm_featureset[i] &= hvm_featuremask[i];
506
507 guest_common_feature_adjustments(hvm_featureset);
508 guest_common_default_feature_adjustments(hvm_featureset);
509
510 sanitise_featureset(hvm_featureset);
511 cpuid_featureset_to_policy(hvm_featureset, p);
512 recalculate_xstate(p);
513 }
514
init_guest_cpuid(void)515 void __init init_guest_cpuid(void)
516 {
517 calculate_raw_policy();
518 calculate_host_policy();
519
520 if ( IS_ENABLED(CONFIG_PV) )
521 {
522 calculate_pv_max_policy();
523 calculate_pv_def_policy();
524 }
525
526 if ( hvm_enabled )
527 {
528 calculate_hvm_max_policy();
529 calculate_hvm_def_policy();
530 }
531 }
532
recheck_cpu_features(unsigned int cpu)533 bool recheck_cpu_features(unsigned int cpu)
534 {
535 bool okay = true;
536 struct cpuinfo_x86 c;
537 const struct cpuinfo_x86 *bsp = &boot_cpu_data;
538 unsigned int i;
539
540 identify_cpu(&c);
541
542 for ( i = 0; i < NCAPINTS; ++i )
543 {
544 if ( !(~c.x86_capability[i] & bsp->x86_capability[i]) )
545 continue;
546
547 printk(XENLOG_ERR "CPU%u: cap[%2u] is %08x (expected %08x)\n",
548 cpu, i, c.x86_capability[i], bsp->x86_capability[i]);
549 okay = false;
550 }
551
552 return okay;
553 }
554
recalculate_cpuid_policy(struct domain * d)555 void recalculate_cpuid_policy(struct domain *d)
556 {
557 struct cpuid_policy *p = d->arch.cpuid;
558 const struct cpuid_policy *max = is_pv_domain(d)
559 ? (IS_ENABLED(CONFIG_PV) ? &pv_max_cpuid_policy : NULL)
560 : (IS_ENABLED(CONFIG_HVM) ? &hvm_max_cpuid_policy : NULL);
561 uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
562 unsigned int i;
563
564 if ( !max )
565 {
566 ASSERT_UNREACHABLE();
567 return;
568 }
569
570 p->x86_vendor = x86_cpuid_lookup_vendor(
571 p->basic.vendor_ebx, p->basic.vendor_ecx, p->basic.vendor_edx);
572
573 p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
574 p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
575 p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
576 ((p->x86_vendor & (X86_VENDOR_AMD |
577 X86_VENDOR_HYGON))
578 ? CPUID_GUEST_NR_EXTD_AMD
579 : CPUID_GUEST_NR_EXTD_INTEL) - 1);
580
581 cpuid_policy_to_featureset(p, fs);
582 cpuid_policy_to_featureset(max, max_fs);
583
584 if ( is_hvm_domain(d) )
585 {
586 /*
587 * HVM domains using Shadow paging have further restrictions on their
588 * available paging features.
589 */
590 if ( !hap_enabled(d) )
591 {
592 for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
593 max_fs[i] &= hvm_shadow_max_featuremask[i];
594 }
595
596 /* Hide nested-virt if it hasn't been explicitly configured. */
597 if ( !nestedhvm_enabled(d) )
598 {
599 __clear_bit(X86_FEATURE_VMX, max_fs);
600 __clear_bit(X86_FEATURE_SVM, max_fs);
601 }
602 }
603
604 /*
605 * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
606 * affect how to interpret topology information in other cpuid leaves.
607 */
608 __set_bit(X86_FEATURE_HTT, max_fs);
609 __set_bit(X86_FEATURE_X2APIC, max_fs);
610 __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
611
612 /*
613 * 32bit PV domains can't use any Long Mode features, and cannot use
614 * SYSCALL on non-AMD hardware.
615 */
616 if ( is_pv_32bit_domain(d) )
617 {
618 __clear_bit(X86_FEATURE_LM, max_fs);
619 if ( !(boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON)) )
620 __clear_bit(X86_FEATURE_SYSCALL, max_fs);
621 }
622
623 /*
624 * ITSC is masked by default (so domains are safe to migrate), but a
625 * toolstack which has configured disable_migrate or vTSC for a domain may
626 * safely select it, and needs a way of doing so.
627 */
628 if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
629 __set_bit(X86_FEATURE_ITSC, max_fs);
630
631 /*
632 * On hardware with MSR_TSX_CTRL, the admin may have elected to disable
633 * TSX and hide the feature bits. Migrating-in VMs may have been booted
634 * pre-mitigation when the TSX features were visbile.
635 *
636 * This situation is compatible (albeit with a perf hit to any TSX code in
637 * the guest), so allow the feature bits to remain set.
638 */
639 if ( cpu_has_tsx_ctrl )
640 {
641 __set_bit(X86_FEATURE_HLE, max_fs);
642 __set_bit(X86_FEATURE_RTM, max_fs);
643 }
644
645 /* Clamp the toolstacks choices to reality. */
646 for ( i = 0; i < ARRAY_SIZE(fs); i++ )
647 fs[i] &= max_fs[i];
648
649 if ( p->basic.max_leaf < XSTATE_CPUID )
650 __clear_bit(X86_FEATURE_XSAVE, fs);
651
652 sanitise_featureset(fs);
653
654 /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
655 fs[FEATURESET_7b0] &= ~special_features[FEATURESET_7b0];
656 fs[FEATURESET_7b0] |= (host_cpuid_policy.feat._7b0 &
657 special_features[FEATURESET_7b0]);
658
659 cpuid_featureset_to_policy(fs, p);
660
661 /* Pass host cacheline size through to guests. */
662 p->basic.clflush_size = max->basic.clflush_size;
663
664 p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
665 p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
666 paging_max_paddr_bits(d));
667 p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
668 (p->basic.pae || p->basic.pse36) ? 36 : 32);
669
670 p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
671
672 recalculate_xstate(p);
673 recalculate_misc(p);
674
675 for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
676 {
677 if ( p->cache.subleaf[i].type >= 1 &&
678 p->cache.subleaf[i].type <= 3 )
679 {
680 /* Subleaf has a valid cache type. Zero reserved fields. */
681 p->cache.raw[i].a &= 0xffffc3ffu;
682 p->cache.raw[i].d &= 0x00000007u;
683 }
684 else
685 {
686 /* Subleaf is not valid. Zero the rest of the union. */
687 zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
688 break;
689 }
690 }
691
692 if ( vpmu_mode == XENPMU_MODE_OFF ||
693 ((vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(d)) )
694 p->basic.raw[0xa] = EMPTY_LEAF;
695
696 if ( !p->extd.svm )
697 p->extd.raw[0xa] = EMPTY_LEAF;
698
699 if ( !p->extd.page1gb )
700 p->extd.raw[0x19] = EMPTY_LEAF;
701 }
702
init_domain_cpuid_policy(struct domain * d)703 int init_domain_cpuid_policy(struct domain *d)
704 {
705 struct cpuid_policy *p = is_pv_domain(d)
706 ? (IS_ENABLED(CONFIG_PV) ? &pv_def_cpuid_policy : NULL)
707 : (IS_ENABLED(CONFIG_HVM) ? &hvm_def_cpuid_policy : NULL);
708
709 if ( !p )
710 {
711 ASSERT_UNREACHABLE();
712 return -EOPNOTSUPP;
713 }
714
715 p = xmemdup(p);
716 if ( !p )
717 return -ENOMEM;
718
719 if ( d->disable_migrate )
720 p->extd.itsc = cpu_has_itsc;
721
722 /*
723 * Expose the "hardware speculation behaviour" bits of ARCH_CAPS to dom0,
724 * so dom0 can turn off workarounds as appropriate. Temporary, until the
725 * domain policy logic gains a better understanding of MSRs.
726 */
727 if ( is_hardware_domain(d) && boot_cpu_has(X86_FEATURE_ARCH_CAPS) )
728 p->feat.arch_caps = true;
729
730 d->arch.cpuid = p;
731
732 recalculate_cpuid_policy(d);
733
734 return 0;
735 }
736
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)737 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
738 uint32_t subleaf, struct cpuid_leaf *res)
739 {
740 const struct domain *d = v->domain;
741 const struct cpuid_policy *p = d->arch.cpuid;
742
743 *res = EMPTY_LEAF;
744
745 /*
746 * First pass:
747 * - Perform max_leaf/subleaf calculations. Out-of-range leaves return
748 * all zeros, following the AMD model.
749 * - Fill in *res with static data.
750 * - Dispatch the virtualised leaves to their respective handlers.
751 */
752 switch ( leaf )
753 {
754 case 0 ... CPUID_GUEST_NR_BASIC - 1:
755 ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
756 if ( leaf > min_t(uint32_t, p->basic.max_leaf,
757 ARRAY_SIZE(p->basic.raw) - 1) )
758 return;
759
760 switch ( leaf )
761 {
762 case 0x4:
763 if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
764 return;
765
766 *res = array_access_nospec(p->cache.raw, subleaf);
767 break;
768
769 case 0x7:
770 ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
771 if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
772 ARRAY_SIZE(p->feat.raw) - 1) )
773 return;
774
775 *res = array_access_nospec(p->feat.raw, subleaf);
776 break;
777
778 case 0xb:
779 if ( subleaf >= ARRAY_SIZE(p->topo.raw) )
780 return;
781
782 *res = array_access_nospec(p->topo.raw, subleaf);
783 break;
784
785 case XSTATE_CPUID:
786 if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
787 return;
788
789 *res = array_access_nospec(p->xstate.raw, subleaf);
790 break;
791
792 default:
793 *res = array_access_nospec(p->basic.raw, leaf);
794 break;
795 }
796 break;
797
798 case 0x40000000 ... 0x400000ff:
799 if ( is_viridian_domain(d) )
800 return cpuid_viridian_leaves(v, leaf, subleaf, res);
801
802 /*
803 * Fallthrough.
804 *
805 * Intel reserve up until 0x4fffffff for hypervisor use. AMD reserve
806 * only until 0x400000ff, but we already use double that.
807 */
808 case 0x40000100 ... 0x400001ff:
809 return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
810
811 case 0x80000000 ... 0x80000000 + CPUID_GUEST_NR_EXTD - 1:
812 ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
813 if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
814 ARRAY_SIZE(p->extd.raw) - 1) )
815 return;
816
817 *res = array_access_nospec(p->extd.raw, leaf & 0xffff);
818 break;
819
820 default:
821 return;
822 }
823
824 /*
825 * Skip dynamic adjustments if we are in the wrong context.
826 *
827 * All dynamic adjustments depends on current register state, which will
828 * be stale if the vcpu is running elsewhere. It is simpler, quicker, and
829 * more reliable for the caller to do nothing (consistently) than to hand
830 * back stale data which it can't use safely.
831 */
832 if ( v != current )
833 return;
834
835 /*
836 * Second pass:
837 * - Dynamic adjustments
838 */
839 switch ( leaf )
840 {
841 const struct cpu_user_regs *regs;
842
843 case 0x1:
844 /* TODO: Rework topology logic. */
845 res->b &= 0x00ffffffu;
846 if ( is_hvm_domain(d) )
847 res->b |= (v->vcpu_id * 2) << 24;
848
849 /* TODO: Rework vPMU control in terms of toolstack choices. */
850 if ( vpmu_available(v) &&
851 vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
852 {
853 res->d |= cpufeat_mask(X86_FEATURE_DS);
854 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
855 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
856 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
857 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
858 }
859
860 if ( is_hvm_domain(d) )
861 {
862 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
863 if ( v->arch.hvm.guest_cr[4] & X86_CR4_OSXSAVE )
864 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
865 }
866 else /* PV domain */
867 {
868 regs = guest_cpu_user_regs();
869
870 /*
871 * !!! OSXSAVE handling for PV guests is non-architectural !!!
872 *
873 * Architecturally, the correct code here is simply:
874 *
875 * if ( v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE )
876 * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
877 *
878 * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
879 * the XSAVE cpuid flag leaked into guests despite the feature not
880 * being available for use), buggy workarounds where introduced to
881 * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
882 * that Xen also incorrectly leaked OSXSAVE into the guest.
883 *
884 * Furthermore, providing architectural OSXSAVE behaviour to a
885 * many Linux PV guests triggered a further kernel bug when the
886 * fpu code observes that XSAVEOPT is available, assumes that
887 * xsave state had been set up for the task, and follows a wild
888 * pointer.
889 *
890 * Older Linux PVOPS kernels however do require architectural
891 * behaviour. They observe Xen's leaked OSXSAVE and assume they
892 * can already use XSETBV, dying with a #UD because the shadowed
893 * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
894 * observed cases via stable backports of the above changeset.
895 *
896 * Therefore, the leaking of Xen's OSXSAVE setting has become a
897 * defacto part of the PV ABI and can't reasonably be corrected.
898 * It can however be restricted to only the enlightened CPUID
899 * view, as seen by the guest kernel.
900 *
901 * The following situations and logic now applies:
902 *
903 * - Hardware without CPUID faulting support and native CPUID:
904 * There is nothing Xen can do here. The hosts XSAVE flag will
905 * leak through and Xen's OSXSAVE choice will leak through.
906 *
907 * In the case that the guest kernel has not set up OSXSAVE, only
908 * SSE will be set in xcr0, and guest userspace can't do too much
909 * damage itself.
910 *
911 * - Enlightened CPUID or CPUID faulting available:
912 * Xen can fully control what is seen here. When the guest has
913 * been configured to have XSAVE available, guest kernels need
914 * to see the leaked OSXSAVE via the enlightened path, but
915 * guest userspace and the native is given architectural
916 * behaviour.
917 *
918 * Emulated vs Faulted CPUID is distinguised based on whether a
919 * #UD or #GP is currently being serviced.
920 */
921 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
922 if ( (v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE) ||
923 (p->basic.xsave &&
924 regs->entry_vector == TRAP_invalid_op &&
925 guest_kernel_mode(v, regs) &&
926 (read_cr4() & X86_CR4_OSXSAVE)) )
927 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
928
929 /*
930 * At the time of writing, a PV domain is the only viable option
931 * for Dom0. Several interactions between dom0 and Xen for real
932 * hardware setup have unfortunately been implemented based on
933 * state which incorrectly leaked into dom0.
934 *
935 * These leaks are retained for backwards compatibility, but
936 * restricted to the hardware domains kernel only.
937 */
938 if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
939 {
940 /*
941 * MONITOR never leaked into PV guests, as PV guests cannot
942 * use the MONITOR/MWAIT instructions. As such, they require
943 * the feature to not being present in emulated CPUID.
944 *
945 * Modern PVOPS Linux try to be cunning and use native CPUID
946 * to see if the hardware actually supports MONITOR, and by
947 * extension, deep C states.
948 *
949 * If the feature is seen, deep-C state information is
950 * obtained from the DSDT and handed back to Xen via the
951 * XENPF_set_processor_pminfo hypercall.
952 *
953 * This mechanism is incompatible with an HVM-based hardware
954 * domain, and also with CPUID Faulting.
955 *
956 * Luckily, Xen can be just as 'cunning', and distinguish an
957 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
958 * fault is currently being serviced. Yuck...
959 */
960 if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
961 res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
962
963 /*
964 * While MONITOR never leaked into PV guests, EIST always used
965 * to.
966 *
967 * Modern PVOPS Linux will only parse P state information from
968 * the DSDT and return it to Xen if EIST is seen in the
969 * emulated CPUID information.
970 */
971 if ( cpu_has_eist )
972 res->c |= cpufeat_mask(X86_FEATURE_EIST);
973 }
974 }
975 goto common_leaf1_adjustments;
976
977 case 0x5:
978 /*
979 * Leak the hardware MONITOR leaf under the same conditions that the
980 * MONITOR feature flag is leaked. See above for details.
981 */
982 regs = guest_cpu_user_regs();
983 if ( is_pv_domain(d) && is_hardware_domain(d) &&
984 guest_kernel_mode(v, regs) && cpu_has_monitor &&
985 regs->entry_vector == TRAP_gp_fault )
986 *res = raw_cpuid_policy.basic.raw[5];
987 break;
988
989 case 0x7:
990 switch ( subleaf )
991 {
992 case 0:
993 /* OSPKE clear in policy. Fast-forward CR4 back in. */
994 if ( (is_pv_domain(d)
995 ? v->arch.pv.ctrlreg[4]
996 : v->arch.hvm.guest_cr[4]) & X86_CR4_PKE )
997 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
998 break;
999 }
1000 break;
1001
1002 case 0xa:
1003 /* TODO: Rework vPMU control in terms of toolstack choices. */
1004 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
1005 !vpmu_available(v) )
1006 *res = EMPTY_LEAF;
1007 else
1008 {
1009 /* Report at most v3 since that's all we currently emulate. */
1010 if ( (res->a & 0xff) > 3 )
1011 res->a = (res->a & ~0xff) | 3;
1012 }
1013 break;
1014
1015 case 0xb:
1016 /*
1017 * In principle, this leaf is Intel-only. In practice, it is tightly
1018 * coupled with x2apic, and we offer an x2apic-capable APIC emulation
1019 * to guests on AMD hardware as well.
1020 *
1021 * TODO: Rework topology logic.
1022 */
1023 if ( p->basic.x2apic )
1024 {
1025 *(uint8_t *)&res->c = subleaf;
1026
1027 /* Fix the x2APIC identifier. */
1028 res->d = v->vcpu_id * 2;
1029 }
1030 break;
1031
1032 case XSTATE_CPUID:
1033 switch ( subleaf )
1034 {
1035 case 1:
1036 if ( p->xstate.xsaves )
1037 {
1038 /*
1039 * TODO: Figure out what to do for XSS state. VT-x manages
1040 * host vs guest MSR_XSS automatically, so as soon as we start
1041 * supporting any XSS states, the wrong XSS will be in
1042 * context.
1043 */
1044 BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
1045
1046 /*
1047 * Read CPUID[0xD,0/1].EBX from hardware. They vary with
1048 * enabled XSTATE, and appropraite XCR0|XSS are in context.
1049 */
1050 case 0:
1051 res->b = cpuid_count_ebx(leaf, subleaf);
1052 }
1053 break;
1054 }
1055 break;
1056
1057 case 0x80000001:
1058 /* SYSCALL is hidden outside of long mode on Intel. */
1059 if ( p->x86_vendor == X86_VENDOR_INTEL &&
1060 is_hvm_domain(d) && !hvm_long_mode_active(v) )
1061 res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
1062
1063 common_leaf1_adjustments:
1064 if ( is_hvm_domain(d) )
1065 {
1066 /* Fast-forward MSR_APIC_BASE.EN. */
1067 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
1068 res->d &= ~cpufeat_mask(X86_FEATURE_APIC);
1069
1070 /*
1071 * PSE36 is not supported in shadow mode. This bit should be
1072 * clear in hvm_shadow_max_featuremask[].
1073 *
1074 * However, an unspecified version of Hyper-V from 2011 refuses to
1075 * start as the "cpu does not provide required hw features" if it
1076 * can't see PSE36.
1077 *
1078 * As a workaround, leak the toolstack-provided PSE36 value into a
1079 * shadow guest if the guest is already using PAE paging (and
1080 * won't care about reverting back to PSE paging). Otherwise,
1081 * knoble it, so a 32bit guest doesn't get the impression that it
1082 * could try to use PSE36 paging.
1083 */
1084 if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
1085 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
1086 }
1087 else /* PV domain */
1088 {
1089 /*
1090 * MTRR used to unconditionally leak into PV guests. They cannot
1091 * MTRR infrastructure at all, and shouldn't be able to see the
1092 * feature.
1093 *
1094 * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
1095 * trying to use the associated MSRs. Xenolinux-based PV dom0's
1096 * however use the MTRR feature as an indication of the presence
1097 * of the XENPF_{add,del,read}_memtype hypercalls.
1098 */
1099 if ( is_hardware_domain(d) && cpu_has_mtrr &&
1100 guest_kernel_mode(v, guest_cpu_user_regs()) )
1101 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
1102 }
1103 break;
1104 }
1105 }
1106
build_assertions(void)1107 static void __init __maybe_unused build_assertions(void)
1108 {
1109 BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
1110 BUILD_BUG_ON(ARRAY_SIZE(special_features) != FSCAPINTS);
1111 BUILD_BUG_ON(ARRAY_SIZE(pv_max_featuremask) != FSCAPINTS);
1112 BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_max_featuremask) != FSCAPINTS);
1113 BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_max_featuremask) != FSCAPINTS);
1114 BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
1115
1116 /* Find some more clever allocation scheme if this trips. */
1117 BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
1118
1119 BUILD_BUG_ON(sizeof(raw_cpuid_policy.basic) !=
1120 sizeof(raw_cpuid_policy.basic.raw));
1121 BUILD_BUG_ON(sizeof(raw_cpuid_policy.feat) !=
1122 sizeof(raw_cpuid_policy.feat.raw));
1123 BUILD_BUG_ON(sizeof(raw_cpuid_policy.xstate) !=
1124 sizeof(raw_cpuid_policy.xstate.raw));
1125 BUILD_BUG_ON(sizeof(raw_cpuid_policy.extd) !=
1126 sizeof(raw_cpuid_policy.extd.raw));
1127 }
1128
1129 /*
1130 * Local variables:
1131 * mode: C
1132 * c-file-style: "BSD"
1133 * c-basic-offset: 4
1134 * tab-width: 4
1135 * indent-tabs-mode: nil
1136 * End:
1137 */
1138