1 /******************************************************************************
2 * dom0_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
6
7 #include <xen/init.h>
8 #include <xen/iocap.h>
9 #include <xen/libelf.h>
10 #include <xen/param.h>
11 #include <xen/pfn.h>
12 #include <xen/sched.h>
13 #include <xen/softirq.h>
14
15 #include <asm/amd.h>
16 #include <asm/dom0_build.h>
17 #include <asm/guest.h>
18 #include <asm/hpet.h>
19 #include <asm/io_apic.h>
20 #include <asm/p2m.h>
21 #include <asm/setup.h>
22
23 struct memsize {
24 long nr_pages;
25 unsigned int percent;
26 bool minus;
27 };
28
29 static struct memsize __initdata dom0_size;
30 static struct memsize __initdata dom0_min_size;
31 static struct memsize __initdata dom0_max_size = { .nr_pages = LONG_MAX };
32 static bool __initdata dom0_mem_set;
33
memsize_gt_zero(const struct memsize * sz)34 static bool __init memsize_gt_zero(const struct memsize *sz)
35 {
36 return !sz->minus && sz->nr_pages;
37 }
38
get_memsize(const struct memsize * sz,unsigned long avail)39 static unsigned long __init get_memsize(const struct memsize *sz,
40 unsigned long avail)
41 {
42 unsigned long pages;
43
44 pages = sz->nr_pages + sz->percent * avail / 100;
45 return sz->minus ? avail - pages : pages;
46 }
47
48 /*
49 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
50 *
51 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
52 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
53 * <amt>: The precise amount of memory to allocate for dom0.
54 *
55 * The format of <min_amt>, <max_amt> and <amt> is as follows:
56 * <size> | <frac>% | <size>+<frac>%
57 * <size> is a size value like 1G (1 GByte), <frac> is percentage of host
58 * memory (so 1G+10% means 10 percent of host memory + 1 GByte).
59 *
60 * Notes:
61 * 1. <amt> is clamped from below by <min_amt> and from above by available
62 * memory and <max_amt>
63 * 2. <min_amt> is clamped from above by available memory and <max_amt>
64 * 3. <min_amt> is ignored if it is greater than <max_amt>
65 * 4. If <amt> is not specified, it is calculated as follows:
66 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
67 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
68 *
69 * Each value can be specified as positive or negative:
70 * If +ve: The specified amount is an absolute value.
71 * If -ve: The specified amount is subtracted from total available memory.
72 */
parse_amt(const char * s,const char ** ps,struct memsize * sz)73 static int __init parse_amt(const char *s, const char **ps, struct memsize *sz)
74 {
75 unsigned long val;
76 struct memsize tmp = { };
77 unsigned int items = 0;
78
79 tmp.minus = (*s == '-');
80 if ( tmp.minus )
81 s++;
82
83 do
84 {
85 if ( !isdigit(*s) )
86 return -EINVAL;
87
88 val = parse_size_and_unit(s, ps);
89 s = *ps;
90 if ( *s == '%' )
91 {
92 if ( val >= 100 )
93 return -EINVAL;
94 tmp.percent = val;
95 s++;
96 items++; /* No other item allowed. */
97 }
98 else
99 {
100 /* <size> item must be first one. */
101 if ( items )
102 return -EINVAL;
103 tmp.nr_pages = val >> PAGE_SHIFT;
104 }
105 items++;
106 } while ( *s++ == '+' && items < 2 );
107
108 *ps = --s;
109 if ( *s && *s != ',' )
110 return -EINVAL;
111
112 *sz = tmp;
113
114 return 0;
115 }
116
parse_dom0_mem(const char * s)117 static int __init parse_dom0_mem(const char *s)
118 {
119 int ret;
120
121 dom0_mem_set = true;
122
123 /* xen-shim uses shim_mem parameter instead of dom0_mem */
124 if ( pv_shim )
125 {
126 printk("Ignoring dom0_mem param in pv-shim mode\n");
127 return 0;
128 }
129
130 do {
131 if ( !strncmp(s, "min:", 4) )
132 ret = parse_amt(s + 4, &s, &dom0_min_size);
133 else if ( !strncmp(s, "max:", 4) )
134 ret = parse_amt(s + 4, &s, &dom0_max_size);
135 else
136 ret = parse_amt(s, &s, &dom0_size);
137 } while ( *s++ == ',' && !ret );
138
139 return s[-1] ? -EINVAL : ret;
140 }
141 custom_param("dom0_mem", parse_dom0_mem);
142
143 static unsigned int __initdata opt_dom0_max_vcpus_min = 1;
144 static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX;
145
parse_dom0_max_vcpus(const char * s)146 static int __init parse_dom0_max_vcpus(const char *s)
147 {
148 if ( *s == '-' ) /* -M */
149 opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0);
150 else /* N, N-, or N-M */
151 {
152 opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0);
153 if ( opt_dom0_max_vcpus_min == 0 )
154 opt_dom0_max_vcpus_min = 1;
155 if ( !*s ) /* N */
156 opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min;
157 else if ( *s++ == '-' && *s ) /* N-M */
158 opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0);
159 }
160
161 return *s ? -EINVAL : 0;
162 }
163 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
164
165 static __initdata unsigned int dom0_nr_pxms;
166 static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
167 { [0 ... MAX_NUMNODES - 1] = ~0 };
168 bool __initdata dom0_affinity_relaxed;
169
parse_dom0_nodes(const char * s)170 static int __init parse_dom0_nodes(const char *s)
171 {
172 do {
173 if ( isdigit(*s) )
174 {
175 if ( dom0_nr_pxms >= ARRAY_SIZE(dom0_pxms) )
176 return -E2BIG;
177 dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
178 if ( !*s || *s == ',' )
179 ++dom0_nr_pxms;
180 }
181 else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
182 {
183 dom0_affinity_relaxed = true;
184 s += 7;
185 }
186 else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
187 {
188 dom0_affinity_relaxed = false;
189 s += 6;
190 }
191 else
192 return -EINVAL;
193 } while ( *s++ == ',' );
194
195 return s[-1] ? -EINVAL : 0;
196 }
197 custom_param("dom0_nodes", parse_dom0_nodes);
198
199 cpumask_t __initdata dom0_cpus;
200 static nodemask_t __initdata dom0_nodes;
201
dom0_max_vcpus(void)202 unsigned int __init dom0_max_vcpus(void)
203 {
204 unsigned int i, max_vcpus, limit;
205 nodeid_t node;
206
207 if ( pv_shim )
208 {
209 nodes_setall(dom0_nodes);
210
211 /*
212 * When booting in shim mode APs are not started until the guest brings
213 * other vCPUs up.
214 */
215 cpumask_set_cpu(0, &dom0_cpus);
216
217 /* On PV shim mode allow the guest to have as many CPUs as available. */
218 return nr_cpu_ids;
219 }
220
221
222 for ( i = 0; i < dom0_nr_pxms; ++i )
223 if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
224 node_set(node, dom0_nodes);
225 nodes_and(dom0_nodes, dom0_nodes, node_online_map);
226 if ( nodes_empty(dom0_nodes) )
227 dom0_nodes = node_online_map;
228 for_each_node_mask ( node, dom0_nodes )
229 cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
230 cpumask_and(&dom0_cpus, &dom0_cpus, cpupool_valid_cpus(cpupool0));
231 if ( cpumask_empty(&dom0_cpus) )
232 cpumask_copy(&dom0_cpus, cpupool_valid_cpus(cpupool0));
233
234 max_vcpus = cpumask_weight(&dom0_cpus);
235 if ( opt_dom0_max_vcpus_min > max_vcpus )
236 max_vcpus = opt_dom0_max_vcpus_min;
237 if ( opt_dom0_max_vcpus_max < max_vcpus )
238 max_vcpus = opt_dom0_max_vcpus_max;
239 limit = opt_dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
240 if ( max_vcpus > limit )
241 max_vcpus = limit;
242
243 return max_vcpus;
244 }
245
alloc_dom0_vcpu0(struct domain * dom0)246 struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
247 {
248 dom0->node_affinity = dom0_nodes;
249 dom0->auto_node_affinity = !dom0_nr_pxms;
250
251 return vcpu_create(dom0, 0);
252 }
253
254 #ifdef CONFIG_SHADOW_PAGING
255 bool __initdata opt_dom0_shadow;
256 #endif
257 bool __initdata opt_dom0_pvh = !IS_ENABLED(CONFIG_PV);
258 bool __initdata opt_dom0_verbose = IS_ENABLED(CONFIG_VERBOSE_DEBUG);
259
parse_dom0_param(const char * s)260 static int __init parse_dom0_param(const char *s)
261 {
262 const char *ss;
263 int rc = 0;
264
265 do {
266 int val;
267
268 ss = strchr(s, ',');
269 if ( !ss )
270 ss = strchr(s, '\0');
271
272 if ( IS_ENABLED(CONFIG_PV) && !cmdline_strcmp(s, "pv") )
273 opt_dom0_pvh = false;
274 else if ( IS_ENABLED(CONFIG_HVM) && !cmdline_strcmp(s, "pvh") )
275 opt_dom0_pvh = true;
276 #ifdef CONFIG_SHADOW_PAGING
277 else if ( (val = parse_boolean("shadow", s, ss)) >= 0 )
278 opt_dom0_shadow = val;
279 #endif
280 else if ( (val = parse_boolean("verbose", s, ss)) >= 0 )
281 opt_dom0_verbose = val;
282 else if ( IS_ENABLED(CONFIG_PV) &&
283 (val = parse_boolean("cpuid-faulting", s, ss)) >= 0 )
284 opt_dom0_cpuid_faulting = val;
285 else
286 rc = -EINVAL;
287
288 s = ss + 1;
289 } while ( *ss );
290
291 return rc;
292 }
293 custom_param("dom0", parse_dom0_param);
294
295 static char __initdata opt_dom0_ioports_disable[200] = "";
296 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
297
298 static bool __initdata ro_hpet = true;
299 boolean_param("ro-hpet", ro_hpet);
300
301 unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node;
302
dom0_paging_pages(const struct domain * d,unsigned long nr_pages)303 unsigned long __init dom0_paging_pages(const struct domain *d,
304 unsigned long nr_pages)
305 {
306 /* Copied from: libxl_get_required_shadow_memory() */
307 unsigned long memkb = nr_pages * (PAGE_SIZE / 1024);
308
309 memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
310
311 return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
312 }
313
dom0_compute_nr_pages(struct domain * d,struct elf_dom_parms * parms,unsigned long initrd_len)314 unsigned long __init dom0_compute_nr_pages(
315 struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
316 {
317 nodeid_t node;
318 unsigned long avail = 0, nr_pages, min_pages, max_pages;
319 bool need_paging;
320
321 /* The ordering of operands is to work around a clang5 issue. */
322 if ( CONFIG_DOM0_MEM[0] && !dom0_mem_set )
323 parse_dom0_mem(CONFIG_DOM0_MEM);
324
325 for_each_node_mask ( node, dom0_nodes )
326 avail += avail_domheap_pages_region(node, 0, 0) +
327 initial_images_nrpages(node);
328
329 /* Reserve memory for further dom0 vcpu-struct allocations... */
330 avail -= (d->max_vcpus - 1UL)
331 << get_order_from_bytes(sizeof(struct vcpu));
332 /* ...and compat_l4's, if needed. */
333 if ( is_pv_32bit_domain(d) )
334 avail -= d->max_vcpus - 1;
335
336 /* Reserve memory for iommu_dom0_init() (rough estimate). */
337 if ( is_iommu_enabled(d) )
338 {
339 unsigned int s;
340
341 for ( s = 9; s < BITS_PER_LONG; s += 9 )
342 avail -= max_pdx >> s;
343 }
344
345 need_paging = is_hvm_domain(d) &&
346 (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
347 for ( ; ; need_paging = false )
348 {
349 nr_pages = get_memsize(&dom0_size, avail);
350 min_pages = get_memsize(&dom0_min_size, avail);
351 max_pages = get_memsize(&dom0_max_size, avail);
352
353 /*
354 * If allocation isn't specified, reserve 1/16th of available memory
355 * for things like DMA buffers. This reservation is clamped to a
356 * maximum of 128MB.
357 */
358 if ( !nr_pages )
359 {
360 nr_pages = avail - (pv_shim ? pv_shim_mem(avail)
361 : min(avail / 16, 128UL << (20 - PAGE_SHIFT)));
362 if ( is_hvm_domain(d) && !need_paging )
363 /*
364 * Temporary workaround message until internal (paging) memory
365 * accounting required to build a pvh dom0 is improved.
366 */
367 printk("WARNING: PVH dom0 without dom0_mem set is still unstable. "
368 "If you get crashes during boot, try adding a dom0_mem parameter\n");
369 }
370
371
372 /* Clamp according to min/max limits and available memory. */
373 nr_pages = max(nr_pages, min_pages);
374 nr_pages = min(nr_pages, max_pages);
375 nr_pages = min(nr_pages, avail);
376
377 if ( !need_paging )
378 break;
379
380 /* Reserve memory for shadow or HAP. */
381 avail -= dom0_paging_pages(d, nr_pages);
382 }
383
384 if ( is_pv_domain(d) &&
385 (parms->p2m_base == UNSET_ADDR) && !memsize_gt_zero(&dom0_size) &&
386 (!memsize_gt_zero(&dom0_min_size) || (nr_pages > min_pages)) )
387 {
388 /*
389 * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
390 * note) require that there is enough virtual space beyond the initial
391 * allocation to set up their initial page tables. This space is
392 * roughly the same size as the p2m table, so make sure the initial
393 * allocation doesn't consume more than about half the space that's
394 * available between params.virt_base and the address space end.
395 */
396 unsigned long vstart, vend, end;
397 size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long);
398
399 vstart = parms->virt_base;
400 vend = round_pgup(parms->virt_kend);
401 if ( !parms->unmapped_initrd )
402 vend += round_pgup(initrd_len);
403 end = vend + nr_pages * sizeof_long;
404
405 if ( end > vstart )
406 end += end - vstart;
407 if ( end <= vstart ||
408 (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
409 {
410 end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
411 nr_pages = (end - vend) / (2 * sizeof_long);
412 if ( memsize_gt_zero(&dom0_min_size) && nr_pages < min_pages )
413 nr_pages = min_pages;
414 printk("Dom0 memory clipped to %lu pages\n", nr_pages);
415 }
416 }
417
418 d->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
419
420 return nr_pages;
421 }
422
process_dom0_ioports_disable(struct domain * dom0)423 static void __init process_dom0_ioports_disable(struct domain *dom0)
424 {
425 unsigned long io_from, io_to;
426 char *t, *s = opt_dom0_ioports_disable;
427 const char *u;
428
429 if ( *s == '\0' )
430 return;
431
432 while ( (t = strsep(&s, ",")) != NULL )
433 {
434 io_from = simple_strtoul(t, &u, 16);
435 if ( u == t )
436 {
437 parse_error:
438 printk("Invalid ioport range <%s> "
439 "in dom0_ioports_disable, skipping\n", t);
440 continue;
441 }
442
443 if ( *u == '\0' )
444 io_to = io_from;
445 else if ( *u == '-' )
446 io_to = simple_strtoul(u + 1, &u, 16);
447 else
448 goto parse_error;
449
450 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
451 goto parse_error;
452
453 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
454 io_from, io_to);
455
456 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
457 BUG();
458 }
459 }
460
dom0_setup_permissions(struct domain * d)461 int __init dom0_setup_permissions(struct domain *d)
462 {
463 unsigned long mfn;
464 unsigned int i;
465 int rc;
466
467 if ( pv_shim )
468 return 0;
469
470 /* The hardware domain is initially permitted full I/O capabilities. */
471 rc = ioports_permit_access(d, 0, 0xFFFF);
472 rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1);
473 rc |= irqs_permit_access(d, 1, nr_irqs_gsi - 1);
474
475 /* Modify I/O port access permissions. */
476
477 /* Master Interrupt Controller (PIC). */
478 rc |= ioports_deny_access(d, 0x20, 0x21);
479 /* Slave Interrupt Controller (PIC). */
480 rc |= ioports_deny_access(d, 0xA0, 0xA1);
481 /* Interval Timer (PIT). */
482 rc |= ioports_deny_access(d, 0x40, 0x43);
483 /* PIT Channel 2 / PC Speaker Control. */
484 rc |= ioports_deny_access(d, 0x61, 0x61);
485 /* ACPI PM Timer. */
486 if ( pmtmr_ioport )
487 rc |= ioports_deny_access(d, pmtmr_ioport, pmtmr_ioport + 3);
488 /* PCI configuration space (NB. 0xcf8 has special treatment). */
489 rc |= ioports_deny_access(d, 0xcfc, 0xcff);
490 #ifdef CONFIG_HVM
491 if ( is_hvm_domain(d) )
492 {
493 /* HVM debug console IO port. */
494 rc |= ioports_deny_access(d, XEN_HVM_DEBUGCONS_IOPORT,
495 XEN_HVM_DEBUGCONS_IOPORT);
496 if ( amd_acpi_c1e_quirk )
497 rc |= ioports_deny_access(d, acpi_smi_cmd, acpi_smi_cmd);
498 }
499 #endif
500 /* Command-line I/O ranges. */
501 process_dom0_ioports_disable(d);
502
503 /* Modify I/O memory access permissions. */
504
505 /* Local APIC. */
506 if ( mp_lapic_addr != 0 )
507 {
508 mfn = paddr_to_pfn(mp_lapic_addr);
509 rc |= iomem_deny_access(d, mfn, mfn);
510 }
511 /* I/O APICs. */
512 for ( i = 0; i < nr_ioapics; i++ )
513 {
514 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
515 if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
516 rc |= iomem_deny_access(d, mfn, mfn);
517 }
518 /* MSI range. */
519 rc |= iomem_deny_access(d, paddr_to_pfn(MSI_ADDR_BASE_LO),
520 paddr_to_pfn(MSI_ADDR_BASE_LO +
521 MSI_ADDR_DEST_ID_MASK));
522 /* HyperTransport range. */
523 if ( boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
524 rc |= iomem_deny_access(d, paddr_to_pfn(0xfdULL << 32),
525 paddr_to_pfn((1ULL << 40) - 1));
526
527 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
528 for ( i = 0; i < e820.nr_map; i++ )
529 {
530 unsigned long sfn, efn;
531 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
532 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
533 if ( (e820.map[i].type == E820_UNUSABLE) &&
534 (e820.map[i].size != 0) &&
535 (sfn <= efn) )
536 rc |= iomem_deny_access(d, sfn, efn);
537 }
538
539 /* Prevent access to HPET */
540 if ( hpet_address )
541 {
542 u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK;
543
544 mfn = paddr_to_pfn(hpet_address);
545 if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 )
546 rc |= iomem_deny_access(d, mfn, mfn);
547 else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 )
548 rc |= iomem_deny_access(d, mfn, mfn + 15);
549 else if ( ro_hpet )
550 rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
551 }
552
553 return rc;
554 }
555
construct_dom0(struct domain * d,const module_t * image,unsigned long image_headroom,module_t * initrd,char * cmdline)556 int __init construct_dom0(struct domain *d, const module_t *image,
557 unsigned long image_headroom, module_t *initrd,
558 char *cmdline)
559 {
560 int rc;
561
562 /* Sanity! */
563 BUG_ON(!pv_shim && d->domain_id != 0);
564 BUG_ON(d->vcpu[0] == NULL);
565 BUG_ON(d->vcpu[0]->is_initialised);
566
567 process_pending_softirqs();
568
569 if ( is_hvm_domain(d) )
570 rc = dom0_construct_pvh(d, image, image_headroom, initrd, cmdline);
571 else if ( is_pv_domain(d) )
572 rc = dom0_construct_pv(d, image, image_headroom, initrd, cmdline);
573 else
574 panic("Cannot construct Dom0. No guest interface available\n");
575
576 if ( rc )
577 return rc;
578
579 /* Sanity! */
580 BUG_ON(!d->vcpu[0]->is_initialised);
581
582 return 0;
583 }
584
585 /*
586 * Local variables:
587 * mode: C
588 * c-file-style: "BSD"
589 * c-basic-offset: 4
590 * tab-width: 4
591 * indent-tabs-mode: nil
592 * End:
593 */
594