1 /******************************************************************************
2  * dom0_build.c
3  *
4  * Copyright (c) 2002-2005, K A Fraser
5  */
6 
7 #include <xen/init.h>
8 #include <xen/iocap.h>
9 #include <xen/libelf.h>
10 #include <xen/param.h>
11 #include <xen/pfn.h>
12 #include <xen/sched.h>
13 #include <xen/softirq.h>
14 
15 #include <asm/amd.h>
16 #include <asm/dom0_build.h>
17 #include <asm/guest.h>
18 #include <asm/hpet.h>
19 #include <asm/io_apic.h>
20 #include <asm/p2m.h>
21 #include <asm/setup.h>
22 
23 struct memsize {
24     long nr_pages;
25     unsigned int percent;
26     bool minus;
27 };
28 
29 static struct memsize __initdata dom0_size;
30 static struct memsize __initdata dom0_min_size;
31 static struct memsize __initdata dom0_max_size = { .nr_pages = LONG_MAX };
32 static bool __initdata dom0_mem_set;
33 
memsize_gt_zero(const struct memsize * sz)34 static bool __init memsize_gt_zero(const struct memsize *sz)
35 {
36     return !sz->minus && sz->nr_pages;
37 }
38 
get_memsize(const struct memsize * sz,unsigned long avail)39 static unsigned long __init get_memsize(const struct memsize *sz,
40                                         unsigned long avail)
41 {
42     unsigned long pages;
43 
44     pages = sz->nr_pages + sz->percent * avail / 100;
45     return sz->minus ? avail - pages : pages;
46 }
47 
48 /*
49  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
50  *
51  * <min_amt>: The minimum amount of memory which should be allocated for dom0.
52  * <max_amt>: The maximum amount of memory which should be allocated for dom0.
53  * <amt>:     The precise amount of memory to allocate for dom0.
54  *
55  * The format of <min_amt>, <max_amt> and <amt> is as follows:
56  * <size> | <frac>% | <size>+<frac>%
57  * <size> is a size value like 1G (1 GByte), <frac> is percentage of host
58  * memory (so 1G+10% means 10 percent of host memory + 1 GByte).
59  *
60  * Notes:
61  *  1. <amt> is clamped from below by <min_amt> and from above by available
62  *     memory and <max_amt>
63  *  2. <min_amt> is clamped from above by available memory and <max_amt>
64  *  3. <min_amt> is ignored if it is greater than <max_amt>
65  *  4. If <amt> is not specified, it is calculated as follows:
66  *     "All of memory is allocated to domain 0, minus 1/16th which is reserved
67  *      for uses such as DMA buffers (the reservation is clamped to 128MB)."
68  *
69  * Each value can be specified as positive or negative:
70  *  If +ve: The specified amount is an absolute value.
71  *  If -ve: The specified amount is subtracted from total available memory.
72  */
parse_amt(const char * s,const char ** ps,struct memsize * sz)73 static int __init parse_amt(const char *s, const char **ps, struct memsize *sz)
74 {
75     unsigned long val;
76     struct memsize tmp = { };
77     unsigned int items = 0;
78 
79     tmp.minus = (*s == '-');
80     if ( tmp.minus )
81         s++;
82 
83     do
84     {
85         if ( !isdigit(*s) )
86             return -EINVAL;
87 
88         val = parse_size_and_unit(s, ps);
89         s = *ps;
90         if ( *s == '%' )
91         {
92             if ( val >= 100 )
93                 return -EINVAL;
94             tmp.percent = val;
95             s++;
96             items++; /* No other item allowed. */
97         }
98         else
99         {
100             /* <size> item must be first one. */
101             if ( items )
102                 return -EINVAL;
103             tmp.nr_pages = val >> PAGE_SHIFT;
104         }
105         items++;
106     } while ( *s++ == '+' && items < 2 );
107 
108     *ps = --s;
109     if ( *s && *s != ',' )
110         return -EINVAL;
111 
112     *sz = tmp;
113 
114     return 0;
115 }
116 
parse_dom0_mem(const char * s)117 static int __init parse_dom0_mem(const char *s)
118 {
119     int ret;
120 
121     dom0_mem_set = true;
122 
123     /* xen-shim uses shim_mem parameter instead of dom0_mem */
124     if ( pv_shim )
125     {
126         printk("Ignoring dom0_mem param in pv-shim mode\n");
127         return 0;
128     }
129 
130     do {
131         if ( !strncmp(s, "min:", 4) )
132             ret = parse_amt(s + 4, &s, &dom0_min_size);
133         else if ( !strncmp(s, "max:", 4) )
134             ret = parse_amt(s + 4, &s, &dom0_max_size);
135         else
136             ret = parse_amt(s, &s, &dom0_size);
137     } while ( *s++ == ',' && !ret );
138 
139     return s[-1] ? -EINVAL : ret;
140 }
141 custom_param("dom0_mem", parse_dom0_mem);
142 
143 static unsigned int __initdata opt_dom0_max_vcpus_min = 1;
144 static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX;
145 
parse_dom0_max_vcpus(const char * s)146 static int __init parse_dom0_max_vcpus(const char *s)
147 {
148     if ( *s == '-' )                   /* -M */
149         opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0);
150     else                               /* N, N-, or N-M */
151     {
152         opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0);
153         if ( opt_dom0_max_vcpus_min == 0 )
154             opt_dom0_max_vcpus_min = 1;
155         if ( !*s )                    /* N */
156             opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min;
157         else if ( *s++ == '-' && *s ) /* N-M */
158             opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0);
159     }
160 
161     return *s ? -EINVAL : 0;
162 }
163 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
164 
165 static __initdata unsigned int dom0_nr_pxms;
166 static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
167     { [0 ... MAX_NUMNODES - 1] = ~0 };
168 bool __initdata dom0_affinity_relaxed;
169 
parse_dom0_nodes(const char * s)170 static int __init parse_dom0_nodes(const char *s)
171 {
172     do {
173         if ( isdigit(*s) )
174         {
175             if ( dom0_nr_pxms >= ARRAY_SIZE(dom0_pxms) )
176                 return -E2BIG;
177             dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
178             if ( !*s || *s == ',' )
179                 ++dom0_nr_pxms;
180         }
181         else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
182         {
183             dom0_affinity_relaxed = true;
184             s += 7;
185         }
186         else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
187         {
188             dom0_affinity_relaxed = false;
189             s += 6;
190         }
191         else
192             return -EINVAL;
193     } while ( *s++ == ',' );
194 
195     return s[-1] ? -EINVAL : 0;
196 }
197 custom_param("dom0_nodes", parse_dom0_nodes);
198 
199 cpumask_t __initdata dom0_cpus;
200 static nodemask_t __initdata dom0_nodes;
201 
dom0_max_vcpus(void)202 unsigned int __init dom0_max_vcpus(void)
203 {
204     unsigned int i, max_vcpus, limit;
205     nodeid_t node;
206 
207     if ( pv_shim )
208     {
209         nodes_setall(dom0_nodes);
210 
211         /*
212          * When booting in shim mode APs are not started until the guest brings
213          * other vCPUs up.
214          */
215         cpumask_set_cpu(0, &dom0_cpus);
216 
217         /* On PV shim mode allow the guest to have as many CPUs as available. */
218         return nr_cpu_ids;
219     }
220 
221 
222     for ( i = 0; i < dom0_nr_pxms; ++i )
223         if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
224             node_set(node, dom0_nodes);
225     nodes_and(dom0_nodes, dom0_nodes, node_online_map);
226     if ( nodes_empty(dom0_nodes) )
227         dom0_nodes = node_online_map;
228     for_each_node_mask ( node, dom0_nodes )
229         cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
230     cpumask_and(&dom0_cpus, &dom0_cpus, cpupool_valid_cpus(cpupool0));
231     if ( cpumask_empty(&dom0_cpus) )
232         cpumask_copy(&dom0_cpus, cpupool_valid_cpus(cpupool0));
233 
234     max_vcpus = cpumask_weight(&dom0_cpus);
235     if ( opt_dom0_max_vcpus_min > max_vcpus )
236         max_vcpus = opt_dom0_max_vcpus_min;
237     if ( opt_dom0_max_vcpus_max < max_vcpus )
238         max_vcpus = opt_dom0_max_vcpus_max;
239     limit = opt_dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
240     if ( max_vcpus > limit )
241         max_vcpus = limit;
242 
243     return max_vcpus;
244 }
245 
alloc_dom0_vcpu0(struct domain * dom0)246 struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
247 {
248     dom0->node_affinity = dom0_nodes;
249     dom0->auto_node_affinity = !dom0_nr_pxms;
250 
251     return vcpu_create(dom0, 0);
252 }
253 
254 #ifdef CONFIG_SHADOW_PAGING
255 bool __initdata opt_dom0_shadow;
256 #endif
257 bool __initdata opt_dom0_pvh = !IS_ENABLED(CONFIG_PV);
258 bool __initdata opt_dom0_verbose = IS_ENABLED(CONFIG_VERBOSE_DEBUG);
259 
parse_dom0_param(const char * s)260 static int __init parse_dom0_param(const char *s)
261 {
262     const char *ss;
263     int rc = 0;
264 
265     do {
266         int val;
267 
268         ss = strchr(s, ',');
269         if ( !ss )
270             ss = strchr(s, '\0');
271 
272         if ( IS_ENABLED(CONFIG_PV) && !cmdline_strcmp(s, "pv") )
273             opt_dom0_pvh = false;
274         else if ( IS_ENABLED(CONFIG_HVM) && !cmdline_strcmp(s, "pvh") )
275             opt_dom0_pvh = true;
276 #ifdef CONFIG_SHADOW_PAGING
277         else if ( (val = parse_boolean("shadow", s, ss)) >= 0 )
278             opt_dom0_shadow = val;
279 #endif
280         else if ( (val = parse_boolean("verbose", s, ss)) >= 0 )
281             opt_dom0_verbose = val;
282         else if ( IS_ENABLED(CONFIG_PV) &&
283                   (val = parse_boolean("cpuid-faulting", s, ss)) >= 0 )
284             opt_dom0_cpuid_faulting = val;
285         else
286             rc = -EINVAL;
287 
288         s = ss + 1;
289     } while ( *ss );
290 
291     return rc;
292 }
293 custom_param("dom0", parse_dom0_param);
294 
295 static char __initdata opt_dom0_ioports_disable[200] = "";
296 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
297 
298 static bool __initdata ro_hpet = true;
299 boolean_param("ro-hpet", ro_hpet);
300 
301 unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node;
302 
dom0_paging_pages(const struct domain * d,unsigned long nr_pages)303 unsigned long __init dom0_paging_pages(const struct domain *d,
304                                        unsigned long nr_pages)
305 {
306     /* Copied from: libxl_get_required_shadow_memory() */
307     unsigned long memkb = nr_pages * (PAGE_SIZE / 1024);
308 
309     memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
310 
311     return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
312 }
313 
dom0_compute_nr_pages(struct domain * d,struct elf_dom_parms * parms,unsigned long initrd_len)314 unsigned long __init dom0_compute_nr_pages(
315     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
316 {
317     nodeid_t node;
318     unsigned long avail = 0, nr_pages, min_pages, max_pages;
319     bool need_paging;
320 
321     /* The ordering of operands is to work around a clang5 issue. */
322     if ( CONFIG_DOM0_MEM[0] && !dom0_mem_set )
323         parse_dom0_mem(CONFIG_DOM0_MEM);
324 
325     for_each_node_mask ( node, dom0_nodes )
326         avail += avail_domheap_pages_region(node, 0, 0) +
327                  initial_images_nrpages(node);
328 
329     /* Reserve memory for further dom0 vcpu-struct allocations... */
330     avail -= (d->max_vcpus - 1UL)
331              << get_order_from_bytes(sizeof(struct vcpu));
332     /* ...and compat_l4's, if needed. */
333     if ( is_pv_32bit_domain(d) )
334         avail -= d->max_vcpus - 1;
335 
336     /* Reserve memory for iommu_dom0_init() (rough estimate). */
337     if ( is_iommu_enabled(d) )
338     {
339         unsigned int s;
340 
341         for ( s = 9; s < BITS_PER_LONG; s += 9 )
342             avail -= max_pdx >> s;
343     }
344 
345     need_paging = is_hvm_domain(d) &&
346         (!iommu_use_hap_pt(d) || !paging_mode_hap(d));
347     for ( ; ; need_paging = false )
348     {
349         nr_pages = get_memsize(&dom0_size, avail);
350         min_pages = get_memsize(&dom0_min_size, avail);
351         max_pages = get_memsize(&dom0_max_size, avail);
352 
353         /*
354          * If allocation isn't specified, reserve 1/16th of available memory
355          * for things like DMA buffers. This reservation is clamped to a
356          * maximum of 128MB.
357          */
358         if ( !nr_pages )
359         {
360             nr_pages = avail - (pv_shim ? pv_shim_mem(avail)
361                                  : min(avail / 16, 128UL << (20 - PAGE_SHIFT)));
362             if ( is_hvm_domain(d) && !need_paging )
363                 /*
364                  * Temporary workaround message until internal (paging) memory
365                  * accounting required to build a pvh dom0 is improved.
366                  */
367                 printk("WARNING: PVH dom0 without dom0_mem set is still unstable. "
368                        "If you get crashes during boot, try adding a dom0_mem parameter\n");
369         }
370 
371 
372         /* Clamp according to min/max limits and available memory. */
373         nr_pages = max(nr_pages, min_pages);
374         nr_pages = min(nr_pages, max_pages);
375         nr_pages = min(nr_pages, avail);
376 
377         if ( !need_paging )
378             break;
379 
380         /* Reserve memory for shadow or HAP. */
381         avail -= dom0_paging_pages(d, nr_pages);
382     }
383 
384     if ( is_pv_domain(d) &&
385          (parms->p2m_base == UNSET_ADDR) && !memsize_gt_zero(&dom0_size) &&
386          (!memsize_gt_zero(&dom0_min_size) || (nr_pages > min_pages)) )
387     {
388         /*
389          * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
390          * note) require that there is enough virtual space beyond the initial
391          * allocation to set up their initial page tables. This space is
392          * roughly the same size as the p2m table, so make sure the initial
393          * allocation doesn't consume more than about half the space that's
394          * available between params.virt_base and the address space end.
395          */
396         unsigned long vstart, vend, end;
397         size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long);
398 
399         vstart = parms->virt_base;
400         vend = round_pgup(parms->virt_kend);
401         if ( !parms->unmapped_initrd )
402             vend += round_pgup(initrd_len);
403         end = vend + nr_pages * sizeof_long;
404 
405         if ( end > vstart )
406             end += end - vstart;
407         if ( end <= vstart ||
408              (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
409         {
410             end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
411             nr_pages = (end - vend) / (2 * sizeof_long);
412             if ( memsize_gt_zero(&dom0_min_size) && nr_pages < min_pages )
413                 nr_pages = min_pages;
414             printk("Dom0 memory clipped to %lu pages\n", nr_pages);
415         }
416     }
417 
418     d->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
419 
420     return nr_pages;
421 }
422 
process_dom0_ioports_disable(struct domain * dom0)423 static void __init process_dom0_ioports_disable(struct domain *dom0)
424 {
425     unsigned long io_from, io_to;
426     char *t, *s = opt_dom0_ioports_disable;
427     const char *u;
428 
429     if ( *s == '\0' )
430         return;
431 
432     while ( (t = strsep(&s, ",")) != NULL )
433     {
434         io_from = simple_strtoul(t, &u, 16);
435         if ( u == t )
436         {
437         parse_error:
438             printk("Invalid ioport range <%s> "
439                    "in dom0_ioports_disable, skipping\n", t);
440             continue;
441         }
442 
443         if ( *u == '\0' )
444             io_to = io_from;
445         else if ( *u == '-' )
446             io_to = simple_strtoul(u + 1, &u, 16);
447         else
448             goto parse_error;
449 
450         if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
451             goto parse_error;
452 
453         printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
454             io_from, io_to);
455 
456         if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
457             BUG();
458     }
459 }
460 
dom0_setup_permissions(struct domain * d)461 int __init dom0_setup_permissions(struct domain *d)
462 {
463     unsigned long mfn;
464     unsigned int i;
465     int rc;
466 
467     if ( pv_shim )
468         return 0;
469 
470     /* The hardware domain is initially permitted full I/O capabilities. */
471     rc = ioports_permit_access(d, 0, 0xFFFF);
472     rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1);
473     rc |= irqs_permit_access(d, 1, nr_irqs_gsi - 1);
474 
475     /* Modify I/O port access permissions. */
476 
477     /* Master Interrupt Controller (PIC). */
478     rc |= ioports_deny_access(d, 0x20, 0x21);
479     /* Slave Interrupt Controller (PIC). */
480     rc |= ioports_deny_access(d, 0xA0, 0xA1);
481     /* Interval Timer (PIT). */
482     rc |= ioports_deny_access(d, 0x40, 0x43);
483     /* PIT Channel 2 / PC Speaker Control. */
484     rc |= ioports_deny_access(d, 0x61, 0x61);
485     /* ACPI PM Timer. */
486     if ( pmtmr_ioport )
487         rc |= ioports_deny_access(d, pmtmr_ioport, pmtmr_ioport + 3);
488     /* PCI configuration space (NB. 0xcf8 has special treatment). */
489     rc |= ioports_deny_access(d, 0xcfc, 0xcff);
490 #ifdef CONFIG_HVM
491     if ( is_hvm_domain(d) )
492     {
493         /* HVM debug console IO port. */
494         rc |= ioports_deny_access(d, XEN_HVM_DEBUGCONS_IOPORT,
495                                   XEN_HVM_DEBUGCONS_IOPORT);
496         if ( amd_acpi_c1e_quirk )
497             rc |= ioports_deny_access(d, acpi_smi_cmd, acpi_smi_cmd);
498     }
499 #endif
500     /* Command-line I/O ranges. */
501     process_dom0_ioports_disable(d);
502 
503     /* Modify I/O memory access permissions. */
504 
505     /* Local APIC. */
506     if ( mp_lapic_addr != 0 )
507     {
508         mfn = paddr_to_pfn(mp_lapic_addr);
509         rc |= iomem_deny_access(d, mfn, mfn);
510     }
511     /* I/O APICs. */
512     for ( i = 0; i < nr_ioapics; i++ )
513     {
514         mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
515         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
516             rc |= iomem_deny_access(d, mfn, mfn);
517     }
518     /* MSI range. */
519     rc |= iomem_deny_access(d, paddr_to_pfn(MSI_ADDR_BASE_LO),
520                             paddr_to_pfn(MSI_ADDR_BASE_LO +
521                                          MSI_ADDR_DEST_ID_MASK));
522     /* HyperTransport range. */
523     if ( boot_cpu_data.x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
524         rc |= iomem_deny_access(d, paddr_to_pfn(0xfdULL << 32),
525                                 paddr_to_pfn((1ULL << 40) - 1));
526 
527     /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
528     for ( i = 0; i < e820.nr_map; i++ )
529     {
530         unsigned long sfn, efn;
531         sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
532         efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
533         if ( (e820.map[i].type == E820_UNUSABLE) &&
534              (e820.map[i].size != 0) &&
535              (sfn <= efn) )
536             rc |= iomem_deny_access(d, sfn, efn);
537     }
538 
539     /* Prevent access to HPET */
540     if ( hpet_address )
541     {
542         u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK;
543 
544         mfn = paddr_to_pfn(hpet_address);
545         if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 )
546             rc |= iomem_deny_access(d, mfn, mfn);
547         else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 )
548             rc |= iomem_deny_access(d, mfn, mfn + 15);
549         else if ( ro_hpet )
550             rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
551     }
552 
553     return rc;
554 }
555 
construct_dom0(struct domain * d,const module_t * image,unsigned long image_headroom,module_t * initrd,char * cmdline)556 int __init construct_dom0(struct domain *d, const module_t *image,
557                           unsigned long image_headroom, module_t *initrd,
558                           char *cmdline)
559 {
560     int rc;
561 
562     /* Sanity! */
563     BUG_ON(!pv_shim && d->domain_id != 0);
564     BUG_ON(d->vcpu[0] == NULL);
565     BUG_ON(d->vcpu[0]->is_initialised);
566 
567     process_pending_softirqs();
568 
569     if ( is_hvm_domain(d) )
570         rc = dom0_construct_pvh(d, image, image_headroom, initrd, cmdline);
571     else if ( is_pv_domain(d) )
572         rc = dom0_construct_pv(d, image, image_headroom, initrd, cmdline);
573     else
574         panic("Cannot construct Dom0. No guest interface available\n");
575 
576     if ( rc )
577         return rc;
578 
579     /* Sanity! */
580     BUG_ON(!d->vcpu[0]->is_initialised);
581 
582     return 0;
583 }
584 
585 /*
586  * Local variables:
587  * mode: C
588  * c-file-style: "BSD"
589  * c-basic-offset: 4
590  * tab-width: 4
591  * indent-tabs-mode: nil
592  * End:
593  */
594