1 #include "libxl_internal.h"
2 #include "libxl_arch.h"
3 
4 #include <xc_dom.h>
5 
libxl__arch_domain_prepare_config(libxl__gc * gc,libxl_domain_config * d_config,xc_domain_configuration_t * xc_config)6 int libxl__arch_domain_prepare_config(libxl__gc *gc,
7                                       libxl_domain_config *d_config,
8                                       xc_domain_configuration_t *xc_config)
9 {
10     switch(d_config->c_info.type) {
11     case LIBXL_DOMAIN_TYPE_HVM:
12         xc_config->emulation_flags = XEN_X86_EMU_ALL;
13         break;
14     case LIBXL_DOMAIN_TYPE_PVH:
15         if (libxl_defbool_val(d_config->b_info.apic))
16             /* PVH guests may want to have LAPIC emulation. */
17             xc_config->emulation_flags = XEN_X86_EMU_LAPIC;
18         else
19             xc_config->emulation_flags = 0;
20         break;
21     case LIBXL_DOMAIN_TYPE_PV:
22         xc_config->emulation_flags = 0;
23         break;
24     default:
25         abort();
26     }
27 
28     return 0;
29 }
30 
libxl__arch_domain_save_config(libxl__gc * gc,libxl_domain_config * d_config,const xc_domain_configuration_t * xc_config)31 int libxl__arch_domain_save_config(libxl__gc *gc,
32                                    libxl_domain_config *d_config,
33                                    const xc_domain_configuration_t *xc_config)
34 {
35     return 0;
36 }
37 
e820_names(int type)38 static const char *e820_names(int type)
39 {
40     switch (type) {
41         case E820_RAM: return "RAM";
42         case E820_RESERVED: return "Reserved";
43         case E820_ACPI: return "ACPI";
44         case E820_NVS: return "ACPI NVS";
45         case E820_UNUSABLE: return "Unusable";
46         default: break;
47     }
48     return "Unknown";
49 }
50 
e820_sanitize(libxl__gc * gc,struct e820entry src[],uint32_t * nr_entries,unsigned long map_limitkb,unsigned long balloon_kb)51 static int e820_sanitize(libxl__gc *gc, struct e820entry src[],
52                          uint32_t *nr_entries,
53                          unsigned long map_limitkb,
54                          unsigned long balloon_kb)
55 {
56     uint64_t delta_kb = 0, start = 0, start_kb = 0, last = 0, ram_end;
57     uint32_t i, idx = 0, nr;
58     struct e820entry e820[E820MAX];
59 
60     if (!src || !map_limitkb || !nr_entries)
61         return ERROR_INVAL;
62 
63     nr = *nr_entries;
64     if (!nr)
65         return ERROR_INVAL;
66 
67     if (nr > E820MAX)
68         return ERROR_NOMEM;
69 
70     /* Weed out anything under 1MB */
71     for (i = 0; i < nr; i++) {
72         if (src[i].addr > 0x100000)
73             continue;
74 
75         src[i].type = 0;
76         src[i].size = 0;
77         src[i].addr = -1ULL;
78     }
79 
80     /* Find the lowest and highest entry in E820, skipping over
81      * undesired entries. */
82     start = -1ULL;
83     last = 0;
84     for (i = 0; i < nr; i++) {
85         if ((src[i].type == E820_RAM) ||
86             (src[i].type == E820_UNUSABLE) ||
87             (src[i].type == 0))
88             continue;
89 
90         start = src[i].addr < start ? src[i].addr : start;
91         last = src[i].addr + src[i].size > last ?
92                src[i].addr + src[i].size > last : last;
93     }
94     if (start > 1024)
95         start_kb = start >> 10;
96 
97     /* Add the memory RAM region for the guest */
98     e820[idx].addr = 0;
99     e820[idx].size = (uint64_t)map_limitkb << 10;
100     e820[idx].type = E820_RAM;
101 
102     /* .. and trim if neccessary */
103     if (start_kb && map_limitkb > start_kb) {
104         delta_kb = map_limitkb - start_kb;
105         if (delta_kb)
106             e820[idx].size -= (uint64_t)(delta_kb << 10);
107     }
108     /* Note: We don't touch balloon_kb here. Will add it at the end. */
109     ram_end = e820[idx].addr + e820[idx].size;
110     idx ++;
111 
112     LOG(DEBUG, "Memory: %"PRIu64"kB End of RAM: " \
113         "0x%"PRIx64" (PFN) Delta: %"PRIu64"kB, PCI start: %"PRIu64"kB " \
114         "(0x%"PRIx64" PFN), Balloon %"PRIu64"kB\n", (uint64_t)map_limitkb,
115         ram_end >> 12, delta_kb, start_kb ,start >> 12,
116         (uint64_t)balloon_kb);
117 
118 
119     /* This whole code below is to guard against if the Intel IGD is passed into
120      * the guest. If we don't pass in IGD, this whole code can be ignored.
121      *
122      * The reason for this code is that Intel boxes fill their E820 with
123      * E820_RAM amongst E820_RESERVED and we can't just ditch those E820_RAM.
124      * That is b/c any "gaps" in the E820 is considered PCI I/O space by
125      * Linux and it would be utilized by the Intel IGD as I/O space while
126      * in reality it was an RAM region.
127      *
128      * What this means is that we have to walk the E820 and for any region
129      * that is RAM and below 4GB and above ram_end, needs to change its type
130      * to E820_UNUSED. We also need to move some of the E820_RAM regions if
131      * the overlap with ram_end. */
132     for (i = 0; i < nr; i++) {
133         uint64_t end = src[i].addr + src[i].size;
134 
135         /* We don't care about E820_UNUSABLE, but we need to
136          * change the type to zero b/c the loop after this
137          * sticks E820_UNUSABLE on the guest's E820 but ignores
138          * the ones with type zero. */
139         if ((src[i].type == E820_UNUSABLE) ||
140             /* Any region that is within the "RAM region" can
141              * be safely ditched. */
142             (end < ram_end)) {
143                 src[i].type = 0;
144                 continue;
145         }
146 
147         /* Look only at RAM regions. */
148         if (src[i].type != E820_RAM)
149             continue;
150 
151         /* We only care about RAM regions below 4GB. */
152         if (src[i].addr >= (1ULL<<32))
153             continue;
154 
155         /* E820_RAM overlaps with our RAM region. Move it */
156         if (src[i].addr < ram_end) {
157             uint64_t delta;
158 
159             src[i].type = E820_UNUSABLE;
160             delta = ram_end - src[i].addr;
161             /* The end < ram_end should weed this out */
162             if (src[i].size < delta)
163                 src[i].type = 0;
164             else {
165                 src[i].size -= delta;
166                 src[i].addr = ram_end;
167             }
168             if (src[i].addr + src[i].size != end) {
169                 /* We messed up somewhere */
170                 src[i].type = 0;
171                 LOGE(ERROR, "Computed E820 wrongly. Continuing on.");
172             }
173         }
174         /* Lastly, convert the RAM to UNSUABLE. Look in the Linux kernel
175            at git commit 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948
176             "xen/setup: Inhibit resource API from using System RAM E820
177            gaps as PCI mem gaps" for full explanation. */
178         if (end > ram_end)
179             src[i].type = E820_UNUSABLE;
180     }
181 
182     /* Check if there is a region between ram_end and start. */
183     if (start > ram_end) {
184         int add_unusable = 1;
185         for (i = 0; i < nr && add_unusable; i++) {
186             if (src[i].type != E820_UNUSABLE)
187                 continue;
188             if (ram_end != src[i].addr)
189                 continue;
190             if (start != src[i].addr + src[i].size) {
191                 /* there is one, adjust it */
192                 src[i].size = start - src[i].addr;
193             }
194             add_unusable = 0;
195         }
196         /* .. and if not present, add it in. This is to guard against
197            the Linux guest assuming that the gap between the end of
198            RAM region and the start of the E820_[ACPI,NVS,RESERVED]
199            is PCI I/O space. Which it certainly is _not_. */
200         if (add_unusable) {
201             e820[idx].type = E820_UNUSABLE;
202             e820[idx].addr = ram_end;
203             e820[idx].size = start - ram_end;
204             idx++;
205         }
206     }
207     /* Almost done: copy them over, ignoring the undesireable ones */
208     for (i = 0; i < nr; i++) {
209         if ((src[i].type == E820_RAM) ||
210             (src[i].type == 0))
211             continue;
212 
213         e820[idx].type = src[i].type;
214         e820[idx].addr = src[i].addr;
215         e820[idx].size = src[i].size;
216         idx++;
217     }
218     /* At this point we have the mapped RAM + E820 entries from src. */
219     if (balloon_kb || delta_kb) {
220         /* and if we truncated the RAM region, then add it to the end. */
221         e820[idx].type = E820_RAM;
222         e820[idx].addr = (uint64_t)(1ULL << 32) > last ?
223                          (uint64_t)(1ULL << 32) : last;
224         /* also add the balloon memory to the end. */
225         e820[idx].size = (uint64_t)(delta_kb << 10) +
226                          (uint64_t)(balloon_kb << 10);
227         idx++;
228 
229     }
230     nr = idx;
231 
232     for (i = 0; i < nr; i++) {
233       LOG(DEBUG, ":\t[%"PRIx64" -> %"PRIx64"] %s", e820[i].addr >> 12,
234           (e820[i].addr + e820[i].size) >> 12, e820_names(e820[i].type));
235     }
236 
237     /* Done: copy the sanitized version. */
238     *nr_entries = nr;
239     memcpy(src, e820, nr * sizeof(struct e820entry));
240     return 0;
241 }
242 
e820_host_sanitize(libxl__gc * gc,libxl_domain_build_info * b_info,struct e820entry map[],uint32_t * nr)243 static int e820_host_sanitize(libxl__gc *gc,
244                               libxl_domain_build_info *b_info,
245                               struct e820entry map[],
246                               uint32_t *nr)
247 {
248     int rc;
249 
250     rc = xc_get_machine_memory_map(CTX->xch, map, *nr);
251     if (rc < 0)
252         return ERROR_FAIL;
253 
254     *nr = rc;
255 
256     rc = e820_sanitize(gc, map, nr, b_info->target_memkb,
257                        (b_info->max_memkb - b_info->target_memkb) +
258                        b_info->u.pv.slack_memkb);
259     return rc;
260 }
261 
libxl__e820_alloc(libxl__gc * gc,uint32_t domid,libxl_domain_config * d_config)262 static int libxl__e820_alloc(libxl__gc *gc, uint32_t domid,
263         libxl_domain_config *d_config)
264 {
265     libxl_ctx *ctx = libxl__gc_owner(gc);
266     int rc;
267     uint32_t nr;
268     struct e820entry map[E820MAX];
269     libxl_domain_build_info *b_info;
270 
271     if (d_config == NULL || d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV)
272         return ERROR_INVAL;
273 
274     b_info = &d_config->b_info;
275     if (!libxl_defbool_val(b_info->u.pv.e820_host))
276         return ERROR_INVAL;
277 
278     nr = E820MAX;
279     rc = e820_host_sanitize(gc, b_info, map, &nr);
280     if (rc)
281         return ERROR_FAIL;
282 
283     rc = xc_domain_set_memory_map(ctx->xch, domid, map, nr);
284 
285     if (rc < 0)
286         return ERROR_FAIL;
287 
288     return 0;
289 }
290 
libxl__arch_domain_create(libxl__gc * gc,libxl_domain_config * d_config,uint32_t domid)291 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
292         uint32_t domid)
293 {
294     int ret = 0;
295     int tsc_mode;
296     uint32_t rtc_timeoffset;
297     libxl_ctx *ctx = libxl__gc_owner(gc);
298 
299     if (d_config->b_info.type == LIBXL_DOMAIN_TYPE_PV)
300         xc_domain_set_memmap_limit(ctx->xch, domid,
301                                    (d_config->b_info.max_memkb +
302                                     d_config->b_info.u.pv.slack_memkb));
303 
304     switch (d_config->b_info.tsc_mode) {
305     case LIBXL_TSC_MODE_DEFAULT:
306         tsc_mode = 0;
307         break;
308     case LIBXL_TSC_MODE_ALWAYS_EMULATE:
309         tsc_mode = 1;
310         break;
311     case LIBXL_TSC_MODE_NATIVE:
312         tsc_mode = 2;
313         break;
314     case LIBXL_TSC_MODE_NATIVE_PARAVIRT:
315         tsc_mode = 3;
316         break;
317     default:
318         abort();
319     }
320     xc_domain_set_tsc_info(ctx->xch, domid, tsc_mode, 0, 0, 0);
321     if (libxl_defbool_val(d_config->b_info.disable_migrate))
322         xc_domain_disable_migrate(ctx->xch, domid);
323     rtc_timeoffset = d_config->b_info.rtc_timeoffset;
324     if (libxl_defbool_val(d_config->b_info.localtime)) {
325         time_t t;
326         struct tm *tm, result;
327 
328         t = time(NULL);
329         tm = localtime_r(&t, &result);
330 
331         if (!tm) {
332             LOGED(ERROR, domid, "Failed to call localtime_r");
333             ret = ERROR_FAIL;
334             goto out;
335         }
336 
337         rtc_timeoffset += tm->tm_gmtoff;
338     }
339 
340     if (rtc_timeoffset)
341         xc_domain_set_time_offset(ctx->xch, domid, rtc_timeoffset);
342 
343     if (d_config->b_info.type != LIBXL_DOMAIN_TYPE_PV) {
344         unsigned long shadow = DIV_ROUNDUP(d_config->b_info.shadow_memkb,
345                                            1024);
346         xc_shadow_control(ctx->xch, domid, XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
347                           NULL, 0, &shadow, 0, NULL);
348     }
349 
350     if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
351             libxl_defbool_val(d_config->b_info.u.pv.e820_host)) {
352         ret = libxl__e820_alloc(gc, domid, d_config);
353         if (ret) {
354             LOGED(ERROR, domid, "Failed while collecting E820 with: %d (errno:%d)\n",
355                  ret, errno);
356         }
357     }
358 
359 out:
360     return ret;
361 }
362 
libxl__arch_extra_memory(libxl__gc * gc,const libxl_domain_build_info * info,uint64_t * out)363 int libxl__arch_extra_memory(libxl__gc *gc,
364                              const libxl_domain_build_info *info,
365                              uint64_t *out)
366 {
367     *out = LIBXL_MAXMEM_CONSTANT;
368 
369     return 0;
370 }
371 
libxl__arch_domain_init_hw_description(libxl__gc * gc,libxl_domain_build_info * info,libxl__domain_build_state * state,struct xc_dom_image * dom)372 int libxl__arch_domain_init_hw_description(libxl__gc *gc,
373                                            libxl_domain_build_info *info,
374                                            libxl__domain_build_state *state,
375                                            struct xc_dom_image *dom)
376 {
377     return 0;
378 }
379 
libxl__arch_domain_finalise_hw_description(libxl__gc * gc,libxl_domain_build_info * info,struct xc_dom_image * dom)380 int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
381                                                libxl_domain_build_info *info,
382                                                struct xc_dom_image *dom)
383 {
384     int rc = 0;
385 
386     if (info->type == LIBXL_DOMAIN_TYPE_PVH) {
387         rc = libxl__dom_load_acpi(gc, info, dom);
388         if (rc != 0)
389             LOGE(ERROR, "libxl_dom_load_acpi failed");
390     }
391 
392     return rc;
393 }
394 
libxl__arch_build_dom_finish(libxl__gc * gc,libxl_domain_build_info * info,struct xc_dom_image * dom,libxl__domain_build_state * state)395 int libxl__arch_build_dom_finish(libxl__gc *gc,
396                                  libxl_domain_build_info *info,
397                                  struct xc_dom_image *dom,
398                                  libxl__domain_build_state *state)
399 {
400     return 0;
401 }
402 
403 /* Return 0 on success, ERROR_* on failure. */
libxl__arch_vnuma_build_vmemrange(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)404 int libxl__arch_vnuma_build_vmemrange(libxl__gc *gc,
405                                       uint32_t domid,
406                                       libxl_domain_build_info *b_info,
407                                       libxl__domain_build_state *state)
408 {
409     int nid, nr_vmemrange, rc;
410     uint32_t nr_e820, e820_count;
411     struct e820entry map[E820MAX];
412     xen_vmemrange_t *vmemranges;
413     unsigned int array_size;
414 
415     /* If e820_host is not set, call the generic function */
416     if (!(b_info->type == LIBXL_DOMAIN_TYPE_PV &&
417           libxl_defbool_val(b_info->u.pv.e820_host)))
418         return libxl__vnuma_build_vmemrange_pv_generic(gc, domid, b_info,
419                                                        state);
420 
421     assert(state->vmemranges == NULL);
422 
423     nr_e820 = E820MAX;
424     rc = e820_host_sanitize(gc, b_info, map, &nr_e820);
425     if (rc) goto out;
426 
427     e820_count = 0;
428     nr_vmemrange = 0;
429     vmemranges = NULL;
430     array_size = 0;
431     for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
432         libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
433         uint64_t remaining_bytes = (p->memkb << 10), bytes;
434 
435         while (remaining_bytes > 0) {
436             if (e820_count >= nr_e820) {
437                 rc = ERROR_NOMEM;
438                 goto out;
439             }
440 
441             /* Skip non RAM region */
442             if (map[e820_count].type != E820_RAM) {
443                 e820_count++;
444                 continue;
445             }
446 
447             if (nr_vmemrange >= array_size) {
448                 array_size += 32;
449                 GCREALLOC_ARRAY(vmemranges, array_size);
450             }
451 
452             bytes = map[e820_count].size >= remaining_bytes ?
453                 remaining_bytes : map[e820_count].size;
454 
455             vmemranges[nr_vmemrange].start = map[e820_count].addr;
456             vmemranges[nr_vmemrange].end = map[e820_count].addr + bytes;
457 
458             if (map[e820_count].size >= remaining_bytes) {
459                 map[e820_count].addr += bytes;
460                 map[e820_count].size -= bytes;
461             } else {
462                 e820_count++;
463             }
464 
465             remaining_bytes -= bytes;
466 
467             vmemranges[nr_vmemrange].flags = 0;
468             vmemranges[nr_vmemrange].nid = nid;
469             nr_vmemrange++;
470         }
471     }
472 
473     state->vmemranges = vmemranges;
474     state->num_vmemranges = nr_vmemrange;
475 
476     rc = 0;
477 out:
478     return rc;
479 }
480 
libxl__arch_domain_map_irq(libxl__gc * gc,uint32_t domid,int irq)481 int libxl__arch_domain_map_irq(libxl__gc *gc, uint32_t domid, int irq)
482 {
483     int ret;
484 
485     ret = xc_physdev_map_pirq(CTX->xch, domid, irq, &irq);
486     if (ret)
487         return ret;
488 
489     ret = xc_domain_irq_permission(CTX->xch, domid, irq, 1);
490 
491     return ret;
492 }
493 
494 /*
495  * Here we're just trying to set these kinds of e820 mappings:
496  *
497  * #1. Low memory region
498  *
499  * Low RAM starts at least from 1M to make sure all standard regions
500  * of the PC memory map, like BIOS, VGA memory-mapped I/O and vgabios,
501  * have enough space.
502  * Note: Those stuffs below 1M are still constructed with multiple
503  * e820 entries by hvmloader. At this point we don't change anything.
504  *
505  * #2. RDM region if it exists
506  *
507  * #3. High memory region if it exists
508  *
509  * Note: these regions are not overlapping since we already check
510  * to adjust them. Please refer to libxl__domain_device_construct_rdm().
511  */
512 #define GUEST_LOW_MEM_START_DEFAULT 0x100000
libxl__arch_domain_construct_memmap(libxl__gc * gc,libxl_domain_config * d_config,uint32_t domid,struct xc_dom_image * dom)513 int libxl__arch_domain_construct_memmap(libxl__gc *gc,
514                                         libxl_domain_config *d_config,
515                                         uint32_t domid,
516                                         struct xc_dom_image *dom)
517 {
518     int rc = 0;
519     unsigned int nr = 0, i;
520     /* We always own at least one lowmem entry. */
521     unsigned int e820_entries = 1;
522     struct e820entry *e820 = NULL;
523     uint64_t highmem_size =
524                     dom->highmem_end ? dom->highmem_end - (1ull << 32) : 0;
525     uint32_t lowmem_start = dom->device_model ? GUEST_LOW_MEM_START_DEFAULT : 0;
526     unsigned page_size = XC_DOM_PAGE_SIZE(dom);
527 
528     /* Add all rdm entries. */
529     for (i = 0; i < d_config->num_rdms; i++)
530         if (d_config->rdms[i].policy != LIBXL_RDM_RESERVE_POLICY_INVALID)
531             e820_entries++;
532 
533 
534     /* If we should have a highmem range. */
535     if (highmem_size)
536         e820_entries++;
537 
538     for (i = 0; i < MAX_ACPI_MODULES; i++)
539         if (dom->acpi_modules[i].length)
540             e820_entries++;
541 
542     if (e820_entries >= E820MAX) {
543         LOGD(ERROR, domid, "Ooops! Too many entries in the memory map!");
544         rc = ERROR_INVAL;
545         goto out;
546     }
547 
548     e820 = libxl__malloc(gc, sizeof(struct e820entry) * e820_entries);
549 
550     /* Low memory */
551     e820[nr].addr = lowmem_start;
552     e820[nr].size = dom->lowmem_end - lowmem_start;
553     e820[nr].type = E820_RAM;
554     nr++;
555 
556     /* RDM mapping */
557     for (i = 0; i < d_config->num_rdms; i++) {
558         if (d_config->rdms[i].policy == LIBXL_RDM_RESERVE_POLICY_INVALID)
559             continue;
560 
561         e820[nr].addr = d_config->rdms[i].start;
562         e820[nr].size = d_config->rdms[i].size;
563         e820[nr].type = E820_RESERVED;
564         nr++;
565     }
566 
567     for (i = 0; i < MAX_ACPI_MODULES; i++) {
568         if (dom->acpi_modules[i].length) {
569             e820[nr].addr = dom->acpi_modules[i].guest_addr_out & ~(page_size - 1);
570             e820[nr].size = dom->acpi_modules[i].length +
571                 (dom->acpi_modules[i].guest_addr_out & (page_size - 1));
572             e820[nr].type = E820_ACPI;
573             nr++;
574         }
575     }
576 
577     /* High memory */
578     if (highmem_size) {
579         e820[nr].addr = ((uint64_t)1 << 32);
580         e820[nr].size = highmem_size;
581         e820[nr].type = E820_RAM;
582     }
583 
584     if (xc_domain_set_memory_map(CTX->xch, domid, e820, e820_entries) != 0) {
585         rc = ERROR_FAIL;
586         goto out;
587     }
588 
589 out:
590     return rc;
591 }
592 
libxl__arch_domain_build_info_acpi_setdefault(libxl_domain_build_info * b_info)593 void libxl__arch_domain_build_info_acpi_setdefault(
594                                         libxl_domain_build_info *b_info)
595 {
596     libxl_defbool_setdefault(&b_info->acpi, true);
597 }
598 
599 /*
600  * Local variables:
601  * mode: C
602  * c-basic-offset: 4
603  * indent-tabs-mode: nil
604  * End:
605  */
606