1 #include "libxl_internal.h"
2 #include "libxl_arch.h"
3
4 #include <xc_dom.h>
5
libxl__arch_domain_prepare_config(libxl__gc * gc,libxl_domain_config * d_config,xc_domain_configuration_t * xc_config)6 int libxl__arch_domain_prepare_config(libxl__gc *gc,
7 libxl_domain_config *d_config,
8 xc_domain_configuration_t *xc_config)
9 {
10 switch(d_config->c_info.type) {
11 case LIBXL_DOMAIN_TYPE_HVM:
12 xc_config->emulation_flags = XEN_X86_EMU_ALL;
13 break;
14 case LIBXL_DOMAIN_TYPE_PVH:
15 if (libxl_defbool_val(d_config->b_info.apic))
16 /* PVH guests may want to have LAPIC emulation. */
17 xc_config->emulation_flags = XEN_X86_EMU_LAPIC;
18 else
19 xc_config->emulation_flags = 0;
20 break;
21 case LIBXL_DOMAIN_TYPE_PV:
22 xc_config->emulation_flags = 0;
23 break;
24 default:
25 abort();
26 }
27
28 return 0;
29 }
30
libxl__arch_domain_save_config(libxl__gc * gc,libxl_domain_config * d_config,const xc_domain_configuration_t * xc_config)31 int libxl__arch_domain_save_config(libxl__gc *gc,
32 libxl_domain_config *d_config,
33 const xc_domain_configuration_t *xc_config)
34 {
35 return 0;
36 }
37
e820_names(int type)38 static const char *e820_names(int type)
39 {
40 switch (type) {
41 case E820_RAM: return "RAM";
42 case E820_RESERVED: return "Reserved";
43 case E820_ACPI: return "ACPI";
44 case E820_NVS: return "ACPI NVS";
45 case E820_UNUSABLE: return "Unusable";
46 default: break;
47 }
48 return "Unknown";
49 }
50
e820_sanitize(libxl__gc * gc,struct e820entry src[],uint32_t * nr_entries,unsigned long map_limitkb,unsigned long balloon_kb)51 static int e820_sanitize(libxl__gc *gc, struct e820entry src[],
52 uint32_t *nr_entries,
53 unsigned long map_limitkb,
54 unsigned long balloon_kb)
55 {
56 uint64_t delta_kb = 0, start = 0, start_kb = 0, last = 0, ram_end;
57 uint32_t i, idx = 0, nr;
58 struct e820entry e820[E820MAX];
59
60 if (!src || !map_limitkb || !nr_entries)
61 return ERROR_INVAL;
62
63 nr = *nr_entries;
64 if (!nr)
65 return ERROR_INVAL;
66
67 if (nr > E820MAX)
68 return ERROR_NOMEM;
69
70 /* Weed out anything under 1MB */
71 for (i = 0; i < nr; i++) {
72 if (src[i].addr > 0x100000)
73 continue;
74
75 src[i].type = 0;
76 src[i].size = 0;
77 src[i].addr = -1ULL;
78 }
79
80 /* Find the lowest and highest entry in E820, skipping over
81 * undesired entries. */
82 start = -1ULL;
83 last = 0;
84 for (i = 0; i < nr; i++) {
85 if ((src[i].type == E820_RAM) ||
86 (src[i].type == E820_UNUSABLE) ||
87 (src[i].type == 0))
88 continue;
89
90 start = src[i].addr < start ? src[i].addr : start;
91 last = src[i].addr + src[i].size > last ?
92 src[i].addr + src[i].size > last : last;
93 }
94 if (start > 1024)
95 start_kb = start >> 10;
96
97 /* Add the memory RAM region for the guest */
98 e820[idx].addr = 0;
99 e820[idx].size = (uint64_t)map_limitkb << 10;
100 e820[idx].type = E820_RAM;
101
102 /* .. and trim if neccessary */
103 if (start_kb && map_limitkb > start_kb) {
104 delta_kb = map_limitkb - start_kb;
105 if (delta_kb)
106 e820[idx].size -= (uint64_t)(delta_kb << 10);
107 }
108 /* Note: We don't touch balloon_kb here. Will add it at the end. */
109 ram_end = e820[idx].addr + e820[idx].size;
110 idx ++;
111
112 LOG(DEBUG, "Memory: %"PRIu64"kB End of RAM: " \
113 "0x%"PRIx64" (PFN) Delta: %"PRIu64"kB, PCI start: %"PRIu64"kB " \
114 "(0x%"PRIx64" PFN), Balloon %"PRIu64"kB\n", (uint64_t)map_limitkb,
115 ram_end >> 12, delta_kb, start_kb ,start >> 12,
116 (uint64_t)balloon_kb);
117
118
119 /* This whole code below is to guard against if the Intel IGD is passed into
120 * the guest. If we don't pass in IGD, this whole code can be ignored.
121 *
122 * The reason for this code is that Intel boxes fill their E820 with
123 * E820_RAM amongst E820_RESERVED and we can't just ditch those E820_RAM.
124 * That is b/c any "gaps" in the E820 is considered PCI I/O space by
125 * Linux and it would be utilized by the Intel IGD as I/O space while
126 * in reality it was an RAM region.
127 *
128 * What this means is that we have to walk the E820 and for any region
129 * that is RAM and below 4GB and above ram_end, needs to change its type
130 * to E820_UNUSED. We also need to move some of the E820_RAM regions if
131 * the overlap with ram_end. */
132 for (i = 0; i < nr; i++) {
133 uint64_t end = src[i].addr + src[i].size;
134
135 /* We don't care about E820_UNUSABLE, but we need to
136 * change the type to zero b/c the loop after this
137 * sticks E820_UNUSABLE on the guest's E820 but ignores
138 * the ones with type zero. */
139 if ((src[i].type == E820_UNUSABLE) ||
140 /* Any region that is within the "RAM region" can
141 * be safely ditched. */
142 (end < ram_end)) {
143 src[i].type = 0;
144 continue;
145 }
146
147 /* Look only at RAM regions. */
148 if (src[i].type != E820_RAM)
149 continue;
150
151 /* We only care about RAM regions below 4GB. */
152 if (src[i].addr >= (1ULL<<32))
153 continue;
154
155 /* E820_RAM overlaps with our RAM region. Move it */
156 if (src[i].addr < ram_end) {
157 uint64_t delta;
158
159 src[i].type = E820_UNUSABLE;
160 delta = ram_end - src[i].addr;
161 /* The end < ram_end should weed this out */
162 if (src[i].size < delta)
163 src[i].type = 0;
164 else {
165 src[i].size -= delta;
166 src[i].addr = ram_end;
167 }
168 if (src[i].addr + src[i].size != end) {
169 /* We messed up somewhere */
170 src[i].type = 0;
171 LOGE(ERROR, "Computed E820 wrongly. Continuing on.");
172 }
173 }
174 /* Lastly, convert the RAM to UNSUABLE. Look in the Linux kernel
175 at git commit 2f14ddc3a7146ea4cd5a3d1ecd993f85f2e4f948
176 "xen/setup: Inhibit resource API from using System RAM E820
177 gaps as PCI mem gaps" for full explanation. */
178 if (end > ram_end)
179 src[i].type = E820_UNUSABLE;
180 }
181
182 /* Check if there is a region between ram_end and start. */
183 if (start > ram_end) {
184 int add_unusable = 1;
185 for (i = 0; i < nr && add_unusable; i++) {
186 if (src[i].type != E820_UNUSABLE)
187 continue;
188 if (ram_end != src[i].addr)
189 continue;
190 if (start != src[i].addr + src[i].size) {
191 /* there is one, adjust it */
192 src[i].size = start - src[i].addr;
193 }
194 add_unusable = 0;
195 }
196 /* .. and if not present, add it in. This is to guard against
197 the Linux guest assuming that the gap between the end of
198 RAM region and the start of the E820_[ACPI,NVS,RESERVED]
199 is PCI I/O space. Which it certainly is _not_. */
200 if (add_unusable) {
201 e820[idx].type = E820_UNUSABLE;
202 e820[idx].addr = ram_end;
203 e820[idx].size = start - ram_end;
204 idx++;
205 }
206 }
207 /* Almost done: copy them over, ignoring the undesireable ones */
208 for (i = 0; i < nr; i++) {
209 if ((src[i].type == E820_RAM) ||
210 (src[i].type == 0))
211 continue;
212
213 e820[idx].type = src[i].type;
214 e820[idx].addr = src[i].addr;
215 e820[idx].size = src[i].size;
216 idx++;
217 }
218 /* At this point we have the mapped RAM + E820 entries from src. */
219 if (balloon_kb || delta_kb) {
220 /* and if we truncated the RAM region, then add it to the end. */
221 e820[idx].type = E820_RAM;
222 e820[idx].addr = (uint64_t)(1ULL << 32) > last ?
223 (uint64_t)(1ULL << 32) : last;
224 /* also add the balloon memory to the end. */
225 e820[idx].size = (uint64_t)(delta_kb << 10) +
226 (uint64_t)(balloon_kb << 10);
227 idx++;
228
229 }
230 nr = idx;
231
232 for (i = 0; i < nr; i++) {
233 LOG(DEBUG, ":\t[%"PRIx64" -> %"PRIx64"] %s", e820[i].addr >> 12,
234 (e820[i].addr + e820[i].size) >> 12, e820_names(e820[i].type));
235 }
236
237 /* Done: copy the sanitized version. */
238 *nr_entries = nr;
239 memcpy(src, e820, nr * sizeof(struct e820entry));
240 return 0;
241 }
242
e820_host_sanitize(libxl__gc * gc,libxl_domain_build_info * b_info,struct e820entry map[],uint32_t * nr)243 static int e820_host_sanitize(libxl__gc *gc,
244 libxl_domain_build_info *b_info,
245 struct e820entry map[],
246 uint32_t *nr)
247 {
248 int rc;
249
250 rc = xc_get_machine_memory_map(CTX->xch, map, *nr);
251 if (rc < 0)
252 return ERROR_FAIL;
253
254 *nr = rc;
255
256 rc = e820_sanitize(gc, map, nr, b_info->target_memkb,
257 (b_info->max_memkb - b_info->target_memkb) +
258 b_info->u.pv.slack_memkb);
259 return rc;
260 }
261
libxl__e820_alloc(libxl__gc * gc,uint32_t domid,libxl_domain_config * d_config)262 static int libxl__e820_alloc(libxl__gc *gc, uint32_t domid,
263 libxl_domain_config *d_config)
264 {
265 libxl_ctx *ctx = libxl__gc_owner(gc);
266 int rc;
267 uint32_t nr;
268 struct e820entry map[E820MAX];
269 libxl_domain_build_info *b_info;
270
271 if (d_config == NULL || d_config->c_info.type != LIBXL_DOMAIN_TYPE_PV)
272 return ERROR_INVAL;
273
274 b_info = &d_config->b_info;
275 if (!libxl_defbool_val(b_info->u.pv.e820_host))
276 return ERROR_INVAL;
277
278 nr = E820MAX;
279 rc = e820_host_sanitize(gc, b_info, map, &nr);
280 if (rc)
281 return ERROR_FAIL;
282
283 rc = xc_domain_set_memory_map(ctx->xch, domid, map, nr);
284
285 if (rc < 0)
286 return ERROR_FAIL;
287
288 return 0;
289 }
290
libxl__arch_domain_create(libxl__gc * gc,libxl_domain_config * d_config,uint32_t domid)291 int libxl__arch_domain_create(libxl__gc *gc, libxl_domain_config *d_config,
292 uint32_t domid)
293 {
294 int ret = 0;
295 int tsc_mode;
296 uint32_t rtc_timeoffset;
297 libxl_ctx *ctx = libxl__gc_owner(gc);
298
299 if (d_config->b_info.type == LIBXL_DOMAIN_TYPE_PV)
300 xc_domain_set_memmap_limit(ctx->xch, domid,
301 (d_config->b_info.max_memkb +
302 d_config->b_info.u.pv.slack_memkb));
303
304 switch (d_config->b_info.tsc_mode) {
305 case LIBXL_TSC_MODE_DEFAULT:
306 tsc_mode = 0;
307 break;
308 case LIBXL_TSC_MODE_ALWAYS_EMULATE:
309 tsc_mode = 1;
310 break;
311 case LIBXL_TSC_MODE_NATIVE:
312 tsc_mode = 2;
313 break;
314 case LIBXL_TSC_MODE_NATIVE_PARAVIRT:
315 tsc_mode = 3;
316 break;
317 default:
318 abort();
319 }
320 xc_domain_set_tsc_info(ctx->xch, domid, tsc_mode, 0, 0, 0);
321 if (libxl_defbool_val(d_config->b_info.disable_migrate))
322 xc_domain_disable_migrate(ctx->xch, domid);
323 rtc_timeoffset = d_config->b_info.rtc_timeoffset;
324 if (libxl_defbool_val(d_config->b_info.localtime)) {
325 time_t t;
326 struct tm *tm, result;
327
328 t = time(NULL);
329 tm = localtime_r(&t, &result);
330
331 if (!tm) {
332 LOGED(ERROR, domid, "Failed to call localtime_r");
333 ret = ERROR_FAIL;
334 goto out;
335 }
336
337 rtc_timeoffset += tm->tm_gmtoff;
338 }
339
340 if (rtc_timeoffset)
341 xc_domain_set_time_offset(ctx->xch, domid, rtc_timeoffset);
342
343 if (d_config->b_info.type != LIBXL_DOMAIN_TYPE_PV) {
344 unsigned long shadow = DIV_ROUNDUP(d_config->b_info.shadow_memkb,
345 1024);
346 xc_shadow_control(ctx->xch, domid, XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION,
347 NULL, 0, &shadow, 0, NULL);
348 }
349
350 if (d_config->c_info.type == LIBXL_DOMAIN_TYPE_PV &&
351 libxl_defbool_val(d_config->b_info.u.pv.e820_host)) {
352 ret = libxl__e820_alloc(gc, domid, d_config);
353 if (ret) {
354 LOGED(ERROR, domid, "Failed while collecting E820 with: %d (errno:%d)\n",
355 ret, errno);
356 }
357 }
358
359 out:
360 return ret;
361 }
362
libxl__arch_extra_memory(libxl__gc * gc,const libxl_domain_build_info * info,uint64_t * out)363 int libxl__arch_extra_memory(libxl__gc *gc,
364 const libxl_domain_build_info *info,
365 uint64_t *out)
366 {
367 *out = LIBXL_MAXMEM_CONSTANT;
368
369 return 0;
370 }
371
libxl__arch_domain_init_hw_description(libxl__gc * gc,libxl_domain_build_info * info,libxl__domain_build_state * state,struct xc_dom_image * dom)372 int libxl__arch_domain_init_hw_description(libxl__gc *gc,
373 libxl_domain_build_info *info,
374 libxl__domain_build_state *state,
375 struct xc_dom_image *dom)
376 {
377 return 0;
378 }
379
libxl__arch_domain_finalise_hw_description(libxl__gc * gc,libxl_domain_build_info * info,struct xc_dom_image * dom)380 int libxl__arch_domain_finalise_hw_description(libxl__gc *gc,
381 libxl_domain_build_info *info,
382 struct xc_dom_image *dom)
383 {
384 int rc = 0;
385
386 if (info->type == LIBXL_DOMAIN_TYPE_PVH) {
387 rc = libxl__dom_load_acpi(gc, info, dom);
388 if (rc != 0)
389 LOGE(ERROR, "libxl_dom_load_acpi failed");
390 }
391
392 return rc;
393 }
394
libxl__arch_build_dom_finish(libxl__gc * gc,libxl_domain_build_info * info,struct xc_dom_image * dom,libxl__domain_build_state * state)395 int libxl__arch_build_dom_finish(libxl__gc *gc,
396 libxl_domain_build_info *info,
397 struct xc_dom_image *dom,
398 libxl__domain_build_state *state)
399 {
400 return 0;
401 }
402
403 /* Return 0 on success, ERROR_* on failure. */
libxl__arch_vnuma_build_vmemrange(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)404 int libxl__arch_vnuma_build_vmemrange(libxl__gc *gc,
405 uint32_t domid,
406 libxl_domain_build_info *b_info,
407 libxl__domain_build_state *state)
408 {
409 int nid, nr_vmemrange, rc;
410 uint32_t nr_e820, e820_count;
411 struct e820entry map[E820MAX];
412 xen_vmemrange_t *vmemranges;
413 unsigned int array_size;
414
415 /* If e820_host is not set, call the generic function */
416 if (!(b_info->type == LIBXL_DOMAIN_TYPE_PV &&
417 libxl_defbool_val(b_info->u.pv.e820_host)))
418 return libxl__vnuma_build_vmemrange_pv_generic(gc, domid, b_info,
419 state);
420
421 assert(state->vmemranges == NULL);
422
423 nr_e820 = E820MAX;
424 rc = e820_host_sanitize(gc, b_info, map, &nr_e820);
425 if (rc) goto out;
426
427 e820_count = 0;
428 nr_vmemrange = 0;
429 vmemranges = NULL;
430 array_size = 0;
431 for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
432 libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
433 uint64_t remaining_bytes = (p->memkb << 10), bytes;
434
435 while (remaining_bytes > 0) {
436 if (e820_count >= nr_e820) {
437 rc = ERROR_NOMEM;
438 goto out;
439 }
440
441 /* Skip non RAM region */
442 if (map[e820_count].type != E820_RAM) {
443 e820_count++;
444 continue;
445 }
446
447 if (nr_vmemrange >= array_size) {
448 array_size += 32;
449 GCREALLOC_ARRAY(vmemranges, array_size);
450 }
451
452 bytes = map[e820_count].size >= remaining_bytes ?
453 remaining_bytes : map[e820_count].size;
454
455 vmemranges[nr_vmemrange].start = map[e820_count].addr;
456 vmemranges[nr_vmemrange].end = map[e820_count].addr + bytes;
457
458 if (map[e820_count].size >= remaining_bytes) {
459 map[e820_count].addr += bytes;
460 map[e820_count].size -= bytes;
461 } else {
462 e820_count++;
463 }
464
465 remaining_bytes -= bytes;
466
467 vmemranges[nr_vmemrange].flags = 0;
468 vmemranges[nr_vmemrange].nid = nid;
469 nr_vmemrange++;
470 }
471 }
472
473 state->vmemranges = vmemranges;
474 state->num_vmemranges = nr_vmemrange;
475
476 rc = 0;
477 out:
478 return rc;
479 }
480
libxl__arch_domain_map_irq(libxl__gc * gc,uint32_t domid,int irq)481 int libxl__arch_domain_map_irq(libxl__gc *gc, uint32_t domid, int irq)
482 {
483 int ret;
484
485 ret = xc_physdev_map_pirq(CTX->xch, domid, irq, &irq);
486 if (ret)
487 return ret;
488
489 ret = xc_domain_irq_permission(CTX->xch, domid, irq, 1);
490
491 return ret;
492 }
493
494 /*
495 * Here we're just trying to set these kinds of e820 mappings:
496 *
497 * #1. Low memory region
498 *
499 * Low RAM starts at least from 1M to make sure all standard regions
500 * of the PC memory map, like BIOS, VGA memory-mapped I/O and vgabios,
501 * have enough space.
502 * Note: Those stuffs below 1M are still constructed with multiple
503 * e820 entries by hvmloader. At this point we don't change anything.
504 *
505 * #2. RDM region if it exists
506 *
507 * #3. High memory region if it exists
508 *
509 * Note: these regions are not overlapping since we already check
510 * to adjust them. Please refer to libxl__domain_device_construct_rdm().
511 */
512 #define GUEST_LOW_MEM_START_DEFAULT 0x100000
libxl__arch_domain_construct_memmap(libxl__gc * gc,libxl_domain_config * d_config,uint32_t domid,struct xc_dom_image * dom)513 int libxl__arch_domain_construct_memmap(libxl__gc *gc,
514 libxl_domain_config *d_config,
515 uint32_t domid,
516 struct xc_dom_image *dom)
517 {
518 int rc = 0;
519 unsigned int nr = 0, i;
520 /* We always own at least one lowmem entry. */
521 unsigned int e820_entries = 1;
522 struct e820entry *e820 = NULL;
523 uint64_t highmem_size =
524 dom->highmem_end ? dom->highmem_end - (1ull << 32) : 0;
525 uint32_t lowmem_start = dom->device_model ? GUEST_LOW_MEM_START_DEFAULT : 0;
526 unsigned page_size = XC_DOM_PAGE_SIZE(dom);
527
528 /* Add all rdm entries. */
529 for (i = 0; i < d_config->num_rdms; i++)
530 if (d_config->rdms[i].policy != LIBXL_RDM_RESERVE_POLICY_INVALID)
531 e820_entries++;
532
533
534 /* If we should have a highmem range. */
535 if (highmem_size)
536 e820_entries++;
537
538 for (i = 0; i < MAX_ACPI_MODULES; i++)
539 if (dom->acpi_modules[i].length)
540 e820_entries++;
541
542 if (e820_entries >= E820MAX) {
543 LOGD(ERROR, domid, "Ooops! Too many entries in the memory map!");
544 rc = ERROR_INVAL;
545 goto out;
546 }
547
548 e820 = libxl__malloc(gc, sizeof(struct e820entry) * e820_entries);
549
550 /* Low memory */
551 e820[nr].addr = lowmem_start;
552 e820[nr].size = dom->lowmem_end - lowmem_start;
553 e820[nr].type = E820_RAM;
554 nr++;
555
556 /* RDM mapping */
557 for (i = 0; i < d_config->num_rdms; i++) {
558 if (d_config->rdms[i].policy == LIBXL_RDM_RESERVE_POLICY_INVALID)
559 continue;
560
561 e820[nr].addr = d_config->rdms[i].start;
562 e820[nr].size = d_config->rdms[i].size;
563 e820[nr].type = E820_RESERVED;
564 nr++;
565 }
566
567 for (i = 0; i < MAX_ACPI_MODULES; i++) {
568 if (dom->acpi_modules[i].length) {
569 e820[nr].addr = dom->acpi_modules[i].guest_addr_out & ~(page_size - 1);
570 e820[nr].size = dom->acpi_modules[i].length +
571 (dom->acpi_modules[i].guest_addr_out & (page_size - 1));
572 e820[nr].type = E820_ACPI;
573 nr++;
574 }
575 }
576
577 /* High memory */
578 if (highmem_size) {
579 e820[nr].addr = ((uint64_t)1 << 32);
580 e820[nr].size = highmem_size;
581 e820[nr].type = E820_RAM;
582 }
583
584 if (xc_domain_set_memory_map(CTX->xch, domid, e820, e820_entries) != 0) {
585 rc = ERROR_FAIL;
586 goto out;
587 }
588
589 out:
590 return rc;
591 }
592
libxl__arch_domain_build_info_acpi_setdefault(libxl_domain_build_info * b_info)593 void libxl__arch_domain_build_info_acpi_setdefault(
594 libxl_domain_build_info *b_info)
595 {
596 libxl_defbool_setdefault(&b_info->acpi, true);
597 }
598
599 /*
600 * Local variables:
601 * mode: C
602 * c-basic-offset: 4
603 * indent-tabs-mode: nil
604 * End:
605 */
606