1 #include <assert.h>
2 
3 #include "xc_sr_common_x86_pv.h"
4 
pfn_to_mfn(const struct xc_sr_context * ctx,xen_pfn_t pfn)5 static xen_pfn_t pfn_to_mfn(const struct xc_sr_context *ctx, xen_pfn_t pfn)
6 {
7     assert(pfn <= ctx->x86_pv.max_pfn);
8 
9     return xc_pfn_to_mfn(pfn, ctx->x86_pv.p2m, ctx->x86_pv.width);
10 }
11 
12 /*
13  * Expand our local tracking information for the p2m table and domains maximum
14  * size.  Normally this will be called once to expand from 0 to max_pfn, but
15  * is liable to expand multiple times if the domain grows on the sending side
16  * after migration has started.
17  */
expand_p2m(struct xc_sr_context * ctx,unsigned long max_pfn)18 static int expand_p2m(struct xc_sr_context *ctx, unsigned long max_pfn)
19 {
20     xc_interface *xch = ctx->xch;
21     unsigned long old_max = ctx->x86_pv.max_pfn, i;
22     unsigned int fpp = PAGE_SIZE / ctx->x86_pv.width;
23     unsigned long end_frame = (max_pfn / fpp) + 1;
24     unsigned long old_end_frame = (old_max / fpp) + 1;
25     xen_pfn_t *p2m = NULL, *p2m_pfns = NULL;
26     uint32_t *pfn_types = NULL;
27     size_t p2msz, p2m_pfnsz, pfn_typesz;
28 
29     assert(max_pfn > old_max);
30 
31     p2msz = (max_pfn + 1) * ctx->x86_pv.width;
32     p2m = realloc(ctx->x86_pv.p2m, p2msz);
33     if ( !p2m )
34     {
35         ERROR("Failed to (re)alloc %zu bytes for p2m", p2msz);
36         return -1;
37     }
38     ctx->x86_pv.p2m = p2m;
39 
40     pfn_typesz = (max_pfn + 1) * sizeof(*pfn_types);
41     pfn_types = realloc(ctx->x86_pv.restore.pfn_types, pfn_typesz);
42     if ( !pfn_types )
43     {
44         ERROR("Failed to (re)alloc %zu bytes for pfn_types", pfn_typesz);
45         return -1;
46     }
47     ctx->x86_pv.restore.pfn_types = pfn_types;
48 
49     p2m_pfnsz = (end_frame + 1) * sizeof(*p2m_pfns);
50     p2m_pfns = realloc(ctx->x86_pv.p2m_pfns, p2m_pfnsz);
51     if ( !p2m_pfns )
52     {
53         ERROR("Failed to (re)alloc %zu bytes for p2m frame list", p2m_pfnsz);
54         return -1;
55     }
56     ctx->x86_pv.p2m_frames = end_frame;
57     ctx->x86_pv.p2m_pfns = p2m_pfns;
58 
59     ctx->x86_pv.max_pfn = max_pfn;
60     for ( i = (old_max ? old_max + 1 : 0); i <= max_pfn; ++i )
61     {
62         ctx->restore.ops.set_gfn(ctx, i, INVALID_MFN);
63         ctx->restore.ops.set_page_type(ctx, i, 0);
64     }
65 
66     for ( i = (old_end_frame ? old_end_frame + 1 : 0); i <= end_frame; ++i )
67         ctx->x86_pv.p2m_pfns[i] = INVALID_MFN;
68 
69     DPRINTF("Changed max_pfn from %#lx to %#lx", old_max, max_pfn);
70     return 0;
71 }
72 
73 /*
74  * Pin all of the pagetables.
75  */
pin_pagetables(struct xc_sr_context * ctx)76 static int pin_pagetables(struct xc_sr_context *ctx)
77 {
78     xc_interface *xch = ctx->xch;
79     unsigned long i, nr_pins;
80     struct mmuext_op pin[MAX_PIN_BATCH];
81 
82     for ( i = nr_pins = 0; i <= ctx->x86_pv.max_pfn; ++i )
83     {
84         if ( (ctx->x86_pv.restore.pfn_types[i] &
85               XEN_DOMCTL_PFINFO_LPINTAB) == 0 )
86             continue;
87 
88         switch ( (ctx->x86_pv.restore.pfn_types[i] &
89                   XEN_DOMCTL_PFINFO_LTABTYPE_MASK) )
90         {
91         case XEN_DOMCTL_PFINFO_L1TAB:
92             pin[nr_pins].cmd = MMUEXT_PIN_L1_TABLE;
93             break;
94         case XEN_DOMCTL_PFINFO_L2TAB:
95             pin[nr_pins].cmd = MMUEXT_PIN_L2_TABLE;
96             break;
97         case XEN_DOMCTL_PFINFO_L3TAB:
98             pin[nr_pins].cmd = MMUEXT_PIN_L3_TABLE;
99             break;
100         case XEN_DOMCTL_PFINFO_L4TAB:
101             pin[nr_pins].cmd = MMUEXT_PIN_L4_TABLE;
102             break;
103         default:
104             continue;
105         }
106 
107         pin[nr_pins].arg1.mfn = pfn_to_mfn(ctx, i);
108         nr_pins++;
109 
110         if ( nr_pins == MAX_PIN_BATCH )
111         {
112             if ( xc_mmuext_op(xch, pin, nr_pins, ctx->domid) != 0 )
113             {
114                 PERROR("Failed to pin batch of pagetables");
115                 return -1;
116             }
117             nr_pins = 0;
118         }
119     }
120 
121     if ( (nr_pins > 0) && (xc_mmuext_op(xch, pin, nr_pins, ctx->domid) < 0) )
122     {
123         PERROR("Failed to pin batch of pagetables");
124         return -1;
125     }
126 
127     return 0;
128 }
129 
130 /*
131  * Update details in a guests start_info structure.
132  */
process_start_info(struct xc_sr_context * ctx,vcpu_guest_context_any_t * vcpu)133 static int process_start_info(struct xc_sr_context *ctx,
134                               vcpu_guest_context_any_t *vcpu)
135 {
136     xc_interface *xch = ctx->xch;
137     xen_pfn_t pfn, mfn;
138     start_info_any_t *guest_start_info = NULL;
139     int rc = -1;
140 
141     pfn = GET_FIELD(vcpu, user_regs.edx, ctx->x86_pv.width);
142 
143     if ( pfn > ctx->x86_pv.max_pfn )
144     {
145         ERROR("Start Info pfn %#lx out of range", pfn);
146         goto err;
147     }
148     else if ( ctx->x86_pv.restore.pfn_types[pfn] != XEN_DOMCTL_PFINFO_NOTAB )
149     {
150         ERROR("Start Info pfn %#lx has bad type %u", pfn,
151               (ctx->x86_pv.restore.pfn_types[pfn] >>
152                XEN_DOMCTL_PFINFO_LTAB_SHIFT));
153         goto err;
154     }
155 
156     mfn = pfn_to_mfn(ctx, pfn);
157     if ( !mfn_in_pseudophysmap(ctx, mfn) )
158     {
159         ERROR("Start Info has bad mfn");
160         dump_bad_pseudophysmap_entry(ctx, mfn);
161         goto err;
162     }
163 
164     SET_FIELD(vcpu, user_regs.edx, mfn, ctx->x86_pv.width);
165     guest_start_info = xc_map_foreign_range(
166         xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE, mfn);
167     if ( !guest_start_info )
168     {
169         PERROR("Failed to map Start Info at mfn %#lx", mfn);
170         goto err;
171     }
172 
173     /* Deal with xenstore stuff */
174     pfn = GET_FIELD(guest_start_info, store_mfn, ctx->x86_pv.width);
175     if ( pfn > ctx->x86_pv.max_pfn )
176     {
177         ERROR("XenStore pfn %#lx out of range", pfn);
178         goto err;
179     }
180 
181     mfn = pfn_to_mfn(ctx, pfn);
182     if ( !mfn_in_pseudophysmap(ctx, mfn) )
183     {
184         ERROR("XenStore pfn has bad mfn");
185         dump_bad_pseudophysmap_entry(ctx, mfn);
186         goto err;
187     }
188 
189     ctx->restore.xenstore_gfn = mfn;
190     SET_FIELD(guest_start_info, store_mfn, mfn, ctx->x86_pv.width);
191     SET_FIELD(guest_start_info, store_evtchn,
192               ctx->restore.xenstore_evtchn, ctx->x86_pv.width);
193 
194     /* Deal with console stuff */
195     pfn = GET_FIELD(guest_start_info, console.domU.mfn, ctx->x86_pv.width);
196     if ( pfn > ctx->x86_pv.max_pfn )
197     {
198         ERROR("Console pfn %#lx out of range", pfn);
199         goto err;
200     }
201 
202     mfn = pfn_to_mfn(ctx, pfn);
203     if ( !mfn_in_pseudophysmap(ctx, mfn) )
204     {
205         ERROR("Console pfn has bad mfn");
206         dump_bad_pseudophysmap_entry(ctx, mfn);
207         goto err;
208     }
209 
210     ctx->restore.console_gfn = mfn;
211     SET_FIELD(guest_start_info, console.domU.mfn, mfn, ctx->x86_pv.width);
212     SET_FIELD(guest_start_info, console.domU.evtchn,
213               ctx->restore.console_evtchn, ctx->x86_pv.width);
214 
215     /* Set other information */
216     SET_FIELD(guest_start_info, nr_pages,
217               ctx->x86_pv.max_pfn + 1, ctx->x86_pv.width);
218     SET_FIELD(guest_start_info, shared_info,
219               ctx->dominfo.shared_info_frame << PAGE_SHIFT, ctx->x86_pv.width);
220     SET_FIELD(guest_start_info, flags, 0, ctx->x86_pv.width);
221 
222     rc = 0;
223 
224 err:
225     if ( guest_start_info )
226         munmap(guest_start_info, PAGE_SIZE);
227 
228     return rc;
229 }
230 
231 /*
232  * Process one stashed vcpu worth of basic state and send to Xen.
233  */
process_vcpu_basic(struct xc_sr_context * ctx,unsigned int vcpuid)234 static int process_vcpu_basic(struct xc_sr_context *ctx,
235                               unsigned int vcpuid)
236 {
237     xc_interface *xch = ctx->xch;
238     vcpu_guest_context_any_t vcpu;
239     xen_pfn_t pfn, mfn;
240     unsigned i, gdt_count;
241     int rc = -1;
242 
243     memcpy(&vcpu, ctx->x86_pv.restore.vcpus[vcpuid].basic,
244            ctx->x86_pv.restore.vcpus[vcpuid].basicsz);
245 
246     /* Vcpu 0 is special: Convert the suspend record to an mfn. */
247     if ( vcpuid == 0 )
248     {
249         rc = process_start_info(ctx, &vcpu);
250         if ( rc )
251             return rc;
252         rc = -1;
253     }
254 
255     SET_FIELD(&vcpu, flags,
256               GET_FIELD(&vcpu, flags, ctx->x86_pv.width) | VGCF_online,
257               ctx->x86_pv.width);
258 
259     gdt_count = GET_FIELD(&vcpu, gdt_ents, ctx->x86_pv.width);
260     if ( gdt_count > FIRST_RESERVED_GDT_ENTRY )
261     {
262         ERROR("GDT entry count (%u) out of range (max %u)",
263               gdt_count, FIRST_RESERVED_GDT_ENTRY);
264         errno = ERANGE;
265         goto err;
266     }
267     gdt_count = (gdt_count + 511) / 512; /* gdt_count now in units of frames. */
268 
269     /* Convert GDT frames to mfns. */
270     for ( i = 0; i < gdt_count; ++i )
271     {
272         pfn = GET_FIELD(&vcpu, gdt_frames[i], ctx->x86_pv.width);
273         if ( pfn > ctx->x86_pv.max_pfn )
274         {
275             ERROR("GDT frame %u (pfn %#lx) out of range", i, pfn);
276             goto err;
277         }
278         else if ( (ctx->x86_pv.restore.pfn_types[pfn] !=
279                    XEN_DOMCTL_PFINFO_NOTAB) )
280         {
281             ERROR("GDT frame %u (pfn %#lx) has bad type %u", i, pfn,
282                   (ctx->x86_pv.restore.pfn_types[pfn] >>
283                    XEN_DOMCTL_PFINFO_LTAB_SHIFT));
284             goto err;
285         }
286 
287         mfn = pfn_to_mfn(ctx, pfn);
288         if ( !mfn_in_pseudophysmap(ctx, mfn) )
289         {
290             ERROR("GDT frame %u has bad mfn", i);
291             dump_bad_pseudophysmap_entry(ctx, mfn);
292             goto err;
293         }
294 
295         SET_FIELD(&vcpu, gdt_frames[i], mfn, ctx->x86_pv.width);
296     }
297 
298     /* Convert CR3 to an mfn. */
299     pfn = cr3_to_mfn(ctx, GET_FIELD(&vcpu, ctrlreg[3], ctx->x86_pv.width));
300     if ( pfn > ctx->x86_pv.max_pfn )
301     {
302         ERROR("cr3 (pfn %#lx) out of range", pfn);
303         goto err;
304     }
305     else if ( (ctx->x86_pv.restore.pfn_types[pfn] &
306                 XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
307               (((xen_pfn_t)ctx->x86_pv.levels) <<
308                XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
309     {
310         ERROR("cr3 (pfn %#lx) has bad type %u, expected %u", pfn,
311               (ctx->x86_pv.restore.pfn_types[pfn] >>
312                XEN_DOMCTL_PFINFO_LTAB_SHIFT),
313               ctx->x86_pv.levels);
314         goto err;
315     }
316 
317     mfn = pfn_to_mfn(ctx, pfn);
318     if ( !mfn_in_pseudophysmap(ctx, mfn) )
319     {
320         ERROR("cr3 has bad mfn");
321         dump_bad_pseudophysmap_entry(ctx, mfn);
322         goto err;
323     }
324 
325     SET_FIELD(&vcpu, ctrlreg[3], mfn_to_cr3(ctx, mfn), ctx->x86_pv.width);
326 
327     /* 64bit guests: Convert CR1 (guest pagetables) to mfn. */
328     if ( ctx->x86_pv.levels == 4 && (vcpu.x64.ctrlreg[1] & 1) )
329     {
330         pfn = vcpu.x64.ctrlreg[1] >> PAGE_SHIFT;
331 
332         if ( pfn > ctx->x86_pv.max_pfn )
333         {
334             ERROR("cr1 (pfn %#lx) out of range", pfn);
335             goto err;
336         }
337         else if ( (ctx->x86_pv.restore.pfn_types[pfn] &
338                    XEN_DOMCTL_PFINFO_LTABTYPE_MASK) !=
339                   (((xen_pfn_t)ctx->x86_pv.levels) <<
340                    XEN_DOMCTL_PFINFO_LTAB_SHIFT) )
341         {
342             ERROR("cr1 (pfn %#lx) has bad type %u, expected %u", pfn,
343                   (ctx->x86_pv.restore.pfn_types[pfn] >>
344                    XEN_DOMCTL_PFINFO_LTAB_SHIFT),
345                   ctx->x86_pv.levels);
346             goto err;
347         }
348 
349         mfn = pfn_to_mfn(ctx, pfn);
350         if ( !mfn_in_pseudophysmap(ctx, mfn) )
351         {
352             ERROR("cr1 has bad mfn");
353             dump_bad_pseudophysmap_entry(ctx, mfn);
354             goto err;
355         }
356 
357         vcpu.x64.ctrlreg[1] = (uint64_t)mfn << PAGE_SHIFT;
358     }
359 
360     if ( xc_vcpu_setcontext(xch, ctx->domid, vcpuid, &vcpu) )
361     {
362         PERROR("Failed to set vcpu%u's basic info", vcpuid);
363         goto err;
364     }
365 
366     rc = 0;
367 
368  err:
369     return rc;
370 }
371 
372 /*
373  * Process one stashed vcpu worth of extended state and send to Xen.
374  */
process_vcpu_extended(struct xc_sr_context * ctx,unsigned int vcpuid)375 static int process_vcpu_extended(struct xc_sr_context *ctx,
376                                  unsigned int vcpuid)
377 {
378     xc_interface *xch = ctx->xch;
379     struct xc_sr_x86_pv_restore_vcpu *vcpu =
380         &ctx->x86_pv.restore.vcpus[vcpuid];
381     DECLARE_DOMCTL;
382 
383     domctl.cmd = XEN_DOMCTL_set_ext_vcpucontext;
384     domctl.domain = ctx->domid;
385     memcpy(&domctl.u.ext_vcpucontext, vcpu->extd, vcpu->extdsz);
386 
387     if ( xc_domctl(xch, &domctl) != 0 )
388     {
389         PERROR("Failed to set vcpu%u's extended info", vcpuid);
390         return -1;
391     }
392 
393     return 0;
394 }
395 
396 /*
397  * Process one stashed vcpu worth of xsave state and send to Xen.
398  */
process_vcpu_xsave(struct xc_sr_context * ctx,unsigned int vcpuid)399 static int process_vcpu_xsave(struct xc_sr_context *ctx,
400                               unsigned int vcpuid)
401 {
402     xc_interface *xch = ctx->xch;
403     struct xc_sr_x86_pv_restore_vcpu *vcpu =
404         &ctx->x86_pv.restore.vcpus[vcpuid];
405     int rc;
406     DECLARE_DOMCTL;
407     DECLARE_HYPERCALL_BUFFER(void, buffer);
408 
409     buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->xsavesz);
410     if ( !buffer )
411     {
412         ERROR("Unable to allocate %zu bytes for xsave hypercall buffer",
413               vcpu->xsavesz);
414         return -1;
415     }
416 
417     domctl.cmd = XEN_DOMCTL_setvcpuextstate;
418     domctl.domain = ctx->domid;
419     domctl.u.vcpuextstate.vcpu = vcpuid;
420     domctl.u.vcpuextstate.size = vcpu->xsavesz;
421     set_xen_guest_handle(domctl.u.vcpuextstate.buffer, buffer);
422 
423     memcpy(buffer, vcpu->xsave, vcpu->xsavesz);
424 
425     rc = xc_domctl(xch, &domctl);
426     if ( rc )
427         PERROR("Failed to set vcpu%u's xsave info", vcpuid);
428 
429     xc_hypercall_buffer_free(xch, buffer);
430 
431     return rc;
432 }
433 
434 /*
435  * Process one stashed vcpu worth of msr state and send to Xen.
436  */
process_vcpu_msrs(struct xc_sr_context * ctx,unsigned int vcpuid)437 static int process_vcpu_msrs(struct xc_sr_context *ctx,
438                              unsigned int vcpuid)
439 {
440     xc_interface *xch = ctx->xch;
441     struct xc_sr_x86_pv_restore_vcpu *vcpu =
442         &ctx->x86_pv.restore.vcpus[vcpuid];
443     int rc;
444     DECLARE_DOMCTL;
445     DECLARE_HYPERCALL_BUFFER(void, buffer);
446 
447     buffer = xc_hypercall_buffer_alloc(xch, buffer, vcpu->msrsz);
448     if ( !buffer )
449     {
450         ERROR("Unable to allocate %zu bytes for msr hypercall buffer",
451               vcpu->msrsz);
452         return -1;
453     }
454 
455     domctl.cmd = XEN_DOMCTL_set_vcpu_msrs;
456     domctl.domain = ctx->domid;
457     domctl.u.vcpu_msrs.vcpu = vcpuid;
458     domctl.u.vcpu_msrs.msr_count = vcpu->msrsz / sizeof(xen_domctl_vcpu_msr_t);
459     set_xen_guest_handle(domctl.u.vcpu_msrs.msrs, buffer);
460 
461     memcpy(buffer, vcpu->msr, vcpu->msrsz);
462 
463     rc = xc_domctl(xch, &domctl);
464     if ( rc )
465         PERROR("Failed to set vcpu%u's msrs", vcpuid);
466 
467     xc_hypercall_buffer_free(xch, buffer);
468 
469     return rc;
470 }
471 
472 /*
473  * Process all stashed vcpu context and send to Xen.
474  */
update_vcpu_context(struct xc_sr_context * ctx)475 static int update_vcpu_context(struct xc_sr_context *ctx)
476 {
477     xc_interface *xch = ctx->xch;
478     struct xc_sr_x86_pv_restore_vcpu *vcpu;
479     unsigned i;
480     int rc = 0;
481 
482     for ( i = 0; i < ctx->x86_pv.restore.nr_vcpus; ++i )
483     {
484         vcpu = &ctx->x86_pv.restore.vcpus[i];
485 
486         if ( vcpu->basic )
487         {
488             rc = process_vcpu_basic(ctx, i);
489             if ( rc )
490                 return rc;
491         }
492         else if ( i == 0 )
493         {
494             ERROR("Sender didn't send vcpu0's basic state");
495             return -1;
496         }
497 
498         if ( vcpu->extd )
499         {
500             rc = process_vcpu_extended(ctx, i);
501             if ( rc )
502                 return rc;
503         }
504 
505         if ( vcpu->xsave )
506         {
507             rc = process_vcpu_xsave(ctx, i);
508             if ( rc )
509                 return rc;
510         }
511 
512         if ( vcpu->msr )
513         {
514             rc = process_vcpu_msrs(ctx, i);
515             if ( rc )
516                 return rc;
517         }
518     }
519 
520     return rc;
521 }
522 
523 /*
524  * Copy the p2m which has been constructed locally as memory has been
525  * allocated, over the p2m in guest, so the guest can find its memory again on
526  * resume.
527  */
update_guest_p2m(struct xc_sr_context * ctx)528 static int update_guest_p2m(struct xc_sr_context *ctx)
529 {
530     xc_interface *xch = ctx->xch;
531     xen_pfn_t mfn, pfn, *guest_p2m = NULL;
532     unsigned i;
533     int rc = -1;
534 
535     for ( i = 0; i < ctx->x86_pv.p2m_frames; ++i )
536     {
537         pfn = ctx->x86_pv.p2m_pfns[i];
538 
539         if ( pfn > ctx->x86_pv.max_pfn )
540         {
541             ERROR("pfn (%#lx) for p2m_frame_list[%u] out of range",
542                   pfn, i);
543             goto err;
544         }
545         else if ( (ctx->x86_pv.restore.pfn_types[pfn] !=
546                    XEN_DOMCTL_PFINFO_NOTAB) )
547         {
548             ERROR("pfn (%#lx) for p2m_frame_list[%u] has bad type %u", pfn, i,
549                   (ctx->x86_pv.restore.pfn_types[pfn] >>
550                    XEN_DOMCTL_PFINFO_LTAB_SHIFT));
551             goto err;
552         }
553 
554         mfn = pfn_to_mfn(ctx, pfn);
555         if ( !mfn_in_pseudophysmap(ctx, mfn) )
556         {
557             ERROR("p2m_frame_list[%u] has bad mfn", i);
558             dump_bad_pseudophysmap_entry(ctx, mfn);
559             goto err;
560         }
561 
562         ctx->x86_pv.p2m_pfns[i] = mfn;
563     }
564 
565     guest_p2m = xc_map_foreign_pages(xch, ctx->domid, PROT_WRITE,
566                                      ctx->x86_pv.p2m_pfns,
567                                      ctx->x86_pv.p2m_frames );
568     if ( !guest_p2m )
569     {
570         PERROR("Failed to map p2m frames");
571         goto err;
572     }
573 
574     memcpy(guest_p2m, ctx->x86_pv.p2m,
575            (ctx->x86_pv.max_pfn + 1) * ctx->x86_pv.width);
576     rc = 0;
577  err:
578     if ( guest_p2m )
579         munmap(guest_p2m, ctx->x86_pv.p2m_frames * PAGE_SIZE);
580 
581     return rc;
582 }
583 
584 /*
585  * Process an X86_PV_INFO record.
586  */
handle_x86_pv_info(struct xc_sr_context * ctx,struct xc_sr_record * rec)587 static int handle_x86_pv_info(struct xc_sr_context *ctx,
588                               struct xc_sr_record *rec)
589 {
590     xc_interface *xch = ctx->xch;
591     struct xc_sr_rec_x86_pv_info *info = rec->data;
592 
593     if ( ctx->x86_pv.restore.seen_pv_info )
594     {
595         ERROR("Already received X86_PV_INFO record");
596         return -1;
597     }
598 
599     if ( rec->length < sizeof(*info) )
600     {
601         ERROR("X86_PV_INFO record truncated: length %u, expected %zu",
602               rec->length, sizeof(*info));
603         return -1;
604     }
605     else if ( info->guest_width != 4 &&
606               info->guest_width != 8 )
607     {
608         ERROR("Unexpected guest width %u, Expected 4 or 8",
609               info->guest_width);
610         return -1;
611     }
612     else if ( info->guest_width != ctx->x86_pv.width )
613     {
614         int rc;
615         struct xen_domctl domctl;
616 
617         /* Try to set address size, domain is always created 64 bit. */
618         memset(&domctl, 0, sizeof(domctl));
619         domctl.domain = ctx->domid;
620         domctl.cmd    = XEN_DOMCTL_set_address_size;
621         domctl.u.address_size.size = info->guest_width * 8;
622         rc = do_domctl(xch, &domctl);
623         if ( rc != 0 )
624         {
625             ERROR("Width of guest in stream (%u"
626                   " bits) differs with existing domain (%u bits)",
627                   info->guest_width * 8, ctx->x86_pv.width * 8);
628             return -1;
629         }
630 
631         /* Domain's information changed, better to refresh. */
632         rc = x86_pv_domain_info(ctx);
633         if ( rc != 0 )
634         {
635             ERROR("Unable to refresh guest information");
636             return -1;
637         }
638     }
639     else if ( info->pt_levels != 3 &&
640               info->pt_levels != 4 )
641     {
642         ERROR("Unexpected guest levels %u, Expected 3 or 4",
643               info->pt_levels);
644         return -1;
645     }
646     else if ( info->pt_levels != ctx->x86_pv.levels )
647     {
648         ERROR("Levels of guest in stream (%u"
649               ") differs with existing domain (%u)",
650               info->pt_levels, ctx->x86_pv.levels);
651         return -1;
652     }
653 
654     ctx->x86_pv.restore.seen_pv_info = true;
655     return 0;
656 }
657 
658 /*
659  * Process an X86_PV_P2M_FRAMES record.  Takes care of expanding the local p2m
660  * state if needed.
661  */
handle_x86_pv_p2m_frames(struct xc_sr_context * ctx,struct xc_sr_record * rec)662 static int handle_x86_pv_p2m_frames(struct xc_sr_context *ctx,
663                                     struct xc_sr_record *rec)
664 {
665     xc_interface *xch = ctx->xch;
666     struct xc_sr_rec_x86_pv_p2m_frames *data = rec->data;
667     unsigned start, end, x, fpp = PAGE_SIZE / ctx->x86_pv.width;
668     int rc;
669 
670     if ( !ctx->x86_pv.restore.seen_pv_info )
671     {
672         ERROR("Not yet received X86_PV_INFO record");
673         return -1;
674     }
675 
676     if ( rec->length < sizeof(*data) )
677     {
678         ERROR("X86_PV_P2M_FRAMES record truncated: length %u, min %zu",
679               rec->length, sizeof(*data) + sizeof(uint64_t));
680         return -1;
681     }
682     else if ( data->start_pfn > data->end_pfn )
683     {
684         ERROR("End pfn in stream (%#x) exceeds Start (%#x)",
685               data->end_pfn, data->start_pfn);
686         return -1;
687     }
688 
689     start =  data->start_pfn / fpp;
690     end = data->end_pfn / fpp + 1;
691 
692     if ( rec->length != sizeof(*data) + ((end - start) * sizeof(uint64_t)) )
693     {
694         ERROR("X86_PV_P2M_FRAMES record wrong size: start_pfn %#x"
695               ", end_pfn %#x, length %u, expected %zu + (%u - %u) * %zu",
696               data->start_pfn, data->end_pfn, rec->length,
697               sizeof(*data), end, start, sizeof(uint64_t));
698         return -1;
699     }
700 
701     if ( data->end_pfn > ctx->x86_pv.max_pfn )
702     {
703         rc = expand_p2m(ctx, data->end_pfn);
704         if ( rc )
705             return rc;
706     }
707 
708     for ( x = 0; x < (end - start); ++x )
709         ctx->x86_pv.p2m_pfns[start + x] = data->p2m_pfns[x];
710 
711     return 0;
712 }
713 
714 /*
715  * Processes X86_PV_VCPU_{BASIC,EXTENDED,XSAVE,MSRS} records from the stream.
716  * The blobs are all stashed to one side as they need to be deferred until the
717  * very end of the stream, rather than being send to Xen at the point they
718  * arrive in the stream.  It performs all pre-hypercall size validation.
719  */
handle_x86_pv_vcpu_blob(struct xc_sr_context * ctx,struct xc_sr_record * rec)720 static int handle_x86_pv_vcpu_blob(struct xc_sr_context *ctx,
721                                    struct xc_sr_record *rec)
722 {
723     xc_interface *xch = ctx->xch;
724     struct xc_sr_rec_x86_pv_vcpu_hdr *vhdr = rec->data;
725     struct xc_sr_x86_pv_restore_vcpu *vcpu;
726     const char *rec_name;
727     size_t blobsz;
728     void *blob;
729     int rc = -1;
730 
731     switch ( rec->type )
732     {
733     case REC_TYPE_X86_PV_VCPU_BASIC:
734         rec_name = "X86_PV_VCPU_BASIC";
735         break;
736 
737     case REC_TYPE_X86_PV_VCPU_EXTENDED:
738         rec_name = "X86_PV_VCPU_EXTENDED";
739         break;
740 
741     case REC_TYPE_X86_PV_VCPU_XSAVE:
742         rec_name = "X86_PV_VCPU_XSAVE";
743         break;
744 
745     case REC_TYPE_X86_PV_VCPU_MSRS:
746         rec_name = "X86_PV_VCPU_MSRS";
747         break;
748 
749     default:
750         ERROR("Unrecognised vcpu blob record %s (%u)",
751               rec_type_to_str(rec->type), rec->type);
752         goto out;
753     }
754 
755     /* Confirm that there is a complete header. */
756     if ( rec->length < sizeof(*vhdr) )
757     {
758         ERROR("%s record truncated: length %u, header size %zu",
759               rec_name, rec->length, sizeof(*vhdr));
760         goto out;
761     }
762 
763     blobsz = rec->length - sizeof(*vhdr);
764 
765     /*
766      * Tolerate empty records.  Older sending sides used to accidentally
767      * generate them.
768      */
769     if ( blobsz == 0 )
770     {
771         DBGPRINTF("Skipping empty %s record for vcpu %u\n",
772                   rec_type_to_str(rec->type), vhdr->vcpu_id);
773         goto out;
774     }
775 
776     /* Check that the vcpu id is within range. */
777     if ( vhdr->vcpu_id >= ctx->x86_pv.restore.nr_vcpus )
778     {
779         ERROR("%s record vcpu_id (%u) exceeds domain max (%u)",
780               rec_name, vhdr->vcpu_id, ctx->x86_pv.restore.nr_vcpus - 1);
781         goto out;
782     }
783 
784     vcpu = &ctx->x86_pv.restore.vcpus[vhdr->vcpu_id];
785 
786     /* Further per-record checks, where possible. */
787     switch ( rec->type )
788     {
789     case REC_TYPE_X86_PV_VCPU_BASIC:
790     {
791         size_t vcpusz = ctx->x86_pv.width == 8 ?
792             sizeof(vcpu_guest_context_x86_64_t) :
793             sizeof(vcpu_guest_context_x86_32_t);
794 
795         if ( blobsz != vcpusz )
796         {
797             ERROR("%s record wrong size: expected %zu, got %u",
798                   rec_name, sizeof(*vhdr) + vcpusz, rec->length);
799             goto out;
800         }
801         break;
802     }
803 
804     case REC_TYPE_X86_PV_VCPU_EXTENDED:
805         if ( blobsz > 128 )
806         {
807             ERROR("%s record too long: max %zu, got %u",
808                   rec_name, sizeof(*vhdr) + 128, rec->length);
809             goto out;
810         }
811         break;
812 
813     case REC_TYPE_X86_PV_VCPU_XSAVE:
814         if ( blobsz % sizeof(xen_domctl_vcpu_msr_t) != 0 )
815         {
816             ERROR("%s record payload size %zu expected to be a multiple of %zu",
817                   rec_name, blobsz, sizeof(xen_domctl_vcpu_msr_t));
818             goto out;
819         }
820         break;
821     }
822 
823     /* Allocate memory. */
824     blob = malloc(blobsz);
825     if ( !blob )
826     {
827         ERROR("Unable to allocate %zu bytes for vcpu%u %s blob",
828               blobsz, vhdr->vcpu_id, rec_name);
829         goto out;
830     }
831 
832     memcpy(blob, &vhdr->context, blobsz);
833 
834     /* Stash sideways for later. */
835     switch ( rec->type )
836     {
837 #define RECSTORE(x, y) case REC_TYPE_X86_PV_ ## x: \
838         free(y); (y) = blob; (y ## sz) = blobsz; break
839 
840         RECSTORE(VCPU_BASIC,    vcpu->basic);
841         RECSTORE(VCPU_EXTENDED, vcpu->extd);
842         RECSTORE(VCPU_XSAVE,    vcpu->xsave);
843         RECSTORE(VCPU_MSRS,     vcpu->msr);
844 #undef RECSTORE
845     }
846 
847     rc = 0;
848 
849  out:
850     return rc;
851 }
852 
853 /*
854  * Process a SHARED_INFO record from the stream.
855  */
handle_shared_info(struct xc_sr_context * ctx,struct xc_sr_record * rec)856 static int handle_shared_info(struct xc_sr_context *ctx,
857                               struct xc_sr_record *rec)
858 {
859     xc_interface *xch = ctx->xch;
860     unsigned i;
861     int rc = -1;
862     shared_info_any_t *guest_shinfo = NULL;
863     const shared_info_any_t *old_shinfo = rec->data;
864 
865     if ( !ctx->x86_pv.restore.seen_pv_info )
866     {
867         ERROR("Not yet received X86_PV_INFO record");
868         return -1;
869     }
870 
871     if ( rec->length != PAGE_SIZE )
872     {
873         ERROR("X86_PV_SHARED_INFO record wrong size: length %u"
874               ", expected 4096", rec->length);
875         goto err;
876     }
877 
878     guest_shinfo = xc_map_foreign_range(
879         xch, ctx->domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
880         ctx->dominfo.shared_info_frame);
881     if ( !guest_shinfo )
882     {
883         PERROR("Failed to map Shared Info at mfn %#lx",
884                ctx->dominfo.shared_info_frame);
885         goto err;
886     }
887 
888     MEMCPY_FIELD(guest_shinfo, old_shinfo, vcpu_info, ctx->x86_pv.width);
889     MEMCPY_FIELD(guest_shinfo, old_shinfo, arch, ctx->x86_pv.width);
890 
891     SET_FIELD(guest_shinfo, arch.pfn_to_mfn_frame_list_list,
892               0, ctx->x86_pv.width);
893 
894     MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_pending, 0, ctx->x86_pv.width);
895     for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
896         SET_FIELD(guest_shinfo, vcpu_info[i].evtchn_pending_sel,
897                   0, ctx->x86_pv.width);
898 
899     MEMSET_ARRAY_FIELD(guest_shinfo, evtchn_mask, 0xff, ctx->x86_pv.width);
900 
901     rc = 0;
902  err:
903 
904     if ( guest_shinfo )
905         munmap(guest_shinfo, PAGE_SIZE);
906 
907     return rc;
908 }
909 
910 /* restore_ops function. */
x86_pv_pfn_is_valid(const struct xc_sr_context * ctx,xen_pfn_t pfn)911 static bool x86_pv_pfn_is_valid(const struct xc_sr_context *ctx, xen_pfn_t pfn)
912 {
913     return pfn <= ctx->x86_pv.max_pfn;
914 }
915 
916 /* restore_ops function. */
x86_pv_set_page_type(struct xc_sr_context * ctx,xen_pfn_t pfn,unsigned long type)917 static void x86_pv_set_page_type(struct xc_sr_context *ctx, xen_pfn_t pfn,
918                                  unsigned long type)
919 {
920     assert(pfn <= ctx->x86_pv.max_pfn);
921 
922     ctx->x86_pv.restore.pfn_types[pfn] = type;
923 }
924 
925 /* restore_ops function. */
x86_pv_set_gfn(struct xc_sr_context * ctx,xen_pfn_t pfn,xen_pfn_t mfn)926 static void x86_pv_set_gfn(struct xc_sr_context *ctx, xen_pfn_t pfn,
927                            xen_pfn_t mfn)
928 {
929     assert(pfn <= ctx->x86_pv.max_pfn);
930 
931     if ( ctx->x86_pv.width == sizeof(uint64_t) )
932         /* 64 bit guest.  Need to expand INVALID_MFN for 32 bit toolstacks. */
933         ((uint64_t *)ctx->x86_pv.p2m)[pfn] = mfn == INVALID_MFN ? ~0ULL : mfn;
934     else
935         /* 32 bit guest.  Can truncate INVALID_MFN for 64 bit toolstacks. */
936         ((uint32_t *)ctx->x86_pv.p2m)[pfn] = mfn;
937 }
938 
939 /*
940  * restore_ops function.  Convert pfns back to mfns in pagetables.  Possibly
941  * needs to populate new frames if a PTE is found referring to a frame which
942  * hasn't yet been seen from PAGE_DATA records.
943  */
x86_pv_localise_page(struct xc_sr_context * ctx,uint32_t type,void * page)944 static int x86_pv_localise_page(struct xc_sr_context *ctx,
945                                 uint32_t type, void *page)
946 {
947     xc_interface *xch = ctx->xch;
948     uint64_t *table = page;
949     uint64_t pte;
950     unsigned i, to_populate;
951     xen_pfn_t pfns[(PAGE_SIZE / sizeof(uint64_t))];
952 
953     type &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
954 
955     /* Only page tables need localisation. */
956     if ( type < XEN_DOMCTL_PFINFO_L1TAB || type > XEN_DOMCTL_PFINFO_L4TAB )
957         return 0;
958 
959     /* Check to see whether we need to populate any new frames. */
960     for ( i = 0, to_populate = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
961     {
962         pte = table[i];
963 
964         if ( pte & _PAGE_PRESENT )
965         {
966             xen_pfn_t pfn = pte_to_frame(pte);
967 
968 #ifdef __i386__
969             if ( pfn == INVALID_MFN )
970             {
971                 ERROR("PTE truncation detected.  L%u[%u] = %016"PRIx64,
972                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
973                 errno = E2BIG;
974                 return -1;
975             }
976 #endif
977 
978             if ( pfn_to_mfn(ctx, pfn) == INVALID_MFN )
979                 pfns[to_populate++] = pfn;
980         }
981     }
982 
983     if ( to_populate && populate_pfns(ctx, to_populate, pfns, NULL) )
984         return -1;
985 
986     for ( i = 0; i < (PAGE_SIZE / sizeof(uint64_t)); ++i )
987     {
988         pte = table[i];
989 
990         if ( pte & _PAGE_PRESENT )
991         {
992             xen_pfn_t mfn, pfn;
993 
994             pfn = pte_to_frame(pte);
995             mfn = pfn_to_mfn(ctx, pfn);
996 
997             if ( !mfn_in_pseudophysmap(ctx, mfn) )
998             {
999                 ERROR("Bad mfn for L%u[%u] - pte %"PRIx64,
1000                       type >> XEN_DOMCTL_PFINFO_LTAB_SHIFT, i, pte);
1001                 dump_bad_pseudophysmap_entry(ctx, mfn);
1002                 errno = ERANGE;
1003                 return -1;
1004             }
1005 
1006             table[i] = merge_pte(pte, mfn);
1007         }
1008     }
1009 
1010     return 0;
1011 }
1012 
1013 /*
1014  * restore_ops function.  Confirm that the incoming stream matches the type of
1015  * domain we are attempting to restore into.
1016  */
x86_pv_setup(struct xc_sr_context * ctx)1017 static int x86_pv_setup(struct xc_sr_context *ctx)
1018 {
1019     xc_interface *xch = ctx->xch;
1020     int rc;
1021 
1022     if ( ctx->restore.guest_type != DHDR_TYPE_X86_PV )
1023     {
1024         ERROR("Unable to restore %s domain into an x86_pv domain",
1025               dhdr_type_to_str(ctx->restore.guest_type));
1026         return -1;
1027     }
1028     else if ( ctx->restore.guest_page_size != PAGE_SIZE )
1029     {
1030         ERROR("Invalid page size %d for x86_pv domains",
1031               ctx->restore.guest_page_size);
1032         return -1;
1033     }
1034 
1035     rc = x86_pv_domain_info(ctx);
1036     if ( rc )
1037         return rc;
1038 
1039     ctx->x86_pv.restore.nr_vcpus = ctx->dominfo.max_vcpu_id + 1;
1040     ctx->x86_pv.restore.vcpus = calloc(sizeof(struct xc_sr_x86_pv_restore_vcpu),
1041                                        ctx->x86_pv.restore.nr_vcpus);
1042     if ( !ctx->x86_pv.restore.vcpus )
1043     {
1044         errno = ENOMEM;
1045         return -1;
1046     }
1047 
1048     rc = x86_pv_map_m2p(ctx);
1049     if ( rc )
1050         return rc;
1051 
1052     return rc;
1053 }
1054 
1055 /*
1056  * restore_ops function.
1057  */
x86_pv_process_record(struct xc_sr_context * ctx,struct xc_sr_record * rec)1058 static int x86_pv_process_record(struct xc_sr_context *ctx,
1059                                  struct xc_sr_record *rec)
1060 {
1061     switch ( rec->type )
1062     {
1063     case REC_TYPE_X86_PV_INFO:
1064         return handle_x86_pv_info(ctx, rec);
1065 
1066     case REC_TYPE_X86_PV_P2M_FRAMES:
1067         return handle_x86_pv_p2m_frames(ctx, rec);
1068 
1069     case REC_TYPE_X86_PV_VCPU_BASIC:
1070     case REC_TYPE_X86_PV_VCPU_EXTENDED:
1071     case REC_TYPE_X86_PV_VCPU_XSAVE:
1072     case REC_TYPE_X86_PV_VCPU_MSRS:
1073         return handle_x86_pv_vcpu_blob(ctx, rec);
1074 
1075     case REC_TYPE_SHARED_INFO:
1076         return handle_shared_info(ctx, rec);
1077 
1078     case REC_TYPE_TSC_INFO:
1079         return handle_tsc_info(ctx, rec);
1080 
1081     default:
1082         return RECORD_NOT_PROCESSED;
1083     }
1084 }
1085 
1086 /*
1087  * restore_ops function.  Update the vcpu context in Xen, pin the pagetables,
1088  * rewrite the p2m and seed the grant table.
1089  */
x86_pv_stream_complete(struct xc_sr_context * ctx)1090 static int x86_pv_stream_complete(struct xc_sr_context *ctx)
1091 {
1092     xc_interface *xch = ctx->xch;
1093     int rc;
1094 
1095     rc = update_vcpu_context(ctx);
1096     if ( rc )
1097         return rc;
1098 
1099     rc = pin_pagetables(ctx);
1100     if ( rc )
1101         return rc;
1102 
1103     rc = update_guest_p2m(ctx);
1104     if ( rc )
1105         return rc;
1106 
1107     rc = xc_dom_gnttab_seed(xch, ctx->domid,
1108                             ctx->restore.console_gfn,
1109                             ctx->restore.xenstore_gfn,
1110                             ctx->restore.console_domid,
1111                             ctx->restore.xenstore_domid);
1112     if ( rc )
1113     {
1114         PERROR("Failed to seed grant table");
1115         return rc;
1116     }
1117 
1118     return rc;
1119 }
1120 
1121 /*
1122  * restore_ops function.
1123  */
x86_pv_cleanup(struct xc_sr_context * ctx)1124 static int x86_pv_cleanup(struct xc_sr_context *ctx)
1125 {
1126     free(ctx->x86_pv.p2m);
1127     free(ctx->x86_pv.p2m_pfns);
1128 
1129     if ( ctx->x86_pv.restore.vcpus )
1130     {
1131         unsigned i;
1132 
1133         for ( i = 0; i < ctx->x86_pv.restore.nr_vcpus; ++i )
1134         {
1135             struct xc_sr_x86_pv_restore_vcpu *vcpu =
1136                 &ctx->x86_pv.restore.vcpus[i];
1137 
1138             free(vcpu->basic);
1139             free(vcpu->extd);
1140             free(vcpu->xsave);
1141             free(vcpu->msr);
1142         }
1143 
1144         free(ctx->x86_pv.restore.vcpus);
1145     }
1146 
1147     free(ctx->x86_pv.restore.pfn_types);
1148 
1149     if ( ctx->x86_pv.m2p )
1150         munmap(ctx->x86_pv.m2p, ctx->x86_pv.nr_m2p_frames * PAGE_SIZE);
1151 
1152     return 0;
1153 }
1154 
1155 struct xc_sr_restore_ops restore_ops_x86_pv =
1156 {
1157     .pfn_is_valid    = x86_pv_pfn_is_valid,
1158     .pfn_to_gfn      = pfn_to_mfn,
1159     .set_page_type   = x86_pv_set_page_type,
1160     .set_gfn         = x86_pv_set_gfn,
1161     .localise_page   = x86_pv_localise_page,
1162     .setup           = x86_pv_setup,
1163     .process_record  = x86_pv_process_record,
1164     .stream_complete = x86_pv_stream_complete,
1165     .cleanup         = x86_pv_cleanup,
1166 };
1167 
1168 /*
1169  * Local variables:
1170  * mode: C
1171  * c-file-style: "BSD"
1172  * c-basic-offset: 4
1173  * tab-width: 4
1174  * indent-tabs-mode: nil
1175  * End:
1176  */
1177