1 /******************************************************************************
2  *
3  * Domain paging.
4  * Copyright (c) 2009 by Citrix Systems, Inc. (Patrick Colp)
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #define _GNU_SOURCE
21 
22 #include <inttypes.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <stdarg.h>
26 #include <time.h>
27 #include <signal.h>
28 #include <unistd.h>
29 #include <poll.h>
30 #include <xenstore.h>
31 #include <getopt.h>
32 
33 #include "xc_bitops.h"
34 #include "file_ops.h"
35 #include "policy.h"
36 #include "xenpaging.h"
37 
38 /* Defines number of mfns a guest should use at a time, in KiB */
39 #define WATCH_TARGETPAGES "memory/target-tot_pages"
40 static char *watch_target_tot_pages;
41 static char *dom_path;
42 static char watch_token[16];
43 static char *filename;
44 static int interrupted;
45 
unlink_pagefile(void)46 static void unlink_pagefile(void)
47 {
48     if ( filename && filename[0] )
49     {
50         unlink(filename);
51         filename[0] = '\0';
52     }
53 }
54 
close_handler(int sig)55 static void close_handler(int sig)
56 {
57     interrupted = sig;
58     unlink_pagefile();
59 }
60 
xenpaging_mem_paging_flush_ioemu_cache(struct xenpaging * paging)61 static void xenpaging_mem_paging_flush_ioemu_cache(struct xenpaging *paging)
62 {
63     struct xs_handle *xsh = paging->xs_handle;
64     domid_t domain_id = paging->vm_event.domain_id;
65     char path[80];
66 
67     sprintf(path, "/local/domain/0/device-model/%u/command", domain_id);
68 
69     xs_write(xsh, XBT_NULL, path, "flush-cache", strlen("flush-cache"));
70 }
71 
xenpaging_wait_for_event_or_timeout(struct xenpaging * paging)72 static int xenpaging_wait_for_event_or_timeout(struct xenpaging *paging)
73 {
74     xc_interface *xch = paging->xc_handle;
75     xenevtchn_handle *xce = paging->vm_event.xce_handle;
76     char **vec, *val;
77     unsigned int num;
78     struct pollfd fd[2];
79     int port;
80     int rc;
81     int timeout;
82 
83     /* Wait for event channel and xenstore */
84     fd[0].fd = xenevtchn_fd(xce);
85     fd[0].events = POLLIN | POLLERR;
86     fd[1].fd = xs_fileno(paging->xs_handle);
87     fd[1].events = POLLIN | POLLERR;
88 
89     /* No timeout while page-out is still in progress */
90     timeout = paging->use_poll_timeout ? 100 : 0;
91     rc = poll(fd, 2, timeout);
92     if ( rc < 0 )
93     {
94         if (errno == EINTR)
95             return 0;
96 
97         PERROR("Poll exited with an error");
98         return -1;
99     }
100 
101     /* First check for guest shutdown */
102     if ( rc && fd[1].revents & POLLIN )
103     {
104         DPRINTF("Got event from xenstore\n");
105         vec = xs_read_watch(paging->xs_handle, &num);
106         if ( vec )
107         {
108             DPRINTF("path '%s' token '%s'\n", vec[XS_WATCH_PATH], vec[XS_WATCH_TOKEN]);
109             if ( strcmp(vec[XS_WATCH_TOKEN], watch_token) == 0 )
110             {
111                 /* If our guest disappeared, set interrupt flag and fall through */
112                 if ( xs_is_domain_introduced(paging->xs_handle, paging->vm_event.domain_id) == false )
113                 {
114                     xs_unwatch(paging->xs_handle, "@releaseDomain", watch_token);
115                     interrupted = SIGQUIT;
116                     /* No further poll result processing */
117                     rc = 0;
118                 }
119             }
120             else if ( strcmp(vec[XS_WATCH_PATH], watch_target_tot_pages) == 0 )
121             {
122                 int ret, target_tot_pages;
123                 val = xs_read(paging->xs_handle, XBT_NULL, vec[XS_WATCH_PATH], NULL);
124                 if ( val )
125                 {
126                     ret = sscanf(val, "%d", &target_tot_pages);
127                     if ( ret > 0 )
128                     {
129                         /* KiB to pages */
130                         target_tot_pages >>= 2;
131                         if ( target_tot_pages < 0 || target_tot_pages > paging->max_pages )
132                             target_tot_pages = paging->max_pages;
133                         paging->target_tot_pages = target_tot_pages;
134                         /* Disable poll() delay while new target is not yet reached */
135                         paging->use_poll_timeout = 0;
136                         DPRINTF("new target_tot_pages %d\n", target_tot_pages);
137                     }
138                     free(val);
139                 }
140             }
141             free(vec);
142         }
143     }
144 
145     if ( rc && fd[0].revents & POLLIN )
146     {
147         DPRINTF("Got event from evtchn\n");
148         port = xenevtchn_pending(xce);
149         if ( port == -1 )
150         {
151             PERROR("Failed to read port from event channel");
152             rc = -1;
153             goto err;
154         }
155 
156         rc = xenevtchn_unmask(xce, port);
157         if ( rc < 0 )
158         {
159             PERROR("Failed to unmask event channel port");
160         }
161     }
162 err:
163     return rc;
164 }
165 
xenpaging_get_tot_pages(struct xenpaging * paging)166 static int xenpaging_get_tot_pages(struct xenpaging *paging)
167 {
168     xc_interface *xch = paging->xc_handle;
169     xc_domaininfo_t domain_info;
170     int rc;
171 
172     rc = xc_domain_getinfolist(xch, paging->vm_event.domain_id, 1, &domain_info);
173     if ( rc != 1 )
174     {
175         PERROR("Error getting domain info");
176         return -1;
177     }
178     return domain_info.tot_pages;
179 }
180 
init_page(void)181 static void *init_page(void)
182 {
183     void *buffer;
184 
185     /* Allocated page memory */
186     errno = posix_memalign(&buffer, PAGE_SIZE, PAGE_SIZE);
187     if ( errno != 0 )
188         return NULL;
189 
190     /* Lock buffer in memory so it can't be paged out */
191     if ( mlock(buffer, PAGE_SIZE) < 0 )
192     {
193         free(buffer);
194         buffer = NULL;
195     }
196 
197     return buffer;
198 }
199 
usage(void)200 static void usage(void)
201 {
202     printf("usage:\n\n");
203 
204     printf("  xenpaging [options] -f <pagefile> -d <domain_id>\n\n");
205 
206     printf("options:\n");
207     printf(" -d <domid>     --domain=<domid>         numerical domain_id of guest. This option is required.\n");
208     printf(" -f <file>      --pagefile=<file>        pagefile to use. This option is required.\n");
209     printf(" -m <max_memkb> --max_memkb=<max_memkb>  maximum amount of memory to handle.\n");
210     printf(" -r <num>       --mru_size=<num>         number of paged-in pages to keep in memory.\n");
211     printf(" -v             --verbose                enable debug output.\n");
212     printf(" -h             --help                   this output.\n");
213 }
214 
xenpaging_getopts(struct xenpaging * paging,int argc,char * argv[])215 static int xenpaging_getopts(struct xenpaging *paging, int argc, char *argv[])
216 {
217     int ch;
218     static const char sopts[] = "hvd:f:m:r:";
219     static const struct option lopts[] = {
220         {"help", 0, NULL, 'h'},
221         {"verbose", 0, NULL, 'v'},
222         {"domain", 1, NULL, 'd'},
223         {"pagefile", 1, NULL, 'f'},
224         {"mru_size", 1, NULL, 'm'},
225         { }
226     };
227 
228     while ((ch = getopt_long(argc, argv, sopts, lopts, NULL)) != -1)
229     {
230         switch(ch) {
231         case 'd':
232             paging->vm_event.domain_id = atoi(optarg);
233             break;
234         case 'f':
235             free(filename);
236             filename = strdup(optarg);
237             break;
238         case 'm':
239             /* KiB to pages */
240             paging->max_pages = atoi(optarg) >> 2;
241             break;
242         case 'r':
243             paging->policy_mru_size = atoi(optarg);
244             break;
245         case 'v':
246             paging->debug = 1;
247             break;
248         case 'h':
249         case '?':
250             usage();
251             return 1;
252         }
253     }
254 
255     argv += optind; argc -= optind;
256 
257     /* Path to pagefile is required */
258     if ( !filename )
259     {
260         printf("Filename for pagefile missing!\n");
261         usage();
262         return 1;
263     }
264 
265     /* Set domain id */
266     if ( !paging->vm_event.domain_id )
267     {
268         printf("Numerical <domain_id> missing!\n");
269         return 1;
270     }
271 
272     return 0;
273 }
274 
xenpaging_init(int argc,char * argv[])275 static struct xenpaging *xenpaging_init(int argc, char *argv[])
276 {
277     struct xenpaging *paging;
278     xc_domaininfo_t domain_info;
279     xc_interface *xch = NULL;
280     xentoollog_logger *dbg = NULL;
281     char *p;
282     int rc;
283     unsigned long ring_pfn, mmap_pfn;
284 
285     /* Allocate memory */
286     paging = calloc(1, sizeof(struct xenpaging));
287     if ( !paging )
288         goto err;
289 
290     /* Get cmdline options and domain_id */
291     if ( xenpaging_getopts(paging, argc, argv) )
292         goto err;
293 
294     /* Enable debug output */
295     if ( paging->debug )
296         dbg = (xentoollog_logger *)xtl_createlogger_stdiostream(stderr, XTL_DEBUG, 0);
297 
298     /* Open connection to xen */
299     paging->xc_handle = xch = xc_interface_open(dbg, NULL, 0);
300     if ( !xch )
301         goto err;
302 
303     DPRINTF("xenpaging init\n");
304 
305     /* Open connection to xenstore */
306     paging->xs_handle = xs_open(0);
307     if ( paging->xs_handle == NULL )
308     {
309         PERROR("Error initialising xenstore connection");
310         goto err;
311     }
312 
313     /* write domain ID to watch so we can ignore other domain shutdowns */
314     snprintf(watch_token, sizeof(watch_token), "%u", paging->vm_event.domain_id);
315     if ( xs_watch(paging->xs_handle, "@releaseDomain", watch_token) == false )
316     {
317         PERROR("Could not bind to shutdown watch\n");
318         goto err;
319     }
320 
321     /* Watch xenpagings working target */
322     dom_path = xs_get_domain_path(paging->xs_handle, paging->vm_event.domain_id);
323     if ( !dom_path )
324     {
325         PERROR("Could not find domain path\n");
326         goto err;
327     }
328     if ( asprintf(&watch_target_tot_pages, "%s/%s", dom_path, WATCH_TARGETPAGES) < 0 )
329     {
330         PERROR("Could not alloc watch path\n");
331         goto err;
332     }
333     DPRINTF("watching '%s'\n", watch_target_tot_pages);
334     if ( xs_watch(paging->xs_handle, watch_target_tot_pages, "") == false )
335     {
336         PERROR("Could not bind to xenpaging watch\n");
337         goto err;
338     }
339 
340     /* Map the ring page */
341     xc_get_hvm_param(xch, paging->vm_event.domain_id,
342                         HVM_PARAM_PAGING_RING_PFN, &ring_pfn);
343     mmap_pfn = ring_pfn;
344     paging->vm_event.ring_page =
345         xc_map_foreign_pages(xch, paging->vm_event.domain_id,
346                              PROT_READ | PROT_WRITE, &mmap_pfn, 1);
347     if ( !paging->vm_event.ring_page )
348     {
349         /* Map failed, populate ring page */
350         rc = xc_domain_populate_physmap_exact(paging->xc_handle,
351                                               paging->vm_event.domain_id,
352                                               1, 0, 0, &ring_pfn);
353         if ( rc != 0 )
354         {
355             PERROR("Failed to populate ring gfn\n");
356             goto err;
357         }
358 
359         paging->vm_event.ring_page =
360             xc_map_foreign_pages(xch, paging->vm_event.domain_id,
361                                  PROT_READ | PROT_WRITE,
362                                  &mmap_pfn, 1);
363         if ( !paging->vm_event.ring_page )
364         {
365             PERROR("Could not map the ring page\n");
366             goto err;
367         }
368     }
369 
370     /* Initialise Xen */
371     rc = xc_mem_paging_enable(xch, paging->vm_event.domain_id,
372                              &paging->vm_event.evtchn_port);
373     if ( rc != 0 )
374     {
375         switch ( errno ) {
376             case EBUSY:
377                 ERROR("xenpaging is (or was) active on this domain");
378                 break;
379             case ENODEV:
380                 ERROR("xenpaging requires Hardware Assisted Paging");
381                 break;
382             case EMLINK:
383                 ERROR("xenpaging not supported while iommu passthrough is enabled");
384                 break;
385             case EXDEV:
386                 ERROR("xenpaging not supported in a PoD guest");
387                 break;
388             default:
389                 PERROR("Error initialising shared page");
390                 break;
391         }
392         goto err;
393     }
394 
395     /* Open event channel */
396     paging->vm_event.xce_handle = xenevtchn_open(NULL, 0);
397     if ( paging->vm_event.xce_handle == NULL )
398     {
399         PERROR("Failed to open event channel");
400         goto err;
401     }
402 
403     /* Bind event notification */
404     rc = xenevtchn_bind_interdomain(paging->vm_event.xce_handle,
405                                     paging->vm_event.domain_id,
406                                     paging->vm_event.evtchn_port);
407     if ( rc < 0 )
408     {
409         PERROR("Failed to bind event channel");
410         goto err;
411     }
412 
413     paging->vm_event.port = rc;
414 
415     /* Initialise ring */
416     SHARED_RING_INIT((vm_event_sring_t *)paging->vm_event.ring_page);
417     BACK_RING_INIT(&paging->vm_event.back_ring,
418                    (vm_event_sring_t *)paging->vm_event.ring_page,
419                    PAGE_SIZE);
420 
421     /* Now that the ring is set, remove it from the guest's physmap */
422     if ( xc_domain_decrease_reservation_exact(xch,
423                     paging->vm_event.domain_id, 1, 0, &ring_pfn) )
424         PERROR("Failed to remove ring from guest physmap");
425 
426     /* Get max_pages from guest if not provided via cmdline */
427     if ( !paging->max_pages )
428     {
429         rc = xc_domain_getinfolist(xch, paging->vm_event.domain_id, 1,
430                                    &domain_info);
431         if ( rc != 1 )
432         {
433             PERROR("Error getting domain info");
434             goto err;
435         }
436 
437         /* Record number of max_pages */
438         paging->max_pages = domain_info.max_pages;
439     }
440 
441     /* Allocate bitmap for tracking pages that have been paged out */
442     paging->bitmap = bitmap_alloc(paging->max_pages);
443     if ( !paging->bitmap )
444     {
445         PERROR("Error allocating bitmap");
446         goto err;
447     }
448     DPRINTF("max_pages = %d\n", paging->max_pages);
449 
450     /* Allocate indicies for pagefile slots */
451     paging->slot_to_gfn = calloc(paging->max_pages, sizeof(*paging->slot_to_gfn));
452     paging->gfn_to_slot = calloc(paging->max_pages, sizeof(*paging->gfn_to_slot));
453     if ( !paging->slot_to_gfn || !paging->gfn_to_slot )
454         goto err;
455 
456     /* Allocate stack for known free slots in pagefile */
457     paging->free_slot_stack = calloc(paging->max_pages, sizeof(*paging->free_slot_stack));
458     if ( !paging->free_slot_stack )
459         goto err;
460 
461     /* Initialise policy */
462     rc = policy_init(paging);
463     if ( rc != 0 )
464     {
465         PERROR("Error initialising policy");
466         goto err;
467     }
468 
469     paging->paging_buffer = init_page();
470     if ( !paging->paging_buffer )
471     {
472         PERROR("Creating page aligned load buffer");
473         goto err;
474     }
475 
476     /* Open file */
477     paging->fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR);
478     if ( paging->fd < 0 )
479     {
480         PERROR("failed to open file");
481         goto err;
482     }
483 
484     return paging;
485 
486  err:
487     if ( paging )
488     {
489         if ( paging->xs_handle )
490             xs_close(paging->xs_handle);
491         if ( xch )
492             xc_interface_close(xch);
493         if ( paging->paging_buffer )
494         {
495             munlock(paging->paging_buffer, PAGE_SIZE);
496             free(paging->paging_buffer);
497         }
498 
499         if ( paging->vm_event.ring_page )
500         {
501             munmap(paging->vm_event.ring_page, PAGE_SIZE);
502         }
503 
504         free(dom_path);
505         free(watch_target_tot_pages);
506         free(paging->free_slot_stack);
507         free(paging->slot_to_gfn);
508         free(paging->gfn_to_slot);
509         free(paging->bitmap);
510         free(paging);
511     }
512 
513     return NULL;
514 }
515 
xenpaging_teardown(struct xenpaging * paging)516 static void xenpaging_teardown(struct xenpaging *paging)
517 {
518     int rc;
519     xc_interface *xch = paging->xc_handle;
520 
521     xs_unwatch(paging->xs_handle, watch_target_tot_pages, "");
522     xs_unwatch(paging->xs_handle, "@releaseDomain", watch_token);
523 
524     paging->xc_handle = NULL;
525     /* Tear down domain paging in Xen */
526     munmap(paging->vm_event.ring_page, PAGE_SIZE);
527     rc = xc_mem_paging_disable(xch, paging->vm_event.domain_id);
528     if ( rc != 0 )
529     {
530         PERROR("Error tearing down domain paging in xen");
531     }
532 
533     /* Unbind VIRQ */
534     rc = xenevtchn_unbind(paging->vm_event.xce_handle, paging->vm_event.port);
535     if ( rc != 0 )
536     {
537         PERROR("Error unbinding event port");
538     }
539     paging->vm_event.port = -1;
540 
541     /* Close event channel */
542     rc = xenevtchn_close(paging->vm_event.xce_handle);
543     if ( rc != 0 )
544     {
545         PERROR("Error closing event channel");
546     }
547     paging->vm_event.xce_handle = NULL;
548 
549     /* Close connection to xenstore */
550     xs_close(paging->xs_handle);
551 
552     /* Close connection to Xen */
553     xc_interface_close(xch);
554 }
555 
get_request(struct vm_event * vm_event,vm_event_request_t * req)556 static void get_request(struct vm_event *vm_event, vm_event_request_t *req)
557 {
558     vm_event_back_ring_t *back_ring;
559     RING_IDX req_cons;
560 
561     back_ring = &vm_event->back_ring;
562     req_cons = back_ring->req_cons;
563 
564     /* Copy request */
565     memcpy(req, RING_GET_REQUEST(back_ring, req_cons), sizeof(*req));
566     req_cons++;
567 
568     /* Update ring */
569     back_ring->req_cons = req_cons;
570     back_ring->sring->req_event = req_cons + 1;
571 }
572 
put_response(struct vm_event * vm_event,vm_event_response_t * rsp)573 static void put_response(struct vm_event *vm_event, vm_event_response_t *rsp)
574 {
575     vm_event_back_ring_t *back_ring;
576     RING_IDX rsp_prod;
577 
578     back_ring = &vm_event->back_ring;
579     rsp_prod = back_ring->rsp_prod_pvt;
580 
581     /* Copy response */
582     memcpy(RING_GET_RESPONSE(back_ring, rsp_prod), rsp, sizeof(*rsp));
583     rsp_prod++;
584 
585     /* Update ring */
586     back_ring->rsp_prod_pvt = rsp_prod;
587     RING_PUSH_RESPONSES(back_ring);
588 }
589 
590 /* Evict a given gfn
591  * Returns < 0 on fatal error
592  * Returns 0 on successful evict
593  * Returns > 0 if gfn can not be evicted
594  */
xenpaging_evict_page(struct xenpaging * paging,unsigned long gfn,int slot)595 static int xenpaging_evict_page(struct xenpaging *paging, unsigned long gfn, int slot)
596 {
597     xc_interface *xch = paging->xc_handle;
598     void *page;
599     xen_pfn_t victim = gfn;
600     int ret;
601 
602     DECLARE_DOMCTL;
603 
604     /* Nominate page */
605     ret = xc_mem_paging_nominate(xch, paging->vm_event.domain_id, gfn);
606     if ( ret < 0 )
607     {
608         /* unpageable gfn is indicated by EBUSY */
609         if ( errno == EBUSY )
610             ret = 1;
611         else
612             PERROR("Error nominating page %lx", gfn);
613         goto out;
614     }
615 
616     /* Map page */
617     page = xc_map_foreign_pages(xch, paging->vm_event.domain_id, PROT_READ, &victim, 1);
618     if ( page == NULL )
619     {
620         PERROR("Error mapping page %lx", gfn);
621         ret = -1;
622         goto out;
623     }
624 
625     /* Copy page */
626     ret = write_page(paging->fd, page, slot);
627     if ( ret < 0 )
628     {
629         PERROR("Error copying page %lx", gfn);
630         munmap(page, PAGE_SIZE);
631         ret = -1;
632         goto out;
633     }
634 
635     /* Release page */
636     munmap(page, PAGE_SIZE);
637 
638     /* Tell Xen to evict page */
639     ret = xc_mem_paging_evict(xch, paging->vm_event.domain_id, gfn);
640     if ( ret < 0 )
641     {
642         /* A gfn in use is indicated by EBUSY */
643         if ( errno == EBUSY )
644         {
645                 ret = 1;
646                 DPRINTF("Nominated page %lx busy", gfn);
647         } else
648             PERROR("Error evicting page %lx", gfn);
649         goto out;
650     }
651 
652     DPRINTF("evict_page > gfn %lx pageslot %d\n", gfn, slot);
653     /* Notify policy of page being paged out */
654     policy_notify_paged_out(gfn);
655 
656     /* Update index */
657     paging->slot_to_gfn[slot] = gfn;
658     paging->gfn_to_slot[gfn] = slot;
659 
660     /* Record number of evicted pages */
661     paging->num_paged_out++;
662 
663     ret = 0;
664 
665  out:
666     return ret;
667 }
668 
xenpaging_resume_page(struct xenpaging * paging,vm_event_response_t * rsp,int notify_policy)669 static int xenpaging_resume_page(struct xenpaging *paging, vm_event_response_t *rsp, int notify_policy)
670 {
671     /* Put the page info on the ring */
672     put_response(&paging->vm_event, rsp);
673 
674     /* Notify policy of page being paged in */
675     if ( notify_policy )
676     {
677         /*
678          * Do not add gfn to mru list if the target is lower than mru size.
679          * This allows page-out of these gfns if the target grows again.
680          */
681         if (paging->num_paged_out > paging->policy_mru_size)
682             policy_notify_paged_in(rsp->u.mem_paging.gfn);
683         else
684             policy_notify_paged_in_nomru(rsp->u.mem_paging.gfn);
685 
686        /* Record number of resumed pages */
687        paging->num_paged_out--;
688     }
689 
690     /* Tell Xen page is ready */
691     return xenevtchn_notify(paging->vm_event.xce_handle, paging->vm_event.port);
692 }
693 
xenpaging_populate_page(struct xenpaging * paging,unsigned long gfn,int i)694 static int xenpaging_populate_page(struct xenpaging *paging, unsigned long gfn, int i)
695 {
696     xc_interface *xch = paging->xc_handle;
697     int ret;
698     unsigned char oom = 0;
699 
700     DPRINTF("populate_page < gfn %lx pageslot %d\n", gfn, i);
701 
702     /* Read page */
703     ret = read_page(paging->fd, paging->paging_buffer, i);
704     if ( ret != 0 )
705     {
706         PERROR("Error reading page");
707         goto out;
708     }
709 
710     do
711     {
712         /* Tell Xen to allocate a page for the domain */
713         ret = xc_mem_paging_load(xch, paging->vm_event.domain_id, gfn, paging->paging_buffer);
714         if ( ret < 0 )
715         {
716             if ( errno == ENOMEM )
717             {
718                 if ( oom++ == 0 )
719                     DPRINTF("ENOMEM while preparing gfn %lx\n", gfn);
720                 sleep(1);
721                 continue;
722             }
723             PERROR("Error loading %lx during page-in", gfn);
724             ret = -1;
725             break;
726         }
727     }
728     while ( ret && !interrupted );
729 
730 
731  out:
732     return ret;
733 }
734 
735 /* Trigger a page-in for a batch of pages */
resume_pages(struct xenpaging * paging,int num_pages)736 static void resume_pages(struct xenpaging *paging, int num_pages)
737 {
738     xc_interface *xch = paging->xc_handle;
739     int i, num = 0;
740 
741     for ( i = 0; i < paging->max_pages && num < num_pages; i++ )
742     {
743         if ( test_bit(i, paging->bitmap) )
744         {
745             paging->pagein_queue[num] = i;
746             num++;
747             if ( num == XENPAGING_PAGEIN_QUEUE_SIZE )
748                 break;
749         }
750     }
751     /* num may be less than num_pages, caller has to try again */
752     if ( num )
753         page_in_trigger();
754 }
755 
756 /* Evict one gfn and write it to the given slot
757  * Returns < 0 on fatal error
758  * Returns 0 on successful evict
759  * Returns > 0 if no gfn can be evicted
760  */
evict_victim(struct xenpaging * paging,int slot)761 static int evict_victim(struct xenpaging *paging, int slot)
762 {
763     xc_interface *xch = paging->xc_handle;
764     unsigned long gfn;
765     static int num_paged_out;
766     int ret;
767 
768     do
769     {
770         gfn = policy_choose_victim(paging);
771         if ( gfn == INVALID_MFN )
772         {
773             /* If the number did not change after last flush command then
774              * the command did not reach qemu yet, or qemu still processes
775              * the command, or qemu has nothing to release.
776              * Right now there is no need to issue the command again.
777              */
778             if ( num_paged_out != paging->num_paged_out )
779             {
780                 DPRINTF("Flushing qemu cache\n");
781                 xenpaging_mem_paging_flush_ioemu_cache(paging);
782                 num_paged_out = paging->num_paged_out;
783             }
784             ret = ENOSPC;
785             goto out;
786         }
787 
788         if ( interrupted )
789         {
790             ret = EINTR;
791             goto out;
792         }
793 
794         ret = xenpaging_evict_page(paging, gfn, slot);
795         if ( ret < 0 )
796             goto out;
797     }
798     while ( ret );
799 
800     if ( test_and_set_bit(gfn, paging->bitmap) )
801         ERROR("Page %lx has been evicted before", gfn);
802 
803     ret = 0;
804 
805  out:
806     return ret;
807 }
808 
809 /* Evict a batch of pages and write them to a free slot in the paging file
810  * Returns < 0 on fatal error
811  * Returns 0 if no gfn can be evicted
812  * Returns > 0 on successful evict
813  */
evict_pages(struct xenpaging * paging,int num_pages)814 static int evict_pages(struct xenpaging *paging, int num_pages)
815 {
816     xc_interface *xch = paging->xc_handle;
817     int rc, slot, num = 0;
818 
819     /* Reuse known free slots */
820     while ( paging->stack_count > 0 && num < num_pages )
821     {
822         slot = paging->free_slot_stack[--paging->stack_count];
823         rc = evict_victim(paging, slot);
824         if ( rc )
825         {
826             num = rc < 0 ? -1 : num;
827             return num;
828         }
829         num++;
830     }
831 
832     /* Scan all slots slots for remainders */
833     for ( slot = 0; slot < paging->max_pages && num < num_pages; slot++ )
834     {
835         /* Slot is allocated */
836         if ( paging->slot_to_gfn[slot] )
837             continue;
838 
839         rc = evict_victim(paging, slot);
840         if ( rc )
841         {
842             num = rc < 0 ? -1 : num;
843             break;
844         }
845 
846         num++;
847     }
848     return num;
849 }
850 
main(int argc,char * argv[])851 int main(int argc, char *argv[])
852 {
853     struct sigaction act;
854     struct xenpaging *paging;
855     vm_event_request_t req;
856     vm_event_response_t rsp;
857     int num, prev_num = 0;
858     int slot;
859     int tot_pages;
860     int rc;
861     xc_interface *xch;
862 
863     /* Initialise domain paging */
864     paging = xenpaging_init(argc, argv);
865     if ( paging == NULL )
866     {
867         fprintf(stderr, "Error initialising paging\n");
868         return 1;
869     }
870     xch = paging->xc_handle;
871 
872     DPRINTF("starting %s for domain_id %u with pagefile %s\n",
873             argv[0], paging->vm_event.domain_id, filename);
874 
875     /* ensure that if we get a signal, we'll do cleanup, then exit */
876     act.sa_handler = close_handler;
877     act.sa_flags = 0;
878     sigemptyset(&act.sa_mask);
879     sigaction(SIGHUP,  &act, NULL);
880     sigaction(SIGTERM, &act, NULL);
881     sigaction(SIGINT,  &act, NULL);
882     sigaction(SIGALRM, &act, NULL);
883 
884     /* listen for page-in events to stop pager */
885     create_page_in_thread(paging);
886 
887     /* Swap pages in and out */
888     while ( 1 )
889     {
890         /* Wait for Xen to signal that a page needs paged in */
891         rc = xenpaging_wait_for_event_or_timeout(paging);
892         if ( rc < 0 )
893         {
894             ERROR("Error getting event");
895             goto out;
896         }
897         else if ( rc != 0 )
898         {
899             DPRINTF("Got event from Xen\n");
900         }
901 
902         while ( RING_HAS_UNCONSUMED_REQUESTS(&paging->vm_event.back_ring) )
903         {
904             /* Indicate possible error */
905             rc = 1;
906 
907             get_request(&paging->vm_event, &req);
908 
909             if ( req.u.mem_paging.gfn > paging->max_pages )
910             {
911                 ERROR("Requested gfn %"PRIx64" higher than max_pages %x\n",
912                       req.u.mem_paging.gfn, paging->max_pages);
913                 goto out;
914             }
915 
916             /* Check if the page has already been paged in */
917             if ( test_and_clear_bit(req.u.mem_paging.gfn, paging->bitmap) )
918             {
919                 /* Find where in the paging file to read from */
920                 slot = paging->gfn_to_slot[req.u.mem_paging.gfn];
921 
922                 /* Sanity check */
923                 if ( paging->slot_to_gfn[slot] != req.u.mem_paging.gfn )
924                 {
925                     ERROR("Expected gfn %"PRIx64" in slot %d, but found gfn %lx\n",
926                           req.u.mem_paging.gfn, slot, paging->slot_to_gfn[slot]);
927                     goto out;
928                 }
929 
930                 if ( req.u.mem_paging.flags & MEM_PAGING_DROP_PAGE )
931                 {
932                     DPRINTF("drop_page ^ gfn %"PRIx64" pageslot %d\n",
933                             req.u.mem_paging.gfn, slot);
934                     /* Notify policy of page being dropped */
935                     policy_notify_dropped(req.u.mem_paging.gfn);
936                 }
937                 else
938                 {
939                     /* Populate the page */
940                     if ( xenpaging_populate_page(paging, req.u.mem_paging.gfn, slot) < 0 )
941                     {
942                         ERROR("Error populating page %"PRIx64"", req.u.mem_paging.gfn);
943                         goto out;
944                     }
945                 }
946 
947                 /* Prepare the response */
948                 rsp.u.mem_paging.gfn = req.u.mem_paging.gfn;
949                 rsp.vcpu_id = req.vcpu_id;
950                 rsp.flags = req.flags;
951 
952                 if ( xenpaging_resume_page(paging, &rsp, 1) < 0 )
953                 {
954                     PERROR("Error resuming page %"PRIx64"", req.u.mem_paging.gfn);
955                     goto out;
956                 }
957 
958                 /* Clear this pagefile slot */
959                 paging->slot_to_gfn[slot] = 0;
960 
961                 /* Record this free slot */
962                 paging->free_slot_stack[paging->stack_count++] = slot;
963             }
964             else
965             {
966                 DPRINTF("page %s populated (domain = %d; vcpu = %d;"
967                         " gfn = %"PRIx64"; paused = %d; evict_fail = %d)\n",
968                         req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL ? "not" : "already",
969                         paging->vm_event.domain_id, req.vcpu_id, req.u.mem_paging.gfn,
970                         !!(req.flags & VM_EVENT_FLAG_VCPU_PAUSED) ,
971                         !!(req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL) );
972 
973                 /* Tell Xen to resume the vcpu */
974                 if (( req.flags & VM_EVENT_FLAG_VCPU_PAUSED ) ||
975                     ( req.u.mem_paging.flags & MEM_PAGING_EVICT_FAIL ))
976                 {
977                     /* Prepare the response */
978                     rsp.u.mem_paging.gfn = req.u.mem_paging.gfn;
979                     rsp.vcpu_id = req.vcpu_id;
980                     rsp.flags = req.flags;
981 
982                     if ( xenpaging_resume_page(paging, &rsp, 0) < 0 )
983                     {
984                         PERROR("Error resuming page %"PRIx64"", req.u.mem_paging.gfn);
985                         goto out;
986                     }
987                 }
988             }
989         }
990 
991         /* If interrupted, write all pages back into the guest */
992         if ( interrupted == SIGTERM || interrupted == SIGINT )
993         {
994             /* If no more pages to process, exit loop. */
995             if ( !paging->num_paged_out )
996                 break;
997 
998             /* One more round if there are still pages to process. */
999             resume_pages(paging, paging->num_paged_out);
1000 
1001             /* Resume main loop */
1002             continue;
1003         }
1004 
1005         /* Exit main loop on any other signal */
1006         if ( interrupted )
1007             break;
1008 
1009         /* Indicate possible error */
1010         rc = 1;
1011 
1012         /* Check if the target has been reached already */
1013         tot_pages = xenpaging_get_tot_pages(paging);
1014         if ( tot_pages < 0 )
1015             goto out;
1016 
1017         /* Resume all pages if paging is disabled or no target was set */
1018         if ( paging->target_tot_pages == 0 )
1019         {
1020             if ( paging->num_paged_out )
1021                 resume_pages(paging, paging->num_paged_out);
1022         }
1023         /* Evict more pages if target not reached */
1024         else if ( tot_pages > paging->target_tot_pages )
1025         {
1026             num = tot_pages - paging->target_tot_pages;
1027             if ( num != prev_num )
1028             {
1029                 DPRINTF("Need to evict %d pages to reach %d target_tot_pages\n", num, paging->target_tot_pages);
1030                 prev_num = num;
1031             }
1032             /* Limit the number of evicts to be able to process page-in requests */
1033             if ( num > 42 )
1034             {
1035                 paging->use_poll_timeout = 0;
1036                 num = 42;
1037             }
1038             if ( evict_pages(paging, num) < 0 )
1039                 goto out;
1040         }
1041         /* Resume some pages if target not reached */
1042         else if ( tot_pages < paging->target_tot_pages && paging->num_paged_out )
1043         {
1044             num = paging->target_tot_pages - tot_pages;
1045             if ( num != prev_num )
1046             {
1047                 DPRINTF("Need to resume %d pages to reach %d target_tot_pages\n", num, paging->target_tot_pages);
1048                 prev_num = num;
1049             }
1050             resume_pages(paging, num);
1051         }
1052         /* Now target was reached, enable poll() timeout */
1053         else
1054         {
1055             paging->use_poll_timeout = 1;
1056         }
1057 
1058     }
1059 
1060     /* No error */
1061     rc = 0;
1062 
1063     DPRINTF("xenpaging got signal %d\n", interrupted);
1064 
1065  out:
1066     close(paging->fd);
1067     unlink_pagefile();
1068 
1069     /* Tear down domain paging */
1070     xenpaging_teardown(paging);
1071 
1072     return rc ? 2 : 0;
1073 }
1074 
1075 
1076 /*
1077  * Local variables:
1078  * mode: C
1079  * c-file-style: "BSD"
1080  * c-basic-offset: 4
1081  * indent-tabs-mode: nil
1082  * End:
1083  */
1084