1 /******************************************************************************
2  * tools/xenbaked.c
3  *
4  * Tool for collecting raw trace buffer data from Xen and
5  *  performing some accumulation operations and other processing
6  *  on it.
7  *
8  * Copyright (C) 2004 by Intel Research Cambridge
9  * Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins
10  * Copyright (C) 2006 by Hewlett Packard Fort Collins
11  *
12  * Authors: Diwaker Gupta, diwaker.gupta@hp.com
13  *          Rob Gardner, rob.gardner@hp.com
14  *          Lucy Cherkasova, lucy.cherkasova.hp.com
15  * Much code based on xentrace, authored by Mark Williamson,
16  * mark.a.williamson@intel.com
17  * Date:   November, 2005
18  *
19  *  This program is free software; you can redistribute it and/or modify
20  *  it under the terms of the GNU General Public License as published by
21  *  the Free Software Foundation; under version 2 of the License.
22  *
23  *  This program is distributed in the hope that it will be useful,
24  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26  *  GNU General Public License for more details.
27  *
28  *  You should have received a copy of the GNU General Public License
29  *  along with this program; If not, see <http://www.gnu.org/licenses/>.
30  */
31 
32 #include <time.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <sys/mman.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <signal.h>
40 #include <xenevtchn.h>
41 #define XC_WANT_COMPAT_MAP_FOREIGN_API
42 #include <xenctrl.h>
43 #include <xen/xen.h>
44 #include <string.h>
45 #include <sys/select.h>
46 #include <getopt.h>
47 
48 #define PERROR(_m, _a...)                                       \
49 do {                                                            \
50     int __saved_errno = errno;                                  \
51     fprintf(stderr, "ERROR: " _m " (%d = %s)\n" , ## _a ,       \
52             __saved_errno, strerror(__saved_errno));            \
53     errno = __saved_errno;                                      \
54 } while (0)
55 
56 typedef struct { int counter; } atomic_t;
57 #define _atomic_read(v)		((v).counter)
58 
59 #include <xen/trace.h>
60 #include "xenbaked.h"
61 
62 
63 /***** Compile time configuration of defaults ********************************/
64 
65 /* when we've got more records than this waiting, we log it to the output */
66 #define NEW_DATA_THRESH 1
67 
68 /* sleep for this long (milliseconds) between checking the trace buffers */
69 #define POLL_SLEEP_MILLIS 100
70 
71 /* Size of time period represented by each sample */
72 #define MS_PER_SAMPLE 100
73 
74 /* CPU Frequency */
75 #define MHZ
76 #define CPU_FREQ 2660 MHZ
77 
78 /***** The code **************************************************************/
79 
80 typedef struct settings_st {
81     struct timespec poll_sleep;
82     unsigned long new_data_thresh;
83     unsigned long ms_per_sample;
84     double cpu_freq;
85 } settings_t;
86 
87 struct t_struct {
88     const struct t_info *t_info; /* Structure with information about individual buffers */
89     struct t_buf **meta;    /* Pointers to trace buffer metadata */
90     unsigned char **data;   /* Pointers to trace buffer data areas */
91 };
92 
93 settings_t opts;
94 
95 int interrupted = 0; /* gets set if we get a SIGHUP */
96 int rec_count = 0;
97 int wakeups = 0;
98 time_t start_time;
99 int dom0_flips = 0;
100 
101 _new_qos_data *new_qos;
102 _new_qos_data **cpu_qos_data;
103 
104 int global_cpu;
105 uint64_t global_now;
106 
107 // array of currently running domains, indexed by cpu
108 int *running = NULL;
109 
110 // number of cpu's on this platform
111 int NCPU = 0;
112 
113 
114 static void advance_next_datapoint(uint64_t);
115 static void alloc_qos_data(int ncpu);
116 static int process_record(int, struct t_rec *);
117 static void qos_kill_thread(int domid);
118 
119 
init_current(int ncpu)120 static void init_current(int ncpu)
121 {
122     running = calloc(ncpu, sizeof(int));
123     NCPU = ncpu;
124     printf("Initialized with %d %s\n", ncpu, (ncpu == 1) ? "cpu" : "cpu's");
125 }
126 
is_current(int domain,int cpu)127 static int is_current(int domain, int cpu)
128 {
129     //  int i;
130 
131     //  for (i=0; i<NCPU; i++)
132     if (running[cpu] == domain)
133         return 1;
134     return 0;
135 }
136 
137 
138 #if 0 /* unused */
139 // return the domain that's currently running on the given cpu
140 static int current(int cpu)
141 {
142     return running[cpu];
143 }
144 #endif
145 
set_current(int cpu,int domain)146 static void set_current(int cpu, int domain)
147 {
148     running[cpu] = domain;
149 }
150 
151 
152 
close_handler(int signal)153 static void close_handler(int signal)
154 {
155     interrupted = 1;
156 }
157 
158 #if 0
159 void dump_record(int cpu, struct t_rec *x)
160 {
161     printf("record: cpu=%x, tsc=%lx, event=%x, d1=%lx\n",
162            cpu, x->cycles, x->event, x->data[0]);
163 }
164 #endif
165 
166 /**
167  * millis_to_timespec - convert a time in milliseconds to a struct timespec
168  * @millis:             time interval in milliseconds
169  */
millis_to_timespec(unsigned long millis)170 static struct timespec millis_to_timespec(unsigned long millis)
171 {
172     struct timespec spec;
173 
174     spec.tv_sec = millis / 1000;
175     spec.tv_nsec = (millis % 1000) * 1000;
176 
177     return spec;
178 }
179 
180 
181 typedef struct
182 {
183     int event_count;
184     int event_id;
185     char *text;
186 } stat_map_t;
187 
188 stat_map_t stat_map[] = {
189     { 0,       0, 	    "Other" },
190     { 0, TRC_SCHED_DOM_ADD, "Add Domain" },
191     { 0, TRC_SCHED_DOM_REM, "Remove Domain" },
192     { 0, TRC_SCHED_SLEEP, "Sleep" },
193     { 0, TRC_SCHED_WAKE,  "Wake" },
194     { 0, TRC_SCHED_BLOCK,  "Block" },
195     { 0, TRC_SCHED_SWITCH,  "Switch" },
196     { 0, TRC_SCHED_S_TIMER_FN, "Timer Func"},
197     { 0, TRC_SCHED_SWITCH_INFPREV,  "Switch Prev" },
198     { 0, TRC_SCHED_SWITCH_INFNEXT,  "Switch Next" },
199     { 0, TRC_MEM_PAGE_GRANT_MAP,  "Page Map" },
200     { 0, TRC_MEM_PAGE_GRANT_UNMAP,  "Page Unmap" },
201     { 0, TRC_MEM_PAGE_GRANT_TRANSFER,  "Page Transfer" },
202     { 0,      0, 		 0  }
203 };
204 
205 
check_gotten_sum(void)206 static void check_gotten_sum(void)
207 {
208 #if 0
209     uint64_t sum, ns;
210     extern uint64_t total_ns_gotten(uint64_t*);
211     double percent;
212     int i;
213 
214     for (i=0; i<NCPU; i++) {
215         new_qos = cpu_qos_data[i];
216         ns = billion;
217         sum = total_ns_gotten(&ns);
218 
219         printf("[cpu%d] ns_gotten over all domains = %lldns, over %lldns\n",
220                i, sum, ns);
221         percent = (double) sum;
222         percent = (100.0*percent) / (double)ns;
223         printf(" ==> ns_gotten = %7.3f%%\n", percent);
224     }
225 #endif
226 }
227 
228 
229 
dump_stats(void)230 static void dump_stats(void)
231 {
232     stat_map_t *smt = stat_map;
233     time_t end_time, run_time;
234 
235     time(&end_time);
236 
237     run_time = end_time - start_time;
238 
239     printf("Event counts:\n");
240     while (smt->text != NULL) {
241         printf("%08d\t%s\n", smt->event_count, smt->text);
242         smt++;
243     }
244 
245     printf("processed %d total records in %d seconds (%ld per second)\n",
246            rec_count, (int)run_time, (long)(rec_count/run_time));
247 
248     printf("woke up %d times in %d seconds (%ld per second)\n", wakeups,
249 	   (int) run_time, (long)(wakeups/run_time));
250 
251     check_gotten_sum();
252 }
253 
log_event(int event_id)254 static void log_event(int event_id)
255 {
256     stat_map_t *smt = stat_map;
257 
258     //  printf("event_id = 0x%x\n", event_id);
259 
260     while (smt->text != NULL) {
261         if (smt->event_id == event_id) {
262             smt->event_count++;
263             return;
264         }
265         smt++;
266     }
267     if (smt->text == NULL)
268         stat_map[0].event_count++;	// other
269 }
270 
271 int virq_port;
272 xenevtchn_handle *xce_handle = NULL;
273 
274 /* Returns the event channel handle. */
275 /* Stolen from xenstore code */
eventchn_init(void)276 static int eventchn_init(void)
277 {
278     int rc;
279 
280     // to revert to old way:
281     if (0)
282         return -1;
283 
284     xce_handle = xenevtchn_open(NULL, 0);
285 
286     if (xce_handle == NULL)
287         perror("Failed to open evtchn device");
288 
289     if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_TBUF)) == -1)
290         perror("Failed to bind to domain exception virq port");
291     virq_port = rc;
292 
293     return xce_handle == NULL ? -1 : 0;
294 }
295 
wait_for_event(void)296 static void wait_for_event(void)
297 {
298     int ret;
299     fd_set inset;
300     evtchn_port_t port;
301     struct timeval tv;
302     int evtchn_fd;
303 
304     if (xce_handle == NULL) {
305         nanosleep(&opts.poll_sleep, NULL);
306         return;
307     }
308 
309     evtchn_fd = xenevtchn_fd(xce_handle);
310 
311     FD_ZERO(&inset);
312     FD_SET(evtchn_fd, &inset);
313     tv.tv_sec = 1;
314     tv.tv_usec = 0;
315     // tv = millis_to_timespec(&opts.poll_sleep);
316     ret = select(evtchn_fd+1, &inset, NULL, NULL, &tv);
317 
318     if ( (ret == 1) && FD_ISSET(evtchn_fd, &inset)) {
319         if ((port = xenevtchn_pending(xce_handle)) == -1)
320             perror("Failed to read from event fd");
321 
322         //    if (port == virq_port)
323         //      printf("got the event I was looking for\r\n");
324 
325         if (xenevtchn_unmask(xce_handle, port) == -1)
326             perror("Failed to write to event fd");
327     }
328 }
329 
get_tbufs(unsigned long * mfn,unsigned long * size)330 static void get_tbufs(unsigned long *mfn, unsigned long *size)
331 {
332     xc_interface *xc_handle = xc_interface_open(0,0,0);
333     int ret;
334 
335     if ( !xc_handle )
336     {
337         exit(EXIT_FAILURE);
338     }
339 
340     ret = xc_tbuf_enable(xc_handle, DEFAULT_TBUF_SIZE, mfn, size);
341 
342     if ( ret != 0 )
343     {
344         perror("Couldn't enable trace buffers");
345         exit(1);
346     }
347 
348     xc_interface_close(xc_handle);
349 }
350 
disable_tracing(void)351 static void disable_tracing(void)
352 {
353     xc_interface *xc_handle = xc_interface_open(0,0,0);
354     xc_tbuf_disable(xc_handle);
355     xc_interface_close(xc_handle);
356 }
357 
358 /**
359  * map_tbufs - memory map Xen trace buffers into user space
360  * @tbufs_mfn: mfn of the trace buffers
361  * @num:       number of trace buffers to map
362  * @size:      size of each trace buffer
363  *
364  * Maps the Xen trace buffers them into process address space.
365  */
map_tbufs(unsigned long tbufs_mfn,unsigned int num,unsigned long tinfo_size)366 static struct t_struct *map_tbufs(unsigned long tbufs_mfn, unsigned int num,
367                                   unsigned long tinfo_size)
368 {
369     xc_interface *xc_handle;
370     static struct t_struct tbufs = { 0 };
371     int i;
372 
373     xc_handle = xc_interface_open(0,0,0);
374     if ( !xc_handle )
375     {
376         exit(EXIT_FAILURE);
377     }
378 
379     /* Map t_info metadata structure */
380     tbufs.t_info = xc_map_foreign_range(xc_handle, DOMID_XEN, tinfo_size,
381                                         PROT_READ, tbufs_mfn);
382 
383     if ( tbufs.t_info == 0 )
384     {
385         PERROR("Failed to mmap trace buffers");
386         exit(EXIT_FAILURE);
387     }
388 
389     if ( tbufs.t_info->tbuf_size == 0 )
390     {
391         fprintf(stderr, "%s: tbuf_size 0!\n", __func__);
392         exit(EXIT_FAILURE);
393     }
394 
395     /* Map per-cpu buffers */
396     tbufs.meta = (struct t_buf **)calloc(num, sizeof(struct t_buf *));
397     tbufs.data = (unsigned char **)calloc(num, sizeof(unsigned char *));
398     if ( tbufs.meta == NULL || tbufs.data == NULL )
399     {
400         PERROR( "Failed to allocate memory for buffer pointers\n");
401         exit(EXIT_FAILURE);
402     }
403 
404     for(i=0; i<num; i++)
405     {
406 
407         const uint32_t *mfn_list = (const uint32_t *)tbufs.t_info
408                                    + tbufs.t_info->mfn_offset[i];
409         int j;
410         xen_pfn_t pfn_list[tbufs.t_info->tbuf_size];
411 
412         for ( j=0; j<tbufs.t_info->tbuf_size; j++)
413             pfn_list[j] = (xen_pfn_t)mfn_list[j];
414 
415         tbufs.meta[i] = xc_map_foreign_pages(xc_handle, DOMID_XEN,
416                                              PROT_READ | PROT_WRITE,
417                                              pfn_list,
418                                              tbufs.t_info->tbuf_size);
419         if ( tbufs.meta[i] == NULL )
420         {
421             PERROR("Failed to map cpu buffer!");
422             exit(EXIT_FAILURE);
423         }
424         tbufs.data[i] = (unsigned char *)(tbufs.meta[i]+1);
425     }
426 
427     xc_interface_close(xc_handle);
428 
429     return &tbufs;
430 }
431 
432 /**
433  * get_num_cpus - get the number of logical CPUs
434  */
get_num_cpus(void)435 static unsigned int get_num_cpus(void)
436 {
437     xc_physinfo_t physinfo = { 0 };
438     xc_interface *xc_handle = xc_interface_open(0,0,0);
439     int ret;
440 
441     ret = xc_physinfo(xc_handle, &physinfo);
442 
443     if ( ret != 0 )
444     {
445         PERROR("Failure to get logical CPU count from Xen");
446         exit(EXIT_FAILURE);
447     }
448 
449     xc_interface_close(xc_handle);
450     opts.cpu_freq = (double)physinfo.cpu_khz/1000.0;
451 
452     return physinfo.nr_cpus;
453 }
454 
455 /**
456  * monitor_tbufs - monitor the contents of tbufs
457  */
monitor_tbufs(void)458 static int monitor_tbufs(void)
459 {
460     int i;
461 
462     struct t_struct *tbufs;      /* Pointer to hypervisor maps */
463     struct t_buf **meta;         /* pointers to the trace buffer metadata    */
464     unsigned char **data;        /* pointers to the trace buffer data areas
465                                   * where they are mapped into user space.   */
466     unsigned long tbufs_mfn;     /* mfn of the tbufs                         */
467     unsigned int  num;           /* number of trace buffers / logical CPUS   */
468     unsigned long tinfo_size;    /* size of t_info metadata map              */
469     unsigned long size;          /* size of a single trace buffer            */
470 
471     unsigned long data_size, rec_size;
472 
473     /* get number of logical CPUs (and therefore number of trace buffers) */
474     num = get_num_cpus();
475 
476     init_current(num);
477     alloc_qos_data(num);
478 
479     printf("CPU Frequency = %7.2f\n", opts.cpu_freq);
480 
481     /* setup access to trace buffers */
482     get_tbufs(&tbufs_mfn, &tinfo_size);
483     tbufs = map_tbufs(tbufs_mfn, num, tinfo_size);
484 
485     size = tbufs->t_info->tbuf_size * XC_PAGE_SIZE;
486 
487     data_size = size - sizeof(struct t_buf);
488 
489     meta = tbufs->meta;
490     data = tbufs->data;
491 
492     if ( eventchn_init() < 0 )
493         fprintf(stderr, "Failed to initialize event channel; "
494                 "Using POLL method\r\n");
495 
496     /* now, scan buffers for events */
497     while ( !interrupted )
498     {
499         for ( i = 0; (i < num) && !interrupted; i++ )
500         {
501             unsigned long start_offset, end_offset, cons, prod;
502 
503             cons = meta[i]->cons;
504             prod = meta[i]->prod;
505             xen_rmb(); /* read prod, then read item. */
506 
507             if ( cons == prod )
508                 continue;
509 
510             start_offset = cons % data_size;
511             end_offset = prod % data_size;
512 
513             if ( start_offset >= end_offset )
514             {
515                 while ( start_offset != data_size )
516                 {
517                     rec_size = process_record(
518                         i, (struct t_rec *)(data[i] + start_offset));
519                     start_offset += rec_size;
520                 }
521                 start_offset = 0;
522             }
523             while ( start_offset != end_offset )
524             {
525                 rec_size = process_record(
526                     i, (struct t_rec *)(data[i] + start_offset));
527                 start_offset += rec_size;
528             }
529             xen_mb(); /* read item, then update cons. */
530             meta[i]->cons = prod;
531         }
532 
533 	wait_for_event();
534 	wakeups++;
535     }
536 
537     /* cleanup */
538     free(meta);
539     free(data);
540     /* don't need to munmap - cleanup is automatic */
541 
542     return 0;
543 }
544 
545 
546 /******************************************************************************
547  * Command line handling
548  *****************************************************************************/
549 
550 const char *program_version     = "xenbaked v1.4";
551 const char *program_bug_address = "<rob.gardner@hp.com>";
552 
553 #define xstr(x) str(x)
554 #define str(x) #x
555 
usage(void)556 static void usage(void)
557 {
558 #define USAGE_STR \
559 "Usage: xenbaked [OPTION...]\n" \
560 "Tool to capture and partially process Xen trace buffer data\n" \
561 "\n" \
562 "  -m, --ms_per_sample=MS     Specify the number of milliseconds per sample\n" \
563 "                             (default " xstr(MS_PER_SAMPLE) ").\n" \
564 "  -s, --poll-sleep=p         Set sleep time, p, in milliseconds between\n" \
565 "                             polling the trace buffer for new data\n" \
566 "                             (default " xstr(POLL_SLEEP_MILLIS) ").\n" \
567 "  -t, --log-thresh=l         Set number, l, of new records required to\n" \
568 "                             trigger a write to output (default " \
569                               xstr(NEW_DATA_THRESH) ").\n" \
570 "  -?, --help                 Show this message\n" \
571 " -V, --version              Print program version\n" \
572 "\n" \
573 "This tool is used to capture trace buffer data from Xen.  The data is\n" \
574 "saved in a shared memory structure to be further processed by xenmon.\n"
575 
576     printf(USAGE_STR);
577     printf("\nReport bugs to %s\n", program_bug_address);
578 
579     exit(EXIT_FAILURE);
580 }
581 
582 /* convert the argument string pointed to by arg to a long int representation */
argtol(const char * restrict arg,int base)583 static long argtol(const char *restrict arg, int base)
584 {
585     char *endp;
586     long val;
587 
588     errno = 0;
589     val = strtol(arg, &endp, base);
590 
591     if (errno != 0) {
592         fprintf(stderr, "Invalid option argument: %s\n", arg);
593         fprintf(stderr, "Error: %s\n\n", strerror(errno));
594         usage();
595     } else if (endp == arg || *endp != '\0') {
596         fprintf(stderr, "Invalid option argument: %s\n\n", arg);
597         usage();
598     }
599 
600     return val;
601 }
602 
603 /* parse command line arguments */
parse_args(int argc,char ** argv)604 static void parse_args(int argc, char **argv)
605 {
606     int option;
607     static struct option long_options[] = {
608         { "log-thresh",    required_argument, 0, 't' },
609         { "poll-sleep",    required_argument, 0, 's' },
610         { "ms_per_sample", required_argument, 0, 'm' },
611         { "help",          no_argument,       0, '?' },
612         { "version",       no_argument,       0, 'V' },
613         { 0, 0, 0, 0 }
614     };
615 
616     while ( (option = getopt_long(argc, argv, "m:s:t:?V",
617                     long_options, NULL)) != -1)
618     {
619         switch ( option )
620         {
621             case 't': /* set new records threshold for logging */
622                 opts.new_data_thresh = argtol(optarg, 0);
623                 break;
624 
625             case 's': /* set sleep time (given in milliseconds) */
626                 opts.poll_sleep = millis_to_timespec(argtol(optarg, 0));
627                 break;
628 
629             case 'm': /* set ms_per_sample */
630                 opts.ms_per_sample = argtol(optarg, 0);
631                 break;
632 
633             case 'V': /* print program version */
634                 printf("%s\n", program_version);
635                 exit(EXIT_SUCCESS);
636                 break;
637 
638             default:
639                usage();
640         }
641     }
642 
643     /* all arguments should have been processed */
644     if (optind != argc) {
645         usage();
646     }
647 }
648 
649 #define SHARED_MEM_FILE "/var/run/xenq-shm"
alloc_qos_data(int ncpu)650 static void alloc_qos_data(int ncpu)
651 {
652     int i, n, pgsize, off=0;
653     char *dummy;
654     int qos_fd;
655 
656     cpu_qos_data = (_new_qos_data **) calloc(ncpu, sizeof(_new_qos_data *));
657 
658 
659     qos_fd = open(SHARED_MEM_FILE, O_RDWR|O_CREAT|O_TRUNC, 0777);
660     if (qos_fd < 0) {
661         PERROR(SHARED_MEM_FILE);
662         exit(2);
663     }
664     pgsize = getpagesize();
665     dummy = malloc(pgsize);
666     if (!dummy) {
667         PERROR("malloc");
668         exit(EXIT_FAILURE);
669     }
670     memset(dummy, 0, pgsize);
671 
672     for (n=0; n<ncpu; n++) {
673 
674         for (i=0; i<sizeof(_new_qos_data); i=i+pgsize)
675             if ((write(qos_fd, dummy, pgsize)) != pgsize) {
676                 PERROR(SHARED_MEM_FILE);
677                 exit(2);
678             }
679 
680         new_qos = (_new_qos_data *) mmap(0, sizeof(_new_qos_data), PROT_READ|PROT_WRITE,
681                                          MAP_SHARED, qos_fd, off);
682         off += i;
683         if (new_qos == MAP_FAILED) {
684             PERROR("mmap");
685             exit(3);
686         }
687         //  printf("new_qos = %p\n", new_qos);
688         memset(new_qos, 0, sizeof(_new_qos_data));
689         new_qos->next_datapoint = 0;
690         advance_next_datapoint(0);
691         new_qos->structlen = i;
692         new_qos->ncpu = ncpu;
693         //      printf("structlen = 0x%x\n", i);
694         cpu_qos_data[n] = new_qos;
695     }
696     free(dummy);
697     close(qos_fd);
698     new_qos = NULL;
699 }
700 
701 
main(int argc,char ** argv)702 int main(int argc, char **argv)
703 {
704     int ret;
705     struct sigaction act;
706 
707     time(&start_time);
708     opts.poll_sleep = millis_to_timespec(POLL_SLEEP_MILLIS);
709     opts.new_data_thresh = NEW_DATA_THRESH;
710     opts.ms_per_sample = MS_PER_SAMPLE;
711     opts.cpu_freq = CPU_FREQ;
712 
713     parse_args(argc, argv);
714     fprintf(stderr, "ms_per_sample = %ld\n", opts.ms_per_sample);
715 
716 
717     /* ensure that if we get a signal, we'll do cleanup, then exit */
718     act.sa_handler = close_handler;
719     act.sa_flags = 0;
720     sigemptyset(&act.sa_mask);
721     sigaction(SIGHUP,  &act, NULL);
722     sigaction(SIGTERM, &act, NULL);
723     sigaction(SIGINT,  &act, NULL);
724 
725     ret = monitor_tbufs();
726 
727     dump_stats();
728     msync(new_qos, sizeof(_new_qos_data), MS_SYNC);
729     disable_tracing();
730 
731     return ret;
732 }
733 
qos_init_domain(int domid,int idx)734 static void qos_init_domain(int domid, int idx)
735 {
736     int i;
737 
738     memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info));
739     new_qos->domain_info[idx].last_update_time = global_now;
740     //  runnable_start_time[idx] = 0;
741     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
742     new_qos->domain_info[idx].in_use = 1;
743     new_qos->domain_info[idx].blocked_start_time = 0;
744     new_qos->domain_info[idx].id = domid;
745     if (domid == IDLE_DOMAIN_ID)
746         snprintf(new_qos->domain_info[idx].name,
747 		sizeof(new_qos->domain_info[idx].name),
748 		"Idle Task%d", global_cpu);
749     else
750         snprintf(new_qos->domain_info[idx].name,
751 		sizeof(new_qos->domain_info[idx].name),
752 		"Domain#%d", domid);
753 
754     for (i=0; i<NSAMPLES; i++) {
755         new_qos->qdata[i].ns_gotten[idx] = 0;
756         new_qos->qdata[i].ns_allocated[idx] = 0;
757         new_qos->qdata[i].ns_waiting[idx] = 0;
758         new_qos->qdata[i].ns_blocked[idx] = 0;
759         new_qos->qdata[i].switchin_count[idx] = 0;
760         new_qos->qdata[i].io_count[idx] = 0;
761     }
762 }
763 
global_init_domain(int domid,int idx)764 static void global_init_domain(int domid, int idx)
765 {
766     int cpu;
767     _new_qos_data *saved_qos;
768 
769     saved_qos = new_qos;
770 
771     for (cpu=0; cpu<NCPU; cpu++) {
772         new_qos = cpu_qos_data[cpu];
773         qos_init_domain(domid, idx);
774     }
775     new_qos = saved_qos;
776 }
777 
778 // give index of this domain in the qos data array
indexof(int domid)779 static int indexof(int domid)
780 {
781     int idx;
782     xc_dominfo_t dominfo[NDOMAINS];
783     xc_interface *xc_handle;
784     int ndomains;
785 
786     if (domid < 0) {	// shouldn't happen
787         printf("bad domain id: %d\r\n", domid);
788         return 0;
789     }
790 
791     for (idx=0; idx<NDOMAINS; idx++)
792         if ( (new_qos->domain_info[idx].id == domid) && new_qos->domain_info[idx].in_use)
793             return idx;
794 
795     // not found, make a new entry
796     for (idx=0; idx<NDOMAINS; idx++)
797         if (new_qos->domain_info[idx].in_use == 0) {
798             global_init_domain(domid, idx);
799             return idx;
800         }
801 
802     // call domaininfo hypercall to try and garbage collect unused entries
803     xc_handle = xc_interface_open(0,0,0);
804     ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo);
805     xc_interface_close(xc_handle);
806 
807     // for each domain in our data, look for it in the system dominfo structure
808     // and purge the domain's data from our state if it does not exist in the
809     // dominfo structure
810     for (idx=0; idx<NDOMAINS; idx++) {
811         int domid = new_qos->domain_info[idx].id;
812         int jdx;
813 
814         for (jdx=0; jdx<ndomains; jdx++) {
815             if (dominfo[jdx].domid == domid)
816                 break;
817         }
818         if (jdx == ndomains)        // we didn't find domid in the dominfo struct
819             if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not
820                 // contained in dominfo
821                 qos_kill_thread(domid);	// purge our stale data
822     }
823 
824     // look again for a free slot
825     for (idx=0; idx<NDOMAINS; idx++)
826         if (new_qos->domain_info[idx].in_use == 0) {
827             global_init_domain(domid, idx);
828             return idx;
829         }
830 
831     // still no space found, so bail
832     fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n");
833     exit(2);
834 }
835 
domain_runnable(int domid)836 static int domain_runnable(int domid)
837 {
838     return new_qos->domain_info[indexof(domid)].runnable;
839 }
840 
841 
update_blocked_time(int domid,uint64_t now)842 static void update_blocked_time(int domid, uint64_t now)
843 {
844     uint64_t t_blocked;
845     int id = indexof(domid);
846 
847     if (new_qos->domain_info[id].blocked_start_time != 0) {
848         if (now >= new_qos->domain_info[id].blocked_start_time)
849             t_blocked = now - new_qos->domain_info[id].blocked_start_time;
850         else
851             t_blocked = now + (~0ULL - new_qos->domain_info[id].blocked_start_time);
852         new_qos->qdata[new_qos->next_datapoint].ns_blocked[id] += t_blocked;
853     }
854 
855     if (domain_runnable(domid))
856         new_qos->domain_info[id].blocked_start_time = 0;
857     else
858         new_qos->domain_info[id].blocked_start_time = now;
859 }
860 
861 
862 // advance to next datapoint for all domains
advance_next_datapoint(uint64_t now)863 static void advance_next_datapoint(uint64_t now)
864 {
865     int new, old, didx;
866 
867     old = new_qos->next_datapoint;
868     new = QOS_INCR(old);
869     new_qos->next_datapoint = new;
870     //	memset(&new_qos->qdata[new], 0, sizeof(uint64_t)*(2+5*NDOMAINS));
871     for (didx = 0; didx < NDOMAINS; didx++) {
872         new_qos->qdata[new].ns_gotten[didx] = 0;
873         new_qos->qdata[new].ns_allocated[didx] = 0;
874         new_qos->qdata[new].ns_waiting[didx] = 0;
875         new_qos->qdata[new].ns_blocked[didx] = 0;
876         new_qos->qdata[new].switchin_count[didx] = 0;
877         new_qos->qdata[new].io_count[didx] = 0;
878     }
879     new_qos->qdata[new].ns_passed = 0;
880     new_qos->qdata[new].lost_records = 0;
881     new_qos->qdata[new].flip_free_periods = 0;
882 
883     new_qos->qdata[new].timestamp = now;
884 }
885 
886 
887 
qos_update_thread(int cpu,int domid,uint64_t now)888 static void qos_update_thread(int cpu, int domid, uint64_t now)
889 {
890     int n, id;
891     uint64_t last_update_time, start;
892     int64_t time_since_update, run_time = 0;
893 
894     id = indexof(domid);
895 
896     n = new_qos->next_datapoint;
897     last_update_time = new_qos->domain_info[id].last_update_time;
898 
899     time_since_update = now - last_update_time;
900 
901     if (time_since_update < 0) {
902         // what happened here? either a timestamp wraparound, or more likely,
903         // a slight inconsistency among timestamps from various cpu's
904         if (-time_since_update < billion) {
905             // fairly small difference, let's just adjust 'now' to be a little
906             // beyond last_update_time
907             time_since_update = -time_since_update;
908         }
909         else if ( ((~0ULL - last_update_time) < billion) && (now < billion) ) {
910             // difference is huge, must be a wraparound
911             // last_update time should be "near" ~0ULL,
912             // and now should be "near" 0
913             time_since_update = now + (~0ULL - last_update_time);
914             printf("time wraparound\n");
915         }
916         else {
917             // none of the above, may be an out of order record
918             // no good solution, just ignore and update again later
919             return;
920         }
921     }
922 
923     new_qos->domain_info[id].last_update_time = now;
924 
925     if (new_qos->domain_info[id].runnable_at_last_update && is_current(domid, cpu)) {
926         start = new_qos->domain_info[id].start_time;
927         if (start > now) {		// wrapped around
928             run_time = now + (~0ULL - start);
929 	    // this could happen if there is nothing going on within a cpu;
930 	    // in this case the idle domain would run forever
931 	    //        printf("warning: start > now\n");
932         }
933         else
934             run_time = now - start;
935 	//	if (run_time < 0)	// should not happen
936 	//	  printf("warning: run_time < 0; start = %lld now= %lld\n", start, now);
937         new_qos->domain_info[id].ns_oncpu_since_boot += run_time;
938         new_qos->domain_info[id].start_time = now;
939         new_qos->domain_info[id].ns_since_boot += time_since_update;
940 
941 	new_qos->qdata[n].ns_gotten[id] += run_time;
942 	//	if (domid == 0 && cpu == 1)
943 	//	  printf("adding run time for dom0 on cpu1\r\n");
944 
945     }
946 
947     new_qos->domain_info[id].runnable_at_last_update = domain_runnable(domid);
948 
949     update_blocked_time(domid, now);
950 
951     // how much time passed since this datapoint was updated?
952     if (now >= new_qos->qdata[n].timestamp) {
953         // all is right with the world, time is increasing
954         new_qos->qdata[n].ns_passed += (now - new_qos->qdata[n].timestamp);
955     }
956     else {
957         // time wrapped around
958         //new_qos->qdata[n].ns_passed += (now + (~0LL - new_qos->qdata[n].timestamp));
959         //    printf("why timewrap?\r\n");
960     }
961     new_qos->qdata[n].timestamp = now;
962 }
963 
964 
965 // called by dump routines to update all structures
qos_update_all(uint64_t now,int cpu)966 static void qos_update_all(uint64_t now, int cpu)
967 {
968     int i;
969 
970     for (i=0; i<NDOMAINS; i++)
971         if (new_qos->domain_info[i].in_use)
972             qos_update_thread(cpu, new_qos->domain_info[i].id, now);
973 }
974 
975 
qos_update_thread_stats(int cpu,int domid,uint64_t now)976 static void qos_update_thread_stats(int cpu, int domid, uint64_t now)
977 {
978     if (new_qos->qdata[new_qos->next_datapoint].ns_passed > (million*opts.ms_per_sample)) {
979         qos_update_all(now, cpu);
980         advance_next_datapoint(now);
981         return;
982     }
983     qos_update_thread(cpu, domid, now);
984 }
985 
986 
987 
988 // called when a new thread gets the cpu
qos_switch_in(int cpu,int domid,uint64_t now,unsigned long ns_alloc,unsigned long ns_waited)989 static void qos_switch_in(int cpu, int domid, uint64_t now, unsigned long ns_alloc, unsigned long ns_waited)
990 {
991     int idx = indexof(domid);
992 
993     new_qos->domain_info[idx].runnable = 1;
994     update_blocked_time(domid, now);
995     new_qos->domain_info[idx].blocked_start_time = 0; // invalidate
996     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
997     //runnable_start_time[idx] = 0;
998 
999     new_qos->domain_info[idx].start_time = now;
1000     new_qos->qdata[new_qos->next_datapoint].switchin_count[idx]++;
1001     new_qos->qdata[new_qos->next_datapoint].ns_allocated[idx] += ns_alloc;
1002     new_qos->qdata[new_qos->next_datapoint].ns_waiting[idx] += ns_waited;
1003     qos_update_thread_stats(cpu, domid, now);
1004     set_current(cpu, domid);
1005 
1006     // count up page flips for dom0 execution
1007     if (domid == 0)
1008         dom0_flips = 0;
1009 }
1010 
1011 // called when the current thread is taken off the cpu
qos_switch_out(int cpu,int domid,uint64_t now,unsigned long gotten)1012 static void qos_switch_out(int cpu, int domid, uint64_t now, unsigned long gotten)
1013 {
1014     int idx = indexof(domid);
1015     int n;
1016 
1017     if (!is_current(domid, cpu)) {
1018         //    printf("switching out domain %d but it is not current. gotten=%ld\r\n", id, gotten);
1019     }
1020 
1021     if (gotten == 0) {
1022         printf("gotten==0 in qos_switchout(domid=%d)\n", domid);
1023     }
1024 
1025     if (gotten < 100) {
1026         printf("gotten<100ns in qos_switchout(domid=%d)\n", domid);
1027     }
1028 
1029 
1030     n = new_qos->next_datapoint;
1031 #if 0
1032     new_qos->qdata[n].ns_gotten[idx] += gotten;
1033     if (gotten > new_qos->qdata[n].ns_passed)
1034         printf("inconsistency #257, diff = %lld\n",
1035                gotten - new_qos->qdata[n].ns_passed );
1036 #endif
1037     new_qos->domain_info[idx].ns_oncpu_since_boot += gotten;
1038     new_qos->domain_info[idx].runnable_start_time = now;
1039     //  runnable_start_time[id] = now;
1040     qos_update_thread_stats(cpu, domid, now);
1041 
1042     // process dom0 page flips
1043     if (domid == 0)
1044         if (dom0_flips == 0)
1045             new_qos->qdata[n].flip_free_periods++;
1046 }
1047 
1048 // called when domain is put to sleep, may also be called
1049 // when thread is already asleep
qos_state_sleeping(int cpu,int domid,uint64_t now)1050 static void qos_state_sleeping(int cpu, int domid, uint64_t now)
1051 {
1052     int idx;
1053 
1054     if (!domain_runnable(domid))	// double call?
1055         return;
1056 
1057     idx = indexof(domid);
1058     new_qos->domain_info[idx].runnable = 0;
1059     new_qos->domain_info[idx].blocked_start_time = now;
1060     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
1061     //  runnable_start_time[idx] = 0; // invalidate
1062     qos_update_thread_stats(cpu, domid, now);
1063 }
1064 
1065 
1066 
1067 // domain died, presume it's dead on all cpu's, not just mostly dead
qos_kill_thread(int domid)1068 static void qos_kill_thread(int domid)
1069 {
1070     int cpu;
1071 
1072     for (cpu=0; cpu<NCPU; cpu++) {
1073         cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0;
1074     }
1075 
1076 }
1077 
1078 
1079 // called when thread becomes runnable, may also be called
1080 // when thread is already runnable
qos_state_runnable(int cpu,int domid,uint64_t now)1081 static void qos_state_runnable(int cpu, int domid, uint64_t now)
1082 {
1083     int idx;
1084 
1085 
1086     qos_update_thread_stats(cpu, domid, now);
1087 
1088     if (domain_runnable(domid))	// double call?
1089         return;
1090 
1091     idx = indexof(domid);
1092     new_qos->domain_info[idx].runnable = 1;
1093     update_blocked_time(domid, now);
1094 
1095     new_qos->domain_info[idx].blocked_start_time = 0; /* invalidate */
1096     new_qos->domain_info[idx].runnable_start_time = now;
1097     //  runnable_start_time[id] = now;
1098 }
1099 
1100 
qos_count_packets(domid_t domid,uint64_t now)1101 static void qos_count_packets(domid_t domid, uint64_t now)
1102 {
1103     int i, idx = indexof(domid);
1104     _new_qos_data *cpu_data;
1105 
1106     for (i=0; i<NCPU; i++) {
1107         cpu_data = cpu_qos_data[i];
1108         if (cpu_data->domain_info[idx].in_use) {
1109             cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++;
1110         }
1111     }
1112 
1113     new_qos->qdata[new_qos->next_datapoint].io_count[0]++;
1114     dom0_flips++;
1115 }
1116 
1117 
process_record(int cpu,struct t_rec * r)1118 static int process_record(int cpu, struct t_rec *r)
1119 {
1120     uint64_t now = 0;
1121     uint32_t *extra_u32 = r->u.nocycles.extra_u32;
1122 
1123     new_qos = cpu_qos_data[cpu];
1124 
1125     rec_count++;
1126 
1127     if ( r->cycles_included )
1128     {
1129         now = ((uint64_t)r->u.cycles.cycles_hi << 32) | r->u.cycles.cycles_lo;
1130         now = ((double)now) / (opts.cpu_freq / 1000.0);
1131         extra_u32 = r->u.cycles.extra_u32;
1132     }
1133 
1134     global_now = now;
1135     global_cpu = cpu;
1136 
1137     log_event(r->event);
1138 
1139     switch (r->event) {
1140 
1141     case TRC_SCHED_SWITCH_INFPREV:
1142         // domain data[0] just switched out and received data[1] ns of cpu time
1143         qos_switch_out(cpu, extra_u32[0], now, extra_u32[1]);
1144         //    printf("ns_gotten %ld\n", extra_u32[1]);
1145         break;
1146 
1147     case TRC_SCHED_SWITCH_INFNEXT:
1148         // domain data[0] just switched in and
1149         // waited data[1] ns, and was allocated data[2] ns of cpu time
1150         qos_switch_in(cpu, extra_u32[0], now, extra_u32[2], extra_u32[1]);
1151         break;
1152 
1153     case TRC_SCHED_DOM_ADD:
1154         (void) indexof(extra_u32[0]);
1155         break;
1156 
1157     case TRC_SCHED_DOM_REM:
1158         qos_kill_thread(extra_u32[0]);
1159         break;
1160 
1161     case TRC_SCHED_SLEEP:
1162         qos_state_sleeping(cpu, extra_u32[0], now);
1163         break;
1164 
1165     case TRC_SCHED_WAKE:
1166         qos_state_runnable(cpu, extra_u32[0], now);
1167         break;
1168 
1169     case TRC_SCHED_BLOCK:
1170         qos_state_sleeping(cpu, extra_u32[0], now);
1171         break;
1172 
1173     case TRC_MEM_PAGE_GRANT_TRANSFER:
1174         qos_count_packets(extra_u32[0], now);
1175         break;
1176 
1177     default:
1178         break;
1179     }
1180 
1181     new_qos = NULL;
1182 
1183     return 4 + (r->cycles_included ? 8 : 0) + (r->extra_u32 * 4);
1184 }
1185 
1186 /*
1187  * Local variables:
1188  * mode: C
1189  * c-file-style: "BSD"
1190  * c-basic-offset: 4
1191  * tab-width: 4
1192  * indent-tabs-mode: nil
1193  * End:
1194  */
1195