1 /******************************************************************************
2  * tools/xenbaked.c
3  *
4  * Tool for collecting raw trace buffer data from Xen and
5  *  performing some accumulation operations and other processing
6  *  on it.
7  *
8  * Copyright (C) 2004 by Intel Research Cambridge
9  * Copyright (C) 2005 by Hewlett Packard, Palo Alto and Fort Collins
10  * Copyright (C) 2006 by Hewlett Packard Fort Collins
11  *
12  * Authors: Diwaker Gupta, diwaker.gupta@hp.com
13  *          Rob Gardner, rob.gardner@hp.com
14  *          Lucy Cherkasova, lucy.cherkasova.hp.com
15  * Much code based on xentrace, authored by Mark Williamson,
16  * mark.a.williamson@intel.com
17  * Date:   November, 2005
18  *
19  *  This program is free software; you can redistribute it and/or modify
20  *  it under the terms of the GNU General Public License as published by
21  *  the Free Software Foundation; under version 2 of the License.
22  *
23  *  This program is distributed in the hope that it will be useful,
24  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26  *  GNU General Public License for more details.
27  *
28  *  You should have received a copy of the GNU General Public License
29  *  along with this program; If not, see <http://www.gnu.org/licenses/>.
30  */
31 
32 #include <time.h>
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <sys/mman.h>
36 #include <fcntl.h>
37 #include <unistd.h>
38 #include <errno.h>
39 #include <signal.h>
40 #include <xenevtchn.h>
41 #define XC_WANT_COMPAT_MAP_FOREIGN_API
42 #include <xenctrl.h>
43 #include <xen/xen.h>
44 #include <string.h>
45 #include <sys/select.h>
46 #include <getopt.h>
47 
48 #define PERROR(_m, _a...)                                       \
49 do {                                                            \
50     int __saved_errno = errno;                                  \
51     fprintf(stderr, "ERROR: " _m " (%d = %s)\n" , ## _a ,       \
52             __saved_errno, strerror(__saved_errno));            \
53     errno = __saved_errno;                                      \
54 } while (0)
55 
56 typedef struct { int counter; } atomic_t;
57 #define _atomic_read(v)		((v).counter)
58 
59 #include <xen/trace.h>
60 #include "xenbaked.h"
61 
62 
63 /***** Compile time configuration of defaults ********************************/
64 
65 /* when we've got more records than this waiting, we log it to the output */
66 #define NEW_DATA_THRESH 1
67 
68 /* sleep for this long (milliseconds) between checking the trace buffers */
69 #define POLL_SLEEP_MILLIS 100
70 
71 /* Size of time period represented by each sample */
72 #define MS_PER_SAMPLE 100
73 
74 /* CPU Frequency */
75 #define MHZ
76 #define CPU_FREQ 2660 MHZ
77 
78 /***** The code **************************************************************/
79 
80 typedef struct settings_st {
81     struct timespec poll_sleep;
82     unsigned long new_data_thresh;
83     unsigned long ms_per_sample;
84     double cpu_freq;
85 } settings_t;
86 
87 struct t_struct {
88     const struct t_info *t_info; /* Structure with information about individual buffers */
89     struct t_buf **meta;    /* Pointers to trace buffer metadata */
90     unsigned char **data;   /* Pointers to trace buffer data areas */
91 };
92 
93 settings_t opts;
94 
95 int interrupted = 0; /* gets set if we get a SIGHUP */
96 int rec_count = 0;
97 int wakeups = 0;
98 time_t start_time;
99 int dom0_flips = 0;
100 
101 _new_qos_data *new_qos;
102 _new_qos_data **cpu_qos_data;
103 
104 int global_cpu;
105 uint64_t global_now;
106 
107 // array of currently running domains, indexed by cpu
108 int *running = NULL;
109 
110 // number of cpu's on this platform
111 int NCPU = 0;
112 
113 
114 static void advance_next_datapoint(uint64_t);
115 static void alloc_qos_data(int ncpu);
116 static int process_record(int, struct t_rec *);
117 static void qos_kill_thread(int domid);
118 
119 
init_current(int ncpu)120 static void init_current(int ncpu)
121 {
122     running = calloc(ncpu, sizeof(int));
123     NCPU = ncpu;
124     printf("Initialized with %d %s\n", ncpu, (ncpu == 1) ? "cpu" : "cpu's");
125 }
126 
is_current(int domain,int cpu)127 static int is_current(int domain, int cpu)
128 {
129     //  int i;
130 
131     //  for (i=0; i<NCPU; i++)
132     if (running[cpu] == domain)
133         return 1;
134     return 0;
135 }
136 
137 
138 #if 0 /* unused */
139 // return the domain that's currently running on the given cpu
140 static int current(int cpu)
141 {
142     return running[cpu];
143 }
144 #endif
145 
set_current(int cpu,int domain)146 static void set_current(int cpu, int domain)
147 {
148     running[cpu] = domain;
149 }
150 
151 
152 
close_handler(int signal)153 static void close_handler(int signal)
154 {
155     interrupted = 1;
156 }
157 
158 #if 0
159 void dump_record(int cpu, struct t_rec *x)
160 {
161     printf("record: cpu=%x, tsc=%lx, event=%x, d1=%lx\n",
162            cpu, x->cycles, x->event, x->data[0]);
163 }
164 #endif
165 
166 /**
167  * millis_to_timespec - convert a time in milliseconds to a struct timespec
168  * @millis:             time interval in milliseconds
169  */
millis_to_timespec(unsigned long millis)170 static struct timespec millis_to_timespec(unsigned long millis)
171 {
172     struct timespec spec;
173 
174     spec.tv_sec = millis / 1000;
175     spec.tv_nsec = (millis % 1000) * 1000;
176 
177     return spec;
178 }
179 
180 
181 typedef struct
182 {
183     int event_count;
184     int event_id;
185     char *text;
186 } stat_map_t;
187 
188 stat_map_t stat_map[] = {
189     { 0,       0, 	    "Other" },
190     { 0, TRC_SCHED_DOM_ADD, "Add Domain" },
191     { 0, TRC_SCHED_DOM_REM, "Remove Domain" },
192     { 0, TRC_SCHED_SLEEP, "Sleep" },
193     { 0, TRC_SCHED_WAKE,  "Wake" },
194     { 0, TRC_SCHED_BLOCK,  "Block" },
195     { 0, TRC_SCHED_SWITCH,  "Switch" },
196     { 0, TRC_SCHED_S_TIMER_FN, "Timer Func"},
197     { 0, TRC_SCHED_SWITCH_INFPREV,  "Switch Prev" },
198     { 0, TRC_SCHED_SWITCH_INFNEXT,  "Switch Next" },
199     { 0, TRC_MEM_PAGE_GRANT_MAP,  "Page Map" },
200     { 0, TRC_MEM_PAGE_GRANT_UNMAP,  "Page Unmap" },
201     { 0, TRC_MEM_PAGE_GRANT_TRANSFER,  "Page Transfer" },
202     { 0,      0, 		 0  }
203 };
204 
205 
check_gotten_sum(void)206 static void check_gotten_sum(void)
207 {
208 #if 0
209     uint64_t sum, ns;
210     extern uint64_t total_ns_gotten(uint64_t*);
211     double percent;
212     int i;
213 
214     for (i=0; i<NCPU; i++) {
215         new_qos = cpu_qos_data[i];
216         ns = billion;
217         sum = total_ns_gotten(&ns);
218 
219         printf("[cpu%d] ns_gotten over all domains = %lldns, over %lldns\n",
220                i, sum, ns);
221         percent = (double) sum;
222         percent = (100.0*percent) / (double)ns;
223         printf(" ==> ns_gotten = %7.3f%%\n", percent);
224     }
225 #endif
226 }
227 
228 
229 
dump_stats(void)230 static void dump_stats(void)
231 {
232     stat_map_t *smt = stat_map;
233     time_t end_time, run_time;
234 
235     time(&end_time);
236 
237     run_time = end_time - start_time;
238 
239     printf("Event counts:\n");
240     while (smt->text != NULL) {
241         printf("%08d\t%s\n", smt->event_count, smt->text);
242         smt++;
243     }
244 
245     printf("processed %d total records in %d seconds (%ld per second)\n",
246            rec_count, (int)run_time,
247            run_time ? (long)(rec_count/run_time) : 0L);
248 
249     printf("woke up %d times in %d seconds (%ld per second)\n",
250            wakeups, (int) run_time,
251            run_time ? (long)(wakeups/run_time) : 0L);
252 
253     check_gotten_sum();
254 }
255 
log_event(int event_id)256 static void log_event(int event_id)
257 {
258     stat_map_t *smt = stat_map;
259 
260     //  printf("event_id = 0x%x\n", event_id);
261 
262     while (smt->text != NULL) {
263         if (smt->event_id == event_id) {
264             smt->event_count++;
265             return;
266         }
267         smt++;
268     }
269     if (smt->text == NULL)
270         stat_map[0].event_count++;	// other
271 }
272 
273 int virq_port;
274 xenevtchn_handle *xce_handle = NULL;
275 
276 /* Returns the event channel handle. */
277 /* Stolen from xenstore code */
eventchn_init(void)278 static int eventchn_init(void)
279 {
280     int rc;
281 
282     // to revert to old way:
283     if (0)
284         return -1;
285 
286     xce_handle = xenevtchn_open(NULL, 0);
287 
288     if (xce_handle == NULL)
289         perror("Failed to open evtchn device");
290 
291     if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_TBUF)) == -1)
292         perror("Failed to bind to domain exception virq port");
293     virq_port = rc;
294 
295     return xce_handle == NULL ? -1 : 0;
296 }
297 
wait_for_event(void)298 static void wait_for_event(void)
299 {
300     int ret;
301     fd_set inset;
302     evtchn_port_t port;
303     struct timeval tv;
304     int evtchn_fd;
305 
306     if (xce_handle == NULL) {
307         nanosleep(&opts.poll_sleep, NULL);
308         return;
309     }
310 
311     evtchn_fd = xenevtchn_fd(xce_handle);
312 
313     FD_ZERO(&inset);
314     FD_SET(evtchn_fd, &inset);
315     tv.tv_sec = 1;
316     tv.tv_usec = 0;
317     // tv = millis_to_timespec(&opts.poll_sleep);
318     ret = select(evtchn_fd+1, &inset, NULL, NULL, &tv);
319 
320     if ( (ret == 1) && FD_ISSET(evtchn_fd, &inset)) {
321         if ((port = xenevtchn_pending(xce_handle)) == -1)
322             perror("Failed to read from event fd");
323 
324         //    if (port == virq_port)
325         //      printf("got the event I was looking for\r\n");
326 
327         if (xenevtchn_unmask(xce_handle, port) == -1)
328             perror("Failed to write to event fd");
329     }
330 }
331 
get_tbufs(unsigned long * mfn,unsigned long * size)332 static void get_tbufs(unsigned long *mfn, unsigned long *size)
333 {
334     xc_interface *xc_handle = xc_interface_open(0,0,0);
335     int ret;
336 
337     if ( !xc_handle )
338     {
339         exit(EXIT_FAILURE);
340     }
341 
342     ret = xc_tbuf_enable(xc_handle, DEFAULT_TBUF_SIZE, mfn, size);
343 
344     if ( ret != 0 )
345     {
346         perror("Couldn't enable trace buffers");
347         exit(1);
348     }
349 
350     xc_interface_close(xc_handle);
351 }
352 
disable_tracing(void)353 static void disable_tracing(void)
354 {
355     xc_interface *xc_handle = xc_interface_open(0,0,0);
356     xc_tbuf_disable(xc_handle);
357     xc_interface_close(xc_handle);
358 }
359 
360 /**
361  * map_tbufs - memory map Xen trace buffers into user space
362  * @tbufs_mfn: mfn of the trace buffers
363  * @num:       number of trace buffers to map
364  * @size:      size of each trace buffer
365  *
366  * Maps the Xen trace buffers them into process address space.
367  */
map_tbufs(unsigned long tbufs_mfn,unsigned int num,unsigned long tinfo_size)368 static struct t_struct *map_tbufs(unsigned long tbufs_mfn, unsigned int num,
369                                   unsigned long tinfo_size)
370 {
371     xc_interface *xc_handle;
372     static struct t_struct tbufs = { 0 };
373     int i;
374 
375     xc_handle = xc_interface_open(0,0,0);
376     if ( !xc_handle )
377     {
378         exit(EXIT_FAILURE);
379     }
380 
381     /* Map t_info metadata structure */
382     tbufs.t_info = xc_map_foreign_range(xc_handle, DOMID_XEN, tinfo_size,
383                                         PROT_READ, tbufs_mfn);
384 
385     if ( tbufs.t_info == 0 )
386     {
387         PERROR("Failed to mmap trace buffers");
388         exit(EXIT_FAILURE);
389     }
390 
391     if ( tbufs.t_info->tbuf_size == 0 )
392     {
393         fprintf(stderr, "%s: tbuf_size 0!\n", __func__);
394         exit(EXIT_FAILURE);
395     }
396 
397     /* Map per-cpu buffers */
398     tbufs.meta = (struct t_buf **)calloc(num, sizeof(struct t_buf *));
399     tbufs.data = (unsigned char **)calloc(num, sizeof(unsigned char *));
400     if ( tbufs.meta == NULL || tbufs.data == NULL )
401     {
402         PERROR( "Failed to allocate memory for buffer pointers\n");
403         exit(EXIT_FAILURE);
404     }
405 
406     for(i=0; i<num; i++)
407     {
408 
409         const uint32_t *mfn_list = (const uint32_t *)tbufs.t_info
410                                    + tbufs.t_info->mfn_offset[i];
411         int j;
412         xen_pfn_t pfn_list[tbufs.t_info->tbuf_size];
413 
414         for ( j=0; j<tbufs.t_info->tbuf_size; j++)
415             pfn_list[j] = (xen_pfn_t)mfn_list[j];
416 
417         tbufs.meta[i] = xc_map_foreign_pages(xc_handle, DOMID_XEN,
418                                              PROT_READ | PROT_WRITE,
419                                              pfn_list,
420                                              tbufs.t_info->tbuf_size);
421         if ( tbufs.meta[i] == NULL )
422         {
423             PERROR("Failed to map cpu buffer!");
424             exit(EXIT_FAILURE);
425         }
426         tbufs.data[i] = (unsigned char *)(tbufs.meta[i]+1);
427     }
428 
429     xc_interface_close(xc_handle);
430 
431     return &tbufs;
432 }
433 
434 /**
435  * get_num_cpus - get the number of logical CPUs
436  */
get_num_cpus(void)437 static unsigned int get_num_cpus(void)
438 {
439     xc_physinfo_t physinfo = { 0 };
440     xc_interface *xc_handle = xc_interface_open(0,0,0);
441     int ret;
442 
443     ret = xc_physinfo(xc_handle, &physinfo);
444 
445     if ( ret != 0 )
446     {
447         PERROR("Failure to get logical CPU count from Xen");
448         exit(EXIT_FAILURE);
449     }
450 
451     xc_interface_close(xc_handle);
452     opts.cpu_freq = (double)physinfo.cpu_khz/1000.0;
453 
454     return physinfo.nr_cpus;
455 }
456 
457 /**
458  * monitor_tbufs - monitor the contents of tbufs
459  */
monitor_tbufs(void)460 static int monitor_tbufs(void)
461 {
462     int i;
463 
464     struct t_struct *tbufs;      /* Pointer to hypervisor maps */
465     struct t_buf **meta;         /* pointers to the trace buffer metadata    */
466     unsigned char **data;        /* pointers to the trace buffer data areas
467                                   * where they are mapped into user space.   */
468     unsigned long tbufs_mfn;     /* mfn of the tbufs                         */
469     unsigned int  num;           /* number of trace buffers / logical CPUS   */
470     unsigned long tinfo_size;    /* size of t_info metadata map              */
471     unsigned long size;          /* size of a single trace buffer            */
472 
473     unsigned long data_size, rec_size;
474 
475     /* get number of logical CPUs (and therefore number of trace buffers) */
476     num = get_num_cpus();
477 
478     init_current(num);
479     alloc_qos_data(num);
480 
481     printf("CPU Frequency = %7.2f\n", opts.cpu_freq);
482 
483     /* setup access to trace buffers */
484     get_tbufs(&tbufs_mfn, &tinfo_size);
485     tbufs = map_tbufs(tbufs_mfn, num, tinfo_size);
486 
487     size = tbufs->t_info->tbuf_size * XC_PAGE_SIZE;
488 
489     data_size = size - sizeof(struct t_buf);
490 
491     meta = tbufs->meta;
492     data = tbufs->data;
493 
494     if ( eventchn_init() < 0 )
495         fprintf(stderr, "Failed to initialize event channel; "
496                 "Using POLL method\r\n");
497 
498     /* now, scan buffers for events */
499     while ( !interrupted )
500     {
501         for ( i = 0; (i < num) && !interrupted; i++ )
502         {
503             unsigned long start_offset, end_offset, cons, prod;
504 
505             cons = meta[i]->cons;
506             prod = meta[i]->prod;
507             xen_rmb(); /* read prod, then read item. */
508 
509             if ( cons == prod )
510                 continue;
511 
512             start_offset = cons % data_size;
513             end_offset = prod % data_size;
514 
515             if ( start_offset >= end_offset )
516             {
517                 while ( start_offset != data_size )
518                 {
519                     rec_size = process_record(
520                         i, (struct t_rec *)(data[i] + start_offset));
521                     start_offset += rec_size;
522                 }
523                 start_offset = 0;
524             }
525             while ( start_offset != end_offset )
526             {
527                 rec_size = process_record(
528                     i, (struct t_rec *)(data[i] + start_offset));
529                 start_offset += rec_size;
530             }
531             xen_mb(); /* read item, then update cons. */
532             meta[i]->cons = prod;
533         }
534 
535 	wait_for_event();
536 	wakeups++;
537     }
538 
539     /* cleanup */
540     free(meta);
541     free(data);
542     /* don't need to munmap - cleanup is automatic */
543 
544     return 0;
545 }
546 
547 
548 /******************************************************************************
549  * Command line handling
550  *****************************************************************************/
551 
552 const char *program_version     = "xenbaked v1.4";
553 const char *program_bug_address = "<rob.gardner@hp.com>";
554 
555 #define xstr(x) str(x)
556 #define str(x) #x
557 
usage(void)558 static void usage(void)
559 {
560 #define USAGE_STR \
561 "Usage: xenbaked [OPTION...]\n" \
562 "Tool to capture and partially process Xen trace buffer data\n" \
563 "\n" \
564 "  -m, --ms_per_sample=MS     Specify the number of milliseconds per sample\n" \
565 "                             (default " xstr(MS_PER_SAMPLE) ").\n" \
566 "  -s, --poll-sleep=p         Set sleep time, p, in milliseconds between\n" \
567 "                             polling the trace buffer for new data\n" \
568 "                             (default " xstr(POLL_SLEEP_MILLIS) ").\n" \
569 "  -t, --log-thresh=l         Set number, l, of new records required to\n" \
570 "                             trigger a write to output (default " \
571                               xstr(NEW_DATA_THRESH) ").\n" \
572 "  -?, --help                 Show this message\n" \
573 " -V, --version              Print program version\n" \
574 "\n" \
575 "This tool is used to capture trace buffer data from Xen.  The data is\n" \
576 "saved in a shared memory structure to be further processed by xenmon.\n"
577 
578     printf(USAGE_STR);
579     printf("\nReport bugs to %s\n", program_bug_address);
580 
581     exit(EXIT_FAILURE);
582 }
583 
584 /* convert the argument string pointed to by arg to a long int representation */
argtol(const char * restrict arg,int base)585 static long argtol(const char *restrict arg, int base)
586 {
587     char *endp;
588     long val;
589 
590     errno = 0;
591     val = strtol(arg, &endp, base);
592 
593     if (errno != 0) {
594         fprintf(stderr, "Invalid option argument: %s\n", arg);
595         fprintf(stderr, "Error: %s\n\n", strerror(errno));
596         usage();
597     } else if (endp == arg || *endp != '\0') {
598         fprintf(stderr, "Invalid option argument: %s\n\n", arg);
599         usage();
600     }
601 
602     return val;
603 }
604 
605 /* parse command line arguments */
parse_args(int argc,char ** argv)606 static void parse_args(int argc, char **argv)
607 {
608     int option;
609     static struct option long_options[] = {
610         { "log-thresh",    required_argument, 0, 't' },
611         { "poll-sleep",    required_argument, 0, 's' },
612         { "ms_per_sample", required_argument, 0, 'm' },
613         { "help",          no_argument,       0, '?' },
614         { "version",       no_argument,       0, 'V' },
615         { 0, 0, 0, 0 }
616     };
617 
618     while ( (option = getopt_long(argc, argv, "m:s:t:?V",
619                     long_options, NULL)) != -1)
620     {
621         switch ( option )
622         {
623             case 't': /* set new records threshold for logging */
624                 opts.new_data_thresh = argtol(optarg, 0);
625                 break;
626 
627             case 's': /* set sleep time (given in milliseconds) */
628                 opts.poll_sleep = millis_to_timespec(argtol(optarg, 0));
629                 break;
630 
631             case 'm': /* set ms_per_sample */
632                 opts.ms_per_sample = argtol(optarg, 0);
633                 break;
634 
635             case 'V': /* print program version */
636                 printf("%s\n", program_version);
637                 exit(EXIT_SUCCESS);
638                 break;
639 
640             default:
641                usage();
642         }
643     }
644 
645     /* all arguments should have been processed */
646     if (optind != argc) {
647         usage();
648     }
649 }
650 
651 #define SHARED_MEM_FILE "/var/run/xenq-shm"
alloc_qos_data(int ncpu)652 static void alloc_qos_data(int ncpu)
653 {
654     int i, n, pgsize, off=0;
655     char *dummy;
656     int qos_fd;
657 
658     cpu_qos_data = (_new_qos_data **) calloc(ncpu, sizeof(_new_qos_data *));
659 
660 
661     qos_fd = open(SHARED_MEM_FILE, O_RDWR|O_CREAT|O_TRUNC, 0777);
662     if (qos_fd < 0) {
663         PERROR(SHARED_MEM_FILE);
664         exit(2);
665     }
666     pgsize = getpagesize();
667     dummy = malloc(pgsize);
668     if (!dummy) {
669         PERROR("malloc");
670         exit(EXIT_FAILURE);
671     }
672     memset(dummy, 0, pgsize);
673 
674     for (n=0; n<ncpu; n++) {
675 
676         for (i=0; i<sizeof(_new_qos_data); i=i+pgsize)
677             if ((write(qos_fd, dummy, pgsize)) != pgsize) {
678                 PERROR(SHARED_MEM_FILE);
679                 exit(2);
680             }
681 
682         new_qos = (_new_qos_data *) mmap(0, sizeof(_new_qos_data), PROT_READ|PROT_WRITE,
683                                          MAP_SHARED, qos_fd, off);
684         off += i;
685         if (new_qos == MAP_FAILED) {
686             PERROR("mmap");
687             exit(3);
688         }
689         //  printf("new_qos = %p\n", new_qos);
690         memset(new_qos, 0, sizeof(_new_qos_data));
691         new_qos->next_datapoint = 0;
692         advance_next_datapoint(0);
693         new_qos->structlen = i;
694         new_qos->ncpu = ncpu;
695         //      printf("structlen = 0x%x\n", i);
696         cpu_qos_data[n] = new_qos;
697     }
698     free(dummy);
699     close(qos_fd);
700     new_qos = NULL;
701 }
702 
703 
main(int argc,char ** argv)704 int main(int argc, char **argv)
705 {
706     int ret;
707     struct sigaction act;
708 
709     time(&start_time);
710     opts.poll_sleep = millis_to_timespec(POLL_SLEEP_MILLIS);
711     opts.new_data_thresh = NEW_DATA_THRESH;
712     opts.ms_per_sample = MS_PER_SAMPLE;
713     opts.cpu_freq = CPU_FREQ;
714 
715     parse_args(argc, argv);
716     fprintf(stderr, "ms_per_sample = %ld\n", opts.ms_per_sample);
717 
718 
719     /* ensure that if we get a signal, we'll do cleanup, then exit */
720     act.sa_handler = close_handler;
721     act.sa_flags = 0;
722     sigemptyset(&act.sa_mask);
723     sigaction(SIGHUP,  &act, NULL);
724     sigaction(SIGTERM, &act, NULL);
725     sigaction(SIGINT,  &act, NULL);
726 
727     ret = monitor_tbufs();
728 
729     dump_stats();
730     msync(new_qos, sizeof(_new_qos_data), MS_SYNC);
731     disable_tracing();
732 
733     return ret;
734 }
735 
qos_init_domain(int domid,int idx)736 static void qos_init_domain(int domid, int idx)
737 {
738     int i;
739 
740     memset(&new_qos->domain_info[idx], 0, sizeof(_domain_info));
741     new_qos->domain_info[idx].last_update_time = global_now;
742     //  runnable_start_time[idx] = 0;
743     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
744     new_qos->domain_info[idx].in_use = 1;
745     new_qos->domain_info[idx].blocked_start_time = 0;
746     new_qos->domain_info[idx].id = domid;
747     if (domid == IDLE_DOMAIN_ID)
748         snprintf(new_qos->domain_info[idx].name,
749 		sizeof(new_qos->domain_info[idx].name),
750 		"Idle Task%d", global_cpu);
751     else
752         snprintf(new_qos->domain_info[idx].name,
753 		sizeof(new_qos->domain_info[idx].name),
754 		"Domain#%d", domid);
755 
756     for (i=0; i<NSAMPLES; i++) {
757         new_qos->qdata[i].ns_gotten[idx] = 0;
758         new_qos->qdata[i].ns_allocated[idx] = 0;
759         new_qos->qdata[i].ns_waiting[idx] = 0;
760         new_qos->qdata[i].ns_blocked[idx] = 0;
761         new_qos->qdata[i].switchin_count[idx] = 0;
762         new_qos->qdata[i].io_count[idx] = 0;
763     }
764 }
765 
global_init_domain(int domid,int idx)766 static void global_init_domain(int domid, int idx)
767 {
768     int cpu;
769     _new_qos_data *saved_qos;
770 
771     saved_qos = new_qos;
772 
773     for (cpu=0; cpu<NCPU; cpu++) {
774         new_qos = cpu_qos_data[cpu];
775         qos_init_domain(domid, idx);
776     }
777     new_qos = saved_qos;
778 }
779 
780 // give index of this domain in the qos data array
indexof(int domid)781 static int indexof(int domid)
782 {
783     int idx;
784     xc_dominfo_t dominfo[NDOMAINS];
785     xc_interface *xc_handle;
786     int ndomains;
787 
788     if (domid < 0) {	// shouldn't happen
789         printf("bad domain id: %d\r\n", domid);
790         return 0;
791     }
792 
793     for (idx=0; idx<NDOMAINS; idx++)
794         if ( (new_qos->domain_info[idx].id == domid) && new_qos->domain_info[idx].in_use)
795             return idx;
796 
797     // not found, make a new entry
798     for (idx=0; idx<NDOMAINS; idx++)
799         if (new_qos->domain_info[idx].in_use == 0) {
800             global_init_domain(domid, idx);
801             return idx;
802         }
803 
804     // call domaininfo hypercall to try and garbage collect unused entries
805     xc_handle = xc_interface_open(0,0,0);
806     ndomains = xc_domain_getinfo(xc_handle, 0, NDOMAINS, dominfo);
807     xc_interface_close(xc_handle);
808 
809     // for each domain in our data, look for it in the system dominfo structure
810     // and purge the domain's data from our state if it does not exist in the
811     // dominfo structure
812     for (idx=0; idx<NDOMAINS; idx++) {
813         int domid = new_qos->domain_info[idx].id;
814         int jdx;
815 
816         for (jdx=0; jdx<ndomains; jdx++) {
817             if (dominfo[jdx].domid == domid)
818                 break;
819         }
820         if (jdx == ndomains)        // we didn't find domid in the dominfo struct
821             if (domid != IDLE_DOMAIN_ID) // exception for idle domain, which is not
822                 // contained in dominfo
823                 qos_kill_thread(domid);	// purge our stale data
824     }
825 
826     // look again for a free slot
827     for (idx=0; idx<NDOMAINS; idx++)
828         if (new_qos->domain_info[idx].in_use == 0) {
829             global_init_domain(domid, idx);
830             return idx;
831         }
832 
833     // still no space found, so bail
834     fprintf(stderr, "out of space in domain table, increase NDOMAINS\r\n");
835     exit(2);
836 }
837 
domain_runnable(int domid)838 static int domain_runnable(int domid)
839 {
840     return new_qos->domain_info[indexof(domid)].runnable;
841 }
842 
843 
update_blocked_time(int domid,uint64_t now)844 static void update_blocked_time(int domid, uint64_t now)
845 {
846     uint64_t t_blocked;
847     int id = indexof(domid);
848 
849     if (new_qos->domain_info[id].blocked_start_time != 0) {
850         if (now >= new_qos->domain_info[id].blocked_start_time)
851             t_blocked = now - new_qos->domain_info[id].blocked_start_time;
852         else
853             t_blocked = now + (~0ULL - new_qos->domain_info[id].blocked_start_time);
854         new_qos->qdata[new_qos->next_datapoint].ns_blocked[id] += t_blocked;
855     }
856 
857     if (domain_runnable(domid))
858         new_qos->domain_info[id].blocked_start_time = 0;
859     else
860         new_qos->domain_info[id].blocked_start_time = now;
861 }
862 
863 
864 // advance to next datapoint for all domains
advance_next_datapoint(uint64_t now)865 static void advance_next_datapoint(uint64_t now)
866 {
867     int new, old, didx;
868 
869     old = new_qos->next_datapoint;
870     new = QOS_INCR(old);
871     new_qos->next_datapoint = new;
872     //	memset(&new_qos->qdata[new], 0, sizeof(uint64_t)*(2+5*NDOMAINS));
873     for (didx = 0; didx < NDOMAINS; didx++) {
874         new_qos->qdata[new].ns_gotten[didx] = 0;
875         new_qos->qdata[new].ns_allocated[didx] = 0;
876         new_qos->qdata[new].ns_waiting[didx] = 0;
877         new_qos->qdata[new].ns_blocked[didx] = 0;
878         new_qos->qdata[new].switchin_count[didx] = 0;
879         new_qos->qdata[new].io_count[didx] = 0;
880     }
881     new_qos->qdata[new].ns_passed = 0;
882     new_qos->qdata[new].lost_records = 0;
883     new_qos->qdata[new].flip_free_periods = 0;
884 
885     new_qos->qdata[new].timestamp = now;
886 }
887 
888 
889 
qos_update_thread(int cpu,int domid,uint64_t now)890 static void qos_update_thread(int cpu, int domid, uint64_t now)
891 {
892     int n, id;
893     uint64_t last_update_time, start;
894     int64_t time_since_update, run_time = 0;
895 
896     id = indexof(domid);
897 
898     n = new_qos->next_datapoint;
899     last_update_time = new_qos->domain_info[id].last_update_time;
900 
901     time_since_update = now - last_update_time;
902 
903     if (time_since_update < 0) {
904         // what happened here? either a timestamp wraparound, or more likely,
905         // a slight inconsistency among timestamps from various cpu's
906         if (-time_since_update < billion) {
907             // fairly small difference, let's just adjust 'now' to be a little
908             // beyond last_update_time
909             time_since_update = -time_since_update;
910         }
911         else if ( ((~0ULL - last_update_time) < billion) && (now < billion) ) {
912             // difference is huge, must be a wraparound
913             // last_update time should be "near" ~0ULL,
914             // and now should be "near" 0
915             time_since_update = now + (~0ULL - last_update_time);
916             printf("time wraparound\n");
917         }
918         else {
919             // none of the above, may be an out of order record
920             // no good solution, just ignore and update again later
921             return;
922         }
923     }
924 
925     new_qos->domain_info[id].last_update_time = now;
926 
927     if (new_qos->domain_info[id].runnable_at_last_update && is_current(domid, cpu)) {
928         start = new_qos->domain_info[id].start_time;
929         if (start > now) {		// wrapped around
930             run_time = now + (~0ULL - start);
931 	    // this could happen if there is nothing going on within a cpu;
932 	    // in this case the idle domain would run forever
933 	    //        printf("warning: start > now\n");
934         }
935         else
936             run_time = now - start;
937 	//	if (run_time < 0)	// should not happen
938 	//	  printf("warning: run_time < 0; start = %lld now= %lld\n", start, now);
939         new_qos->domain_info[id].ns_oncpu_since_boot += run_time;
940         new_qos->domain_info[id].start_time = now;
941         new_qos->domain_info[id].ns_since_boot += time_since_update;
942 
943 	new_qos->qdata[n].ns_gotten[id] += run_time;
944 	//	if (domid == 0 && cpu == 1)
945 	//	  printf("adding run time for dom0 on cpu1\r\n");
946 
947     }
948 
949     new_qos->domain_info[id].runnable_at_last_update = domain_runnable(domid);
950 
951     update_blocked_time(domid, now);
952 
953     // how much time passed since this datapoint was updated?
954     if (now >= new_qos->qdata[n].timestamp) {
955         // all is right with the world, time is increasing
956         new_qos->qdata[n].ns_passed += (now - new_qos->qdata[n].timestamp);
957     }
958     else {
959         // time wrapped around
960         //new_qos->qdata[n].ns_passed += (now + (~0LL - new_qos->qdata[n].timestamp));
961         //    printf("why timewrap?\r\n");
962     }
963     new_qos->qdata[n].timestamp = now;
964 }
965 
966 
967 // called by dump routines to update all structures
qos_update_all(uint64_t now,int cpu)968 static void qos_update_all(uint64_t now, int cpu)
969 {
970     int i;
971 
972     for (i=0; i<NDOMAINS; i++)
973         if (new_qos->domain_info[i].in_use)
974             qos_update_thread(cpu, new_qos->domain_info[i].id, now);
975 }
976 
977 
qos_update_thread_stats(int cpu,int domid,uint64_t now)978 static void qos_update_thread_stats(int cpu, int domid, uint64_t now)
979 {
980     if (new_qos->qdata[new_qos->next_datapoint].ns_passed > (million*opts.ms_per_sample)) {
981         qos_update_all(now, cpu);
982         advance_next_datapoint(now);
983         return;
984     }
985     qos_update_thread(cpu, domid, now);
986 }
987 
988 
989 
990 // called when a new thread gets the cpu
qos_switch_in(int cpu,int domid,uint64_t now,unsigned long ns_alloc,unsigned long ns_waited)991 static void qos_switch_in(int cpu, int domid, uint64_t now, unsigned long ns_alloc, unsigned long ns_waited)
992 {
993     int idx = indexof(domid);
994 
995     new_qos->domain_info[idx].runnable = 1;
996     update_blocked_time(domid, now);
997     new_qos->domain_info[idx].blocked_start_time = 0; // invalidate
998     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
999     //runnable_start_time[idx] = 0;
1000 
1001     new_qos->domain_info[idx].start_time = now;
1002     new_qos->qdata[new_qos->next_datapoint].switchin_count[idx]++;
1003     new_qos->qdata[new_qos->next_datapoint].ns_allocated[idx] += ns_alloc;
1004     new_qos->qdata[new_qos->next_datapoint].ns_waiting[idx] += ns_waited;
1005     qos_update_thread_stats(cpu, domid, now);
1006     set_current(cpu, domid);
1007 
1008     // count up page flips for dom0 execution
1009     if (domid == 0)
1010         dom0_flips = 0;
1011 }
1012 
1013 // called when the current thread is taken off the cpu
qos_switch_out(int cpu,int domid,uint64_t now,unsigned long gotten)1014 static void qos_switch_out(int cpu, int domid, uint64_t now, unsigned long gotten)
1015 {
1016     int idx = indexof(domid);
1017     int n;
1018 
1019     if (!is_current(domid, cpu)) {
1020         //    printf("switching out domain %d but it is not current. gotten=%ld\r\n", id, gotten);
1021     }
1022 
1023     if (gotten == 0) {
1024         printf("gotten==0 in qos_switchout(domid=%d)\n", domid);
1025     }
1026 
1027     if (gotten < 100) {
1028         printf("gotten<100ns in qos_switchout(domid=%d)\n", domid);
1029     }
1030 
1031 
1032     n = new_qos->next_datapoint;
1033 #if 0
1034     new_qos->qdata[n].ns_gotten[idx] += gotten;
1035     if (gotten > new_qos->qdata[n].ns_passed)
1036         printf("inconsistency #257, diff = %lld\n",
1037                gotten - new_qos->qdata[n].ns_passed );
1038 #endif
1039     new_qos->domain_info[idx].ns_oncpu_since_boot += gotten;
1040     new_qos->domain_info[idx].runnable_start_time = now;
1041     //  runnable_start_time[id] = now;
1042     qos_update_thread_stats(cpu, domid, now);
1043 
1044     // process dom0 page flips
1045     if (domid == 0)
1046         if (dom0_flips == 0)
1047             new_qos->qdata[n].flip_free_periods++;
1048 }
1049 
1050 // called when domain is put to sleep, may also be called
1051 // when thread is already asleep
qos_state_sleeping(int cpu,int domid,uint64_t now)1052 static void qos_state_sleeping(int cpu, int domid, uint64_t now)
1053 {
1054     int idx;
1055 
1056     if (!domain_runnable(domid))	// double call?
1057         return;
1058 
1059     idx = indexof(domid);
1060     new_qos->domain_info[idx].runnable = 0;
1061     new_qos->domain_info[idx].blocked_start_time = now;
1062     new_qos->domain_info[idx].runnable_start_time = 0; // invalidate
1063     //  runnable_start_time[idx] = 0; // invalidate
1064     qos_update_thread_stats(cpu, domid, now);
1065 }
1066 
1067 
1068 
1069 // domain died, presume it's dead on all cpu's, not just mostly dead
qos_kill_thread(int domid)1070 static void qos_kill_thread(int domid)
1071 {
1072     int cpu;
1073 
1074     for (cpu=0; cpu<NCPU; cpu++) {
1075         cpu_qos_data[cpu]->domain_info[indexof(domid)].in_use = 0;
1076     }
1077 
1078 }
1079 
1080 
1081 // called when thread becomes runnable, may also be called
1082 // when thread is already runnable
qos_state_runnable(int cpu,int domid,uint64_t now)1083 static void qos_state_runnable(int cpu, int domid, uint64_t now)
1084 {
1085     int idx;
1086 
1087 
1088     qos_update_thread_stats(cpu, domid, now);
1089 
1090     if (domain_runnable(domid))	// double call?
1091         return;
1092 
1093     idx = indexof(domid);
1094     new_qos->domain_info[idx].runnable = 1;
1095     update_blocked_time(domid, now);
1096 
1097     new_qos->domain_info[idx].blocked_start_time = 0; /* invalidate */
1098     new_qos->domain_info[idx].runnable_start_time = now;
1099     //  runnable_start_time[id] = now;
1100 }
1101 
1102 
qos_count_packets(domid_t domid,uint64_t now)1103 static void qos_count_packets(domid_t domid, uint64_t now)
1104 {
1105     int i, idx = indexof(domid);
1106     _new_qos_data *cpu_data;
1107 
1108     for (i=0; i<NCPU; i++) {
1109         cpu_data = cpu_qos_data[i];
1110         if (cpu_data->domain_info[idx].in_use) {
1111             cpu_data->qdata[cpu_data->next_datapoint].io_count[idx]++;
1112         }
1113     }
1114 
1115     new_qos->qdata[new_qos->next_datapoint].io_count[0]++;
1116     dom0_flips++;
1117 }
1118 
1119 
process_record(int cpu,struct t_rec * r)1120 static int process_record(int cpu, struct t_rec *r)
1121 {
1122     uint64_t now = 0;
1123     uint32_t *extra_u32 = r->u.nocycles.extra_u32;
1124 
1125     new_qos = cpu_qos_data[cpu];
1126 
1127     rec_count++;
1128 
1129     if ( r->cycles_included )
1130     {
1131         now = ((uint64_t)r->u.cycles.cycles_hi << 32) | r->u.cycles.cycles_lo;
1132         now = ((double)now) / (opts.cpu_freq / 1000.0);
1133         extra_u32 = r->u.cycles.extra_u32;
1134     }
1135 
1136     global_now = now;
1137     global_cpu = cpu;
1138 
1139     log_event(r->event);
1140 
1141     switch (r->event) {
1142 
1143     case TRC_SCHED_SWITCH_INFPREV:
1144         // domain data[0] just switched out and received data[1] ns of cpu time
1145         qos_switch_out(cpu, extra_u32[0], now, extra_u32[1]);
1146         //    printf("ns_gotten %ld\n", extra_u32[1]);
1147         break;
1148 
1149     case TRC_SCHED_SWITCH_INFNEXT:
1150         // domain data[0] just switched in and
1151         // waited data[1] ns, and was allocated data[2] ns of cpu time
1152         qos_switch_in(cpu, extra_u32[0], now, extra_u32[2], extra_u32[1]);
1153         break;
1154 
1155     case TRC_SCHED_DOM_ADD:
1156         (void) indexof(extra_u32[0]);
1157         break;
1158 
1159     case TRC_SCHED_DOM_REM:
1160         qos_kill_thread(extra_u32[0]);
1161         break;
1162 
1163     case TRC_SCHED_SLEEP:
1164         qos_state_sleeping(cpu, extra_u32[0], now);
1165         break;
1166 
1167     case TRC_SCHED_WAKE:
1168         qos_state_runnable(cpu, extra_u32[0], now);
1169         break;
1170 
1171     case TRC_SCHED_BLOCK:
1172         qos_state_sleeping(cpu, extra_u32[0], now);
1173         break;
1174 
1175     case TRC_MEM_PAGE_GRANT_TRANSFER:
1176         qos_count_packets(extra_u32[0], now);
1177         break;
1178 
1179     default:
1180         break;
1181     }
1182 
1183     new_qos = NULL;
1184 
1185     return 4 + (r->cycles_included ? 8 : 0) + (r->extra_u32 * 4);
1186 }
1187 
1188 /*
1189  * Local variables:
1190  * mode: C
1191  * c-file-style: "BSD"
1192  * c-basic-offset: 4
1193  * tab-width: 4
1194  * indent-tabs-mode: nil
1195  * End:
1196  */
1197