1 /*
2 Simple prototype Xen Store Daemon providing simple tree-like database.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <inttypes.h>
20 #include <sys/types.h>
21 #include <sys/stat.h>
22 #include <poll.h>
23 #ifndef NO_SOCKETS
24 #include <sys/socket.h>
25 #include <sys/un.h>
26 #endif
27 #include <sys/time.h>
28 #include <time.h>
29 #include <unistd.h>
30 #include <fcntl.h>
31 #include <stdbool.h>
32 #include <stdio.h>
33 #include <stdarg.h>
34 #include <stdlib.h>
35 #include <syslog.h>
36 #include <string.h>
37 #include <errno.h>
38 #include <dirent.h>
39 #include <getopt.h>
40 #include <signal.h>
41 #include <assert.h>
42 #include <setjmp.h>
43
44 #include <xenevtchn.h>
45
46 #include "utils.h"
47 #include "list.h"
48 #include "talloc.h"
49 #include "xenstore_lib.h"
50 #include "xenstored_core.h"
51 #include "xenstored_watch.h"
52 #include "xenstored_transaction.h"
53 #include "xenstored_domain.h"
54 #include "xenstored_control.h"
55 #include "tdb.h"
56
57 #ifndef NO_SOCKETS
58 #if defined(HAVE_SYSTEMD)
59 #define XEN_SYSTEMD_ENABLED 1
60 #endif
61 #endif
62
63 #if defined(XEN_SYSTEMD_ENABLED)
64 #include <systemd/sd-daemon.h>
65 #endif
66
67 extern xenevtchn_handle *xce_handle; /* in xenstored_domain.c */
68 static int xce_pollfd_idx = -1;
69 static struct pollfd *fds;
70 static unsigned int current_array_size;
71 static unsigned int nr_fds;
72
73 #define ROUNDUP(_x, _w) (((unsigned long)(_x)+(1UL<<(_w))-1) & ~((1UL<<(_w))-1))
74
75 static bool verbose = false;
76 LIST_HEAD(connections);
77 int tracefd = -1;
78 static bool recovery = true;
79 static int reopen_log_pipe[2];
80 static int reopen_log_pipe0_pollfd_idx = -1;
81 char *tracefile = NULL;
82 TDB_CONTEXT *tdb_ctx = NULL;
83
84 static const char *sockmsg_string(enum xsd_sockmsg_type type);
85
86 #define log(...) \
87 do { \
88 char *s = talloc_asprintf(NULL, __VA_ARGS__); \
89 if (s) { \
90 trace("%s\n", s); \
91 syslog(LOG_ERR, "%s", s); \
92 talloc_free(s); \
93 } else { \
94 trace("talloc failure during logging\n"); \
95 syslog(LOG_ERR, "talloc failure during logging\n"); \
96 } \
97 } while (0)
98
99
100 int quota_nb_entry_per_domain = 1000;
101 int quota_nb_watch_per_domain = 128;
102 int quota_max_entry_size = 2048; /* 2K */
103 int quota_max_transaction = 10;
104
trace(const char * fmt,...)105 void trace(const char *fmt, ...)
106 {
107 va_list arglist;
108 char *str;
109 char sbuf[1024];
110 int ret, dummy;
111
112 if (tracefd < 0)
113 return;
114
115 /* try to use a static buffer */
116 va_start(arglist, fmt);
117 ret = vsnprintf(sbuf, 1024, fmt, arglist);
118 va_end(arglist);
119
120 if (ret <= 1024) {
121 dummy = write(tracefd, sbuf, ret);
122 return;
123 }
124
125 /* fail back to dynamic allocation */
126 va_start(arglist, fmt);
127 str = talloc_vasprintf(NULL, fmt, arglist);
128 va_end(arglist);
129 if (str) {
130 dummy = write(tracefd, str, strlen(str));
131 talloc_free(str);
132 }
133 }
134
trace_io(const struct connection * conn,const struct buffered_data * data,int out)135 static void trace_io(const struct connection *conn,
136 const struct buffered_data *data,
137 int out)
138 {
139 unsigned int i;
140 time_t now;
141 struct tm *tm;
142
143 #ifdef HAVE_DTRACE
144 dtrace_io(conn, data, out);
145 #endif
146
147 if (tracefd < 0)
148 return;
149
150 now = time(NULL);
151 tm = localtime(&now);
152
153 trace("%s %p %04d%02d%02d %02d:%02d:%02d %s (",
154 out ? "OUT" : "IN", conn,
155 tm->tm_year + 1900, tm->tm_mon + 1,
156 tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec,
157 sockmsg_string(data->hdr.msg.type));
158
159 for (i = 0; i < data->hdr.msg.len; i++)
160 trace("%c", (data->buffer[i] != '\0') ? data->buffer[i] : ' ');
161 trace(")\n");
162 }
163
trace_create(const void * data,const char * type)164 void trace_create(const void *data, const char *type)
165 {
166 trace("CREATE %s %p\n", type, data);
167 }
168
trace_destroy(const void * data,const char * type)169 void trace_destroy(const void *data, const char *type)
170 {
171 trace("DESTROY %s %p\n", type, data);
172 }
173
174 /**
175 * Signal handler for SIGHUP, which requests that the trace log is reopened
176 * (in the main loop). A single byte is written to reopen_log_pipe, to awaken
177 * the poll() in the main loop.
178 */
trigger_reopen_log(int signal)179 static void trigger_reopen_log(int signal __attribute__((unused)))
180 {
181 char c = 'A';
182 int dummy;
183 dummy = write(reopen_log_pipe[1], &c, 1);
184 }
185
close_log(void)186 void close_log(void)
187 {
188 if (tracefd >= 0)
189 close(tracefd);
190 tracefd = -1;
191 }
192
reopen_log(void)193 void reopen_log(void)
194 {
195 if (tracefile) {
196 close_log();
197
198 tracefd = open(tracefile, O_WRONLY|O_CREAT|O_APPEND, 0600);
199
200 if (tracefd < 0)
201 perror("Could not open tracefile");
202 else
203 trace("\n***\n");
204 }
205 }
206
write_messages(struct connection * conn)207 static bool write_messages(struct connection *conn)
208 {
209 int ret;
210 struct buffered_data *out;
211
212 out = list_top(&conn->out_list, struct buffered_data, list);
213 if (out == NULL)
214 return true;
215
216 if (out->inhdr) {
217 if (verbose)
218 xprintf("Writing msg %s (%.*s) out to %p\n",
219 sockmsg_string(out->hdr.msg.type),
220 out->hdr.msg.len,
221 out->buffer, conn);
222 ret = conn->write(conn, out->hdr.raw + out->used,
223 sizeof(out->hdr) - out->used);
224 if (ret < 0)
225 return false;
226
227 out->used += ret;
228 if (out->used < sizeof(out->hdr))
229 return true;
230
231 out->inhdr = false;
232 out->used = 0;
233
234 /* Second write might block if non-zero. */
235 if (out->hdr.msg.len && !conn->domain)
236 return true;
237 }
238
239 ret = conn->write(conn, out->buffer + out->used,
240 out->hdr.msg.len - out->used);
241 if (ret < 0)
242 return false;
243
244 out->used += ret;
245 if (out->used != out->hdr.msg.len)
246 return true;
247
248 trace_io(conn, out, 1);
249
250 list_del(&out->list);
251 talloc_free(out);
252
253 return true;
254 }
255
destroy_conn(void * _conn)256 static int destroy_conn(void *_conn)
257 {
258 struct connection *conn = _conn;
259
260 /* Flush outgoing if possible, but don't block. */
261 if (!conn->domain) {
262 struct pollfd pfd;
263 pfd.fd = conn->fd;
264 pfd.events = POLLOUT;
265
266 while (!list_empty(&conn->out_list)
267 && poll(&pfd, 1, 0) == 1)
268 if (!write_messages(conn))
269 break;
270 close(conn->fd);
271 }
272 if (conn->target)
273 talloc_unlink(conn, conn->target);
274 list_del(&conn->list);
275 trace_destroy(conn, "connection");
276 return 0;
277 }
278
279 /* This function returns index inside the array if succeed, -1 if fail */
set_fd(int fd,short events)280 static int set_fd(int fd, short events)
281 {
282 int ret;
283 if (current_array_size < nr_fds + 1) {
284 struct pollfd *new_fds = NULL;
285 unsigned long newsize;
286
287 /* Round up to 2^8 boundary, in practice this just
288 * make newsize larger than current_array_size.
289 */
290 newsize = ROUNDUP(nr_fds + 1, 8);
291
292 new_fds = realloc(fds, sizeof(struct pollfd)*newsize);
293 if (!new_fds)
294 goto fail;
295 fds = new_fds;
296
297 memset(&fds[0] + current_array_size, 0,
298 sizeof(struct pollfd ) * (newsize-current_array_size));
299 current_array_size = newsize;
300 }
301
302 fds[nr_fds].fd = fd;
303 fds[nr_fds].events = events;
304 ret = nr_fds;
305 nr_fds++;
306
307 return ret;
308 fail:
309 syslog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
310 return -1;
311 }
312
initialize_fds(int sock,int * p_sock_pollfd_idx,int ro_sock,int * p_ro_sock_pollfd_idx,int * ptimeout)313 static void initialize_fds(int sock, int *p_sock_pollfd_idx,
314 int ro_sock, int *p_ro_sock_pollfd_idx,
315 int *ptimeout)
316 {
317 struct connection *conn;
318 struct wrl_timestampt now;
319
320 if (fds)
321 memset(fds, 0, sizeof(struct pollfd) * current_array_size);
322 nr_fds = 0;
323
324 *ptimeout = -1;
325
326 if (sock != -1)
327 *p_sock_pollfd_idx = set_fd(sock, POLLIN|POLLPRI);
328 if (ro_sock != -1)
329 *p_ro_sock_pollfd_idx = set_fd(ro_sock, POLLIN|POLLPRI);
330 if (reopen_log_pipe[0] != -1)
331 reopen_log_pipe0_pollfd_idx =
332 set_fd(reopen_log_pipe[0], POLLIN|POLLPRI);
333
334 if (xce_handle != NULL)
335 xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle),
336 POLLIN|POLLPRI);
337
338 wrl_gettime_now(&now);
339 wrl_log_periodic(now);
340
341 list_for_each_entry(conn, &connections, list) {
342 if (conn->domain) {
343 wrl_check_timeout(conn->domain, now, ptimeout);
344 if (domain_can_read(conn) ||
345 (domain_can_write(conn) &&
346 !list_empty(&conn->out_list)))
347 *ptimeout = 0;
348 } else {
349 short events = POLLIN|POLLPRI;
350 if (!list_empty(&conn->out_list))
351 events |= POLLOUT;
352 conn->pollfd_idx = set_fd(conn->fd, events);
353 }
354 }
355 }
356
357 /*
358 * If it fails, returns NULL and sets errno.
359 * Temporary memory allocations will be done with ctx.
360 */
read_node(struct connection * conn,const void * ctx,const char * name)361 static struct node *read_node(struct connection *conn, const void *ctx,
362 const char *name)
363 {
364 TDB_DATA key, data;
365 struct xs_tdb_record_hdr *hdr;
366 struct node *node;
367
368 node = talloc(ctx, struct node);
369 if (!node) {
370 errno = ENOMEM;
371 return NULL;
372 }
373 node->name = talloc_strdup(node, name);
374 if (!node->name) {
375 talloc_free(node);
376 errno = ENOMEM;
377 return NULL;
378 }
379
380 if (transaction_prepend(conn, name, &key))
381 return NULL;
382
383 data = tdb_fetch(tdb_ctx, key);
384
385 if (data.dptr == NULL) {
386 if (tdb_error(tdb_ctx) == TDB_ERR_NOEXIST) {
387 node->generation = NO_GENERATION;
388 access_node(conn, node, NODE_ACCESS_READ, NULL);
389 errno = ENOENT;
390 } else {
391 log("TDB error on read: %s", tdb_errorstr(tdb_ctx));
392 errno = EIO;
393 }
394 talloc_free(node);
395 return NULL;
396 }
397
398 node->parent = NULL;
399 talloc_steal(node, data.dptr);
400
401 /* Datalen, childlen, number of permissions */
402 hdr = (void *)data.dptr;
403 node->generation = hdr->generation;
404 node->num_perms = hdr->num_perms;
405 node->datalen = hdr->datalen;
406 node->childlen = hdr->childlen;
407
408 /* Permissions are struct xs_permissions. */
409 node->perms = hdr->perms;
410 /* Data is binary blob (usually ascii, no nul). */
411 node->data = node->perms + node->num_perms;
412 /* Children is strings, nul separated. */
413 node->children = node->data + node->datalen;
414
415 access_node(conn, node, NODE_ACCESS_READ, NULL);
416
417 return node;
418 }
419
write_node_raw(struct connection * conn,TDB_DATA * key,struct node * node)420 int write_node_raw(struct connection *conn, TDB_DATA *key, struct node *node)
421 {
422 TDB_DATA data;
423 void *p;
424 struct xs_tdb_record_hdr *hdr;
425
426 data.dsize = sizeof(*hdr)
427 + node->num_perms*sizeof(node->perms[0])
428 + node->datalen + node->childlen;
429
430 if (domain_is_unprivileged(conn) &&
431 data.dsize >= quota_max_entry_size) {
432 errno = ENOSPC;
433 return errno;
434 }
435
436 data.dptr = talloc_size(node, data.dsize);
437 hdr = (void *)data.dptr;
438 hdr->generation = node->generation;
439 hdr->num_perms = node->num_perms;
440 hdr->datalen = node->datalen;
441 hdr->childlen = node->childlen;
442
443 memcpy(hdr->perms, node->perms, node->num_perms*sizeof(node->perms[0]));
444 p = hdr->perms + node->num_perms;
445 memcpy(p, node->data, node->datalen);
446 p += node->datalen;
447 memcpy(p, node->children, node->childlen);
448
449 /* TDB should set errno, but doesn't even set ecode AFAICT. */
450 if (tdb_store(tdb_ctx, *key, data, TDB_REPLACE) != 0) {
451 corrupt(conn, "Write of %s failed", key->dptr);
452 errno = EIO;
453 return errno;
454 }
455 return 0;
456 }
457
write_node(struct connection * conn,struct node * node)458 static int write_node(struct connection *conn, struct node *node)
459 {
460 TDB_DATA key;
461
462 if (access_node(conn, node, NODE_ACCESS_WRITE, &key))
463 return errno;
464
465 return write_node_raw(conn, &key, node);
466 }
467
perm_for_conn(struct connection * conn,struct xs_permissions * perms,unsigned int num)468 static enum xs_perm_type perm_for_conn(struct connection *conn,
469 struct xs_permissions *perms,
470 unsigned int num)
471 {
472 unsigned int i;
473 enum xs_perm_type mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
474
475 if (!conn->can_write)
476 mask &= ~XS_PERM_WRITE;
477
478 /* Owners and tools get it all... */
479 if (!domain_is_unprivileged(conn) || perms[0].id == conn->id
480 || (conn->target && perms[0].id == conn->target->id))
481 return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
482
483 for (i = 1; i < num; i++)
484 if (perms[i].id == conn->id
485 || (conn->target && perms[i].id == conn->target->id))
486 return perms[i].perms & mask;
487
488 return perms[0].perms & mask;
489 }
490
491 /*
492 * Get name of node parent.
493 * Temporary memory allocations are done with ctx.
494 */
get_parent(const void * ctx,const char * node)495 static char *get_parent(const void *ctx, const char *node)
496 {
497 char *parent;
498 char *slash = strrchr(node + 1, '/');
499
500 parent = slash ? talloc_asprintf(ctx, "%.*s", (int)(slash - node), node)
501 : talloc_strdup(ctx, "/");
502 if (!parent)
503 errno = ENOMEM;
504
505 return parent;
506 }
507
508 /*
509 * What do parents say?
510 * Temporary memory allocations are done with ctx.
511 */
ask_parents(struct connection * conn,const void * ctx,const char * name,enum xs_perm_type * perm)512 static int ask_parents(struct connection *conn, const void *ctx,
513 const char *name, enum xs_perm_type *perm)
514 {
515 struct node *node;
516
517 do {
518 name = get_parent(ctx, name);
519 if (!name)
520 return errno;
521 node = read_node(conn, ctx, name);
522 if (node)
523 break;
524 if (errno == ENOMEM)
525 return errno;
526 } while (!streq(name, "/"));
527
528 /* No permission at root? We're in trouble. */
529 if (!node) {
530 corrupt(conn, "No permissions file at root");
531 *perm = XS_PERM_NONE;
532 return 0;
533 }
534
535 *perm = perm_for_conn(conn, node->perms, node->num_perms);
536 return 0;
537 }
538
539 /*
540 * We have a weird permissions system. You can allow someone into a
541 * specific node without allowing it in the parents. If it's going to
542 * fail, however, we don't want the errno to indicate any information
543 * about the node.
544 * Temporary memory allocations are done with ctx.
545 */
errno_from_parents(struct connection * conn,const void * ctx,const char * node,int errnum,enum xs_perm_type perm)546 static int errno_from_parents(struct connection *conn, const void *ctx,
547 const char *node, int errnum,
548 enum xs_perm_type perm)
549 {
550 enum xs_perm_type parent_perm = XS_PERM_NONE;
551
552 /* We always tell them about memory failures. */
553 if (errnum == ENOMEM)
554 return errnum;
555
556 if (ask_parents(conn, ctx, node, &parent_perm))
557 return errno;
558 if (parent_perm & perm)
559 return errnum;
560 return EACCES;
561 }
562
563 /*
564 * If it fails, returns NULL and sets errno.
565 * Temporary memory allocations are done with ctx.
566 */
get_node(struct connection * conn,const void * ctx,const char * name,enum xs_perm_type perm)567 struct node *get_node(struct connection *conn,
568 const void *ctx,
569 const char *name,
570 enum xs_perm_type perm)
571 {
572 struct node *node;
573
574 if (!name || !is_valid_nodename(name)) {
575 errno = EINVAL;
576 return NULL;
577 }
578 node = read_node(conn, ctx, name);
579 /* If we don't have permission, we don't have node. */
580 if (node) {
581 if ((perm_for_conn(conn, node->perms, node->num_perms) & perm)
582 != perm) {
583 errno = EACCES;
584 node = NULL;
585 }
586 }
587 /* Clean up errno if they weren't supposed to know. */
588 if (!node && errno != ENOMEM)
589 errno = errno_from_parents(conn, ctx, name, errno, perm);
590 return node;
591 }
592
new_buffer(void * ctx)593 static struct buffered_data *new_buffer(void *ctx)
594 {
595 struct buffered_data *data;
596
597 data = talloc_zero(ctx, struct buffered_data);
598 if (data == NULL)
599 return NULL;
600
601 data->inhdr = true;
602 return data;
603 }
604
605 /* Return length of string (including nul) at this offset.
606 * If there is no nul, returns 0 for failure.
607 */
get_string(const struct buffered_data * data,unsigned int offset)608 static unsigned int get_string(const struct buffered_data *data,
609 unsigned int offset)
610 {
611 const char *nul;
612
613 if (offset >= data->used)
614 return 0;
615
616 nul = memchr(data->buffer + offset, 0, data->used - offset);
617 if (!nul)
618 return 0;
619
620 return nul - (data->buffer + offset) + 1;
621 }
622
623 /* Break input into vectors, return the number, fill in up to num of them.
624 * Always returns the actual number of nuls in the input. Stores the
625 * positions of the starts of the nul-terminated strings in vec.
626 * Callers who use this and then rely only on vec[] will
627 * ignore any data after the final nul.
628 */
get_strings(struct buffered_data * data,char * vec[],unsigned int num)629 unsigned int get_strings(struct buffered_data *data,
630 char *vec[], unsigned int num)
631 {
632 unsigned int off, i, len;
633
634 off = i = 0;
635 while ((len = get_string(data, off)) != 0) {
636 if (i < num)
637 vec[i] = data->buffer + off;
638 i++;
639 off += len;
640 }
641 return i;
642 }
643
send_error(struct connection * conn,int error)644 static void send_error(struct connection *conn, int error)
645 {
646 unsigned int i;
647
648 for (i = 0; error != xsd_errors[i].errnum; i++) {
649 if (i == ARRAY_SIZE(xsd_errors) - 1) {
650 eprintf("xenstored: error %i untranslatable", error);
651 i = 0; /* EINVAL */
652 break;
653 }
654 }
655 send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
656 strlen(xsd_errors[i].errstring) + 1);
657 }
658
send_reply(struct connection * conn,enum xsd_sockmsg_type type,const void * data,unsigned int len)659 void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
660 const void *data, unsigned int len)
661 {
662 struct buffered_data *bdata;
663
664 if ( len > XENSTORE_PAYLOAD_MAX ) {
665 send_error(conn, E2BIG);
666 return;
667 }
668
669 /* Replies reuse the request buffer, events need a new one. */
670 if (type != XS_WATCH_EVENT) {
671 bdata = conn->in;
672 bdata->inhdr = true;
673 bdata->used = 0;
674 conn->in = NULL;
675 } else {
676 /* Message is a child of the connection for auto-cleanup. */
677 bdata = new_buffer(conn);
678
679 /*
680 * Allocation failure here is unfortunate: we have no way to
681 * tell anybody about it.
682 */
683 if (!bdata)
684 return;
685 }
686 if (len <= DEFAULT_BUFFER_SIZE)
687 bdata->buffer = bdata->default_buffer;
688 else
689 bdata->buffer = talloc_array(bdata, char, len);
690 if (!bdata->buffer) {
691 if (type == XS_WATCH_EVENT) {
692 /* Same as above: no way to tell someone. */
693 talloc_free(bdata);
694 return;
695 }
696 /* re-establish request buffer for sending ENOMEM. */
697 conn->in = bdata;
698 send_error(conn, ENOMEM);
699 return;
700 }
701
702 /* Update relevant header fields and fill in the message body. */
703 bdata->hdr.msg.type = type;
704 bdata->hdr.msg.len = len;
705 memcpy(bdata->buffer, data, len);
706
707 /* Queue for later transmission. */
708 list_add_tail(&bdata->list, &conn->out_list);
709
710 return;
711 }
712
713 /* Some routines (write, mkdir, etc) just need a non-error return */
send_ack(struct connection * conn,enum xsd_sockmsg_type type)714 void send_ack(struct connection *conn, enum xsd_sockmsg_type type)
715 {
716 send_reply(conn, type, "OK", sizeof("OK"));
717 }
718
valid_chars(const char * node)719 static bool valid_chars(const char *node)
720 {
721 /* Nodes can have lots of crap. */
722 return (strspn(node,
723 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
724 "abcdefghijklmnopqrstuvwxyz"
725 "0123456789-/_@") == strlen(node));
726 }
727
is_valid_nodename(const char * node)728 bool is_valid_nodename(const char *node)
729 {
730 /* Must start in /. */
731 if (!strstarts(node, "/"))
732 return false;
733
734 /* Cannot end in / (unless it's just "/"). */
735 if (strends(node, "/") && !streq(node, "/"))
736 return false;
737
738 /* No double //. */
739 if (strstr(node, "//"))
740 return false;
741
742 if (strlen(node) > XENSTORE_ABS_PATH_MAX)
743 return false;
744
745 return valid_chars(node);
746 }
747
748 /* We expect one arg in the input: return NULL otherwise.
749 * The payload must contain exactly one nul, at the end.
750 */
onearg(struct buffered_data * in)751 const char *onearg(struct buffered_data *in)
752 {
753 if (!in->used || get_string(in, 0) != in->used)
754 return NULL;
755 return in->buffer;
756 }
757
perms_to_strings(const void * ctx,struct xs_permissions * perms,unsigned int num,unsigned int * len)758 static char *perms_to_strings(const void *ctx,
759 struct xs_permissions *perms, unsigned int num,
760 unsigned int *len)
761 {
762 unsigned int i;
763 char *strings = NULL;
764 char buffer[MAX_STRLEN(unsigned int) + 1];
765
766 for (*len = 0, i = 0; i < num; i++) {
767 if (!xs_perm_to_string(&perms[i], buffer, sizeof(buffer)))
768 return NULL;
769
770 strings = talloc_realloc(ctx, strings, char,
771 *len + strlen(buffer) + 1);
772 if (!strings)
773 return NULL;
774 strcpy(strings + *len, buffer);
775 *len += strlen(buffer) + 1;
776 }
777 return strings;
778 }
779
canonicalize(struct connection * conn,const void * ctx,const char * node)780 char *canonicalize(struct connection *conn, const void *ctx, const char *node)
781 {
782 const char *prefix;
783
784 if (!node || (node[0] == '/') || (node[0] == '@'))
785 return (char *)node;
786 prefix = get_implicit_path(conn);
787 if (prefix)
788 return talloc_asprintf(ctx, "%s/%s", prefix, node);
789 return (char *)node;
790 }
791
get_node_canonicalized(struct connection * conn,const void * ctx,const char * name,char ** canonical_name,enum xs_perm_type perm)792 static struct node *get_node_canonicalized(struct connection *conn,
793 const void *ctx,
794 const char *name,
795 char **canonical_name,
796 enum xs_perm_type perm)
797 {
798 char *tmp_name;
799
800 if (!canonical_name)
801 canonical_name = &tmp_name;
802 *canonical_name = canonicalize(conn, ctx, name);
803 return get_node(conn, ctx, *canonical_name, perm);
804 }
805
send_directory(struct connection * conn,struct buffered_data * in)806 static int send_directory(struct connection *conn, struct buffered_data *in)
807 {
808 struct node *node;
809
810 node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
811 if (!node)
812 return errno;
813
814 send_reply(conn, XS_DIRECTORY, node->children, node->childlen);
815
816 return 0;
817 }
818
send_directory_part(struct connection * conn,struct buffered_data * in)819 static int send_directory_part(struct connection *conn,
820 struct buffered_data *in)
821 {
822 unsigned int off, len, maxlen, genlen;
823 char *child, *data;
824 struct node *node;
825 char gen[24];
826
827 if (xs_count_strings(in->buffer, in->used) != 2)
828 return EINVAL;
829
830 /* First arg is node name. */
831 node = get_node_canonicalized(conn, in, in->buffer, NULL, XS_PERM_READ);
832 if (!node)
833 return errno;
834
835 /* Second arg is childlist offset. */
836 off = atoi(in->buffer + strlen(in->buffer) + 1);
837
838 genlen = snprintf(gen, sizeof(gen), "%"PRIu64, node->generation) + 1;
839
840 /* Offset behind list: just return a list with an empty string. */
841 if (off >= node->childlen) {
842 gen[genlen] = 0;
843 send_reply(conn, XS_DIRECTORY_PART, gen, genlen + 1);
844 return 0;
845 }
846
847 len = 0;
848 maxlen = XENSTORE_PAYLOAD_MAX - genlen - 1;
849 child = node->children + off;
850
851 while (len + strlen(child) < maxlen) {
852 len += strlen(child) + 1;
853 child += strlen(child) + 1;
854 if (off + len == node->childlen)
855 break;
856 }
857
858 data = talloc_array(in, char, genlen + len + 1);
859 if (!data)
860 return ENOMEM;
861
862 memcpy(data, gen, genlen);
863 memcpy(data + genlen, node->children + off, len);
864 if (off + len == node->childlen) {
865 data[genlen + len] = 0;
866 len++;
867 }
868
869 send_reply(conn, XS_DIRECTORY_PART, data, genlen + len);
870
871 return 0;
872 }
873
do_read(struct connection * conn,struct buffered_data * in)874 static int do_read(struct connection *conn, struct buffered_data *in)
875 {
876 struct node *node;
877
878 node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
879 if (!node)
880 return errno;
881
882 send_reply(conn, XS_READ, node->data, node->datalen);
883
884 return 0;
885 }
886
delete_node_single(struct connection * conn,struct node * node)887 static void delete_node_single(struct connection *conn, struct node *node)
888 {
889 TDB_DATA key;
890
891 if (access_node(conn, node, NODE_ACCESS_DELETE, &key))
892 return;
893
894 if (tdb_delete(tdb_ctx, key) != 0) {
895 corrupt(conn, "Could not delete '%s'", node->name);
896 return;
897 }
898
899 domain_entry_dec(conn, node);
900 }
901
902 /* Must not be / */
basename(const char * name)903 static char *basename(const char *name)
904 {
905 return strrchr(name, '/') + 1;
906 }
907
construct_node(struct connection * conn,const void * ctx,const char * name)908 static struct node *construct_node(struct connection *conn, const void *ctx,
909 const char *name)
910 {
911 const char *base;
912 unsigned int baselen;
913 struct node *parent, *node;
914 char *children, *parentname = get_parent(ctx, name);
915
916 if (!parentname)
917 return NULL;
918
919 /* If parent doesn't exist, create it. */
920 parent = read_node(conn, parentname, parentname);
921 if (!parent)
922 parent = construct_node(conn, ctx, parentname);
923 if (!parent)
924 return NULL;
925
926 if (domain_entry(conn) >= quota_nb_entry_per_domain) {
927 errno = ENOSPC;
928 return NULL;
929 }
930
931 /* Add child to parent. */
932 base = basename(name);
933 baselen = strlen(base) + 1;
934 children = talloc_array(ctx, char, parent->childlen + baselen);
935 if (!children)
936 goto nomem;
937 memcpy(children, parent->children, parent->childlen);
938 memcpy(children + parent->childlen, base, baselen);
939 parent->children = children;
940 parent->childlen += baselen;
941
942 /* Allocate node */
943 node = talloc(ctx, struct node);
944 if (!node)
945 goto nomem;
946 node->name = talloc_strdup(node, name);
947 if (!node->name)
948 goto nomem;
949
950 /* Inherit permissions, except unprivileged domains own what they create */
951 node->num_perms = parent->num_perms;
952 node->perms = talloc_memdup(node, parent->perms,
953 node->num_perms * sizeof(node->perms[0]));
954 if (!node->perms)
955 goto nomem;
956 if (domain_is_unprivileged(conn))
957 node->perms[0].id = conn->id;
958
959 /* No children, no data */
960 node->children = node->data = NULL;
961 node->childlen = node->datalen = 0;
962 node->parent = parent;
963 domain_entry_inc(conn, node);
964 return node;
965
966 nomem:
967 errno = ENOMEM;
968 return NULL;
969 }
970
destroy_node(void * _node)971 static int destroy_node(void *_node)
972 {
973 struct node *node = _node;
974 TDB_DATA key;
975
976 if (streq(node->name, "/"))
977 corrupt(NULL, "Destroying root node!");
978
979 key.dptr = (void *)node->name;
980 key.dsize = strlen(node->name);
981
982 tdb_delete(tdb_ctx, key);
983 return 0;
984 }
985
create_node(struct connection * conn,const void * ctx,const char * name,void * data,unsigned int datalen)986 static struct node *create_node(struct connection *conn, const void *ctx,
987 const char *name,
988 void *data, unsigned int datalen)
989 {
990 struct node *node, *i;
991
992 node = construct_node(conn, ctx, name);
993 if (!node)
994 return NULL;
995
996 node->data = data;
997 node->datalen = datalen;
998
999 /* We write out the nodes down, setting destructor in case
1000 * something goes wrong. */
1001 for (i = node; i; i = i->parent) {
1002 if (write_node(conn, i)) {
1003 domain_entry_dec(conn, i);
1004 return NULL;
1005 }
1006 talloc_set_destructor(i, destroy_node);
1007 }
1008
1009 /* OK, now remove destructors so they stay around */
1010 for (i = node; i; i = i->parent)
1011 talloc_set_destructor(i, NULL);
1012 return node;
1013 }
1014
1015 /* path, data... */
do_write(struct connection * conn,struct buffered_data * in)1016 static int do_write(struct connection *conn, struct buffered_data *in)
1017 {
1018 unsigned int offset, datalen;
1019 struct node *node;
1020 char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
1021 char *name;
1022
1023 /* Extra "strings" can be created by binary data. */
1024 if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
1025 return EINVAL;
1026
1027 offset = strlen(vec[0]) + 1;
1028 datalen = in->used - offset;
1029
1030 node = get_node_canonicalized(conn, in, vec[0], &name, XS_PERM_WRITE);
1031 if (!node) {
1032 /* No permissions, invalid input? */
1033 if (errno != ENOENT)
1034 return errno;
1035 node = create_node(conn, in, name, in->buffer + offset,
1036 datalen);
1037 if (!node)
1038 return errno;
1039 } else {
1040 node->data = in->buffer + offset;
1041 node->datalen = datalen;
1042 if (write_node(conn, node))
1043 return errno;
1044 }
1045
1046 fire_watches(conn, in, name, false);
1047 send_ack(conn, XS_WRITE);
1048
1049 return 0;
1050 }
1051
do_mkdir(struct connection * conn,struct buffered_data * in)1052 static int do_mkdir(struct connection *conn, struct buffered_data *in)
1053 {
1054 struct node *node;
1055 char *name;
1056
1057 node = get_node_canonicalized(conn, in, onearg(in), &name,
1058 XS_PERM_WRITE);
1059
1060 /* If it already exists, fine. */
1061 if (!node) {
1062 /* No permissions? */
1063 if (errno != ENOENT)
1064 return errno;
1065 node = create_node(conn, in, name, NULL, 0);
1066 if (!node)
1067 return errno;
1068 fire_watches(conn, in, name, false);
1069 }
1070 send_ack(conn, XS_MKDIR);
1071
1072 return 0;
1073 }
1074
delete_node(struct connection * conn,struct node * node)1075 static void delete_node(struct connection *conn, struct node *node)
1076 {
1077 unsigned int i;
1078 char *name;
1079
1080 /* Delete self, then delete children. If we crash, then the worst
1081 that can happen is the children will continue to take up space, but
1082 will otherwise be unreachable. */
1083 delete_node_single(conn, node);
1084
1085 /* Delete children, too. */
1086 for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
1087 struct node *child;
1088
1089 name = talloc_asprintf(node, "%s/%s", node->name,
1090 node->children + i);
1091 child = name ? read_node(conn, node, name) : NULL;
1092 if (child) {
1093 delete_node(conn, child);
1094 }
1095 else {
1096 trace("delete_node: Error deleting child '%s/%s'!\n",
1097 node->name, node->children + i);
1098 /* Skip it, we've already deleted the parent. */
1099 }
1100 talloc_free(name);
1101 }
1102 }
1103
1104
1105 /* Delete memory using memmove. */
memdel(void * mem,unsigned off,unsigned len,unsigned total)1106 static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
1107 {
1108 memmove(mem + off, mem + off + len, total - off - len);
1109 }
1110
1111
remove_child_entry(struct connection * conn,struct node * node,size_t offset)1112 static int remove_child_entry(struct connection *conn, struct node *node,
1113 size_t offset)
1114 {
1115 size_t childlen = strlen(node->children + offset);
1116 memdel(node->children, offset, childlen + 1, node->childlen);
1117 node->childlen -= childlen + 1;
1118 return write_node(conn, node);
1119 }
1120
1121
delete_child(struct connection * conn,struct node * node,const char * childname)1122 static int delete_child(struct connection *conn,
1123 struct node *node, const char *childname)
1124 {
1125 unsigned int i;
1126
1127 for (i = 0; i < node->childlen; i += strlen(node->children+i) + 1) {
1128 if (streq(node->children+i, childname)) {
1129 return remove_child_entry(conn, node, i);
1130 }
1131 }
1132 corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
1133 return ENOENT;
1134 }
1135
1136
_rm(struct connection * conn,const void * ctx,struct node * node,const char * name)1137 static int _rm(struct connection *conn, const void *ctx, struct node *node,
1138 const char *name)
1139 {
1140 /* Delete from parent first, then if we crash, the worst that can
1141 happen is the child will continue to take up space, but will
1142 otherwise be unreachable. */
1143 struct node *parent;
1144 char *parentname = get_parent(ctx, name);
1145
1146 if (!parentname)
1147 return errno;
1148
1149 parent = read_node(conn, ctx, parentname);
1150 if (!parent)
1151 return (errno == ENOMEM) ? ENOMEM : EINVAL;
1152
1153 if (delete_child(conn, parent, basename(name)))
1154 return EINVAL;
1155
1156 delete_node(conn, node);
1157 return 0;
1158 }
1159
1160
do_rm(struct connection * conn,struct buffered_data * in)1161 static int do_rm(struct connection *conn, struct buffered_data *in)
1162 {
1163 struct node *node;
1164 int ret;
1165 char *name;
1166 char *parentname;
1167
1168 node = get_node_canonicalized(conn, in, onearg(in), &name,
1169 XS_PERM_WRITE);
1170 if (!node) {
1171 /* Didn't exist already? Fine, if parent exists. */
1172 if (errno == ENOENT) {
1173 parentname = get_parent(in, name);
1174 if (!parentname)
1175 return errno;
1176 node = read_node(conn, in, parentname);
1177 if (node) {
1178 send_ack(conn, XS_RM);
1179 return 0;
1180 }
1181 /* Restore errno, just in case. */
1182 if (errno != ENOMEM)
1183 errno = ENOENT;
1184 }
1185 return errno;
1186 }
1187
1188 if (streq(name, "/"))
1189 return EINVAL;
1190
1191 ret = _rm(conn, in, node, name);
1192 if (ret)
1193 return ret;
1194
1195 fire_watches(conn, in, name, true);
1196 send_ack(conn, XS_RM);
1197
1198 return 0;
1199 }
1200
1201
do_get_perms(struct connection * conn,struct buffered_data * in)1202 static int do_get_perms(struct connection *conn, struct buffered_data *in)
1203 {
1204 struct node *node;
1205 char *strings;
1206 unsigned int len;
1207
1208 node = get_node_canonicalized(conn, in, onearg(in), NULL, XS_PERM_READ);
1209 if (!node)
1210 return errno;
1211
1212 strings = perms_to_strings(node, node->perms, node->num_perms, &len);
1213 if (!strings)
1214 return errno;
1215
1216 send_reply(conn, XS_GET_PERMS, strings, len);
1217
1218 return 0;
1219 }
1220
do_set_perms(struct connection * conn,struct buffered_data * in)1221 static int do_set_perms(struct connection *conn, struct buffered_data *in)
1222 {
1223 unsigned int num;
1224 struct xs_permissions *perms;
1225 char *name, *permstr;
1226 struct node *node;
1227
1228 num = xs_count_strings(in->buffer, in->used);
1229 if (num < 2)
1230 return EINVAL;
1231
1232 /* First arg is node name. */
1233 /* We must own node to do this (tools can do this too). */
1234 node = get_node_canonicalized(conn, in, in->buffer, &name,
1235 XS_PERM_WRITE | XS_PERM_OWNER);
1236 if (!node)
1237 return errno;
1238
1239 permstr = in->buffer + strlen(in->buffer) + 1;
1240 num--;
1241
1242 perms = talloc_array(node, struct xs_permissions, num);
1243 if (!perms)
1244 return ENOMEM;
1245 if (!xs_strings_to_perms(perms, num, permstr))
1246 return errno;
1247
1248 /* Unprivileged domains may not change the owner. */
1249 if (domain_is_unprivileged(conn) && perms[0].id != node->perms[0].id)
1250 return EPERM;
1251
1252 domain_entry_dec(conn, node);
1253 node->perms = perms;
1254 node->num_perms = num;
1255 domain_entry_inc(conn, node);
1256
1257 if (write_node(conn, node))
1258 return errno;
1259
1260 fire_watches(conn, in, name, false);
1261 send_ack(conn, XS_SET_PERMS);
1262
1263 return 0;
1264 }
1265
1266 static struct {
1267 const char *str;
1268 int (*func)(struct connection *conn, struct buffered_data *in);
1269 } const wire_funcs[XS_TYPE_COUNT] = {
1270 [XS_CONTROL] = { "CONTROL", do_control },
1271 [XS_DIRECTORY] = { "DIRECTORY", send_directory },
1272 [XS_READ] = { "READ", do_read },
1273 [XS_GET_PERMS] = { "GET_PERMS", do_get_perms },
1274 [XS_WATCH] = { "WATCH", do_watch },
1275 [XS_UNWATCH] = { "UNWATCH", do_unwatch },
1276 [XS_TRANSACTION_START] = { "TRANSACTION_START", do_transaction_start },
1277 [XS_TRANSACTION_END] = { "TRANSACTION_END", do_transaction_end },
1278 [XS_INTRODUCE] = { "INTRODUCE", do_introduce },
1279 [XS_RELEASE] = { "RELEASE", do_release },
1280 [XS_GET_DOMAIN_PATH] = { "GET_DOMAIN_PATH", do_get_domain_path },
1281 [XS_WRITE] = { "WRITE", do_write },
1282 [XS_MKDIR] = { "MKDIR", do_mkdir },
1283 [XS_RM] = { "RM", do_rm },
1284 [XS_SET_PERMS] = { "SET_PERMS", do_set_perms },
1285 [XS_WATCH_EVENT] = { "WATCH_EVENT", NULL },
1286 [XS_ERROR] = { "ERROR", NULL },
1287 [XS_IS_DOMAIN_INTRODUCED] =
1288 { "IS_DOMAIN_INTRODUCED", do_is_domain_introduced },
1289 [XS_RESUME] = { "RESUME", do_resume },
1290 [XS_SET_TARGET] = { "SET_TARGET", do_set_target },
1291 [XS_RESET_WATCHES] = { "RESET_WATCHES", do_reset_watches },
1292 [XS_DIRECTORY_PART] = { "DIRECTORY_PART", send_directory_part },
1293 };
1294
sockmsg_string(enum xsd_sockmsg_type type)1295 static const char *sockmsg_string(enum xsd_sockmsg_type type)
1296 {
1297 if ((unsigned)type < XS_TYPE_COUNT && wire_funcs[type].str)
1298 return wire_funcs[type].str;
1299
1300 return "**UNKNOWN**";
1301 }
1302
1303 /* Process "in" for conn: "in" will vanish after this conversation, so
1304 * we can talloc off it for temporary variables. May free "conn".
1305 */
process_message(struct connection * conn,struct buffered_data * in)1306 static void process_message(struct connection *conn, struct buffered_data *in)
1307 {
1308 struct transaction *trans;
1309 enum xsd_sockmsg_type type = in->hdr.msg.type;
1310 int ret;
1311
1312 trans = transaction_lookup(conn, in->hdr.msg.tx_id);
1313 if (IS_ERR(trans)) {
1314 send_error(conn, -PTR_ERR(trans));
1315 return;
1316 }
1317
1318 assert(conn->transaction == NULL);
1319 conn->transaction = trans;
1320
1321 if ((unsigned)type < XS_TYPE_COUNT && wire_funcs[type].func)
1322 ret = wire_funcs[type].func(conn, in);
1323 else {
1324 eprintf("Client unknown operation %i", type);
1325 ret = ENOSYS;
1326 }
1327 if (ret)
1328 send_error(conn, ret);
1329
1330 conn->transaction = NULL;
1331 }
1332
consider_message(struct connection * conn)1333 static void consider_message(struct connection *conn)
1334 {
1335 if (verbose)
1336 xprintf("Got message %s len %i from %p\n",
1337 sockmsg_string(conn->in->hdr.msg.type),
1338 conn->in->hdr.msg.len, conn);
1339
1340 process_message(conn, conn->in);
1341
1342 assert(conn->in == NULL);
1343 }
1344
1345 /* Errors in reading or allocating here mean we get out of sync, so we
1346 * drop the whole client connection. */
handle_input(struct connection * conn)1347 static void handle_input(struct connection *conn)
1348 {
1349 int bytes;
1350 struct buffered_data *in;
1351
1352 if (!conn->in) {
1353 conn->in = new_buffer(conn);
1354 /* In case of no memory just try it again next time. */
1355 if (!conn->in)
1356 return;
1357 }
1358 in = conn->in;
1359
1360 /* Not finished header yet? */
1361 if (in->inhdr) {
1362 if (in->used != sizeof(in->hdr)) {
1363 bytes = conn->read(conn, in->hdr.raw + in->used,
1364 sizeof(in->hdr) - in->used);
1365 if (bytes < 0)
1366 goto bad_client;
1367 in->used += bytes;
1368 if (in->used != sizeof(in->hdr))
1369 return;
1370
1371 if (in->hdr.msg.len > XENSTORE_PAYLOAD_MAX) {
1372 syslog(LOG_ERR, "Client tried to feed us %i",
1373 in->hdr.msg.len);
1374 goto bad_client;
1375 }
1376 }
1377
1378 if (in->hdr.msg.len <= DEFAULT_BUFFER_SIZE)
1379 in->buffer = in->default_buffer;
1380 else
1381 in->buffer = talloc_array(in, char, in->hdr.msg.len);
1382 /* In case of no memory just try it again next time. */
1383 if (!in->buffer)
1384 return;
1385 in->used = 0;
1386 in->inhdr = false;
1387 }
1388
1389 bytes = conn->read(conn, in->buffer + in->used,
1390 in->hdr.msg.len - in->used);
1391 if (bytes < 0)
1392 goto bad_client;
1393
1394 in->used += bytes;
1395 if (in->used != in->hdr.msg.len)
1396 return;
1397
1398 trace_io(conn, in, 0);
1399 consider_message(conn);
1400 return;
1401
1402 bad_client:
1403 /* Kill it. */
1404 talloc_free(conn);
1405 }
1406
handle_output(struct connection * conn)1407 static void handle_output(struct connection *conn)
1408 {
1409 if (!write_messages(conn))
1410 talloc_free(conn);
1411 }
1412
new_connection(connwritefn_t * write,connreadfn_t * read)1413 struct connection *new_connection(connwritefn_t *write, connreadfn_t *read)
1414 {
1415 struct connection *new;
1416
1417 new = talloc_zero(talloc_autofree_context(), struct connection);
1418 if (!new)
1419 return NULL;
1420
1421 new->fd = -1;
1422 new->pollfd_idx = -1;
1423 new->write = write;
1424 new->read = read;
1425 new->can_write = true;
1426 new->transaction_started = 0;
1427 INIT_LIST_HEAD(&new->out_list);
1428 INIT_LIST_HEAD(&new->watches);
1429 INIT_LIST_HEAD(&new->transaction_list);
1430
1431 list_add_tail(&new->list, &connections);
1432 talloc_set_destructor(new, destroy_conn);
1433 trace_create(new, "connection");
1434 return new;
1435 }
1436
1437 #ifdef NO_SOCKETS
accept_connection(int sock,bool canwrite)1438 static void accept_connection(int sock, bool canwrite)
1439 {
1440 }
1441 #else
writefd(struct connection * conn,const void * data,unsigned int len)1442 static int writefd(struct connection *conn, const void *data, unsigned int len)
1443 {
1444 int rc;
1445
1446 while ((rc = write(conn->fd, data, len)) < 0) {
1447 if (errno == EAGAIN) {
1448 rc = 0;
1449 break;
1450 }
1451 if (errno != EINTR)
1452 break;
1453 }
1454
1455 return rc;
1456 }
1457
readfd(struct connection * conn,void * data,unsigned int len)1458 static int readfd(struct connection *conn, void *data, unsigned int len)
1459 {
1460 int rc;
1461
1462 while ((rc = read(conn->fd, data, len)) < 0) {
1463 if (errno == EAGAIN) {
1464 rc = 0;
1465 break;
1466 }
1467 if (errno != EINTR)
1468 break;
1469 }
1470
1471 /* Reading zero length means we're done with this connection. */
1472 if ((rc == 0) && (len != 0)) {
1473 errno = EBADF;
1474 rc = -1;
1475 }
1476
1477 return rc;
1478 }
1479
accept_connection(int sock,bool canwrite)1480 static void accept_connection(int sock, bool canwrite)
1481 {
1482 int fd;
1483 struct connection *conn;
1484
1485 fd = accept(sock, NULL, NULL);
1486 if (fd < 0)
1487 return;
1488
1489 conn = new_connection(writefd, readfd);
1490 if (conn) {
1491 conn->fd = fd;
1492 conn->can_write = canwrite;
1493 } else
1494 close(fd);
1495 }
1496 #endif
1497
1498 static int tdb_flags;
1499
1500 /* We create initial nodes manually. */
manual_node(const char * name,const char * child)1501 static void manual_node(const char *name, const char *child)
1502 {
1503 struct node *node;
1504 struct xs_permissions perms = { .id = 0, .perms = XS_PERM_NONE };
1505
1506 node = talloc_zero(NULL, struct node);
1507 if (!node)
1508 barf_perror("Could not allocate initial node %s", name);
1509
1510 node->name = name;
1511 node->perms = &perms;
1512 node->num_perms = 1;
1513 node->children = (char *)child;
1514 if (child)
1515 node->childlen = strlen(child) + 1;
1516
1517 if (write_node(NULL, node))
1518 barf_perror("Could not create initial node %s", name);
1519 talloc_free(node);
1520 }
1521
tdb_logger(TDB_CONTEXT * tdb,int level,const char * fmt,...)1522 static void tdb_logger(TDB_CONTEXT *tdb, int level, const char * fmt, ...)
1523 {
1524 va_list ap;
1525 char *s;
1526
1527 va_start(ap, fmt);
1528 s = talloc_vasprintf(NULL, fmt, ap);
1529 va_end(ap);
1530
1531 if (s) {
1532 trace("TDB: %s\n", s);
1533 syslog(LOG_ERR, "TDB: %s", s);
1534 if (verbose)
1535 xprintf("TDB: %s", s);
1536 talloc_free(s);
1537 } else {
1538 trace("talloc failure during logging\n");
1539 syslog(LOG_ERR, "talloc failure during logging\n");
1540 }
1541 }
1542
setup_structure(void)1543 static void setup_structure(void)
1544 {
1545 char *tdbname;
1546 tdbname = talloc_strdup(talloc_autofree_context(), xs_daemon_tdb());
1547 if (!tdbname)
1548 barf_perror("Could not create tdbname");
1549
1550 if (!(tdb_flags & TDB_INTERNAL))
1551 unlink(tdbname);
1552
1553 tdb_ctx = tdb_open_ex(tdbname, 7919, tdb_flags, O_RDWR|O_CREAT|O_EXCL,
1554 0640, &tdb_logger, NULL);
1555 if (!tdb_ctx)
1556 barf_perror("Could not create tdb file %s", tdbname);
1557
1558 manual_node("/", "tool");
1559 manual_node("/tool", "xenstored");
1560 manual_node("/tool/xenstored", NULL);
1561
1562 check_store();
1563 }
1564
1565
hash_from_key_fn(void * k)1566 static unsigned int hash_from_key_fn(void *k)
1567 {
1568 char *str = k;
1569 unsigned int hash = 5381;
1570 char c;
1571
1572 while ((c = *str++))
1573 hash = ((hash << 5) + hash) + (unsigned int)c;
1574
1575 return hash;
1576 }
1577
1578
keys_equal_fn(void * key1,void * key2)1579 static int keys_equal_fn(void *key1, void *key2)
1580 {
1581 return 0 == strcmp((char *)key1, (char *)key2);
1582 }
1583
1584
child_name(const char * s1,const char * s2)1585 static char *child_name(const char *s1, const char *s2)
1586 {
1587 if (strcmp(s1, "/")) {
1588 return talloc_asprintf(NULL, "%s/%s", s1, s2);
1589 }
1590 else {
1591 return talloc_asprintf(NULL, "/%s", s2);
1592 }
1593 }
1594
1595
remember_string(struct hashtable * hash,const char * str)1596 int remember_string(struct hashtable *hash, const char *str)
1597 {
1598 char *k = malloc(strlen(str) + 1);
1599
1600 if (!k)
1601 return 0;
1602 strcpy(k, str);
1603 return hashtable_insert(hash, k, (void *)1);
1604 }
1605
1606
1607 /**
1608 * A node has a children field that names the children of the node, separated
1609 * by NULs. We check whether there are entries in there that are duplicated
1610 * (and if so, delete the second one), and whether there are any that do not
1611 * have a corresponding child node (and if so, delete them). Each valid child
1612 * is then recursively checked.
1613 *
1614 * No deleting is performed if the recovery flag is cleared (i.e. -R was
1615 * passed on the command line).
1616 *
1617 * As we go, we record each node in the given reachable hashtable. These
1618 * entries will be used later in clean_store.
1619 */
check_store_(const char * name,struct hashtable * reachable)1620 static int check_store_(const char *name, struct hashtable *reachable)
1621 {
1622 struct node *node = read_node(NULL, name, name);
1623 int ret = 0;
1624
1625 if (node) {
1626 size_t i = 0;
1627
1628 struct hashtable * children =
1629 create_hashtable(16, hash_from_key_fn, keys_equal_fn);
1630
1631 if (!remember_string(reachable, name)) {
1632 hashtable_destroy(children, 0);
1633 log("check_store: ENOMEM");
1634 return ENOMEM;
1635 }
1636
1637 while (i < node->childlen && !ret) {
1638 struct node *childnode;
1639 size_t childlen = strlen(node->children + i);
1640 char * childname = child_name(node->name,
1641 node->children + i);
1642
1643 if (!childname) {
1644 log("check_store: ENOMEM");
1645 ret = ENOMEM;
1646 break;
1647 }
1648 childnode = read_node(NULL, childname, childname);
1649
1650 if (childnode) {
1651 if (hashtable_search(children, childname)) {
1652 log("check_store: '%s' is duplicated!",
1653 childname);
1654
1655 if (recovery) {
1656 remove_child_entry(NULL, node,
1657 i);
1658 i -= childlen + 1;
1659 }
1660 }
1661 else {
1662 if (!remember_string(children,
1663 childname)) {
1664 log("check_store: ENOMEM");
1665 talloc_free(childnode);
1666 talloc_free(childname);
1667 ret = ENOMEM;
1668 break;
1669 }
1670 ret = check_store_(childname,
1671 reachable);
1672 }
1673 } else if (errno != ENOMEM) {
1674 log("check_store: No child '%s' found!\n",
1675 childname);
1676
1677 if (recovery) {
1678 remove_child_entry(NULL, node, i);
1679 i -= childlen + 1;
1680 }
1681 } else {
1682 log("check_store: ENOMEM");
1683 ret = ENOMEM;
1684 }
1685
1686 talloc_free(childnode);
1687 talloc_free(childname);
1688 i += childlen + 1;
1689 }
1690
1691 hashtable_destroy(children, 0 /* Don't free values (they are
1692 all (void *)1) */);
1693 talloc_free(node);
1694 } else if (errno != ENOMEM) {
1695 /* Impossible, because no database should ever be without the
1696 root, and otherwise, we've just checked in our caller
1697 (which made a recursive call to get here). */
1698
1699 log("check_store: No child '%s' found: impossible!", name);
1700 } else {
1701 log("check_store: ENOMEM");
1702 ret = ENOMEM;
1703 }
1704
1705 return ret;
1706 }
1707
1708
1709 /**
1710 * Helper to clean_store below.
1711 */
clean_store_(TDB_CONTEXT * tdb,TDB_DATA key,TDB_DATA val,void * private)1712 static int clean_store_(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA val,
1713 void *private)
1714 {
1715 struct hashtable *reachable = private;
1716 char *slash;
1717 char * name = talloc_strndup(NULL, key.dptr, key.dsize);
1718
1719 if (!name) {
1720 log("clean_store: ENOMEM");
1721 return 1;
1722 }
1723
1724 if (name[0] != '/') {
1725 slash = strchr(name, '/');
1726 if (slash)
1727 *slash = 0;
1728 }
1729 if (!hashtable_search(reachable, name)) {
1730 log("clean_store: '%s' is orphaned!", name);
1731 if (recovery) {
1732 tdb_delete(tdb, key);
1733 }
1734 }
1735
1736 talloc_free(name);
1737
1738 return 0;
1739 }
1740
1741
1742 /**
1743 * Given the list of reachable nodes, iterate over the whole store, and
1744 * remove any that were not reached.
1745 */
clean_store(struct hashtable * reachable)1746 static void clean_store(struct hashtable *reachable)
1747 {
1748 tdb_traverse(tdb_ctx, &clean_store_, reachable);
1749 }
1750
1751
check_store(void)1752 void check_store(void)
1753 {
1754 char * root = talloc_strdup(NULL, "/");
1755 struct hashtable * reachable =
1756 create_hashtable(16, hash_from_key_fn, keys_equal_fn);
1757
1758 if (!reachable) {
1759 log("check_store: ENOMEM");
1760 return;
1761 }
1762
1763 log("Checking store ...");
1764 if (!check_store_(root, reachable) &&
1765 !check_transactions(reachable))
1766 clean_store(reachable);
1767 log("Checking store complete.");
1768
1769 hashtable_destroy(reachable, 0 /* Don't free values (they are all
1770 (void *)1) */);
1771 talloc_free(root);
1772 }
1773
1774
1775 /* Something is horribly wrong: check the store. */
corrupt(struct connection * conn,const char * fmt,...)1776 void corrupt(struct connection *conn, const char *fmt, ...)
1777 {
1778 va_list arglist;
1779 char *str;
1780 int saved_errno = errno;
1781
1782 va_start(arglist, fmt);
1783 str = talloc_vasprintf(NULL, fmt, arglist);
1784 va_end(arglist);
1785
1786 log("corruption detected by connection %i: err %s: %s",
1787 conn ? (int)conn->id : -1, strerror(saved_errno), str);
1788
1789 check_store();
1790 }
1791
1792
1793 #ifdef NO_SOCKETS
init_sockets(int ** psock,int ** pro_sock)1794 static void init_sockets(int **psock, int **pro_sock)
1795 {
1796 static int minus_one = -1;
1797 *psock = *pro_sock = &minus_one;
1798 }
1799 #else
destroy_fd(void * _fd)1800 static int destroy_fd(void *_fd)
1801 {
1802 int *fd = _fd;
1803 close(*fd);
1804 return 0;
1805 }
1806
init_sockets(int ** psock,int ** pro_sock)1807 static void init_sockets(int **psock, int **pro_sock)
1808 {
1809 struct sockaddr_un addr;
1810 int *sock, *ro_sock;
1811 const char *soc_str = xs_daemon_socket();
1812 const char *soc_str_ro = xs_daemon_socket_ro();
1813
1814 /* Create sockets for them to listen to. */
1815 *psock = sock = talloc(talloc_autofree_context(), int);
1816 if (!sock)
1817 barf_perror("No memory when creating sockets");
1818 *sock = socket(PF_UNIX, SOCK_STREAM, 0);
1819 if (*sock < 0)
1820 barf_perror("Could not create socket");
1821 *pro_sock = ro_sock = talloc(talloc_autofree_context(), int);
1822 if (!ro_sock)
1823 barf_perror("No memory when creating sockets");
1824 *ro_sock = socket(PF_UNIX, SOCK_STREAM, 0);
1825 if (*ro_sock < 0)
1826 barf_perror("Could not create socket");
1827 talloc_set_destructor(sock, destroy_fd);
1828 talloc_set_destructor(ro_sock, destroy_fd);
1829
1830 /* FIXME: Be more sophisticated, don't mug running daemon. */
1831 unlink(soc_str);
1832 unlink(soc_str_ro);
1833
1834 addr.sun_family = AF_UNIX;
1835
1836 if(strlen(soc_str) >= sizeof(addr.sun_path))
1837 barf_perror("socket string '%s' too long", soc_str);
1838 strcpy(addr.sun_path, soc_str);
1839 if (bind(*sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1840 barf_perror("Could not bind socket to %s", soc_str);
1841
1842 if(strlen(soc_str_ro) >= sizeof(addr.sun_path))
1843 barf_perror("socket string '%s' too long", soc_str_ro);
1844 strcpy(addr.sun_path, soc_str_ro);
1845 if (bind(*ro_sock, (struct sockaddr *)&addr, sizeof(addr)) != 0)
1846 barf_perror("Could not bind socket to %s", soc_str_ro);
1847
1848 if (chmod(soc_str, 0600) != 0
1849 || chmod(soc_str_ro, 0660) != 0)
1850 barf_perror("Could not chmod sockets");
1851
1852 if (listen(*sock, 1) != 0
1853 || listen(*ro_sock, 1) != 0)
1854 barf_perror("Could not listen on sockets");
1855
1856
1857 }
1858 #endif
1859
usage(void)1860 static void usage(void)
1861 {
1862 fprintf(stderr,
1863 "Usage:\n"
1864 "\n"
1865 " xenstored <options>\n"
1866 "\n"
1867 "where options may include:\n"
1868 "\n"
1869 " -D, --no-domain-init to state that xenstored should not initialise dom0,\n"
1870 " -F, --pid-file <file> giving a file for the daemon's pid to be written,\n"
1871 " -H, --help to output this message,\n"
1872 " -N, --no-fork to request that the daemon does not fork,\n"
1873 " -P, --output-pid to request that the pid of the daemon is output,\n"
1874 " -T, --trace-file <file> giving the file for logging, and\n"
1875 " -E, --entry-nb <nb> limit the number of entries per domain,\n"
1876 " -S, --entry-size <size> limit the size of entry per domain, and\n"
1877 " -W, --watch-nb <nb> limit the number of watches per domain,\n"
1878 " -t, --transaction <nb> limit the number of transaction allowed per domain,\n"
1879 " -R, --no-recovery to request that no recovery should be attempted when\n"
1880 " the store is corrupted (debug only),\n"
1881 " -I, --internal-db store database in memory, not on disk\n"
1882 " -V, --verbose to request verbose execution.\n");
1883 }
1884
1885
1886 static struct option options[] = {
1887 { "no-domain-init", 0, NULL, 'D' },
1888 { "entry-nb", 1, NULL, 'E' },
1889 { "pid-file", 1, NULL, 'F' },
1890 { "event", 1, NULL, 'e' },
1891 { "master-domid", 1, NULL, 'm' },
1892 { "help", 0, NULL, 'H' },
1893 { "no-fork", 0, NULL, 'N' },
1894 { "priv-domid", 1, NULL, 'p' },
1895 { "output-pid", 0, NULL, 'P' },
1896 { "entry-size", 1, NULL, 'S' },
1897 { "trace-file", 1, NULL, 'T' },
1898 { "transaction", 1, NULL, 't' },
1899 { "no-recovery", 0, NULL, 'R' },
1900 { "internal-db", 0, NULL, 'I' },
1901 { "verbose", 0, NULL, 'V' },
1902 { "watch-nb", 1, NULL, 'W' },
1903 { NULL, 0, NULL, 0 } };
1904
1905 extern void dump_conn(struct connection *conn);
1906 int dom0_domid = 0;
1907 int dom0_event = 0;
1908 int priv_domid = 0;
1909
main(int argc,char * argv[])1910 int main(int argc, char *argv[])
1911 {
1912 int opt, *sock = NULL, *ro_sock = NULL;
1913 int sock_pollfd_idx = -1, ro_sock_pollfd_idx = -1;
1914 bool dofork = true;
1915 bool outputpid = false;
1916 bool no_domain_init = false;
1917 const char *pidfile = NULL;
1918 int timeout;
1919
1920
1921 while ((opt = getopt_long(argc, argv, "DE:F:HNPS:t:T:RVW:", options,
1922 NULL)) != -1) {
1923 switch (opt) {
1924 case 'D':
1925 no_domain_init = true;
1926 break;
1927 case 'E':
1928 quota_nb_entry_per_domain = strtol(optarg, NULL, 10);
1929 break;
1930 case 'F':
1931 pidfile = optarg;
1932 break;
1933 case 'H':
1934 usage();
1935 return 0;
1936 case 'N':
1937 dofork = false;
1938 break;
1939 case 'P':
1940 outputpid = true;
1941 break;
1942 case 'R':
1943 recovery = false;
1944 break;
1945 case 'S':
1946 quota_max_entry_size = strtol(optarg, NULL, 10);
1947 break;
1948 case 't':
1949 quota_max_transaction = strtol(optarg, NULL, 10);
1950 break;
1951 case 'T':
1952 tracefile = optarg;
1953 break;
1954 case 'I':
1955 tdb_flags = TDB_INTERNAL|TDB_NOLOCK;
1956 break;
1957 case 'V':
1958 verbose = true;
1959 break;
1960 case 'W':
1961 quota_nb_watch_per_domain = strtol(optarg, NULL, 10);
1962 break;
1963 case 'e':
1964 dom0_event = strtol(optarg, NULL, 10);
1965 break;
1966 case 'm':
1967 dom0_domid = strtol(optarg, NULL, 10);
1968 break;
1969 case 'p':
1970 priv_domid = strtol(optarg, NULL, 10);
1971 break;
1972 }
1973 }
1974 if (optind != argc)
1975 barf("%s: No arguments desired", argv[0]);
1976
1977 reopen_log();
1978
1979 /* make sure xenstored directories exist */
1980 /* Errors ignored here, will be reported when we open files */
1981 mkdir(xs_daemon_rundir(), 0755);
1982 mkdir(xs_daemon_rootdir(), 0755);
1983
1984 if (dofork) {
1985 openlog("xenstored", 0, LOG_DAEMON);
1986 daemonize();
1987 }
1988 if (pidfile)
1989 write_pidfile(pidfile);
1990
1991 /* Talloc leak reports go to stderr, which is closed if we fork. */
1992 if (!dofork)
1993 talloc_enable_leak_report_full();
1994
1995 /* Don't kill us with SIGPIPE. */
1996 signal(SIGPIPE, SIG_IGN);
1997
1998 talloc_enable_null_tracking();
1999
2000 init_sockets(&sock, &ro_sock);
2001
2002 init_pipe(reopen_log_pipe);
2003
2004 /* Setup the database */
2005 setup_structure();
2006
2007 /* Listen to hypervisor. */
2008 if (!no_domain_init)
2009 domain_init();
2010
2011 /* Restore existing connections. */
2012 restore_existing_connections();
2013
2014 if (outputpid) {
2015 printf("%ld\n", (long)getpid());
2016 fflush(stdout);
2017 }
2018
2019 /* redirect to /dev/null now we're ready to accept connections */
2020 if (dofork)
2021 finish_daemonize();
2022
2023 signal(SIGHUP, trigger_reopen_log);
2024 if (tracefile)
2025 tracefile = talloc_strdup(NULL, tracefile);
2026
2027 /* Get ready to listen to the tools. */
2028 initialize_fds(*sock, &sock_pollfd_idx, *ro_sock, &ro_sock_pollfd_idx,
2029 &timeout);
2030
2031 /* Tell the kernel we're up and running. */
2032 xenbus_notify_running();
2033
2034 #if defined(XEN_SYSTEMD_ENABLED)
2035 sd_notify(1, "READY=1");
2036 fprintf(stderr, SD_NOTICE "xenstored is ready\n");
2037 #endif
2038
2039 /* Main loop. */
2040 for (;;) {
2041 struct connection *conn, *next;
2042
2043 if (poll(fds, nr_fds, timeout) < 0) {
2044 if (errno == EINTR)
2045 continue;
2046 barf_perror("Poll failed");
2047 }
2048
2049 if (reopen_log_pipe0_pollfd_idx != -1) {
2050 if (fds[reopen_log_pipe0_pollfd_idx].revents
2051 & ~POLLIN) {
2052 close(reopen_log_pipe[0]);
2053 close(reopen_log_pipe[1]);
2054 init_pipe(reopen_log_pipe);
2055 } else if (fds[reopen_log_pipe0_pollfd_idx].revents
2056 & POLLIN) {
2057 char c;
2058 if (read(reopen_log_pipe[0], &c, 1) != 1)
2059 barf_perror("read failed");
2060 reopen_log();
2061 }
2062 reopen_log_pipe0_pollfd_idx = -1;
2063 }
2064
2065 if (sock_pollfd_idx != -1) {
2066 if (fds[sock_pollfd_idx].revents & ~POLLIN) {
2067 barf_perror("sock poll failed");
2068 break;
2069 } else if (fds[sock_pollfd_idx].revents & POLLIN) {
2070 accept_connection(*sock, true);
2071 sock_pollfd_idx = -1;
2072 }
2073 }
2074
2075 if (ro_sock_pollfd_idx != -1) {
2076 if (fds[ro_sock_pollfd_idx].revents & ~POLLIN) {
2077 barf_perror("ro sock poll failed");
2078 break;
2079 } else if (fds[ro_sock_pollfd_idx].revents & POLLIN) {
2080 accept_connection(*ro_sock, false);
2081 ro_sock_pollfd_idx = -1;
2082 }
2083 }
2084
2085 if (xce_pollfd_idx != -1) {
2086 if (fds[xce_pollfd_idx].revents & ~POLLIN) {
2087 barf_perror("xce_handle poll failed");
2088 break;
2089 } else if (fds[xce_pollfd_idx].revents & POLLIN) {
2090 handle_event();
2091 xce_pollfd_idx = -1;
2092 }
2093 }
2094
2095 next = list_entry(connections.next, typeof(*conn), list);
2096 if (&next->list != &connections)
2097 talloc_increase_ref_count(next);
2098 while (&next->list != &connections) {
2099 conn = next;
2100
2101 next = list_entry(conn->list.next,
2102 typeof(*conn), list);
2103 if (&next->list != &connections)
2104 talloc_increase_ref_count(next);
2105
2106 if (conn->domain) {
2107 if (domain_can_read(conn))
2108 handle_input(conn);
2109 if (talloc_free(conn) == 0)
2110 continue;
2111
2112 talloc_increase_ref_count(conn);
2113 if (domain_can_write(conn) &&
2114 !list_empty(&conn->out_list))
2115 handle_output(conn);
2116 if (talloc_free(conn) == 0)
2117 continue;
2118 } else {
2119 if (conn->pollfd_idx != -1) {
2120 if (fds[conn->pollfd_idx].revents
2121 & ~(POLLIN|POLLOUT))
2122 talloc_free(conn);
2123 else if (fds[conn->pollfd_idx].revents
2124 & POLLIN)
2125 handle_input(conn);
2126 }
2127 if (talloc_free(conn) == 0)
2128 continue;
2129
2130 talloc_increase_ref_count(conn);
2131
2132 if (conn->pollfd_idx != -1) {
2133 if (fds[conn->pollfd_idx].revents
2134 & ~(POLLIN|POLLOUT))
2135 talloc_free(conn);
2136 else if (fds[conn->pollfd_idx].revents
2137 & POLLOUT)
2138 handle_output(conn);
2139 }
2140 if (talloc_free(conn) == 0)
2141 continue;
2142
2143 conn->pollfd_idx = -1;
2144 }
2145 }
2146
2147 initialize_fds(*sock, &sock_pollfd_idx, *ro_sock,
2148 &ro_sock_pollfd_idx, &timeout);
2149 }
2150 }
2151
2152 /*
2153 * Local variables:
2154 * c-file-style: "linux"
2155 * indent-tabs-mode: t
2156 * c-indent-level: 8
2157 * c-basic-offset: 8
2158 * tab-width: 8
2159 * End:
2160 */
2161