1 /*
2  * Copyright (c) 2008, XenSource Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of XenSource Inc. nor the names of its contributors
13  *       may be used to endorse or promote products derived from this software
14  *       without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  * A note on write transactions:
29  * Writes that require updating the BAT or bitmaps cannot be signaled
30  * as complete until all updates have reached disk.  Transactions are
31  * used to ensure proper ordering in these cases.  The two types of
32  * transactions are as follows:
33  *   - Bitmap updates only: data writes that require updates to the same
34  *     bitmap are grouped in a transaction.  Only after all data writes
35  *     in a transaction complete does the bitmap write commence.  Only
36  *     after the bitmap write finishes are the data writes signalled as
37  *     complete.
38  *   - BAT and bitmap updates: data writes are grouped in transactions
39  *     as above, but a special extra write is included in the transaction,
40  *     which zeros out the newly allocated bitmap on disk.  When the data
41  *     writes and the zero-bitmap write complete, the BAT and bitmap writes
42  *     are started in parallel.  The transaction is completed only after both
43  *     the BAT and bitmap writes successfully return.
44  */
45 
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <unistd.h>
51 #include <sys/stat.h>
52 #include <sys/ioctl.h>
53 #include <string.h>    /* for memset.                                 */
54 #include <libaio.h>
55 #include <sys/mman.h>
56 
57 #include "libvhd.h"
58 #include "tapdisk.h"
59 #include "tapdisk-driver.h"
60 #include "tapdisk-interface.h"
61 #include "tapdisk-disktype.h"
62 
63 unsigned int SPB;
64 
65 #define DEBUGGING   2
66 #define ASSERTING   1
67 #define MICROSOFT_COMPAT
68 
69 #define VHD_BATMAP_MAX_RETRIES 10
70 
71 #define __TRACE(s)							\
72 	do {								\
73 		DBG(TLOG_DBG, "%s: QUEUED: %" PRIu64 ", COMPLETED: %"	\
74 		    PRIu64", RETURNED: %" PRIu64 ", DATA_ALLOCATED: "	\
75 		    "%lu, BBLK: 0x%04x\n",				\
76 		    s->vhd.file, s->queued, s->completed, s->returned,	\
77 		    VHD_REQS_DATA - s->vreq_free_count,			\
78 		    s->bat.pbw_blk);					\
79 	} while(0)
80 
81 #define __ASSERT(_p)							\
82 	if (!(_p)) {							\
83 		DPRINTF("%s:%d: FAILED ASSERTION: '%s'\n",		\
84 			__FILE__, __LINE__, #_p);			\
85 		DBG(TLOG_WARN, "%s:%d: FAILED ASSERTION: '%s'\n",	\
86 		    __FILE__, __LINE__, #_p);				\
87 		tlog_flush();						\
88 		abort();                                                \
89 	}
90 
91 #if (DEBUGGING == 1)
92   #define DBG(level, _f, _a...)      DPRINTF(_f, ##_a)
93   #define ERR(err, _f, _a...)        DPRINTF("ERROR: %d: " _f, err, ##_a)
94   #define TRACE(s)                   ((void)0)
95 #elif (DEBUGGING == 2)
96   #define DBG(level, _f, _a...)      tlog_write(level, _f, ##_a)
97   #define ERR(err, _f, _a...)	     tlog_error(err, _f, ##_a)
98   #define TRACE(s)                   __TRACE(s)
99 #else
100   #define DBG(level, _f, _a...)      ((void)0)
101   #define ERR(err, _f, _a...)        ((void)0)
102   #define TRACE(s)                   ((void)0)
103 #endif
104 
105 #if (ASSERTING == 1)
106   #define ASSERT(_p)                 __ASSERT(_p)
107 #else
108   #define ASSERT(_p)                 ((void)0)
109 #endif
110 
111 /******VHD DEFINES******/
112 #define VHD_CACHE_SIZE               32
113 
114 #define VHD_REQS_DATA                TAPDISK_DATA_REQUESTS
115 #define VHD_REQS_META                (VHD_CACHE_SIZE + 2)
116 #define VHD_REQS_TOTAL               (VHD_REQS_DATA + VHD_REQS_META)
117 
118 #define VHD_OP_BAT_WRITE             0
119 #define VHD_OP_DATA_READ             1
120 #define VHD_OP_DATA_WRITE            2
121 #define VHD_OP_BITMAP_READ           3
122 #define VHD_OP_BITMAP_WRITE          4
123 #define VHD_OP_ZERO_BM_WRITE         5
124 
125 #define VHD_BM_BAT_LOCKED            0
126 #define VHD_BM_BAT_CLEAR             1
127 #define VHD_BM_BIT_CLEAR             2
128 #define VHD_BM_BIT_SET               3
129 #define VHD_BM_NOT_CACHED            4
130 #define VHD_BM_READ_PENDING          5
131 
132 #define VHD_FLAG_OPEN_RDONLY         1
133 #define VHD_FLAG_OPEN_NO_CACHE       2
134 #define VHD_FLAG_OPEN_QUIET          4
135 #define VHD_FLAG_OPEN_STRICT         8
136 #define VHD_FLAG_OPEN_QUERY          16
137 #define VHD_FLAG_OPEN_PREALLOCATE    32
138 
139 #define VHD_FLAG_BAT_LOCKED          1
140 #define VHD_FLAG_BAT_WRITE_STARTED   2
141 
142 #define VHD_FLAG_BM_UPDATE_BAT       1
143 #define VHD_FLAG_BM_WRITE_PENDING    2
144 #define VHD_FLAG_BM_READ_PENDING     4
145 #define VHD_FLAG_BM_LOCKED           8
146 
147 #define VHD_FLAG_REQ_UPDATE_BAT      1
148 #define VHD_FLAG_REQ_UPDATE_BITMAP   2
149 #define VHD_FLAG_REQ_QUEUED          4
150 #define VHD_FLAG_REQ_FINISHED        8
151 
152 #define VHD_FLAG_TX_LIVE             1
153 #define VHD_FLAG_TX_UPDATE_BAT       2
154 
155 typedef uint8_t vhd_flag_t;
156 
157 struct vhd_state;
158 struct vhd_request;
159 
160 struct vhd_req_list {
161 	struct vhd_request       *head;
162 	struct vhd_request       *tail;
163 };
164 
165 struct vhd_transaction {
166 	int                       error;
167 	int                       closed;
168 	int                       started;
169 	int                       finished;
170 	vhd_flag_t                status;
171 	struct vhd_req_list       requests;
172 };
173 
174 struct vhd_request {
175 	int                       error;
176 	uint8_t                   op;
177 	vhd_flag_t                flags;
178 	td_request_t              treq;
179 	struct tiocb              tiocb;
180 	struct vhd_state         *state;
181 	struct vhd_request       *next;
182 	struct vhd_transaction   *tx;
183 };
184 
185 struct vhd_bat_state {
186 	vhd_bat_t                 bat;
187 	vhd_batmap_t              batmap;
188 	vhd_flag_t                status;
189 	uint32_t                  pbw_blk;     /* blk num of pending write */
190 	uint64_t                  pbw_offset;  /* file offset of same */
191 	struct vhd_request        req;         /* for writing bat table */
192 	struct vhd_request        zero_req;    /* for initializing bitmaps */
193 	char                     *bat_buf;
194 };
195 
196 struct vhd_bitmap {
197 	u32                       blk;
198 	u64                       seqno;       /* lru sequence number */
199 	vhd_flag_t                status;
200 
201 	char                     *map;         /* map should only be modified
202 					        * in finish_bitmap_write */
203 	char                     *shadow;      /* in-memory bitmap changes are
204 					        * made to shadow and copied to
205 					        * map only after having been
206 					        * flushed to disk */
207 	struct vhd_transaction    tx;          /* transaction data structure
208 						* encapsulating data, bitmap,
209 						* and bat writes */
210 	struct vhd_req_list       queue;       /* data writes waiting for next
211 						* transaction */
212 	struct vhd_req_list       waiting;     /* pending requests that cannot
213 					        * be serviced until this bitmap
214 					        * is read from disk */
215 	struct vhd_request        req;
216 };
217 
218 struct vhd_state {
219 	vhd_flag_t                flags;
220 
221         /* VHD stuff */
222 	vhd_context_t             vhd;
223 	u32                       spp;         /* sectors per page */
224         u32                       spb;         /* sectors per block */
225         u64                       next_db;     /* pointer to the next
226 						* (unallocated) datablock */
227 
228 	struct vhd_bat_state      bat;
229 
230 	u64                       bm_lru;      /* lru sequence number */
231 	u32                       bm_secs;     /* size of bitmap, in sectors */
232 	struct vhd_bitmap        *bitmap[VHD_CACHE_SIZE];
233 
234 	int                       bm_free_count;
235 	struct vhd_bitmap        *bitmap_free[VHD_CACHE_SIZE];
236 	struct vhd_bitmap         bitmap_list[VHD_CACHE_SIZE];
237 
238 	int                       vreq_free_count;
239 	struct vhd_request       *vreq_free[VHD_REQS_DATA];
240 	struct vhd_request        vreq_list[VHD_REQS_DATA];
241 
242 	td_driver_t              *driver;
243 
244 	uint64_t                  queued;
245 	uint64_t                  completed;
246 	uint64_t                  returned;
247 	uint64_t                  reads;
248 	uint64_t                  read_size;
249 	uint64_t                  writes;
250 	uint64_t                  write_size;
251 };
252 
253 #define test_vhd_flag(word, flag)  ((word) & (flag))
254 #define set_vhd_flag(word, flag)   ((word) |= (flag))
255 #define clear_vhd_flag(word, flag) ((word) &= ~(flag))
256 
257 #define bat_entry(s, blk)          ((s)->bat.bat.bat[(blk)])
258 
259 static void vhd_complete(void *, struct tiocb *, int);
260 static void finish_data_transaction(struct vhd_state *, struct vhd_bitmap *);
261 
262 static struct vhd_state  *_vhd_master;
263 static unsigned long      _vhd_zsize;
264 static char              *_vhd_zeros;
265 
266 static int
vhd_initialize(struct vhd_state * s)267 vhd_initialize(struct vhd_state *s)
268 {
269 	if (_vhd_zeros)
270 		return 0;
271 
272 	_vhd_zsize = 2 * getpagesize();
273 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
274 		_vhd_zsize += VHD_BLOCK_SIZE;
275 
276 	_vhd_zeros = mmap(0, _vhd_zsize, PROT_READ,
277 			  MAP_SHARED | MAP_ANON, -1, 0);
278 	if (_vhd_zeros == MAP_FAILED) {
279 		EPRINTF("vhd_initialize failed: %d\n", -errno);
280 		_vhd_zeros = NULL;
281 		_vhd_zsize = 0;
282 		return -errno;
283 	}
284 
285 	_vhd_master = s;
286 	return 0;
287 }
288 
289 static void
vhd_free(struct vhd_state * s)290 vhd_free(struct vhd_state *s)
291 {
292 	if (_vhd_master != s || !_vhd_zeros)
293 		return;
294 
295 	munmap(_vhd_zeros, _vhd_zsize);
296 	_vhd_zsize  = 0;
297 	_vhd_zeros  = NULL;
298 	_vhd_master = NULL;
299 }
300 
301 static char *
_get_vhd_zeros(const char * func,unsigned long size)302 _get_vhd_zeros(const char *func, unsigned long size)
303 {
304 	if (!_vhd_zeros || _vhd_zsize < size) {
305 		EPRINTF("invalid zero request from %s: %lu, %lu, %p\n",
306 			func, size, _vhd_zsize, _vhd_zeros);
307 		ASSERT(0);
308 	}
309 
310 	return _vhd_zeros;
311 }
312 
313 #define vhd_zeros(size)	_get_vhd_zeros(__func__, size)
314 
315 static inline void
set_batmap(struct vhd_state * s,uint32_t blk)316 set_batmap(struct vhd_state *s, uint32_t blk)
317 {
318 	if (s->bat.batmap.map) {
319 		vhd_batmap_set(&s->vhd, &s->bat.batmap, blk);
320 		DBG(TLOG_DBG, "block 0x%x completely full\n", blk);
321 	}
322 }
323 
324 static inline int
test_batmap(struct vhd_state * s,uint32_t blk)325 test_batmap(struct vhd_state *s, uint32_t blk)
326 {
327 	if (!s->bat.batmap.map)
328 		return 0;
329 	return vhd_batmap_test(&s->vhd, &s->bat.batmap, blk);
330 }
331 
332 static int
vhd_kill_footer(struct vhd_state * s)333 vhd_kill_footer(struct vhd_state *s)
334 {
335 	int err;
336 	off_t end;
337 	char *zeros;
338 
339 	if (s->vhd.footer.type == HD_TYPE_FIXED)
340 		return 0;
341 
342 	err = posix_memalign((void **)&zeros, 512, 512);
343 	if (err)
344 		return -err;
345 
346 	err = 1;
347 	memset(zeros, 0xc7c7c7c7, 512);
348 
349 	if ((end = lseek(s->vhd.fd, 0, SEEK_END)) == -1)
350 		goto fail;
351 
352 	if (lseek(s->vhd.fd, (end - 512), SEEK_SET) == -1)
353 		goto fail;
354 
355 	if (write(s->vhd.fd, zeros, 512) != 512)
356 		goto fail;
357 
358 	err = 0;
359 
360  fail:
361 	free(zeros);
362 	if (err)
363 		return (errno ? -errno : -EIO);
364 	return 0;
365 }
366 
367 static inline int
find_next_free_block(struct vhd_state * s)368 find_next_free_block(struct vhd_state *s)
369 {
370 	int err;
371 	off_t eom;
372 	uint32_t i, entry;
373 
374 	err = vhd_end_of_headers(&s->vhd, &eom);
375 	if (err)
376 		return err;
377 
378 	s->next_db = secs_round_up(eom);
379 
380 	for (i = 0; i < s->bat.bat.entries; i++) {
381 		entry = bat_entry(s, i);
382 		if (entry != DD_BLK_UNUSED && entry >= s->next_db)
383 			s->next_db = entry + s->spb + s->bm_secs;
384 	}
385 
386 	return 0;
387 }
388 
389 static void
vhd_free_bat(struct vhd_state * s)390 vhd_free_bat(struct vhd_state *s)
391 {
392 	free(s->bat.bat.bat);
393 	free(s->bat.batmap.map);
394 	free(s->bat.bat_buf);
395 	memset(&s->bat, 0, sizeof(struct vhd_bat));
396 }
397 
398 static int
vhd_initialize_bat(struct vhd_state * s)399 vhd_initialize_bat(struct vhd_state *s)
400 {
401 	int err, psize, batmap_required, i;
402 
403 	memset(&s->bat, 0, sizeof(struct vhd_bat));
404 
405 	psize = getpagesize();
406 
407 	err = vhd_read_bat(&s->vhd, &s->bat.bat);
408 	if (err) {
409 		EPRINTF("%s: reading bat: %d\n", s->vhd.file, err);
410 		return err;
411 	}
412 
413 	batmap_required = 1;
414 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY)) {
415 		batmap_required = 0;
416 	} else {
417 		err = find_next_free_block(s);
418 		if (err)
419 			goto fail;
420 	}
421 
422 	if (vhd_has_batmap(&s->vhd)) {
423 		for (i = 0; i < VHD_BATMAP_MAX_RETRIES; i++) {
424 			err = vhd_read_batmap(&s->vhd, &s->bat.batmap);
425 			if (err) {
426 				EPRINTF("%s: reading batmap: %d\n",
427 						s->vhd.file, err);
428 				if (batmap_required)
429 					goto fail;
430 			} else {
431 				break;
432 			}
433 		}
434 		if (err)
435 			EPRINTF("%s: ignoring non-critical batmap error\n",
436 					s->vhd.file);
437 	}
438 
439 	err = posix_memalign((void **)&s->bat.bat_buf,
440 			     VHD_SECTOR_SIZE, VHD_SECTOR_SIZE);
441 	if (err) {
442 		s->bat.bat_buf = NULL;
443 		goto fail;
444 	}
445 
446 	return 0;
447 
448 fail:
449 	vhd_free_bat(s);
450 	return err;
451 }
452 
453 static void
vhd_free_bitmap_cache(struct vhd_state * s)454 vhd_free_bitmap_cache(struct vhd_state *s)
455 {
456 	int i;
457 	struct vhd_bitmap *bm;
458 
459 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
460 		bm = s->bitmap_list + i;
461 		free(bm->map);
462 		free(bm->shadow);
463 		s->bitmap_free[i] = NULL;
464 	}
465 
466 	memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
467 }
468 
469 static int
vhd_initialize_bitmap_cache(struct vhd_state * s)470 vhd_initialize_bitmap_cache(struct vhd_state *s)
471 {
472 	int i, err, map_size;
473 	struct vhd_bitmap *bm;
474 
475 	memset(s->bitmap_list, 0, sizeof(struct vhd_bitmap) * VHD_CACHE_SIZE);
476 
477 	s->bm_lru        = 0;
478 	map_size         = vhd_sectors_to_bytes(s->bm_secs);
479 	s->bm_free_count = VHD_CACHE_SIZE;
480 
481 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
482 		bm = s->bitmap_list + i;
483 
484 		err = posix_memalign((void **)&bm->map, 512, map_size);
485 		if (err) {
486 			bm->map = NULL;
487 			goto fail;
488 		}
489 
490 		err = posix_memalign((void **)&bm->shadow, 512, map_size);
491 		if (err) {
492 			bm->shadow = NULL;
493 			goto fail;
494 		}
495 
496 		memset(bm->map, 0, map_size);
497 		memset(bm->shadow, 0, map_size);
498 		s->bitmap_free[i] = bm;
499 	}
500 
501 	return 0;
502 
503 fail:
504 	vhd_free_bitmap_cache(s);
505 	return err;
506 }
507 
508 static int
vhd_initialize_dynamic_disk(struct vhd_state * s)509 vhd_initialize_dynamic_disk(struct vhd_state *s)
510 {
511 	int err;
512 
513 	err = vhd_get_header(&s->vhd);
514 	if (err) {
515 		if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
516 			EPRINTF("Error reading VHD DD header.\n");
517 		return err;
518 	}
519 
520 	if (s->vhd.header.hdr_ver != 0x00010000) {
521 		EPRINTF("unsupported header version! (0x%x)\n",
522 			s->vhd.header.hdr_ver);
523 		return -EINVAL;
524 	}
525 
526 	s->spp     = getpagesize() >> VHD_SECTOR_SHIFT;
527 	s->spb     = s->vhd.header.block_size >> VHD_SECTOR_SHIFT;
528 	s->bm_secs = secs_round_up_no_zero(s->spb >> 3);
529 
530 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_NO_CACHE))
531 		return 0;
532 
533 	err = vhd_initialize_bat(s);
534 	if (err)
535 		return err;
536 
537 	err = vhd_initialize_bitmap_cache(s);
538 	if (err) {
539 		vhd_free_bat(s);
540 		return err;
541 	}
542 
543 	return 0;
544 }
545 
546 static int
vhd_check_version(struct vhd_state * s)547 vhd_check_version(struct vhd_state *s)
548 {
549 	if (strncmp(s->vhd.footer.crtr_app, "tap", 3))
550 		return 0;
551 
552 	if (s->vhd.footer.crtr_ver > VHD_CURRENT_VERSION) {
553 		if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
554 			EPRINTF("WARNING: %s vhd creator version 0x%08x, "
555 				"but only versions up to 0x%08x are "
556 				"supported for IO\n", s->vhd.file,
557 				s->vhd.footer.crtr_ver, VHD_CURRENT_VERSION);
558 
559 		return -EINVAL;
560 	}
561 
562 	return 0;
563 }
564 
565 static void
vhd_log_open(struct vhd_state * s)566 vhd_log_open(struct vhd_state *s)
567 {
568 	char buf[5];
569 	uint32_t i, allocated, full;
570 
571 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
572 		return;
573 
574 	snprintf(buf, sizeof(buf), "%s", s->vhd.footer.crtr_app);
575 	if (!vhd_type_dynamic(&s->vhd)) {
576 		DPRINTF("%s version: %s 0x%08x\n",
577 			s->vhd.file, buf, s->vhd.footer.crtr_ver);
578 		return;
579 	}
580 
581 	allocated = 0;
582 	full      = 0;
583 
584 	for (i = 0; i < s->bat.bat.entries; i++) {
585 		if (bat_entry(s, i) != DD_BLK_UNUSED)
586 			allocated++;
587 		if (test_batmap(s, i))
588 			full++;
589 	}
590 
591 	DPRINTF("%s version: %s 0x%08x, b: %u, a: %u, f: %u, n: %"PRIu64"\n",
592 		s->vhd.file, buf, s->vhd.footer.crtr_ver, s->bat.bat.entries,
593 		allocated, full, s->next_db);
594 }
595 
596 static int
__vhd_open(td_driver_t * driver,const char * name,vhd_flag_t flags)597 __vhd_open(td_driver_t *driver, const char *name, vhd_flag_t flags)
598 {
599         int i, o_flags, err;
600 	struct vhd_state *s;
601 
602         DBG(TLOG_INFO, "vhd_open: %s\n", name);
603 	if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT))
604 		libvhd_set_log_level(1);
605 
606 	s = (struct vhd_state *)driver->data;
607 	memset(s, 0, sizeof(struct vhd_state));
608 
609 	s->flags  = flags;
610 	s->driver = driver;
611 
612 	err = vhd_initialize(s);
613 	if (err)
614 		return err;
615 
616 	o_flags = ((test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) ?
617 		   VHD_OPEN_RDONLY : VHD_OPEN_RDWR);
618 
619 	err = vhd_open(&s->vhd, name, o_flags);
620 	if (err) {
621 		libvhd_set_log_level(1);
622 		err = vhd_open(&s->vhd, name, o_flags);
623 		if (err) {
624 			EPRINTF("Unable to open [%s] (%d)!\n", name, err);
625 			return err;
626 		}
627 	}
628 
629 	err = vhd_check_version(s);
630 	if (err)
631 		goto fail;
632 
633 	s->spb = s->spp = 1;
634 
635 	if (vhd_type_dynamic(&s->vhd)) {
636 		err = vhd_initialize_dynamic_disk(s);
637 		if (err)
638 			goto fail;
639 	}
640 
641 	vhd_log_open(s);
642 
643 	SPB = s->spb;
644 
645 	s->vreq_free_count = VHD_REQS_DATA;
646 	for (i = 0; i < VHD_REQS_DATA; i++)
647 		s->vreq_free[i] = s->vreq_list + i;
648 
649 	driver->info.size        = s->vhd.footer.curr_size >> VHD_SECTOR_SHIFT;
650 	driver->info.sector_size = VHD_SECTOR_SIZE;
651 	driver->info.info        = 0;
652 
653         DBG(TLOG_INFO, "vhd_open: done (sz:%"PRIu64", sct:%"PRIu64
654             ", inf:%u)\n",
655 	    driver->info.size, driver->info.sector_size, driver->info.info);
656 
657 	if (test_vhd_flag(flags, VHD_FLAG_OPEN_STRICT) &&
658 	    !test_vhd_flag(flags, VHD_FLAG_OPEN_RDONLY)) {
659 		err = vhd_kill_footer(s);
660 		if (err) {
661 			DPRINTF("ERROR killing footer: %d\n", err);
662 			goto fail;
663 		}
664 		s->writes++;
665 	}
666 
667         return 0;
668 
669  fail:
670 	vhd_free_bat(s);
671 	vhd_free_bitmap_cache(s);
672 	vhd_close(&s->vhd);
673 	vhd_free(s);
674 	return err;
675 }
676 
677 static int
_vhd_open(td_driver_t * driver,const char * name,td_flag_t flags)678 _vhd_open(td_driver_t *driver, const char *name, td_flag_t flags)
679 {
680 	vhd_flag_t vhd_flags = 0;
681 
682 	if (flags & TD_OPEN_RDONLY)
683 		vhd_flags |= VHD_FLAG_OPEN_RDONLY;
684 	if (flags & TD_OPEN_QUIET)
685 		vhd_flags |= VHD_FLAG_OPEN_QUIET;
686 	if (flags & TD_OPEN_STRICT)
687 		vhd_flags |= VHD_FLAG_OPEN_STRICT;
688 	if (flags & TD_OPEN_QUERY)
689 		vhd_flags |= (VHD_FLAG_OPEN_QUERY  |
690 			      VHD_FLAG_OPEN_QUIET  |
691 			      VHD_FLAG_OPEN_RDONLY |
692 			      VHD_FLAG_OPEN_NO_CACHE);
693 
694 	/* pre-allocate for all but NFS and LVM storage */
695 	if (driver->storage != TAPDISK_STORAGE_TYPE_NFS &&
696 	    driver->storage != TAPDISK_STORAGE_TYPE_LVM)
697 		vhd_flags |= VHD_FLAG_OPEN_PREALLOCATE;
698 
699 	return __vhd_open(driver, name, vhd_flags);
700 }
701 
702 static void
vhd_log_close(struct vhd_state * s)703 vhd_log_close(struct vhd_state *s)
704 {
705 	uint32_t i, allocated, full;
706 
707 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_QUIET))
708 		return;
709 
710 	allocated = 0;
711 	full      = 0;
712 
713 	for (i = 0; i < s->bat.bat.entries; i++) {
714 		if (bat_entry(s, i) != DD_BLK_UNUSED)
715 			allocated++;
716 		if (test_batmap(s, i))
717 			full++;
718 	}
719 
720 	DPRINTF("%s: b: %u, a: %u, f: %u, n: %"PRIu64"\n",
721 		s->vhd.file, s->bat.bat.entries, allocated, full, s->next_db);
722 }
723 
724 static int
_vhd_close(td_driver_t * driver)725 _vhd_close(td_driver_t *driver)
726 {
727 	int err;
728 	struct vhd_state *s;
729 	struct vhd_bitmap *bm;
730 
731 	DBG(TLOG_WARN, "vhd_close\n");
732 	s = (struct vhd_state *)driver->data;
733 
734 	/* don't write footer if tapdisk is read-only */
735 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_RDONLY))
736 		goto free;
737 
738 	/*
739 	 * write footer if:
740 	 *   - we killed it on open (opened with strict)
741 	 *   - we've written data since opening
742 	 */
743 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_STRICT) || s->writes) {
744 		memcpy(&s->vhd.bat, &s->bat.bat, sizeof(vhd_bat_t));
745 		err = vhd_write_footer(&s->vhd, &s->vhd.footer);
746 		memset(&s->vhd.bat, 0, sizeof(vhd_bat_t));
747 
748 		if (err)
749 			EPRINTF("writing %s footer: %d\n", s->vhd.file, err);
750 
751 		if (!vhd_has_batmap(&s->vhd))
752 			goto free;
753 
754 		err = vhd_write_batmap(&s->vhd, &s->bat.batmap);
755 		if (err)
756 			EPRINTF("writing %s batmap: %d\n", s->vhd.file, err);
757 	}
758 
759  free:
760 	vhd_log_close(s);
761 	vhd_free_bat(s);
762 	vhd_free_bitmap_cache(s);
763 	vhd_close(&s->vhd);
764 	vhd_free(s);
765 
766 	memset(s, 0, sizeof(struct vhd_state));
767 
768 	return 0;
769 }
770 
771 int
vhd_validate_parent(td_driver_t * child_driver,td_driver_t * parent_driver,td_flag_t flags)772 vhd_validate_parent(td_driver_t *child_driver,
773 		    td_driver_t *parent_driver, td_flag_t flags)
774 {
775 	uint32_t status;
776 	struct stat stats;
777 	struct vhd_state *child  = (struct vhd_state *)child_driver->data;
778 	struct vhd_state *parent;
779 
780 	if (parent_driver->type != DISK_TYPE_VHD) {
781 		if (child_driver->type != DISK_TYPE_VHD)
782 			return -EINVAL;
783 		if (child->vhd.footer.type != HD_TYPE_DIFF)
784 			return -EINVAL;
785 		if (!vhd_parent_raw(&child->vhd))
786 			return -EINVAL;
787 		return 0;
788 	}
789 
790 	parent = (struct vhd_state *)parent_driver->data;
791 
792 	/*
793 	 * This check removed because of cases like:
794 	 *   - parent VHD marked as 'hidden'
795 	 *   - parent VHD modified during coalesce
796 	 */
797 	/*
798 	if (stat(parent->vhd.file, &stats)) {
799 		DPRINTF("ERROR stating parent file %s\n", parent->vhd.file);
800 		return -errno;
801 	}
802 
803 	if (child->hdr.prt_ts != vhd_time(stats.st_mtime)) {
804 		DPRINTF("ERROR: parent file has been modified since "
805 			"snapshot.  Child image no longer valid.\n");
806 		return -EINVAL;
807 	}
808 	*/
809 
810 	if (vhd_uuid_compare(&child->vhd.header.prt_uuid, &parent->vhd.footer.uuid)) {
811 		DPRINTF("ERROR: %s: %s, %s: parent uuid has changed since "
812 			"snapshot.  Child image no longer valid.\n",
813 			__func__, child->vhd.file, parent->vhd.file);
814 		return -EINVAL;
815 	}
816 
817 	/* TODO: compare sizes */
818 
819 	return 0;
820 }
821 
822 int
vhd_get_parent_id(td_driver_t * driver,td_disk_id_t * id)823 vhd_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
824 {
825 	int err;
826 	char *parent;
827 	struct vhd_state *s;
828 
829 	DBG(TLOG_DBG, "\n");
830 	memset(id, 0, sizeof(td_disk_id_t));
831 
832 	s = (struct vhd_state *)driver->data;
833 
834 	if (s->vhd.footer.type != HD_TYPE_DIFF)
835 		return TD_NO_PARENT;
836 
837 	err = vhd_parent_locator_get(&s->vhd, &parent);
838 	if (err)
839 		return err;
840 
841 	id->name       = parent;
842 	id->drivertype = DISK_TYPE_VHD;
843 	if (vhd_parent_raw(&s->vhd)) {
844 		DPRINTF("VHD: parent is raw\n");
845 		id->drivertype = DISK_TYPE_AIO;
846 	}
847 	return 0;
848 }
849 
850 static inline void
clear_req_list(struct vhd_req_list * list)851 clear_req_list(struct vhd_req_list *list)
852 {
853 	list->head = list->tail = NULL;
854 }
855 
856 static inline void
add_to_tail(struct vhd_req_list * list,struct vhd_request * e)857 add_to_tail(struct vhd_req_list *list, struct vhd_request *e)
858 {
859 	if (!list->head)
860 		list->head = list->tail = e;
861 	else
862 		list->tail = list->tail->next = e;
863 }
864 
865 static inline int
remove_from_req_list(struct vhd_req_list * list,struct vhd_request * e)866 remove_from_req_list(struct vhd_req_list *list, struct vhd_request *e)
867 {
868 	struct vhd_request *i = list->head;
869 
870 	if (list->head == e) {
871 		if (list->tail == e)
872 			clear_req_list(list);
873 		else
874 			list->head = list->head->next;
875 		return 0;
876 	}
877 
878 	while (i->next) {
879 		if (i->next == e) {
880 			if (list->tail == e) {
881 				i->next = NULL;
882 				list->tail = i;
883 			} else
884 				i->next = i->next->next;
885 			return 0;
886 		}
887 		i = i->next;
888 	}
889 
890 	return -EINVAL;
891 }
892 
893 static inline void
init_vhd_request(struct vhd_state * s,struct vhd_request * req)894 init_vhd_request(struct vhd_state *s, struct vhd_request *req)
895 {
896 	memset(req, 0, sizeof(struct vhd_request));
897 	req->state = s;
898 }
899 
900 static inline void
init_tx(struct vhd_transaction * tx)901 init_tx(struct vhd_transaction *tx)
902 {
903 	memset(tx, 0, sizeof(struct vhd_transaction));
904 }
905 
906 static inline void
add_to_transaction(struct vhd_transaction * tx,struct vhd_request * r)907 add_to_transaction(struct vhd_transaction *tx, struct vhd_request *r)
908 {
909 	ASSERT(!tx->closed);
910 
911 	r->tx = tx;
912 	tx->started++;
913 	add_to_tail(&tx->requests, r);
914 	set_vhd_flag(tx->status, VHD_FLAG_TX_LIVE);
915 
916 	DBG(TLOG_DBG, "blk: 0x%04"PRIx64", lsec: 0x%08"PRIx64", tx: %p, "
917 	    "started: %d, finished: %d, status: %u\n",
918 	    r->treq.sec / SPB, r->treq.sec, tx,
919 	    tx->started, tx->finished, tx->status);
920 }
921 
922 static inline int
transaction_completed(struct vhd_transaction * tx)923 transaction_completed(struct vhd_transaction *tx)
924 {
925 	return (tx->started == tx->finished);
926 }
927 
928 static inline void
init_bat(struct vhd_state * s)929 init_bat(struct vhd_state *s)
930 {
931 	s->bat.req.tx     = NULL;
932 	s->bat.req.next   = NULL;
933 	s->bat.req.error  = 0;
934 	s->bat.pbw_blk    = 0;
935 	s->bat.pbw_offset = 0;
936 	s->bat.status     = 0;
937 }
938 
939 static inline void
lock_bat(struct vhd_state * s)940 lock_bat(struct vhd_state *s)
941 {
942 	set_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
943 }
944 
945 static inline void
unlock_bat(struct vhd_state * s)946 unlock_bat(struct vhd_state *s)
947 {
948 	clear_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
949 }
950 
951 static inline int
bat_locked(struct vhd_state * s)952 bat_locked(struct vhd_state *s)
953 {
954 	return test_vhd_flag(s->bat.status, VHD_FLAG_BAT_LOCKED);
955 }
956 
957 static inline void
init_vhd_bitmap(struct vhd_state * s,struct vhd_bitmap * bm)958 init_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
959 {
960 	bm->blk    = 0;
961 	bm->seqno  = 0;
962 	bm->status = 0;
963 	init_tx(&bm->tx);
964 	clear_req_list(&bm->queue);
965 	clear_req_list(&bm->waiting);
966 	memset(bm->map, 0, vhd_sectors_to_bytes(s->bm_secs));
967 	memset(bm->shadow, 0, vhd_sectors_to_bytes(s->bm_secs));
968 	init_vhd_request(s, &bm->req);
969 }
970 
971 static inline struct vhd_bitmap *
get_bitmap(struct vhd_state * s,uint32_t block)972 get_bitmap(struct vhd_state *s, uint32_t block)
973 {
974 	int i;
975 	struct vhd_bitmap *bm;
976 
977 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
978 		bm = s->bitmap[i];
979 		if (bm && bm->blk == block)
980 			return bm;
981 	}
982 
983 	return NULL;
984 }
985 
986 static inline void
lock_bitmap(struct vhd_bitmap * bm)987 lock_bitmap(struct vhd_bitmap *bm)
988 {
989 	set_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
990 }
991 
992 static inline void
unlock_bitmap(struct vhd_bitmap * bm)993 unlock_bitmap(struct vhd_bitmap *bm)
994 {
995 	clear_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
996 }
997 
998 static inline int
bitmap_locked(struct vhd_bitmap * bm)999 bitmap_locked(struct vhd_bitmap *bm)
1000 {
1001 	return test_vhd_flag(bm->status, VHD_FLAG_BM_LOCKED);
1002 }
1003 
1004 static inline int
bitmap_valid(struct vhd_bitmap * bm)1005 bitmap_valid(struct vhd_bitmap *bm)
1006 {
1007 	return !test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
1008 }
1009 
1010 static inline int
bitmap_in_use(struct vhd_bitmap * bm)1011 bitmap_in_use(struct vhd_bitmap *bm)
1012 {
1013 	return (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING)  ||
1014 		test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING) ||
1015 		test_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT) ||
1016 		bm->waiting.head || bm->tx.requests.head || bm->queue.head);
1017 }
1018 
1019 static inline int
bitmap_full(struct vhd_state * s,struct vhd_bitmap * bm)1020 bitmap_full(struct vhd_state *s, struct vhd_bitmap *bm)
1021 {
1022 	int i, n;
1023 
1024 	n = s->spb >> 3;
1025 	for (i = 0; i < n; i++)
1026 		if (bm->map[i] != (char)0xFF)
1027 			return 0;
1028 
1029 	DBG(TLOG_DBG, "bitmap 0x%04x full\n", bm->blk);
1030 	return 1;
1031 }
1032 
1033 static struct vhd_bitmap *
remove_lru_bitmap(struct vhd_state * s)1034 remove_lru_bitmap(struct vhd_state *s)
1035 {
1036 	int i, idx = 0;
1037 	u64 seq = s->bm_lru;
1038 	struct vhd_bitmap *bm, *lru = NULL;
1039 
1040 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
1041 		bm = s->bitmap[i];
1042 		if (bm && bm->seqno < seq && !bitmap_locked(bm)) {
1043 			idx = i;
1044 			lru = bm;
1045 			seq = lru->seqno;
1046 		}
1047 	}
1048 
1049 	if (lru) {
1050 		s->bitmap[idx] = NULL;
1051 		ASSERT(!bitmap_in_use(lru));
1052 	}
1053 
1054 	return  lru;
1055 }
1056 
1057 static int
alloc_vhd_bitmap(struct vhd_state * s,struct vhd_bitmap ** bitmap,uint32_t blk)1058 alloc_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap **bitmap, uint32_t blk)
1059 {
1060 	struct vhd_bitmap *bm;
1061 
1062 	*bitmap = NULL;
1063 
1064 	if (s->bm_free_count > 0) {
1065 		bm = s->bitmap_free[--s->bm_free_count];
1066 	} else {
1067 		bm = remove_lru_bitmap(s);
1068 		if (!bm)
1069 			return -EBUSY;
1070 	}
1071 
1072 	init_vhd_bitmap(s, bm);
1073 	bm->blk = blk;
1074 	*bitmap = bm;
1075 
1076 	return 0;
1077 }
1078 
1079 static inline uint64_t
__bitmap_lru_seqno(struct vhd_state * s)1080 __bitmap_lru_seqno(struct vhd_state *s)
1081 {
1082 	int i;
1083 	struct vhd_bitmap *bm;
1084 
1085 	if (s->bm_lru == 0xffffffff) {
1086 		s->bm_lru = 0;
1087 		for (i = 0; i < VHD_CACHE_SIZE; i++) {
1088 			bm = s->bitmap[i];
1089 			if (bm) {
1090 				bm->seqno >>= 1;
1091 				if (bm->seqno > s->bm_lru)
1092 					s->bm_lru = bm->seqno;
1093 			}
1094 		}
1095 	}
1096 
1097 	return ++s->bm_lru;
1098 }
1099 
1100 static inline void
touch_bitmap(struct vhd_state * s,struct vhd_bitmap * bm)1101 touch_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1102 {
1103 	bm->seqno = __bitmap_lru_seqno(s);
1104 }
1105 
1106 static inline void
install_bitmap(struct vhd_state * s,struct vhd_bitmap * bm)1107 install_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1108 {
1109 	int i;
1110 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
1111 		if (!s->bitmap[i]) {
1112 			touch_bitmap(s, bm);
1113 			s->bitmap[i] = bm;
1114 			return;
1115 		}
1116 	}
1117 
1118 	ASSERT(0);
1119 }
1120 
1121 static inline void
free_vhd_bitmap(struct vhd_state * s,struct vhd_bitmap * bm)1122 free_vhd_bitmap(struct vhd_state *s, struct vhd_bitmap *bm)
1123 {
1124 	int i;
1125 
1126 	for (i = 0; i < VHD_CACHE_SIZE; i++)
1127 		if (s->bitmap[i] == bm)
1128 			break;
1129 
1130 	ASSERT(!bitmap_locked(bm));
1131 	ASSERT(!bitmap_in_use(bm));
1132 	ASSERT(i < VHD_CACHE_SIZE);
1133 
1134 	s->bitmap[i] = NULL;
1135 	s->bitmap_free[s->bm_free_count++] = bm;
1136 }
1137 
1138 static int
read_bitmap_cache(struct vhd_state * s,uint64_t sector,uint8_t op)1139 read_bitmap_cache(struct vhd_state *s, uint64_t sector, uint8_t op)
1140 {
1141 	u32 blk, sec;
1142 	struct vhd_bitmap *bm;
1143 
1144 	/* in fixed disks, every block is present */
1145 	if (s->vhd.footer.type == HD_TYPE_FIXED)
1146 		return VHD_BM_BIT_SET;
1147 
1148 	blk = sector / s->spb;
1149 	sec = sector % s->spb;
1150 
1151 	if (blk > s->vhd.header.max_bat_size) {
1152 		DPRINTF("ERROR: sec %"PRIu64" out of range, op = %d\n",
1153 			sector, op);
1154 		return -EINVAL;
1155 	}
1156 
1157 	if (bat_entry(s, blk) == DD_BLK_UNUSED) {
1158 		if (op == VHD_OP_DATA_WRITE &&
1159 		    s->bat.pbw_blk != blk && bat_locked(s))
1160 			return VHD_BM_BAT_LOCKED;
1161 
1162 		return VHD_BM_BAT_CLEAR;
1163 	}
1164 
1165 	if (test_batmap(s, blk)) {
1166 		DBG(TLOG_DBG, "batmap set for 0x%04x\n", blk);
1167 		return VHD_BM_BIT_SET;
1168 	}
1169 
1170 	bm = get_bitmap(s, blk);
1171 	if (!bm)
1172 		return VHD_BM_NOT_CACHED;
1173 
1174 	/* bump lru count */
1175 	touch_bitmap(s, bm);
1176 
1177 	if (test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING))
1178 		return VHD_BM_READ_PENDING;
1179 
1180 	return ((vhd_bitmap_test(&s->vhd, bm->map, sec)) ?
1181 		VHD_BM_BIT_SET : VHD_BM_BIT_CLEAR);
1182 }
1183 
1184 static int
read_bitmap_cache_span(struct vhd_state * s,uint64_t sector,int nr_secs,int value)1185 read_bitmap_cache_span(struct vhd_state *s,
1186 		       uint64_t sector, int nr_secs, int value)
1187 {
1188 	int ret;
1189 	u32 blk, sec;
1190 	struct vhd_bitmap *bm;
1191 
1192 	/* in fixed disks, every block is present */
1193 	if (s->vhd.footer.type == HD_TYPE_FIXED)
1194 		return nr_secs;
1195 
1196 	sec = sector % s->spb;
1197 	blk = sector / s->spb;
1198 
1199 	if (test_batmap(s, blk))
1200 		return MIN(nr_secs, s->spb - sec);
1201 
1202 	bm  = get_bitmap(s, blk);
1203 
1204 	ASSERT(bm && bitmap_valid(bm));
1205 
1206 	for (ret = 0; sec < s->spb && ret < nr_secs; sec++, ret++)
1207 		if (vhd_bitmap_test(&s->vhd, bm->map, sec) != value)
1208 			break;
1209 
1210 	return ret;
1211 }
1212 
1213 static inline struct vhd_request *
alloc_vhd_request(struct vhd_state * s)1214 alloc_vhd_request(struct vhd_state *s)
1215 {
1216 	struct vhd_request *req = NULL;
1217 
1218 	if (s->vreq_free_count > 0) {
1219 		req = s->vreq_free[--s->vreq_free_count];
1220 		ASSERT(req->treq.secs == 0);
1221 		init_vhd_request(s, req);
1222 		return req;
1223 	}
1224 
1225 	return NULL;
1226 }
1227 
1228 static inline void
free_vhd_request(struct vhd_state * s,struct vhd_request * req)1229 free_vhd_request(struct vhd_state *s, struct vhd_request *req)
1230 {
1231 	memset(req, 0, sizeof(struct vhd_request));
1232 	s->vreq_free[s->vreq_free_count++] = req;
1233 }
1234 
1235 static inline void
aio_read(struct vhd_state * s,struct vhd_request * req,uint64_t offset)1236 aio_read(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
1237 {
1238 	struct tiocb *tiocb = &req->tiocb;
1239 
1240 	td_prep_read(tiocb, s->vhd.fd, req->treq.buf,
1241 		     vhd_sectors_to_bytes(req->treq.secs),
1242 		     offset, vhd_complete, req);
1243 	td_queue_tiocb(s->driver, tiocb);
1244 
1245 	s->queued++;
1246 	s->reads++;
1247 	s->read_size += req->treq.secs;
1248 	TRACE(s);
1249 }
1250 
1251 static inline void
aio_write(struct vhd_state * s,struct vhd_request * req,uint64_t offset)1252 aio_write(struct vhd_state *s, struct vhd_request *req, uint64_t offset)
1253 {
1254 	struct tiocb *tiocb = &req->tiocb;
1255 
1256 	td_prep_write(tiocb, s->vhd.fd, req->treq.buf,
1257 		      vhd_sectors_to_bytes(req->treq.secs),
1258 		      offset, vhd_complete, req);
1259 	td_queue_tiocb(s->driver, tiocb);
1260 
1261 	s->queued++;
1262 	s->writes++;
1263 	s->write_size += req->treq.secs;
1264 	TRACE(s);
1265 }
1266 
1267 static inline uint64_t
reserve_new_block(struct vhd_state * s,uint32_t blk)1268 reserve_new_block(struct vhd_state *s, uint32_t blk)
1269 {
1270 	int gap = 0;
1271 
1272 	ASSERT(!test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
1273 
1274 	/* data region of segment should begin on page boundary */
1275 	if ((s->next_db + s->bm_secs) % s->spp)
1276 		gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
1277 
1278 	s->bat.pbw_blk    = blk;
1279 	s->bat.pbw_offset = s->next_db + gap;
1280 
1281 	return s->next_db;
1282 }
1283 
1284 static int
schedule_bat_write(struct vhd_state * s)1285 schedule_bat_write(struct vhd_state *s)
1286 {
1287 	int i;
1288 	u32 blk;
1289 	char *buf;
1290 	u64 offset;
1291 	struct vhd_request *req;
1292 
1293 	ASSERT(bat_locked(s));
1294 
1295 	req = &s->bat.req;
1296 	buf = s->bat.bat_buf;
1297 	blk = s->bat.pbw_blk;
1298 
1299 	init_vhd_request(s, req);
1300 	memcpy(buf, &bat_entry(s, blk - (blk % 128)), 512);
1301 
1302 	((u32 *)buf)[blk % 128] = s->bat.pbw_offset;
1303 
1304 	for (i = 0; i < 128; i++)
1305 		BE32_OUT(&((u32 *)buf)[i]);
1306 
1307 	offset         = s->vhd.header.table_offset + (blk - (blk % 128)) * 4;
1308 	req->treq.secs = 1;
1309 	req->treq.buf  = buf;
1310 	req->op        = VHD_OP_BAT_WRITE;
1311 	req->next      = NULL;
1312 
1313 	aio_write(s, req, offset);
1314 	set_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED);
1315 
1316 	DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64", "
1317 	    "table_offset: 0x%08"PRIx64"\n", blk, s->bat.pbw_offset, offset);
1318 
1319 	return 0;
1320 }
1321 
1322 static void
schedule_zero_bm_write(struct vhd_state * s,struct vhd_bitmap * bm,uint64_t lb_end)1323 schedule_zero_bm_write(struct vhd_state *s,
1324 		       struct vhd_bitmap *bm, uint64_t lb_end)
1325 {
1326 	uint64_t offset;
1327 	struct vhd_request *req = &s->bat.zero_req;
1328 
1329 	init_vhd_request(s, req);
1330 
1331 	offset         = vhd_sectors_to_bytes(lb_end);
1332 	req->op        = VHD_OP_ZERO_BM_WRITE;
1333 	req->treq.sec  = s->bat.pbw_blk * s->spb;
1334 	req->treq.secs = (s->bat.pbw_offset - lb_end) + s->bm_secs;
1335 	req->treq.buf  = vhd_zeros(vhd_sectors_to_bytes(req->treq.secs));
1336 	req->next      = NULL;
1337 
1338 	DBG(TLOG_DBG, "blk: 0x%04x, writing zero bitmap at 0x%08"PRIx64"\n",
1339 	    s->bat.pbw_blk, offset);
1340 
1341 	lock_bitmap(bm);
1342 	add_to_transaction(&bm->tx, req);
1343 	aio_write(s, req, offset);
1344 }
1345 
1346 static int
update_bat(struct vhd_state * s,uint32_t blk)1347 update_bat(struct vhd_state *s, uint32_t blk)
1348 {
1349 	int err;
1350 	uint64_t lb_end;
1351 	struct vhd_bitmap *bm;
1352 
1353 	ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
1354 
1355 	if (bat_locked(s)) {
1356 		ASSERT(s->bat.pbw_blk == blk);
1357 		return 0;
1358 	}
1359 
1360 	/* empty bitmap could already be in
1361 	 * cache if earlier bat update failed */
1362 	bm = get_bitmap(s, blk);
1363 	if (!bm) {
1364 		/* install empty bitmap in cache */
1365 		err = alloc_vhd_bitmap(s, &bm, blk);
1366 		if (err)
1367 			return err;
1368 
1369 		install_bitmap(s, bm);
1370 	}
1371 
1372 	lock_bat(s);
1373 	lb_end = reserve_new_block(s, blk);
1374 	schedule_zero_bm_write(s, bm, lb_end);
1375 	set_vhd_flag(bm->tx.status, VHD_FLAG_TX_UPDATE_BAT);
1376 
1377 	return 0;
1378 }
1379 
1380 static int
allocate_block(struct vhd_state * s,uint32_t blk)1381 allocate_block(struct vhd_state *s, uint32_t blk)
1382 {
1383 	char *zeros;
1384 	int err, gap;
1385 	uint64_t offset, size;
1386 	struct vhd_bitmap *bm;
1387 
1388 	ASSERT(bat_entry(s, blk) == DD_BLK_UNUSED);
1389 
1390 	if (bat_locked(s)) {
1391 		ASSERT(s->bat.pbw_blk == blk);
1392 		if (s->bat.req.error)
1393 			return -EBUSY;
1394 		return 0;
1395 	}
1396 
1397 	gap            = 0;
1398 	s->bat.pbw_blk = blk;
1399 	offset         = vhd_sectors_to_bytes(s->next_db);
1400 
1401 	/* data region of segment should begin on page boundary */
1402 	if ((s->next_db + s->bm_secs) % s->spp) {
1403 		gap = (s->spp - ((s->next_db + s->bm_secs) % s->spp));
1404 		s->next_db += gap;
1405 	}
1406 
1407 	s->bat.pbw_offset = s->next_db;
1408 
1409 	DBG(TLOG_DBG, "blk: 0x%04x, pbwo: 0x%08"PRIx64"\n",
1410 	    blk, s->bat.pbw_offset);
1411 
1412 	if (lseek(s->vhd.fd, offset, SEEK_SET) == (off_t)-1) {
1413 		ERR(errno, "lseek failed\n");
1414 		return -errno;
1415 	}
1416 
1417 	size = vhd_sectors_to_bytes(s->spb + s->bm_secs + gap);
1418 	err  = write(s->vhd.fd, vhd_zeros(size), size);
1419 	if (err != size) {
1420 		err = (err == -1 ? -errno : -EIO);
1421 		ERR(err, "write failed");
1422 		return err;
1423 	}
1424 
1425 	/* empty bitmap could already be in
1426 	 * cache if earlier bat update failed */
1427 	bm = get_bitmap(s, blk);
1428 	if (!bm) {
1429 		/* install empty bitmap in cache */
1430 		err = alloc_vhd_bitmap(s, &bm, blk);
1431 		if (err)
1432 			return err;
1433 
1434 		install_bitmap(s, bm);
1435 	}
1436 
1437 	lock_bat(s);
1438 	lock_bitmap(bm);
1439 	schedule_bat_write(s);
1440 	add_to_transaction(&bm->tx, &s->bat.req);
1441 
1442 	return 0;
1443 }
1444 
1445 static int
schedule_data_read(struct vhd_state * s,td_request_t treq,vhd_flag_t flags)1446 schedule_data_read(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
1447 {
1448 	u64 offset;
1449 	u32 blk = 0, sec = 0;
1450 	struct vhd_bitmap  *bm;
1451 	struct vhd_request *req;
1452 
1453 	if (s->vhd.footer.type == HD_TYPE_FIXED) {
1454 		offset = vhd_sectors_to_bytes(treq.sec);
1455 		goto make_request;
1456 	}
1457 
1458 	blk    = treq.sec / s->spb;
1459 	sec    = treq.sec % s->spb;
1460 	bm     = get_bitmap(s, blk);
1461 	offset = bat_entry(s, blk);
1462 
1463 	ASSERT(offset != DD_BLK_UNUSED);
1464 	ASSERT(test_batmap(s, blk) || (bm && bitmap_valid(bm)));
1465 
1466 	offset += s->bm_secs + sec;
1467 	offset  = vhd_sectors_to_bytes(offset);
1468 
1469  make_request:
1470 	req = alloc_vhd_request(s);
1471 	if (!req)
1472 		return -EBUSY;
1473 
1474 	req->treq  = treq;
1475 	req->flags = flags;
1476 	req->op    = VHD_OP_DATA_READ;
1477 	req->next  = NULL;
1478 
1479 	aio_read(s, req, offset);
1480 
1481 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
1482 	    "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x, buf: %p\n",
1483 	    s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags,
1484 	    treq.buf);
1485 
1486 	return 0;
1487 }
1488 
1489 static int
schedule_data_write(struct vhd_state * s,td_request_t treq,vhd_flag_t flags)1490 schedule_data_write(struct vhd_state *s, td_request_t treq, vhd_flag_t flags)
1491 {
1492 	int err;
1493 	u64 offset;
1494 	u32 blk = 0, sec = 0;
1495 	struct vhd_bitmap  *bm = NULL;
1496 	struct vhd_request *req;
1497 
1498 	if (s->vhd.footer.type == HD_TYPE_FIXED) {
1499 		offset = vhd_sectors_to_bytes(treq.sec);
1500 		goto make_request;
1501 	}
1502 
1503 	blk    = treq.sec / s->spb;
1504 	sec    = treq.sec % s->spb;
1505 	offset = bat_entry(s, blk);
1506 
1507 	if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BAT)) {
1508 		if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE))
1509 			err = allocate_block(s, blk);
1510 		else
1511 			err = update_bat(s, blk);
1512 
1513 		if (err)
1514 			return err;
1515 
1516 		offset = s->bat.pbw_offset;
1517 	}
1518 
1519 	offset += s->bm_secs + sec;
1520 	offset  = vhd_sectors_to_bytes(offset);
1521 
1522  make_request:
1523 	req = alloc_vhd_request(s);
1524 	if (!req)
1525 		return -EBUSY;
1526 
1527 	req->treq  = treq;
1528 	req->flags = flags;
1529 	req->op    = VHD_OP_DATA_WRITE;
1530 	req->next  = NULL;
1531 
1532 	if (test_vhd_flag(flags, VHD_FLAG_REQ_UPDATE_BITMAP)) {
1533 		bm = get_bitmap(s, blk);
1534 		ASSERT(bm && bitmap_valid(bm));
1535 		lock_bitmap(bm);
1536 
1537 		if (bm->tx.closed) {
1538 			add_to_tail(&bm->queue, req);
1539 			set_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED);
1540 		} else
1541 			add_to_transaction(&bm->tx, req);
1542 	}
1543 
1544 	aio_write(s, req, offset);
1545 
1546 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, sec: 0x%04x, "
1547 	    "nr_secs: 0x%04x, offset: 0x%08"PRIx64", flags: 0x%08x\n",
1548 	    s->vhd.file, treq.sec, blk, sec, treq.secs, offset, req->flags);
1549 
1550 	return 0;
1551 }
1552 
1553 static int
schedule_bitmap_read(struct vhd_state * s,uint32_t blk)1554 schedule_bitmap_read(struct vhd_state *s, uint32_t blk)
1555 {
1556 	int err;
1557 	u64 offset;
1558 	struct vhd_bitmap  *bm;
1559 	struct vhd_request *req = NULL;
1560 
1561 	ASSERT(vhd_type_dynamic(&s->vhd));
1562 
1563 	offset = bat_entry(s, blk);
1564 
1565 	ASSERT(offset != DD_BLK_UNUSED);
1566 	ASSERT(!get_bitmap(s, blk));
1567 
1568 	offset = vhd_sectors_to_bytes(offset);
1569 
1570 	err = alloc_vhd_bitmap(s, &bm, blk);
1571 	if (err)
1572 		return err;
1573 
1574 	req = &bm->req;
1575 	init_vhd_request(s, req);
1576 
1577 	req->treq.sec  = blk * s->spb;
1578 	req->treq.secs = s->bm_secs;
1579 	req->treq.buf  = bm->map;
1580 	req->treq.cb   = NULL;
1581 	req->op        = VHD_OP_BITMAP_READ;
1582 	req->next      = NULL;
1583 
1584 	aio_read(s, req, offset);
1585 	lock_bitmap(bm);
1586 	install_bitmap(s, bm);
1587 	set_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
1588 
1589 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x, nr_secs: 0x%04x, "
1590 	    "offset: 0x%08"PRIx64"\n", s->vhd.file, req->treq.sec, blk,
1591 	    req->treq.secs, offset);
1592 
1593 	return 0;
1594 }
1595 
1596 static void
schedule_bitmap_write(struct vhd_state * s,uint32_t blk)1597 schedule_bitmap_write(struct vhd_state *s, uint32_t blk)
1598 {
1599 	u64 offset;
1600 	struct vhd_bitmap  *bm;
1601 	struct vhd_request *req;
1602 
1603 	bm     = get_bitmap(s, blk);
1604 	offset = bat_entry(s, blk);
1605 
1606 	ASSERT(vhd_type_dynamic(&s->vhd));
1607 	ASSERT(bm && bitmap_valid(bm) &&
1608 	       !test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
1609 
1610 	if (offset == DD_BLK_UNUSED) {
1611 		ASSERT(bat_locked(s) && s->bat.pbw_blk == blk);
1612 		offset = s->bat.pbw_offset;
1613 	}
1614 
1615 	offset = vhd_sectors_to_bytes(offset);
1616 
1617 	req = &bm->req;
1618 	init_vhd_request(s, req);
1619 
1620 	req->treq.sec  = blk * s->spb;
1621 	req->treq.secs = s->bm_secs;
1622 	req->treq.buf  = bm->shadow;
1623 	req->treq.cb   = NULL;
1624 	req->op        = VHD_OP_BITMAP_WRITE;
1625 	req->next      = NULL;
1626 
1627 	aio_write(s, req, offset);
1628 	lock_bitmap(bm);
1629 	touch_bitmap(s, bm);     /* bump lru count */
1630 	set_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
1631 
1632 	DBG(TLOG_DBG, "%s: blk: 0x%04x, sec: 0x%08"PRIx64", nr_secs: 0x%04x, "
1633 	    "offset: 0x%"PRIx64"\n", s->vhd.file, blk, req->treq.sec,
1634 	    req->treq.secs, offset);
1635 }
1636 
1637 /*
1638  * queued requests will be submitted once the bitmap
1639  * describing them is read and the requests are validated.
1640  */
1641 static int
__vhd_queue_request(struct vhd_state * s,uint8_t op,td_request_t treq)1642 __vhd_queue_request(struct vhd_state *s, uint8_t op, td_request_t treq)
1643 {
1644 	u32 blk;
1645 	struct vhd_bitmap  *bm;
1646 	struct vhd_request *req;
1647 
1648 	ASSERT(vhd_type_dynamic(&s->vhd));
1649 
1650 	blk = treq.sec / s->spb;
1651 	bm  = get_bitmap(s, blk);
1652 
1653 	ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
1654 
1655 	req = alloc_vhd_request(s);
1656 	if (!req)
1657 		return -EBUSY;
1658 
1659 	req->treq = treq;
1660 	req->op   = op;
1661 	req->next = NULL;
1662 
1663 	add_to_tail(&bm->waiting, req);
1664 	lock_bitmap(bm);
1665 
1666 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", blk: 0x%04x nr_secs: 0x%04x, "
1667 	    "op: %u\n", s->vhd.file, treq.sec, blk, treq.secs, op);
1668 
1669 	TRACE(s);
1670 	return 0;
1671 }
1672 
1673 static void
vhd_queue_read(td_driver_t * driver,td_request_t treq)1674 vhd_queue_read(td_driver_t *driver, td_request_t treq)
1675 {
1676 	struct vhd_state *s = (struct vhd_state *)driver->data;
1677 
1678 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x (seg: %d)\n",
1679 	    s->vhd.file, treq.sec, treq.secs, treq.sidx);
1680 
1681 	while (treq.secs) {
1682 		int err;
1683 		td_request_t clone;
1684 
1685 		err   = 0;
1686 		clone = treq;
1687 
1688 		switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_READ)) {
1689 		case -EINVAL:
1690 			err = -EINVAL;
1691 			goto fail;
1692 
1693 		case VHD_BM_BAT_CLEAR:
1694 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1695 			td_forward_request(clone);
1696 			break;
1697 
1698 		case VHD_BM_BIT_CLEAR:
1699 			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
1700 			td_forward_request(clone);
1701 			break;
1702 
1703 		case VHD_BM_BIT_SET:
1704 			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
1705 			err = schedule_data_read(s, clone, 0);
1706 			if (err)
1707 				goto fail;
1708 			break;
1709 
1710 		case VHD_BM_NOT_CACHED:
1711 			err = schedule_bitmap_read(s, clone.sec / s->spb);
1712 			if (err)
1713 				goto fail;
1714 
1715 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1716 			err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
1717 			if (err)
1718 				goto fail;
1719 			break;
1720 
1721 		case VHD_BM_READ_PENDING:
1722 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1723 			err = __vhd_queue_request(s, VHD_OP_DATA_READ, clone);
1724 			if (err)
1725 				goto fail;
1726 			break;
1727 
1728 		case VHD_BM_BAT_LOCKED:
1729 		default:
1730 			ASSERT(0);
1731 			break;
1732 		}
1733 
1734 		treq.sec  += clone.secs;
1735 		treq.secs -= clone.secs;
1736 		treq.buf  += vhd_sectors_to_bytes(clone.secs);
1737 		continue;
1738 
1739 	fail:
1740 		clone.secs = treq.secs;
1741 		td_complete_request(clone, err);
1742 		break;
1743 	}
1744 }
1745 
1746 static void
vhd_queue_write(td_driver_t * driver,td_request_t treq)1747 vhd_queue_write(td_driver_t *driver, td_request_t treq)
1748 {
1749 	struct vhd_state *s = (struct vhd_state *)driver->data;
1750 
1751 	DBG(TLOG_DBG, "%s: lsec: 0x%08"PRIx64", secs: 0x%04x, (seg: %d)\n",
1752 	    s->vhd.file, treq.sec, treq.secs, treq.sidx);
1753 
1754 	while (treq.secs) {
1755 		int err;
1756 		uint8_t flags;
1757 		td_request_t clone;
1758 
1759 		err   = 0;
1760 		flags = 0;
1761 		clone = treq;
1762 
1763 		switch (read_bitmap_cache(s, clone.sec, VHD_OP_DATA_WRITE)) {
1764 		case -EINVAL:
1765 			err = -EINVAL;
1766 			goto fail;
1767 
1768 		case VHD_BM_BAT_LOCKED:
1769 			err = -EBUSY;
1770 			clone.blocked = 1;
1771 			goto fail;
1772 
1773 		case VHD_BM_BAT_CLEAR:
1774 			flags      = (VHD_FLAG_REQ_UPDATE_BAT |
1775 				      VHD_FLAG_REQ_UPDATE_BITMAP);
1776 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1777 			err        = schedule_data_write(s, clone, flags);
1778 			if (err)
1779 				goto fail;
1780 			break;
1781 
1782 		case VHD_BM_BIT_CLEAR:
1783 			flags      = VHD_FLAG_REQ_UPDATE_BITMAP;
1784 			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 0);
1785 			err        = schedule_data_write(s, clone, flags);
1786 			if (err)
1787 				goto fail;
1788 			break;
1789 
1790 		case VHD_BM_BIT_SET:
1791 			clone.secs = read_bitmap_cache_span(s, clone.sec, clone.secs, 1);
1792 			err = schedule_data_write(s, clone, 0);
1793 			if (err)
1794 				goto fail;
1795 			break;
1796 
1797 		case VHD_BM_NOT_CACHED:
1798 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1799 			err = schedule_bitmap_read(s, clone.sec / s->spb);
1800 			if (err)
1801 				goto fail;
1802 
1803 			err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
1804 			if (err)
1805 				goto fail;
1806 			break;
1807 
1808 		case VHD_BM_READ_PENDING:
1809 			clone.secs = MIN(clone.secs, s->spb - (clone.sec % s->spb));
1810 			err = __vhd_queue_request(s, VHD_OP_DATA_WRITE, clone);
1811 			if (err)
1812 				goto fail;
1813 			break;
1814 
1815 		default:
1816 			ASSERT(0);
1817 			break;
1818 		}
1819 
1820 		treq.sec  += clone.secs;
1821 		treq.secs -= clone.secs;
1822 		treq.buf  += vhd_sectors_to_bytes(clone.secs);
1823 		continue;
1824 
1825 	fail:
1826 		clone.secs = treq.secs;
1827 		td_complete_request(clone, err);
1828 		break;
1829 	}
1830 }
1831 
1832 static inline void
signal_completion(struct vhd_request * list,int error)1833 signal_completion(struct vhd_request *list, int error)
1834 {
1835 	struct vhd_state *s;
1836 	struct vhd_request *r, *next;
1837 
1838 	if (!list)
1839 		return;
1840 
1841 	r = list;
1842 	s = list->state;
1843 
1844 	while (r) {
1845 		int err;
1846 
1847 		err  = (error ? error : r->error);
1848 		next = r->next;
1849 		td_complete_request(r->treq, err);
1850 		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64", "
1851 		    "err: %d\n", r->treq.sec, r->treq.sec / s->spb, err);
1852 		free_vhd_request(s, r);
1853 		r    = next;
1854 
1855 		s->returned++;
1856 		TRACE(s);
1857 	}
1858 }
1859 
1860 static void
start_new_bitmap_transaction(struct vhd_state * s,struct vhd_bitmap * bm)1861 start_new_bitmap_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
1862 {
1863 	int i, error = 0;
1864 	struct vhd_transaction *tx;
1865 	struct vhd_request *r, *next;
1866 
1867 	if (!bm->queue.head)
1868 		return;
1869 
1870 	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
1871 
1872 	r  = bm->queue.head;
1873 	tx = &bm->tx;
1874 	clear_req_list(&bm->queue);
1875 
1876 	if (r && bat_entry(s, bm->blk) == DD_BLK_UNUSED)
1877 		tx->error = -EIO;
1878 
1879 	while (r) {
1880 		next    = r->next;
1881 		r->next = NULL;
1882 		clear_vhd_flag(r->flags, VHD_FLAG_REQ_QUEUED);
1883 
1884 		add_to_transaction(tx, r);
1885 		if (test_vhd_flag(r->flags, VHD_FLAG_REQ_FINISHED)) {
1886 			tx->finished++;
1887 			if (!r->error) {
1888 				u32 sec = r->treq.sec % s->spb;
1889 				for (i = 0; i < r->treq.secs; i++)
1890 					vhd_bitmap_set(&s->vhd,
1891 						       bm->shadow, sec + i);
1892 			}
1893 		}
1894 		r = next;
1895 	}
1896 
1897 	/* perhaps all the queued writes already completed? */
1898 	if (tx->started && transaction_completed(tx))
1899 		finish_data_transaction(s, bm);
1900 }
1901 
1902 static void
finish_bat_transaction(struct vhd_state * s,struct vhd_bitmap * bm)1903 finish_bat_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
1904 {
1905 	struct vhd_transaction *tx = &bm->tx;
1906 
1907 	if (!bat_locked(s))
1908 		return;
1909 
1910 	if (s->bat.pbw_blk != bm->blk)
1911 		return;
1912 
1913 	if (!s->bat.req.error)
1914 		goto release;
1915 
1916 	if (!test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE))
1917 		goto release;
1918 
1919 	tx->closed = 1;
1920 	return;
1921 
1922  release:
1923 	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
1924 	unlock_bat(s);
1925 	init_bat(s);
1926 }
1927 
1928 static void
finish_bitmap_transaction(struct vhd_state * s,struct vhd_bitmap * bm,int error)1929 finish_bitmap_transaction(struct vhd_state *s,
1930 			  struct vhd_bitmap *bm, int error)
1931 {
1932 	int map_size;
1933 	struct vhd_transaction *tx = &bm->tx;
1934 
1935 	DBG(TLOG_DBG, "blk: 0x%04x, err: %d\n", bm->blk, error);
1936 	tx->error = (tx->error ? tx->error : error);
1937 	map_size  = vhd_sectors_to_bytes(s->bm_secs);
1938 
1939 	if (!test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
1940 		if (test_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT)) {
1941 			/* still waiting for bat write */
1942 			ASSERT(bm->blk == s->bat.pbw_blk);
1943 			ASSERT(test_vhd_flag(s->bat.status,
1944 					     VHD_FLAG_BAT_WRITE_STARTED));
1945 			s->bat.req.tx = tx;
1946 			return;
1947 		}
1948 	}
1949 
1950 	if (tx->error) {
1951 		/* undo changes to shadow */
1952 		memcpy(bm->shadow, bm->map, map_size);
1953 	} else {
1954 		/* complete atomic write */
1955 		memcpy(bm->map, bm->shadow, map_size);
1956 		if (!test_batmap(s, bm->blk) && bitmap_full(s, bm))
1957 			set_batmap(s, bm->blk);
1958 	}
1959 
1960 	/* transaction done; signal completions */
1961 	signal_completion(tx->requests.head, tx->error);
1962 	init_tx(tx);
1963 	start_new_bitmap_transaction(s, bm);
1964 
1965 	if (!bitmap_in_use(bm))
1966 		unlock_bitmap(bm);
1967 
1968 	finish_bat_transaction(s, bm);
1969 }
1970 
1971 static void
finish_data_transaction(struct vhd_state * s,struct vhd_bitmap * bm)1972 finish_data_transaction(struct vhd_state *s, struct vhd_bitmap *bm)
1973 {
1974 	struct vhd_transaction *tx = &bm->tx;
1975 
1976 	DBG(TLOG_DBG, "blk: 0x%04x\n", bm->blk);
1977 
1978 	tx->closed = 1;
1979 
1980 	if (!tx->error)
1981 		return schedule_bitmap_write(s, bm->blk);
1982 
1983 	return finish_bitmap_transaction(s, bm, 0);
1984 }
1985 
1986 static void
finish_bat_write(struct vhd_request * req)1987 finish_bat_write(struct vhd_request *req)
1988 {
1989 	struct vhd_bitmap *bm;
1990 	struct vhd_transaction *tx;
1991 	struct vhd_state *s = req->state;
1992 
1993 	s->returned++;
1994 	TRACE(s);
1995 
1996 	bm = get_bitmap(s, s->bat.pbw_blk);
1997 
1998 	DBG(TLOG_DBG, "blk 0x%04x, pbwo: 0x%08"PRIx64", err %d\n",
1999 	    s->bat.pbw_blk, s->bat.pbw_offset, req->error);
2000 	ASSERT(bm && bitmap_valid(bm));
2001 	ASSERT(bat_locked(s) &&
2002 	       test_vhd_flag(s->bat.status, VHD_FLAG_BAT_WRITE_STARTED));
2003 
2004 	tx = &bm->tx;
2005 	ASSERT(test_vhd_flag(tx->status, VHD_FLAG_TX_LIVE));
2006 
2007 	if (!req->error) {
2008 		bat_entry(s, s->bat.pbw_blk) = s->bat.pbw_offset;
2009 		s->next_db = s->bat.pbw_offset + s->spb + s->bm_secs;
2010 	} else
2011 		tx->error = req->error;
2012 
2013 	if (test_vhd_flag(s->flags, VHD_FLAG_OPEN_PREALLOCATE)) {
2014 		tx->finished++;
2015 		remove_from_req_list(&tx->requests, req);
2016 		if (transaction_completed(tx))
2017 			finish_data_transaction(s, bm);
2018 	} else {
2019 		clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
2020 		if (s->bat.req.tx)
2021 			finish_bitmap_transaction(s, bm, req->error);
2022 	}
2023 
2024 	finish_bat_transaction(s, bm);
2025 }
2026 
2027 static void
finish_zero_bm_write(struct vhd_request * req)2028 finish_zero_bm_write(struct vhd_request *req)
2029 {
2030 	u32 blk;
2031 	struct vhd_bitmap *bm;
2032 	struct vhd_transaction *tx = req->tx;
2033 	struct vhd_state *s = req->state;
2034 
2035 	s->returned++;
2036 	TRACE(s);
2037 
2038 	blk = req->treq.sec / s->spb;
2039 	bm  = get_bitmap(s, blk);
2040 
2041 	DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
2042 	ASSERT(bat_locked(s));
2043 	ASSERT(s->bat.pbw_blk == blk);
2044 	ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
2045 
2046 	tx->finished++;
2047 	remove_from_req_list(&tx->requests, req);
2048 
2049 	if (req->error) {
2050 		unlock_bat(s);
2051 		init_bat(s);
2052 		tx->error = req->error;
2053 		clear_vhd_flag(tx->status, VHD_FLAG_TX_UPDATE_BAT);
2054 	} else
2055 		schedule_bat_write(s);
2056 
2057 	if (transaction_completed(tx))
2058 		finish_data_transaction(s, bm);
2059 }
2060 
2061 static void
finish_bitmap_read(struct vhd_request * req)2062 finish_bitmap_read(struct vhd_request *req)
2063 {
2064 	u32 blk;
2065 	struct vhd_bitmap  *bm;
2066 	struct vhd_request *r, *next;
2067 	struct vhd_state   *s = req->state;
2068 
2069 	s->returned++;
2070 	TRACE(s);
2071 
2072 	blk = req->treq.sec / s->spb;
2073 	bm  = get_bitmap(s, blk);
2074 
2075 	DBG(TLOG_DBG, "blk: 0x%04x\n", blk);
2076 	ASSERT(bm && test_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING));
2077 
2078 	r = bm->waiting.head;
2079 	clear_req_list(&bm->waiting);
2080 	clear_vhd_flag(bm->status, VHD_FLAG_BM_READ_PENDING);
2081 
2082 	if (!req->error) {
2083 		memcpy(bm->shadow, bm->map, vhd_sectors_to_bytes(s->bm_secs));
2084 
2085 		while (r) {
2086 			struct vhd_request tmp;
2087 
2088 			tmp  = *r;
2089 			next =  r->next;
2090 			free_vhd_request(s, r);
2091 
2092 			ASSERT(tmp.op == VHD_OP_DATA_READ ||
2093 			       tmp.op == VHD_OP_DATA_WRITE);
2094 
2095 			if (tmp.op == VHD_OP_DATA_READ)
2096 				vhd_queue_read(s->driver, tmp.treq);
2097 			else if (tmp.op == VHD_OP_DATA_WRITE)
2098 				vhd_queue_write(s->driver, tmp.treq);
2099 
2100 			r = next;
2101 		}
2102 	} else {
2103 		int err = req->error;
2104 		unlock_bitmap(bm);
2105 		free_vhd_bitmap(s, bm);
2106 		return signal_completion(r, err);
2107 	}
2108 
2109 	if (!bitmap_in_use(bm))
2110 		unlock_bitmap(bm);
2111 }
2112 
2113 static void
finish_bitmap_write(struct vhd_request * req)2114 finish_bitmap_write(struct vhd_request *req)
2115 {
2116 	u32 blk;
2117 	struct vhd_bitmap  *bm;
2118 	struct vhd_transaction *tx;
2119 	struct vhd_state *s = req->state;
2120 
2121 	s->returned++;
2122 	TRACE(s);
2123 
2124 	blk = req->treq.sec / s->spb;
2125 	bm  = get_bitmap(s, blk);
2126 	tx  = &bm->tx;
2127 
2128 	DBG(TLOG_DBG, "blk: 0x%04x, started: %d, finished: %d\n",
2129 	    blk, tx->started, tx->finished);
2130 	ASSERT(tx->closed);
2131 	ASSERT(bm && bitmap_valid(bm));
2132 	ASSERT(test_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING));
2133 
2134 	clear_vhd_flag(bm->status, VHD_FLAG_BM_WRITE_PENDING);
2135 
2136 	finish_bitmap_transaction(s, bm, req->error);
2137 }
2138 
2139 static void
finish_data_read(struct vhd_request * req)2140 finish_data_read(struct vhd_request *req)
2141 {
2142 	struct vhd_state *s = req->state;
2143 
2144 	DBG(TLOG_DBG, "lsec 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
2145 	    req->treq.sec, req->treq.sec / s->spb);
2146 	signal_completion(req, 0);
2147 }
2148 
2149 static void
finish_data_write(struct vhd_request * req)2150 finish_data_write(struct vhd_request *req)
2151 {
2152 	int i;
2153 	struct vhd_transaction *tx = req->tx;
2154 	struct vhd_state *s = (struct vhd_state *)req->state;
2155 
2156 	set_vhd_flag(req->flags, VHD_FLAG_REQ_FINISHED);
2157 
2158 	if (tx) {
2159 		u32 blk, sec;
2160 		struct vhd_bitmap *bm;
2161 
2162 		blk = req->treq.sec / s->spb;
2163 		sec = req->treq.sec % s->spb;
2164 		bm  = get_bitmap(s, blk);
2165 
2166 		ASSERT(bm && bitmap_valid(bm) && bitmap_locked(bm));
2167 
2168 		tx->finished++;
2169 
2170 		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x04%"PRIx64", "
2171 		    "tx->started: %d, tx->finished: %d\n", req->treq.sec,
2172 		    req->treq.sec / s->spb, tx->started, tx->finished);
2173 
2174 		if (!req->error)
2175 			for (i = 0; i < req->treq.secs; i++)
2176 				vhd_bitmap_set(&s->vhd, bm->shadow,  sec + i);
2177 
2178 		if (transaction_completed(tx))
2179 			finish_data_transaction(s, bm);
2180 
2181 	} else if (!test_vhd_flag(req->flags, VHD_FLAG_REQ_QUEUED)) {
2182 		ASSERT(!req->next);
2183 		DBG(TLOG_DBG, "lsec: 0x%08"PRIx64", blk: 0x%04"PRIx64"\n",
2184 		    req->treq.sec, req->treq.sec / s->spb);
2185 		signal_completion(req, 0);
2186 	}
2187 }
2188 
2189 void
vhd_complete(void * arg,struct tiocb * tiocb,int err)2190 vhd_complete(void *arg, struct tiocb *tiocb, int err)
2191 {
2192 	struct vhd_request *req = (struct vhd_request *)arg;
2193 	struct vhd_state *s = req->state;
2194 	struct iocb *io = &tiocb->iocb;
2195 
2196 	s->completed++;
2197 	TRACE(s);
2198 
2199 	req->error = err;
2200 
2201 	if (req->error)
2202 		ERR(req->error, "%s: op: %u, lsec: %"PRIu64", secs: %u, "
2203 		    "nbytes: %lu, blk: %"PRIu64", blk_offset: %u",
2204 		    s->vhd.file, req->op, req->treq.sec, req->treq.secs,
2205 		    io->u.c.nbytes, req->treq.sec / s->spb,
2206 		    bat_entry(s, req->treq.sec / s->spb));
2207 
2208 	switch (req->op) {
2209 	case VHD_OP_DATA_READ:
2210 		finish_data_read(req);
2211 		break;
2212 
2213 	case VHD_OP_DATA_WRITE:
2214 		finish_data_write(req);
2215 		break;
2216 
2217 	case VHD_OP_BITMAP_READ:
2218 		finish_bitmap_read(req);
2219 		break;
2220 
2221 	case VHD_OP_BITMAP_WRITE:
2222 		finish_bitmap_write(req);
2223 		break;
2224 
2225 	case VHD_OP_ZERO_BM_WRITE:
2226 		finish_zero_bm_write(req);
2227 		break;
2228 
2229 	case VHD_OP_BAT_WRITE:
2230 		finish_bat_write(req);
2231 		break;
2232 
2233 	default:
2234 		ASSERT(0);
2235 		break;
2236 	}
2237 }
2238 
2239 void
vhd_debug(td_driver_t * driver)2240 vhd_debug(td_driver_t *driver)
2241 {
2242 	int i;
2243 	struct vhd_state *s = (struct vhd_state *)driver->data;
2244 
2245 	DBG(TLOG_WARN, "%s: QUEUED: 0x%08"PRIx64", COMPLETED: 0x%08"PRIx64", "
2246 	    "RETURNED: 0x%08"PRIx64"\n", s->vhd.file, s->queued, s->completed,
2247 	    s->returned);
2248 	DBG(TLOG_WARN, "WRITES: 0x%08"PRIx64", AVG_WRITE_SIZE: %f\n",
2249 	    s->writes, (s->writes ? ((float)s->write_size / s->writes) : 0.0));
2250 	DBG(TLOG_WARN, "READS: 0x%08"PRIx64", AVG_READ_SIZE: %f\n",
2251 	    s->reads, (s->reads ? ((float)s->read_size / s->reads) : 0.0));
2252 
2253 	DBG(TLOG_WARN, "ALLOCATED REQUESTS: (%lu total)\n", VHD_REQS_DATA);
2254 	for (i = 0; i < VHD_REQS_DATA; i++) {
2255 		struct vhd_request *r = &s->vreq_list[i];
2256 		td_request_t *t       = &r->treq;
2257 		if (t->secs)
2258 			DBG(TLOG_WARN, "%d: id: 0x%04"PRIx64", err: %d, op: %d,"
2259 			    " lsec: 0x%08"PRIx64", flags: %d, this: %p, "
2260 			    "next: %p, tx: %p\n", i, t->id, r->error, r->op,
2261 			    t->sec, r->flags, r, r->next, r->tx);
2262 	}
2263 
2264 	DBG(TLOG_WARN, "BITMAP CACHE:\n");
2265 	for (i = 0; i < VHD_CACHE_SIZE; i++) {
2266 		int qnum = 0, wnum = 0, rnum = 0;
2267 		struct vhd_bitmap *bm = s->bitmap[i];
2268 		struct vhd_transaction *tx;
2269 		struct vhd_request *r;
2270 
2271 		if (!bm)
2272 			continue;
2273 
2274 		tx = &bm->tx;
2275 		r = bm->queue.head;
2276 		while (r) {
2277 			qnum++;
2278 			r = r->next;
2279 		}
2280 
2281 		r = bm->waiting.head;
2282 		while (r) {
2283 			wnum++;
2284 			r = r->next;
2285 		}
2286 
2287 		r = tx->requests.head;
2288 		while (r) {
2289 			rnum++;
2290 			r = r->next;
2291 		}
2292 
2293 		DBG(TLOG_WARN, "%d: blk: 0x%04x, status: 0x%08x, q: %p, qnum: %d, w: %p, "
2294 		    "wnum: %d, locked: %d, in use: %d, tx: %p, tx_error: %d, "
2295 		    "started: %d, finished: %d, status: %u, reqs: %p, nreqs: %d\n",
2296 		    i, bm->blk, bm->status, bm->queue.head, qnum, bm->waiting.head,
2297 		    wnum, bitmap_locked(bm), bitmap_in_use(bm), tx, tx->error,
2298 		    tx->started, tx->finished, tx->status, tx->requests.head, rnum);
2299 	}
2300 
2301 	DBG(TLOG_WARN, "BAT: status: 0x%08x, pbw_blk: 0x%04x, "
2302 	    "pbw_off: 0x%08"PRIx64", tx: %p\n", s->bat.status, s->bat.pbw_blk,
2303 	    s->bat.pbw_offset, s->bat.req.tx);
2304 
2305 /*
2306 	for (i = 0; i < s->hdr.max_bat_size; i++)
2307 		DPRINTF("%d: %u\n", i, s->bat.bat[i]);
2308 */
2309 }
2310 
2311 struct tap_disk tapdisk_vhd = {
2312 	.disk_type          = "tapdisk_vhd",
2313 	.flags              = 0,
2314 	.private_data_size  = sizeof(struct vhd_state),
2315 	.td_open            = _vhd_open,
2316 	.td_close           = _vhd_close,
2317 	.td_queue_read      = vhd_queue_read,
2318 	.td_queue_write     = vhd_queue_write,
2319 	.td_get_parent_id   = vhd_get_parent_id,
2320 	.td_validate_parent = vhd_validate_parent,
2321 	.td_debug           = vhd_debug,
2322 };
2323