1  /*
2    Unix SMB/CIFS implementation.
3 
4    trivial database library
5 
6    Copyright (C) Andrew Tridgell              1999-2004
7    Copyright (C) Paul `Rusty' Russell		   2000
8    Copyright (C) Jeremy Allison			   2000-2003
9 
10      ** NOTE! The following LGPL license applies to the tdb
11      ** library. This does NOT imply that all of Samba is released
12      ** under the LGPL
13 
14    This library is free software; you can redistribute it and/or
15    modify it under the terms of the GNU Lesser General Public
16    License as published by the Free Software Foundation; either
17    version 2 of the License, or (at your option) any later version.
18 
19    This library is distributed in the hope that it will be useful,
20    but WITHOUT ANY WARRANTY; without even the implied warranty of
21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22    Lesser General Public License for more details.
23 
24    You should have received a copy of the GNU Lesser General Public
25    License along with this library; If not, see <http://www.gnu.org/licenses/>.
26 */
27 
28 
29 #ifndef _SAMBA_BUILD_
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
33 
34 #include <stdlib.h>
35 #include <stdio.h>
36 #include <stdint.h>
37 #include <fcntl.h>
38 #include <unistd.h>
39 #include <string.h>
40 #include <fcntl.h>
41 #include <errno.h>
42 #include <sys/mman.h>
43 #include <sys/stat.h>
44 #include "tdb.h"
45 #include <stdarg.h>
46 #include "talloc.h"
47 #undef HAVE_MMAP
48 #else
49 #include "includes.h"
50 #include "lib/tdb/include/tdb.h"
51 #include "system/time.h"
52 #include "system/shmem.h"
53 #include "system/filesys.h"
54 #endif
55 
56 #define TDB_MAGIC_FOOD "TDB file\n"
57 #define TDB_VERSION (0x26011967 + 7)
58 #define TDB_MAGIC (0x26011999U)
59 #define TDB_FREE_MAGIC (~TDB_MAGIC)
60 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
61 #define TDB_ALIGNMENT 4
62 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
63 #define DEFAULT_HASH_SIZE 131
64 #define TDB_PAGE_SIZE 0x2000
65 #define FREELIST_TOP (sizeof(struct tdb_header))
66 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
67 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
68 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
69 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
70 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
71 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
72 
73 
74 /* NB assumes there is a local variable called "tdb" that is the
75  * current context, also takes doubly-parenthesized print-style
76  * argument. */
77 #define TDB_LOG(x) tdb->log_fn x
78 
79 /* lock offsets */
80 #define GLOBAL_LOCK 0
81 #define ACTIVE_LOCK 4
82 
83 #ifndef MAP_FILE
84 #define MAP_FILE 0
85 #endif
86 
87 #ifndef MAP_FAILED
88 #define MAP_FAILED ((void *)-1)
89 #endif
90 
91 #ifndef discard_const_p
92 # if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
93 #  define discard_const(ptr) ((void *)((intptr_t)(ptr)))
94 # else
95 #  define discard_const(ptr) ((void *)(ptr))
96 # endif
97 # define discard_const_p(type, ptr) ((type *)discard_const(ptr))
98 #endif
99 
100 /* free memory if the pointer is valid and zero the pointer */
101 #ifndef SAFE_FREE
102 #define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
103 #endif
104 
105 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
106 static TDB_DATA tdb_null;
107 
108 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
109 static TDB_CONTEXT *tdbs = NULL;
110 
tdb_munmap(TDB_CONTEXT * tdb)111 static int tdb_munmap(TDB_CONTEXT *tdb)
112 {
113 	if (tdb->flags & TDB_INTERNAL)
114 		return 0;
115 
116 #ifdef HAVE_MMAP
117 	if (tdb->map_ptr) {
118 		int ret = munmap(tdb->map_ptr, tdb->map_size);
119 		if (ret != 0)
120 			return ret;
121 	}
122 #endif
123 	tdb->map_ptr = NULL;
124 	return 0;
125 }
126 
tdb_mmap(TDB_CONTEXT * tdb)127 static void tdb_mmap(TDB_CONTEXT *tdb)
128 {
129 	if (tdb->flags & TDB_INTERNAL)
130 		return;
131 
132 #ifdef HAVE_MMAP
133 	if (!(tdb->flags & TDB_NOMMAP)) {
134 		tdb->map_ptr = mmap(NULL, tdb->map_size,
135 				    PROT_READ|(tdb->read_only? 0:PROT_WRITE),
136 				    MAP_SHARED|MAP_FILE, tdb->fd, 0);
137 
138 		/*
139 		 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
140 		 */
141 
142 		if (tdb->map_ptr == MAP_FAILED) {
143 			tdb->map_ptr = NULL;
144 			TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
145 				 tdb->map_size, strerror(errno)));
146 		}
147 	} else {
148 		tdb->map_ptr = NULL;
149 	}
150 #else
151 	tdb->map_ptr = NULL;
152 #endif
153 }
154 
155 /* Endian conversion: we only ever deal with 4 byte quantities */
convert(void * buf,uint32_t size)156 static void *convert(void *buf, uint32_t size)
157 {
158 	uint32_t i, *p = buf;
159 	for (i = 0; i < size / 4; i++)
160 		p[i] = TDB_BYTEREV(p[i]);
161 	return buf;
162 }
163 #define DOCONV() (tdb->flags & TDB_CONVERT)
164 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
165 
166 /* the body of the database is made of one list_struct for the free space
167    plus a separate data list for each hash value */
168 struct list_struct {
169 	tdb_off next; /* offset of the next record in the list */
170 	tdb_len rec_len; /* total byte length of record */
171 	tdb_len key_len; /* byte length of key */
172 	tdb_len data_len; /* byte length of data */
173 	uint32_t full_hash; /* the full 32 bit hash of the key */
174 	uint32_t magic;   /* try to catch errors */
175 	/* the following union is implied:
176 		union {
177 			char record[rec_len];
178 			struct {
179 				char key[key_len];
180 				char data[data_len];
181 			}
182 			uint32_t totalsize; (tailer)
183 		}
184 	*/
185 };
186 
187 /* a byte range locking function - return 0 on success
188    this functions locks/unlocks 1 byte at the specified offset.
189 
190    On error, errno is also set so that errors are passed back properly
191    through tdb_open(). */
tdb_brlock(TDB_CONTEXT * tdb,tdb_off offset,int rw_type,int lck_type,int probe)192 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
193 		      int rw_type, int lck_type, int probe)
194 {
195 	struct flock fl;
196 	int ret;
197 
198 	if (tdb->flags & TDB_NOLOCK)
199 		return 0;
200 	if ((rw_type == F_WRLCK) && (tdb->read_only)) {
201 		errno = EACCES;
202 		return -1;
203 	}
204 
205 	fl.l_type = rw_type;
206 	fl.l_whence = SEEK_SET;
207 	fl.l_start = offset;
208 	fl.l_len = 1;
209 	fl.l_pid = 0;
210 
211 	do {
212 		ret = fcntl(tdb->fd,lck_type,&fl);
213 	} while (ret == -1 && errno == EINTR);
214 
215 	if (ret == -1) {
216 		if (!probe && lck_type != F_SETLK) {
217 			/* Ensure error code is set for log fun to examine. */
218 			tdb->ecode = TDB_ERR_LOCK;
219 			TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
220 				 tdb->fd, offset, rw_type, lck_type));
221 		}
222 		/* Generic lock error. errno set by fcntl.
223 		 * EAGAIN is an expected return from non-blocking
224 		 * locks. */
225 		if (errno != EAGAIN) {
226 		TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
227 				 tdb->fd, offset, rw_type, lck_type,
228 				 strerror(errno)));
229 		}
230 		return TDB_ERRCODE(TDB_ERR_LOCK, -1);
231 	}
232 	return 0;
233 }
234 
235 /* lock a list in the database. list -1 is the alloc list */
tdb_lock(TDB_CONTEXT * tdb,int list,int ltype)236 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
237 {
238 	if (list < -1 || list >= (int)tdb->header.hash_size) {
239 		TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
240 			   list, ltype));
241 		return -1;
242 	}
243 	if (tdb->flags & TDB_NOLOCK)
244 		return 0;
245 
246 	/* Since fcntl locks don't nest, we do a lock for the first one,
247 	   and simply bump the count for future ones */
248 	if (tdb->locked[list+1].count == 0) {
249 		if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
250 			TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
251 					   list, ltype, strerror(errno)));
252 			return -1;
253 		}
254 		tdb->locked[list+1].ltype = ltype;
255 	}
256 	tdb->locked[list+1].count++;
257 	return 0;
258 }
259 
260 /* unlock the database: returns void because it's too late for errors. */
261 	/* changed to return int it may be interesting to know there
262 	   has been an error  --simo */
tdb_unlock(TDB_CONTEXT * tdb,int list,int ltype)263 static int tdb_unlock(TDB_CONTEXT *tdb, int list,
264 		      int ltype __attribute__((unused)))
265 {
266 	int ret = -1;
267 
268 	if (tdb->flags & TDB_NOLOCK)
269 		return 0;
270 
271 	/* Sanity checks */
272 	if (list < -1 || list >= (int)tdb->header.hash_size) {
273 		TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
274 		return ret;
275 	}
276 
277 	if (tdb->locked[list+1].count==0) {
278 		TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
279 		return ret;
280 	}
281 
282 	if (tdb->locked[list+1].count == 1) {
283 		/* Down to last nested lock: unlock underneath */
284 		ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
285 	} else {
286 		ret = 0;
287 	}
288 	tdb->locked[list+1].count--;
289 
290 	if (ret)
291 		TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
292 	return ret;
293 }
294 
295 /* This is based on the hash algorithm from gdbm */
default_tdb_hash(TDB_DATA * key)296 static uint32_t default_tdb_hash(TDB_DATA *key)
297 {
298 	uint32_t value;	/* Used to compute the hash value.  */
299 	uint32_t   i;	/* Used to cycle through random values. */
300 
301 	/* Set the initial value from the key size. */
302 	for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
303 		value = (value + (key->dptr[i] << (i*5 % 24)));
304 
305 	return (1103515243 * value + 12345);
306 }
307 
308 /* check for an out of bounds access - if it is out of bounds then
309    see if the database has been expanded by someone else and expand
310    if necessary
311    note that "len" is the minimum length needed for the db
312 */
tdb_oob(TDB_CONTEXT * tdb,tdb_off len,int probe)313 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
314 {
315 	struct stat st;
316 	if (len <= tdb->map_size)
317 		return 0;
318 	if (tdb->flags & TDB_INTERNAL) {
319 		if (!probe) {
320 			/* Ensure ecode is set for log fn. */
321 			tdb->ecode = TDB_ERR_IO;
322 			TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
323 				 (int)len, (int)tdb->map_size));
324 		}
325 		return TDB_ERRCODE(TDB_ERR_IO, -1);
326 	}
327 
328 	if (fstat(tdb->fd, &st) == -1)
329 		return TDB_ERRCODE(TDB_ERR_IO, -1);
330 
331 	if (st.st_size < (off_t)len) {
332 		if (!probe) {
333 			/* Ensure ecode is set for log fn. */
334 			tdb->ecode = TDB_ERR_IO;
335 			TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
336 				 (int)len, (int)st.st_size));
337 		}
338 		return TDB_ERRCODE(TDB_ERR_IO, -1);
339 	}
340 
341 	/* Unmap, update size, remap */
342 	if (tdb_munmap(tdb) == -1)
343 		return TDB_ERRCODE(TDB_ERR_IO, -1);
344 	tdb->map_size = st.st_size;
345 	tdb_mmap(tdb);
346 	return 0;
347 }
348 
349 /* write a lump of data at a specified offset */
tdb_write(TDB_CONTEXT * tdb,tdb_off off,void * buf,tdb_len len)350 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
351 {
352 	if (tdb_oob(tdb, off + len, 0) != 0)
353 		return -1;
354 
355 	if (tdb->map_ptr)
356 		memcpy(off + (char *)tdb->map_ptr, buf, len);
357 #ifdef HAVE_PWRITE
358 	else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
359 #else
360 	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
361 		 || write(tdb->fd, buf, len) != (off_t)len) {
362 #endif
363 		/* Ensure ecode is set for log fn. */
364 		tdb->ecode = TDB_ERR_IO;
365 		TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
366 			   off, len, strerror(errno)));
367 		return TDB_ERRCODE(TDB_ERR_IO, -1);
368 	}
369 	return 0;
370 }
371 
372 /* read a lump of data at a specified offset, maybe convert */
373 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
374 {
375 	if (tdb_oob(tdb, off + len, 0) != 0)
376 		return -1;
377 
378 	if (tdb->map_ptr)
379 		memcpy(buf, off + (char *)tdb->map_ptr, len);
380 #ifdef HAVE_PREAD
381 	else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
382 #else
383 	else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
384 		 || read(tdb->fd, buf, len) != (off_t)len) {
385 #endif
386 		/* Ensure ecode is set for log fn. */
387 		tdb->ecode = TDB_ERR_IO;
388 		TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
389 			   off, len, strerror(errno)));
390 		return TDB_ERRCODE(TDB_ERR_IO, -1);
391 	}
392 	if (cv)
393 		convert(buf, len);
394 	return 0;
395 }
396 
397 /* don't allocate memory: used in tdb_delete path. */
398 static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
399 {
400 	char buf[64];
401 	uint32_t len;
402 
403 	if (tdb_oob(tdb, off + key.dsize, 0) != 0)
404 		return -1;
405 
406 	if (tdb->map_ptr)
407 		return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
408 
409 	while (key.dsize) {
410 		len = key.dsize;
411 		if (len > sizeof(buf))
412 			len = sizeof(buf);
413 		if (tdb_read(tdb, off, buf, len, 0) != 0)
414 			return -1;
415 		if (memcmp(buf, key.dptr, len) != 0)
416 			return 0;
417 		key.dptr += len;
418 		key.dsize -= len;
419 		off += len;
420 	}
421 	return 1;
422 }
423 
424 /* read a lump of data, allocating the space for it */
425 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
426 {
427 	char *buf;
428 
429 	if (!(buf = talloc_size(tdb, len))) {
430 		/* Ensure ecode is set for log fn. */
431 		tdb->ecode = TDB_ERR_OOM;
432 		TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
433 			   len, strerror(errno)));
434 		return TDB_ERRCODE(TDB_ERR_OOM, buf);
435 	}
436 	if (tdb_read(tdb, offset, buf, len, 0) == -1) {
437 		SAFE_FREE(buf);
438 		return NULL;
439 	}
440 	return buf;
441 }
442 
443 /* read/write a tdb_off */
444 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
445 {
446 	return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
447 }
448 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
449 {
450 	tdb_off off = *d;
451 	return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
452 }
453 
454 /* read/write a record */
455 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
456 {
457 	if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
458 		return -1;
459 	if (TDB_BAD_MAGIC(rec)) {
460 		/* Ensure ecode is set for log fn. */
461 		tdb->ecode = TDB_ERR_CORRUPT;
462 		TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
463 		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
464 	}
465 	return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
466 }
467 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
468 {
469 	struct list_struct r = *rec;
470 	return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
471 }
472 
473 /* read a freelist record and check for simple errors */
474 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
475 {
476 	if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
477 		return -1;
478 
479 	if (rec->magic == TDB_MAGIC) {
480 		/* this happens when a app is showdown while deleting a record - we should
481 		   not completely fail when this happens */
482 		TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
483 			 rec->magic, off));
484 		rec->magic = TDB_FREE_MAGIC;
485 		if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
486 			return -1;
487 	}
488 
489 	if (rec->magic != TDB_FREE_MAGIC) {
490 		/* Ensure ecode is set for log fn. */
491 		tdb->ecode = TDB_ERR_CORRUPT;
492 		TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
493 			   rec->magic, off));
494 		return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
495 	}
496 	if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
497 		return -1;
498 	return 0;
499 }
500 
501 /* update a record tailer (must hold allocation lock) */
502 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
503 			 const struct list_struct *rec)
504 {
505 	tdb_off totalsize;
506 
507 	/* Offset of tailer from record header */
508 	totalsize = sizeof(*rec) + rec->rec_len;
509 	return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
510 			 &totalsize);
511 }
512 
513 /* Remove an element from the freelist.  Must have alloc lock. */
514 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
515 {
516 	tdb_off last_ptr, i;
517 
518 	/* read in the freelist top */
519 	last_ptr = FREELIST_TOP;
520 	while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
521 		if (i == off) {
522 			/* We've found it! */
523 			return ofs_write(tdb, last_ptr, &next);
524 		}
525 		/* Follow chain (next offset is at start of record) */
526 		last_ptr = i;
527 	}
528 	TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
529 	return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
530 }
531 
532 /* Add an element into the freelist. Merge adjacent records if
533    neccessary. */
534 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
535 {
536 	tdb_off right, left;
537 
538 	/* Allocation and tailer lock */
539 	if (tdb_lock(tdb, -1, F_WRLCK) != 0)
540 		return -1;
541 
542 	/* set an initial tailer, so if we fail we don't leave a bogus record */
543 	if (update_tailer(tdb, offset, rec) != 0) {
544 		TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
545 		goto fail;
546 	}
547 
548 	/* Look right first (I'm an Australian, dammit) */
549 	right = offset + sizeof(*rec) + rec->rec_len;
550 	if (right + sizeof(*rec) <= tdb->map_size) {
551 		struct list_struct r;
552 
553 		if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
554 			TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
555 			goto left;
556 		}
557 
558 		/* If it's free, expand to include it. */
559 		if (r.magic == TDB_FREE_MAGIC) {
560 			if (remove_from_freelist(tdb, right, r.next) == -1) {
561 				TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
562 				goto left;
563 			}
564 			rec->rec_len += sizeof(r) + r.rec_len;
565 		}
566 	}
567 
568 left:
569 	/* Look left */
570 	left = offset - sizeof(tdb_off);
571 	if (left > TDB_DATA_START(tdb->header.hash_size)) {
572 		struct list_struct l;
573 		tdb_off leftsize;
574 
575 		/* Read in tailer and jump back to header */
576 		if (ofs_read(tdb, left, &leftsize) == -1) {
577 			TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
578 			goto update;
579 		}
580 		left = offset - leftsize;
581 
582 		/* Now read in record */
583 		if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
584 			TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
585 			goto update;
586 		}
587 
588 		/* If it's free, expand to include it. */
589 		if (l.magic == TDB_FREE_MAGIC) {
590 			if (remove_from_freelist(tdb, left, l.next) == -1) {
591 				TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
592 				goto update;
593 			} else {
594 				offset = left;
595 				rec->rec_len += leftsize;
596 			}
597 		}
598 	}
599 
600 update:
601 	if (update_tailer(tdb, offset, rec) == -1) {
602 		TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
603 		goto fail;
604 	}
605 
606 	/* Now, prepend to free list */
607 	rec->magic = TDB_FREE_MAGIC;
608 
609 	if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
610 	    rec_write(tdb, offset, rec) == -1 ||
611 	    ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
612 		TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
613 		goto fail;
614 	}
615 
616 	/* And we're done. */
617 	tdb_unlock(tdb, -1, F_WRLCK);
618 	return 0;
619 
620  fail:
621 	tdb_unlock(tdb, -1, F_WRLCK);
622 	return -1;
623 }
624 
625 
626 /* expand a file.  we prefer to use ftruncate, as that is what posix
627   says to use for mmap expansion */
628 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
629 {
630 	char buf[1024];
631 #ifdef HAVE_FTRUNCATE_EXTEND
632 	if (ftruncate(tdb->fd, size+addition) != 0) {
633 		TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
634 			   size+addition, strerror(errno)));
635 		return -1;
636 	}
637 #else
638 	char b = 0;
639 
640 #ifdef HAVE_PWRITE
641 	if (pwrite(tdb->fd,  &b, 1, (size+addition) - 1) != 1) {
642 #else
643 	if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 ||
644 	    write(tdb->fd, &b, 1) != 1) {
645 #endif
646 		TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
647 			   size+addition, strerror(errno)));
648 		return -1;
649 	}
650 #endif
651 
652 	/* now fill the file with something. This ensures that the file isn't sparse, which would be
653 	   very bad if we ran out of disk. This must be done with write, not via mmap */
654 	memset(buf, 0x42, sizeof(buf));
655 	while (addition) {
656 		int n = addition>sizeof(buf)?sizeof(buf):addition;
657 #ifdef HAVE_PWRITE
658 		int ret = pwrite(tdb->fd, buf, n, size);
659 #else
660 		int ret;
661 		if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
662 			return -1;
663 		ret = write(tdb->fd, buf, n);
664 #endif
665 		if (ret != n) {
666 			TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
667 				   n, strerror(errno)));
668 			return -1;
669 		}
670 		addition -= n;
671 		size += n;
672 	}
673 	return 0;
674 }
675 
676 
677 /* expand the database at least size bytes by expanding the underlying
678    file and doing the mmap again if necessary */
679 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
680 {
681 	struct list_struct rec;
682 	tdb_off offset;
683 
684 	if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
685 		TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
686 		return -1;
687 	}
688 
689 	/* must know about any previous expansions by another process */
690 	tdb_oob(tdb, tdb->map_size + 1, 1);
691 
692 	/* always make room for at least 10 more records, and round
693            the database up to a multiple of TDB_PAGE_SIZE */
694 	size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
695 
696 	if (!(tdb->flags & TDB_INTERNAL))
697 		tdb_munmap(tdb);
698 
699 	/*
700 	 * We must ensure the file is unmapped before doing this
701 	 * to ensure consistency with systems like OpenBSD where
702 	 * writes and mmaps are not consistent.
703 	 */
704 
705 	/* expand the file itself */
706 	if (!(tdb->flags & TDB_INTERNAL)) {
707 		if (expand_file(tdb, tdb->map_size, size) != 0)
708 			goto fail;
709 	}
710 
711 	tdb->map_size += size;
712 
713 	if (tdb->flags & TDB_INTERNAL) {
714 		char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
715 							tdb->map_size);
716 		if (!new_map_ptr) {
717 			tdb->map_size -= size;
718 			goto fail;
719 		}
720 		tdb->map_ptr = new_map_ptr;
721 	} else {
722 		/*
723 		 * We must ensure the file is remapped before adding the space
724 		 * to ensure consistency with systems like OpenBSD where
725 		 * writes and mmaps are not consistent.
726 		 */
727 
728 		/* We're ok if the mmap fails as we'll fallback to read/write */
729 		tdb_mmap(tdb);
730 	}
731 
732 	/* form a new freelist record */
733 	memset(&rec,'\0',sizeof(rec));
734 	rec.rec_len = size - sizeof(rec);
735 
736 	/* link it into the free list */
737 	offset = tdb->map_size - size;
738 	if (tdb_free(tdb, offset, &rec) == -1)
739 		goto fail;
740 
741 	tdb_unlock(tdb, -1, F_WRLCK);
742 	return 0;
743  fail:
744 	tdb_unlock(tdb, -1, F_WRLCK);
745 	return -1;
746 }
747 
748 
749 /*
750    the core of tdb_allocate - called when we have decided which
751    free list entry to use
752  */
753 static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
754 				struct list_struct *rec, tdb_off last_ptr)
755 {
756 	struct list_struct newrec;
757 	tdb_off newrec_ptr;
758 
759 	memset(&newrec, '\0', sizeof(newrec));
760 
761 	/* found it - now possibly split it up  */
762 	if (rec->rec_len > length + MIN_REC_SIZE) {
763 		/* Length of left piece */
764 		length = TDB_ALIGN(length, TDB_ALIGNMENT);
765 
766 		/* Right piece to go on free list */
767 		newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
768 		newrec_ptr = rec_ptr + sizeof(*rec) + length;
769 
770 		/* And left record is shortened */
771 		rec->rec_len = length;
772 	} else {
773 		newrec_ptr = 0;
774 	}
775 
776 	/* Remove allocated record from the free list */
777 	if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
778 		return 0;
779 	}
780 
781 	/* Update header: do this before we drop alloc
782 	   lock, otherwise tdb_free() might try to
783 	   merge with us, thinking we're free.
784 	   (Thanks Jeremy Allison). */
785 	rec->magic = TDB_MAGIC;
786 	if (rec_write(tdb, rec_ptr, rec) == -1) {
787 		return 0;
788 	}
789 
790 	/* Did we create new block? */
791 	if (newrec_ptr) {
792 		/* Update allocated record tailer (we
793 		   shortened it). */
794 		if (update_tailer(tdb, rec_ptr, rec) == -1) {
795 			return 0;
796 		}
797 
798 		/* Free new record */
799 		if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
800 			return 0;
801 		}
802 	}
803 
804 	/* all done - return the new record offset */
805 	return rec_ptr;
806 }
807 
808 /* allocate some space from the free list. The offset returned points
809    to a unconnected list_struct within the database with room for at
810    least length bytes of total data
811 
812    0 is returned if the space could not be allocated
813  */
814 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
815 			    struct list_struct *rec)
816 {
817 	tdb_off rec_ptr, last_ptr, newrec_ptr;
818 	struct {
819 		tdb_off rec_ptr, last_ptr;
820 		tdb_len rec_len;
821 	} bestfit = { 0, 0, 0 };
822 
823 	if (tdb_lock(tdb, -1, F_WRLCK) == -1)
824 		return 0;
825 
826 	/* Extra bytes required for tailer */
827 	length += sizeof(tdb_off);
828 
829  again:
830 	last_ptr = FREELIST_TOP;
831 
832 	/* read in the freelist top */
833 	if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
834 		goto fail;
835 
836 	bestfit.rec_ptr = 0;
837 
838 	/*
839 	   this is a best fit allocation strategy. Originally we used
840 	   a first fit strategy, but it suffered from massive fragmentation
841 	   issues when faced with a slowly increasing record size.
842 	 */
843 	while (rec_ptr) {
844 		if (rec_free_read(tdb, rec_ptr, rec) == -1) {
845 			goto fail;
846 		}
847 
848 		if (rec->rec_len >= length) {
849 			if (bestfit.rec_ptr == 0 ||
850 			    rec->rec_len < bestfit.rec_len) {
851 				bestfit.rec_len = rec->rec_len;
852 				bestfit.rec_ptr = rec_ptr;
853 				bestfit.last_ptr = last_ptr;
854 				/* consider a fit to be good enough if we aren't wasting more than half the space */
855 				if (bestfit.rec_len < 2*length) {
856 					break;
857 				}
858 			}
859 		}
860 
861 		/* move to the next record */
862 		last_ptr = rec_ptr;
863 		rec_ptr = rec->next;
864 	}
865 
866 	if (bestfit.rec_ptr != 0) {
867 		if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
868 			goto fail;
869 		}
870 
871 		newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
872 		tdb_unlock(tdb, -1, F_WRLCK);
873 		return newrec_ptr;
874 	}
875 
876 	/* we didn't find enough space. See if we can expand the
877 	   database and if we can then try again */
878 	if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
879 		goto again;
880  fail:
881 	tdb_unlock(tdb, -1, F_WRLCK);
882 	return 0;
883 }
884 
885 /* initialise a new database with a specified hash size */
886 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
887 {
888 	struct tdb_header *newdb;
889 	int size, ret = -1;
890 
891 	/* We make it up in memory, then write it out if not internal */
892 	size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
893 	if (!(newdb = talloc_zero_size(tdb, size)))
894 		return TDB_ERRCODE(TDB_ERR_OOM, -1);
895 
896 	/* Fill in the header */
897 	newdb->version = TDB_VERSION;
898 	newdb->hash_size = hash_size;
899 	if (tdb->flags & TDB_INTERNAL) {
900 		tdb->map_size = size;
901 		tdb->map_ptr = (char *)newdb;
902 		memcpy(&tdb->header, newdb, sizeof(tdb->header));
903 		/* Convert the `ondisk' version if asked. */
904 		CONVERT(*newdb);
905 		return 0;
906 	}
907 	if (lseek(tdb->fd, 0, SEEK_SET) == -1)
908 		goto fail;
909 
910 	if (ftruncate(tdb->fd, 0) == -1)
911 		goto fail;
912 
913 	/* This creates an endian-converted header, as if read from disk */
914 	CONVERT(*newdb);
915 	memcpy(&tdb->header, newdb, sizeof(tdb->header));
916 	/* Don't endian-convert the magic food! */
917 	memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
918 	if (write(tdb->fd, newdb, size) != size)
919 		ret = -1;
920 	else
921 		ret = 0;
922 
923   fail:
924 	SAFE_FREE(newdb);
925 	return ret;
926 }
927 
928 /* Returns 0 on fail.  On success, return offset of record, and fills
929    in rec */
930 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash,
931 			struct list_struct *r)
932 {
933 	tdb_off rec_ptr;
934 
935 	/* read in the hash top */
936 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
937 		return 0;
938 
939 	/* keep looking until we find the right record */
940 	while (rec_ptr) {
941 		if (rec_read(tdb, rec_ptr, r) == -1)
942 			return 0;
943 
944 		if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
945 			/* a very likely hit - read the key */
946 			int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
947 			if (cmp < 0)
948 				return 0;
949 			else if (cmp > 0)
950 				return rec_ptr;
951 		}
952 		rec_ptr = r->next;
953 	}
954 	return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
955 }
956 
957 /* As tdb_find, but if you succeed, keep the lock */
958 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash, int locktype,
959 			     struct list_struct *rec)
960 {
961 	uint32_t rec_ptr;
962 
963 	if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
964 		return 0;
965 	if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
966 		tdb_unlock(tdb, BUCKET(hash), locktype);
967 	return rec_ptr;
968 }
969 
970 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
971 {
972 	return tdb->ecode;
973 }
974 
975 static struct tdb_errname {
976 	enum TDB_ERROR ecode; const char *estring;
977 } emap[] = { {TDB_SUCCESS, "Success"},
978 	     {TDB_ERR_CORRUPT, "Corrupt database"},
979 	     {TDB_ERR_IO, "IO Error"},
980 	     {TDB_ERR_LOCK, "Locking error"},
981 	     {TDB_ERR_OOM, "Out of memory"},
982 	     {TDB_ERR_EXISTS, "Record exists"},
983 	     {TDB_ERR_NOLOCK, "Lock exists on other keys"},
984 	     {TDB_ERR_NOEXIST, "Record does not exist"} };
985 
986 /* Error string for the last tdb error */
987 const char *tdb_errorstr(TDB_CONTEXT *tdb)
988 {
989 	uint32_t i;
990 	for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
991 		if (tdb->ecode == emap[i].ecode)
992 			return emap[i].estring;
993 	return "Invalid error code";
994 }
995 
996 /* update an entry in place - this only works if the new data size
997    is <= the old data size and the key exists.
998    on failure return -1.
999 */
1000 
1001 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
1002 {
1003 	struct list_struct rec;
1004 	tdb_off rec_ptr;
1005 
1006 	/* find entry */
1007 	if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1008 		return -1;
1009 
1010 	/* must be long enough key, data and tailer */
1011 	if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1012 		tdb->ecode = TDB_SUCCESS; /* Not really an error */
1013 		return -1;
1014 	}
1015 
1016 	if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1017 		      dbuf.dptr, dbuf.dsize) == -1)
1018 		return -1;
1019 
1020 	if (dbuf.dsize != rec.data_len) {
1021 		/* update size */
1022 		rec.data_len = dbuf.dsize;
1023 		return rec_write(tdb, rec_ptr, &rec);
1024 	}
1025 
1026 	return 0;
1027 }
1028 
1029 /* find an entry in the database given a key */
1030 /* If an entry doesn't exist tdb_err will be set to
1031  * TDB_ERR_NOEXIST. If a key has no data attached
1032  * then the TDB_DATA will have zero length but
1033  * a non-zero pointer
1034  */
1035 
1036 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1037 {
1038 	tdb_off rec_ptr;
1039 	struct list_struct rec;
1040 	TDB_DATA ret;
1041 	uint32_t hash;
1042 
1043 	/* find which hash bucket it is in */
1044 	hash = tdb->hash_fn(&key);
1045 	if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1046 		return tdb_null;
1047 
1048 	ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1049 				  rec.data_len);
1050 	ret.dsize = rec.data_len;
1051 	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1052 	return ret;
1053 }
1054 
1055 /* check if an entry in the database exists
1056 
1057    note that 1 is returned if the key is found and 0 is returned if not found
1058    this doesn't match the conventions in the rest of this module, but is
1059    compatible with gdbm
1060 */
1061 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash)
1062 {
1063 	struct list_struct rec;
1064 
1065 	if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1066 		return 0;
1067 	tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1068 	return 1;
1069 }
1070 
1071 /* record lock stops delete underneath */
1072 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1073 {
1074 	return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1075 }
1076 /*
1077   Write locks override our own fcntl readlocks, so check it here.
1078   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1079   an error to fail to get the lock here.
1080 */
1081 
1082 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1083 {
1084 	struct tdb_traverse_lock *i;
1085 	for (i = &tdb->travlocks; i; i = i->next)
1086 		if (i->off == off)
1087 			return -1;
1088 	return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1089 }
1090 
1091 /*
1092   Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1093   an error to fail to get the lock here.
1094 */
1095 
1096 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1097 {
1098 	return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1099 }
1100 /* fcntl locks don't stack: avoid unlocking someone else's */
1101 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1102 {
1103 	struct tdb_traverse_lock *i;
1104 	uint32_t count = 0;
1105 
1106 	if (off == 0)
1107 		return 0;
1108 	for (i = &tdb->travlocks; i; i = i->next)
1109 		if (i->off == off)
1110 			count++;
1111 	return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1112 }
1113 
1114 /* actually delete an entry in the database given the offset */
1115 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1116 {
1117 	tdb_off last_ptr, i;
1118 	struct list_struct lastrec;
1119 
1120 	if (tdb->read_only) return -1;
1121 
1122 	if (write_lock_record(tdb, rec_ptr) == -1) {
1123 		/* Someone traversing here: mark it as dead */
1124 		rec->magic = TDB_DEAD_MAGIC;
1125 		return rec_write(tdb, rec_ptr, rec);
1126 	}
1127 	if (write_unlock_record(tdb, rec_ptr) != 0)
1128 		return -1;
1129 
1130 	/* find previous record in hash chain */
1131 	if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1132 		return -1;
1133 	for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1134 		if (rec_read(tdb, i, &lastrec) == -1)
1135 			return -1;
1136 
1137 	/* unlink it: next ptr is at start of record. */
1138 	if (last_ptr == 0)
1139 		last_ptr = TDB_HASH_TOP(rec->full_hash);
1140 	if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1141 		return -1;
1142 
1143 	/* recover the space */
1144 	if (tdb_free(tdb, rec_ptr, rec) == -1)
1145 		return -1;
1146 	return 0;
1147 }
1148 
1149 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1150 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1151 			 struct list_struct *rec)
1152 {
1153 	int want_next = (tlock->off != 0);
1154 
1155 	/* Lock each chain from the start one. */
1156 	for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1157 
1158 		/* this is an optimisation for the common case where
1159 		   the hash chain is empty, which is particularly
1160 		   common for the use of tdb with ldb, where large
1161 		   hashes are used. In that case we spend most of our
1162 		   time in tdb_brlock(), locking empty hash chains.
1163 
1164 		   To avoid this, we do an unlocked pre-check to see
1165 		   if the hash chain is empty before starting to look
1166 		   inside it. If it is empty then we can avoid that
1167 		   hash chain. If it isn't empty then we can't believe
1168 		   the value we get back, as we read it without a
1169 		   lock, so instead we get the lock and re-fetch the
1170 		   value below.
1171 
1172 		   Notice that not doing this optimisation on the
1173 		   first hash chain is critical. We must guarantee
1174 		   that we have done at least one fcntl lock at the
1175 		   start of a search to guarantee that memory is
1176 		   coherent on SMP systems. If records are added by
1177 		   others during the search then thats OK, and we
1178 		   could possibly miss those with this trick, but we
1179 		   could miss them anyway without this trick, so the
1180 		   semantics don't change.
1181 
1182 		   With a non-indexed ldb search this trick gains us a
1183 		   factor of around 80 in speed on a linux 2.6.x
1184 		   system (testing using ldbtest).
1185 		 */
1186 		if (!tlock->off && tlock->hash != 0) {
1187 			uint32_t off;
1188 			if (tdb->map_ptr) {
1189 				for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
1190 					if (0 != *(uint32_t *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
1191 						break;
1192 					}
1193 				}
1194 				if (tlock->hash == tdb->header.hash_size) {
1195 					continue;
1196 				}
1197 			} else {
1198 				if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
1199 				    off == 0) {
1200 					continue;
1201 				}
1202 			}
1203 		}
1204 
1205 		if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1206 			return -1;
1207 
1208 		/* No previous record?  Start at top of chain. */
1209 		if (!tlock->off) {
1210 			if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1211 				     &tlock->off) == -1)
1212 				goto fail;
1213 		} else {
1214 			/* Otherwise unlock the previous record. */
1215 			if (unlock_record(tdb, tlock->off) != 0)
1216 				goto fail;
1217 		}
1218 
1219 		if (want_next) {
1220 			/* We have offset of old record: grab next */
1221 			if (rec_read(tdb, tlock->off, rec) == -1)
1222 				goto fail;
1223 			tlock->off = rec->next;
1224 		}
1225 
1226 		/* Iterate through chain */
1227 		while( tlock->off) {
1228 			tdb_off current;
1229 			if (rec_read(tdb, tlock->off, rec) == -1)
1230 				goto fail;
1231 
1232 			/* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
1233 			if (tlock->off == rec->next) {
1234 				TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
1235 				goto fail;
1236 			}
1237 
1238 			if (!TDB_DEAD(rec)) {
1239 				/* Woohoo: we found one! */
1240 				if (lock_record(tdb, tlock->off) != 0)
1241 					goto fail;
1242 				return tlock->off;
1243 			}
1244 
1245 			/* Try to clean dead ones from old traverses */
1246 			current = tlock->off;
1247 			tlock->off = rec->next;
1248 			if (!tdb->read_only &&
1249 			    do_delete(tdb, current, rec) != 0)
1250 				goto fail;
1251 		}
1252 		tdb_unlock(tdb, tlock->hash, F_WRLCK);
1253 		want_next = 0;
1254 	}
1255 	/* We finished iteration without finding anything */
1256 	return TDB_ERRCODE(TDB_SUCCESS, 0);
1257 
1258  fail:
1259 	tlock->off = 0;
1260 	if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1261 		TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1262 	return -1;
1263 }
1264 
1265 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1266    return -1 on error or the record count traversed
1267    if fn is NULL then it is not called
1268    a non-zero return value from fn() indicates that the traversal should stop
1269   */
1270 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1271 {
1272 	TDB_DATA key, dbuf;
1273 	struct list_struct rec;
1274 	struct tdb_traverse_lock tl = { NULL, 0, 0 };
1275 	int ret, count = 0;
1276 
1277 	/* This was in the initializaton, above, but the IRIX compiler
1278 	 * did not like it.  crh
1279 	 */
1280 	tl.next = tdb->travlocks.next;
1281 
1282 	/* fcntl locks don't stack: beware traverse inside traverse */
1283 	tdb->travlocks.next = &tl;
1284 
1285 	/* tdb_next_lock places locks on the record returned, and its chain */
1286 	while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1287 		count++;
1288 		/* now read the full record */
1289 		key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1290 					  rec.key_len + rec.data_len);
1291 		if (!key.dptr) {
1292 			ret = -1;
1293 			if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1294 				goto out;
1295 			if (unlock_record(tdb, tl.off) != 0)
1296 				TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1297 			goto out;
1298 		}
1299 		key.dsize = rec.key_len;
1300 		dbuf.dptr = key.dptr + rec.key_len;
1301 		dbuf.dsize = rec.data_len;
1302 
1303 		/* Drop chain lock, call out */
1304 		if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1305 			ret = -1;
1306 			goto out;
1307 		}
1308 		if (fn && fn(tdb, key, dbuf, private)) {
1309 			/* They want us to terminate traversal */
1310 			ret = count;
1311 			if (unlock_record(tdb, tl.off) != 0) {
1312 				TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));
1313 				ret = -1;
1314 			}
1315 			tdb->travlocks.next = tl.next;
1316 			SAFE_FREE(key.dptr);
1317 			return count;
1318 		}
1319 		SAFE_FREE(key.dptr);
1320 	}
1321 out:
1322 	tdb->travlocks.next = tl.next;
1323 	if (ret < 0)
1324 		return -1;
1325 	else
1326 		return count;
1327 }
1328 
1329 /* find the first entry in the database and return its key */
1330 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1331 {
1332 	TDB_DATA key;
1333 	struct list_struct rec;
1334 
1335 	/* release any old lock */
1336 	if (unlock_record(tdb, tdb->travlocks.off) != 0)
1337 		return tdb_null;
1338 	tdb->travlocks.off = tdb->travlocks.hash = 0;
1339 
1340 	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1341 		return tdb_null;
1342 	/* now read the key */
1343 	key.dsize = rec.key_len;
1344 	key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1345 	if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1346 		TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1347 	return key;
1348 }
1349 
1350 /* find the next entry in the database, returning its key */
1351 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1352 {
1353 	uint32_t oldhash;
1354 	TDB_DATA key = tdb_null;
1355 	struct list_struct rec;
1356 	char *k = NULL;
1357 
1358 	/* Is locked key the old key?  If so, traverse will be reliable. */
1359 	if (tdb->travlocks.off) {
1360 		if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1361 			return tdb_null;
1362 		if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1363 		    || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1364 					    rec.key_len))
1365 		    || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1366 			/* No, it wasn't: unlock it and start from scratch */
1367 			if (unlock_record(tdb, tdb->travlocks.off) != 0)
1368 				return tdb_null;
1369 			if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1370 				return tdb_null;
1371 			tdb->travlocks.off = 0;
1372 		}
1373 
1374 		SAFE_FREE(k);
1375 	}
1376 
1377 	if (!tdb->travlocks.off) {
1378 		/* No previous element: do normal find, and lock record */
1379 		tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1380 		if (!tdb->travlocks.off)
1381 			return tdb_null;
1382 		tdb->travlocks.hash = BUCKET(rec.full_hash);
1383 		if (lock_record(tdb, tdb->travlocks.off) != 0) {
1384 			TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1385 			return tdb_null;
1386 		}
1387 	}
1388 	oldhash = tdb->travlocks.hash;
1389 
1390 	/* Grab next record: locks chain and returned record,
1391 	   unlocks old record */
1392 	if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1393 		key.dsize = rec.key_len;
1394 		key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1395 					  key.dsize);
1396 		/* Unlock the chain of this new record */
1397 		if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1398 			TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1399 	}
1400 	/* Unlock the chain of old record */
1401 	if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1402 		TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1403 	return key;
1404 }
1405 
1406 /* delete an entry in the database given a key */
1407 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash)
1408 {
1409 	tdb_off rec_ptr;
1410 	struct list_struct rec;
1411 	int ret;
1412 
1413 	if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1414 		return -1;
1415 	ret = do_delete(tdb, rec_ptr, &rec);
1416 	if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1417 		TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1418 	return ret;
1419 }
1420 
1421 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1422 {
1423 	uint32_t hash = tdb->hash_fn(&key);
1424 	return tdb_delete_hash(tdb, key, hash);
1425 }
1426 
1427 /* store an element in the database, replacing any existing element
1428    with the same key
1429 
1430    return 0 on success, -1 on failure
1431 */
1432 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1433 {
1434 	struct list_struct rec;
1435 	uint32_t hash;
1436 	tdb_off rec_ptr;
1437 	char *p = NULL;
1438 	int ret = 0;
1439 
1440 	/* find which hash bucket it is in */
1441 	hash = tdb->hash_fn(&key);
1442 	if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1443 		return -1;
1444 
1445 	/* check for it existing, on insert. */
1446 	if (flag == TDB_INSERT) {
1447 		if (tdb_exists_hash(tdb, key, hash)) {
1448 			tdb->ecode = TDB_ERR_EXISTS;
1449 			goto fail;
1450 		}
1451 	} else {
1452 		/* first try in-place update, on modify or replace. */
1453 		if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1454 			goto out;
1455 		if (tdb->ecode == TDB_ERR_NOEXIST &&
1456 		    flag == TDB_MODIFY) {
1457 			/* if the record doesn't exist and we are in TDB_MODIFY mode then
1458 			 we should fail the store */
1459 			goto fail;
1460 		}
1461 	}
1462 	/* reset the error code potentially set by the tdb_update() */
1463 	tdb->ecode = TDB_SUCCESS;
1464 
1465 	/* delete any existing record - if it doesn't exist we don't
1466            care.  Doing this first reduces fragmentation, and avoids
1467            coalescing with `allocated' block before it's updated. */
1468 	if (flag != TDB_INSERT)
1469 		tdb_delete_hash(tdb, key, hash);
1470 
1471 	/* Copy key+value *before* allocating free space in case malloc
1472 	   fails and we are left with a dead spot in the tdb. */
1473 
1474 	if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
1475 		tdb->ecode = TDB_ERR_OOM;
1476 		goto fail;
1477 	}
1478 
1479 	memcpy(p, key.dptr, key.dsize);
1480 	if (dbuf.dsize)
1481 		memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1482 
1483 	/* we have to allocate some space */
1484 	if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1485 		goto fail;
1486 
1487 	/* Read hash top into next ptr */
1488 	if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1489 		goto fail;
1490 
1491 	rec.key_len = key.dsize;
1492 	rec.data_len = dbuf.dsize;
1493 	rec.full_hash = hash;
1494 	rec.magic = TDB_MAGIC;
1495 
1496 	/* write out and point the top of the hash chain at it */
1497 	if (rec_write(tdb, rec_ptr, &rec) == -1
1498 	    || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1499 	    || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1500 		/* Need to tdb_unallocate() here */
1501 		goto fail;
1502 	}
1503  out:
1504 	SAFE_FREE(p);
1505 	tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1506 	return ret;
1507 fail:
1508 	ret = -1;
1509 	goto out;
1510 }
1511 
1512 static int tdb_already_open(dev_t device,
1513 			    ino_t ino)
1514 {
1515 	TDB_CONTEXT *i;
1516 
1517 	for (i = tdbs; i; i = i->next) {
1518 		if (i->device == device && i->inode == ino) {
1519 			return 1;
1520 		}
1521 	}
1522 
1523 	return 0;
1524 }
1525 
1526 /* a default logging function */
1527 static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
1528 			int level __attribute__((unused)),
1529 			const char *fmt __attribute__((unused)), ...)
1530 {
1531 }
1532 
1533 
1534 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1535 			 int open_flags, mode_t mode,
1536 			 tdb_log_func log_fn,
1537 			 tdb_hash_func hash_fn)
1538 {
1539 	TDB_CONTEXT *tdb;
1540 	struct stat st;
1541 	int rev = 0, locked = 0;
1542 	uint8_t *vp;
1543 	uint32_t vertest;
1544 
1545 	if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
1546 		/* Can't log this */
1547 		errno = ENOMEM;
1548 		goto fail;
1549 	}
1550 	tdb->fd = -1;
1551 	tdb->name = NULL;
1552 	tdb->map_ptr = NULL;
1553 	tdb->flags = tdb_flags;
1554 	tdb->open_flags = open_flags;
1555 	tdb->log_fn = log_fn?log_fn:null_log_fn;
1556 	tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1557 
1558 	if ((open_flags & O_ACCMODE) == O_WRONLY) {
1559 		TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1560 			 name));
1561 		errno = EINVAL;
1562 		goto fail;
1563 	}
1564 
1565 	if (hash_size == 0)
1566 		hash_size = DEFAULT_HASH_SIZE;
1567 	if ((open_flags & O_ACCMODE) == O_RDONLY) {
1568 		tdb->read_only = 1;
1569 		/* read only databases don't do locking or clear if first */
1570 		tdb->flags |= TDB_NOLOCK;
1571 		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1572 	}
1573 
1574 	/* internal databases don't mmap or lock, and start off cleared */
1575 	if (tdb->flags & TDB_INTERNAL) {
1576 		tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1577 		tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1578 		if (tdb_new_database(tdb, hash_size) != 0) {
1579 			TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1580 			goto fail;
1581 		}
1582 		goto internal;
1583 	}
1584 
1585 	if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1586 		TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1587 			 name, strerror(errno)));
1588 		goto fail;	/* errno set by open(2) */
1589 	}
1590 
1591 	/* ensure there is only one process initialising at once */
1592 	if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1593 		TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1594 			 name, strerror(errno)));
1595 		goto fail;	/* errno set by tdb_brlock */
1596 	}
1597 
1598 	/* we need to zero database if we are the only one with it open */
1599 	if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1600 		(locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1601 		open_flags |= O_CREAT;
1602 		if (ftruncate(tdb->fd, 0) == -1) {
1603 			TDB_LOG((tdb, 0, "tdb_open_ex: "
1604 				 "failed to truncate %s: %s\n",
1605 				 name, strerror(errno)));
1606 			goto fail; /* errno set by ftruncate */
1607 		}
1608 	}
1609 
1610 	if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1611 	    || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1612 	    || (tdb->header.version != TDB_VERSION
1613 		&& !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1614 		/* its not a valid database - possibly initialise it */
1615 		if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1616 			errno = EIO; /* ie bad format or something */
1617 			goto fail;
1618 		}
1619 		rev = (tdb->flags & TDB_CONVERT);
1620 	}
1621 	vp = (uint8_t *)&tdb->header.version;
1622 	vertest = (((uint32_t)vp[0]) << 24) | (((uint32_t)vp[1]) << 16) |
1623 		  (((uint32_t)vp[2]) << 8) | (uint32_t)vp[3];
1624 	tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1625 	if (!rev)
1626 		tdb->flags &= ~TDB_CONVERT;
1627 	else {
1628 		tdb->flags |= TDB_CONVERT;
1629 		convert(&tdb->header, sizeof(tdb->header));
1630 	}
1631 	if (fstat(tdb->fd, &st) == -1)
1632 		goto fail;
1633 
1634 	/* Is it already in the open list?  If so, fail. */
1635 	if (tdb_already_open(st.st_dev, st.st_ino)) {
1636 		TDB_LOG((tdb, 2, "tdb_open_ex: "
1637 			 "%s (%d,%d) is already open in this process\n",
1638 			 name, (int)st.st_dev, (int)st.st_ino));
1639 		errno = EBUSY;
1640 		goto fail;
1641 	}
1642 
1643 	if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
1644 		errno = ENOMEM;
1645 		goto fail;
1646 	}
1647 
1648 	tdb->map_size = st.st_size;
1649 	tdb->device = st.st_dev;
1650 	tdb->inode = st.st_ino;
1651 	tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
1652 					tdb->header.hash_size+1);
1653 	if (!tdb->locked) {
1654 		TDB_LOG((tdb, 2, "tdb_open_ex: "
1655 			 "failed to allocate lock structure for %s\n",
1656 			 name));
1657 		errno = ENOMEM;
1658 		goto fail;
1659 	}
1660 	tdb_mmap(tdb);
1661 	if (locked) {
1662 		if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1663 			TDB_LOG((tdb, 0, "tdb_open_ex: "
1664 				 "failed to take ACTIVE_LOCK on %s: %s\n",
1665 				 name, strerror(errno)));
1666 			goto fail;
1667 		}
1668 
1669 	}
1670 
1671 	/* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1672 	   we didn't get the initial exclusive lock as we need to let all other
1673 	   users know we're using it. */
1674 
1675 	if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1676 	/* leave this lock in place to indicate it's in use */
1677 	if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1678 		goto fail;
1679 	}
1680 
1681 
1682  internal:
1683 	/* Internal (memory-only) databases skip all the code above to
1684 	 * do with disk files, and resume here by releasing their
1685 	 * global lock and hooking into the active list. */
1686 	if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1687 		goto fail;
1688 	tdb->next = tdbs;
1689 	tdbs = tdb;
1690 	return tdb;
1691 
1692  fail:
1693 	{ int save_errno = errno;
1694 
1695 	if (!tdb)
1696 		return NULL;
1697 
1698 	if (tdb->map_ptr) {
1699 		if (tdb->flags & TDB_INTERNAL)
1700 			SAFE_FREE(tdb->map_ptr);
1701 		else
1702 			tdb_munmap(tdb);
1703 	}
1704 	SAFE_FREE(tdb->name);
1705 	if (tdb->fd != -1)
1706 		if (close(tdb->fd) != 0)
1707 			TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1708 	SAFE_FREE(tdb->locked);
1709 	SAFE_FREE(tdb);
1710 	errno = save_errno;
1711 	return NULL;
1712 	}
1713 }
1714 
1715 /**
1716  * Close a database.
1717  *
1718  * @returns -1 for error; 0 for success.
1719  **/
1720 int tdb_close(TDB_CONTEXT *tdb)
1721 {
1722 	TDB_CONTEXT **i;
1723 	int ret = 0;
1724 
1725 	if (tdb->map_ptr) {
1726 		if (tdb->flags & TDB_INTERNAL)
1727 			SAFE_FREE(tdb->map_ptr);
1728 		else
1729 			tdb_munmap(tdb);
1730 	}
1731 	SAFE_FREE(tdb->name);
1732 	if (tdb->fd != -1)
1733 		ret = close(tdb->fd);
1734 	SAFE_FREE(tdb->locked);
1735 
1736 	/* Remove from contexts list */
1737 	for (i = &tdbs; *i; i = &(*i)->next) {
1738 		if (*i == tdb) {
1739 			*i = tdb->next;
1740 			break;
1741 		}
1742 	}
1743 
1744 	memset(tdb, 0, sizeof(*tdb));
1745 	SAFE_FREE(tdb);
1746 
1747 	return ret;
1748 }
1749