1 /*
2 Unix SMB/CIFS implementation.
3
4 trivial database library
5
6 Copyright (C) Andrew Tridgell 1999-2004
7 Copyright (C) Paul `Rusty' Russell 2000
8 Copyright (C) Jeremy Allison 2000-2003
9
10 ** NOTE! The following LGPL license applies to the tdb
11 ** library. This does NOT imply that all of Samba is released
12 ** under the LGPL
13
14 This library is free software; you can redistribute it and/or
15 modify it under the terms of the GNU Lesser General Public
16 License as published by the Free Software Foundation; either
17 version 2 of the License, or (at your option) any later version.
18
19 This library is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 Lesser General Public License for more details.
23
24 You should have received a copy of the GNU Lesser General Public
25 License along with this library; If not, see <http://www.gnu.org/licenses/>.
26 */
27
28
29 #ifndef _SAMBA_BUILD_
30 #ifdef HAVE_CONFIG_H
31 #include <config.h>
32 #endif
33
34 #include <stdlib.h>
35 #include <stdio.h>
36 #include <stdint.h>
37 #include <fcntl.h>
38 #include <unistd.h>
39 #include <string.h>
40 #include <fcntl.h>
41 #include <errno.h>
42 #include <sys/mman.h>
43 #include <sys/stat.h>
44 #include "tdb.h"
45 #include <stdarg.h>
46 #include "talloc.h"
47 #undef HAVE_MMAP
48 #else
49 #include "includes.h"
50 #include "lib/tdb/include/tdb.h"
51 #include "system/time.h"
52 #include "system/shmem.h"
53 #include "system/filesys.h"
54 #endif
55
56 #define TDB_MAGIC_FOOD "TDB file\n"
57 #define TDB_VERSION (0x26011967 + 7)
58 #define TDB_MAGIC (0x26011999U)
59 #define TDB_FREE_MAGIC (~TDB_MAGIC)
60 #define TDB_DEAD_MAGIC (0xFEE1DEAD)
61 #define TDB_ALIGNMENT 4
62 #define MIN_REC_SIZE (2*sizeof(struct list_struct) + TDB_ALIGNMENT)
63 #define DEFAULT_HASH_SIZE 131
64 #define TDB_PAGE_SIZE 0x2000
65 #define FREELIST_TOP (sizeof(struct tdb_header))
66 #define TDB_ALIGN(x,a) (((x) + (a)-1) & ~((a)-1))
67 #define TDB_BYTEREV(x) (((((x)&0xff)<<24)|((x)&0xFF00)<<8)|(((x)>>8)&0xFF00)|((x)>>24))
68 #define TDB_DEAD(r) ((r)->magic == TDB_DEAD_MAGIC)
69 #define TDB_BAD_MAGIC(r) ((r)->magic != TDB_MAGIC && !TDB_DEAD(r))
70 #define TDB_HASH_TOP(hash) (FREELIST_TOP + (BUCKET(hash)+1)*sizeof(tdb_off))
71 #define TDB_DATA_START(hash_size) (TDB_HASH_TOP(hash_size-1))
72
73
74 /* NB assumes there is a local variable called "tdb" that is the
75 * current context, also takes doubly-parenthesized print-style
76 * argument. */
77 #define TDB_LOG(x) tdb->log_fn x
78
79 /* lock offsets */
80 #define GLOBAL_LOCK 0
81 #define ACTIVE_LOCK 4
82
83 #ifndef MAP_FILE
84 #define MAP_FILE 0
85 #endif
86
87 #ifndef MAP_FAILED
88 #define MAP_FAILED ((void *)-1)
89 #endif
90
91 #ifndef discard_const_p
92 # if defined(__intptr_t_defined) || defined(HAVE_INTPTR_T)
93 # define discard_const(ptr) ((void *)((intptr_t)(ptr)))
94 # else
95 # define discard_const(ptr) ((void *)(ptr))
96 # endif
97 # define discard_const_p(type, ptr) ((type *)discard_const(ptr))
98 #endif
99
100 /* free memory if the pointer is valid and zero the pointer */
101 #ifndef SAFE_FREE
102 #define SAFE_FREE(x) do { if ((x) != NULL) {talloc_free(discard_const_p(void *, (x))); (x)=NULL;} } while(0)
103 #endif
104
105 #define BUCKET(hash) ((hash) % tdb->header.hash_size)
106 static TDB_DATA tdb_null;
107
108 /* all contexts, to ensure no double-opens (fcntl locks don't nest!) */
109 static TDB_CONTEXT *tdbs = NULL;
110
tdb_munmap(TDB_CONTEXT * tdb)111 static int tdb_munmap(TDB_CONTEXT *tdb)
112 {
113 if (tdb->flags & TDB_INTERNAL)
114 return 0;
115
116 #ifdef HAVE_MMAP
117 if (tdb->map_ptr) {
118 int ret = munmap(tdb->map_ptr, tdb->map_size);
119 if (ret != 0)
120 return ret;
121 }
122 #endif
123 tdb->map_ptr = NULL;
124 return 0;
125 }
126
tdb_mmap(TDB_CONTEXT * tdb)127 static void tdb_mmap(TDB_CONTEXT *tdb)
128 {
129 if (tdb->flags & TDB_INTERNAL)
130 return;
131
132 #ifdef HAVE_MMAP
133 if (!(tdb->flags & TDB_NOMMAP)) {
134 tdb->map_ptr = mmap(NULL, tdb->map_size,
135 PROT_READ|(tdb->read_only? 0:PROT_WRITE),
136 MAP_SHARED|MAP_FILE, tdb->fd, 0);
137
138 /*
139 * NB. When mmap fails it returns MAP_FAILED *NOT* NULL !!!!
140 */
141
142 if (tdb->map_ptr == MAP_FAILED) {
143 tdb->map_ptr = NULL;
144 TDB_LOG((tdb, 2, "tdb_mmap failed for size %d (%s)\n",
145 tdb->map_size, strerror(errno)));
146 }
147 } else {
148 tdb->map_ptr = NULL;
149 }
150 #else
151 tdb->map_ptr = NULL;
152 #endif
153 }
154
155 /* Endian conversion: we only ever deal with 4 byte quantities */
convert(void * buf,uint32_t size)156 static void *convert(void *buf, uint32_t size)
157 {
158 uint32_t i, *p = buf;
159 for (i = 0; i < size / 4; i++)
160 p[i] = TDB_BYTEREV(p[i]);
161 return buf;
162 }
163 #define DOCONV() (tdb->flags & TDB_CONVERT)
164 #define CONVERT(x) (DOCONV() ? convert(&x, sizeof(x)) : &x)
165
166 /* the body of the database is made of one list_struct for the free space
167 plus a separate data list for each hash value */
168 struct list_struct {
169 tdb_off next; /* offset of the next record in the list */
170 tdb_len rec_len; /* total byte length of record */
171 tdb_len key_len; /* byte length of key */
172 tdb_len data_len; /* byte length of data */
173 uint32_t full_hash; /* the full 32 bit hash of the key */
174 uint32_t magic; /* try to catch errors */
175 /* the following union is implied:
176 union {
177 char record[rec_len];
178 struct {
179 char key[key_len];
180 char data[data_len];
181 }
182 uint32_t totalsize; (tailer)
183 }
184 */
185 };
186
187 /* a byte range locking function - return 0 on success
188 this functions locks/unlocks 1 byte at the specified offset.
189
190 On error, errno is also set so that errors are passed back properly
191 through tdb_open(). */
tdb_brlock(TDB_CONTEXT * tdb,tdb_off offset,int rw_type,int lck_type,int probe)192 static int tdb_brlock(TDB_CONTEXT *tdb, tdb_off offset,
193 int rw_type, int lck_type, int probe)
194 {
195 struct flock fl;
196 int ret;
197
198 if (tdb->flags & TDB_NOLOCK)
199 return 0;
200 if ((rw_type == F_WRLCK) && (tdb->read_only)) {
201 errno = EACCES;
202 return -1;
203 }
204
205 fl.l_type = rw_type;
206 fl.l_whence = SEEK_SET;
207 fl.l_start = offset;
208 fl.l_len = 1;
209 fl.l_pid = 0;
210
211 do {
212 ret = fcntl(tdb->fd,lck_type,&fl);
213 } while (ret == -1 && errno == EINTR);
214
215 if (ret == -1) {
216 if (!probe && lck_type != F_SETLK) {
217 /* Ensure error code is set for log fun to examine. */
218 tdb->ecode = TDB_ERR_LOCK;
219 TDB_LOG((tdb, 5,"tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d\n",
220 tdb->fd, offset, rw_type, lck_type));
221 }
222 /* Generic lock error. errno set by fcntl.
223 * EAGAIN is an expected return from non-blocking
224 * locks. */
225 if (errno != EAGAIN) {
226 TDB_LOG((tdb, 5, "tdb_brlock failed (fd=%d) at offset %d rw_type=%d lck_type=%d: %s\n",
227 tdb->fd, offset, rw_type, lck_type,
228 strerror(errno)));
229 }
230 return TDB_ERRCODE(TDB_ERR_LOCK, -1);
231 }
232 return 0;
233 }
234
235 /* lock a list in the database. list -1 is the alloc list */
tdb_lock(TDB_CONTEXT * tdb,int list,int ltype)236 static int tdb_lock(TDB_CONTEXT *tdb, int list, int ltype)
237 {
238 if (list < -1 || list >= (int)tdb->header.hash_size) {
239 TDB_LOG((tdb, 0,"tdb_lock: invalid list %d for ltype=%d\n",
240 list, ltype));
241 return -1;
242 }
243 if (tdb->flags & TDB_NOLOCK)
244 return 0;
245
246 /* Since fcntl locks don't nest, we do a lock for the first one,
247 and simply bump the count for future ones */
248 if (tdb->locked[list+1].count == 0) {
249 if (tdb_brlock(tdb,FREELIST_TOP+4*list,ltype,F_SETLKW, 0)) {
250 TDB_LOG((tdb, 0,"tdb_lock failed on list %d ltype=%d (%s)\n",
251 list, ltype, strerror(errno)));
252 return -1;
253 }
254 tdb->locked[list+1].ltype = ltype;
255 }
256 tdb->locked[list+1].count++;
257 return 0;
258 }
259
260 /* unlock the database: returns void because it's too late for errors. */
261 /* changed to return int it may be interesting to know there
262 has been an error --simo */
tdb_unlock(TDB_CONTEXT * tdb,int list,int ltype)263 static int tdb_unlock(TDB_CONTEXT *tdb, int list,
264 int ltype __attribute__((unused)))
265 {
266 int ret = -1;
267
268 if (tdb->flags & TDB_NOLOCK)
269 return 0;
270
271 /* Sanity checks */
272 if (list < -1 || list >= (int)tdb->header.hash_size) {
273 TDB_LOG((tdb, 0, "tdb_unlock: list %d invalid (%d)\n", list, tdb->header.hash_size));
274 return ret;
275 }
276
277 if (tdb->locked[list+1].count==0) {
278 TDB_LOG((tdb, 0, "tdb_unlock: count is 0\n"));
279 return ret;
280 }
281
282 if (tdb->locked[list+1].count == 1) {
283 /* Down to last nested lock: unlock underneath */
284 ret = tdb_brlock(tdb, FREELIST_TOP+4*list, F_UNLCK, F_SETLKW, 0);
285 } else {
286 ret = 0;
287 }
288 tdb->locked[list+1].count--;
289
290 if (ret)
291 TDB_LOG((tdb, 0,"tdb_unlock: An error occurred unlocking!\n"));
292 return ret;
293 }
294
295 /* This is based on the hash algorithm from gdbm */
default_tdb_hash(TDB_DATA * key)296 static uint32_t default_tdb_hash(TDB_DATA *key)
297 {
298 uint32_t value; /* Used to compute the hash value. */
299 uint32_t i; /* Used to cycle through random values. */
300
301 /* Set the initial value from the key size. */
302 for (value = 0x238F13AF * key->dsize, i=0; i < key->dsize; i++)
303 value = (value + (key->dptr[i] << (i*5 % 24)));
304
305 return (1103515243 * value + 12345);
306 }
307
308 /* check for an out of bounds access - if it is out of bounds then
309 see if the database has been expanded by someone else and expand
310 if necessary
311 note that "len" is the minimum length needed for the db
312 */
tdb_oob(TDB_CONTEXT * tdb,tdb_off len,int probe)313 static int tdb_oob(TDB_CONTEXT *tdb, tdb_off len, int probe)
314 {
315 struct stat st;
316 if (len <= tdb->map_size)
317 return 0;
318 if (tdb->flags & TDB_INTERNAL) {
319 if (!probe) {
320 /* Ensure ecode is set for log fn. */
321 tdb->ecode = TDB_ERR_IO;
322 TDB_LOG((tdb, 0,"tdb_oob len %d beyond internal malloc size %d\n",
323 (int)len, (int)tdb->map_size));
324 }
325 return TDB_ERRCODE(TDB_ERR_IO, -1);
326 }
327
328 if (fstat(tdb->fd, &st) == -1)
329 return TDB_ERRCODE(TDB_ERR_IO, -1);
330
331 if (st.st_size < (off_t)len) {
332 if (!probe) {
333 /* Ensure ecode is set for log fn. */
334 tdb->ecode = TDB_ERR_IO;
335 TDB_LOG((tdb, 0,"tdb_oob len %d beyond eof at %d\n",
336 (int)len, (int)st.st_size));
337 }
338 return TDB_ERRCODE(TDB_ERR_IO, -1);
339 }
340
341 /* Unmap, update size, remap */
342 if (tdb_munmap(tdb) == -1)
343 return TDB_ERRCODE(TDB_ERR_IO, -1);
344 tdb->map_size = st.st_size;
345 tdb_mmap(tdb);
346 return 0;
347 }
348
349 /* write a lump of data at a specified offset */
tdb_write(TDB_CONTEXT * tdb,tdb_off off,void * buf,tdb_len len)350 static int tdb_write(TDB_CONTEXT *tdb, tdb_off off, void *buf, tdb_len len)
351 {
352 if (tdb_oob(tdb, off + len, 0) != 0)
353 return -1;
354
355 if (tdb->map_ptr)
356 memcpy(off + (char *)tdb->map_ptr, buf, len);
357 #ifdef HAVE_PWRITE
358 else if (pwrite(tdb->fd, buf, len, off) != (ssize_t)len) {
359 #else
360 else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
361 || write(tdb->fd, buf, len) != (off_t)len) {
362 #endif
363 /* Ensure ecode is set for log fn. */
364 tdb->ecode = TDB_ERR_IO;
365 TDB_LOG((tdb, 0,"tdb_write failed at %d len=%d (%s)\n",
366 off, len, strerror(errno)));
367 return TDB_ERRCODE(TDB_ERR_IO, -1);
368 }
369 return 0;
370 }
371
372 /* read a lump of data at a specified offset, maybe convert */
373 static int tdb_read(TDB_CONTEXT *tdb,tdb_off off,void *buf,tdb_len len,int cv)
374 {
375 if (tdb_oob(tdb, off + len, 0) != 0)
376 return -1;
377
378 if (tdb->map_ptr)
379 memcpy(buf, off + (char *)tdb->map_ptr, len);
380 #ifdef HAVE_PREAD
381 else if (pread(tdb->fd, buf, len, off) != (off_t)len) {
382 #else
383 else if (lseek(tdb->fd, off, SEEK_SET) != (off_t)off
384 || read(tdb->fd, buf, len) != (off_t)len) {
385 #endif
386 /* Ensure ecode is set for log fn. */
387 tdb->ecode = TDB_ERR_IO;
388 TDB_LOG((tdb, 0,"tdb_read failed at %d len=%d (%s)\n",
389 off, len, strerror(errno)));
390 return TDB_ERRCODE(TDB_ERR_IO, -1);
391 }
392 if (cv)
393 convert(buf, len);
394 return 0;
395 }
396
397 /* don't allocate memory: used in tdb_delete path. */
398 static int tdb_key_eq(TDB_CONTEXT *tdb, tdb_off off, TDB_DATA key)
399 {
400 char buf[64];
401 uint32_t len;
402
403 if (tdb_oob(tdb, off + key.dsize, 0) != 0)
404 return -1;
405
406 if (tdb->map_ptr)
407 return !memcmp(off + (char*)tdb->map_ptr, key.dptr, key.dsize);
408
409 while (key.dsize) {
410 len = key.dsize;
411 if (len > sizeof(buf))
412 len = sizeof(buf);
413 if (tdb_read(tdb, off, buf, len, 0) != 0)
414 return -1;
415 if (memcmp(buf, key.dptr, len) != 0)
416 return 0;
417 key.dptr += len;
418 key.dsize -= len;
419 off += len;
420 }
421 return 1;
422 }
423
424 /* read a lump of data, allocating the space for it */
425 static char *tdb_alloc_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_len len)
426 {
427 char *buf;
428
429 if (!(buf = talloc_size(tdb, len))) {
430 /* Ensure ecode is set for log fn. */
431 tdb->ecode = TDB_ERR_OOM;
432 TDB_LOG((tdb, 0,"tdb_alloc_read malloc failed len=%d (%s)\n",
433 len, strerror(errno)));
434 return TDB_ERRCODE(TDB_ERR_OOM, buf);
435 }
436 if (tdb_read(tdb, offset, buf, len, 0) == -1) {
437 SAFE_FREE(buf);
438 return NULL;
439 }
440 return buf;
441 }
442
443 /* read/write a tdb_off */
444 static int ofs_read(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
445 {
446 return tdb_read(tdb, offset, (char*)d, sizeof(*d), DOCONV());
447 }
448 static int ofs_write(TDB_CONTEXT *tdb, tdb_off offset, tdb_off *d)
449 {
450 tdb_off off = *d;
451 return tdb_write(tdb, offset, CONVERT(off), sizeof(*d));
452 }
453
454 /* read/write a record */
455 static int rec_read(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
456 {
457 if (tdb_read(tdb, offset, rec, sizeof(*rec),DOCONV()) == -1)
458 return -1;
459 if (TDB_BAD_MAGIC(rec)) {
460 /* Ensure ecode is set for log fn. */
461 tdb->ecode = TDB_ERR_CORRUPT;
462 TDB_LOG((tdb, 0,"rec_read bad magic 0x%x at offset=%d\n", rec->magic, offset));
463 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
464 }
465 return tdb_oob(tdb, rec->next+sizeof(*rec), 0);
466 }
467 static int rec_write(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
468 {
469 struct list_struct r = *rec;
470 return tdb_write(tdb, offset, CONVERT(r), sizeof(r));
471 }
472
473 /* read a freelist record and check for simple errors */
474 static int rec_free_read(TDB_CONTEXT *tdb, tdb_off off, struct list_struct *rec)
475 {
476 if (tdb_read(tdb, off, rec, sizeof(*rec),DOCONV()) == -1)
477 return -1;
478
479 if (rec->magic == TDB_MAGIC) {
480 /* this happens when a app is showdown while deleting a record - we should
481 not completely fail when this happens */
482 TDB_LOG((tdb, 0,"rec_free_read non-free magic 0x%x at offset=%d - fixing\n",
483 rec->magic, off));
484 rec->magic = TDB_FREE_MAGIC;
485 if (tdb_write(tdb, off, rec, sizeof(*rec)) == -1)
486 return -1;
487 }
488
489 if (rec->magic != TDB_FREE_MAGIC) {
490 /* Ensure ecode is set for log fn. */
491 tdb->ecode = TDB_ERR_CORRUPT;
492 TDB_LOG((tdb, 0,"rec_free_read bad magic 0x%x at offset=%d\n",
493 rec->magic, off));
494 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
495 }
496 if (tdb_oob(tdb, rec->next+sizeof(*rec), 0) != 0)
497 return -1;
498 return 0;
499 }
500
501 /* update a record tailer (must hold allocation lock) */
502 static int update_tailer(TDB_CONTEXT *tdb, tdb_off offset,
503 const struct list_struct *rec)
504 {
505 tdb_off totalsize;
506
507 /* Offset of tailer from record header */
508 totalsize = sizeof(*rec) + rec->rec_len;
509 return ofs_write(tdb, offset + totalsize - sizeof(tdb_off),
510 &totalsize);
511 }
512
513 /* Remove an element from the freelist. Must have alloc lock. */
514 static int remove_from_freelist(TDB_CONTEXT *tdb, tdb_off off, tdb_off next)
515 {
516 tdb_off last_ptr, i;
517
518 /* read in the freelist top */
519 last_ptr = FREELIST_TOP;
520 while (ofs_read(tdb, last_ptr, &i) != -1 && i != 0) {
521 if (i == off) {
522 /* We've found it! */
523 return ofs_write(tdb, last_ptr, &next);
524 }
525 /* Follow chain (next offset is at start of record) */
526 last_ptr = i;
527 }
528 TDB_LOG((tdb, 0,"remove_from_freelist: not on list at off=%d\n", off));
529 return TDB_ERRCODE(TDB_ERR_CORRUPT, -1);
530 }
531
532 /* Add an element into the freelist. Merge adjacent records if
533 neccessary. */
534 static int tdb_free(TDB_CONTEXT *tdb, tdb_off offset, struct list_struct *rec)
535 {
536 tdb_off right, left;
537
538 /* Allocation and tailer lock */
539 if (tdb_lock(tdb, -1, F_WRLCK) != 0)
540 return -1;
541
542 /* set an initial tailer, so if we fail we don't leave a bogus record */
543 if (update_tailer(tdb, offset, rec) != 0) {
544 TDB_LOG((tdb, 0, "tdb_free: upfate_tailer failed!\n"));
545 goto fail;
546 }
547
548 /* Look right first (I'm an Australian, dammit) */
549 right = offset + sizeof(*rec) + rec->rec_len;
550 if (right + sizeof(*rec) <= tdb->map_size) {
551 struct list_struct r;
552
553 if (tdb_read(tdb, right, &r, sizeof(r), DOCONV()) == -1) {
554 TDB_LOG((tdb, 0, "tdb_free: right read failed at %u\n", right));
555 goto left;
556 }
557
558 /* If it's free, expand to include it. */
559 if (r.magic == TDB_FREE_MAGIC) {
560 if (remove_from_freelist(tdb, right, r.next) == -1) {
561 TDB_LOG((tdb, 0, "tdb_free: right free failed at %u\n", right));
562 goto left;
563 }
564 rec->rec_len += sizeof(r) + r.rec_len;
565 }
566 }
567
568 left:
569 /* Look left */
570 left = offset - sizeof(tdb_off);
571 if (left > TDB_DATA_START(tdb->header.hash_size)) {
572 struct list_struct l;
573 tdb_off leftsize;
574
575 /* Read in tailer and jump back to header */
576 if (ofs_read(tdb, left, &leftsize) == -1) {
577 TDB_LOG((tdb, 0, "tdb_free: left offset read failed at %u\n", left));
578 goto update;
579 }
580 left = offset - leftsize;
581
582 /* Now read in record */
583 if (tdb_read(tdb, left, &l, sizeof(l), DOCONV()) == -1) {
584 TDB_LOG((tdb, 0, "tdb_free: left read failed at %u (%u)\n", left, leftsize));
585 goto update;
586 }
587
588 /* If it's free, expand to include it. */
589 if (l.magic == TDB_FREE_MAGIC) {
590 if (remove_from_freelist(tdb, left, l.next) == -1) {
591 TDB_LOG((tdb, 0, "tdb_free: left free failed at %u\n", left));
592 goto update;
593 } else {
594 offset = left;
595 rec->rec_len += leftsize;
596 }
597 }
598 }
599
600 update:
601 if (update_tailer(tdb, offset, rec) == -1) {
602 TDB_LOG((tdb, 0, "tdb_free: update_tailer failed at %u\n", offset));
603 goto fail;
604 }
605
606 /* Now, prepend to free list */
607 rec->magic = TDB_FREE_MAGIC;
608
609 if (ofs_read(tdb, FREELIST_TOP, &rec->next) == -1 ||
610 rec_write(tdb, offset, rec) == -1 ||
611 ofs_write(tdb, FREELIST_TOP, &offset) == -1) {
612 TDB_LOG((tdb, 0, "tdb_free record write failed at offset=%d\n", offset));
613 goto fail;
614 }
615
616 /* And we're done. */
617 tdb_unlock(tdb, -1, F_WRLCK);
618 return 0;
619
620 fail:
621 tdb_unlock(tdb, -1, F_WRLCK);
622 return -1;
623 }
624
625
626 /* expand a file. we prefer to use ftruncate, as that is what posix
627 says to use for mmap expansion */
628 static int expand_file(TDB_CONTEXT *tdb, tdb_off size, tdb_off addition)
629 {
630 char buf[1024];
631 #ifdef HAVE_FTRUNCATE_EXTEND
632 if (ftruncate(tdb->fd, size+addition) != 0) {
633 TDB_LOG((tdb, 0, "expand_file ftruncate to %d failed (%s)\n",
634 size+addition, strerror(errno)));
635 return -1;
636 }
637 #else
638 char b = 0;
639
640 #ifdef HAVE_PWRITE
641 if (pwrite(tdb->fd, &b, 1, (size+addition) - 1) != 1) {
642 #else
643 if (lseek(tdb->fd, (size+addition) - 1, SEEK_SET) != (off_t)(size+addition) - 1 ||
644 write(tdb->fd, &b, 1) != 1) {
645 #endif
646 TDB_LOG((tdb, 0, "expand_file to %d failed (%s)\n",
647 size+addition, strerror(errno)));
648 return -1;
649 }
650 #endif
651
652 /* now fill the file with something. This ensures that the file isn't sparse, which would be
653 very bad if we ran out of disk. This must be done with write, not via mmap */
654 memset(buf, 0x42, sizeof(buf));
655 while (addition) {
656 int n = addition>sizeof(buf)?sizeof(buf):addition;
657 #ifdef HAVE_PWRITE
658 int ret = pwrite(tdb->fd, buf, n, size);
659 #else
660 int ret;
661 if (lseek(tdb->fd, size, SEEK_SET) != (off_t)size)
662 return -1;
663 ret = write(tdb->fd, buf, n);
664 #endif
665 if (ret != n) {
666 TDB_LOG((tdb, 0, "expand_file write of %d failed (%s)\n",
667 n, strerror(errno)));
668 return -1;
669 }
670 addition -= n;
671 size += n;
672 }
673 return 0;
674 }
675
676
677 /* expand the database at least size bytes by expanding the underlying
678 file and doing the mmap again if necessary */
679 static int tdb_expand(TDB_CONTEXT *tdb, tdb_off size)
680 {
681 struct list_struct rec;
682 tdb_off offset;
683
684 if (tdb_lock(tdb, -1, F_WRLCK) == -1) {
685 TDB_LOG((tdb, 0, "lock failed in tdb_expand\n"));
686 return -1;
687 }
688
689 /* must know about any previous expansions by another process */
690 tdb_oob(tdb, tdb->map_size + 1, 1);
691
692 /* always make room for at least 10 more records, and round
693 the database up to a multiple of TDB_PAGE_SIZE */
694 size = TDB_ALIGN(tdb->map_size + size*10, TDB_PAGE_SIZE) - tdb->map_size;
695
696 if (!(tdb->flags & TDB_INTERNAL))
697 tdb_munmap(tdb);
698
699 /*
700 * We must ensure the file is unmapped before doing this
701 * to ensure consistency with systems like OpenBSD where
702 * writes and mmaps are not consistent.
703 */
704
705 /* expand the file itself */
706 if (!(tdb->flags & TDB_INTERNAL)) {
707 if (expand_file(tdb, tdb->map_size, size) != 0)
708 goto fail;
709 }
710
711 tdb->map_size += size;
712
713 if (tdb->flags & TDB_INTERNAL) {
714 char *new_map_ptr = talloc_realloc_size(tdb, tdb->map_ptr,
715 tdb->map_size);
716 if (!new_map_ptr) {
717 tdb->map_size -= size;
718 goto fail;
719 }
720 tdb->map_ptr = new_map_ptr;
721 } else {
722 /*
723 * We must ensure the file is remapped before adding the space
724 * to ensure consistency with systems like OpenBSD where
725 * writes and mmaps are not consistent.
726 */
727
728 /* We're ok if the mmap fails as we'll fallback to read/write */
729 tdb_mmap(tdb);
730 }
731
732 /* form a new freelist record */
733 memset(&rec,'\0',sizeof(rec));
734 rec.rec_len = size - sizeof(rec);
735
736 /* link it into the free list */
737 offset = tdb->map_size - size;
738 if (tdb_free(tdb, offset, &rec) == -1)
739 goto fail;
740
741 tdb_unlock(tdb, -1, F_WRLCK);
742 return 0;
743 fail:
744 tdb_unlock(tdb, -1, F_WRLCK);
745 return -1;
746 }
747
748
749 /*
750 the core of tdb_allocate - called when we have decided which
751 free list entry to use
752 */
753 static tdb_off tdb_allocate_ofs(TDB_CONTEXT *tdb, tdb_len length, tdb_off rec_ptr,
754 struct list_struct *rec, tdb_off last_ptr)
755 {
756 struct list_struct newrec;
757 tdb_off newrec_ptr;
758
759 memset(&newrec, '\0', sizeof(newrec));
760
761 /* found it - now possibly split it up */
762 if (rec->rec_len > length + MIN_REC_SIZE) {
763 /* Length of left piece */
764 length = TDB_ALIGN(length, TDB_ALIGNMENT);
765
766 /* Right piece to go on free list */
767 newrec.rec_len = rec->rec_len - (sizeof(*rec) + length);
768 newrec_ptr = rec_ptr + sizeof(*rec) + length;
769
770 /* And left record is shortened */
771 rec->rec_len = length;
772 } else {
773 newrec_ptr = 0;
774 }
775
776 /* Remove allocated record from the free list */
777 if (ofs_write(tdb, last_ptr, &rec->next) == -1) {
778 return 0;
779 }
780
781 /* Update header: do this before we drop alloc
782 lock, otherwise tdb_free() might try to
783 merge with us, thinking we're free.
784 (Thanks Jeremy Allison). */
785 rec->magic = TDB_MAGIC;
786 if (rec_write(tdb, rec_ptr, rec) == -1) {
787 return 0;
788 }
789
790 /* Did we create new block? */
791 if (newrec_ptr) {
792 /* Update allocated record tailer (we
793 shortened it). */
794 if (update_tailer(tdb, rec_ptr, rec) == -1) {
795 return 0;
796 }
797
798 /* Free new record */
799 if (tdb_free(tdb, newrec_ptr, &newrec) == -1) {
800 return 0;
801 }
802 }
803
804 /* all done - return the new record offset */
805 return rec_ptr;
806 }
807
808 /* allocate some space from the free list. The offset returned points
809 to a unconnected list_struct within the database with room for at
810 least length bytes of total data
811
812 0 is returned if the space could not be allocated
813 */
814 static tdb_off tdb_allocate(TDB_CONTEXT *tdb, tdb_len length,
815 struct list_struct *rec)
816 {
817 tdb_off rec_ptr, last_ptr, newrec_ptr;
818 struct {
819 tdb_off rec_ptr, last_ptr;
820 tdb_len rec_len;
821 } bestfit = { 0, 0, 0 };
822
823 if (tdb_lock(tdb, -1, F_WRLCK) == -1)
824 return 0;
825
826 /* Extra bytes required for tailer */
827 length += sizeof(tdb_off);
828
829 again:
830 last_ptr = FREELIST_TOP;
831
832 /* read in the freelist top */
833 if (ofs_read(tdb, FREELIST_TOP, &rec_ptr) == -1)
834 goto fail;
835
836 bestfit.rec_ptr = 0;
837
838 /*
839 this is a best fit allocation strategy. Originally we used
840 a first fit strategy, but it suffered from massive fragmentation
841 issues when faced with a slowly increasing record size.
842 */
843 while (rec_ptr) {
844 if (rec_free_read(tdb, rec_ptr, rec) == -1) {
845 goto fail;
846 }
847
848 if (rec->rec_len >= length) {
849 if (bestfit.rec_ptr == 0 ||
850 rec->rec_len < bestfit.rec_len) {
851 bestfit.rec_len = rec->rec_len;
852 bestfit.rec_ptr = rec_ptr;
853 bestfit.last_ptr = last_ptr;
854 /* consider a fit to be good enough if we aren't wasting more than half the space */
855 if (bestfit.rec_len < 2*length) {
856 break;
857 }
858 }
859 }
860
861 /* move to the next record */
862 last_ptr = rec_ptr;
863 rec_ptr = rec->next;
864 }
865
866 if (bestfit.rec_ptr != 0) {
867 if (rec_free_read(tdb, bestfit.rec_ptr, rec) == -1) {
868 goto fail;
869 }
870
871 newrec_ptr = tdb_allocate_ofs(tdb, length, bestfit.rec_ptr, rec, bestfit.last_ptr);
872 tdb_unlock(tdb, -1, F_WRLCK);
873 return newrec_ptr;
874 }
875
876 /* we didn't find enough space. See if we can expand the
877 database and if we can then try again */
878 if (tdb_expand(tdb, length + sizeof(*rec)) == 0)
879 goto again;
880 fail:
881 tdb_unlock(tdb, -1, F_WRLCK);
882 return 0;
883 }
884
885 /* initialise a new database with a specified hash size */
886 static int tdb_new_database(TDB_CONTEXT *tdb, int hash_size)
887 {
888 struct tdb_header *newdb;
889 int size, ret = -1;
890
891 /* We make it up in memory, then write it out if not internal */
892 size = sizeof(struct tdb_header) + (hash_size+1)*sizeof(tdb_off);
893 if (!(newdb = talloc_zero_size(tdb, size)))
894 return TDB_ERRCODE(TDB_ERR_OOM, -1);
895
896 /* Fill in the header */
897 newdb->version = TDB_VERSION;
898 newdb->hash_size = hash_size;
899 if (tdb->flags & TDB_INTERNAL) {
900 tdb->map_size = size;
901 tdb->map_ptr = (char *)newdb;
902 memcpy(&tdb->header, newdb, sizeof(tdb->header));
903 /* Convert the `ondisk' version if asked. */
904 CONVERT(*newdb);
905 return 0;
906 }
907 if (lseek(tdb->fd, 0, SEEK_SET) == -1)
908 goto fail;
909
910 if (ftruncate(tdb->fd, 0) == -1)
911 goto fail;
912
913 /* This creates an endian-converted header, as if read from disk */
914 CONVERT(*newdb);
915 memcpy(&tdb->header, newdb, sizeof(tdb->header));
916 /* Don't endian-convert the magic food! */
917 memcpy(newdb->magic_food, TDB_MAGIC_FOOD, strlen(TDB_MAGIC_FOOD)+1);
918 if (write(tdb->fd, newdb, size) != size)
919 ret = -1;
920 else
921 ret = 0;
922
923 fail:
924 SAFE_FREE(newdb);
925 return ret;
926 }
927
928 /* Returns 0 on fail. On success, return offset of record, and fills
929 in rec */
930 static tdb_off tdb_find(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash,
931 struct list_struct *r)
932 {
933 tdb_off rec_ptr;
934
935 /* read in the hash top */
936 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1)
937 return 0;
938
939 /* keep looking until we find the right record */
940 while (rec_ptr) {
941 if (rec_read(tdb, rec_ptr, r) == -1)
942 return 0;
943
944 if (!TDB_DEAD(r) && hash==r->full_hash && key.dsize==r->key_len) {
945 /* a very likely hit - read the key */
946 int cmp = tdb_key_eq(tdb, rec_ptr + sizeof(*r), key);
947 if (cmp < 0)
948 return 0;
949 else if (cmp > 0)
950 return rec_ptr;
951 }
952 rec_ptr = r->next;
953 }
954 return TDB_ERRCODE(TDB_ERR_NOEXIST, 0);
955 }
956
957 /* As tdb_find, but if you succeed, keep the lock */
958 static tdb_off tdb_find_lock_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash, int locktype,
959 struct list_struct *rec)
960 {
961 uint32_t rec_ptr;
962
963 if (tdb_lock(tdb, BUCKET(hash), locktype) == -1)
964 return 0;
965 if (!(rec_ptr = tdb_find(tdb, key, hash, rec)))
966 tdb_unlock(tdb, BUCKET(hash), locktype);
967 return rec_ptr;
968 }
969
970 enum TDB_ERROR tdb_error(TDB_CONTEXT *tdb)
971 {
972 return tdb->ecode;
973 }
974
975 static struct tdb_errname {
976 enum TDB_ERROR ecode; const char *estring;
977 } emap[] = { {TDB_SUCCESS, "Success"},
978 {TDB_ERR_CORRUPT, "Corrupt database"},
979 {TDB_ERR_IO, "IO Error"},
980 {TDB_ERR_LOCK, "Locking error"},
981 {TDB_ERR_OOM, "Out of memory"},
982 {TDB_ERR_EXISTS, "Record exists"},
983 {TDB_ERR_NOLOCK, "Lock exists on other keys"},
984 {TDB_ERR_NOEXIST, "Record does not exist"} };
985
986 /* Error string for the last tdb error */
987 const char *tdb_errorstr(TDB_CONTEXT *tdb)
988 {
989 uint32_t i;
990 for (i = 0; i < sizeof(emap) / sizeof(struct tdb_errname); i++)
991 if (tdb->ecode == emap[i].ecode)
992 return emap[i].estring;
993 return "Invalid error code";
994 }
995
996 /* update an entry in place - this only works if the new data size
997 is <= the old data size and the key exists.
998 on failure return -1.
999 */
1000
1001 static int tdb_update_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash, TDB_DATA dbuf)
1002 {
1003 struct list_struct rec;
1004 tdb_off rec_ptr;
1005
1006 /* find entry */
1007 if (!(rec_ptr = tdb_find(tdb, key, hash, &rec)))
1008 return -1;
1009
1010 /* must be long enough key, data and tailer */
1011 if (rec.rec_len < key.dsize + dbuf.dsize + sizeof(tdb_off)) {
1012 tdb->ecode = TDB_SUCCESS; /* Not really an error */
1013 return -1;
1014 }
1015
1016 if (tdb_write(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1017 dbuf.dptr, dbuf.dsize) == -1)
1018 return -1;
1019
1020 if (dbuf.dsize != rec.data_len) {
1021 /* update size */
1022 rec.data_len = dbuf.dsize;
1023 return rec_write(tdb, rec_ptr, &rec);
1024 }
1025
1026 return 0;
1027 }
1028
1029 /* find an entry in the database given a key */
1030 /* If an entry doesn't exist tdb_err will be set to
1031 * TDB_ERR_NOEXIST. If a key has no data attached
1032 * then the TDB_DATA will have zero length but
1033 * a non-zero pointer
1034 */
1035
1036 TDB_DATA tdb_fetch(TDB_CONTEXT *tdb, TDB_DATA key)
1037 {
1038 tdb_off rec_ptr;
1039 struct list_struct rec;
1040 TDB_DATA ret;
1041 uint32_t hash;
1042
1043 /* find which hash bucket it is in */
1044 hash = tdb->hash_fn(&key);
1045 if (!(rec_ptr = tdb_find_lock_hash(tdb,key,hash,F_RDLCK,&rec)))
1046 return tdb_null;
1047
1048 ret.dptr = tdb_alloc_read(tdb, rec_ptr + sizeof(rec) + rec.key_len,
1049 rec.data_len);
1050 ret.dsize = rec.data_len;
1051 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1052 return ret;
1053 }
1054
1055 /* check if an entry in the database exists
1056
1057 note that 1 is returned if the key is found and 0 is returned if not found
1058 this doesn't match the conventions in the rest of this module, but is
1059 compatible with gdbm
1060 */
1061 static int tdb_exists_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash)
1062 {
1063 struct list_struct rec;
1064
1065 if (tdb_find_lock_hash(tdb, key, hash, F_RDLCK, &rec) == 0)
1066 return 0;
1067 tdb_unlock(tdb, BUCKET(rec.full_hash), F_RDLCK);
1068 return 1;
1069 }
1070
1071 /* record lock stops delete underneath */
1072 static int lock_record(TDB_CONTEXT *tdb, tdb_off off)
1073 {
1074 return off ? tdb_brlock(tdb, off, F_RDLCK, F_SETLKW, 0) : 0;
1075 }
1076 /*
1077 Write locks override our own fcntl readlocks, so check it here.
1078 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1079 an error to fail to get the lock here.
1080 */
1081
1082 static int write_lock_record(TDB_CONTEXT *tdb, tdb_off off)
1083 {
1084 struct tdb_traverse_lock *i;
1085 for (i = &tdb->travlocks; i; i = i->next)
1086 if (i->off == off)
1087 return -1;
1088 return tdb_brlock(tdb, off, F_WRLCK, F_SETLK, 1);
1089 }
1090
1091 /*
1092 Note this is meant to be F_SETLK, *not* F_SETLKW, as it's not
1093 an error to fail to get the lock here.
1094 */
1095
1096 static int write_unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1097 {
1098 return tdb_brlock(tdb, off, F_UNLCK, F_SETLK, 0);
1099 }
1100 /* fcntl locks don't stack: avoid unlocking someone else's */
1101 static int unlock_record(TDB_CONTEXT *tdb, tdb_off off)
1102 {
1103 struct tdb_traverse_lock *i;
1104 uint32_t count = 0;
1105
1106 if (off == 0)
1107 return 0;
1108 for (i = &tdb->travlocks; i; i = i->next)
1109 if (i->off == off)
1110 count++;
1111 return (count == 1 ? tdb_brlock(tdb, off, F_UNLCK, F_SETLKW, 0) : 0);
1112 }
1113
1114 /* actually delete an entry in the database given the offset */
1115 static int do_delete(TDB_CONTEXT *tdb, tdb_off rec_ptr, struct list_struct*rec)
1116 {
1117 tdb_off last_ptr, i;
1118 struct list_struct lastrec;
1119
1120 if (tdb->read_only) return -1;
1121
1122 if (write_lock_record(tdb, rec_ptr) == -1) {
1123 /* Someone traversing here: mark it as dead */
1124 rec->magic = TDB_DEAD_MAGIC;
1125 return rec_write(tdb, rec_ptr, rec);
1126 }
1127 if (write_unlock_record(tdb, rec_ptr) != 0)
1128 return -1;
1129
1130 /* find previous record in hash chain */
1131 if (ofs_read(tdb, TDB_HASH_TOP(rec->full_hash), &i) == -1)
1132 return -1;
1133 for (last_ptr = 0; i != rec_ptr; last_ptr = i, i = lastrec.next)
1134 if (rec_read(tdb, i, &lastrec) == -1)
1135 return -1;
1136
1137 /* unlink it: next ptr is at start of record. */
1138 if (last_ptr == 0)
1139 last_ptr = TDB_HASH_TOP(rec->full_hash);
1140 if (ofs_write(tdb, last_ptr, &rec->next) == -1)
1141 return -1;
1142
1143 /* recover the space */
1144 if (tdb_free(tdb, rec_ptr, rec) == -1)
1145 return -1;
1146 return 0;
1147 }
1148
1149 /* Uses traverse lock: 0 = finish, -1 = error, other = record offset */
1150 static int tdb_next_lock(TDB_CONTEXT *tdb, struct tdb_traverse_lock *tlock,
1151 struct list_struct *rec)
1152 {
1153 int want_next = (tlock->off != 0);
1154
1155 /* Lock each chain from the start one. */
1156 for (; tlock->hash < tdb->header.hash_size; tlock->hash++) {
1157
1158 /* this is an optimisation for the common case where
1159 the hash chain is empty, which is particularly
1160 common for the use of tdb with ldb, where large
1161 hashes are used. In that case we spend most of our
1162 time in tdb_brlock(), locking empty hash chains.
1163
1164 To avoid this, we do an unlocked pre-check to see
1165 if the hash chain is empty before starting to look
1166 inside it. If it is empty then we can avoid that
1167 hash chain. If it isn't empty then we can't believe
1168 the value we get back, as we read it without a
1169 lock, so instead we get the lock and re-fetch the
1170 value below.
1171
1172 Notice that not doing this optimisation on the
1173 first hash chain is critical. We must guarantee
1174 that we have done at least one fcntl lock at the
1175 start of a search to guarantee that memory is
1176 coherent on SMP systems. If records are added by
1177 others during the search then thats OK, and we
1178 could possibly miss those with this trick, but we
1179 could miss them anyway without this trick, so the
1180 semantics don't change.
1181
1182 With a non-indexed ldb search this trick gains us a
1183 factor of around 80 in speed on a linux 2.6.x
1184 system (testing using ldbtest).
1185 */
1186 if (!tlock->off && tlock->hash != 0) {
1187 uint32_t off;
1188 if (tdb->map_ptr) {
1189 for (;tlock->hash < tdb->header.hash_size;tlock->hash++) {
1190 if (0 != *(uint32_t *)(TDB_HASH_TOP(tlock->hash) + (unsigned char *)tdb->map_ptr)) {
1191 break;
1192 }
1193 }
1194 if (tlock->hash == tdb->header.hash_size) {
1195 continue;
1196 }
1197 } else {
1198 if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash), &off) == 0 &&
1199 off == 0) {
1200 continue;
1201 }
1202 }
1203 }
1204
1205 if (tdb_lock(tdb, tlock->hash, F_WRLCK) == -1)
1206 return -1;
1207
1208 /* No previous record? Start at top of chain. */
1209 if (!tlock->off) {
1210 if (ofs_read(tdb, TDB_HASH_TOP(tlock->hash),
1211 &tlock->off) == -1)
1212 goto fail;
1213 } else {
1214 /* Otherwise unlock the previous record. */
1215 if (unlock_record(tdb, tlock->off) != 0)
1216 goto fail;
1217 }
1218
1219 if (want_next) {
1220 /* We have offset of old record: grab next */
1221 if (rec_read(tdb, tlock->off, rec) == -1)
1222 goto fail;
1223 tlock->off = rec->next;
1224 }
1225
1226 /* Iterate through chain */
1227 while( tlock->off) {
1228 tdb_off current;
1229 if (rec_read(tdb, tlock->off, rec) == -1)
1230 goto fail;
1231
1232 /* Detect infinite loops. From "Shlomi Yaakobovich" <Shlomi@exanet.com>. */
1233 if (tlock->off == rec->next) {
1234 TDB_LOG((tdb, 0, "tdb_next_lock: loop detected.\n"));
1235 goto fail;
1236 }
1237
1238 if (!TDB_DEAD(rec)) {
1239 /* Woohoo: we found one! */
1240 if (lock_record(tdb, tlock->off) != 0)
1241 goto fail;
1242 return tlock->off;
1243 }
1244
1245 /* Try to clean dead ones from old traverses */
1246 current = tlock->off;
1247 tlock->off = rec->next;
1248 if (!tdb->read_only &&
1249 do_delete(tdb, current, rec) != 0)
1250 goto fail;
1251 }
1252 tdb_unlock(tdb, tlock->hash, F_WRLCK);
1253 want_next = 0;
1254 }
1255 /* We finished iteration without finding anything */
1256 return TDB_ERRCODE(TDB_SUCCESS, 0);
1257
1258 fail:
1259 tlock->off = 0;
1260 if (tdb_unlock(tdb, tlock->hash, F_WRLCK) != 0)
1261 TDB_LOG((tdb, 0, "tdb_next_lock: On error unlock failed!\n"));
1262 return -1;
1263 }
1264
1265 /* traverse the entire database - calling fn(tdb, key, data) on each element.
1266 return -1 on error or the record count traversed
1267 if fn is NULL then it is not called
1268 a non-zero return value from fn() indicates that the traversal should stop
1269 */
1270 int tdb_traverse(TDB_CONTEXT *tdb, tdb_traverse_func fn, void *private)
1271 {
1272 TDB_DATA key, dbuf;
1273 struct list_struct rec;
1274 struct tdb_traverse_lock tl = { NULL, 0, 0 };
1275 int ret, count = 0;
1276
1277 /* This was in the initializaton, above, but the IRIX compiler
1278 * did not like it. crh
1279 */
1280 tl.next = tdb->travlocks.next;
1281
1282 /* fcntl locks don't stack: beware traverse inside traverse */
1283 tdb->travlocks.next = &tl;
1284
1285 /* tdb_next_lock places locks on the record returned, and its chain */
1286 while ((ret = tdb_next_lock(tdb, &tl, &rec)) > 0) {
1287 count++;
1288 /* now read the full record */
1289 key.dptr = tdb_alloc_read(tdb, tl.off + sizeof(rec),
1290 rec.key_len + rec.data_len);
1291 if (!key.dptr) {
1292 ret = -1;
1293 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0)
1294 goto out;
1295 if (unlock_record(tdb, tl.off) != 0)
1296 TDB_LOG((tdb, 0, "tdb_traverse: key.dptr == NULL and unlock_record failed!\n"));
1297 goto out;
1298 }
1299 key.dsize = rec.key_len;
1300 dbuf.dptr = key.dptr + rec.key_len;
1301 dbuf.dsize = rec.data_len;
1302
1303 /* Drop chain lock, call out */
1304 if (tdb_unlock(tdb, tl.hash, F_WRLCK) != 0) {
1305 ret = -1;
1306 goto out;
1307 }
1308 if (fn && fn(tdb, key, dbuf, private)) {
1309 /* They want us to terminate traversal */
1310 ret = count;
1311 if (unlock_record(tdb, tl.off) != 0) {
1312 TDB_LOG((tdb, 0, "tdb_traverse: unlock_record failed!\n"));
1313 ret = -1;
1314 }
1315 tdb->travlocks.next = tl.next;
1316 SAFE_FREE(key.dptr);
1317 return count;
1318 }
1319 SAFE_FREE(key.dptr);
1320 }
1321 out:
1322 tdb->travlocks.next = tl.next;
1323 if (ret < 0)
1324 return -1;
1325 else
1326 return count;
1327 }
1328
1329 /* find the first entry in the database and return its key */
1330 TDB_DATA tdb_firstkey(TDB_CONTEXT *tdb)
1331 {
1332 TDB_DATA key;
1333 struct list_struct rec;
1334
1335 /* release any old lock */
1336 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1337 return tdb_null;
1338 tdb->travlocks.off = tdb->travlocks.hash = 0;
1339
1340 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) <= 0)
1341 return tdb_null;
1342 /* now read the key */
1343 key.dsize = rec.key_len;
1344 key.dptr =tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),key.dsize);
1345 if (tdb_unlock(tdb, BUCKET(tdb->travlocks.hash), F_WRLCK) != 0)
1346 TDB_LOG((tdb, 0, "tdb_firstkey: error occurred while tdb_unlocking!\n"));
1347 return key;
1348 }
1349
1350 /* find the next entry in the database, returning its key */
1351 TDB_DATA tdb_nextkey(TDB_CONTEXT *tdb, TDB_DATA oldkey)
1352 {
1353 uint32_t oldhash;
1354 TDB_DATA key = tdb_null;
1355 struct list_struct rec;
1356 char *k = NULL;
1357
1358 /* Is locked key the old key? If so, traverse will be reliable. */
1359 if (tdb->travlocks.off) {
1360 if (tdb_lock(tdb,tdb->travlocks.hash,F_WRLCK))
1361 return tdb_null;
1362 if (rec_read(tdb, tdb->travlocks.off, &rec) == -1
1363 || !(k = tdb_alloc_read(tdb,tdb->travlocks.off+sizeof(rec),
1364 rec.key_len))
1365 || memcmp(k, oldkey.dptr, oldkey.dsize) != 0) {
1366 /* No, it wasn't: unlock it and start from scratch */
1367 if (unlock_record(tdb, tdb->travlocks.off) != 0)
1368 return tdb_null;
1369 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1370 return tdb_null;
1371 tdb->travlocks.off = 0;
1372 }
1373
1374 SAFE_FREE(k);
1375 }
1376
1377 if (!tdb->travlocks.off) {
1378 /* No previous element: do normal find, and lock record */
1379 tdb->travlocks.off = tdb_find_lock_hash(tdb, oldkey, tdb->hash_fn(&oldkey), F_WRLCK, &rec);
1380 if (!tdb->travlocks.off)
1381 return tdb_null;
1382 tdb->travlocks.hash = BUCKET(rec.full_hash);
1383 if (lock_record(tdb, tdb->travlocks.off) != 0) {
1384 TDB_LOG((tdb, 0, "tdb_nextkey: lock_record failed (%s)!\n", strerror(errno)));
1385 return tdb_null;
1386 }
1387 }
1388 oldhash = tdb->travlocks.hash;
1389
1390 /* Grab next record: locks chain and returned record,
1391 unlocks old record */
1392 if (tdb_next_lock(tdb, &tdb->travlocks, &rec) > 0) {
1393 key.dsize = rec.key_len;
1394 key.dptr = tdb_alloc_read(tdb, tdb->travlocks.off+sizeof(rec),
1395 key.dsize);
1396 /* Unlock the chain of this new record */
1397 if (tdb_unlock(tdb, tdb->travlocks.hash, F_WRLCK) != 0)
1398 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1399 }
1400 /* Unlock the chain of old record */
1401 if (tdb_unlock(tdb, BUCKET(oldhash), F_WRLCK) != 0)
1402 TDB_LOG((tdb, 0, "tdb_nextkey: WARNING tdb_unlock failed!\n"));
1403 return key;
1404 }
1405
1406 /* delete an entry in the database given a key */
1407 static int tdb_delete_hash(TDB_CONTEXT *tdb, TDB_DATA key, uint32_t hash)
1408 {
1409 tdb_off rec_ptr;
1410 struct list_struct rec;
1411 int ret;
1412
1413 if (!(rec_ptr = tdb_find_lock_hash(tdb, key, hash, F_WRLCK, &rec)))
1414 return -1;
1415 ret = do_delete(tdb, rec_ptr, &rec);
1416 if (tdb_unlock(tdb, BUCKET(rec.full_hash), F_WRLCK) != 0)
1417 TDB_LOG((tdb, 0, "tdb_delete: WARNING tdb_unlock failed!\n"));
1418 return ret;
1419 }
1420
1421 int tdb_delete(TDB_CONTEXT *tdb, TDB_DATA key)
1422 {
1423 uint32_t hash = tdb->hash_fn(&key);
1424 return tdb_delete_hash(tdb, key, hash);
1425 }
1426
1427 /* store an element in the database, replacing any existing element
1428 with the same key
1429
1430 return 0 on success, -1 on failure
1431 */
1432 int tdb_store(TDB_CONTEXT *tdb, TDB_DATA key, TDB_DATA dbuf, int flag)
1433 {
1434 struct list_struct rec;
1435 uint32_t hash;
1436 tdb_off rec_ptr;
1437 char *p = NULL;
1438 int ret = 0;
1439
1440 /* find which hash bucket it is in */
1441 hash = tdb->hash_fn(&key);
1442 if (tdb_lock(tdb, BUCKET(hash), F_WRLCK) == -1)
1443 return -1;
1444
1445 /* check for it existing, on insert. */
1446 if (flag == TDB_INSERT) {
1447 if (tdb_exists_hash(tdb, key, hash)) {
1448 tdb->ecode = TDB_ERR_EXISTS;
1449 goto fail;
1450 }
1451 } else {
1452 /* first try in-place update, on modify or replace. */
1453 if (tdb_update_hash(tdb, key, hash, dbuf) == 0)
1454 goto out;
1455 if (tdb->ecode == TDB_ERR_NOEXIST &&
1456 flag == TDB_MODIFY) {
1457 /* if the record doesn't exist and we are in TDB_MODIFY mode then
1458 we should fail the store */
1459 goto fail;
1460 }
1461 }
1462 /* reset the error code potentially set by the tdb_update() */
1463 tdb->ecode = TDB_SUCCESS;
1464
1465 /* delete any existing record - if it doesn't exist we don't
1466 care. Doing this first reduces fragmentation, and avoids
1467 coalescing with `allocated' block before it's updated. */
1468 if (flag != TDB_INSERT)
1469 tdb_delete_hash(tdb, key, hash);
1470
1471 /* Copy key+value *before* allocating free space in case malloc
1472 fails and we are left with a dead spot in the tdb. */
1473
1474 if (!(p = (char *)talloc_size(tdb, key.dsize + dbuf.dsize))) {
1475 tdb->ecode = TDB_ERR_OOM;
1476 goto fail;
1477 }
1478
1479 memcpy(p, key.dptr, key.dsize);
1480 if (dbuf.dsize)
1481 memcpy(p+key.dsize, dbuf.dptr, dbuf.dsize);
1482
1483 /* we have to allocate some space */
1484 if (!(rec_ptr = tdb_allocate(tdb, key.dsize + dbuf.dsize, &rec)))
1485 goto fail;
1486
1487 /* Read hash top into next ptr */
1488 if (ofs_read(tdb, TDB_HASH_TOP(hash), &rec.next) == -1)
1489 goto fail;
1490
1491 rec.key_len = key.dsize;
1492 rec.data_len = dbuf.dsize;
1493 rec.full_hash = hash;
1494 rec.magic = TDB_MAGIC;
1495
1496 /* write out and point the top of the hash chain at it */
1497 if (rec_write(tdb, rec_ptr, &rec) == -1
1498 || tdb_write(tdb, rec_ptr+sizeof(rec), p, key.dsize+dbuf.dsize)==-1
1499 || ofs_write(tdb, TDB_HASH_TOP(hash), &rec_ptr) == -1) {
1500 /* Need to tdb_unallocate() here */
1501 goto fail;
1502 }
1503 out:
1504 SAFE_FREE(p);
1505 tdb_unlock(tdb, BUCKET(hash), F_WRLCK);
1506 return ret;
1507 fail:
1508 ret = -1;
1509 goto out;
1510 }
1511
1512 static int tdb_already_open(dev_t device,
1513 ino_t ino)
1514 {
1515 TDB_CONTEXT *i;
1516
1517 for (i = tdbs; i; i = i->next) {
1518 if (i->device == device && i->inode == ino) {
1519 return 1;
1520 }
1521 }
1522
1523 return 0;
1524 }
1525
1526 /* a default logging function */
1527 static void null_log_fn(TDB_CONTEXT *tdb __attribute__((unused)),
1528 int level __attribute__((unused)),
1529 const char *fmt __attribute__((unused)), ...)
1530 {
1531 }
1532
1533
1534 TDB_CONTEXT *tdb_open_ex(const char *name, int hash_size, int tdb_flags,
1535 int open_flags, mode_t mode,
1536 tdb_log_func log_fn,
1537 tdb_hash_func hash_fn)
1538 {
1539 TDB_CONTEXT *tdb;
1540 struct stat st;
1541 int rev = 0, locked = 0;
1542 uint8_t *vp;
1543 uint32_t vertest;
1544
1545 if (!(tdb = talloc_zero(name, TDB_CONTEXT))) {
1546 /* Can't log this */
1547 errno = ENOMEM;
1548 goto fail;
1549 }
1550 tdb->fd = -1;
1551 tdb->name = NULL;
1552 tdb->map_ptr = NULL;
1553 tdb->flags = tdb_flags;
1554 tdb->open_flags = open_flags;
1555 tdb->log_fn = log_fn?log_fn:null_log_fn;
1556 tdb->hash_fn = hash_fn ? hash_fn : default_tdb_hash;
1557
1558 if ((open_flags & O_ACCMODE) == O_WRONLY) {
1559 TDB_LOG((tdb, 0, "tdb_open_ex: can't open tdb %s write-only\n",
1560 name));
1561 errno = EINVAL;
1562 goto fail;
1563 }
1564
1565 if (hash_size == 0)
1566 hash_size = DEFAULT_HASH_SIZE;
1567 if ((open_flags & O_ACCMODE) == O_RDONLY) {
1568 tdb->read_only = 1;
1569 /* read only databases don't do locking or clear if first */
1570 tdb->flags |= TDB_NOLOCK;
1571 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1572 }
1573
1574 /* internal databases don't mmap or lock, and start off cleared */
1575 if (tdb->flags & TDB_INTERNAL) {
1576 tdb->flags |= (TDB_NOLOCK | TDB_NOMMAP);
1577 tdb->flags &= ~TDB_CLEAR_IF_FIRST;
1578 if (tdb_new_database(tdb, hash_size) != 0) {
1579 TDB_LOG((tdb, 0, "tdb_open_ex: tdb_new_database failed!"));
1580 goto fail;
1581 }
1582 goto internal;
1583 }
1584
1585 if ((tdb->fd = open(name, open_flags, mode)) == -1) {
1586 TDB_LOG((tdb, 5, "tdb_open_ex: could not open file %s: %s\n",
1587 name, strerror(errno)));
1588 goto fail; /* errno set by open(2) */
1589 }
1590
1591 /* ensure there is only one process initialising at once */
1592 if (tdb_brlock(tdb, GLOBAL_LOCK, F_WRLCK, F_SETLKW, 0) == -1) {
1593 TDB_LOG((tdb, 0, "tdb_open_ex: failed to get global lock on %s: %s\n",
1594 name, strerror(errno)));
1595 goto fail; /* errno set by tdb_brlock */
1596 }
1597
1598 /* we need to zero database if we are the only one with it open */
1599 if ((tdb_flags & TDB_CLEAR_IF_FIRST) &&
1600 (locked = (tdb_brlock(tdb, ACTIVE_LOCK, F_WRLCK, F_SETLK, 0) == 0))) {
1601 open_flags |= O_CREAT;
1602 if (ftruncate(tdb->fd, 0) == -1) {
1603 TDB_LOG((tdb, 0, "tdb_open_ex: "
1604 "failed to truncate %s: %s\n",
1605 name, strerror(errno)));
1606 goto fail; /* errno set by ftruncate */
1607 }
1608 }
1609
1610 if (read(tdb->fd, &tdb->header, sizeof(tdb->header)) != sizeof(tdb->header)
1611 || strcmp(tdb->header.magic_food, TDB_MAGIC_FOOD) != 0
1612 || (tdb->header.version != TDB_VERSION
1613 && !(rev = (tdb->header.version==TDB_BYTEREV(TDB_VERSION))))) {
1614 /* its not a valid database - possibly initialise it */
1615 if (!(open_flags & O_CREAT) || tdb_new_database(tdb, hash_size) == -1) {
1616 errno = EIO; /* ie bad format or something */
1617 goto fail;
1618 }
1619 rev = (tdb->flags & TDB_CONVERT);
1620 }
1621 vp = (uint8_t *)&tdb->header.version;
1622 vertest = (((uint32_t)vp[0]) << 24) | (((uint32_t)vp[1]) << 16) |
1623 (((uint32_t)vp[2]) << 8) | (uint32_t)vp[3];
1624 tdb->flags |= (vertest==TDB_VERSION) ? TDB_BIGENDIAN : 0;
1625 if (!rev)
1626 tdb->flags &= ~TDB_CONVERT;
1627 else {
1628 tdb->flags |= TDB_CONVERT;
1629 convert(&tdb->header, sizeof(tdb->header));
1630 }
1631 if (fstat(tdb->fd, &st) == -1)
1632 goto fail;
1633
1634 /* Is it already in the open list? If so, fail. */
1635 if (tdb_already_open(st.st_dev, st.st_ino)) {
1636 TDB_LOG((tdb, 2, "tdb_open_ex: "
1637 "%s (%d,%d) is already open in this process\n",
1638 name, (int)st.st_dev, (int)st.st_ino));
1639 errno = EBUSY;
1640 goto fail;
1641 }
1642
1643 if (!(tdb->name = (char *)talloc_strdup(tdb, name))) {
1644 errno = ENOMEM;
1645 goto fail;
1646 }
1647
1648 tdb->map_size = st.st_size;
1649 tdb->device = st.st_dev;
1650 tdb->inode = st.st_ino;
1651 tdb->locked = talloc_zero_array(tdb, struct tdb_lock_type,
1652 tdb->header.hash_size+1);
1653 if (!tdb->locked) {
1654 TDB_LOG((tdb, 2, "tdb_open_ex: "
1655 "failed to allocate lock structure for %s\n",
1656 name));
1657 errno = ENOMEM;
1658 goto fail;
1659 }
1660 tdb_mmap(tdb);
1661 if (locked) {
1662 if (tdb_brlock(tdb, ACTIVE_LOCK, F_UNLCK, F_SETLK, 0) == -1) {
1663 TDB_LOG((tdb, 0, "tdb_open_ex: "
1664 "failed to take ACTIVE_LOCK on %s: %s\n",
1665 name, strerror(errno)));
1666 goto fail;
1667 }
1668
1669 }
1670
1671 /* We always need to do this if the CLEAR_IF_FIRST flag is set, even if
1672 we didn't get the initial exclusive lock as we need to let all other
1673 users know we're using it. */
1674
1675 if (tdb_flags & TDB_CLEAR_IF_FIRST) {
1676 /* leave this lock in place to indicate it's in use */
1677 if (tdb_brlock(tdb, ACTIVE_LOCK, F_RDLCK, F_SETLKW, 0) == -1)
1678 goto fail;
1679 }
1680
1681
1682 internal:
1683 /* Internal (memory-only) databases skip all the code above to
1684 * do with disk files, and resume here by releasing their
1685 * global lock and hooking into the active list. */
1686 if (tdb_brlock(tdb, GLOBAL_LOCK, F_UNLCK, F_SETLKW, 0) == -1)
1687 goto fail;
1688 tdb->next = tdbs;
1689 tdbs = tdb;
1690 return tdb;
1691
1692 fail:
1693 { int save_errno = errno;
1694
1695 if (!tdb)
1696 return NULL;
1697
1698 if (tdb->map_ptr) {
1699 if (tdb->flags & TDB_INTERNAL)
1700 SAFE_FREE(tdb->map_ptr);
1701 else
1702 tdb_munmap(tdb);
1703 }
1704 SAFE_FREE(tdb->name);
1705 if (tdb->fd != -1)
1706 if (close(tdb->fd) != 0)
1707 TDB_LOG((tdb, 5, "tdb_open_ex: failed to close tdb->fd on error!\n"));
1708 SAFE_FREE(tdb->locked);
1709 SAFE_FREE(tdb);
1710 errno = save_errno;
1711 return NULL;
1712 }
1713 }
1714
1715 /**
1716 * Close a database.
1717 *
1718 * @returns -1 for error; 0 for success.
1719 **/
1720 int tdb_close(TDB_CONTEXT *tdb)
1721 {
1722 TDB_CONTEXT **i;
1723 int ret = 0;
1724
1725 if (tdb->map_ptr) {
1726 if (tdb->flags & TDB_INTERNAL)
1727 SAFE_FREE(tdb->map_ptr);
1728 else
1729 tdb_munmap(tdb);
1730 }
1731 SAFE_FREE(tdb->name);
1732 if (tdb->fd != -1)
1733 ret = close(tdb->fd);
1734 SAFE_FREE(tdb->locked);
1735
1736 /* Remove from contexts list */
1737 for (i = &tdbs; *i; i = &(*i)->next) {
1738 if (*i == tdb) {
1739 *i = tdb->next;
1740 break;
1741 }
1742 }
1743
1744 memset(tdb, 0, sizeof(*tdb));
1745 SAFE_FREE(tdb);
1746
1747 return ret;
1748 }
1749