Codebase list lmdb / e50dc15
New upstream version 0.9.10 Ondřej Surý 10 years ago
5 changed file(s) with 220 addition(s) and 125 deletion(s). Raw diff Collapse all Expand all
00 LMDB 0.9 Change Log
1
2 LMDB 0.9.10 Release (2013/11/12)
3 Add MDB_NOMEMINIT option
4 Fix mdb_page_split() again (ITS#7589)
5 Fix MDB_NORDAHEAD definition (ITS#7734)
6 Fix mdb_cursor_del() positioning (ITS#7733)
7 Partial fix for larger page sizes (ITS#7713)
8 Fix Windows64/MSVC build issues
19
210 LMDB 0.9.9 Release (2013/10/24)
311 Add mdb_env_get_fd()
6868 * - There is normally no pure read-only mode, since readers need write
6969 * access to locks and lock file. Exceptions: On read-only filesystems
7070 * or with the #MDB_NOLOCK flag described under #mdb_env_open().
71 *
72 * - By default, in versions before 0.9.10, unused portions of the data
73 * file might receive garbage data from memory freed by other code.
74 * (This does not happen when using the #MDB_WRITEMAP flag.) As of
75 * 0.9.10 the default behavior is to initialize such memory before
76 * writing to the data file. Since there may be a slight performance
77 * cost due to this initialization, applications may disable it using
78 * the #MDB_NOMEMINIT flag. Applications handling sensitive data
79 * which must not be written should not use this flag. This flag is
80 * irrelevant when using #MDB_WRITEMAP.
7181 *
7282 * - A thread can only use one transaction at a time, plus any child
7383 * transactions. Each transaction belongs to one thread. See below.
173183 /** Library minor version */
174184 #define MDB_VERSION_MINOR 9
175185 /** Library patch version */
176 #define MDB_VERSION_PATCH 9
186 #define MDB_VERSION_PATCH 10
177187
178188 /** Combine args a,b,c into a single integer for easy version comparisons */
179189 #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
183193 MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
184194
185195 /** The release date of this library version */
186 #define MDB_VERSION_DATE "October 24, 2013"
196 #define MDB_VERSION_DATE "November 11, 2013"
187197
188198 /** A stringifier for the version info */
189199 #define MDB_VERSTR(a,b,c,d) "MDB " #a "." #b "." #c ": (" d ")"
276286 #define MDB_NOLOCK 0x400000
277287 /** don't do readahead (no effect on Windows) */
278288 #define MDB_NORDAHEAD 0x800000
289 /** don't initialize malloc'd memory before writing to datafile */
290 #define MDB_NOMEMINIT 0x1000000
279291 /** @} */
280292
281293 /** @defgroup mdb_dbi_open Database Flags
545557 * supports it. Turning it off may help random read performance
546558 * when the DB is larger than RAM and system RAM is full.
547559 * The option is not implemented on Windows.
560 * <li>#MDB_NOMEMINIT
561 * Don't initialize malloc'd memory before writing to unused spaces
562 * in the data file. By default, memory for pages written to the data
563 * file is obtained using malloc. While these pages may be reused in
564 * subsequent transactions, freshly malloc'd pages will be initialized
565 * to zeroes before use. This avoids persisting leftover data from other
566 * code (that used the heap and subsequently freed the memory) into the
567 * data file. Note that many other system libraries may allocate
568 * and free memory from the heap for arbitrary uses. E.g., stdio may
569 * use the heap for file I/O buffers. This initialization step has a
570 * modest performance cost so some applications may want to disable
571 * it using this flag. This option can be a problem for applications
572 * which handle sensitive data like passwords, and it makes memory
573 * checkers like Valgrind noisy. This flag is not needed with #MDB_WRITEMAP,
574 * which writes directly to the mmap instead of using malloc for pages. The
575 * initialization is also skipped if #MDB_RESERVE is used; the
576 * caller is expected to overwrite all of the memory that was
577 * reserved in that case.
578 * This flag may be changed at any time using #mdb_env_set_flags().
548579 * </ul>
549580 * @param[in] mode The UNIX permissions to set on created files. This parameter
550581 * is ignored on Windows.
11301161 * reserved space, which the caller can fill in later - before
11311162 * the next update operation or the transaction ends. This saves
11321163 * an extra memcpy if the data is being generated later.
1164 * MDB does nothing else with this memory, the caller is expected
1165 * to modify all of the space requested.
11331166 * <li>#MDB_APPEND - append the given key/data pair to the end of the
11341167 * database. No key comparisons are performed. This option allows
11351168 * fast bulk loading when keys are already known to be in the
3636 #endif
3737 #include <sys/types.h>
3838 #include <sys/stat.h>
39 #include <sys/param.h>
4039 #ifdef _WIN32
4140 #include <windows.h>
41 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
42 * as int64 which is wrong. MSVC doesn't define it at all, so just
43 * don't use it.
44 */
45 #define MDB_PID_T int
46 #ifdef __GNUC__
47 # include <sys/param.h>
4248 #else
49 # define LITTLE_ENDIAN 1234
50 # define BIG_ENDIAN 4321
51 # define BYTE_ORDER LITTLE_ENDIAN
52 # ifndef SSIZE_MAX
53 # define SSIZE_MAX INT_MAX
54 # endif
55 #endif
56 #else
57 #define MDB_PID_T pid_t
58 #include <sys/param.h>
4359 #include <sys/uio.h>
4460 #include <sys/mman.h>
4561 #ifdef HAVE_SYS_FILE_H
323339 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
324340 /** @} */
325341
326 /** A default memory page size.
327 * The actual size is platform-dependent, but we use this for
328 * boot-strapping. We probably should not be using this any more.
329 * The #GET_PAGESIZE() macro is used to get the actual size.
342 /** @brief The maximum size of a database page.
343 *
344 * This is 32k, since it must fit in #MDB_page.#mp_upper.
345 *
346 * LMDB will use database pages < OS pages if needed.
347 * That causes more I/O in write transactions: The OS must
348 * know (read) the whole page before writing a partial page.
330349 *
331350 * Note that we don't currently support Huge pages. On Linux,
332351 * regular data files cannot use Huge pages, and in general
335354 * pressure from other processes is high. So until OSs have
336355 * actual paging support for Huge pages, they're not viable.
337356 */
338 #define MDB_PAGESIZE 4096
357 #define MAX_PAGESIZE 0x8000
339358
340359 /** The minimum number of keys required in a database page.
341360 * Setting this to a larger value will place a smaller bound on the
369388 *
370389 * We require that keys all fit onto a regular page. This limit
371390 * could be raised a bit further if needed; to something just
372 * under #MDB_PAGESIZE / #MDB_MINKEYS.
391 * under (page size / #MDB_MINKEYS / 3).
373392 *
374393 * Note that data items in an #MDB_DUPSORT database are actually keys
375394 * of a subDB, so they're also limited to this size.
493512 */
494513 txnid_t mrb_txnid;
495514 /** The process ID of the process owning this reader txn. */
496 pid_t mrb_pid;
515 MDB_PID_T mrb_pid;
497516 /** The thread ID of the thread owning this txn. */
498517 pthread_t mrb_tid;
499518 } MDB_rxbody;
812831 txnid_t mm_txnid; /**< txnid that committed this page */
813832 } MDB_meta;
814833
815 /** Buffer for a stack-allocated dirty page.
834 /** Buffer for a stack-allocated meta page.
816835 * The members define size and alignment, and silence type
817836 * aliasing warnings. They are not used directly; that could
818837 * mean incorrectly using several union members in parallel.
819838 */
820 typedef union MDB_pagebuf {
821 char mb_raw[MDB_PAGESIZE];
839 typedef union MDB_metabuf {
822840 MDB_page mb_page;
823841 struct {
824842 char mm_pad[PAGEHDRSZ];
825843 MDB_meta mm_meta;
826844 } mb_metabuf;
827 } MDB_pagebuf;
845 } MDB_metabuf;
828846
829847 /** Auxiliary DB info.
830848 * The information here is mostly static/read-only. There is
9931011 /** Have liveness lock in reader table */
9941012 #define MDB_LIVE_READER 0x08000000U
9951013 uint32_t me_flags; /**< @ref mdb_env */
996 unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
1014 unsigned int me_psize; /**< DB page size, inited from me_os_psize */
1015 unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
9971016 unsigned int me_maxreaders; /**< size of the reader table */
9981017 unsigned int me_numreaders; /**< max numreaders set by this env */
9991018 MDB_dbi me_numdbs; /**< number of DBs opened */
10001019 MDB_dbi me_maxdbs; /**< size of the DB table */
1001 pid_t me_pid; /**< process ID of this env */
1020 MDB_PID_T me_pid; /**< process ID of this env */
10021021 char *me_path; /**< path to the DB files */
10031022 char *me_map; /**< the memory map of the data file */
10041023 MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
10051024 MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
1025 void *me_pbuf; /**< scratch area for DUPSORT put() */
10061026 MDB_txn *me_txn; /**< current write transaction */
10071027 size_t me_mapsize; /**< size of the data memory map */
10081028 off_t me_size; /**< current file size */
13171337 {
13181338 MDB_env *env = txn->mt_env;
13191339 MDB_page *ret = env->me_dpages;
1320 size_t sz = env->me_psize;
1340 size_t psize = env->me_psize, sz = psize, off;
1341 /* For ! #MDB_NOMEMINIT, psize counts how much to init.
1342 * For a single page alloc, we init everything after the page header.
1343 * For multi-page, we init the final page; if the caller needed that
1344 * many pages they will be filling in at least up to the last page.
1345 */
13211346 if (num == 1) {
13221347 if (ret) {
13231348 VGMEMP_ALLOC(env, ret, sz);
13251350 env->me_dpages = ret->mp_next;
13261351 return ret;
13271352 }
1353 psize -= off = PAGEHDRSZ;
13281354 } else {
13291355 sz *= num;
1356 off = sz - psize;
13301357 }
13311358 if ((ret = malloc(sz)) != NULL) {
1359 if (!(env->me_flags & MDB_NOMEMINIT)) {
1360 memset((char *)ret + off, 0, psize);
1361 ret->mp_pad = 0;
1362 }
13321363 VGMEMP_ALLOC(env, ret, sz);
13331364 }
13341365 return ret;
20772108 * lock on the lockfile, set at an offset equal to the pid.
20782109 */
20792110 static int
2080 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
2111 mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
20812112 {
20822113 #if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
20832114 int ret = 0;
21422173 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
21432174 return MDB_BAD_RSLOT;
21442175 } else {
2145 pid_t pid = env->me_pid;
2176 MDB_PID_T pid = env->me_pid;
21462177 pthread_t tid = pthread_self();
21472178
21482179 if (!(env->me_flags & MDB_LIVE_READER)) {
24812512 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
24822513 txnid_t pglast = 0, head_id = 0;
24832514 pgno_t freecnt = 0, *free_pgs, *mop;
2484 ssize_t head_room = 0, total_room = 0, mop_len;
2515 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
24852516
24862517 mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
24872518
24922523 return rc;
24932524 }
24942525
2526 /* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
2527 clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
2528 ? SSIZE_MAX : maxfree_1pg;
2529
24952530 for (;;) {
24962531 /* Come back here after each Put() in case freelist changed */
24972532 MDB_val key, data;
2533 pgno_t *pgs;
2534 ssize_t j;
24982535
24992536 /* If using records from freeDB which we have not yet
25002537 * deleted, delete them and any we reserved for me_pghead.
25782615 rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
25792616 if (rc)
25802617 return rc;
2581 *(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
2618 /* IDL is initially empty, zero out at least the length */
2619 pgs = (pgno_t *)data.mv_data;
2620 j = head_room > clean_limit ? head_room : 0;
2621 do {
2622 pgs[j] = 0;
2623 } while (--j >= 0);
25822624 total_room += head_room;
25832625 }
25842626
29693011 static int
29703012 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
29713013 {
2972 MDB_pagebuf pbuf;
3014 MDB_metabuf pbuf;
29733015 MDB_page *p;
29743016 MDB_meta *m;
29753017 int i, rc, off;
3018 enum { Size = sizeof(pbuf) };
29763019
29773020 /* We don't know the page size yet, so use a minimum value.
29783021 * Read both meta pages so we can use the latest one.
29843027 OVERLAPPED ov;
29853028 memset(&ov, 0, sizeof(ov));
29863029 ov.Offset = off;
2987 rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1;
3030 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
29883031 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
29893032 rc = 0;
29903033 #else
2991 rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off);
2992 #endif
2993 if (rc != MDB_PAGESIZE) {
3034 rc = pread(env->me_fd, &pbuf, Size, off);
3035 #endif
3036 if (rc != Size) {
29943037 if (rc == 0 && off == 0)
29953038 return ENOENT;
29963039 rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
31213164 mp->mm_last_pg = txn->mt_next_pgno - 1;
31223165 mp->mm_txnid = txn->mt_txnid;
31233166 if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
3167 unsigned meta_size = env->me_psize;
31243168 rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
31253169 ptr = env->me_map;
3126 if (toggle)
3127 ptr += env->me_psize;
3128 if (MDB_MSYNC(ptr, env->me_psize, rc)) {
3170 if (toggle) {
3171 #ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
3172 if (meta_size < env->me_os_psize)
3173 meta_size += meta_size;
3174 else
3175 #endif
3176 ptr += meta_size;
3177 }
3178 if (MDB_MSYNC(ptr, meta_size, rc)) {
31293179 rc = ErrCode();
31303180 goto fail;
31313181 }
32313281 e->me_wmutex = SEM_FAILED;
32323282 #endif
32333283 e->me_pid = getpid();
3284 GET_PAGESIZE(e->me_os_psize);
32343285 VGMEMP_CREATE(e,0,0);
32353286 *env = e;
32363287 return MDB_SUCCESS;
33963447 return i;
33973448 DPUTS("new mdbenv");
33983449 newenv = 1;
3399 GET_PAGESIZE(env->me_psize);
3450 env->me_psize = env->me_os_psize;
3451 if (env->me_psize > MAX_PAGESIZE)
3452 env->me_psize = MAX_PAGESIZE;
34003453 } else {
34013454 env->me_psize = meta.mm_psize;
34023455 }
35073560 #pragma comment(linker, "/INCLUDE:_tls_used")
35083561 #pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
35093562 #pragma const_seg(".CRT$XLB")
3510 extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
3563 extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
35113564 const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
35123565 #pragma const_seg()
35133566 #else /* WIN32 */
37713824 rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
37723825 if (size < rsize && *excl > 0) {
37733826 #ifdef _WIN32
3774 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
3827 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
37753828 || !SetEndOfFile(env->me_lfd))
37763829 goto fail_errno;
37773830 #else
39273980 * at runtime. Changing other flags requires closing the
39283981 * environment and re-opening it with the new flags.
39293982 */
3930 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
3931 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK)
3983 #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
3984 #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
3985 MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
39323986
39333987 int
39343988 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
40414095 DPRINTF(("opened dbenv %p", (void *) env));
40424096 if (excl > 0) {
40434097 rc = mdb_env_share_locks(env, &excl);
4044 }
4098 if (rc)
4099 goto leave;
4100 }
4101 if (!((flags & MDB_RDONLY) ||
4102 (env->me_pbuf = calloc(1, env->me_psize))))
4103 rc = ENOMEM;
40454104 }
40464105
40474106 leave:
40654124 for (i = env->me_maxdbs; --i > MAIN_DBI; )
40664125 free(env->me_dbxs[i].md_name.mv_data);
40674126
4127 free(env->me_pbuf);
40684128 free(env->me_dbflags);
40694129 free(env->me_dbxs);
40704130 free(env->me_path);
40924152 if (env->me_fd != INVALID_HANDLE_VALUE)
40934153 (void) close(env->me_fd);
40944154 if (env->me_txns) {
4095 pid_t pid = env->me_pid;
4155 MDB_PID_T pid = env->me_pid;
40964156 /* Clearing readers is done in this function because
40974157 * me_txkey with its destructor must be disabled first.
40984158 */
54195479 rc = EINVAL;
54205480 } else {
54215481 MDB_page *mp = mc->mc_pg[mc->mc_top];
5422 if (!NUMKEYS(mp)) {
5423 mc->mc_ki[mc->mc_top] = 0;
5482 int nkeys = NUMKEYS(mp);
5483 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
5484 mc->mc_ki[mc->mc_top] = nkeys;
54245485 rc = MDB_NOTFOUND;
54255486 break;
54265487 }
56015662 unsigned int flags)
56025663 {
56035664 enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
5665 MDB_env *env = mc->mc_txn->mt_env;
56045666 MDB_node *leaf = NULL;
56055667 MDB_val xdata, *rdata, dkey;
5606 MDB_page *fp;
56075668 MDB_db dummy;
56085669 int do_sub = 0, insert = 0;
56095670 unsigned int mcount = 0, dcount = 0, nospill;
56105671 size_t nsize;
56115672 int rc, rc2;
5612 MDB_pagebuf pbuf;
56135673 char dbuf[MDB_MAXKEYSIZE+1];
56145674 unsigned int nflags;
56155675 DKBUF;
57235783
57245784 /* The key already exists */
57255785 if (rc == MDB_SUCCESS) {
5786 MDB_page *fp, *mp;
5787 MDB_val olddata;
5788
57265789 /* there's only a key anyway, so this is a no-op */
57275790 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
57285791 unsigned int ksize = mc->mc_db->md_pad;
57355798 return MDB_SUCCESS;
57365799 }
57375800
5801 more:
57385802 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
5803 olddata.mv_size = NODEDSZ(leaf);
5804 olddata.mv_data = NODEDATA(leaf);
57395805
57405806 /* DB has dups? */
57415807 if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
5808 mp = fp = xdata.mv_data = env->me_pbuf;
5809 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5810
57425811 /* Was a single item before, must convert now */
5743 more:
57445812 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
57455813 /* Just overwrite the current item */
57465814 if (flags == MDB_CURRENT)
57475815 goto current;
57485816
5749 dkey.mv_size = NODEDSZ(leaf);
5750 dkey.mv_data = NODEDATA(leaf);
5817 dkey = olddata;
57515818 #if UINT_MAX < SIZE_MAX
57525819 if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
57535820 #ifdef MISALIGNED_OK
57705837 /* create a fake page for the dup items */
57715838 memcpy(dbuf, dkey.mv_data, dkey.mv_size);
57725839 dkey.mv_data = dbuf;
5773 fp = (MDB_page *)&pbuf;
5774 fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
57755840 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
57765841 fp->mp_lower = PAGEHDRSZ;
5777 fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
5842 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
57785843 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
57795844 fp->mp_flags |= P_LEAF2;
57805845 fp->mp_pad = data->mv_size;
5781 fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */
5846 xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
57825847 } else {
5783 fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
5848 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
57845849 (dkey.mv_size & 1) + (data->mv_size & 1);
57855850 }
5786 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5787 do_sub = 1;
5788 rdata = &xdata;
5789 xdata.mv_size = fp->mp_upper;
5790 xdata.mv_data = fp;
5791 flags |= F_DUPDATA;
5792 goto new_sub;
5793 }
5794 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
5851 fp->mp_upper = xdata.mv_size;
5852 } else if (leaf->mn_flags & F_SUBDATA) {
5853 /* Data is on sub-DB, just store it */
5854 flags |= F_DUPDATA|F_SUBDATA;
5855 goto put_sub;
5856 } else {
57955857 /* See if we need to convert from fake page to subDB */
5796 MDB_page *mp;
57975858 unsigned int offset;
57985859 unsigned int i;
57995860 uint16_t fp_flags;
58005861
5801 fp = NODEDATA(leaf);
5802 if (flags == MDB_CURRENT) {
5803 reuse:
5862 fp = olddata.mv_data;
5863 switch (flags) {
5864 default:
5865 if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
5866 offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5867 offset += offset & 1;
5868 break;
5869 }
5870 offset = fp->mp_pad;
5871 if (SIZELEFT(fp) < offset) {
5872 offset *= 4; /* space for 4 more */
5873 break;
5874 }
5875 /* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
5876 case MDB_CURRENT:
58045877 fp->mp_flags |= P_DIRTY;
5805 COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
5878 COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
58065879 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
58075880 flags |= F_DUPDATA;
58085881 goto put_sub;
58095882 }
5810 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
5811 offset = fp->mp_pad;
5812 if (SIZELEFT(fp) >= offset)
5813 goto reuse;
5814 offset *= 4; /* space for 4 more */
5815 } else {
5816 offset = NODESIZE + sizeof(indx_t) + data->mv_size;
5817 }
5818 offset += offset & 1;
58195883 fp_flags = fp->mp_flags;
5820 if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
5821 offset >= mc->mc_txn->mt_env->me_nodemax) {
5884 xdata.mv_size = olddata.mv_size + offset;
5885 if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
5886 >= env->me_nodemax) {
58225887 /* yes, convert it */
5823 dummy.md_flags = 0;
58245888 if (mc->mc_db->md_flags & MDB_DUPFIXED) {
58255889 dummy.md_pad = fp->mp_pad;
58265890 dummy.md_flags = MDB_DUPFIXED;
58275891 if (mc->mc_db->md_flags & MDB_INTEGERDUP)
58285892 dummy.md_flags |= MDB_INTEGERKEY;
5893 } else {
5894 dummy.md_pad = 0;
5895 dummy.md_flags = 0;
58295896 }
58305897 dummy.md_depth = 1;
58315898 dummy.md_branch_pages = 0;
58325899 dummy.md_leaf_pages = 1;
58335900 dummy.md_overflow_pages = 0;
58345901 dummy.md_entries = NUMKEYS(fp);
5835 rdata = &xdata;
58365902 xdata.mv_size = sizeof(MDB_db);
58375903 xdata.mv_data = &dummy;
58385904 if ((rc = mdb_page_alloc(mc, 1, &mp)))
58395905 return rc;
5840 offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
5906 offset = env->me_psize - olddata.mv_size;
58415907 flags |= F_DUPDATA|F_SUBDATA;
58425908 dummy.md_root = mp->mp_pgno;
58435909 fp_flags &= ~P_SUBP;
5844 } else {
5845 /* no, just grow it */
5846 rdata = &xdata;
5847 xdata.mv_size = NODEDSZ(leaf) + offset;
5848 xdata.mv_data = &pbuf;
5849 mp = (MDB_page *)&pbuf;
5850 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
5851 flags |= F_DUPDATA;
58525910 }
58535911 mp->mp_flags = fp_flags | P_DIRTY;
58545912 mp->mp_pad = fp->mp_pad;
58575915 if (IS_LEAF2(fp)) {
58585916 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
58595917 } else {
5860 nsize = NODEDSZ(leaf) - fp->mp_upper;
5861 memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
5918 memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
5919 olddata.mv_size - fp->mp_upper);
58625920 for (i=0; i<NUMKEYS(fp); i++)
58635921 mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
58645922 }
5865 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5866 do_sub = 1;
5867 goto new_sub;
58685923 }
5869 /* data is on sub-DB, just store it */
5870 flags |= F_DUPDATA|F_SUBDATA;
5871 goto put_sub;
5924
5925 rdata = &xdata;
5926 flags |= F_DUPDATA;
5927 do_sub = 1;
5928 mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
5929 goto new_sub;
58725930 }
58735931 current:
58745932 /* overflow page overwrites need special handling */
58755933 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
58765934 MDB_page *omp;
58775935 pgno_t pg;
5878 unsigned psize = mc->mc_txn->mt_env->me_psize;
5879 int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
5880
5881 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
5936 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
5937
5938 memcpy(&pg, olddata.mv_data, sizeof(pg));
58825939 if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
58835940 return rc2;
58845941 ovpages = omp->mp_pages;
58865943 /* Is the ov page large enough? */
58875944 if (ovpages >= dpages) {
58885945 if (!(omp->mp_flags & P_DIRTY) &&
5889 (level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP)))
5946 (level || (env->me_flags & MDB_WRITEMAP)))
58905947 {
58915948 rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
58925949 if (rc)
59015958 */
59025959 if (level > 1) {
59035960 /* It is writable only in a parent txn */
5904 size_t sz = (size_t) psize * ovpages, off;
5961 size_t sz = (size_t) env->me_psize * ovpages, off;
59055962 MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
59065963 MDB_ID2 id2;
59075964 if (!np)
59315988 }
59325989 if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
59335990 return rc2;
5934 } else if (NODEDSZ(leaf) == data->mv_size) {
5991 } else if (data->mv_size == olddata.mv_size) {
59355992 /* same size, just replace it. Note that we could
59365993 * also reuse this node if the new data is smaller,
59375994 * but instead we opt to shrink the node in that case.
59385995 */
59395996 if (F_ISSET(flags, MDB_RESERVE))
5940 data->mv_data = NODEDATA(leaf);
5997 data->mv_data = olddata.mv_data;
59415998 else if (data->mv_size)
5942 memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
5999 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
59436000 else
59446001 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
59456002 goto done;
59556012
59566013 new_sub:
59576014 nflags = flags & NODE_ADD_FLAGS;
5958 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
6015 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
59596016 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
59606017 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
59616018 nflags &= ~MDB_APPEND;
60496106 data[1].mv_size = mcount;
60506107 if (mcount < dcount) {
60516108 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
6052 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
60536109 goto more;
60546110 }
60556111 }
60686124 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
60696125 {
60706126 MDB_node *leaf;
6127 MDB_page *mp;
60716128 int rc;
60726129
60736130 if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
60756132
60766133 if (!(mc->mc_flags & C_INITIALIZED))
60776134 return EINVAL;
6135
6136 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6137 return MDB_NOTFOUND;
60786138
60796139 if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
60806140 return rc;
60836143 if (rc)
60846144 return rc;
60856145
6086 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6087
6088 if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6146 mp = mc->mc_pg[mc->mc_top];
6147 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6148
6149 if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
60896150 if (!(flags & MDB_NODUPDATA)) {
60906151 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
60916152 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
61006161 } else {
61016162 MDB_cursor *m2;
61026163 /* shrink fake page */
6103 mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6104 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6164 mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
6165 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
61056166 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
61066167 /* fix other sub-DB cursors pointed at this fake page */
61076168 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
61086169 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
6109 if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] &&
6170 if (m2->mc_pg[mc->mc_top] == mp &&
61106171 m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
61116172 m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
61126173 }
67846845 flags = 0;
67856846 } else {
67866847 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
6787 assert(!((long)srcnode&1));
6848 assert(!((size_t)srcnode&1));
67886849 srcpg = NODEPGNO(srcnode);
67896850 flags = srcnode->mn_flags;
67906851 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
72847345
72857346 /* Adjust other cursors pointing to mp */
72867347 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7287 if (m2 == mc)
7348 if (m2 == mc || m2->mc_snum < mc->mc_snum)
72887349 continue;
72897350 if (!(m2->mc_flags & C_INITIALIZED))
72907351 continue;
75357596 psize = 0;
75367597 if (newindx <= split_indx || newindx >= nkeys) {
75377598 i = 0; j = 1;
7538 k = newindx >= nkeys ? nkeys : split_indx+1;
7599 k = newindx >= nkeys ? nkeys : split_indx+2;
75397600 } else {
75407601 i = nkeys; j = -1;
75417602 k = split_indx-1;
75557616 }
75567617 psize += psize & 1;
75577618 }
7558 if (psize > pmax) {
7619 if (psize > pmax || i == k-j) {
75597620 split_indx = i + (j<0);
75607621 break;
75617622 }
75627623 }
7563 /* special case: when the new node was on the last
7564 * slot we may not have tripped the break inside the loop.
7565 * In all other cases we either hit the break condition,
7566 * or the original split_indx was already safe.
7567 */
7568 if (newindx >= nkeys && i == k)
7569 split_indx = nkeys-1;
75707624 }
75717625 if (split_indx == newindx) {
75727626 sepkey.mv_size = newkey->mv_size;
82518305 /** Insert pid into list if not already present.
82528306 * return -1 if already present.
82538307 */
8254 static int mdb_pid_insert(pid_t *ids, pid_t pid)
8308 static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
82558309 {
82568310 /* binary search of pid in list */
82578311 unsigned base = 0;
82918345 {
82928346 unsigned int i, j, rdrs;
82938347 MDB_reader *mr;
8294 pid_t *pids, pid;
8348 MDB_PID_T *pids, pid;
82958349 int count = 0;
82968350
82978351 if (!env)
83018355 if (!env->me_txns)
83028356 return MDB_SUCCESS;
83038357 rdrs = env->me_txns->mti_numreaders;
8304 pids = malloc((rdrs+1) * sizeof(pid_t));
8358 pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
83058359 if (!pids)
83068360 return ENOMEM;
83078361 pids[0] = 0;
2727 MDB_cursor *cursor, *cur2;
2828 int count;
2929 int *values;
30 char sval[32];
30 char sval[32] = "";
3131
3232 srandom(time(NULL));
3333
3030 MDB_cursor *cursor;
3131 int count;
3232 int *values;
33 char sval[32];
33 char sval[32] = "";
3434
3535 srandom(time(NULL));
3636