36 | 36 |
#endif
|
37 | 37 |
#include <sys/types.h>
|
38 | 38 |
#include <sys/stat.h>
|
39 | |
#include <sys/param.h>
|
40 | 39 |
#ifdef _WIN32
|
41 | 40 |
#include <windows.h>
|
|
41 |
/** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
|
|
42 |
* as int64 which is wrong. MSVC doesn't define it at all, so just
|
|
43 |
* don't use it.
|
|
44 |
*/
|
|
45 |
#define MDB_PID_T int
|
|
46 |
#ifdef __GNUC__
|
|
47 |
# include <sys/param.h>
|
42 | 48 |
#else
|
|
49 |
# define LITTLE_ENDIAN 1234
|
|
50 |
# define BIG_ENDIAN 4321
|
|
51 |
# define BYTE_ORDER LITTLE_ENDIAN
|
|
52 |
# ifndef SSIZE_MAX
|
|
53 |
# define SSIZE_MAX INT_MAX
|
|
54 |
# endif
|
|
55 |
#endif
|
|
56 |
#else
|
|
57 |
#define MDB_PID_T pid_t
|
|
58 |
#include <sys/param.h>
|
43 | 59 |
#include <sys/uio.h>
|
44 | 60 |
#include <sys/mman.h>
|
45 | 61 |
#ifdef HAVE_SYS_FILE_H
|
|
323 | 339 |
(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
|
324 | 340 |
/** @} */
|
325 | 341 |
|
326 | |
/** A default memory page size.
|
327 | |
* The actual size is platform-dependent, but we use this for
|
328 | |
* boot-strapping. We probably should not be using this any more.
|
329 | |
* The #GET_PAGESIZE() macro is used to get the actual size.
|
|
342 |
/** @brief The maximum size of a database page.
|
|
343 |
*
|
|
344 |
* This is 32k, since it must fit in #MDB_page.#mp_upper.
|
|
345 |
*
|
|
346 |
* LMDB will use database pages < OS pages if needed.
|
|
347 |
* That causes more I/O in write transactions: The OS must
|
|
348 |
* know (read) the whole page before writing a partial page.
|
330 | 349 |
*
|
331 | 350 |
* Note that we don't currently support Huge pages. On Linux,
|
332 | 351 |
* regular data files cannot use Huge pages, and in general
|
|
335 | 354 |
* pressure from other processes is high. So until OSs have
|
336 | 355 |
* actual paging support for Huge pages, they're not viable.
|
337 | 356 |
*/
|
338 | |
#define MDB_PAGESIZE 4096
|
|
357 |
#define MAX_PAGESIZE 0x8000
|
339 | 358 |
|
340 | 359 |
/** The minimum number of keys required in a database page.
|
341 | 360 |
* Setting this to a larger value will place a smaller bound on the
|
|
369 | 388 |
*
|
370 | 389 |
* We require that keys all fit onto a regular page. This limit
|
371 | 390 |
* could be raised a bit further if needed; to something just
|
372 | |
* under #MDB_PAGESIZE / #MDB_MINKEYS.
|
|
391 |
* under (page size / #MDB_MINKEYS / 3).
|
373 | 392 |
*
|
374 | 393 |
* Note that data items in an #MDB_DUPSORT database are actually keys
|
375 | 394 |
* of a subDB, so they're also limited to this size.
|
|
493 | 512 |
*/
|
494 | 513 |
txnid_t mrb_txnid;
|
495 | 514 |
/** The process ID of the process owning this reader txn. */
|
496 | |
pid_t mrb_pid;
|
|
515 |
MDB_PID_T mrb_pid;
|
497 | 516 |
/** The thread ID of the thread owning this txn. */
|
498 | 517 |
pthread_t mrb_tid;
|
499 | 518 |
} MDB_rxbody;
|
|
812 | 831 |
txnid_t mm_txnid; /**< txnid that committed this page */
|
813 | 832 |
} MDB_meta;
|
814 | 833 |
|
815 | |
/** Buffer for a stack-allocated dirty page.
|
|
834 |
/** Buffer for a stack-allocated meta page.
|
816 | 835 |
* The members define size and alignment, and silence type
|
817 | 836 |
* aliasing warnings. They are not used directly; that could
|
818 | 837 |
* mean incorrectly using several union members in parallel.
|
819 | 838 |
*/
|
820 | |
typedef union MDB_pagebuf {
|
821 | |
char mb_raw[MDB_PAGESIZE];
|
|
839 |
typedef union MDB_metabuf {
|
822 | 840 |
MDB_page mb_page;
|
823 | 841 |
struct {
|
824 | 842 |
char mm_pad[PAGEHDRSZ];
|
825 | 843 |
MDB_meta mm_meta;
|
826 | 844 |
} mb_metabuf;
|
827 | |
} MDB_pagebuf;
|
|
845 |
} MDB_metabuf;
|
828 | 846 |
|
829 | 847 |
/** Auxiliary DB info.
|
830 | 848 |
* The information here is mostly static/read-only. There is
|
|
993 | 1011 |
/** Have liveness lock in reader table */
|
994 | 1012 |
#define MDB_LIVE_READER 0x08000000U
|
995 | 1013 |
uint32_t me_flags; /**< @ref mdb_env */
|
996 | |
unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
|
|
1014 |
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
|
|
1015 |
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
|
997 | 1016 |
unsigned int me_maxreaders; /**< size of the reader table */
|
998 | 1017 |
unsigned int me_numreaders; /**< max numreaders set by this env */
|
999 | 1018 |
MDB_dbi me_numdbs; /**< number of DBs opened */
|
1000 | 1019 |
MDB_dbi me_maxdbs; /**< size of the DB table */
|
1001 | |
pid_t me_pid; /**< process ID of this env */
|
|
1020 |
MDB_PID_T me_pid; /**< process ID of this env */
|
1002 | 1021 |
char *me_path; /**< path to the DB files */
|
1003 | 1022 |
char *me_map; /**< the memory map of the data file */
|
1004 | 1023 |
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
|
1005 | 1024 |
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
|
|
1025 |
void *me_pbuf; /**< scratch area for DUPSORT put() */
|
1006 | 1026 |
MDB_txn *me_txn; /**< current write transaction */
|
1007 | 1027 |
size_t me_mapsize; /**< size of the data memory map */
|
1008 | 1028 |
off_t me_size; /**< current file size */
|
|
1317 | 1337 |
{
|
1318 | 1338 |
MDB_env *env = txn->mt_env;
|
1319 | 1339 |
MDB_page *ret = env->me_dpages;
|
1320 | |
size_t sz = env->me_psize;
|
|
1340 |
size_t psize = env->me_psize, sz = psize, off;
|
|
1341 |
/* For ! #MDB_NOMEMINIT, psize counts how much to init.
|
|
1342 |
* For a single page alloc, we init everything after the page header.
|
|
1343 |
* For multi-page, we init the final page; if the caller needed that
|
|
1344 |
* many pages they will be filling in at least up to the last page.
|
|
1345 |
*/
|
1321 | 1346 |
if (num == 1) {
|
1322 | 1347 |
if (ret) {
|
1323 | 1348 |
VGMEMP_ALLOC(env, ret, sz);
|
|
1325 | 1350 |
env->me_dpages = ret->mp_next;
|
1326 | 1351 |
return ret;
|
1327 | 1352 |
}
|
|
1353 |
psize -= off = PAGEHDRSZ;
|
1328 | 1354 |
} else {
|
1329 | 1355 |
sz *= num;
|
|
1356 |
off = sz - psize;
|
1330 | 1357 |
}
|
1331 | 1358 |
if ((ret = malloc(sz)) != NULL) {
|
|
1359 |
if (!(env->me_flags & MDB_NOMEMINIT)) {
|
|
1360 |
memset((char *)ret + off, 0, psize);
|
|
1361 |
ret->mp_pad = 0;
|
|
1362 |
}
|
1332 | 1363 |
VGMEMP_ALLOC(env, ret, sz);
|
1333 | 1364 |
}
|
1334 | 1365 |
return ret;
|
|
2077 | 2108 |
* lock on the lockfile, set at an offset equal to the pid.
|
2078 | 2109 |
*/
|
2079 | 2110 |
static int
|
2080 | |
mdb_reader_pid(MDB_env *env, enum Pidlock_op op, pid_t pid)
|
|
2111 |
mdb_reader_pid(MDB_env *env, enum Pidlock_op op, MDB_PID_T pid)
|
2081 | 2112 |
{
|
2082 | 2113 |
#if !(MDB_PIDLOCK) /* Currently the same as defined(_WIN32) */
|
2083 | 2114 |
int ret = 0;
|
|
2142 | 2173 |
if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
|
2143 | 2174 |
return MDB_BAD_RSLOT;
|
2144 | 2175 |
} else {
|
2145 | |
pid_t pid = env->me_pid;
|
|
2176 |
MDB_PID_T pid = env->me_pid;
|
2146 | 2177 |
pthread_t tid = pthread_self();
|
2147 | 2178 |
|
2148 | 2179 |
if (!(env->me_flags & MDB_LIVE_READER)) {
|
|
2481 | 2512 |
int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
|
2482 | 2513 |
txnid_t pglast = 0, head_id = 0;
|
2483 | 2514 |
pgno_t freecnt = 0, *free_pgs, *mop;
|
2484 | |
ssize_t head_room = 0, total_room = 0, mop_len;
|
|
2515 |
ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
|
2485 | 2516 |
|
2486 | 2517 |
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
|
2487 | 2518 |
|
|
2492 | 2523 |
return rc;
|
2493 | 2524 |
}
|
2494 | 2525 |
|
|
2526 |
/* MDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
|
|
2527 |
clean_limit = (env->me_flags & (MDB_NOMEMINIT|MDB_WRITEMAP))
|
|
2528 |
? SSIZE_MAX : maxfree_1pg;
|
|
2529 |
|
2495 | 2530 |
for (;;) {
|
2496 | 2531 |
/* Come back here after each Put() in case freelist changed */
|
2497 | 2532 |
MDB_val key, data;
|
|
2533 |
pgno_t *pgs;
|
|
2534 |
ssize_t j;
|
2498 | 2535 |
|
2499 | 2536 |
/* If using records from freeDB which we have not yet
|
2500 | 2537 |
* deleted, delete them and any we reserved for me_pghead.
|
|
2578 | 2615 |
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
|
2579 | 2616 |
if (rc)
|
2580 | 2617 |
return rc;
|
2581 | |
*(MDB_ID *)data.mv_data = 0; /* IDL is initially empty */
|
|
2618 |
/* IDL is initially empty, zero out at least the length */
|
|
2619 |
pgs = (pgno_t *)data.mv_data;
|
|
2620 |
j = head_room > clean_limit ? head_room : 0;
|
|
2621 |
do {
|
|
2622 |
pgs[j] = 0;
|
|
2623 |
} while (--j >= 0);
|
2582 | 2624 |
total_room += head_room;
|
2583 | 2625 |
}
|
2584 | 2626 |
|
|
2969 | 3011 |
static int
|
2970 | 3012 |
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
2971 | 3013 |
{
|
2972 | |
MDB_pagebuf pbuf;
|
|
3014 |
MDB_metabuf pbuf;
|
2973 | 3015 |
MDB_page *p;
|
2974 | 3016 |
MDB_meta *m;
|
2975 | 3017 |
int i, rc, off;
|
|
3018 |
enum { Size = sizeof(pbuf) };
|
2976 | 3019 |
|
2977 | 3020 |
/* We don't know the page size yet, so use a minimum value.
|
2978 | 3021 |
* Read both meta pages so we can use the latest one.
|
|
2984 | 3027 |
OVERLAPPED ov;
|
2985 | 3028 |
memset(&ov, 0, sizeof(ov));
|
2986 | 3029 |
ov.Offset = off;
|
2987 | |
rc = ReadFile(env->me_fd,&pbuf,MDB_PAGESIZE,&len,&ov) ? (int)len : -1;
|
|
3030 |
rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
|
2988 | 3031 |
if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
|
2989 | 3032 |
rc = 0;
|
2990 | 3033 |
#else
|
2991 | |
rc = pread(env->me_fd, &pbuf, MDB_PAGESIZE, off);
|
2992 | |
#endif
|
2993 | |
if (rc != MDB_PAGESIZE) {
|
|
3034 |
rc = pread(env->me_fd, &pbuf, Size, off);
|
|
3035 |
#endif
|
|
3036 |
if (rc != Size) {
|
2994 | 3037 |
if (rc == 0 && off == 0)
|
2995 | 3038 |
return ENOENT;
|
2996 | 3039 |
rc = rc < 0 ? (int) ErrCode() : MDB_INVALID;
|
|
3121 | 3164 |
mp->mm_last_pg = txn->mt_next_pgno - 1;
|
3122 | 3165 |
mp->mm_txnid = txn->mt_txnid;
|
3123 | 3166 |
if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
|
|
3167 |
unsigned meta_size = env->me_psize;
|
3124 | 3168 |
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
|
3125 | 3169 |
ptr = env->me_map;
|
3126 | |
if (toggle)
|
3127 | |
ptr += env->me_psize;
|
3128 | |
if (MDB_MSYNC(ptr, env->me_psize, rc)) {
|
|
3170 |
if (toggle) {
|
|
3171 |
#ifndef _WIN32 /* POSIX msync() requires ptr = start of OS page */
|
|
3172 |
if (meta_size < env->me_os_psize)
|
|
3173 |
meta_size += meta_size;
|
|
3174 |
else
|
|
3175 |
#endif
|
|
3176 |
ptr += meta_size;
|
|
3177 |
}
|
|
3178 |
if (MDB_MSYNC(ptr, meta_size, rc)) {
|
3129 | 3179 |
rc = ErrCode();
|
3130 | 3180 |
goto fail;
|
3131 | 3181 |
}
|
|
3231 | 3281 |
e->me_wmutex = SEM_FAILED;
|
3232 | 3282 |
#endif
|
3233 | 3283 |
e->me_pid = getpid();
|
|
3284 |
GET_PAGESIZE(e->me_os_psize);
|
3234 | 3285 |
VGMEMP_CREATE(e,0,0);
|
3235 | 3286 |
*env = e;
|
3236 | 3287 |
return MDB_SUCCESS;
|
|
3396 | 3447 |
return i;
|
3397 | 3448 |
DPUTS("new mdbenv");
|
3398 | 3449 |
newenv = 1;
|
3399 | |
GET_PAGESIZE(env->me_psize);
|
|
3450 |
env->me_psize = env->me_os_psize;
|
|
3451 |
if (env->me_psize > MAX_PAGESIZE)
|
|
3452 |
env->me_psize = MAX_PAGESIZE;
|
3400 | 3453 |
} else {
|
3401 | 3454 |
env->me_psize = meta.mm_psize;
|
3402 | 3455 |
}
|
|
3507 | 3560 |
#pragma comment(linker, "/INCLUDE:_tls_used")
|
3508 | 3561 |
#pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
|
3509 | 3562 |
#pragma const_seg(".CRT$XLB")
|
3510 | |
extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
|
|
3563 |
extern const PIMAGE_TLS_CALLBACK mdb_tls_cbp;
|
3511 | 3564 |
const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
|
3512 | 3565 |
#pragma const_seg()
|
3513 | 3566 |
#else /* WIN32 */
|
|
3771 | 3824 |
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
|
3772 | 3825 |
if (size < rsize && *excl > 0) {
|
3773 | 3826 |
#ifdef _WIN32
|
3774 | |
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != rsize
|
|
3827 |
if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
|
3775 | 3828 |
|| !SetEndOfFile(env->me_lfd))
|
3776 | 3829 |
goto fail_errno;
|
3777 | 3830 |
#else
|
|
3927 | 3980 |
* at runtime. Changing other flags requires closing the
|
3928 | 3981 |
* environment and re-opening it with the new flags.
|
3929 | 3982 |
*/
|
3930 | |
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
|
3931 | |
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK)
|
|
3983 |
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC|MDB_NOMEMINIT)
|
|
3984 |
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP| \
|
|
3985 |
MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD)
|
3932 | 3986 |
|
3933 | 3987 |
int
|
3934 | 3988 |
mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
|
|
4041 | 4095 |
DPRINTF(("opened dbenv %p", (void *) env));
|
4042 | 4096 |
if (excl > 0) {
|
4043 | 4097 |
rc = mdb_env_share_locks(env, &excl);
|
4044 | |
}
|
|
4098 |
if (rc)
|
|
4099 |
goto leave;
|
|
4100 |
}
|
|
4101 |
if (!((flags & MDB_RDONLY) ||
|
|
4102 |
(env->me_pbuf = calloc(1, env->me_psize))))
|
|
4103 |
rc = ENOMEM;
|
4045 | 4104 |
}
|
4046 | 4105 |
|
4047 | 4106 |
leave:
|
|
4065 | 4124 |
for (i = env->me_maxdbs; --i > MAIN_DBI; )
|
4066 | 4125 |
free(env->me_dbxs[i].md_name.mv_data);
|
4067 | 4126 |
|
|
4127 |
free(env->me_pbuf);
|
4068 | 4128 |
free(env->me_dbflags);
|
4069 | 4129 |
free(env->me_dbxs);
|
4070 | 4130 |
free(env->me_path);
|
|
4092 | 4152 |
if (env->me_fd != INVALID_HANDLE_VALUE)
|
4093 | 4153 |
(void) close(env->me_fd);
|
4094 | 4154 |
if (env->me_txns) {
|
4095 | |
pid_t pid = env->me_pid;
|
|
4155 |
MDB_PID_T pid = env->me_pid;
|
4096 | 4156 |
/* Clearing readers is done in this function because
|
4097 | 4157 |
* me_txkey with its destructor must be disabled first.
|
4098 | 4158 |
*/
|
|
5419 | 5479 |
rc = EINVAL;
|
5420 | 5480 |
} else {
|
5421 | 5481 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
5422 | |
if (!NUMKEYS(mp)) {
|
5423 | |
mc->mc_ki[mc->mc_top] = 0;
|
|
5482 |
int nkeys = NUMKEYS(mp);
|
|
5483 |
if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
|
|
5484 |
mc->mc_ki[mc->mc_top] = nkeys;
|
5424 | 5485 |
rc = MDB_NOTFOUND;
|
5425 | 5486 |
break;
|
5426 | 5487 |
}
|
|
5601 | 5662 |
unsigned int flags)
|
5602 | 5663 |
{
|
5603 | 5664 |
enum { MDB_NO_ROOT = MDB_LAST_ERRCODE+10 }; /* internal code */
|
|
5665 |
MDB_env *env = mc->mc_txn->mt_env;
|
5604 | 5666 |
MDB_node *leaf = NULL;
|
5605 | 5667 |
MDB_val xdata, *rdata, dkey;
|
5606 | |
MDB_page *fp;
|
5607 | 5668 |
MDB_db dummy;
|
5608 | 5669 |
int do_sub = 0, insert = 0;
|
5609 | 5670 |
unsigned int mcount = 0, dcount = 0, nospill;
|
5610 | 5671 |
size_t nsize;
|
5611 | 5672 |
int rc, rc2;
|
5612 | |
MDB_pagebuf pbuf;
|
5613 | 5673 |
char dbuf[MDB_MAXKEYSIZE+1];
|
5614 | 5674 |
unsigned int nflags;
|
5615 | 5675 |
DKBUF;
|
|
5723 | 5783 |
|
5724 | 5784 |
/* The key already exists */
|
5725 | 5785 |
if (rc == MDB_SUCCESS) {
|
|
5786 |
MDB_page *fp, *mp;
|
|
5787 |
MDB_val olddata;
|
|
5788 |
|
5726 | 5789 |
/* there's only a key anyway, so this is a no-op */
|
5727 | 5790 |
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
5728 | 5791 |
unsigned int ksize = mc->mc_db->md_pad;
|
|
5735 | 5798 |
return MDB_SUCCESS;
|
5736 | 5799 |
}
|
5737 | 5800 |
|
|
5801 |
more:
|
5738 | 5802 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5803 |
olddata.mv_size = NODEDSZ(leaf);
|
|
5804 |
olddata.mv_data = NODEDATA(leaf);
|
5739 | 5805 |
|
5740 | 5806 |
/* DB has dups? */
|
5741 | 5807 |
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
|
|
5808 |
mp = fp = xdata.mv_data = env->me_pbuf;
|
|
5809 |
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
|
5810 |
|
5742 | 5811 |
/* Was a single item before, must convert now */
|
5743 | |
more:
|
5744 | 5812 |
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
5745 | 5813 |
/* Just overwrite the current item */
|
5746 | 5814 |
if (flags == MDB_CURRENT)
|
5747 | 5815 |
goto current;
|
5748 | 5816 |
|
5749 | |
dkey.mv_size = NODEDSZ(leaf);
|
5750 | |
dkey.mv_data = NODEDATA(leaf);
|
|
5817 |
dkey = olddata;
|
5751 | 5818 |
#if UINT_MAX < SIZE_MAX
|
5752 | 5819 |
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
|
5753 | 5820 |
#ifdef MISALIGNED_OK
|
|
5770 | 5837 |
/* create a fake page for the dup items */
|
5771 | 5838 |
memcpy(dbuf, dkey.mv_data, dkey.mv_size);
|
5772 | 5839 |
dkey.mv_data = dbuf;
|
5773 | |
fp = (MDB_page *)&pbuf;
|
5774 | |
fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5775 | 5840 |
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
|
5776 | 5841 |
fp->mp_lower = PAGEHDRSZ;
|
5777 | |
fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
|
|
5842 |
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
|
5778 | 5843 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5779 | 5844 |
fp->mp_flags |= P_LEAF2;
|
5780 | 5845 |
fp->mp_pad = data->mv_size;
|
5781 | |
fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */
|
|
5846 |
xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
|
5782 | 5847 |
} else {
|
5783 | |
fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
|
|
5848 |
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
|
5784 | 5849 |
(dkey.mv_size & 1) + (data->mv_size & 1);
|
5785 | 5850 |
}
|
5786 | |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5787 | |
do_sub = 1;
|
5788 | |
rdata = &xdata;
|
5789 | |
xdata.mv_size = fp->mp_upper;
|
5790 | |
xdata.mv_data = fp;
|
5791 | |
flags |= F_DUPDATA;
|
5792 | |
goto new_sub;
|
5793 | |
}
|
5794 | |
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
|
5851 |
fp->mp_upper = xdata.mv_size;
|
|
5852 |
} else if (leaf->mn_flags & F_SUBDATA) {
|
|
5853 |
/* Data is on sub-DB, just store it */
|
|
5854 |
flags |= F_DUPDATA|F_SUBDATA;
|
|
5855 |
goto put_sub;
|
|
5856 |
} else {
|
5795 | 5857 |
/* See if we need to convert from fake page to subDB */
|
5796 | |
MDB_page *mp;
|
5797 | 5858 |
unsigned int offset;
|
5798 | 5859 |
unsigned int i;
|
5799 | 5860 |
uint16_t fp_flags;
|
5800 | 5861 |
|
5801 | |
fp = NODEDATA(leaf);
|
5802 | |
if (flags == MDB_CURRENT) {
|
5803 | |
reuse:
|
|
5862 |
fp = olddata.mv_data;
|
|
5863 |
switch (flags) {
|
|
5864 |
default:
|
|
5865 |
if (!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
|
|
5866 |
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
|
5867 |
offset += offset & 1;
|
|
5868 |
break;
|
|
5869 |
}
|
|
5870 |
offset = fp->mp_pad;
|
|
5871 |
if (SIZELEFT(fp) < offset) {
|
|
5872 |
offset *= 4; /* space for 4 more */
|
|
5873 |
break;
|
|
5874 |
}
|
|
5875 |
/* FALLTHRU: Big enough MDB_DUPFIXED sub-page */
|
|
5876 |
case MDB_CURRENT:
|
5804 | 5877 |
fp->mp_flags |= P_DIRTY;
|
5805 | |
COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
|
|
5878 |
COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
|
5806 | 5879 |
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
|
5807 | 5880 |
flags |= F_DUPDATA;
|
5808 | 5881 |
goto put_sub;
|
5809 | 5882 |
}
|
5810 | |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5811 | |
offset = fp->mp_pad;
|
5812 | |
if (SIZELEFT(fp) >= offset)
|
5813 | |
goto reuse;
|
5814 | |
offset *= 4; /* space for 4 more */
|
5815 | |
} else {
|
5816 | |
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
5817 | |
}
|
5818 | |
offset += offset & 1;
|
5819 | 5883 |
fp_flags = fp->mp_flags;
|
5820 | |
if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
|
5821 | |
offset >= mc->mc_txn->mt_env->me_nodemax) {
|
|
5884 |
xdata.mv_size = olddata.mv_size + offset;
|
|
5885 |
if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + xdata.mv_size
|
|
5886 |
>= env->me_nodemax) {
|
5822 | 5887 |
/* yes, convert it */
|
5823 | |
dummy.md_flags = 0;
|
5824 | 5888 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
5825 | 5889 |
dummy.md_pad = fp->mp_pad;
|
5826 | 5890 |
dummy.md_flags = MDB_DUPFIXED;
|
5827 | 5891 |
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
|
5828 | 5892 |
dummy.md_flags |= MDB_INTEGERKEY;
|
|
5893 |
} else {
|
|
5894 |
dummy.md_pad = 0;
|
|
5895 |
dummy.md_flags = 0;
|
5829 | 5896 |
}
|
5830 | 5897 |
dummy.md_depth = 1;
|
5831 | 5898 |
dummy.md_branch_pages = 0;
|
5832 | 5899 |
dummy.md_leaf_pages = 1;
|
5833 | 5900 |
dummy.md_overflow_pages = 0;
|
5834 | 5901 |
dummy.md_entries = NUMKEYS(fp);
|
5835 | |
rdata = &xdata;
|
5836 | 5902 |
xdata.mv_size = sizeof(MDB_db);
|
5837 | 5903 |
xdata.mv_data = &dummy;
|
5838 | 5904 |
if ((rc = mdb_page_alloc(mc, 1, &mp)))
|
5839 | 5905 |
return rc;
|
5840 | |
offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
|
|
5906 |
offset = env->me_psize - olddata.mv_size;
|
5841 | 5907 |
flags |= F_DUPDATA|F_SUBDATA;
|
5842 | 5908 |
dummy.md_root = mp->mp_pgno;
|
5843 | 5909 |
fp_flags &= ~P_SUBP;
|
5844 | |
} else {
|
5845 | |
/* no, just grow it */
|
5846 | |
rdata = &xdata;
|
5847 | |
xdata.mv_size = NODEDSZ(leaf) + offset;
|
5848 | |
xdata.mv_data = &pbuf;
|
5849 | |
mp = (MDB_page *)&pbuf;
|
5850 | |
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
5851 | |
flags |= F_DUPDATA;
|
5852 | 5910 |
}
|
5853 | 5911 |
mp->mp_flags = fp_flags | P_DIRTY;
|
5854 | 5912 |
mp->mp_pad = fp->mp_pad;
|
|
5857 | 5915 |
if (IS_LEAF2(fp)) {
|
5858 | 5916 |
memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
|
5859 | 5917 |
} else {
|
5860 | |
nsize = NODEDSZ(leaf) - fp->mp_upper;
|
5861 | |
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
|
|
5918 |
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper,
|
|
5919 |
olddata.mv_size - fp->mp_upper);
|
5862 | 5920 |
for (i=0; i<NUMKEYS(fp); i++)
|
5863 | 5921 |
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
|
5864 | 5922 |
}
|
5865 | |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
5866 | |
do_sub = 1;
|
5867 | |
goto new_sub;
|
5868 | 5923 |
}
|
5869 | |
/* data is on sub-DB, just store it */
|
5870 | |
flags |= F_DUPDATA|F_SUBDATA;
|
5871 | |
goto put_sub;
|
|
5924 |
|
|
5925 |
rdata = &xdata;
|
|
5926 |
flags |= F_DUPDATA;
|
|
5927 |
do_sub = 1;
|
|
5928 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
|
5929 |
goto new_sub;
|
5872 | 5930 |
}
|
5873 | 5931 |
current:
|
5874 | 5932 |
/* overflow page overwrites need special handling */
|
5875 | 5933 |
if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
5876 | 5934 |
MDB_page *omp;
|
5877 | 5935 |
pgno_t pg;
|
5878 | |
unsigned psize = mc->mc_txn->mt_env->me_psize;
|
5879 | |
int level, ovpages, dpages = OVPAGES(data->mv_size, psize);
|
5880 | |
|
5881 | |
memcpy(&pg, NODEDATA(leaf), sizeof(pg));
|
|
5936 |
int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
|
|
5937 |
|
|
5938 |
memcpy(&pg, olddata.mv_data, sizeof(pg));
|
5882 | 5939 |
if ((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)
|
5883 | 5940 |
return rc2;
|
5884 | 5941 |
ovpages = omp->mp_pages;
|
|
5886 | 5943 |
/* Is the ov page large enough? */
|
5887 | 5944 |
if (ovpages >= dpages) {
|
5888 | 5945 |
if (!(omp->mp_flags & P_DIRTY) &&
|
5889 | |
(level || (mc->mc_txn->mt_env->me_flags & MDB_WRITEMAP)))
|
|
5946 |
(level || (env->me_flags & MDB_WRITEMAP)))
|
5890 | 5947 |
{
|
5891 | 5948 |
rc = mdb_page_unspill(mc->mc_txn, omp, &omp);
|
5892 | 5949 |
if (rc)
|
|
5901 | 5958 |
*/
|
5902 | 5959 |
if (level > 1) {
|
5903 | 5960 |
/* It is writable only in a parent txn */
|
5904 | |
size_t sz = (size_t) psize * ovpages, off;
|
|
5961 |
size_t sz = (size_t) env->me_psize * ovpages, off;
|
5905 | 5962 |
MDB_page *np = mdb_page_malloc(mc->mc_txn, ovpages);
|
5906 | 5963 |
MDB_ID2 id2;
|
5907 | 5964 |
if (!np)
|
|
5931 | 5988 |
}
|
5932 | 5989 |
if ((rc2 = mdb_ovpage_free(mc, omp)) != MDB_SUCCESS)
|
5933 | 5990 |
return rc2;
|
5934 | |
} else if (NODEDSZ(leaf) == data->mv_size) {
|
|
5991 |
} else if (data->mv_size == olddata.mv_size) {
|
5935 | 5992 |
/* same size, just replace it. Note that we could
|
5936 | 5993 |
* also reuse this node if the new data is smaller,
|
5937 | 5994 |
* but instead we opt to shrink the node in that case.
|
5938 | 5995 |
*/
|
5939 | 5996 |
if (F_ISSET(flags, MDB_RESERVE))
|
5940 | |
data->mv_data = NODEDATA(leaf);
|
|
5997 |
data->mv_data = olddata.mv_data;
|
5941 | 5998 |
else if (data->mv_size)
|
5942 | |
memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
|
|
5999 |
memcpy(olddata.mv_data, data->mv_data, data->mv_size);
|
5943 | 6000 |
else
|
5944 | 6001 |
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
|
5945 | 6002 |
goto done;
|
|
5955 | 6012 |
|
5956 | 6013 |
new_sub:
|
5957 | 6014 |
nflags = flags & NODE_ADD_FLAGS;
|
5958 | |
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
|
|
6015 |
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(env, key, rdata);
|
5959 | 6016 |
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
|
5960 | 6017 |
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
|
5961 | 6018 |
nflags &= ~MDB_APPEND;
|
|
6049 | 6106 |
data[1].mv_size = mcount;
|
6050 | 6107 |
if (mcount < dcount) {
|
6051 | 6108 |
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
|
6052 | |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
6053 | 6109 |
goto more;
|
6054 | 6110 |
}
|
6055 | 6111 |
}
|
|
6068 | 6124 |
mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
6069 | 6125 |
{
|
6070 | 6126 |
MDB_node *leaf;
|
|
6127 |
MDB_page *mp;
|
6071 | 6128 |
int rc;
|
6072 | 6129 |
|
6073 | 6130 |
if (mc->mc_txn->mt_flags & (MDB_TXN_RDONLY|MDB_TXN_ERROR))
|
|
6075 | 6132 |
|
6076 | 6133 |
if (!(mc->mc_flags & C_INITIALIZED))
|
6077 | 6134 |
return EINVAL;
|
|
6135 |
|
|
6136 |
if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
|
|
6137 |
return MDB_NOTFOUND;
|
6078 | 6138 |
|
6079 | 6139 |
if (!(flags & MDB_NOSPILL) && (rc = mdb_page_spill(mc, NULL, NULL)))
|
6080 | 6140 |
return rc;
|
|
6083 | 6143 |
if (rc)
|
6084 | 6144 |
return rc;
|
6085 | 6145 |
|
6086 | |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
6087 | |
|
6088 | |
if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
6146 |
mp = mc->mc_pg[mc->mc_top];
|
|
6147 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
6148 |
|
|
6149 |
if (!IS_LEAF2(mp) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
6089 | 6150 |
if (!(flags & MDB_NODUPDATA)) {
|
6090 | 6151 |
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
6091 | 6152 |
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
|
6100 | 6161 |
} else {
|
6101 | 6162 |
MDB_cursor *m2;
|
6102 | 6163 |
/* shrink fake page */
|
6103 | |
mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
6104 | |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
6164 |
mdb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
|
|
6165 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
6105 | 6166 |
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6106 | 6167 |
/* fix other sub-DB cursors pointed at this fake page */
|
6107 | 6168 |
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
6108 | 6169 |
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
|
6109 | |
if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top] &&
|
|
6170 |
if (m2->mc_pg[mc->mc_top] == mp &&
|
6110 | 6171 |
m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top])
|
6111 | 6172 |
m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
6112 | 6173 |
}
|
|
6784 | 6845 |
flags = 0;
|
6785 | 6846 |
} else {
|
6786 | 6847 |
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
|
6787 | |
assert(!((long)srcnode&1));
|
|
6848 |
assert(!((size_t)srcnode&1));
|
6788 | 6849 |
srcpg = NODEPGNO(srcnode);
|
6789 | 6850 |
flags = srcnode->mn_flags;
|
6790 | 6851 |
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
|
7284 | 7345 |
|
7285 | 7346 |
/* Adjust other cursors pointing to mp */
|
7286 | 7347 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
7287 | |
if (m2 == mc)
|
|
7348 |
if (m2 == mc || m2->mc_snum < mc->mc_snum)
|
7288 | 7349 |
continue;
|
7289 | 7350 |
if (!(m2->mc_flags & C_INITIALIZED))
|
7290 | 7351 |
continue;
|
|
7535 | 7596 |
psize = 0;
|
7536 | 7597 |
if (newindx <= split_indx || newindx >= nkeys) {
|
7537 | 7598 |
i = 0; j = 1;
|
7538 | |
k = newindx >= nkeys ? nkeys : split_indx+1;
|
|
7599 |
k = newindx >= nkeys ? nkeys : split_indx+2;
|
7539 | 7600 |
} else {
|
7540 | 7601 |
i = nkeys; j = -1;
|
7541 | 7602 |
k = split_indx-1;
|
|
7555 | 7616 |
}
|
7556 | 7617 |
psize += psize & 1;
|
7557 | 7618 |
}
|
7558 | |
if (psize > pmax) {
|
|
7619 |
if (psize > pmax || i == k-j) {
|
7559 | 7620 |
split_indx = i + (j<0);
|
7560 | 7621 |
break;
|
7561 | 7622 |
}
|
7562 | 7623 |
}
|
7563 | |
/* special case: when the new node was on the last
|
7564 | |
* slot we may not have tripped the break inside the loop.
|
7565 | |
* In all other cases we either hit the break condition,
|
7566 | |
* or the original split_indx was already safe.
|
7567 | |
*/
|
7568 | |
if (newindx >= nkeys && i == k)
|
7569 | |
split_indx = nkeys-1;
|
7570 | 7624 |
}
|
7571 | 7625 |
if (split_indx == newindx) {
|
7572 | 7626 |
sepkey.mv_size = newkey->mv_size;
|
|
8251 | 8305 |
/** Insert pid into list if not already present.
|
8252 | 8306 |
* return -1 if already present.
|
8253 | 8307 |
*/
|
8254 | |
static int mdb_pid_insert(pid_t *ids, pid_t pid)
|
|
8308 |
static int mdb_pid_insert(MDB_PID_T *ids, MDB_PID_T pid)
|
8255 | 8309 |
{
|
8256 | 8310 |
/* binary search of pid in list */
|
8257 | 8311 |
unsigned base = 0;
|
|
8291 | 8345 |
{
|
8292 | 8346 |
unsigned int i, j, rdrs;
|
8293 | 8347 |
MDB_reader *mr;
|
8294 | |
pid_t *pids, pid;
|
|
8348 |
MDB_PID_T *pids, pid;
|
8295 | 8349 |
int count = 0;
|
8296 | 8350 |
|
8297 | 8351 |
if (!env)
|
|
8301 | 8355 |
if (!env->me_txns)
|
8302 | 8356 |
return MDB_SUCCESS;
|
8303 | 8357 |
rdrs = env->me_txns->mti_numreaders;
|
8304 | |
pids = malloc((rdrs+1) * sizeof(pid_t));
|
|
8358 |
pids = malloc((rdrs+1) * sizeof(MDB_PID_T));
|
8305 | 8359 |
if (!pids)
|
8306 | 8360 |
return ENOMEM;
|
8307 | 8361 |
pids[0] = 0;
|