|
0 |
/** @file mdb.c
|
|
1 |
* @brief memory-mapped database library
|
|
2 |
*
|
|
3 |
* A Btree-based database management library modeled loosely on the
|
|
4 |
* BerkeleyDB API, but much simplified.
|
|
5 |
*/
|
|
6 |
/*
|
|
7 |
* Copyright 2011-2013 Howard Chu, Symas Corp.
|
|
8 |
* All rights reserved.
|
|
9 |
*
|
|
10 |
* Redistribution and use in source and binary forms, with or without
|
|
11 |
* modification, are permitted only as authorized by the OpenLDAP
|
|
12 |
* Public License.
|
|
13 |
*
|
|
14 |
* A copy of this license is available in the file LICENSE in the
|
|
15 |
* top-level directory of the distribution or, alternatively, at
|
|
16 |
* <http://www.OpenLDAP.org/license.html>.
|
|
17 |
*
|
|
18 |
* This code is derived from btree.c written by Martin Hedenfalk.
|
|
19 |
*
|
|
20 |
* Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
|
|
21 |
*
|
|
22 |
* Permission to use, copy, modify, and distribute this software for any
|
|
23 |
* purpose with or without fee is hereby granted, provided that the above
|
|
24 |
* copyright notice and this permission notice appear in all copies.
|
|
25 |
*
|
|
26 |
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
27 |
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
28 |
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
29 |
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
30 |
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
31 |
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
32 |
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
33 |
*/
|
|
34 |
#define _GNU_SOURCE 1
|
|
35 |
#include <sys/types.h>
|
|
36 |
#include <sys/stat.h>
|
|
37 |
#include <sys/param.h>
|
|
38 |
#ifdef _WIN32
|
|
39 |
#include <windows.h>
|
|
40 |
#else
|
|
41 |
#include <sys/uio.h>
|
|
42 |
#include <sys/mman.h>
|
|
43 |
#ifdef HAVE_SYS_FILE_H
|
|
44 |
#include <sys/file.h>
|
|
45 |
#endif
|
|
46 |
#include <fcntl.h>
|
|
47 |
#endif
|
|
48 |
|
|
49 |
#include <assert.h>
|
|
50 |
#include <errno.h>
|
|
51 |
#include <limits.h>
|
|
52 |
#include <stddef.h>
|
|
53 |
#include <inttypes.h>
|
|
54 |
#include <stdio.h>
|
|
55 |
#include <stdlib.h>
|
|
56 |
#include <string.h>
|
|
57 |
#include <time.h>
|
|
58 |
#include <unistd.h>
|
|
59 |
|
|
60 |
#if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
|
|
61 |
#include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
|
|
62 |
#endif
|
|
63 |
|
|
64 |
#if defined(__APPLE__) || defined (BSD)
|
|
65 |
# define MDB_USE_POSIX_SEM 1
|
|
66 |
# define MDB_FDATASYNC fsync
|
|
67 |
#elif defined(ANDROID)
|
|
68 |
# define MDB_FDATASYNC fsync
|
|
69 |
#endif
|
|
70 |
|
|
71 |
#ifndef _WIN32
|
|
72 |
#include <pthread.h>
|
|
73 |
#ifdef MDB_USE_POSIX_SEM
|
|
74 |
#include <semaphore.h>
|
|
75 |
#endif
|
|
76 |
#endif
|
|
77 |
|
|
78 |
#ifdef USE_VALGRIND
|
|
79 |
#include <valgrind/memcheck.h>
|
|
80 |
#define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
|
|
81 |
#define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
|
|
82 |
#define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
|
|
83 |
#define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
|
|
84 |
#define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
|
|
85 |
#else
|
|
86 |
#define VGMEMP_CREATE(h,r,z)
|
|
87 |
#define VGMEMP_ALLOC(h,a,s)
|
|
88 |
#define VGMEMP_FREE(h,a)
|
|
89 |
#define VGMEMP_DESTROY(h)
|
|
90 |
#define VGMEMP_DEFINED(a,s)
|
|
91 |
#endif
|
|
92 |
|
|
93 |
#ifndef BYTE_ORDER
|
|
94 |
# if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
|
|
95 |
/* Solaris just defines one or the other */
|
|
96 |
# define LITTLE_ENDIAN 1234
|
|
97 |
# define BIG_ENDIAN 4321
|
|
98 |
# ifdef _LITTLE_ENDIAN
|
|
99 |
# define BYTE_ORDER LITTLE_ENDIAN
|
|
100 |
# else
|
|
101 |
# define BYTE_ORDER BIG_ENDIAN
|
|
102 |
# endif
|
|
103 |
# else
|
|
104 |
# define BYTE_ORDER __BYTE_ORDER
|
|
105 |
# endif
|
|
106 |
#endif
|
|
107 |
|
|
108 |
#ifndef LITTLE_ENDIAN
|
|
109 |
#define LITTLE_ENDIAN __LITTLE_ENDIAN
|
|
110 |
#endif
|
|
111 |
#ifndef BIG_ENDIAN
|
|
112 |
#define BIG_ENDIAN __BIG_ENDIAN
|
|
113 |
#endif
|
|
114 |
|
|
115 |
#if defined(__i386) || defined(__x86_64)
|
|
116 |
#define MISALIGNED_OK 1
|
|
117 |
#endif
|
|
118 |
|
|
119 |
#include "lmdb.h"
|
|
120 |
#include "midl.h"
|
|
121 |
|
|
122 |
#if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
|
|
123 |
# error "Unknown or unsupported endianness (BYTE_ORDER)"
|
|
124 |
#elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
|
|
125 |
# error "Two's complement, reasonably sized integer types, please"
|
|
126 |
#endif
|
|
127 |
|
|
128 |
/** @defgroup internal MDB Internals
|
|
129 |
* @{
|
|
130 |
*/
|
|
131 |
/** @defgroup compat Windows Compatibility Macros
|
|
132 |
* A bunch of macros to minimize the amount of platform-specific ifdefs
|
|
133 |
* needed throughout the rest of the code. When the features this library
|
|
134 |
* needs are similar enough to POSIX to be hidden in a one-or-two line
|
|
135 |
* replacement, this macro approach is used.
|
|
136 |
* @{
|
|
137 |
*/
|
|
138 |
#ifdef _WIN32
|
|
139 |
#define pthread_t DWORD
|
|
140 |
#define pthread_mutex_t HANDLE
|
|
141 |
#define pthread_key_t DWORD
|
|
142 |
#define pthread_self() GetCurrentThreadId()
|
|
143 |
#define pthread_key_create(x,y) \
|
|
144 |
((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
|
|
145 |
#define pthread_key_delete(x) TlsFree(x)
|
|
146 |
#define pthread_getspecific(x) TlsGetValue(x)
|
|
147 |
#define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
|
|
148 |
#define pthread_mutex_unlock(x) ReleaseMutex(x)
|
|
149 |
#define pthread_mutex_lock(x) WaitForSingleObject(x, INFINITE)
|
|
150 |
#define LOCK_MUTEX_R(env) pthread_mutex_lock((env)->me_rmutex)
|
|
151 |
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex)
|
|
152 |
#define LOCK_MUTEX_W(env) pthread_mutex_lock((env)->me_wmutex)
|
|
153 |
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex)
|
|
154 |
#define getpid() GetCurrentProcessId()
|
|
155 |
#define MDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
|
|
156 |
#define MDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
|
|
157 |
#define ErrCode() GetLastError()
|
|
158 |
#define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
|
|
159 |
#define close(fd) CloseHandle(fd)
|
|
160 |
#define munmap(ptr,len) UnmapViewOfFile(ptr)
|
|
161 |
#else
|
|
162 |
|
|
163 |
#ifdef MDB_USE_POSIX_SEM
|
|
164 |
|
|
165 |
#define LOCK_MUTEX_R(env) mdb_sem_wait((env)->me_rmutex)
|
|
166 |
#define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex)
|
|
167 |
#define LOCK_MUTEX_W(env) mdb_sem_wait((env)->me_wmutex)
|
|
168 |
#define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex)
|
|
169 |
|
|
170 |
static int
|
|
171 |
mdb_sem_wait(sem_t *sem)
|
|
172 |
{
|
|
173 |
int rc;
|
|
174 |
while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
|
|
175 |
return rc;
|
|
176 |
}
|
|
177 |
|
|
178 |
#else
|
|
179 |
/** Lock the reader mutex.
|
|
180 |
*/
|
|
181 |
#define LOCK_MUTEX_R(env) pthread_mutex_lock(&(env)->me_txns->mti_mutex)
|
|
182 |
/** Unlock the reader mutex.
|
|
183 |
*/
|
|
184 |
#define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
|
|
185 |
|
|
186 |
/** Lock the writer mutex.
|
|
187 |
* Only a single write transaction is allowed at a time. Other writers
|
|
188 |
* will block waiting for this mutex.
|
|
189 |
*/
|
|
190 |
#define LOCK_MUTEX_W(env) pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
|
|
191 |
/** Unlock the writer mutex.
|
|
192 |
*/
|
|
193 |
#define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
|
|
194 |
#endif /* MDB_USE_POSIX_SEM */
|
|
195 |
|
|
196 |
/** Get the error code for the last failed system function.
|
|
197 |
*/
|
|
198 |
#define ErrCode() errno
|
|
199 |
|
|
200 |
/** An abstraction for a file handle.
|
|
201 |
* On POSIX systems file handles are small integers. On Windows
|
|
202 |
* they're opaque pointers.
|
|
203 |
*/
|
|
204 |
#define HANDLE int
|
|
205 |
|
|
206 |
/** A value for an invalid file handle.
|
|
207 |
* Mainly used to initialize file variables and signify that they are
|
|
208 |
* unused.
|
|
209 |
*/
|
|
210 |
#define INVALID_HANDLE_VALUE (-1)
|
|
211 |
|
|
212 |
/** Get the size of a memory page for the system.
|
|
213 |
* This is the basic size that the platform's memory manager uses, and is
|
|
214 |
* fundamental to the use of memory-mapped files.
|
|
215 |
*/
|
|
216 |
#define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
|
|
217 |
#endif
|
|
218 |
|
|
219 |
#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
|
|
220 |
#define MNAME_LEN 32
|
|
221 |
#else
|
|
222 |
#define MNAME_LEN (sizeof(pthread_mutex_t))
|
|
223 |
#endif
|
|
224 |
|
|
225 |
/** @} */
|
|
226 |
|
|
227 |
#ifndef _WIN32
|
|
228 |
/** A flag for opening a file and requesting synchronous data writes.
|
|
229 |
* This is only used when writing a meta page. It's not strictly needed;
|
|
230 |
* we could just do a normal write and then immediately perform a flush.
|
|
231 |
* But if this flag is available it saves us an extra system call.
|
|
232 |
*
|
|
233 |
* @note If O_DSYNC is undefined but exists in /usr/include,
|
|
234 |
* preferably set some compiler flag to get the definition.
|
|
235 |
* Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.
|
|
236 |
*/
|
|
237 |
#ifndef MDB_DSYNC
|
|
238 |
# define MDB_DSYNC O_DSYNC
|
|
239 |
#endif
|
|
240 |
#endif
|
|
241 |
|
|
242 |
/** Function for flushing the data of a file. Define this to fsync
|
|
243 |
* if fdatasync() is not supported.
|
|
244 |
*/
|
|
245 |
#ifndef MDB_FDATASYNC
|
|
246 |
# define MDB_FDATASYNC fdatasync
|
|
247 |
#endif
|
|
248 |
|
|
249 |
#ifndef MDB_MSYNC
|
|
250 |
# define MDB_MSYNC(addr,len,flags) msync(addr,len,flags)
|
|
251 |
#endif
|
|
252 |
|
|
253 |
#ifndef MS_SYNC
|
|
254 |
#define MS_SYNC 1
|
|
255 |
#endif
|
|
256 |
|
|
257 |
#ifndef MS_ASYNC
|
|
258 |
#define MS_ASYNC 0
|
|
259 |
#endif
|
|
260 |
|
|
261 |
/** A page number in the database.
|
|
262 |
* Note that 64 bit page numbers are overkill, since pages themselves
|
|
263 |
* already represent 12-13 bits of addressable memory, and the OS will
|
|
264 |
* always limit applications to a maximum of 63 bits of address space.
|
|
265 |
*
|
|
266 |
* @note In the #MDB_node structure, we only store 48 bits of this value,
|
|
267 |
* which thus limits us to only 60 bits of addressable data.
|
|
268 |
*/
|
|
269 |
typedef MDB_ID pgno_t;
|
|
270 |
|
|
271 |
/** A transaction ID.
|
|
272 |
* See struct MDB_txn.mt_txnid for details.
|
|
273 |
*/
|
|
274 |
typedef MDB_ID txnid_t;
|
|
275 |
|
|
276 |
/** @defgroup debug Debug Macros
|
|
277 |
* @{
|
|
278 |
*/
|
|
279 |
#ifndef MDB_DEBUG
|
|
280 |
/** Enable debug output.
|
|
281 |
* Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
|
|
282 |
* read from and written to the database (used for free space management).
|
|
283 |
*/
|
|
284 |
#define MDB_DEBUG 0
|
|
285 |
#endif
|
|
286 |
|
|
287 |
#if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
|
|
288 |
# undef MDB_DEBUG
|
|
289 |
# define MDB_DEBUG 0
|
|
290 |
# define DPRINTF (void) /* Vararg macros may be unsupported */
|
|
291 |
#elif MDB_DEBUG
|
|
292 |
static int mdb_debug;
|
|
293 |
static txnid_t mdb_debug_start;
|
|
294 |
|
|
295 |
/** Print a debug message with printf formatting. */
|
|
296 |
# define DPRINTF(fmt, ...) /**< Requires 2 or more args */ \
|
|
297 |
((void) ((mdb_debug) && \
|
|
298 |
fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__)))
|
|
299 |
#else
|
|
300 |
# define DPRINTF(fmt, ...) ((void) 0)
|
|
301 |
# define MDB_DEBUG_SKIP
|
|
302 |
#endif
|
|
303 |
/** Print a debug string.
|
|
304 |
* The string is printed literally, with no format processing.
|
|
305 |
*/
|
|
306 |
#define DPUTS(arg) DPRINTF("%s", arg)
|
|
307 |
/** @} */
|
|
308 |
|
|
309 |
/** A default memory page size.
|
|
310 |
* The actual size is platform-dependent, but we use this for
|
|
311 |
* boot-strapping. We probably should not be using this any more.
|
|
312 |
* The #GET_PAGESIZE() macro is used to get the actual size.
|
|
313 |
*
|
|
314 |
* Note that we don't currently support Huge pages. On Linux,
|
|
315 |
* regular data files cannot use Huge pages, and in general
|
|
316 |
* Huge pages aren't actually pageable. We rely on the OS
|
|
317 |
* demand-pager to read our data and page it out when memory
|
|
318 |
* pressure from other processes is high. So until OSs have
|
|
319 |
* actual paging support for Huge pages, they're not viable.
|
|
320 |
*/
|
|
321 |
#define MDB_PAGESIZE 4096
|
|
322 |
|
|
323 |
/** The minimum number of keys required in a database page.
|
|
324 |
* Setting this to a larger value will place a smaller bound on the
|
|
325 |
* maximum size of a data item. Data items larger than this size will
|
|
326 |
* be pushed into overflow pages instead of being stored directly in
|
|
327 |
* the B-tree node. This value used to default to 4. With a page size
|
|
328 |
* of 4096 bytes that meant that any item larger than 1024 bytes would
|
|
329 |
* go into an overflow page. That also meant that on average 2-3KB of
|
|
330 |
* each overflow page was wasted space. The value cannot be lower than
|
|
331 |
* 2 because then there would no longer be a tree structure. With this
|
|
332 |
* value, items larger than 2KB will go into overflow pages, and on
|
|
333 |
* average only 1KB will be wasted.
|
|
334 |
*/
|
|
335 |
#define MDB_MINKEYS 2
|
|
336 |
|
|
337 |
/** A stamp that identifies a file as an MDB file.
|
|
338 |
* There's nothing special about this value other than that it is easily
|
|
339 |
* recognizable, and it will reflect any byte order mismatches.
|
|
340 |
*/
|
|
341 |
#define MDB_MAGIC 0xBEEFC0DE
|
|
342 |
|
|
343 |
/** The version number for a database's file format. */
|
|
344 |
#define MDB_VERSION 1
|
|
345 |
|
|
346 |
/** @brief The maximum size of a key in the database.
|
|
347 |
*
|
|
348 |
* We require that keys all fit onto a regular page. This limit
|
|
349 |
* could be raised a bit further if needed; to something just
|
|
350 |
* under #MDB_PAGESIZE / #MDB_MINKEYS.
|
|
351 |
*
|
|
352 |
* Note that data items in an #MDB_DUPSORT database are actually keys
|
|
353 |
* of a subDB, so they're also limited to this size.
|
|
354 |
*/
|
|
355 |
#ifndef MDB_MAXKEYSIZE
|
|
356 |
#define MDB_MAXKEYSIZE 511
|
|
357 |
#endif
|
|
358 |
|
|
359 |
/** @brief The maximum size of a data item.
|
|
360 |
*
|
|
361 |
* We only store a 32 bit value for node sizes.
|
|
362 |
*/
|
|
363 |
#define MAXDATASIZE 0xffffffffUL
|
|
364 |
|
|
365 |
#if MDB_DEBUG
|
|
366 |
/** A key buffer.
|
|
367 |
* @ingroup debug
|
|
368 |
* This is used for printing a hex dump of a key's contents.
|
|
369 |
*/
|
|
370 |
#define DKBUF char kbuf[(MDB_MAXKEYSIZE*2+1)]
|
|
371 |
/** Display a key in hex.
|
|
372 |
* @ingroup debug
|
|
373 |
* Invoke a function to display a key in hex.
|
|
374 |
*/
|
|
375 |
#define DKEY(x) mdb_dkey(x, kbuf)
|
|
376 |
#else
|
|
377 |
#define DKBUF typedef int dummy_kbuf /* so we can put ';' after */
|
|
378 |
#define DKEY(x) 0
|
|
379 |
#endif
|
|
380 |
|
|
381 |
/** An invalid page number.
|
|
382 |
* Mainly used to denote an empty tree.
|
|
383 |
*/
|
|
384 |
#define P_INVALID (~(pgno_t)0)
|
|
385 |
|
|
386 |
/** Test if the flags \b f are set in a flag word \b w. */
|
|
387 |
#define F_ISSET(w, f) (((w) & (f)) == (f))
|
|
388 |
|
|
389 |
/** Used for offsets within a single page.
|
|
390 |
* Since memory pages are typically 4 or 8KB in size, 12-13 bits,
|
|
391 |
* this is plenty.
|
|
392 |
*/
|
|
393 |
typedef uint16_t indx_t;
|
|
394 |
|
|
395 |
/** Default size of memory map.
|
|
396 |
* This is certainly too small for any actual applications. Apps should always set
|
|
397 |
* the size explicitly using #mdb_env_set_mapsize().
|
|
398 |
*/
|
|
399 |
#define DEFAULT_MAPSIZE 1048576
|
|
400 |
|
|
401 |
/** @defgroup readers Reader Lock Table
|
|
402 |
* Readers don't acquire any locks for their data access. Instead, they
|
|
403 |
* simply record their transaction ID in the reader table. The reader
|
|
404 |
* mutex is needed just to find an empty slot in the reader table. The
|
|
405 |
* slot's address is saved in thread-specific data so that subsequent read
|
|
406 |
* transactions started by the same thread need no further locking to proceed.
|
|
407 |
*
|
|
408 |
* No reader table is used if the database is on a read-only filesystem.
|
|
409 |
*
|
|
410 |
* Since the database uses multi-version concurrency control, readers don't
|
|
411 |
* actually need any locking. This table is used to keep track of which
|
|
412 |
* readers are using data from which old transactions, so that we'll know
|
|
413 |
* when a particular old transaction is no longer in use. Old transactions
|
|
414 |
* that have discarded any data pages can then have those pages reclaimed
|
|
415 |
* for use by a later write transaction.
|
|
416 |
*
|
|
417 |
* The lock table is constructed such that reader slots are aligned with the
|
|
418 |
* processor's cache line size. Any slot is only ever used by one thread.
|
|
419 |
* This alignment guarantees that there will be no contention or cache
|
|
420 |
* thrashing as threads update their own slot info, and also eliminates
|
|
421 |
* any need for locking when accessing a slot.
|
|
422 |
*
|
|
423 |
* A writer thread will scan every slot in the table to determine the oldest
|
|
424 |
* outstanding reader transaction. Any freed pages older than this will be
|
|
425 |
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
|
426 |
* this table. This means that there's no guarantee that the writer will
|
|
427 |
* see the most up-to-date reader info, but that's not required for correct
|
|
428 |
* operation - all we need is to know the upper bound on the oldest reader,
|
|
429 |
* we don't care at all about the newest reader. So the only consequence of
|
|
430 |
* reading stale information here is that old pages might hang around a
|
|
431 |
* while longer before being reclaimed. That's actually good anyway, because
|
|
432 |
* the longer we delay reclaiming old pages, the more likely it is that a
|
|
433 |
* string of contiguous pages can be found after coalescing old pages from
|
|
434 |
* many old transactions together.
|
|
435 |
* @{
|
|
436 |
*/
|
|
437 |
/** Number of slots in the reader table.
|
|
438 |
* This value was chosen somewhat arbitrarily. 126 readers plus a
|
|
439 |
* couple mutexes fit exactly into 8KB on my development machine.
|
|
440 |
* Applications should set the table size using #mdb_env_set_maxreaders().
|
|
441 |
*/
|
|
442 |
#define DEFAULT_READERS 126
|
|
443 |
|
|
444 |
/** The size of a CPU cache line in bytes. We want our lock structures
|
|
445 |
* aligned to this size to avoid false cache line sharing in the
|
|
446 |
* lock table.
|
|
447 |
* This value works for most CPUs. For Itanium this should be 128.
|
|
448 |
*/
|
|
449 |
#ifndef CACHELINE
|
|
450 |
#define CACHELINE 64
|
|
451 |
#endif
|
|
452 |
|
|
453 |
/** The information we store in a single slot of the reader table.
|
|
454 |
* In addition to a transaction ID, we also record the process and
|
|
455 |
* thread ID that owns a slot, so that we can detect stale information,
|
|
456 |
* e.g. threads or processes that went away without cleaning up.
|
|
457 |
* @note We currently don't check for stale records. We simply re-init
|
|
458 |
* the table when we know that we're the only process opening the
|
|
459 |
* lock file.
|
|
460 |
*/
|
|
461 |
typedef struct MDB_rxbody {
|
|
462 |
/** Current Transaction ID when this transaction began, or (txnid_t)-1.
|
|
463 |
* Multiple readers that start at the same time will probably have the
|
|
464 |
* same ID here. Again, it's not important to exclude them from
|
|
465 |
* anything; all we need to know is which version of the DB they
|
|
466 |
* started from so we can avoid overwriting any data used in that
|
|
467 |
* particular version.
|
|
468 |
*/
|
|
469 |
txnid_t mrb_txnid;
|
|
470 |
/** The process ID of the process owning this reader txn. */
|
|
471 |
pid_t mrb_pid;
|
|
472 |
/** The thread ID of the thread owning this txn. */
|
|
473 |
pthread_t mrb_tid;
|
|
474 |
} MDB_rxbody;
|
|
475 |
|
|
476 |
/** The actual reader record, with cacheline padding. */
|
|
477 |
typedef struct MDB_reader {
|
|
478 |
union {
|
|
479 |
MDB_rxbody mrx;
|
|
480 |
/** shorthand for mrb_txnid */
|
|
481 |
#define mr_txnid mru.mrx.mrb_txnid
|
|
482 |
#define mr_pid mru.mrx.mrb_pid
|
|
483 |
#define mr_tid mru.mrx.mrb_tid
|
|
484 |
/** cache line alignment */
|
|
485 |
char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
|
|
486 |
} mru;
|
|
487 |
} MDB_reader;
|
|
488 |
|
|
489 |
/** The header for the reader table.
|
|
490 |
* The table resides in a memory-mapped file. (This is a different file
|
|
491 |
* than is used for the main database.)
|
|
492 |
*
|
|
493 |
* For POSIX the actual mutexes reside in the shared memory of this
|
|
494 |
* mapped file. On Windows, mutexes are named objects allocated by the
|
|
495 |
* kernel; we store the mutex names in this mapped file so that other
|
|
496 |
* processes can grab them. This same approach is also used on
|
|
497 |
* MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
|
|
498 |
* process-shared POSIX mutexes. For these cases where a named object
|
|
499 |
* is used, the object name is derived from a 64 bit FNV hash of the
|
|
500 |
* environment pathname. As such, naming collisions are extremely
|
|
501 |
* unlikely. If a collision occurs, the results are unpredictable.
|
|
502 |
*/
|
|
503 |
typedef struct MDB_txbody {
|
|
504 |
/** Stamp identifying this as an MDB file. It must be set
|
|
505 |
* to #MDB_MAGIC. */
|
|
506 |
uint32_t mtb_magic;
|
|
507 |
/** Version number of this lock file. Must be set to #MDB_VERSION. */
|
|
508 |
uint32_t mtb_version;
|
|
509 |
#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
|
|
510 |
char mtb_rmname[MNAME_LEN];
|
|
511 |
#else
|
|
512 |
/** Mutex protecting access to this table.
|
|
513 |
* This is the reader lock that #LOCK_MUTEX_R acquires.
|
|
514 |
*/
|
|
515 |
pthread_mutex_t mtb_mutex;
|
|
516 |
#endif
|
|
517 |
/** The ID of the last transaction committed to the database.
|
|
518 |
* This is recorded here only for convenience; the value can always
|
|
519 |
* be determined by reading the main database meta pages.
|
|
520 |
*/
|
|
521 |
txnid_t mtb_txnid;
|
|
522 |
/** The number of slots that have been used in the reader table.
|
|
523 |
* This always records the maximum count, it is not decremented
|
|
524 |
* when readers release their slots.
|
|
525 |
*/
|
|
526 |
unsigned mtb_numreaders;
|
|
527 |
} MDB_txbody;
|
|
528 |
|
|
529 |
/** The actual reader table definition. */
|
|
530 |
typedef struct MDB_txninfo {
|
|
531 |
union {
|
|
532 |
MDB_txbody mtb;
|
|
533 |
#define mti_magic mt1.mtb.mtb_magic
|
|
534 |
#define mti_version mt1.mtb.mtb_version
|
|
535 |
#define mti_mutex mt1.mtb.mtb_mutex
|
|
536 |
#define mti_rmname mt1.mtb.mtb_rmname
|
|
537 |
#define mti_txnid mt1.mtb.mtb_txnid
|
|
538 |
#define mti_numreaders mt1.mtb.mtb_numreaders
|
|
539 |
char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
|
|
540 |
} mt1;
|
|
541 |
union {
|
|
542 |
#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
|
|
543 |
char mt2_wmname[MNAME_LEN];
|
|
544 |
#define mti_wmname mt2.mt2_wmname
|
|
545 |
#else
|
|
546 |
pthread_mutex_t mt2_wmutex;
|
|
547 |
#define mti_wmutex mt2.mt2_wmutex
|
|
548 |
#endif
|
|
549 |
char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
|
|
550 |
} mt2;
|
|
551 |
MDB_reader mti_readers[1];
|
|
552 |
} MDB_txninfo;
|
|
553 |
/** @} */
|
|
554 |
|
|
555 |
/** Common header for all page types.
|
|
556 |
* Overflow records occupy a number of contiguous pages with no
|
|
557 |
* headers on any page after the first.
|
|
558 |
*/
|
|
559 |
typedef struct MDB_page {
|
|
560 |
#define mp_pgno mp_p.p_pgno
|
|
561 |
#define mp_next mp_p.p_next
|
|
562 |
union {
|
|
563 |
pgno_t p_pgno; /**< page number */
|
|
564 |
void * p_next; /**< for in-memory list of freed structs */
|
|
565 |
} mp_p;
|
|
566 |
uint16_t mp_pad;
|
|
567 |
/** @defgroup mdb_page Page Flags
|
|
568 |
* @ingroup internal
|
|
569 |
* Flags for the page headers.
|
|
570 |
* @{
|
|
571 |
*/
|
|
572 |
#define P_BRANCH 0x01 /**< branch page */
|
|
573 |
#define P_LEAF 0x02 /**< leaf page */
|
|
574 |
#define P_OVERFLOW 0x04 /**< overflow page */
|
|
575 |
#define P_META 0x08 /**< meta page */
|
|
576 |
#define P_DIRTY 0x10 /**< dirty page */
|
|
577 |
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
|
|
578 |
#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
|
|
579 |
/** @} */
|
|
580 |
uint16_t mp_flags; /**< @ref mdb_page */
|
|
581 |
#define mp_lower mp_pb.pb.pb_lower
|
|
582 |
#define mp_upper mp_pb.pb.pb_upper
|
|
583 |
#define mp_pages mp_pb.pb_pages
|
|
584 |
union {
|
|
585 |
struct {
|
|
586 |
indx_t pb_lower; /**< lower bound of free space */
|
|
587 |
indx_t pb_upper; /**< upper bound of free space */
|
|
588 |
} pb;
|
|
589 |
uint32_t pb_pages; /**< number of overflow pages */
|
|
590 |
} mp_pb;
|
|
591 |
indx_t mp_ptrs[1]; /**< dynamic size */
|
|
592 |
} MDB_page;
|
|
593 |
|
|
594 |
/** Size of the page header, excluding dynamic data at the end */
|
|
595 |
#define PAGEHDRSZ ((unsigned) offsetof(MDB_page, mp_ptrs))
|
|
596 |
|
|
597 |
/** Address of first usable data byte in a page, after the header */
|
|
598 |
#define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
|
|
599 |
|
|
600 |
/** Number of nodes on a page */
|
|
601 |
#define NUMKEYS(p) (((p)->mp_lower - PAGEHDRSZ) >> 1)
|
|
602 |
|
|
603 |
/** The amount of space remaining in the page */
|
|
604 |
#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
|
|
605 |
|
|
606 |
/** The percentage of space used in the page, in tenths of a percent. */
|
|
607 |
#define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
|
|
608 |
((env)->me_psize - PAGEHDRSZ))
|
|
609 |
/** The minimum page fill factor, in tenths of a percent.
|
|
610 |
* Pages emptier than this are candidates for merging.
|
|
611 |
*/
|
|
612 |
#define FILL_THRESHOLD 250
|
|
613 |
|
|
614 |
/** Test if a page is a leaf page */
|
|
615 |
#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
|
|
616 |
/** Test if a page is a LEAF2 page */
|
|
617 |
#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
|
|
618 |
/** Test if a page is a branch page */
|
|
619 |
#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
|
|
620 |
/** Test if a page is an overflow page */
|
|
621 |
#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
|
|
622 |
/** Test if a page is a sub page */
|
|
623 |
#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
|
|
624 |
|
|
625 |
/** The number of overflow pages needed to store the given size. */
|
|
626 |
#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
|
|
627 |
|
|
628 |
/** Header for a single key/data pair within a page.
|
|
629 |
* We guarantee 2-byte alignment for nodes.
|
|
630 |
*/
|
|
631 |
typedef struct MDB_node {
|
|
632 |
/** lo and hi are used for data size on leaf nodes and for
|
|
633 |
* child pgno on branch nodes. On 64 bit platforms, flags
|
|
634 |
* is also used for pgno. (Branch nodes have no flags).
|
|
635 |
* They are in host byte order in case that lets some
|
|
636 |
* accesses be optimized into a 32-bit word access.
|
|
637 |
*/
|
|
638 |
#define mn_lo mn_offset[BYTE_ORDER!=LITTLE_ENDIAN]
|
|
639 |
#define mn_hi mn_offset[BYTE_ORDER==LITTLE_ENDIAN] /**< part of dsize or pgno */
|
|
640 |
unsigned short mn_offset[2]; /**< storage for #mn_lo and #mn_hi */
|
|
641 |
/** @defgroup mdb_node Node Flags
|
|
642 |
* @ingroup internal
|
|
643 |
* Flags for node headers.
|
|
644 |
* @{
|
|
645 |
*/
|
|
646 |
#define F_BIGDATA 0x01 /**< data put on overflow page */
|
|
647 |
#define F_SUBDATA 0x02 /**< data is a sub-database */
|
|
648 |
#define F_DUPDATA 0x04 /**< data has duplicates */
|
|
649 |
|
|
650 |
/** valid flags for #mdb_node_add() */
|
|
651 |
#define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
|
|
652 |
|
|
653 |
/** @} */
|
|
654 |
unsigned short mn_flags; /**< @ref mdb_node */
|
|
655 |
unsigned short mn_ksize; /**< key size */
|
|
656 |
char mn_data[1]; /**< key and data are appended here */
|
|
657 |
} MDB_node;
|
|
658 |
|
|
659 |
/** Size of the node header, excluding dynamic data at the end */
|
|
660 |
#define NODESIZE offsetof(MDB_node, mn_data)
|
|
661 |
|
|
662 |
/** Bit position of top word in page number, for shifting mn_flags */
|
|
663 |
#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
|
|
664 |
|
|
665 |
/** Size of a node in a branch page with a given key.
|
|
666 |
* This is just the node header plus the key, there is no data.
|
|
667 |
*/
|
|
668 |
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
|
|
669 |
|
|
670 |
/** Size of a node in a leaf page with a given key and data.
|
|
671 |
* This is node header plus key plus data size.
|
|
672 |
*/
|
|
673 |
#define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
|
|
674 |
|
|
675 |
/** Address of node \b i in page \b p */
|
|
676 |
#define NODEPTR(p, i) ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
|
|
677 |
|
|
678 |
/** Address of the key for the node */
|
|
679 |
#define NODEKEY(node) (void *)((node)->mn_data)
|
|
680 |
|
|
681 |
/** Address of the data for a node */
|
|
682 |
#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
|
|
683 |
|
|
684 |
/** Get the page number pointed to by a branch node */
|
|
685 |
#define NODEPGNO(node) \
|
|
686 |
((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
|
|
687 |
(PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
|
|
688 |
/** Set the page number in a branch node */
|
|
689 |
#define SETPGNO(node,pgno) do { \
|
|
690 |
(node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
|
|
691 |
if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
|
|
692 |
|
|
693 |
/** Get the size of the data in a leaf node */
|
|
694 |
#define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
|
|
695 |
/** Set the size of the data for a leaf node */
|
|
696 |
#define SETDSZ(node,size) do { \
|
|
697 |
(node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
|
|
698 |
/** The size of a key in a node */
|
|
699 |
#define NODEKSZ(node) ((node)->mn_ksize)
|
|
700 |
|
|
701 |
/** Copy a page number from src to dst */
|
|
702 |
#ifdef MISALIGNED_OK
|
|
703 |
#define COPY_PGNO(dst,src) dst = src
|
|
704 |
#else
|
|
705 |
#if SIZE_MAX > 4294967295UL
|
|
706 |
#define COPY_PGNO(dst,src) do { \
|
|
707 |
unsigned short *s, *d; \
|
|
708 |
s = (unsigned short *)&(src); \
|
|
709 |
d = (unsigned short *)&(dst); \
|
|
710 |
*d++ = *s++; \
|
|
711 |
*d++ = *s++; \
|
|
712 |
*d++ = *s++; \
|
|
713 |
*d = *s; \
|
|
714 |
} while (0)
|
|
715 |
#else
|
|
716 |
#define COPY_PGNO(dst,src) do { \
|
|
717 |
unsigned short *s, *d; \
|
|
718 |
s = (unsigned short *)&(src); \
|
|
719 |
d = (unsigned short *)&(dst); \
|
|
720 |
*d++ = *s++; \
|
|
721 |
*d = *s; \
|
|
722 |
} while (0)
|
|
723 |
#endif
|
|
724 |
#endif
|
|
725 |
/** The address of a key in a LEAF2 page.
|
|
726 |
* LEAF2 pages are used for #MDB_DUPFIXED sorted-duplicate sub-DBs.
|
|
727 |
* There are no node headers, keys are stored contiguously.
|
|
728 |
*/
|
|
729 |
#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
|
|
730 |
|
|
731 |
/** Set the \b node's key into \b key, if requested. */
|
|
732 |
#define MDB_GET_KEY(node, key) { if ((key) != NULL) { \
|
|
733 |
(key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } }
|
|
734 |
|
|
735 |
/** Information about a single database in the environment. */
|
|
736 |
typedef struct MDB_db {
|
|
737 |
uint32_t md_pad; /**< also ksize for LEAF2 pages */
|
|
738 |
uint16_t md_flags; /**< @ref mdb_dbi_open */
|
|
739 |
uint16_t md_depth; /**< depth of this tree */
|
|
740 |
pgno_t md_branch_pages; /**< number of internal pages */
|
|
741 |
pgno_t md_leaf_pages; /**< number of leaf pages */
|
|
742 |
pgno_t md_overflow_pages; /**< number of overflow pages */
|
|
743 |
size_t md_entries; /**< number of data items */
|
|
744 |
pgno_t md_root; /**< the root page of this tree */
|
|
745 |
} MDB_db;
|
|
746 |
|
|
747 |
/** mdb_dbi_open flags */
|
|
748 |
#define PERSISTENT_FLAGS 0x7fff
|
|
749 |
#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
|
|
750 |
MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
|
|
751 |
|
|
752 |
/** Handle for the DB used to track free pages. */
|
|
753 |
#define FREE_DBI 0
|
|
754 |
/** Handle for the default DB. */
|
|
755 |
#define MAIN_DBI 1
|
|
756 |
|
|
757 |
/** Meta page content. */
|
|
758 |
typedef struct MDB_meta {
|
|
759 |
/** Stamp identifying this as an MDB file. It must be set
|
|
760 |
* to #MDB_MAGIC. */
|
|
761 |
uint32_t mm_magic;
|
|
762 |
/** Version number of this lock file. Must be set to #MDB_VERSION. */
|
|
763 |
uint32_t mm_version;
|
|
764 |
void *mm_address; /**< address for fixed mapping */
|
|
765 |
size_t mm_mapsize; /**< size of mmap region */
|
|
766 |
MDB_db mm_dbs[2]; /**< first is free space, 2nd is main db */
|
|
767 |
/** The size of pages used in this DB */
|
|
768 |
#define mm_psize mm_dbs[0].md_pad
|
|
769 |
/** Any persistent environment flags. @ref mdb_env */
|
|
770 |
#define mm_flags mm_dbs[0].md_flags
|
|
771 |
pgno_t mm_last_pg; /**< last used page in file */
|
|
772 |
txnid_t mm_txnid; /**< txnid that committed this page */
|
|
773 |
} MDB_meta;
|
|
774 |
|
|
775 |
/** Buffer for a stack-allocated dirty page.
|
|
776 |
* The members define size and alignment, and silence type
|
|
777 |
* aliasing warnings. They are not used directly; that could
|
|
778 |
* mean incorrectly using several union members in parallel.
|
|
779 |
*/
|
|
780 |
typedef union MDB_pagebuf {
|
|
781 |
char mb_raw[MDB_PAGESIZE];
|
|
782 |
MDB_page mb_page;
|
|
783 |
struct {
|
|
784 |
char mm_pad[PAGEHDRSZ];
|
|
785 |
MDB_meta mm_meta;
|
|
786 |
} mb_metabuf;
|
|
787 |
} MDB_pagebuf;
|
|
788 |
|
|
789 |
/** Auxiliary DB info.
|
|
790 |
* The information here is mostly static/read-only. There is
|
|
791 |
* only a single copy of this record in the environment.
|
|
792 |
*/
|
|
793 |
typedef struct MDB_dbx {
|
|
794 |
MDB_val md_name; /**< name of the database */
|
|
795 |
MDB_cmp_func *md_cmp; /**< function for comparing keys */
|
|
796 |
MDB_cmp_func *md_dcmp; /**< function for comparing data items */
|
|
797 |
MDB_rel_func *md_rel; /**< user relocate function */
|
|
798 |
void *md_relctx; /**< user-provided context for md_rel */
|
|
799 |
} MDB_dbx;
|
|
800 |
|
|
801 |
/** A database transaction.
|
|
802 |
* Every operation requires a transaction handle.
|
|
803 |
*/
|
|
804 |
struct MDB_txn {
|
|
805 |
MDB_txn *mt_parent; /**< parent of a nested txn */
|
|
806 |
MDB_txn *mt_child; /**< nested txn under this txn */
|
|
807 |
pgno_t mt_next_pgno; /**< next unallocated page */
|
|
808 |
/** The ID of this transaction. IDs are integers incrementing from 1.
|
|
809 |
* Only committed write transactions increment the ID. If a transaction
|
|
810 |
* aborts, the ID may be re-used by the next writer.
|
|
811 |
*/
|
|
812 |
txnid_t mt_txnid;
|
|
813 |
MDB_env *mt_env; /**< the DB environment */
|
|
814 |
/** The list of pages that became unused during this transaction.
|
|
815 |
*/
|
|
816 |
MDB_IDL mt_free_pgs;
|
|
817 |
union {
|
|
818 |
MDB_ID2L dirty_list; /**< for write txns: modified pages */
|
|
819 |
MDB_reader *reader; /**< this thread's reader table slot or NULL */
|
|
820 |
} mt_u;
|
|
821 |
/** Array of records for each DB known in the environment. */
|
|
822 |
MDB_dbx *mt_dbxs;
|
|
823 |
/** Array of MDB_db records for each known DB */
|
|
824 |
MDB_db *mt_dbs;
|
|
825 |
/** @defgroup mt_dbflag Transaction DB Flags
|
|
826 |
* @ingroup internal
|
|
827 |
* @{
|
|
828 |
*/
|
|
829 |
#define DB_DIRTY 0x01 /**< DB was written in this txn */
|
|
830 |
#define DB_STALE 0x02 /**< DB record is older than txnID */
|
|
831 |
#define DB_NEW 0x04 /**< DB handle opened in this txn */
|
|
832 |
#define DB_VALID 0x08 /**< DB handle is valid */
|
|
833 |
#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
|
|
834 |
/** @} */
|
|
835 |
/** In write txns, array of cursors for each DB */
|
|
836 |
MDB_cursor **mt_cursors;
|
|
837 |
/** Array of flags for each DB */
|
|
838 |
unsigned char *mt_dbflags;
|
|
839 |
/** Number of DB records in use. This number only ever increments;
|
|
840 |
* we don't decrement it when individual DB handles are closed.
|
|
841 |
*/
|
|
842 |
MDB_dbi mt_numdbs;
|
|
843 |
|
|
844 |
/** @defgroup mdb_txn Transaction Flags
|
|
845 |
* @ingroup internal
|
|
846 |
* @{
|
|
847 |
*/
|
|
848 |
#define MDB_TXN_RDONLY 0x01 /**< read-only transaction */
|
|
849 |
#define MDB_TXN_ERROR 0x02 /**< an error has occurred */
|
|
850 |
#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */
|
|
851 |
/** @} */
|
|
852 |
unsigned int mt_flags; /**< @ref mdb_txn */
|
|
853 |
/** dirty_list maxsize - #allocated pages including in parent txns */
|
|
854 |
unsigned int mt_dirty_room;
|
|
855 |
/** Tracks which of the two meta pages was used at the start
|
|
856 |
* of this transaction.
|
|
857 |
*/
|
|
858 |
unsigned int mt_toggle;
|
|
859 |
};
|
|
860 |
|
|
861 |
/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
|
|
862 |
* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
|
|
863 |
* raise this on a 64 bit machine.
|
|
864 |
*/
|
|
865 |
#define CURSOR_STACK 32
|
|
866 |
|
|
867 |
struct MDB_xcursor;
|
|
868 |
|
|
869 |
/** Cursors are used for all DB operations */
|
|
870 |
struct MDB_cursor {
|
|
871 |
/** Next cursor on this DB in this txn */
|
|
872 |
MDB_cursor *mc_next;
|
|
873 |
/** Original cursor if this is a shadow */
|
|
874 |
MDB_cursor *mc_orig;
|
|
875 |
/** Context used for databases with #MDB_DUPSORT, otherwise NULL */
|
|
876 |
struct MDB_xcursor *mc_xcursor;
|
|
877 |
/** The transaction that owns this cursor */
|
|
878 |
MDB_txn *mc_txn;
|
|
879 |
/** The database handle this cursor operates on */
|
|
880 |
MDB_dbi mc_dbi;
|
|
881 |
/** The database record for this cursor */
|
|
882 |
MDB_db *mc_db;
|
|
883 |
/** The database auxiliary record for this cursor */
|
|
884 |
MDB_dbx *mc_dbx;
|
|
885 |
/** The @ref mt_dbflag for this database */
|
|
886 |
unsigned char *mc_dbflag;
|
|
887 |
unsigned short mc_snum; /**< number of pushed pages */
|
|
888 |
unsigned short mc_top; /**< index of top page, normally mc_snum-1 */
|
|
889 |
/** @defgroup mdb_cursor Cursor Flags
|
|
890 |
* @ingroup internal
|
|
891 |
* Cursor state flags.
|
|
892 |
* @{
|
|
893 |
*/
|
|
894 |
#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */
|
|
895 |
#define C_EOF 0x02 /**< No more data */
|
|
896 |
#define C_SUB 0x04 /**< Cursor is a sub-cursor */
|
|
897 |
#define C_SHADOW 0x08 /**< Cursor is a dup from a parent txn */
|
|
898 |
#define C_ALLOCD 0x10 /**< Cursor was malloc'd */
|
|
899 |
#define C_SPLITTING 0x20 /**< Cursor is in page_split */
|
|
900 |
/** @} */
|
|
901 |
unsigned int mc_flags; /**< @ref mdb_cursor */
|
|
902 |
MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
|
|
903 |
indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
|
|
904 |
};
|
|
905 |
|
|
906 |
/** Context for sorted-dup records.
|
|
907 |
* We could have gone to a fully recursive design, with arbitrarily
|
|
908 |
* deep nesting of sub-databases. But for now we only handle these
|
|
909 |
* levels - main DB, optional sub-DB, sorted-duplicate DB.
|
|
910 |
*/
|
|
911 |
typedef struct MDB_xcursor {
|
|
912 |
/** A sub-cursor for traversing the Dup DB */
|
|
913 |
MDB_cursor mx_cursor;
|
|
914 |
/** The database record for this Dup DB */
|
|
915 |
MDB_db mx_db;
|
|
916 |
/** The auxiliary DB record for this Dup DB */
|
|
917 |
MDB_dbx mx_dbx;
|
|
918 |
/** The @ref mt_dbflag for this Dup DB */
|
|
919 |
unsigned char mx_dbflag;
|
|
920 |
} MDB_xcursor;
|
|
921 |
|
|
922 |
/** State of FreeDB old pages, stored in the MDB_env */
|
|
923 |
typedef struct MDB_pgstate {
|
|
924 |
txnid_t mf_pglast; /**< ID of last old page record we used */
|
|
925 |
pgno_t *mf_pghead; /**< old pages reclaimed from freelist */
|
|
926 |
pgno_t *mf_pgfree; /**< memory to free when dropping me_pghead */
|
|
927 |
} MDB_pgstate;
|
|
928 |
|
|
929 |
/** The database environment. */
|
|
930 |
struct MDB_env {
|
|
931 |
HANDLE me_fd; /**< The main data file */
|
|
932 |
HANDLE me_lfd; /**< The lock file */
|
|
933 |
HANDLE me_mfd; /**< just for writing the meta pages */
|
|
934 |
/** Failed to update the meta page. Probably an I/O error. */
|
|
935 |
#define MDB_FATAL_ERROR 0x80000000U
|
|
936 |
/** Read-only Filesystem. Allow read access, no locking. */
|
|
937 |
#define MDB_ROFS 0x40000000U
|
|
938 |
/** Some fields are initialized. */
|
|
939 |
#define MDB_ENV_ACTIVE 0x20000000U
|
|
940 |
uint32_t me_flags; /**< @ref mdb_env */
|
|
941 |
unsigned int me_psize; /**< size of a page, from #GET_PAGESIZE */
|
|
942 |
unsigned int me_maxreaders; /**< size of the reader table */
|
|
943 |
unsigned int me_numreaders; /**< max numreaders set by this env */
|
|
944 |
MDB_dbi me_numdbs; /**< number of DBs opened */
|
|
945 |
MDB_dbi me_maxdbs; /**< size of the DB table */
|
|
946 |
pid_t me_pid; /**< process ID of this env */
|
|
947 |
char *me_path; /**< path to the DB files */
|
|
948 |
char *me_map; /**< the memory map of the data file */
|
|
949 |
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */
|
|
950 |
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
|
|
951 |
MDB_txn *me_txn; /**< current write transaction */
|
|
952 |
size_t me_mapsize; /**< size of the data memory map */
|
|
953 |
off_t me_size; /**< current file size */
|
|
954 |
pgno_t me_maxpg; /**< me_mapsize / me_psize */
|
|
955 |
MDB_dbx *me_dbxs; /**< array of static DB info */
|
|
956 |
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
|
|
957 |
pthread_key_t me_txkey; /**< thread-key for readers */
|
|
958 |
MDB_pgstate me_pgstate; /**< state of old pages from freeDB */
|
|
959 |
# define me_pglast me_pgstate.mf_pglast
|
|
960 |
# define me_pghead me_pgstate.mf_pghead
|
|
961 |
# define me_pgfree me_pgstate.mf_pgfree
|
|
962 |
MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */
|
|
963 |
/** IDL of pages that became unused in a write txn */
|
|
964 |
MDB_IDL me_free_pgs;
|
|
965 |
/** ID2L of pages that were written during a write txn */
|
|
966 |
MDB_ID2 me_dirty_list[MDB_IDL_UM_SIZE];
|
|
967 |
/** Max number of freelist items that can fit in a single overflow page */
|
|
968 |
unsigned int me_maxfree_1pg;
|
|
969 |
/** Max size of a node on a page */
|
|
970 |
unsigned int me_nodemax;
|
|
971 |
#ifdef _WIN32
|
|
972 |
HANDLE me_rmutex; /* Windows mutexes don't reside in shared mem */
|
|
973 |
HANDLE me_wmutex;
|
|
974 |
#elif defined(MDB_USE_POSIX_SEM)
|
|
975 |
sem_t *me_rmutex; /* Shared mutexes are not supported */
|
|
976 |
sem_t *me_wmutex;
|
|
977 |
#endif
|
|
978 |
};
|
|
979 |
|
|
980 |
/** Nested transaction */
|
|
981 |
typedef struct MDB_ntxn {
|
|
982 |
MDB_txn mnt_txn; /* the transaction */
|
|
983 |
MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */
|
|
984 |
} MDB_ntxn;
|
|
985 |
|
|
986 |
/** max number of pages to commit in one writev() call */
|
|
987 |
#define MDB_COMMIT_PAGES 64
|
|
988 |
#if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
|
|
989 |
#undef MDB_COMMIT_PAGES
|
|
990 |
#define MDB_COMMIT_PAGES IOV_MAX
|
|
991 |
#endif
|
|
992 |
|
|
993 |
static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp);
|
|
994 |
static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp);
|
|
995 |
static int mdb_page_touch(MDB_cursor *mc);
|
|
996 |
|
|
997 |
static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp);
|
|
998 |
static int mdb_page_search_root(MDB_cursor *mc,
|
|
999 |
MDB_val *key, int modify);
|
|
1000 |
#define MDB_PS_MODIFY 1
|
|
1001 |
#define MDB_PS_ROOTONLY 2
|
|
1002 |
static int mdb_page_search(MDB_cursor *mc,
|
|
1003 |
MDB_val *key, int flags);
|
|
1004 |
static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
|
|
1005 |
|
|
1006 |
#define MDB_SPLIT_REPLACE MDB_APPENDDUP /**< newkey is not new */
|
|
1007 |
static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
|
|
1008 |
pgno_t newpgno, unsigned int nflags);
|
|
1009 |
|
|
1010 |
static int mdb_env_read_header(MDB_env *env, MDB_meta *meta);
|
|
1011 |
static int mdb_env_pick_meta(const MDB_env *env);
|
|
1012 |
static int mdb_env_write_meta(MDB_txn *txn);
|
|
1013 |
static void mdb_env_close0(MDB_env *env, int excl);
|
|
1014 |
|
|
1015 |
static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
|
|
1016 |
static int mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
1017 |
MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
|
|
1018 |
static void mdb_node_del(MDB_page *mp, indx_t indx, int ksize);
|
|
1019 |
static void mdb_node_shrink(MDB_page *mp, indx_t indx);
|
|
1020 |
static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
|
|
1021 |
static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
|
|
1022 |
static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
|
|
1023 |
static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
|
|
1024 |
|
|
1025 |
static int mdb_rebalance(MDB_cursor *mc);
|
|
1026 |
static int mdb_update_key(MDB_cursor *mc, MDB_val *key);
|
|
1027 |
|
|
1028 |
static void mdb_cursor_pop(MDB_cursor *mc);
|
|
1029 |
static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
|
|
1030 |
|
|
1031 |
static int mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf);
|
|
1032 |
static int mdb_cursor_sibling(MDB_cursor *mc, int move_right);
|
|
1033 |
static int mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
|
|
1034 |
static int mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
|
|
1035 |
static int mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
|
|
1036 |
int *exactp);
|
|
1037 |
static int mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
|
|
1038 |
static int mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
|
|
1039 |
|
|
1040 |
static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
|
|
1041 |
static void mdb_xcursor_init0(MDB_cursor *mc);
|
|
1042 |
static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
|
|
1043 |
|
|
1044 |
static int mdb_drop0(MDB_cursor *mc, int subs);
|
|
1045 |
static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
|
|
1046 |
|
|
1047 |
/** @cond */
|
|
1048 |
static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
|
|
1049 |
/** @endcond */
|
|
1050 |
|
|
1051 |
#ifdef _WIN32
|
|
1052 |
static SECURITY_DESCRIPTOR mdb_null_sd;
|
|
1053 |
static SECURITY_ATTRIBUTES mdb_all_sa;
|
|
1054 |
static int mdb_sec_inited;
|
|
1055 |
#endif
|
|
1056 |
|
|
1057 |
/** Return the library version info. */
|
|
1058 |
char *
|
|
1059 |
mdb_version(int *major, int *minor, int *patch)
|
|
1060 |
{
|
|
1061 |
if (major) *major = MDB_VERSION_MAJOR;
|
|
1062 |
if (minor) *minor = MDB_VERSION_MINOR;
|
|
1063 |
if (patch) *patch = MDB_VERSION_PATCH;
|
|
1064 |
return MDB_VERSION_STRING;
|
|
1065 |
}
|
|
1066 |
|
|
1067 |
/** Table of descriptions for MDB @ref errors */
|
|
1068 |
static char *const mdb_errstr[] = {
|
|
1069 |
"MDB_KEYEXIST: Key/data pair already exists",
|
|
1070 |
"MDB_NOTFOUND: No matching key/data pair found",
|
|
1071 |
"MDB_PAGE_NOTFOUND: Requested page not found",
|
|
1072 |
"MDB_CORRUPTED: Located page was wrong type",
|
|
1073 |
"MDB_PANIC: Update of meta page failed",
|
|
1074 |
"MDB_VERSION_MISMATCH: Database environment version mismatch",
|
|
1075 |
"MDB_INVALID: File is not an MDB file",
|
|
1076 |
"MDB_MAP_FULL: Environment mapsize limit reached",
|
|
1077 |
"MDB_DBS_FULL: Environment maxdbs limit reached",
|
|
1078 |
"MDB_READERS_FULL: Environment maxreaders limit reached",
|
|
1079 |
"MDB_TLS_FULL: Thread-local storage keys full - too many environments open",
|
|
1080 |
"MDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
|
|
1081 |
"MDB_CURSOR_FULL: Internal error - cursor stack limit reached",
|
|
1082 |
"MDB_PAGE_FULL: Internal error - page has no more space",
|
|
1083 |
"MDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
|
|
1084 |
"MDB_INCOMPATIBLE: Database flags changed or would change",
|
|
1085 |
};
|
|
1086 |
|
|
1087 |
char *
|
|
1088 |
mdb_strerror(int err)
|
|
1089 |
{
|
|
1090 |
int i;
|
|
1091 |
if (!err)
|
|
1092 |
return ("Successful return: 0");
|
|
1093 |
|
|
1094 |
if (err >= MDB_KEYEXIST && err <= MDB_LAST_ERRCODE) {
|
|
1095 |
i = err - MDB_KEYEXIST;
|
|
1096 |
return mdb_errstr[i];
|
|
1097 |
}
|
|
1098 |
|
|
1099 |
return strerror(err);
|
|
1100 |
}
|
|
1101 |
|
|
1102 |
#if MDB_DEBUG
|
|
1103 |
/** Display a key in hexadecimal and return the address of the result.
|
|
1104 |
* @param[in] key the key to display
|
|
1105 |
* @param[in] buf the buffer to write into. Should always be #DKBUF.
|
|
1106 |
* @return The key in hexadecimal form.
|
|
1107 |
*/
|
|
1108 |
char *
|
|
1109 |
mdb_dkey(MDB_val *key, char *buf)
|
|
1110 |
{
|
|
1111 |
char *ptr = buf;
|
|
1112 |
unsigned char *c = key->mv_data;
|
|
1113 |
unsigned int i;
|
|
1114 |
|
|
1115 |
if (!key)
|
|
1116 |
return "";
|
|
1117 |
|
|
1118 |
if (key->mv_size > MDB_MAXKEYSIZE)
|
|
1119 |
return "MDB_MAXKEYSIZE";
|
|
1120 |
/* may want to make this a dynamic check: if the key is mostly
|
|
1121 |
* printable characters, print it as-is instead of converting to hex.
|
|
1122 |
*/
|
|
1123 |
#if 1
|
|
1124 |
buf[0] = '\0';
|
|
1125 |
for (i=0; i<key->mv_size; i++)
|
|
1126 |
ptr += sprintf(ptr, "%02x", *c++);
|
|
1127 |
#else
|
|
1128 |
sprintf(buf, "%.*s", key->mv_size, key->mv_data);
|
|
1129 |
#endif
|
|
1130 |
return buf;
|
|
1131 |
}
|
|
1132 |
|
|
1133 |
/** Display all the keys in the page. */
|
|
1134 |
static void
|
|
1135 |
mdb_page_list(MDB_page *mp)
|
|
1136 |
{
|
|
1137 |
MDB_node *node;
|
|
1138 |
unsigned int i, nkeys, nsize;
|
|
1139 |
MDB_val key;
|
|
1140 |
DKBUF;
|
|
1141 |
|
|
1142 |
nkeys = NUMKEYS(mp);
|
|
1143 |
fprintf(stderr, "Page %zu numkeys %d\n", mp->mp_pgno, nkeys);
|
|
1144 |
for (i=0; i<nkeys; i++) {
|
|
1145 |
node = NODEPTR(mp, i);
|
|
1146 |
key.mv_size = node->mn_ksize;
|
|
1147 |
key.mv_data = node->mn_data;
|
|
1148 |
nsize = NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
|
1149 |
if (IS_BRANCH(mp)) {
|
|
1150 |
fprintf(stderr, "key %d: page %zu, %s\n", i, NODEPGNO(node),
|
|
1151 |
DKEY(&key));
|
|
1152 |
} else {
|
|
1153 |
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
|
1154 |
nsize += sizeof(pgno_t);
|
|
1155 |
else
|
|
1156 |
nsize += NODEDSZ(node);
|
|
1157 |
fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
|
|
1158 |
}
|
|
1159 |
}
|
|
1160 |
}
|
|
1161 |
|
|
1162 |
void
|
|
1163 |
mdb_cursor_chk(MDB_cursor *mc)
|
|
1164 |
{
|
|
1165 |
unsigned int i;
|
|
1166 |
MDB_node *node;
|
|
1167 |
MDB_page *mp;
|
|
1168 |
|
|
1169 |
if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
|
|
1170 |
for (i=0; i<mc->mc_top; i++) {
|
|
1171 |
mp = mc->mc_pg[i];
|
|
1172 |
node = NODEPTR(mp, mc->mc_ki[i]);
|
|
1173 |
if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
|
|
1174 |
printf("oops!\n");
|
|
1175 |
}
|
|
1176 |
if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
|
|
1177 |
printf("ack!\n");
|
|
1178 |
}
|
|
1179 |
#endif
|
|
1180 |
|
|
1181 |
#if MDB_DEBUG > 2
|
|
1182 |
/** Count all the pages in each DB and in the freelist
|
|
1183 |
* and make sure it matches the actual number of pages
|
|
1184 |
* being used.
|
|
1185 |
*/
|
|
1186 |
static void mdb_audit(MDB_txn *txn)
|
|
1187 |
{
|
|
1188 |
MDB_cursor mc;
|
|
1189 |
MDB_val key, data;
|
|
1190 |
MDB_ID freecount, count;
|
|
1191 |
MDB_dbi i;
|
|
1192 |
int rc;
|
|
1193 |
|
|
1194 |
freecount = 0;
|
|
1195 |
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
|
|
1196 |
while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
|
|
1197 |
freecount += *(MDB_ID *)data.mv_data;
|
|
1198 |
|
|
1199 |
count = 0;
|
|
1200 |
for (i = 0; i<txn->mt_numdbs; i++) {
|
|
1201 |
MDB_xcursor mx, *mxp;
|
|
1202 |
mxp = (txn->mt_dbs[i].md_flags & MDB_DUPSORT) ? &mx : NULL;
|
|
1203 |
mdb_cursor_init(&mc, txn, i, mxp);
|
|
1204 |
if (txn->mt_dbs[i].md_root == P_INVALID)
|
|
1205 |
continue;
|
|
1206 |
count += txn->mt_dbs[i].md_branch_pages +
|
|
1207 |
txn->mt_dbs[i].md_leaf_pages +
|
|
1208 |
txn->mt_dbs[i].md_overflow_pages;
|
|
1209 |
if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
|
|
1210 |
mdb_page_search(&mc, NULL, 0);
|
|
1211 |
do {
|
|
1212 |
unsigned j;
|
|
1213 |
MDB_page *mp;
|
|
1214 |
mp = mc.mc_pg[mc.mc_top];
|
|
1215 |
for (j=0; j<NUMKEYS(mp); j++) {
|
|
1216 |
MDB_node *leaf = NODEPTR(mp, j);
|
|
1217 |
if (leaf->mn_flags & F_SUBDATA) {
|
|
1218 |
MDB_db db;
|
|
1219 |
memcpy(&db, NODEDATA(leaf), sizeof(db));
|
|
1220 |
count += db.md_branch_pages + db.md_leaf_pages +
|
|
1221 |
db.md_overflow_pages;
|
|
1222 |
}
|
|
1223 |
}
|
|
1224 |
}
|
|
1225 |
while (mdb_cursor_sibling(&mc, 1) == 0);
|
|
1226 |
}
|
|
1227 |
}
|
|
1228 |
if (freecount + count + 2 /* metapages */ != txn->mt_next_pgno) {
|
|
1229 |
fprintf(stderr, "audit: %lu freecount: %lu count: %lu total: %lu next_pgno: %lu\n",
|
|
1230 |
txn->mt_txnid, freecount, count+2, freecount+count+2, txn->mt_next_pgno);
|
|
1231 |
}
|
|
1232 |
}
|
|
1233 |
#endif
|
|
1234 |
|
|
1235 |
int
|
|
1236 |
mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
|
|
1237 |
{
|
|
1238 |
return txn->mt_dbxs[dbi].md_cmp(a, b);
|
|
1239 |
}
|
|
1240 |
|
|
1241 |
int
|
|
1242 |
mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
|
|
1243 |
{
|
|
1244 |
if (txn->mt_dbxs[dbi].md_dcmp)
|
|
1245 |
return txn->mt_dbxs[dbi].md_dcmp(a, b);
|
|
1246 |
else
|
|
1247 |
return EINVAL; /* too bad you can't distinguish this from a valid result */
|
|
1248 |
}
|
|
1249 |
|
|
1250 |
/** Allocate a single page.
|
|
1251 |
* Re-use old malloc'd pages first, otherwise just malloc.
|
|
1252 |
*/
|
|
1253 |
static MDB_page *
|
|
1254 |
mdb_page_malloc(MDB_cursor *mc) {
|
|
1255 |
MDB_page *ret;
|
|
1256 |
size_t sz = mc->mc_txn->mt_env->me_psize;
|
|
1257 |
if ((ret = mc->mc_txn->mt_env->me_dpages) != NULL) {
|
|
1258 |
VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
|
|
1259 |
VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
|
|
1260 |
mc->mc_txn->mt_env->me_dpages = ret->mp_next;
|
|
1261 |
} else if ((ret = malloc(sz)) != NULL) {
|
|
1262 |
VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
|
|
1263 |
}
|
|
1264 |
return ret;
|
|
1265 |
}
|
|
1266 |
|
|
1267 |
static void
|
|
1268 |
mdb_page_free(MDB_env *env, MDB_page *mp)
|
|
1269 |
{
|
|
1270 |
mp->mp_next = env->me_dpages;
|
|
1271 |
VGMEMP_FREE(env, mp);
|
|
1272 |
env->me_dpages = mp;
|
|
1273 |
}
|
|
1274 |
|
|
1275 |
/** Allocate pages for writing.
|
|
1276 |
* If there are free pages available from older transactions, they
|
|
1277 |
* will be re-used first. Otherwise a new page will be allocated.
|
|
1278 |
* @param[in] mc cursor A cursor handle identifying the transaction and
|
|
1279 |
* database for which we are allocating.
|
|
1280 |
* @param[in] num the number of pages to allocate.
|
|
1281 |
* @param[out] mp Address of the allocated page(s). Requests for multiple pages
|
|
1282 |
* will always be satisfied by a single contiguous chunk of memory.
|
|
1283 |
* @return 0 on success, non-zero on failure.
|
|
1284 |
*/
|
|
1285 |
static int
|
|
1286 |
mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp)
|
|
1287 |
{
|
|
1288 |
MDB_txn *txn = mc->mc_txn;
|
|
1289 |
MDB_page *np;
|
|
1290 |
pgno_t pgno = P_INVALID;
|
|
1291 |
MDB_ID2 mid;
|
|
1292 |
txnid_t oldest = 0, last;
|
|
1293 |
int rc;
|
|
1294 |
|
|
1295 |
*mp = NULL;
|
|
1296 |
|
|
1297 |
/* If our dirty list is already full, we can't do anything */
|
|
1298 |
if (txn->mt_dirty_room == 0)
|
|
1299 |
return MDB_TXN_FULL;
|
|
1300 |
|
|
1301 |
/* The free list won't have any content at all until txn 2 has
|
|
1302 |
* committed. The pages freed by txn 2 will be unreferenced
|
|
1303 |
* after txn 3 commits, and so will be safe to re-use in txn 4.
|
|
1304 |
*/
|
|
1305 |
if (txn->mt_txnid > 3) {
|
|
1306 |
if (!txn->mt_env->me_pghead &&
|
|
1307 |
txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
|
|
1308 |
/* See if there's anything in the free DB */
|
|
1309 |
MDB_reader *r;
|
|
1310 |
MDB_cursor m2;
|
|
1311 |
MDB_node *leaf;
|
|
1312 |
MDB_val data;
|
|
1313 |
txnid_t *kptr;
|
|
1314 |
|
|
1315 |
mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
|
|
1316 |
if (!txn->mt_env->me_pglast) {
|
|
1317 |
mdb_page_search(&m2, NULL, 0);
|
|
1318 |
leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0);
|
|
1319 |
kptr = (txnid_t *)NODEKEY(leaf);
|
|
1320 |
last = *kptr;
|
|
1321 |
} else {
|
|
1322 |
MDB_val key;
|
|
1323 |
again:
|
|
1324 |
last = txn->mt_env->me_pglast + 1;
|
|
1325 |
leaf = NULL;
|
|
1326 |
key.mv_data = &last;
|
|
1327 |
key.mv_size = sizeof(last);
|
|
1328 |
rc = mdb_cursor_set(&m2, &key, &data, MDB_SET_RANGE, NULL);
|
|
1329 |
if (rc)
|
|
1330 |
goto none;
|
|
1331 |
last = *(txnid_t *)key.mv_data;
|
|
1332 |
}
|
|
1333 |
|
|
1334 |
{
|
|
1335 |
unsigned int i, nr;
|
|
1336 |
txnid_t mr;
|
|
1337 |
oldest = txn->mt_txnid - 1;
|
|
1338 |
nr = txn->mt_env->me_txns->mti_numreaders;
|
|
1339 |
r = txn->mt_env->me_txns->mti_readers;
|
|
1340 |
for (i=0; i<nr; i++) {
|
|
1341 |
if (!r[i].mr_pid) continue;
|
|
1342 |
mr = r[i].mr_txnid;
|
|
1343 |
if (mr < oldest)
|
|
1344 |
oldest = mr;
|
|
1345 |
}
|
|
1346 |
}
|
|
1347 |
|
|
1348 |
if (oldest > last) {
|
|
1349 |
/* It's usable, grab it.
|
|
1350 |
*/
|
|
1351 |
pgno_t *idl, *mop;
|
|
1352 |
|
|
1353 |
if (!txn->mt_env->me_pglast) {
|
|
1354 |
mdb_node_read(txn, leaf, &data);
|
|
1355 |
}
|
|
1356 |
idl = (MDB_ID *) data.mv_data;
|
|
1357 |
/* We might have a zero-length IDL due to freelist growth
|
|
1358 |
* during a prior commit
|
|
1359 |
*/
|
|
1360 |
if (!idl[0]) {
|
|
1361 |
txn->mt_env->me_pglast = last;
|
|
1362 |
goto again;
|
|
1363 |
}
|
|
1364 |
mop = malloc(MDB_IDL_SIZEOF(idl));
|
|
1365 |
if (!mop)
|
|
1366 |
return ENOMEM;
|
|
1367 |
txn->mt_env->me_pglast = last;
|
|
1368 |
txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop;
|
|
1369 |
memcpy(mop, idl, MDB_IDL_SIZEOF(idl));
|
|
1370 |
|
|
1371 |
#if MDB_DEBUG > 1
|
|
1372 |
{
|
|
1373 |
unsigned int i;
|
|
1374 |
DPRINTF("IDL read txn %zu root %zu num %zu",
|
|
1375 |
last, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
|
|
1376 |
for (i=0; i<idl[0]; i++) {
|
|
1377 |
DPRINTF("IDL %zu", idl[i+1]);
|
|
1378 |
}
|
|
1379 |
}
|
|
1380 |
#endif
|
|
1381 |
}
|
|
1382 |
}
|
|
1383 |
none:
|
|
1384 |
if (txn->mt_env->me_pghead) {
|
|
1385 |
pgno_t *mop = txn->mt_env->me_pghead;
|
|
1386 |
if (num > 1) {
|
|
1387 |
MDB_cursor m2;
|
|
1388 |
int retry = 1, readit = 0, n2 = num-1;
|
|
1389 |
unsigned int i, j, k;
|
|
1390 |
|
|
1391 |
/* If current list is too short, must fetch more and coalesce */
|
|
1392 |
if (mop[0] < (unsigned)num)
|
|
1393 |
readit = 1;
|
|
1394 |
|
|
1395 |
mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
|
|
1396 |
do {
|
|
1397 |
/* If on freelist, don't try to read more. If what we have
|
|
1398 |
* right now isn't enough just use new pages.
|
|
1399 |
* TODO: get all of this working. Many circular dependencies...
|
|
1400 |
*/
|
|
1401 |
if (mc->mc_dbi == FREE_DBI) {
|
|
1402 |
retry = 0;
|
|
1403 |
readit = 0;
|
|
1404 |
}
|
|
1405 |
if (readit) {
|
|
1406 |
MDB_val key, data;
|
|
1407 |
pgno_t *idl, *mop2;
|
|
1408 |
|
|
1409 |
last = txn->mt_env->me_pglast + 1;
|
|
1410 |
|
|
1411 |
/* We haven't hit the readers list yet? */
|
|
1412 |
if (!oldest) {
|
|
1413 |
MDB_reader *r;
|
|
1414 |
unsigned int nr;
|
|
1415 |
txnid_t mr;
|
|
1416 |
|
|
1417 |
oldest = txn->mt_txnid - 1;
|
|
1418 |
nr = txn->mt_env->me_txns->mti_numreaders;
|
|
1419 |
r = txn->mt_env->me_txns->mti_readers;
|
|
1420 |
for (i=0; i<nr; i++) {
|
|
1421 |
if (!r[i].mr_pid) continue;
|
|
1422 |
mr = r[i].mr_txnid;
|
|
1423 |
if (mr < oldest)
|
|
1424 |
oldest = mr;
|
|
1425 |
}
|
|
1426 |
}
|
|
1427 |
|
|
1428 |
/* There's nothing we can use on the freelist */
|
|
1429 |
if (oldest - last < 1)
|
|
1430 |
break;
|
|
1431 |
|
|
1432 |
key.mv_data = &last;
|
|
1433 |
key.mv_size = sizeof(last);
|
|
1434 |
rc = mdb_cursor_set(&m2,&key,&data,MDB_SET_RANGE,NULL);
|
|
1435 |
if (rc) {
|
|
1436 |
if (rc == MDB_NOTFOUND)
|
|
1437 |
break;
|
|
1438 |
return rc;
|
|
1439 |
}
|
|
1440 |
last = *(txnid_t*)key.mv_data;
|
|
1441 |
if (oldest <= last)
|
|
1442 |
break;
|
|
1443 |
idl = (MDB_ID *) data.mv_data;
|
|
1444 |
mop2 = malloc(MDB_IDL_SIZEOF(idl) + MDB_IDL_SIZEOF(mop));
|
|
1445 |
if (!mop2)
|
|
1446 |
return ENOMEM;
|
|
1447 |
/* merge in sorted order */
|
|
1448 |
i = idl[0]; j = mop[0]; mop2[0] = k = i+j;
|
|
1449 |
mop[0] = P_INVALID;
|
|
1450 |
while (i>0 || j>0) {
|
|
1451 |
if (i && idl[i] < mop[j])
|
|
1452 |
mop2[k--] = idl[i--];
|
|
1453 |
else
|
|
1454 |
mop2[k--] = mop[j--];
|
|
1455 |
}
|
|
1456 |
txn->mt_env->me_pglast = last;
|
|
1457 |
free(txn->mt_env->me_pgfree);
|
|
1458 |
txn->mt_env->me_pghead = txn->mt_env->me_pgfree = mop2;
|
|
1459 |
mop = mop2;
|
|
1460 |
/* Keep trying to read until we have enough */
|
|
1461 |
if (mop[0] < (unsigned)num) {
|
|
1462 |
continue;
|
|
1463 |
}
|
|
1464 |
}
|
|
1465 |
|
|
1466 |
/* current list has enough pages, but are they contiguous? */
|
|
1467 |
for (i=mop[0]; i>=(unsigned)num; i--) {
|
|
1468 |
if (mop[i-n2] == mop[i] + n2) {
|
|
1469 |
pgno = mop[i];
|
|
1470 |
i -= n2;
|
|
1471 |
/* move any stragglers down */
|
|
1472 |
for (j=i+num; j<=mop[0]; j++)
|
|
1473 |
mop[i++] = mop[j];
|
|
1474 |
mop[0] -= num;
|
|
1475 |
break;
|
|
1476 |
}
|
|
1477 |
}
|
|
1478 |
|
|
1479 |
/* Stop if we succeeded, or no retries */
|
|
1480 |
if (!retry || pgno != P_INVALID)
|
|
1481 |
break;
|
|
1482 |
readit = 1;
|
|
1483 |
|
|
1484 |
} while (1);
|
|
1485 |
} else {
|
|
1486 |
/* peel pages off tail, so we only have to truncate the list */
|
|
1487 |
pgno = MDB_IDL_LAST(mop);
|
|
1488 |
mop[0]--;
|
|
1489 |
}
|
|
1490 |
if (MDB_IDL_IS_ZERO(mop)) {
|
|
1491 |
free(txn->mt_env->me_pgfree);
|
|
1492 |
txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL;
|
|
1493 |
}
|
|
1494 |
}
|
|
1495 |
}
|
|
1496 |
|
|
1497 |
if (pgno == P_INVALID) {
|
|
1498 |
/* DB size is maxed out */
|
|
1499 |
if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg) {
|
|
1500 |
DPUTS("DB size maxed out");
|
|
1501 |
return MDB_MAP_FULL;
|
|
1502 |
}
|
|
1503 |
}
|
|
1504 |
if (txn->mt_env->me_flags & MDB_WRITEMAP) {
|
|
1505 |
if (pgno == P_INVALID) {
|
|
1506 |
pgno = txn->mt_next_pgno;
|
|
1507 |
txn->mt_next_pgno += num;
|
|
1508 |
}
|
|
1509 |
np = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno);
|
|
1510 |
np->mp_pgno = pgno;
|
|
1511 |
} else {
|
|
1512 |
if (txn->mt_env->me_dpages && num == 1) {
|
|
1513 |
np = txn->mt_env->me_dpages;
|
|
1514 |
VGMEMP_ALLOC(txn->mt_env, np, txn->mt_env->me_psize);
|
|
1515 |
VGMEMP_DEFINED(np, sizeof(np->mp_next));
|
|
1516 |
txn->mt_env->me_dpages = np->mp_next;
|
|
1517 |
} else {
|
|
1518 |
size_t sz = txn->mt_env->me_psize * num;
|
|
1519 |
if ((np = malloc(sz)) == NULL)
|
|
1520 |
return ENOMEM;
|
|
1521 |
VGMEMP_ALLOC(txn->mt_env, np, sz);
|
|
1522 |
}
|
|
1523 |
if (pgno == P_INVALID) {
|
|
1524 |
np->mp_pgno = txn->mt_next_pgno;
|
|
1525 |
txn->mt_next_pgno += num;
|
|
1526 |
} else {
|
|
1527 |
np->mp_pgno = pgno;
|
|
1528 |
}
|
|
1529 |
}
|
|
1530 |
mid.mid = np->mp_pgno;
|
|
1531 |
mid.mptr = np;
|
|
1532 |
if (txn->mt_env->me_flags & MDB_WRITEMAP) {
|
|
1533 |
mdb_mid2l_append(txn->mt_u.dirty_list, &mid);
|
|
1534 |
} else {
|
|
1535 |
mdb_mid2l_insert(txn->mt_u.dirty_list, &mid);
|
|
1536 |
}
|
|
1537 |
txn->mt_dirty_room--;
|
|
1538 |
*mp = np;
|
|
1539 |
|
|
1540 |
return MDB_SUCCESS;
|
|
1541 |
}
|
|
1542 |
|
|
1543 |
/** Copy a page: avoid copying unused portions of the page.
|
|
1544 |
* @param[in] dst page to copy into
|
|
1545 |
* @param[in] src page to copy from
|
|
1546 |
*/
|
|
1547 |
static void
|
|
1548 |
mdb_page_copy(MDB_page *dst, MDB_page *src, unsigned int psize)
|
|
1549 |
{
|
|
1550 |
dst->mp_flags = src->mp_flags | P_DIRTY;
|
|
1551 |
dst->mp_pages = src->mp_pages;
|
|
1552 |
|
|
1553 |
if (IS_LEAF2(src)) {
|
|
1554 |
memcpy(dst->mp_ptrs, src->mp_ptrs, psize - PAGEHDRSZ - SIZELEFT(src));
|
|
1555 |
} else {
|
|
1556 |
unsigned int i, nkeys = NUMKEYS(src);
|
|
1557 |
for (i=0; i<nkeys; i++)
|
|
1558 |
dst->mp_ptrs[i] = src->mp_ptrs[i];
|
|
1559 |
memcpy((char *)dst+src->mp_upper, (char *)src+src->mp_upper,
|
|
1560 |
psize - src->mp_upper);
|
|
1561 |
}
|
|
1562 |
}
|
|
1563 |
|
|
1564 |
/** Touch a page: make it dirty and re-insert into tree with updated pgno.
|
|
1565 |
* @param[in] mc cursor pointing to the page to be touched
|
|
1566 |
* @return 0 on success, non-zero on failure.
|
|
1567 |
*/
|
|
1568 |
static int
|
|
1569 |
mdb_page_touch(MDB_cursor *mc)
|
|
1570 |
{
|
|
1571 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
|
1572 |
pgno_t pgno;
|
|
1573 |
int rc;
|
|
1574 |
|
|
1575 |
if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
|
|
1576 |
MDB_page *np;
|
|
1577 |
if ((rc = mdb_page_alloc(mc, 1, &np)))
|
|
1578 |
return rc;
|
|
1579 |
DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi, mp->mp_pgno, np->mp_pgno);
|
|
1580 |
assert(mp->mp_pgno != np->mp_pgno);
|
|
1581 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
|
|
1582 |
if (SIZELEFT(mp)) {
|
|
1583 |
/* If page isn't full, just copy the used portion */
|
|
1584 |
mdb_page_copy(np, mp, mc->mc_txn->mt_env->me_psize);
|
|
1585 |
} else {
|
|
1586 |
pgno = np->mp_pgno;
|
|
1587 |
memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
|
|
1588 |
np->mp_pgno = pgno;
|
|
1589 |
np->mp_flags |= P_DIRTY;
|
|
1590 |
}
|
|
1591 |
mp = np;
|
|
1592 |
|
|
1593 |
finish:
|
|
1594 |
/* Adjust other cursors pointing to mp */
|
|
1595 |
if (mc->mc_flags & C_SUB) {
|
|
1596 |
MDB_cursor *m2, *m3;
|
|
1597 |
MDB_dbi dbi = mc->mc_dbi-1;
|
|
1598 |
|
|
1599 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
1600 |
if (m2 == mc) continue;
|
|
1601 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
1602 |
if (m3->mc_snum < mc->mc_snum) continue;
|
|
1603 |
if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
|
|
1604 |
m3->mc_pg[mc->mc_top] = mp;
|
|
1605 |
}
|
|
1606 |
}
|
|
1607 |
} else {
|
|
1608 |
MDB_cursor *m2;
|
|
1609 |
|
|
1610 |
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
|
1611 |
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
|
|
1612 |
if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
|
|
1613 |
m2->mc_pg[mc->mc_top] = mp;
|
|
1614 |
}
|
|
1615 |
}
|
|
1616 |
}
|
|
1617 |
mc->mc_pg[mc->mc_top] = mp;
|
|
1618 |
/** If this page has a parent, update the parent to point to
|
|
1619 |
* this new page.
|
|
1620 |
*/
|
|
1621 |
if (mc->mc_top)
|
|
1622 |
SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno);
|
|
1623 |
else
|
|
1624 |
mc->mc_db->md_root = mp->mp_pgno;
|
|
1625 |
} else if (mc->mc_txn->mt_parent) {
|
|
1626 |
MDB_page *np;
|
|
1627 |
MDB_ID2 mid;
|
|
1628 |
/* If txn has a parent, make sure the page is in our
|
|
1629 |
* dirty list.
|
|
1630 |
*/
|
|
1631 |
if (mc->mc_txn->mt_u.dirty_list[0].mid) {
|
|
1632 |
unsigned x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, mp->mp_pgno);
|
|
1633 |
if (x <= mc->mc_txn->mt_u.dirty_list[0].mid &&
|
|
1634 |
mc->mc_txn->mt_u.dirty_list[x].mid == mp->mp_pgno) {
|
|
1635 |
if (mc->mc_txn->mt_u.dirty_list[x].mptr != mp) {
|
|
1636 |
mp = mc->mc_txn->mt_u.dirty_list[x].mptr;
|
|
1637 |
mc->mc_pg[mc->mc_top] = mp;
|
|
1638 |
}
|
|
1639 |
return 0;
|
|
1640 |
}
|
|
1641 |
}
|
|
1642 |
assert(mc->mc_txn->mt_u.dirty_list[0].mid < MDB_IDL_UM_MAX);
|
|
1643 |
/* No - copy it */
|
|
1644 |
np = mdb_page_malloc(mc);
|
|
1645 |
if (!np)
|
|
1646 |
return ENOMEM;
|
|
1647 |
memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
|
|
1648 |
mid.mid = np->mp_pgno;
|
|
1649 |
mid.mptr = np;
|
|
1650 |
mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &mid);
|
|
1651 |
mp = np;
|
|
1652 |
goto finish;
|
|
1653 |
}
|
|
1654 |
return 0;
|
|
1655 |
}
|
|
1656 |
|
|
1657 |
int
|
|
1658 |
mdb_env_sync(MDB_env *env, int force)
|
|
1659 |
{
|
|
1660 |
int rc = 0;
|
|
1661 |
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
|
|
1662 |
if (env->me_flags & MDB_WRITEMAP) {
|
|
1663 |
int flags = ((env->me_flags & MDB_MAPASYNC) && !force)
|
|
1664 |
? MS_ASYNC : MS_SYNC;
|
|
1665 |
if (MDB_MSYNC(env->me_map, env->me_mapsize, flags))
|
|
1666 |
rc = ErrCode();
|
|
1667 |
#ifdef _WIN32
|
|
1668 |
else if (flags == MS_SYNC && MDB_FDATASYNC(env->me_fd))
|
|
1669 |
rc = ErrCode();
|
|
1670 |
#endif
|
|
1671 |
} else {
|
|
1672 |
if (MDB_FDATASYNC(env->me_fd))
|
|
1673 |
rc = ErrCode();
|
|
1674 |
}
|
|
1675 |
}
|
|
1676 |
return rc;
|
|
1677 |
}
|
|
1678 |
|
|
1679 |
/** Make shadow copies of all of parent txn's cursors */
|
|
1680 |
static int
|
|
1681 |
mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
|
|
1682 |
{
|
|
1683 |
MDB_cursor *mc, *m2;
|
|
1684 |
unsigned int i, j, size;
|
|
1685 |
|
|
1686 |
for (i=0;i<src->mt_numdbs; i++) {
|
|
1687 |
if (src->mt_cursors[i]) {
|
|
1688 |
size = sizeof(MDB_cursor);
|
|
1689 |
if (src->mt_cursors[i]->mc_xcursor)
|
|
1690 |
size += sizeof(MDB_xcursor);
|
|
1691 |
for (m2 = src->mt_cursors[i]; m2; m2=m2->mc_next) {
|
|
1692 |
mc = malloc(size);
|
|
1693 |
if (!mc)
|
|
1694 |
return ENOMEM;
|
|
1695 |
mc->mc_orig = m2;
|
|
1696 |
mc->mc_txn = dst;
|
|
1697 |
mc->mc_dbi = i;
|
|
1698 |
mc->mc_db = &dst->mt_dbs[i];
|
|
1699 |
mc->mc_dbx = m2->mc_dbx;
|
|
1700 |
mc->mc_dbflag = &dst->mt_dbflags[i];
|
|
1701 |
mc->mc_snum = m2->mc_snum;
|
|
1702 |
mc->mc_top = m2->mc_top;
|
|
1703 |
mc->mc_flags = m2->mc_flags | C_SHADOW;
|
|
1704 |
for (j=0; j<mc->mc_snum; j++) {
|
|
1705 |
mc->mc_pg[j] = m2->mc_pg[j];
|
|
1706 |
mc->mc_ki[j] = m2->mc_ki[j];
|
|
1707 |
}
|
|
1708 |
if (m2->mc_xcursor) {
|
|
1709 |
MDB_xcursor *mx, *mx2;
|
|
1710 |
mx = (MDB_xcursor *)(mc+1);
|
|
1711 |
mc->mc_xcursor = mx;
|
|
1712 |
mx2 = m2->mc_xcursor;
|
|
1713 |
mx->mx_db = mx2->mx_db;
|
|
1714 |
mx->mx_dbx = mx2->mx_dbx;
|
|
1715 |
mx->mx_dbflag = mx2->mx_dbflag;
|
|
1716 |
mx->mx_cursor.mc_txn = dst;
|
|
1717 |
mx->mx_cursor.mc_dbi = mx2->mx_cursor.mc_dbi;
|
|
1718 |
mx->mx_cursor.mc_db = &mx->mx_db;
|
|
1719 |
mx->mx_cursor.mc_dbx = &mx->mx_dbx;
|
|
1720 |
mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
|
|
1721 |
mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum;
|
|
1722 |
mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top;
|
|
1723 |
mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags | C_SHADOW;
|
|
1724 |
for (j=0; j<mx2->mx_cursor.mc_snum; j++) {
|
|
1725 |
mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j];
|
|
1726 |
mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j];
|
|
1727 |
}
|
|
1728 |
} else {
|
|
1729 |
mc->mc_xcursor = NULL;
|
|
1730 |
}
|
|
1731 |
mc->mc_next = dst->mt_cursors[i];
|
|
1732 |
dst->mt_cursors[i] = mc;
|
|
1733 |
}
|
|
1734 |
}
|
|
1735 |
}
|
|
1736 |
return MDB_SUCCESS;
|
|
1737 |
}
|
|
1738 |
|
|
1739 |
/** Merge shadow cursors back into parent's */
|
|
1740 |
static void
|
|
1741 |
mdb_cursor_merge(MDB_txn *txn)
|
|
1742 |
{
|
|
1743 |
MDB_dbi i;
|
|
1744 |
for (i=0; i<txn->mt_numdbs; i++) {
|
|
1745 |
if (txn->mt_cursors[i]) {
|
|
1746 |
MDB_cursor *mc;
|
|
1747 |
while ((mc = txn->mt_cursors[i])) {
|
|
1748 |
txn->mt_cursors[i] = mc->mc_next;
|
|
1749 |
if (mc->mc_flags & C_SHADOW) {
|
|
1750 |
MDB_cursor *m2 = mc->mc_orig;
|
|
1751 |
unsigned int j;
|
|
1752 |
m2->mc_snum = mc->mc_snum;
|
|
1753 |
m2->mc_top = mc->mc_top;
|
|
1754 |
for (j=0; j<mc->mc_snum; j++) {
|
|
1755 |
m2->mc_pg[j] = mc->mc_pg[j];
|
|
1756 |
m2->mc_ki[j] = mc->mc_ki[j];
|
|
1757 |
}
|
|
1758 |
}
|
|
1759 |
if (mc->mc_flags & C_ALLOCD)
|
|
1760 |
free(mc);
|
|
1761 |
}
|
|
1762 |
}
|
|
1763 |
}
|
|
1764 |
}
|
|
1765 |
|
|
1766 |
static void
|
|
1767 |
mdb_txn_reset0(MDB_txn *txn);
|
|
1768 |
|
|
1769 |
/** Common code for #mdb_txn_begin() and #mdb_txn_renew().
|
|
1770 |
* @param[in] txn the transaction handle to initialize
|
|
1771 |
* @return 0 on success, non-zero on failure. This can only
|
|
1772 |
* fail for read-only transactions, and then only if the
|
|
1773 |
* reader table is full.
|
|
1774 |
*/
|
|
1775 |
static int
|
|
1776 |
mdb_txn_renew0(MDB_txn *txn)
|
|
1777 |
{
|
|
1778 |
MDB_env *env = txn->mt_env;
|
|
1779 |
unsigned int i;
|
|
1780 |
uint16_t x;
|
|
1781 |
int rc;
|
|
1782 |
|
|
1783 |
/* Setup db info */
|
|
1784 |
txn->mt_numdbs = env->me_numdbs;
|
|
1785 |
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
|
|
1786 |
|
|
1787 |
if (txn->mt_flags & MDB_TXN_RDONLY) {
|
|
1788 |
if (env->me_flags & MDB_ROFS) {
|
|
1789 |
i = mdb_env_pick_meta(env);
|
|
1790 |
txn->mt_txnid = env->me_metas[i]->mm_txnid;
|
|
1791 |
txn->mt_u.reader = NULL;
|
|
1792 |
} else {
|
|
1793 |
MDB_reader *r = pthread_getspecific(env->me_txkey);
|
|
1794 |
if (!r) {
|
|
1795 |
pid_t pid = env->me_pid;
|
|
1796 |
pthread_t tid = pthread_self();
|
|
1797 |
|
|
1798 |
LOCK_MUTEX_R(env);
|
|
1799 |
for (i=0; i<env->me_txns->mti_numreaders; i++)
|
|
1800 |
if (env->me_txns->mti_readers[i].mr_pid == 0)
|
|
1801 |
break;
|
|
1802 |
if (i == env->me_maxreaders) {
|
|
1803 |
UNLOCK_MUTEX_R(env);
|
|
1804 |
return MDB_READERS_FULL;
|
|
1805 |
}
|
|
1806 |
env->me_txns->mti_readers[i].mr_pid = pid;
|
|
1807 |
env->me_txns->mti_readers[i].mr_tid = tid;
|
|
1808 |
if (i >= env->me_txns->mti_numreaders)
|
|
1809 |
env->me_txns->mti_numreaders = i+1;
|
|
1810 |
/* Save numreaders for un-mutexed mdb_env_close() */
|
|
1811 |
env->me_numreaders = env->me_txns->mti_numreaders;
|
|
1812 |
UNLOCK_MUTEX_R(env);
|
|
1813 |
r = &env->me_txns->mti_readers[i];
|
|
1814 |
if ((rc = pthread_setspecific(env->me_txkey, r)) != 0) {
|
|
1815 |
env->me_txns->mti_readers[i].mr_pid = 0;
|
|
1816 |
return rc;
|
|
1817 |
}
|
|
1818 |
}
|
|
1819 |
txn->mt_txnid = r->mr_txnid = env->me_txns->mti_txnid;
|
|
1820 |
txn->mt_u.reader = r;
|
|
1821 |
}
|
|
1822 |
txn->mt_toggle = txn->mt_txnid & 1;
|
|
1823 |
txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
|
|
1824 |
} else {
|
|
1825 |
LOCK_MUTEX_W(env);
|
|
1826 |
|
|
1827 |
txn->mt_txnid = env->me_txns->mti_txnid;
|
|
1828 |
txn->mt_toggle = txn->mt_txnid & 1;
|
|
1829 |
txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
|
|
1830 |
txn->mt_txnid++;
|
|
1831 |
#if MDB_DEBUG
|
|
1832 |
if (txn->mt_txnid == mdb_debug_start)
|
|
1833 |
mdb_debug = 1;
|
|
1834 |
#endif
|
|
1835 |
txn->mt_dirty_room = MDB_IDL_UM_MAX;
|
|
1836 |
txn->mt_u.dirty_list = env->me_dirty_list;
|
|
1837 |
txn->mt_u.dirty_list[0].mid = 0;
|
|
1838 |
txn->mt_free_pgs = env->me_free_pgs;
|
|
1839 |
txn->mt_free_pgs[0] = 0;
|
|
1840 |
env->me_txn = txn;
|
|
1841 |
}
|
|
1842 |
|
|
1843 |
/* Copy the DB info and flags */
|
|
1844 |
memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
|
|
1845 |
for (i=2; i<txn->mt_numdbs; i++) {
|
|
1846 |
x = env->me_dbflags[i];
|
|
1847 |
txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
|
|
1848 |
txn->mt_dbflags[i] = (x & MDB_VALID) ? DB_VALID|DB_STALE : 0;
|
|
1849 |
}
|
|
1850 |
txn->mt_dbflags[0] = txn->mt_dbflags[1] = DB_VALID;
|
|
1851 |
|
|
1852 |
if (env->me_maxpg < txn->mt_next_pgno) {
|
|
1853 |
mdb_txn_reset0(txn);
|
|
1854 |
return MDB_MAP_RESIZED;
|
|
1855 |
}
|
|
1856 |
|
|
1857 |
return MDB_SUCCESS;
|
|
1858 |
}
|
|
1859 |
|
|
1860 |
int
|
|
1861 |
mdb_txn_renew(MDB_txn *txn)
|
|
1862 |
{
|
|
1863 |
int rc;
|
|
1864 |
|
|
1865 |
if (! (txn && (txn->mt_flags & MDB_TXN_RDONLY)))
|
|
1866 |
return EINVAL;
|
|
1867 |
|
|
1868 |
if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
|
|
1869 |
DPUTS("environment had fatal error, must shutdown!");
|
|
1870 |
return MDB_PANIC;
|
|
1871 |
}
|
|
1872 |
|
|
1873 |
rc = mdb_txn_renew0(txn);
|
|
1874 |
if (rc == MDB_SUCCESS) {
|
|
1875 |
DPRINTF("renew txn %zu%c %p on mdbenv %p, root page %zu",
|
|
1876 |
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
|
|
1877 |
(void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
|
|
1878 |
}
|
|
1879 |
return rc;
|
|
1880 |
}
|
|
1881 |
|
|
1882 |
int
|
|
1883 |
mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
|
|
1884 |
{
|
|
1885 |
MDB_txn *txn;
|
|
1886 |
MDB_ntxn *ntxn;
|
|
1887 |
int rc, size, tsize = sizeof(MDB_txn);
|
|
1888 |
|
|
1889 |
if (env->me_flags & MDB_FATAL_ERROR) {
|
|
1890 |
DPUTS("environment had fatal error, must shutdown!");
|
|
1891 |
return MDB_PANIC;
|
|
1892 |
}
|
|
1893 |
if ((env->me_flags & MDB_RDONLY) && !(flags & MDB_RDONLY))
|
|
1894 |
return EACCES;
|
|
1895 |
if (parent) {
|
|
1896 |
/* Nested transactions: Max 1 child, write txns only, no writemap */
|
|
1897 |
if (parent->mt_child ||
|
|
1898 |
(flags & MDB_RDONLY) || (parent->mt_flags & MDB_TXN_RDONLY) ||
|
|
1899 |
(env->me_flags & MDB_WRITEMAP))
|
|
1900 |
{
|
|
1901 |
return EINVAL;
|
|
1902 |
}
|
|
1903 |
tsize = sizeof(MDB_ntxn);
|
|
1904 |
}
|
|
1905 |
size = tsize + env->me_maxdbs * (sizeof(MDB_db)+1);
|
|
1906 |
if (!(flags & MDB_RDONLY))
|
|
1907 |
size += env->me_maxdbs * sizeof(MDB_cursor *);
|
|
1908 |
|
|
1909 |
if ((txn = calloc(1, size)) == NULL) {
|
|
1910 |
DPRINTF("calloc: %s", strerror(ErrCode()));
|
|
1911 |
return ENOMEM;
|
|
1912 |
}
|
|
1913 |
txn->mt_dbs = (MDB_db *) ((char *)txn + tsize);
|
|
1914 |
if (flags & MDB_RDONLY) {
|
|
1915 |
txn->mt_flags |= MDB_TXN_RDONLY;
|
|
1916 |
txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
|
|
1917 |
} else {
|
|
1918 |
txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
|
|
1919 |
txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
|
|
1920 |
}
|
|
1921 |
txn->mt_env = env;
|
|
1922 |
|
|
1923 |
if (parent) {
|
|
1924 |
unsigned int i;
|
|
1925 |
txn->mt_free_pgs = mdb_midl_alloc();
|
|
1926 |
if (!txn->mt_free_pgs) {
|
|
1927 |
free(txn);
|
|
1928 |
return ENOMEM;
|
|
1929 |
}
|
|
1930 |
txn->mt_u.dirty_list = malloc(sizeof(MDB_ID2)*MDB_IDL_UM_SIZE);
|
|
1931 |
if (!txn->mt_u.dirty_list) {
|
|
1932 |
free(txn->mt_free_pgs);
|
|
1933 |
free(txn);
|
|
1934 |
return ENOMEM;
|
|
1935 |
}
|
|
1936 |
txn->mt_txnid = parent->mt_txnid;
|
|
1937 |
txn->mt_toggle = parent->mt_toggle;
|
|
1938 |
txn->mt_dirty_room = parent->mt_dirty_room;
|
|
1939 |
txn->mt_u.dirty_list[0].mid = 0;
|
|
1940 |
txn->mt_free_pgs[0] = 0;
|
|
1941 |
txn->mt_next_pgno = parent->mt_next_pgno;
|
|
1942 |
parent->mt_child = txn;
|
|
1943 |
txn->mt_parent = parent;
|
|
1944 |
txn->mt_numdbs = parent->mt_numdbs;
|
|
1945 |
txn->mt_dbxs = parent->mt_dbxs;
|
|
1946 |
memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
|
|
1947 |
/* Copy parent's mt_dbflags, but clear DB_NEW */
|
|
1948 |
for (i=0; i<txn->mt_numdbs; i++)
|
|
1949 |
txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
|
|
1950 |
rc = 0;
|
|
1951 |
ntxn = (MDB_ntxn *)txn;
|
|
1952 |
ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
|
|
1953 |
if (env->me_pghead) {
|
|
1954 |
size = MDB_IDL_SIZEOF(env->me_pghead);
|
|
1955 |
env->me_pghead = malloc(size);
|
|
1956 |
if (env->me_pghead)
|
|
1957 |
memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
|
|
1958 |
else
|
|
1959 |
rc = ENOMEM;
|
|
1960 |
}
|
|
1961 |
env->me_pgfree = env->me_pghead;
|
|
1962 |
if (!rc)
|
|
1963 |
rc = mdb_cursor_shadow(parent, txn);
|
|
1964 |
if (rc)
|
|
1965 |
mdb_txn_reset0(txn);
|
|
1966 |
} else {
|
|
1967 |
rc = mdb_txn_renew0(txn);
|
|
1968 |
}
|
|
1969 |
if (rc)
|
|
1970 |
free(txn);
|
|
1971 |
else {
|
|
1972 |
*ret = txn;
|
|
1973 |
DPRINTF("begin txn %zu%c %p on mdbenv %p, root page %zu",
|
|
1974 |
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
|
|
1975 |
(void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
|
|
1976 |
}
|
|
1977 |
|
|
1978 |
return rc;
|
|
1979 |
}
|
|
1980 |
|
|
1981 |
/** Common code for #mdb_txn_reset() and #mdb_txn_abort().
|
|
1982 |
* @param[in] txn the transaction handle to reset
|
|
1983 |
*/
|
|
1984 |
static void
|
|
1985 |
mdb_txn_reset0(MDB_txn *txn)
|
|
1986 |
{
|
|
1987 |
MDB_env *env = txn->mt_env;
|
|
1988 |
unsigned int i;
|
|
1989 |
|
|
1990 |
/* Close any DBI handles opened in this txn */
|
|
1991 |
for (i=2; i<txn->mt_numdbs; i++) {
|
|
1992 |
if (txn->mt_dbflags[i] & DB_NEW) {
|
|
1993 |
char *ptr = env->me_dbxs[i].md_name.mv_data;
|
|
1994 |
env->me_dbxs[i].md_name.mv_data = NULL;
|
|
1995 |
env->me_dbxs[i].md_name.mv_size = 0;
|
|
1996 |
free(ptr);
|
|
1997 |
}
|
|
1998 |
}
|
|
1999 |
|
|
2000 |
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
|
|
2001 |
if (!(env->me_flags & MDB_ROFS))
|
|
2002 |
txn->mt_u.reader->mr_txnid = (txnid_t)-1;
|
|
2003 |
} else {
|
|
2004 |
MDB_page *dp;
|
|
2005 |
|
|
2006 |
/* close(free) all cursors */
|
|
2007 |
for (i=0; i<txn->mt_numdbs; i++) {
|
|
2008 |
if (txn->mt_cursors[i]) {
|
|
2009 |
MDB_cursor *mc;
|
|
2010 |
while ((mc = txn->mt_cursors[i])) {
|
|
2011 |
txn->mt_cursors[i] = mc->mc_next;
|
|
2012 |
if (mc->mc_flags & C_ALLOCD)
|
|
2013 |
free(mc);
|
|
2014 |
}
|
|
2015 |
}
|
|
2016 |
}
|
|
2017 |
|
|
2018 |
if (!(env->me_flags & MDB_WRITEMAP)) {
|
|
2019 |
/* return all dirty pages to dpage list */
|
|
2020 |
for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
|
|
2021 |
dp = txn->mt_u.dirty_list[i].mptr;
|
|
2022 |
if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
|
|
2023 |
mdb_page_free(txn->mt_env, dp);
|
|
2024 |
} else {
|
|
2025 |
/* large pages just get freed directly */
|
|
2026 |
VGMEMP_FREE(txn->mt_env, dp);
|
|
2027 |
free(dp);
|
|
2028 |
}
|
|
2029 |
}
|
|
2030 |
}
|
|
2031 |
|
|
2032 |
free(env->me_pgfree);
|
|
2033 |
|
|
2034 |
if (txn->mt_parent) {
|
|
2035 |
txn->mt_parent->mt_child = NULL;
|
|
2036 |
env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
|
|
2037 |
mdb_midl_free(txn->mt_free_pgs);
|
|
2038 |
free(txn->mt_u.dirty_list);
|
|
2039 |
return;
|
|
2040 |
} else {
|
|
2041 |
if (mdb_midl_shrink(&txn->mt_free_pgs))
|
|
2042 |
env->me_free_pgs = txn->mt_free_pgs;
|
|
2043 |
}
|
|
2044 |
|
|
2045 |
txn->mt_env->me_pghead = txn->mt_env->me_pgfree = NULL;
|
|
2046 |
txn->mt_env->me_pglast = 0;
|
|
2047 |
|
|
2048 |
env->me_txn = NULL;
|
|
2049 |
/* The writer mutex was locked in mdb_txn_begin. */
|
|
2050 |
UNLOCK_MUTEX_W(env);
|
|
2051 |
}
|
|
2052 |
}
|
|
2053 |
|
|
2054 |
void
|
|
2055 |
mdb_txn_reset(MDB_txn *txn)
|
|
2056 |
{
|
|
2057 |
if (txn == NULL)
|
|
2058 |
return;
|
|
2059 |
|
|
2060 |
DPRINTF("reset txn %zu%c %p on mdbenv %p, root page %zu",
|
|
2061 |
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
|
|
2062 |
(void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
|
|
2063 |
|
|
2064 |
mdb_txn_reset0(txn);
|
|
2065 |
}
|
|
2066 |
|
|
2067 |
void
|
|
2068 |
mdb_txn_abort(MDB_txn *txn)
|
|
2069 |
{
|
|
2070 |
if (txn == NULL)
|
|
2071 |
return;
|
|
2072 |
|
|
2073 |
DPRINTF("abort txn %zu%c %p on mdbenv %p, root page %zu",
|
|
2074 |
txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
|
|
2075 |
(void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
|
|
2076 |
|
|
2077 |
if (txn->mt_child)
|
|
2078 |
mdb_txn_abort(txn->mt_child);
|
|
2079 |
|
|
2080 |
mdb_txn_reset0(txn);
|
|
2081 |
free(txn);
|
|
2082 |
}
|
|
2083 |
|
|
2084 |
int
|
|
2085 |
mdb_txn_commit(MDB_txn *txn)
|
|
2086 |
{
|
|
2087 |
int n, done;
|
|
2088 |
unsigned int i;
|
|
2089 |
ssize_t rc;
|
|
2090 |
off_t size;
|
|
2091 |
MDB_page *dp;
|
|
2092 |
MDB_env *env;
|
|
2093 |
pgno_t next, freecnt;
|
|
2094 |
txnid_t oldpg_txnid, id;
|
|
2095 |
MDB_cursor mc;
|
|
2096 |
|
|
2097 |
assert(txn != NULL);
|
|
2098 |
assert(txn->mt_env != NULL);
|
|
2099 |
|
|
2100 |
if (txn->mt_child) {
|
|
2101 |
mdb_txn_commit(txn->mt_child);
|
|
2102 |
txn->mt_child = NULL;
|
|
2103 |
}
|
|
2104 |
|
|
2105 |
env = txn->mt_env;
|
|
2106 |
|
|
2107 |
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
|
|
2108 |
/* update the DB flags */
|
|
2109 |
for (i = 2; i<txn->mt_numdbs; i++) {
|
|
2110 |
if (txn->mt_dbflags[i] & DB_NEW)
|
|
2111 |
env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
|
|
2112 |
}
|
|
2113 |
if (txn->mt_numdbs > env->me_numdbs)
|
|
2114 |
env->me_numdbs = txn->mt_numdbs;
|
|
2115 |
txn->mt_numdbs = 2; /* so txn_abort() doesn't close any new handles */
|
|
2116 |
mdb_txn_abort(txn);
|
|
2117 |
return MDB_SUCCESS;
|
|
2118 |
}
|
|
2119 |
|
|
2120 |
if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
|
|
2121 |
DPUTS("error flag is set, can't commit");
|
|
2122 |
if (txn->mt_parent)
|
|
2123 |
txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
|
|
2124 |
mdb_txn_abort(txn);
|
|
2125 |
return EINVAL;
|
|
2126 |
}
|
|
2127 |
|
|
2128 |
if (txn->mt_parent) {
|
|
2129 |
MDB_txn *parent = txn->mt_parent;
|
|
2130 |
unsigned x, y, len;
|
|
2131 |
MDB_ID2L dst, src;
|
|
2132 |
|
|
2133 |
/* Append our free list to parent's */
|
|
2134 |
if (mdb_midl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs)) {
|
|
2135 |
mdb_txn_abort(txn);
|
|
2136 |
return ENOMEM;
|
|
2137 |
}
|
|
2138 |
mdb_midl_free(txn->mt_free_pgs);
|
|
2139 |
|
|
2140 |
parent->mt_next_pgno = txn->mt_next_pgno;
|
|
2141 |
parent->mt_flags = txn->mt_flags;
|
|
2142 |
|
|
2143 |
/* Merge (and close) our cursors with parent's */
|
|
2144 |
mdb_cursor_merge(txn);
|
|
2145 |
|
|
2146 |
/* Update parent's DB table. */
|
|
2147 |
memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
|
|
2148 |
txn->mt_parent->mt_numdbs = txn->mt_numdbs;
|
|
2149 |
txn->mt_parent->mt_dbflags[0] = txn->mt_dbflags[0];
|
|
2150 |
txn->mt_parent->mt_dbflags[1] = txn->mt_dbflags[1];
|
|
2151 |
for (i=2; i<txn->mt_numdbs; i++) {
|
|
2152 |
/* preserve parent's DB_NEW status */
|
|
2153 |
x = txn->mt_parent->mt_dbflags[i] & DB_NEW;
|
|
2154 |
txn->mt_parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
|
|
2155 |
}
|
|
2156 |
|
|
2157 |
dst = txn->mt_parent->mt_u.dirty_list;
|
|
2158 |
src = txn->mt_u.dirty_list;
|
|
2159 |
/* Find len = length of merging our dirty list with parent's */
|
|
2160 |
x = dst[0].mid;
|
|
2161 |
dst[0].mid = 0; /* simplify loops */
|
|
2162 |
if (parent->mt_parent) {
|
|
2163 |
len = x + src[0].mid;
|
|
2164 |
y = mdb_mid2l_search(src, dst[x].mid + 1) - 1;
|
|
2165 |
for (i = x; y && i; y--) {
|
|
2166 |
pgno_t yp = src[y].mid;
|
|
2167 |
while (yp < dst[i].mid)
|
|
2168 |
i--;
|
|
2169 |
if (yp == dst[i].mid) {
|
|
2170 |
i--;
|
|
2171 |
len--;
|
|
2172 |
}
|
|
2173 |
}
|
|
2174 |
} else { /* Simplify the above for single-ancestor case */
|
|
2175 |
len = MDB_IDL_UM_MAX - txn->mt_dirty_room;
|
|
2176 |
}
|
|
2177 |
/* Merge our dirty list with parent's */
|
|
2178 |
y = src[0].mid;
|
|
2179 |
for (i = len; y; dst[i--] = src[y--]) {
|
|
2180 |
pgno_t yp = src[y].mid;
|
|
2181 |
while (yp < dst[x].mid)
|
|
2182 |
dst[i--] = dst[x--];
|
|
2183 |
if (yp == dst[x].mid)
|
|
2184 |
free(dst[x--].mptr);
|
|
2185 |
}
|
|
2186 |
assert(i == x);
|
|
2187 |
dst[0].mid = len;
|
|
2188 |
free(txn->mt_u.dirty_list);
|
|
2189 |
parent->mt_dirty_room = txn->mt_dirty_room;
|
|
2190 |
|
|
2191 |
txn->mt_parent->mt_child = NULL;
|
|
2192 |
free(((MDB_ntxn *)txn)->mnt_pgstate.mf_pgfree);
|
|
2193 |
free(txn);
|
|
2194 |
return MDB_SUCCESS;
|
|
2195 |
}
|
|
2196 |
|
|
2197 |
if (txn != env->me_txn) {
|
|
2198 |
DPUTS("attempt to commit unknown transaction");
|
|
2199 |
mdb_txn_abort(txn);
|
|
2200 |
return EINVAL;
|
|
2201 |
}
|
|
2202 |
|
|
2203 |
if (!txn->mt_u.dirty_list[0].mid && !(txn->mt_flags & MDB_TXN_DIRTY))
|
|
2204 |
goto done;
|
|
2205 |
|
|
2206 |
DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu",
|
|
2207 |
txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root);
|
|
2208 |
|
|
2209 |
/* Update DB root pointers */
|
|
2210 |
if (txn->mt_numdbs > 2) {
|
|
2211 |
MDB_dbi i;
|
|
2212 |
MDB_val data;
|
|
2213 |
data.mv_size = sizeof(MDB_db);
|
|
2214 |
|
|
2215 |
mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
|
|
2216 |
for (i = 2; i < txn->mt_numdbs; i++) {
|
|
2217 |
if (txn->mt_dbflags[i] & DB_DIRTY) {
|
|
2218 |
data.mv_data = &txn->mt_dbs[i];
|
|
2219 |
rc = mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
|
|
2220 |
if (rc)
|
|
2221 |
goto fail;
|
|
2222 |
}
|
|
2223 |
}
|
|
2224 |
}
|
|
2225 |
|
|
2226 |
/* Save the freelist as of this transaction to the freeDB. This
|
|
2227 |
* can change the freelist, so keep trying until it stabilizes.
|
|
2228 |
*
|
|
2229 |
* env->me_pglast and the length of txn->mt_free_pgs cannot decrease,
|
|
2230 |
* except the code below can decrease env->me_pglast to split pghead.
|
|
2231 |
* Page numbers cannot disappear from txn->mt_free_pgs. New pages
|
|
2232 |
* can only appear in env->me_pghead when env->me_pglast increases.
|
|
2233 |
* Until then, the me_pghead pointer won't move but can become NULL.
|
|
2234 |
*/
|
|
2235 |
|
|
2236 |
mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
|
|
2237 |
oldpg_txnid = id = 0;
|
|
2238 |
freecnt = 0;
|
|
2239 |
|
|
2240 |
/* should only be one record now */
|
|
2241 |
if (env->me_pghead || env->me_pglast) {
|
|
2242 |
/* make sure first page of freeDB is touched and on freelist */
|
|
2243 |
rc = mdb_page_search(&mc, NULL, MDB_PS_MODIFY);
|
|
2244 |
if (rc && rc != MDB_NOTFOUND) {
|
|
2245 |
fail:
|
|
2246 |
mdb_txn_abort(txn);
|
|
2247 |
return rc;
|
|
2248 |
}
|
|
2249 |
}
|
|
2250 |
|
|
2251 |
/* Delete IDLs we used from the free list */
|
|
2252 |
if (env->me_pglast) {
|
|
2253 |
MDB_val key;
|
|
2254 |
|
|
2255 |
do {
|
|
2256 |
free_pgfirst:
|
|
2257 |
rc = mdb_cursor_first(&mc, &key, NULL);
|
|
2258 |
if (rc)
|
|
2259 |
goto fail;
|
|
2260 |
oldpg_txnid = *(txnid_t *)key.mv_data;
|
|
2261 |
again:
|
|
2262 |
assert(oldpg_txnid <= env->me_pglast);
|
|
2263 |
id = 0;
|
|
2264 |
rc = mdb_cursor_del(&mc, 0);
|
|
2265 |
if (rc)
|
|
2266 |
goto fail;
|
|
2267 |
} while (oldpg_txnid < env->me_pglast);
|
|
2268 |
}
|
|
2269 |
|
|
2270 |
/* Save IDL of pages freed by this txn, to freeDB */
|
|
2271 |
free2:
|
|
2272 |
if (freecnt != txn->mt_free_pgs[0]) {
|
|
2273 |
MDB_val key, data;
|
|
2274 |
|
|
2275 |
/* make sure last page of freeDB is touched and on freelist */
|
|
2276 |
key.mv_size = MDB_MAXKEYSIZE+1;
|
|
2277 |
key.mv_data = NULL;
|
|
2278 |
rc = mdb_page_search(&mc, &key, MDB_PS_MODIFY);
|
|
2279 |
if (rc && rc != MDB_NOTFOUND)
|
|
2280 |
goto fail;
|
|
2281 |
|
|
2282 |
#if MDB_DEBUG > 1
|
|
2283 |
{
|
|
2284 |
unsigned int i;
|
|
2285 |
MDB_IDL idl = txn->mt_free_pgs;
|
|
2286 |
mdb_midl_sort(txn->mt_free_pgs);
|
|
2287 |
DPRINTF("IDL write txn %zu root %zu num %zu",
|
|
2288 |
txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
|
|
2289 |
for (i=1; i<=idl[0]; i++) {
|
|
2290 |
DPRINTF("IDL %zu", idl[i]);
|
|
2291 |
}
|
|
2292 |
}
|
|
2293 |
#endif
|
|
2294 |
/* write to last page of freeDB */
|
|
2295 |
key.mv_size = sizeof(pgno_t);
|
|
2296 |
key.mv_data = &txn->mt_txnid;
|
|
2297 |
/* The free list can still grow during this call,
|
|
2298 |
* despite the pre-emptive touches above. So retry
|
|
2299 |
* until the reserved space remains big enough.
|
|
2300 |
*/
|
|
2301 |
do {
|
|
2302 |
assert(freecnt < txn->mt_free_pgs[0]);
|
|
2303 |
freecnt = txn->mt_free_pgs[0];
|
|
2304 |
data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
|
|
2305 |
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
|
|
2306 |
if (rc)
|
|
2307 |
goto fail;
|
|
2308 |
} while (freecnt != txn->mt_free_pgs[0]);
|
|
2309 |
mdb_midl_sort(txn->mt_free_pgs);
|
|
2310 |
memcpy(data.mv_data, txn->mt_free_pgs, data.mv_size);
|
|
2311 |
if (oldpg_txnid < env->me_pglast || (!env->me_pghead && id))
|
|
2312 |
goto free_pgfirst; /* used up freeDB[oldpg_txnid] */
|
|
2313 |
}
|
|
2314 |
|
|
2315 |
/* Put back page numbers we took from freeDB but did not use */
|
|
2316 |
if (env->me_pghead) {
|
|
2317 |
for (;;) {
|
|
2318 |
MDB_val key, data;
|
|
2319 |
pgno_t orig, *mop;
|
|
2320 |
|
|
2321 |
mop = env->me_pghead;
|
|
2322 |
id = env->me_pglast;
|
|
2323 |
key.mv_size = sizeof(id);
|
|
2324 |
key.mv_data = &id;
|
|
2325 |
/* These steps may grow the freelist again
|
|
2326 |
* due to freed overflow pages...
|
|
2327 |
*/
|
|
2328 |
i = 2;
|
|
2329 |
do {
|
|
2330 |
orig = mop[0];
|
|
2331 |
if (orig > env->me_maxfree_1pg && id > 4)
|
|
2332 |
orig = env->me_maxfree_1pg; /* Do not use more than 1 page */
|
|
2333 |
data.mv_size = (orig + 1) * sizeof(pgno_t);
|
|
2334 |
rc = mdb_cursor_put(&mc, &key, &data, MDB_RESERVE);
|
|
2335 |
if (rc)
|
|
2336 |
goto fail;
|
|
2337 |
assert(!env->me_pghead || env->me_pglast);
|
|
2338 |
/* mop could have been used again here */
|
|
2339 |
if (id != env->me_pglast || env->me_pghead == NULL)
|
|
2340 |
goto again; /* was completely used up */
|
|
2341 |
assert(mop == env->me_pghead);
|
|
2342 |
} while (mop[0] < orig && --i);
|
|
2343 |
memcpy(data.mv_data, mop, data.mv_size);
|
|
2344 |
if (mop[0] <= orig)
|
|
2345 |
break;
|
|
2346 |
*(pgno_t *)data.mv_data = orig;
|
|
2347 |
mop[orig] = mop[0] - orig;
|
|
2348 |
env->me_pghead = mop += orig;
|
|
2349 |
/* Save more oldpages at the previous txnid. */
|
|
2350 |
assert(env->me_pglast == id && id == oldpg_txnid);
|
|
2351 |
env->me_pglast = --oldpg_txnid;
|
|
2352 |
}
|
|
2353 |
}
|
|
2354 |
|
|
2355 |
/* Check for growth of freelist again */
|
|
2356 |
if (freecnt != txn->mt_free_pgs[0])
|
|
2357 |
goto free2;
|
|
2358 |
|
|
2359 |
free(env->me_pgfree);
|
|
2360 |
env->me_pghead = env->me_pgfree = NULL;
|
|
2361 |
|
|
2362 |
if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
|
|
2363 |
if (mdb_midl_shrink(&txn->mt_free_pgs))
|
|
2364 |
env->me_free_pgs = txn->mt_free_pgs;
|
|
2365 |
}
|
|
2366 |
|
|
2367 |
#if MDB_DEBUG > 2
|
|
2368 |
mdb_audit(txn);
|
|
2369 |
#endif
|
|
2370 |
|
|
2371 |
if (env->me_flags & MDB_WRITEMAP) {
|
|
2372 |
for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
|
|
2373 |
dp = txn->mt_u.dirty_list[i].mptr;
|
|
2374 |
/* clear dirty flag */
|
|
2375 |
dp->mp_flags &= ~P_DIRTY;
|
|
2376 |
txn->mt_u.dirty_list[i].mid = 0;
|
|
2377 |
}
|
|
2378 |
txn->mt_u.dirty_list[0].mid = 0;
|
|
2379 |
goto sync;
|
|
2380 |
}
|
|
2381 |
|
|
2382 |
/* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
|
|
2383 |
*/
|
|
2384 |
next = 0;
|
|
2385 |
i = 1;
|
|
2386 |
do {
|
|
2387 |
#ifdef _WIN32
|
|
2388 |
/* Windows actually supports scatter/gather I/O, but only on
|
|
2389 |
* unbuffered file handles. Since we're relying on the OS page
|
|
2390 |
* cache for all our data, that's self-defeating. So we just
|
|
2391 |
* write pages one at a time. We use the ov structure to set
|
|
2392 |
* the write offset, to at least save the overhead of a Seek
|
|
2393 |
* system call.
|
|
2394 |
*/
|
|
2395 |
OVERLAPPED ov;
|
|
2396 |
memset(&ov, 0, sizeof(ov));
|
|
2397 |
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
|
|
2398 |
size_t wsize;
|
|
2399 |
dp = txn->mt_u.dirty_list[i].mptr;
|
|
2400 |
DPRINTF("committing page %zu", dp->mp_pgno);
|
|
2401 |
size = dp->mp_pgno * env->me_psize;
|
|
2402 |
ov.Offset = size & 0xffffffff;
|
|
2403 |
ov.OffsetHigh = size >> 16;
|
|
2404 |
ov.OffsetHigh >>= 16;
|
|
2405 |
/* clear dirty flag */
|
|
2406 |
dp->mp_flags &= ~P_DIRTY;
|
|
2407 |
wsize = env->me_psize;
|
|
2408 |
if (IS_OVERFLOW(dp)) wsize *= dp->mp_pages;
|
|
2409 |
rc = WriteFile(env->me_fd, dp, wsize, NULL, &ov);
|
|
2410 |
if (!rc) {
|
|
2411 |
n = ErrCode();
|
|
2412 |
DPRINTF("WriteFile: %d", n);
|
|
2413 |
mdb_txn_abort(txn);
|
|
2414 |
return n;
|
|
2415 |
}
|
|
2416 |
}
|
|
2417 |
done = 1;
|
|
2418 |
#else
|
|
2419 |
struct iovec iov[MDB_COMMIT_PAGES];
|
|
2420 |
n = 0;
|
|
2421 |
done = 1;
|
|
2422 |
size = 0;
|
|
2423 |
for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
|
|
2424 |
dp = txn->mt_u.dirty_list[i].mptr;
|
|
2425 |
if (dp->mp_pgno != next) {
|
|
2426 |
if (n) {
|
|
2427 |
rc = writev(env->me_fd, iov, n);
|
|
2428 |
if (rc != size) {
|
|
2429 |
n = ErrCode();
|
|
2430 |
if (rc > 0)
|
|
2431 |
DPUTS("short write, filesystem full?");
|
|
2432 |
else
|
|
2433 |
DPRINTF("writev: %s", strerror(n));
|
|
2434 |
mdb_txn_abort(txn);
|
|
2435 |
return n;
|
|
2436 |
}
|
|
2437 |
n = 0;
|
|
2438 |
size = 0;
|
|
2439 |
}
|
|
2440 |
lseek(env->me_fd, dp->mp_pgno * env->me_psize, SEEK_SET);
|
|
2441 |
next = dp->mp_pgno;
|
|
2442 |
}
|
|
2443 |
DPRINTF("committing page %zu", dp->mp_pgno);
|
|
2444 |
iov[n].iov_len = env->me_psize;
|
|
2445 |
if (IS_OVERFLOW(dp)) iov[n].iov_len *= dp->mp_pages;
|
|
2446 |
iov[n].iov_base = (char *)dp;
|
|
2447 |
size += iov[n].iov_len;
|
|
2448 |
next = dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1);
|
|
2449 |
/* clear dirty flag */
|
|
2450 |
dp->mp_flags &= ~P_DIRTY;
|
|
2451 |
if (++n >= MDB_COMMIT_PAGES) {
|
|
2452 |
done = 0;
|
|
2453 |
i++;
|
|
2454 |
break;
|
|
2455 |
}
|
|
2456 |
}
|
|
2457 |
|
|
2458 |
if (n == 0)
|
|
2459 |
break;
|
|
2460 |
|
|
2461 |
rc = writev(env->me_fd, iov, n);
|
|
2462 |
if (rc != size) {
|
|
2463 |
n = ErrCode();
|
|
2464 |
if (rc > 0)
|
|
2465 |
DPUTS("short write, filesystem full?");
|
|
2466 |
else
|
|
2467 |
DPRINTF("writev: %s", strerror(n));
|
|
2468 |
mdb_txn_abort(txn);
|
|
2469 |
return n;
|
|
2470 |
}
|
|
2471 |
#endif
|
|
2472 |
} while (!done);
|
|
2473 |
|
|
2474 |
/* Drop the dirty pages.
|
|
2475 |
*/
|
|
2476 |
for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
|
|
2477 |
dp = txn->mt_u.dirty_list[i].mptr;
|
|
2478 |
if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
|
|
2479 |
mdb_page_free(txn->mt_env, dp);
|
|
2480 |
} else {
|
|
2481 |
VGMEMP_FREE(txn->mt_env, dp);
|
|
2482 |
free(dp);
|
|
2483 |
}
|
|
2484 |
txn->mt_u.dirty_list[i].mid = 0;
|
|
2485 |
}
|
|
2486 |
txn->mt_u.dirty_list[0].mid = 0;
|
|
2487 |
|
|
2488 |
sync:
|
|
2489 |
if ((n = mdb_env_sync(env, 0)) != 0 ||
|
|
2490 |
(n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
|
|
2491 |
mdb_txn_abort(txn);
|
|
2492 |
return n;
|
|
2493 |
}
|
|
2494 |
|
|
2495 |
done:
|
|
2496 |
env->me_pglast = 0;
|
|
2497 |
env->me_txn = NULL;
|
|
2498 |
/* update the DB flags */
|
|
2499 |
for (i = 2; i<txn->mt_numdbs; i++) {
|
|
2500 |
if (txn->mt_dbflags[i] & DB_NEW)
|
|
2501 |
env->me_dbflags[i] = txn->mt_dbs[i].md_flags | MDB_VALID;
|
|
2502 |
}
|
|
2503 |
if (txn->mt_numdbs > env->me_numdbs)
|
|
2504 |
env->me_numdbs = txn->mt_numdbs;
|
|
2505 |
|
|
2506 |
UNLOCK_MUTEX_W(env);
|
|
2507 |
free(txn);
|
|
2508 |
|
|
2509 |
return MDB_SUCCESS;
|
|
2510 |
}
|
|
2511 |
|
|
2512 |
/** Read the environment parameters of a DB environment before
|
|
2513 |
* mapping it into memory.
|
|
2514 |
* @param[in] env the environment handle
|
|
2515 |
* @param[out] meta address of where to store the meta information
|
|
2516 |
* @return 0 on success, non-zero on failure.
|
|
2517 |
*/
|
|
2518 |
static int
|
|
2519 |
mdb_env_read_header(MDB_env *env, MDB_meta *meta)
|
|
2520 |
{
|
|
2521 |
MDB_pagebuf pbuf;
|
|
2522 |
MDB_page *p;
|
|
2523 |
MDB_meta *m;
|
|
2524 |
int i, rc, err;
|
|
2525 |
|
|
2526 |
/* We don't know the page size yet, so use a minimum value.
|
|
2527 |
* Read both meta pages so we can use the latest one.
|
|
2528 |
*/
|
|
2529 |
|
|
2530 |
for (i=0; i<2; i++) {
|
|
2531 |
#ifdef _WIN32
|
|
2532 |
if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
|
|
2533 |
#else
|
|
2534 |
if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0)
|
|
2535 |
#endif
|
|
2536 |
{
|
|
2537 |
return ENOENT;
|
|
2538 |
}
|
|
2539 |
else if (rc != MDB_PAGESIZE) {
|
|
2540 |
err = ErrCode();
|
|
2541 |
if (rc > 0)
|
|
2542 |
err = MDB_INVALID;
|
|
2543 |
DPRINTF("read: %s", strerror(err));
|
|
2544 |
return err;
|
|
2545 |
}
|
|
2546 |
|
|
2547 |
p = (MDB_page *)&pbuf;
|
|
2548 |
|
|
2549 |
if (!F_ISSET(p->mp_flags, P_META)) {
|
|
2550 |
DPRINTF("page %zu not a meta page", p->mp_pgno);
|
|
2551 |
return MDB_INVALID;
|
|
2552 |
}
|
|
2553 |
|
|
2554 |
m = METADATA(p);
|
|
2555 |
if (m->mm_magic != MDB_MAGIC) {
|
|
2556 |
DPUTS("meta has invalid magic");
|
|
2557 |
return MDB_INVALID;
|
|
2558 |
}
|
|
2559 |
|
|
2560 |
if (m->mm_version != MDB_VERSION) {
|
|
2561 |
DPRINTF("database is version %u, expected version %u",
|
|
2562 |
m->mm_version, MDB_VERSION);
|
|
2563 |
return MDB_VERSION_MISMATCH;
|
|
2564 |
}
|
|
2565 |
|
|
2566 |
if (i) {
|
|
2567 |
if (m->mm_txnid > meta->mm_txnid)
|
|
2568 |
memcpy(meta, m, sizeof(*m));
|
|
2569 |
} else {
|
|
2570 |
memcpy(meta, m, sizeof(*m));
|
|
2571 |
#ifdef _WIN32
|
|
2572 |
if (SetFilePointer(env->me_fd, meta->mm_psize, NULL, FILE_BEGIN) != meta->mm_psize)
|
|
2573 |
#else
|
|
2574 |
if (lseek(env->me_fd, meta->mm_psize, SEEK_SET) != meta->mm_psize)
|
|
2575 |
#endif
|
|
2576 |
return ErrCode();
|
|
2577 |
}
|
|
2578 |
}
|
|
2579 |
return 0;
|
|
2580 |
}
|
|
2581 |
|
|
2582 |
/** Write the environment parameters of a freshly created DB environment.
|
|
2583 |
* @param[in] env the environment handle
|
|
2584 |
* @param[out] meta address of where to store the meta information
|
|
2585 |
* @return 0 on success, non-zero on failure.
|
|
2586 |
*/
|
|
2587 |
static int
|
|
2588 |
mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
|
|
2589 |
{
|
|
2590 |
MDB_page *p, *q;
|
|
2591 |
MDB_meta *m;
|
|
2592 |
int rc;
|
|
2593 |
unsigned int psize;
|
|
2594 |
|
|
2595 |
DPUTS("writing new meta page");
|
|
2596 |
|
|
2597 |
GET_PAGESIZE(psize);
|
|
2598 |
|
|
2599 |
meta->mm_magic = MDB_MAGIC;
|
|
2600 |
meta->mm_version = MDB_VERSION;
|
|
2601 |
meta->mm_mapsize = env->me_mapsize;
|
|
2602 |
meta->mm_psize = psize;
|
|
2603 |
meta->mm_last_pg = 1;
|
|
2604 |
meta->mm_flags = env->me_flags & 0xffff;
|
|
2605 |
meta->mm_flags |= MDB_INTEGERKEY;
|
|
2606 |
meta->mm_dbs[0].md_root = P_INVALID;
|
|
2607 |
meta->mm_dbs[1].md_root = P_INVALID;
|
|
2608 |
|
|
2609 |
p = calloc(2, psize);
|
|
2610 |
p->mp_pgno = 0;
|
|
2611 |
p->mp_flags = P_META;
|
|
2612 |
|
|
2613 |
m = METADATA(p);
|
|
2614 |
memcpy(m, meta, sizeof(*meta));
|
|
2615 |
|
|
2616 |
q = (MDB_page *)((char *)p + psize);
|
|
2617 |
|
|
2618 |
q->mp_pgno = 1;
|
|
2619 |
q->mp_flags = P_META;
|
|
2620 |
|
|
2621 |
m = METADATA(q);
|
|
2622 |
memcpy(m, meta, sizeof(*meta));
|
|
2623 |
|
|
2624 |
#ifdef _WIN32
|
|
2625 |
{
|
|
2626 |
DWORD len;
|
|
2627 |
SetFilePointer(env->me_fd, 0, NULL, FILE_BEGIN);
|
|
2628 |
rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
|
|
2629 |
rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
|
|
2630 |
}
|
|
2631 |
#else
|
|
2632 |
lseek(env->me_fd, 0, SEEK_SET);
|
|
2633 |
rc = write(env->me_fd, p, psize * 2);
|
|
2634 |
rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
|
|
2635 |
#endif
|
|
2636 |
free(p);
|
|
2637 |
return rc;
|
|
2638 |
}
|
|
2639 |
|
|
2640 |
/** Update the environment info to commit a transaction.
|
|
2641 |
* @param[in] txn the transaction that's being committed
|
|
2642 |
* @return 0 on success, non-zero on failure.
|
|
2643 |
*/
|
|
2644 |
static int
|
|
2645 |
mdb_env_write_meta(MDB_txn *txn)
|
|
2646 |
{
|
|
2647 |
MDB_env *env;
|
|
2648 |
MDB_meta meta, metab, *mp;
|
|
2649 |
off_t off;
|
|
2650 |
int rc, len, toggle;
|
|
2651 |
char *ptr;
|
|
2652 |
HANDLE mfd;
|
|
2653 |
#ifdef _WIN32
|
|
2654 |
OVERLAPPED ov;
|
|
2655 |
#endif
|
|
2656 |
|
|
2657 |
assert(txn != NULL);
|
|
2658 |
assert(txn->mt_env != NULL);
|
|
2659 |
|
|
2660 |
toggle = !txn->mt_toggle;
|
|
2661 |
DPRINTF("writing meta page %d for root page %zu",
|
|
2662 |
toggle, txn->mt_dbs[MAIN_DBI].md_root);
|
|
2663 |
|
|
2664 |
env = txn->mt_env;
|
|
2665 |
mp = env->me_metas[toggle];
|
|
2666 |
|
|
2667 |
if (env->me_flags & MDB_WRITEMAP) {
|
|
2668 |
/* Persist any increases of mapsize config */
|
|
2669 |
if (env->me_mapsize > mp->mm_mapsize)
|
|
2670 |
mp->mm_mapsize = env->me_mapsize;
|
|
2671 |
mp->mm_dbs[0] = txn->mt_dbs[0];
|
|
2672 |
mp->mm_dbs[1] = txn->mt_dbs[1];
|
|
2673 |
mp->mm_last_pg = txn->mt_next_pgno - 1;
|
|
2674 |
mp->mm_txnid = txn->mt_txnid;
|
|
2675 |
if (!(env->me_flags & (MDB_NOMETASYNC|MDB_NOSYNC))) {
|
|
2676 |
rc = (env->me_flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
|
|
2677 |
ptr = env->me_map;
|
|
2678 |
if (toggle)
|
|
2679 |
ptr += env->me_psize;
|
|
2680 |
if (MDB_MSYNC(ptr, env->me_psize, rc)) {
|
|
2681 |
rc = ErrCode();
|
|
2682 |
goto fail;
|
|
2683 |
}
|
|
2684 |
}
|
|
2685 |
goto done;
|
|
2686 |
}
|
|
2687 |
metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
|
|
2688 |
metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
|
|
2689 |
|
|
2690 |
ptr = (char *)&meta;
|
|
2691 |
if (env->me_mapsize > mp->mm_mapsize) {
|
|
2692 |
/* Persist any increases of mapsize config */
|
|
2693 |
meta.mm_mapsize = env->me_mapsize;
|
|
2694 |
off = offsetof(MDB_meta, mm_mapsize);
|
|
2695 |
} else {
|
|
2696 |
off = offsetof(MDB_meta, mm_dbs[0].md_depth);
|
|
2697 |
}
|
|
2698 |
len = sizeof(MDB_meta) - off;
|
|
2699 |
|
|
2700 |
ptr += off;
|
|
2701 |
meta.mm_dbs[0] = txn->mt_dbs[0];
|
|
2702 |
meta.mm_dbs[1] = txn->mt_dbs[1];
|
|
2703 |
meta.mm_last_pg = txn->mt_next_pgno - 1;
|
|
2704 |
meta.mm_txnid = txn->mt_txnid;
|
|
2705 |
|
|
2706 |
if (toggle)
|
|
2707 |
off += env->me_psize;
|
|
2708 |
off += PAGEHDRSZ;
|
|
2709 |
|
|
2710 |
/* Write to the SYNC fd */
|
|
2711 |
mfd = env->me_flags & (MDB_NOSYNC|MDB_NOMETASYNC) ?
|
|
2712 |
env->me_fd : env->me_mfd;
|
|
2713 |
#ifdef _WIN32
|
|
2714 |
{
|
|
2715 |
memset(&ov, 0, sizeof(ov));
|
|
2716 |
ov.Offset = off;
|
|
2717 |
WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov);
|
|
2718 |
}
|
|
2719 |
#else
|
|
2720 |
rc = pwrite(mfd, ptr, len, off);
|
|
2721 |
#endif
|
|
2722 |
if (rc != len) {
|
|
2723 |
int r2;
|
|
2724 |
rc = ErrCode();
|
|
2725 |
DPUTS("write failed, disk error?");
|
|
2726 |
/* On a failure, the pagecache still contains the new data.
|
|
2727 |
* Write some old data back, to prevent it from being used.
|
|
2728 |
* Use the non-SYNC fd; we know it will fail anyway.
|
|
2729 |
*/
|
|
2730 |
meta.mm_last_pg = metab.mm_last_pg;
|
|
2731 |
meta.mm_txnid = metab.mm_txnid;
|
|
2732 |
#ifdef _WIN32
|
|
2733 |
WriteFile(env->me_fd, ptr, len, NULL, &ov);
|
|
2734 |
#else
|
|
2735 |
r2 = pwrite(env->me_fd, ptr, len, off);
|
|
2736 |
#endif
|
|
2737 |
fail:
|
|
2738 |
env->me_flags |= MDB_FATAL_ERROR;
|
|
2739 |
return rc;
|
|
2740 |
}
|
|
2741 |
done:
|
|
2742 |
/* Memory ordering issues are irrelevant; since the entire writer
|
|
2743 |
* is wrapped by wmutex, all of these changes will become visible
|
|
2744 |
* after the wmutex is unlocked. Since the DB is multi-version,
|
|
2745 |
* readers will get consistent data regardless of how fresh or
|
|
2746 |
* how stale their view of these values is.
|
|
2747 |
*/
|
|
2748 |
txn->mt_env->me_txns->mti_txnid = txn->mt_txnid;
|
|
2749 |
|
|
2750 |
return MDB_SUCCESS;
|
|
2751 |
}
|
|
2752 |
|
|
2753 |
/** Check both meta pages to see which one is newer.
|
|
2754 |
* @param[in] env the environment handle
|
|
2755 |
* @return meta toggle (0 or 1).
|
|
2756 |
*/
|
|
2757 |
static int
|
|
2758 |
mdb_env_pick_meta(const MDB_env *env)
|
|
2759 |
{
|
|
2760 |
return (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid);
|
|
2761 |
}
|
|
2762 |
|
|
2763 |
int
|
|
2764 |
mdb_env_create(MDB_env **env)
|
|
2765 |
{
|
|
2766 |
MDB_env *e;
|
|
2767 |
|
|
2768 |
e = calloc(1, sizeof(MDB_env));
|
|
2769 |
if (!e)
|
|
2770 |
return ENOMEM;
|
|
2771 |
|
|
2772 |
e->me_free_pgs = mdb_midl_alloc();
|
|
2773 |
if (!e->me_free_pgs) {
|
|
2774 |
free(e);
|
|
2775 |
return ENOMEM;
|
|
2776 |
}
|
|
2777 |
e->me_maxreaders = DEFAULT_READERS;
|
|
2778 |
e->me_maxdbs = 2;
|
|
2779 |
e->me_fd = INVALID_HANDLE_VALUE;
|
|
2780 |
e->me_lfd = INVALID_HANDLE_VALUE;
|
|
2781 |
e->me_mfd = INVALID_HANDLE_VALUE;
|
|
2782 |
#ifdef MDB_USE_POSIX_SEM
|
|
2783 |
e->me_rmutex = SEM_FAILED;
|
|
2784 |
e->me_wmutex = SEM_FAILED;
|
|
2785 |
#endif
|
|
2786 |
e->me_pid = getpid();
|
|
2787 |
VGMEMP_CREATE(e,0,0);
|
|
2788 |
*env = e;
|
|
2789 |
return MDB_SUCCESS;
|
|
2790 |
}
|
|
2791 |
|
|
2792 |
int
|
|
2793 |
mdb_env_set_mapsize(MDB_env *env, size_t size)
|
|
2794 |
{
|
|
2795 |
if (env->me_map)
|
|
2796 |
return EINVAL;
|
|
2797 |
env->me_mapsize = size;
|
|
2798 |
if (env->me_psize)
|
|
2799 |
env->me_maxpg = env->me_mapsize / env->me_psize;
|
|
2800 |
return MDB_SUCCESS;
|
|
2801 |
}
|
|
2802 |
|
|
2803 |
int
|
|
2804 |
mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
|
|
2805 |
{
|
|
2806 |
if (env->me_map)
|
|
2807 |
return EINVAL;
|
|
2808 |
env->me_maxdbs = dbs + 2; /* Named databases + main and free DB */
|
|
2809 |
return MDB_SUCCESS;
|
|
2810 |
}
|
|
2811 |
|
|
2812 |
int
|
|
2813 |
mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
|
|
2814 |
{
|
|
2815 |
if (env->me_map || readers < 1)
|
|
2816 |
return EINVAL;
|
|
2817 |
env->me_maxreaders = readers;
|
|
2818 |
return MDB_SUCCESS;
|
|
2819 |
}
|
|
2820 |
|
|
2821 |
int
|
|
2822 |
mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
|
|
2823 |
{
|
|
2824 |
if (!env || !readers)
|
|
2825 |
return EINVAL;
|
|
2826 |
*readers = env->me_maxreaders;
|
|
2827 |
return MDB_SUCCESS;
|
|
2828 |
}
|
|
2829 |
|
|
2830 |
/** Further setup required for opening an MDB environment
|
|
2831 |
*/
|
|
2832 |
static int
|
|
2833 |
mdb_env_open2(MDB_env *env)
|
|
2834 |
{
|
|
2835 |
unsigned int flags = env->me_flags;
|
|
2836 |
int i, newenv = 0, prot;
|
|
2837 |
MDB_meta meta;
|
|
2838 |
MDB_page *p;
|
|
2839 |
|
|
2840 |
memset(&meta, 0, sizeof(meta));
|
|
2841 |
|
|
2842 |
if ((i = mdb_env_read_header(env, &meta)) != 0) {
|
|
2843 |
if (i != ENOENT)
|
|
2844 |
return i;
|
|
2845 |
DPUTS("new mdbenv");
|
|
2846 |
newenv = 1;
|
|
2847 |
}
|
|
2848 |
|
|
2849 |
/* Was a mapsize configured? */
|
|
2850 |
if (!env->me_mapsize) {
|
|
2851 |
/* If this is a new environment, take the default,
|
|
2852 |
* else use the size recorded in the existing env.
|
|
2853 |
*/
|
|
2854 |
env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
|
|
2855 |
} else if (env->me_mapsize < meta.mm_mapsize) {
|
|
2856 |
/* If the configured size is smaller, make sure it's
|
|
2857 |
* still big enough. Silently round up to minimum if not.
|
|
2858 |
*/
|
|
2859 |
size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
|
|
2860 |
if (env->me_mapsize < minsize)
|
|
2861 |
env->me_mapsize = minsize;
|
|
2862 |
}
|
|
2863 |
|
|
2864 |
#ifdef _WIN32
|
|
2865 |
{
|
|
2866 |
HANDLE mh;
|
|
2867 |
LONG sizelo, sizehi;
|
|
2868 |
sizelo = env->me_mapsize & 0xffffffff;
|
|
2869 |
sizehi = env->me_mapsize >> 16; /* pointless on WIN32, only needed on W64 */
|
|
2870 |
sizehi >>= 16;
|
|
2871 |
/* Windows won't create mappings for zero length files.
|
|
2872 |
* Just allocate the maxsize right now.
|
|
2873 |
*/
|
|
2874 |
if (newenv) {
|
|
2875 |
SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
|
|
2876 |
if (!SetEndOfFile(env->me_fd))
|
|
2877 |
return ErrCode();
|
|
2878 |
SetFilePointer(env->me_fd, 0, NULL, 0);
|
|
2879 |
}
|
|
2880 |
mh = CreateFileMapping(env->me_fd, NULL, flags & MDB_WRITEMAP ?
|
|
2881 |
PAGE_READWRITE : PAGE_READONLY,
|
|
2882 |
sizehi, sizelo, NULL);
|
|
2883 |
if (!mh)
|
|
2884 |
return ErrCode();
|
|
2885 |
env->me_map = MapViewOfFileEx(mh, flags & MDB_WRITEMAP ?
|
|
2886 |
FILE_MAP_WRITE : FILE_MAP_READ,
|
|
2887 |
0, 0, env->me_mapsize, meta.mm_address);
|
|
2888 |
CloseHandle(mh);
|
|
2889 |
if (!env->me_map)
|
|
2890 |
return ErrCode();
|
|
2891 |
}
|
|
2892 |
#else
|
|
2893 |
i = MAP_SHARED;
|
|
2894 |
prot = PROT_READ;
|
|
2895 |
if (flags & MDB_WRITEMAP) {
|
|
2896 |
prot |= PROT_WRITE;
|
|
2897 |
if (ftruncate(env->me_fd, env->me_mapsize) < 0)
|
|
2898 |
return ErrCode();
|
|
2899 |
}
|
|
2900 |
env->me_map = mmap(meta.mm_address, env->me_mapsize, prot, i,
|
|
2901 |
env->me_fd, 0);
|
|
2902 |
if (env->me_map == MAP_FAILED) {
|
|
2903 |
env->me_map = NULL;
|
|
2904 |
return ErrCode();
|
|
2905 |
}
|
|
2906 |
/* Turn off readahead. It's harmful when the DB is larger than RAM. */
|
|
2907 |
#ifdef MADV_RANDOM
|
|
2908 |
madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
|
|
2909 |
#else
|
|
2910 |
#ifdef POSIX_MADV_RANDOM
|
|
2911 |
posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
|
|
2912 |
#endif /* POSIX_MADV_RANDOM */
|
|
2913 |
#endif /* MADV_RANDOM */
|
|
2914 |
#endif /* _WIN32 */
|
|
2915 |
|
|
2916 |
if (newenv) {
|
|
2917 |
if (flags & MDB_FIXEDMAP)
|
|
2918 |
meta.mm_address = env->me_map;
|
|
2919 |
i = mdb_env_init_meta(env, &meta);
|
|
2920 |
if (i != MDB_SUCCESS) {
|
|
2921 |
return i;
|
|
2922 |
}
|
|
2923 |
} else if (meta.mm_address && env->me_map != meta.mm_address) {
|
|
2924 |
/* Can happen because the address argument to mmap() is just a
|
|
2925 |
* hint. mmap() can pick another, e.g. if the range is in use.
|
|
2926 |
* The MAP_FIXED flag would prevent that, but then mmap could
|
|
2927 |
* instead unmap existing pages to make room for the new map.
|
|
2928 |
*/
|
|
2929 |
return EBUSY; /* TODO: Make a new MDB_* error code? */
|
|
2930 |
}
|
|
2931 |
env->me_psize = meta.mm_psize;
|
|
2932 |
env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
|
|
2933 |
env->me_nodemax = (env->me_psize - PAGEHDRSZ) / MDB_MINKEYS;
|
|
2934 |
|
|
2935 |
env->me_maxpg = env->me_mapsize / env->me_psize;
|
|
2936 |
|
|
2937 |
p = (MDB_page *)env->me_map;
|
|
2938 |
env->me_metas[0] = METADATA(p);
|
|
2939 |
env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize);
|
|
2940 |
|
|
2941 |
#if MDB_DEBUG
|
|
2942 |
{
|
|
2943 |
int toggle = mdb_env_pick_meta(env);
|
|
2944 |
MDB_db *db = &env->me_metas[toggle]->mm_dbs[MAIN_DBI];
|
|
2945 |
|
|
2946 |
DPRINTF("opened database version %u, pagesize %u",
|
|
2947 |
env->me_metas[0]->mm_version, env->me_psize);
|
|
2948 |
DPRINTF("using meta page %d", toggle);
|
|
2949 |
DPRINTF("depth: %u", db->md_depth);
|
|
2950 |
DPRINTF("entries: %zu", db->md_entries);
|
|
2951 |
DPRINTF("branch pages: %zu", db->md_branch_pages);
|
|
2952 |
DPRINTF("leaf pages: %zu", db->md_leaf_pages);
|
|
2953 |
DPRINTF("overflow pages: %zu", db->md_overflow_pages);
|
|
2954 |
DPRINTF("root: %zu", db->md_root);
|
|
2955 |
}
|
|
2956 |
#endif
|
|
2957 |
|
|
2958 |
return MDB_SUCCESS;
|
|
2959 |
}
|
|
2960 |
|
|
2961 |
|
|
2962 |
/** Release a reader thread's slot in the reader lock table.
|
|
2963 |
* This function is called automatically when a thread exits.
|
|
2964 |
* @param[in] ptr This points to the slot in the reader lock table.
|
|
2965 |
*/
|
|
2966 |
static void
|
|
2967 |
mdb_env_reader_dest(void *ptr)
|
|
2968 |
{
|
|
2969 |
MDB_reader *reader = ptr;
|
|
2970 |
|
|
2971 |
reader->mr_pid = 0;
|
|
2972 |
}
|
|
2973 |
|
|
2974 |
#ifdef _WIN32
|
|
2975 |
/** Junk for arranging thread-specific callbacks on Windows. This is
|
|
2976 |
* necessarily platform and compiler-specific. Windows supports up
|
|
2977 |
* to 1088 keys. Let's assume nobody opens more than 64 environments
|
|
2978 |
* in a single process, for now. They can override this if needed.
|
|
2979 |
*/
|
|
2980 |
#ifndef MAX_TLS_KEYS
|
|
2981 |
#define MAX_TLS_KEYS 64
|
|
2982 |
#endif
|
|
2983 |
static pthread_key_t mdb_tls_keys[MAX_TLS_KEYS];
|
|
2984 |
static int mdb_tls_nkeys;
|
|
2985 |
|
|
2986 |
static void NTAPI mdb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
|
|
2987 |
{
|
|
2988 |
int i;
|
|
2989 |
switch(reason) {
|
|
2990 |
case DLL_PROCESS_ATTACH: break;
|
|
2991 |
case DLL_THREAD_ATTACH: break;
|
|
2992 |
case DLL_THREAD_DETACH:
|
|
2993 |
for (i=0; i<mdb_tls_nkeys; i++) {
|
|
2994 |
MDB_reader *r = pthread_getspecific(mdb_tls_keys[i]);
|
|
2995 |
mdb_env_reader_dest(r);
|
|
2996 |
}
|
|
2997 |
break;
|
|
2998 |
case DLL_PROCESS_DETACH: break;
|
|
2999 |
}
|
|
3000 |
}
|
|
3001 |
#ifdef __GNUC__
|
|
3002 |
#ifdef _WIN64
|
|
3003 |
const PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
|
|
3004 |
#else
|
|
3005 |
PIMAGE_TLS_CALLBACK mdb_tls_cbp __attribute__((section (".CRT$XLB"))) = mdb_tls_callback;
|
|
3006 |
#endif
|
|
3007 |
#else
|
|
3008 |
#ifdef _WIN64
|
|
3009 |
/* Force some symbol references.
|
|
3010 |
* _tls_used forces the linker to create the TLS directory if not already done
|
|
3011 |
* mdb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
|
|
3012 |
*/
|
|
3013 |
#pragma comment(linker, "/INCLUDE:_tls_used")
|
|
3014 |
#pragma comment(linker, "/INCLUDE:mdb_tls_cbp")
|
|
3015 |
#pragma const_seg(".CRT$XLB")
|
|
3016 |
extern const PIMAGE_TLS_CALLBACK mdb_tls_callback;
|
|
3017 |
const PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
|
|
3018 |
#pragma const_seg()
|
|
3019 |
#else /* WIN32 */
|
|
3020 |
#pragma comment(linker, "/INCLUDE:__tls_used")
|
|
3021 |
#pragma comment(linker, "/INCLUDE:_mdb_tls_cbp")
|
|
3022 |
#pragma data_seg(".CRT$XLB")
|
|
3023 |
PIMAGE_TLS_CALLBACK mdb_tls_cbp = mdb_tls_callback;
|
|
3024 |
#pragma data_seg()
|
|
3025 |
#endif /* WIN 32/64 */
|
|
3026 |
#endif /* !__GNUC__ */
|
|
3027 |
#endif
|
|
3028 |
|
|
3029 |
/** Downgrade the exclusive lock on the region back to shared */
|
|
3030 |
static int
|
|
3031 |
mdb_env_share_locks(MDB_env *env, int *excl)
|
|
3032 |
{
|
|
3033 |
int rc = 0, toggle = mdb_env_pick_meta(env);
|
|
3034 |
|
|
3035 |
env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
|
|
3036 |
|
|
3037 |
#ifdef _WIN32
|
|
3038 |
{
|
|
3039 |
OVERLAPPED ov;
|
|
3040 |
/* First acquire a shared lock. The Unlock will
|
|
3041 |
* then release the existing exclusive lock.
|
|
3042 |
*/
|
|
3043 |
memset(&ov, 0, sizeof(ov));
|
|
3044 |
if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
|
|
3045 |
rc = ErrCode();
|
|
3046 |
} else {
|
|
3047 |
UnlockFile(env->me_lfd, 0, 0, 1, 0);
|
|
3048 |
*excl = 0;
|
|
3049 |
}
|
|
3050 |
}
|
|
3051 |
#else
|
|
3052 |
{
|
|
3053 |
struct flock lock_info;
|
|
3054 |
/* The shared lock replaces the existing lock */
|
|
3055 |
memset((void *)&lock_info, 0, sizeof(lock_info));
|
|
3056 |
lock_info.l_type = F_RDLCK;
|
|
3057 |
lock_info.l_whence = SEEK_SET;
|
|
3058 |
lock_info.l_start = 0;
|
|
3059 |
lock_info.l_len = 1;
|
|
3060 |
while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
|
|
3061 |
(rc = ErrCode()) == EINTR) ;
|
|
3062 |
*excl = rc ? -1 : 0; /* error may mean we lost the lock */
|
|
3063 |
}
|
|
3064 |
#endif
|
|
3065 |
|
|
3066 |
return rc;
|
|
3067 |
}
|
|
3068 |
|
|
3069 |
/** Try to get exlusive lock, otherwise shared.
|
|
3070 |
* Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
|
|
3071 |
*/
|
|
3072 |
static int
|
|
3073 |
mdb_env_excl_lock(MDB_env *env, int *excl)
|
|
3074 |
{
|
|
3075 |
int rc = 0;
|
|
3076 |
#ifdef _WIN32
|
|
3077 |
if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
|
|
3078 |
*excl = 1;
|
|
3079 |
} else {
|
|
3080 |
OVERLAPPED ov;
|
|
3081 |
memset(&ov, 0, sizeof(ov));
|
|
3082 |
if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
|
|
3083 |
*excl = 0;
|
|
3084 |
} else {
|
|
3085 |
rc = ErrCode();
|
|
3086 |
}
|
|
3087 |
}
|
|
3088 |
#else
|
|
3089 |
struct flock lock_info;
|
|
3090 |
memset((void *)&lock_info, 0, sizeof(lock_info));
|
|
3091 |
lock_info.l_type = F_WRLCK;
|
|
3092 |
lock_info.l_whence = SEEK_SET;
|
|
3093 |
lock_info.l_start = 0;
|
|
3094 |
lock_info.l_len = 1;
|
|
3095 |
while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
|
|
3096 |
(rc = ErrCode()) == EINTR) ;
|
|
3097 |
if (!rc) {
|
|
3098 |
*excl = 1;
|
|
3099 |
} else
|
|
3100 |
# ifdef MDB_USE_POSIX_SEM
|
|
3101 |
if (*excl < 0) /* always true when !MDB_USE_POSIX_SEM */
|
|
3102 |
# endif
|
|
3103 |
{
|
|
3104 |
lock_info.l_type = F_RDLCK;
|
|
3105 |
while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
|
|
3106 |
(rc = ErrCode()) == EINTR) ;
|
|
3107 |
if (rc == 0)
|
|
3108 |
*excl = 0;
|
|
3109 |
}
|
|
3110 |
#endif
|
|
3111 |
return rc;
|
|
3112 |
}
|
|
3113 |
|
|
3114 |
#if defined(_WIN32) || defined(MDB_USE_POSIX_SEM)
|
|
3115 |
/*
|
|
3116 |
* hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
|
|
3117 |
*
|
|
3118 |
* @(#) $Revision: 5.1 $
|
|
3119 |
* @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
|
|
3120 |
* @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
|
|
3121 |
*
|
|
3122 |
* http://www.isthe.com/chongo/tech/comp/fnv/index.html
|
|
3123 |
*
|
|
3124 |
***
|
|
3125 |
*
|
|
3126 |
* Please do not copyright this code. This code is in the public domain.
|
|
3127 |
*
|
|
3128 |
* LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
|
|
3129 |
* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
|
|
3130 |
* EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
|
|
3131 |
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
|
|
3132 |
* USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
|
|
3133 |
* OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
|
3134 |
* PERFORMANCE OF THIS SOFTWARE.
|
|
3135 |
*
|
|
3136 |
* By:
|
|
3137 |
* chongo <Landon Curt Noll> /\oo/\
|
|
3138 |
* http://www.isthe.com/chongo/
|
|
3139 |
*
|
|
3140 |
* Share and Enjoy! :-)
|
|
3141 |
*/
|
|
3142 |
|
|
3143 |
typedef unsigned long long mdb_hash_t;
|
|
3144 |
#define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
|
|
3145 |
|
|
3146 |
/** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
|
|
3147 |
* @param[in] str string to hash
|
|
3148 |
* @param[in] hval initial value for hash
|
|
3149 |
* @return 64 bit hash
|
|
3150 |
*
|
|
3151 |
* NOTE: To use the recommended 64 bit FNV-1a hash, use MDB_HASH_INIT as the
|
|
3152 |
* hval arg on the first call.
|
|
3153 |
*/
|
|
3154 |
static mdb_hash_t
|
|
3155 |
mdb_hash_val(MDB_val *val, mdb_hash_t hval)
|
|
3156 |
{
|
|
3157 |
unsigned char *s = (unsigned char *)val->mv_data; /* unsigned string */
|
|
3158 |
unsigned char *end = s + val->mv_size;
|
|
3159 |
/*
|
|
3160 |
* FNV-1a hash each octet of the string
|
|
3161 |
*/
|
|
3162 |
while (s < end) {
|
|
3163 |
/* xor the bottom with the current octet */
|
|
3164 |
hval ^= (mdb_hash_t)*s++;
|
|
3165 |
|
|
3166 |
/* multiply by the 64 bit FNV magic prime mod 2^64 */
|
|
3167 |
hval += (hval << 1) + (hval << 4) + (hval << 5) +
|
|
3168 |
(hval << 7) + (hval << 8) + (hval << 40);
|
|
3169 |
}
|
|
3170 |
/* return our new hash value */
|
|
3171 |
return hval;
|
|
3172 |
}
|
|
3173 |
|
|
3174 |
/** Hash the string and output the hash in hex.
|
|
3175 |
* @param[in] str string to hash
|
|
3176 |
* @param[out] hexbuf an array of 17 chars to hold the hash
|
|
3177 |
*/
|
|
3178 |
static void
|
|
3179 |
mdb_hash_hex(MDB_val *val, char *hexbuf)
|
|
3180 |
{
|
|
3181 |
int i;
|
|
3182 |
mdb_hash_t h = mdb_hash_val(val, MDB_HASH_INIT);
|
|
3183 |
for (i=0; i<8; i++) {
|
|
3184 |
hexbuf += sprintf(hexbuf, "%02x", (unsigned int)h & 0xff);
|
|
3185 |
h >>= 8;
|
|
3186 |
}
|
|
3187 |
}
|
|
3188 |
#endif
|
|
3189 |
|
|
3190 |
/** Open and/or initialize the lock region for the environment.
|
|
3191 |
* @param[in] env The MDB environment.
|
|
3192 |
* @param[in] lpath The pathname of the file used for the lock region.
|
|
3193 |
* @param[in] mode The Unix permissions for the file, if we create it.
|
|
3194 |
* @param[out] excl Resulting file lock type: -1 none, 0 shared, 1 exclusive
|
|
3195 |
* @return 0 on success, non-zero on failure.
|
|
3196 |
*/
|
|
3197 |
static int
|
|
3198 |
mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
|
|
3199 |
{
|
|
3200 |
int rc;
|
|
3201 |
off_t size, rsize;
|
|
3202 |
|
|
3203 |
*excl = -1;
|
|
3204 |
|
|
3205 |
#ifdef _WIN32
|
|
3206 |
if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
|
|
3207 |
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
|
|
3208 |
FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
|
|
3209 |
rc = ErrCode();
|
|
3210 |
if (rc == ERROR_WRITE_PROTECT && (env->me_flags & MDB_RDONLY)) {
|
|
3211 |
env->me_flags |= MDB_ROFS;
|
|
3212 |
return MDB_SUCCESS;
|
|
3213 |
}
|
|
3214 |
goto fail_errno;
|
|
3215 |
}
|
|
3216 |
/* Try to get exclusive lock. If we succeed, then
|
|
3217 |
* nobody is using the lock region and we should initialize it.
|
|
3218 |
*/
|
|
3219 |
if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
|
|
3220 |
size = GetFileSize(env->me_lfd, NULL);
|
|
3221 |
|
|
3222 |
#else
|
|
3223 |
#if !(O_CLOEXEC)
|
|
3224 |
{
|
|
3225 |
int fdflags;
|
|
3226 |
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
|
|
3227 |
rc = ErrCode();
|
|
3228 |
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
|
|
3229 |
env->me_flags |= MDB_ROFS;
|
|
3230 |
return MDB_SUCCESS;
|
|
3231 |
}
|
|
3232 |
goto fail_errno;
|
|
3233 |
}
|
|
3234 |
/* Lose record locks when exec*() */
|
|
3235 |
if ((fdflags = fcntl(env->me_lfd, F_GETFD) | FD_CLOEXEC) >= 0)
|
|
3236 |
fcntl(env->me_lfd, F_SETFD, fdflags);
|
|
3237 |
}
|
|
3238 |
#else /* O_CLOEXEC on Linux: Open file and set FD_CLOEXEC atomically */
|
|
3239 |
if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT|O_CLOEXEC, mode)) == -1) {
|
|
3240 |
rc = ErrCode();
|
|
3241 |
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
|
|
3242 |
env->me_flags |= MDB_ROFS;
|
|
3243 |
return MDB_SUCCESS;
|
|
3244 |
}
|
|
3245 |
goto fail_errno;
|
|
3246 |
}
|
|
3247 |
#endif
|
|
3248 |
|
|
3249 |
/* Try to get exclusive lock. If we succeed, then
|
|
3250 |
* nobody is using the lock region and we should initialize it.
|
|
3251 |
*/
|
|
3252 |
if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
|
|
3253 |
|
|
3254 |
size = lseek(env->me_lfd, 0, SEEK_END);
|
|
3255 |
#endif
|
|
3256 |
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
|
|
3257 |
if (size < rsize && *excl > 0) {
|
|
3258 |
#ifdef _WIN32
|
|
3259 |
SetFilePointer(env->me_lfd, rsize, NULL, 0);
|
|
3260 |
if (!SetEndOfFile(env->me_lfd)) goto fail_errno;
|
|
3261 |
#else
|
|
3262 |
if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
|
|
3263 |
#endif
|
|
3264 |
} else {
|
|
3265 |
rsize = size;
|
|
3266 |
size = rsize - sizeof(MDB_txninfo);
|
|
3267 |
env->me_maxreaders = size/sizeof(MDB_reader) + 1;
|
|
3268 |
}
|
|
3269 |
{
|
|
3270 |
#ifdef _WIN32
|
|
3271 |
HANDLE mh;
|
|
3272 |
mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
|
|
3273 |
0, 0, NULL);
|
|
3274 |
if (!mh) goto fail_errno;
|
|
3275 |
env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
|
|
3276 |
CloseHandle(mh);
|
|
3277 |
if (!env->me_txns) goto fail_errno;
|
|
3278 |
#else
|
|
3279 |
void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
|
|
3280 |
env->me_lfd, 0);
|
|
3281 |
if (m == MAP_FAILED) goto fail_errno;
|
|
3282 |
env->me_txns = m;
|
|
3283 |
#endif
|
|
3284 |
}
|
|
3285 |
if (*excl > 0) {
|
|
3286 |
#ifdef _WIN32
|
|
3287 |
BY_HANDLE_FILE_INFORMATION stbuf;
|
|
3288 |
struct {
|
|
3289 |
DWORD volume;
|
|
3290 |
DWORD nhigh;
|
|
3291 |
DWORD nlow;
|
|
3292 |
} idbuf;
|
|
3293 |
MDB_val val;
|
|
3294 |
char hexbuf[17];
|
|
3295 |
|
|
3296 |
if (!mdb_sec_inited) {
|
|
3297 |
InitializeSecurityDescriptor(&mdb_null_sd,
|
|
3298 |
SECURITY_DESCRIPTOR_REVISION);
|
|
3299 |
SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
|
|
3300 |
mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
|
|
3301 |
mdb_all_sa.bInheritHandle = FALSE;
|
|
3302 |
mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
|
|
3303 |
mdb_sec_inited = 1;
|
|
3304 |
}
|
|
3305 |
if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
|
|
3306 |
idbuf.volume = stbuf.dwVolumeSerialNumber;
|
|
3307 |
idbuf.nhigh = stbuf.nFileIndexHigh;
|
|
3308 |
idbuf.nlow = stbuf.nFileIndexLow;
|
|
3309 |
val.mv_data = &idbuf;
|
|
3310 |
val.mv_size = sizeof(idbuf);
|
|
3311 |
mdb_hash_hex(&val, hexbuf);
|
|
3312 |
sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", hexbuf);
|
|
3313 |
sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", hexbuf);
|
|
3314 |
env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
|
|
3315 |
if (!env->me_rmutex) goto fail_errno;
|
|
3316 |
env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
|
|
3317 |
if (!env->me_wmutex) goto fail_errno;
|
|
3318 |
#elif defined(MDB_USE_POSIX_SEM)
|
|
3319 |
struct stat stbuf;
|
|
3320 |
struct {
|
|
3321 |
dev_t dev;
|
|
3322 |
ino_t ino;
|
|
3323 |
} idbuf;
|
|
3324 |
MDB_val val;
|
|
3325 |
char hexbuf[17];
|
|
3326 |
|
|
3327 |
if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
|
|
3328 |
idbuf.dev = stbuf.st_dev;
|
|
3329 |
idbuf.ino = stbuf.st_ino;
|
|
3330 |
val.mv_data = &idbuf;
|
|
3331 |
val.mv_size = sizeof(idbuf);
|
|
3332 |
mdb_hash_hex(&val, hexbuf);
|
|
3333 |
sprintf(env->me_txns->mti_rmname, "/MDBr%s", hexbuf);
|
|
3334 |
sprintf(env->me_txns->mti_wmname, "/MDBw%s", hexbuf);
|
|
3335 |
/* Clean up after a previous run, if needed: Try to
|
|
3336 |
* remove both semaphores before doing anything else.
|
|
3337 |
*/
|
|
3338 |
sem_unlink(env->me_txns->mti_rmname);
|
|
3339 |
sem_unlink(env->me_txns->mti_wmname);
|
|
3340 |
env->me_rmutex = sem_open(env->me_txns->mti_rmname,
|
|
3341 |
O_CREAT|O_EXCL, mode, 1);
|
|
3342 |
if (env->me_rmutex == SEM_FAILED) goto fail_errno;
|
|
3343 |
env->me_wmutex = sem_open(env->me_txns->mti_wmname,
|
|
3344 |
O_CREAT|O_EXCL, mode, 1);
|
|
3345 |
if (env->me_wmutex == SEM_FAILED) goto fail_errno;
|
|
3346 |
#else /* MDB_USE_POSIX_SEM */
|
|
3347 |
pthread_mutexattr_t mattr;
|
|
3348 |
|
|
3349 |
if ((rc = pthread_mutexattr_init(&mattr))
|
|
3350 |
|| (rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED))
|
|
3351 |
|| (rc = pthread_mutex_init(&env->me_txns->mti_mutex, &mattr))
|
|
3352 |
|| (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
|
|
3353 |
goto fail;
|
|
3354 |
pthread_mutexattr_destroy(&mattr);
|
|
3355 |
#endif /* _WIN32 || MDB_USE_POSIX_SEM */
|
|
3356 |
|
|
3357 |
env->me_txns->mti_version = MDB_VERSION;
|
|
3358 |
env->me_txns->mti_magic = MDB_MAGIC;
|
|
3359 |
env->me_txns->mti_txnid = 0;
|
|
3360 |
env->me_txns->mti_numreaders = 0;
|
|
3361 |
|
|
3362 |
} else {
|
|
3363 |
if (env->me_txns->mti_magic != MDB_MAGIC) {
|
|
3364 |
DPUTS("lock region has invalid magic");
|
|
3365 |
rc = MDB_INVALID;
|
|
3366 |
goto fail;
|
|
3367 |
}
|
|
3368 |
if (env->me_txns->mti_version != MDB_VERSION) {
|
|
3369 |
DPRINTF("lock region is version %u, expected version %u",
|
|
3370 |
env->me_txns->mti_version, MDB_VERSION);
|
|
3371 |
rc = MDB_VERSION_MISMATCH;
|
|
3372 |
goto fail;
|
|
3373 |
}
|
|
3374 |
rc = ErrCode();
|
|
3375 |
if (rc != EACCES && rc != EAGAIN) {
|
|
3376 |
goto fail;
|
|
3377 |
}
|
|
3378 |
#ifdef _WIN32
|
|
3379 |
env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
|
|
3380 |
if (!env->me_rmutex) goto fail_errno;
|
|
3381 |
env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
|
|
3382 |
if (!env->me_wmutex) goto fail_errno;
|
|
3383 |
#elif defined(MDB_USE_POSIX_SEM)
|
|
3384 |
env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
|
|
3385 |
if (env->me_rmutex == SEM_FAILED) goto fail_errno;
|
|
3386 |
env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
|
|
3387 |
if (env->me_wmutex == SEM_FAILED) goto fail_errno;
|
|
3388 |
#endif
|
|
3389 |
}
|
|
3390 |
return MDB_SUCCESS;
|
|
3391 |
|
|
3392 |
fail_errno:
|
|
3393 |
rc = ErrCode();
|
|
3394 |
fail:
|
|
3395 |
return rc;
|
|
3396 |
}
|
|
3397 |
|
|
3398 |
/** The name of the lock file in the DB environment */
|
|
3399 |
#define LOCKNAME "/lock.mdb"
|
|
3400 |
/** The name of the data file in the DB environment */
|
|
3401 |
#define DATANAME "/data.mdb"
|
|
3402 |
/** The suffix of the lock file when no subdir is used */
|
|
3403 |
#define LOCKSUFF "-lock"
|
|
3404 |
/** Only a subset of the @ref mdb_env flags can be changed
|
|
3405 |
* at runtime. Changing other flags requires closing the
|
|
3406 |
* environment and re-opening it with the new flags.
|
|
3407 |
*/
|
|
3408 |
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC)
|
|
3409 |
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY|MDB_WRITEMAP)
|
|
3410 |
|
|
3411 |
int
|
|
3412 |
mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mdb_mode_t mode)
|
|
3413 |
{
|
|
3414 |
int oflags, rc, len, excl;
|
|
3415 |
char *lpath, *dpath;
|
|
3416 |
|
|
3417 |
if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
|
|
3418 |
return EINVAL;
|
|
3419 |
|
|
3420 |
len = strlen(path);
|
|
3421 |
if (flags & MDB_NOSUBDIR) {
|
|
3422 |
rc = len + sizeof(LOCKSUFF) + len + 1;
|
|
3423 |
} else {
|
|
3424 |
rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
|
|
3425 |
}
|
|
3426 |
lpath = malloc(rc);
|
|
3427 |
if (!lpath)
|
|
3428 |
return ENOMEM;
|
|
3429 |
if (flags & MDB_NOSUBDIR) {
|
|
3430 |
dpath = lpath + len + sizeof(LOCKSUFF);
|
|
3431 |
sprintf(lpath, "%s" LOCKSUFF, path);
|
|
3432 |
strcpy(dpath, path);
|
|
3433 |
} else {
|
|
3434 |
dpath = lpath + len + sizeof(LOCKNAME);
|
|
3435 |
sprintf(lpath, "%s" LOCKNAME, path);
|
|
3436 |
sprintf(dpath, "%s" DATANAME, path);
|
|
3437 |
}
|
|
3438 |
|
|
3439 |
flags |= env->me_flags;
|
|
3440 |
/* silently ignore WRITEMAP if we're only getting read access */
|
|
3441 |
if (F_ISSET(flags, MDB_RDONLY|MDB_WRITEMAP))
|
|
3442 |
flags ^= MDB_WRITEMAP;
|
|
3443 |
env->me_flags = flags |= MDB_ENV_ACTIVE;
|
|
3444 |
|
|
3445 |
rc = mdb_env_setup_locks(env, lpath, mode, &excl);
|
|
3446 |
if (rc)
|
|
3447 |
goto leave;
|
|
3448 |
|
|
3449 |
#ifdef _WIN32
|
|
3450 |
if (F_ISSET(flags, MDB_RDONLY)) {
|
|
3451 |
oflags = GENERIC_READ;
|
|
3452 |
len = OPEN_EXISTING;
|
|
3453 |
} else {
|
|
3454 |
oflags = GENERIC_READ|GENERIC_WRITE;
|
|
3455 |
len = OPEN_ALWAYS;
|
|
3456 |
}
|
|
3457 |
mode = FILE_ATTRIBUTE_NORMAL;
|
|
3458 |
env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
|
|
3459 |
NULL, len, mode, NULL);
|
|
3460 |
#else
|
|
3461 |
if (F_ISSET(flags, MDB_RDONLY))
|
|
3462 |
oflags = O_RDONLY;
|
|
3463 |
else
|
|
3464 |
oflags = O_RDWR | O_CREAT;
|
|
3465 |
|
|
3466 |
env->me_fd = open(dpath, oflags, mode);
|
|
3467 |
#endif
|
|
3468 |
if (env->me_fd == INVALID_HANDLE_VALUE) {
|
|
3469 |
rc = ErrCode();
|
|
3470 |
goto leave;
|
|
3471 |
}
|
|
3472 |
|
|
3473 |
if ((rc = mdb_env_open2(env)) == MDB_SUCCESS) {
|
|
3474 |
if (flags & (MDB_RDONLY|MDB_WRITEMAP)) {
|
|
3475 |
env->me_mfd = env->me_fd;
|
|
3476 |
} else {
|
|
3477 |
/* Synchronous fd for meta writes. Needed even with
|
|
3478 |
* MDB_NOSYNC/MDB_NOMETASYNC, in case these get reset.
|
|
3479 |
*/
|
|
3480 |
#ifdef _WIN32
|
|
3481 |
env->me_mfd = CreateFile(dpath, oflags,
|
|
3482 |
FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, len,
|
|
3483 |
mode | FILE_FLAG_WRITE_THROUGH, NULL);
|
|
3484 |
#else
|
|
3485 |
env->me_mfd = open(dpath, oflags | MDB_DSYNC, mode);
|
|
3486 |
#endif
|
|
3487 |
if (env->me_mfd == INVALID_HANDLE_VALUE) {
|
|
3488 |
rc = ErrCode();
|
|
3489 |
goto leave;
|
|
3490 |
}
|
|
3491 |
}
|
|
3492 |
DPRINTF("opened dbenv %p", (void *) env);
|
|
3493 |
rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
|
|
3494 |
if (rc)
|
|
3495 |
goto leave;
|
|
3496 |
env->me_numdbs = 2; /* this notes that me_txkey was set */
|
|
3497 |
#ifdef _WIN32
|
|
3498 |
/* Windows TLS callbacks need help finding their TLS info. */
|
|
3499 |
if (mdb_tls_nkeys < MAX_TLS_KEYS)
|
|
3500 |
mdb_tls_keys[mdb_tls_nkeys++] = env->me_txkey;
|
|
3501 |
else {
|
|
3502 |
rc = MDB_TLS_FULL;
|
|
3503 |
goto leave;
|
|
3504 |
}
|
|
3505 |
#endif
|
|
3506 |
if (excl > 0) {
|
|
3507 |
rc = mdb_env_share_locks(env, &excl);
|
|
3508 |
if (rc)
|
|
3509 |
goto leave;
|
|
3510 |
}
|
|
3511 |
env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
|
|
3512 |
env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
|
|
3513 |
env->me_path = strdup(path);
|
|
3514 |
if (!env->me_dbxs || !env->me_dbflags || !env->me_path)
|
|
3515 |
rc = ENOMEM;
|
|
3516 |
}
|
|
3517 |
|
|
3518 |
leave:
|
|
3519 |
if (rc) {
|
|
3520 |
mdb_env_close0(env, excl);
|
|
3521 |
}
|
|
3522 |
free(lpath);
|
|
3523 |
return rc;
|
|
3524 |
}
|
|
3525 |
|
|
3526 |
/** Destroy resources from mdb_env_open() and clear our readers */
|
|
3527 |
static void
|
|
3528 |
mdb_env_close0(MDB_env *env, int excl)
|
|
3529 |
{
|
|
3530 |
int i;
|
|
3531 |
|
|
3532 |
if (!(env->me_flags & MDB_ENV_ACTIVE))
|
|
3533 |
return;
|
|
3534 |
|
|
3535 |
free(env->me_dbflags);
|
|
3536 |
free(env->me_dbxs);
|
|
3537 |
free(env->me_path);
|
|
3538 |
|
|
3539 |
if (env->me_numdbs) {
|
|
3540 |
pthread_key_delete(env->me_txkey);
|
|
3541 |
#ifdef _WIN32
|
|
3542 |
/* Delete our key from the global list */
|
|
3543 |
for (i=0; i<mdb_tls_nkeys; i++)
|
|
3544 |
if (mdb_tls_keys[i] == env->me_txkey) {
|
|
3545 |
mdb_tls_keys[i] = mdb_tls_keys[mdb_tls_nkeys-1];
|
|
3546 |
mdb_tls_nkeys--;
|
|
3547 |
break;
|
|
3548 |
}
|
|
3549 |
#endif
|
|
3550 |
}
|
|
3551 |
|
|
3552 |
if (env->me_map) {
|
|
3553 |
munmap(env->me_map, env->me_mapsize);
|
|
3554 |
}
|
|
3555 |
if (env->me_mfd != env->me_fd && env->me_mfd != INVALID_HANDLE_VALUE)
|
|
3556 |
close(env->me_mfd);
|
|
3557 |
if (env->me_fd != INVALID_HANDLE_VALUE)
|
|
3558 |
close(env->me_fd);
|
|
3559 |
if (env->me_txns) {
|
|
3560 |
pid_t pid = env->me_pid;
|
|
3561 |
/* Clearing readers is done in this function because
|
|
3562 |
* me_txkey with its destructor must be disabled first.
|
|
3563 |
*/
|
|
3564 |
for (i = env->me_numreaders; --i >= 0; )
|
|
3565 |
if (env->me_txns->mti_readers[i].mr_pid == pid)
|
|
3566 |
env->me_txns->mti_readers[i].mr_pid = 0;
|
|
3567 |
#ifdef _WIN32
|
|
3568 |
if (env->me_rmutex) {
|
|
3569 |
CloseHandle(env->me_rmutex);
|
|
3570 |
if (env->me_wmutex) CloseHandle(env->me_wmutex);
|
|
3571 |
}
|
|
3572 |
/* Windows automatically destroys the mutexes when
|
|
3573 |
* the last handle closes.
|
|
3574 |
*/
|
|
3575 |
#elif defined(MDB_USE_POSIX_SEM)
|
|
3576 |
if (env->me_rmutex != SEM_FAILED) {
|
|
3577 |
sem_close(env->me_rmutex);
|
|
3578 |
if (env->me_wmutex != SEM_FAILED)
|
|
3579 |
sem_close(env->me_wmutex);
|
|
3580 |
/* If we have the filelock: If we are the
|
|
3581 |
* only remaining user, clean up semaphores.
|
|
3582 |
*/
|
|
3583 |
if (excl == 0)
|
|
3584 |
mdb_env_excl_lock(env, &excl);
|
|
3585 |
if (excl > 0) {
|
|
3586 |
sem_unlink(env->me_txns->mti_rmname);
|
|
3587 |
sem_unlink(env->me_txns->mti_wmname);
|
|
3588 |
}
|
|
3589 |
}
|
|
3590 |
#endif
|
|
3591 |
munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
|
|
3592 |
}
|
|
3593 |
if (env->me_lfd != INVALID_HANDLE_VALUE) {
|
|
3594 |
#ifdef _WIN32
|
|
3595 |
if (excl >= 0) {
|
|
3596 |
/* Unlock the lockfile. Windows would have unlocked it
|
|
3597 |
* after closing anyway, but not necessarily at once.
|
|
3598 |
*/
|
|
3599 |
UnlockFile(env->me_lfd, 0, 0, 1, 0);
|
|
3600 |
}
|
|
3601 |
#endif
|
|
3602 |
close(env->me_lfd);
|
|
3603 |
}
|
|
3604 |
|
|
3605 |
env->me_flags &= ~MDB_ENV_ACTIVE;
|
|
3606 |
}
|
|
3607 |
|
|
3608 |
int
|
|
3609 |
mdb_env_copy(MDB_env *env, const char *path)
|
|
3610 |
{
|
|
3611 |
MDB_txn *txn = NULL;
|
|
3612 |
int rc, len;
|
|
3613 |
size_t wsize;
|
|
3614 |
char *lpath, *ptr;
|
|
3615 |
HANDLE newfd = INVALID_HANDLE_VALUE;
|
|
3616 |
|
|
3617 |
if (env->me_flags & MDB_NOSUBDIR) {
|
|
3618 |
lpath = (char *)path;
|
|
3619 |
} else {
|
|
3620 |
len = strlen(path);
|
|
3621 |
len += sizeof(DATANAME);
|
|
3622 |
lpath = malloc(len);
|
|
3623 |
if (!lpath)
|
|
3624 |
return ENOMEM;
|
|
3625 |
sprintf(lpath, "%s" DATANAME, path);
|
|
3626 |
}
|
|
3627 |
|
|
3628 |
/* The destination path must exist, but the destination file must not.
|
|
3629 |
* We don't want the OS to cache the writes, since the source data is
|
|
3630 |
* already in the OS cache.
|
|
3631 |
*/
|
|
3632 |
#ifdef _WIN32
|
|
3633 |
newfd = CreateFile(lpath, GENERIC_WRITE, 0, NULL, CREATE_NEW,
|
|
3634 |
FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH, NULL);
|
|
3635 |
#else
|
|
3636 |
newfd = open(lpath, O_WRONLY|O_CREAT|O_EXCL
|
|
3637 |
#ifdef O_DIRECT
|
|
3638 |
|O_DIRECT
|
|
3639 |
#endif
|
|
3640 |
, 0666);
|
|
3641 |
#endif
|
|
3642 |
if (!(env->me_flags & MDB_NOSUBDIR))
|
|
3643 |
free(lpath);
|
|
3644 |
if (newfd == INVALID_HANDLE_VALUE) {
|
|
3645 |
rc = ErrCode();
|
|
3646 |
goto leave;
|
|
3647 |
}
|
|
3648 |
|
|
3649 |
#ifdef F_NOCACHE /* __APPLE__ */
|
|
3650 |
rc = fcntl(newfd, F_NOCACHE, 1);
|
|
3651 |
if (rc) {
|
|
3652 |
rc = ErrCode();
|
|
3653 |
goto leave;
|
|
3654 |
}
|
|
3655 |
#endif
|
|
3656 |
|
|
3657 |
/* Do the lock/unlock of the reader mutex before starting the
|
|
3658 |
* write txn. Otherwise other read txns could block writers.
|
|
3659 |
*/
|
|
3660 |
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
|
|
3661 |
if (rc)
|
|
3662 |
goto leave;
|
|
3663 |
|
|
3664 |
if (!(env->me_flags & MDB_ROFS)) {
|
|
3665 |
/* We must start the actual read txn after blocking writers */
|
|
3666 |
mdb_txn_reset0(txn);
|
|
3667 |
|
|
3668 |
/* Temporarily block writers until we snapshot the meta pages */
|
|
3669 |
LOCK_MUTEX_W(env);
|
|
3670 |
|
|
3671 |
rc = mdb_txn_renew0(txn);
|
|
3672 |
if (rc) {
|
|
3673 |
UNLOCK_MUTEX_W(env);
|
|
3674 |
goto leave;
|
|
3675 |
}
|
|
3676 |
}
|
|
3677 |
|
|
3678 |
wsize = env->me_psize * 2;
|
|
3679 |
#ifdef _WIN32
|
|
3680 |
{
|
|
3681 |
DWORD len;
|
|
3682 |
rc = WriteFile(newfd, env->me_map, wsize, &len, NULL);
|
|
3683 |
rc = (len == wsize) ? MDB_SUCCESS : ErrCode();
|
|
3684 |
}
|
|
3685 |
#else
|
|
3686 |
rc = write(newfd, env->me_map, wsize);
|
|
3687 |
rc = (rc == (int)wsize) ? MDB_SUCCESS : ErrCode();
|
|
3688 |
#endif
|
|
3689 |
if (! (env->me_flags & MDB_ROFS))
|
|
3690 |
UNLOCK_MUTEX_W(env);
|
|
3691 |
|
|
3692 |
if (rc)
|
|
3693 |
goto leave;
|
|
3694 |
|
|
3695 |
ptr = env->me_map + wsize;
|
|
3696 |
wsize = txn->mt_next_pgno * env->me_psize - wsize;
|
|
3697 |
#define MAX_WRITE 2147483648U
|
|
3698 |
#ifdef _WIN32
|
|
3699 |
while (wsize > 0) {
|
|
3700 |
DWORD len, w2;
|
|
3701 |
if (wsize > MAX_WRITE)
|
|
3702 |
w2 = MAX_WRITE;
|
|
3703 |
else
|
|
3704 |
w2 = wsize;
|
|
3705 |
rc = WriteFile(newfd, ptr, w2, &len, NULL);
|
|
3706 |
rc = (len == w2) ? MDB_SUCCESS : ErrCode();
|
|
3707 |
if (rc) break;
|
|
3708 |
wsize -= w2;
|
|
3709 |
ptr += w2;
|
|
3710 |
}
|
|
3711 |
#else
|
|
3712 |
while (wsize > 0) {
|
|
3713 |
size_t w2;
|
|
3714 |
ssize_t wres;
|
|
3715 |
if (wsize > MAX_WRITE)
|
|
3716 |
w2 = MAX_WRITE;
|
|
3717 |
else
|
|
3718 |
w2 = wsize;
|
|
3719 |
wres = write(newfd, ptr, w2);
|
|
3720 |
rc = (wres > 0) ? MDB_SUCCESS : ErrCode();
|
|
3721 |
if (rc) break;
|
|
3722 |
wsize -= wres;
|
|
3723 |
ptr += wres;
|
|
3724 |
}
|
|
3725 |
#endif
|
|
3726 |
mdb_txn_abort(txn);
|
|
3727 |
|
|
3728 |
leave:
|
|
3729 |
if (newfd != INVALID_HANDLE_VALUE)
|
|
3730 |
close(newfd);
|
|
3731 |
|
|
3732 |
return rc;
|
|
3733 |
}
|
|
3734 |
|
|
3735 |
void
|
|
3736 |
mdb_env_close(MDB_env *env)
|
|
3737 |
{
|
|
3738 |
MDB_page *dp;
|
|
3739 |
int i;
|
|
3740 |
|
|
3741 |
if (env == NULL)
|
|
3742 |
return;
|
|
3743 |
|
|
3744 |
for (i = env->me_numdbs; --i > MAIN_DBI; )
|
|
3745 |
free(env->me_dbxs[i].md_name.mv_data);
|
|
3746 |
|
|
3747 |
VGMEMP_DESTROY(env);
|
|
3748 |
while ((dp = env->me_dpages) != NULL) {
|
|
3749 |
VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
|
|
3750 |
env->me_dpages = dp->mp_next;
|
|
3751 |
free(dp);
|
|
3752 |
}
|
|
3753 |
|
|
3754 |
mdb_env_close0(env, 0);
|
|
3755 |
mdb_midl_free(env->me_free_pgs);
|
|
3756 |
free(env);
|
|
3757 |
}
|
|
3758 |
|
|
3759 |
/** Compare two items pointing at aligned size_t's */
|
|
3760 |
static int
|
|
3761 |
mdb_cmp_long(const MDB_val *a, const MDB_val *b)
|
|
3762 |
{
|
|
3763 |
return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
|
|
3764 |
*(size_t *)a->mv_data > *(size_t *)b->mv_data;
|
|
3765 |
}
|
|
3766 |
|
|
3767 |
/** Compare two items pointing at aligned int's */
|
|
3768 |
static int
|
|
3769 |
mdb_cmp_int(const MDB_val *a, const MDB_val *b)
|
|
3770 |
{
|
|
3771 |
return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
|
|
3772 |
*(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
|
|
3773 |
}
|
|
3774 |
|
|
3775 |
/** Compare two items pointing at ints of unknown alignment.
|
|
3776 |
* Nodes and keys are guaranteed to be 2-byte aligned.
|
|
3777 |
*/
|
|
3778 |
static int
|
|
3779 |
mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
|
|
3780 |
{
|
|
3781 |
#if BYTE_ORDER == LITTLE_ENDIAN
|
|
3782 |
unsigned short *u, *c;
|
|
3783 |
int x;
|
|
3784 |
|
|
3785 |
u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
|
|
3786 |
c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
|
|
3787 |
do {
|
|
3788 |
x = *--u - *--c;
|
|
3789 |
} while(!x && u > (unsigned short *)a->mv_data);
|
|
3790 |
return x;
|
|
3791 |
#else
|
|
3792 |
return memcmp(a->mv_data, b->mv_data, a->mv_size);
|
|
3793 |
#endif
|
|
3794 |
}
|
|
3795 |
|
|
3796 |
/** Compare two items lexically */
|
|
3797 |
static int
|
|
3798 |
mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
|
|
3799 |
{
|
|
3800 |
int diff;
|
|
3801 |
ssize_t len_diff;
|
|
3802 |
unsigned int len;
|
|
3803 |
|
|
3804 |
len = a->mv_size;
|
|
3805 |
len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
|
|
3806 |
if (len_diff > 0) {
|
|
3807 |
len = b->mv_size;
|
|
3808 |
len_diff = 1;
|
|
3809 |
}
|
|
3810 |
|
|
3811 |
diff = memcmp(a->mv_data, b->mv_data, len);
|
|
3812 |
return diff ? diff : len_diff<0 ? -1 : len_diff;
|
|
3813 |
}
|
|
3814 |
|
|
3815 |
/** Compare two items in reverse byte order */
|
|
3816 |
static int
|
|
3817 |
mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
|
|
3818 |
{
|
|
3819 |
const unsigned char *p1, *p2, *p1_lim;
|
|
3820 |
ssize_t len_diff;
|
|
3821 |
int diff;
|
|
3822 |
|
|
3823 |
p1_lim = (const unsigned char *)a->mv_data;
|
|
3824 |
p1 = (const unsigned char *)a->mv_data + a->mv_size;
|
|
3825 |
p2 = (const unsigned char *)b->mv_data + b->mv_size;
|
|
3826 |
|
|
3827 |
len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
|
|
3828 |
if (len_diff > 0) {
|
|
3829 |
p1_lim += len_diff;
|
|
3830 |
len_diff = 1;
|
|
3831 |
}
|
|
3832 |
|
|
3833 |
while (p1 > p1_lim) {
|
|
3834 |
diff = *--p1 - *--p2;
|
|
3835 |
if (diff)
|
|
3836 |
return diff;
|
|
3837 |
}
|
|
3838 |
return len_diff<0 ? -1 : len_diff;
|
|
3839 |
}
|
|
3840 |
|
|
3841 |
/** Search for key within a page, using binary search.
|
|
3842 |
* Returns the smallest entry larger or equal to the key.
|
|
3843 |
* If exactp is non-null, stores whether the found entry was an exact match
|
|
3844 |
* in *exactp (1 or 0).
|
|
3845 |
* Updates the cursor index with the index of the found entry.
|
|
3846 |
* If no entry larger or equal to the key is found, returns NULL.
|
|
3847 |
*/
|
|
3848 |
static MDB_node *
|
|
3849 |
mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
|
|
3850 |
{
|
|
3851 |
unsigned int i = 0, nkeys;
|
|
3852 |
int low, high;
|
|
3853 |
int rc = 0;
|
|
3854 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
|
3855 |
MDB_node *node = NULL;
|
|
3856 |
MDB_val nodekey;
|
|
3857 |
MDB_cmp_func *cmp;
|
|
3858 |
DKBUF;
|
|
3859 |
|
|
3860 |
nkeys = NUMKEYS(mp);
|
|
3861 |
|
|
3862 |
#if MDB_DEBUG
|
|
3863 |
{
|
|
3864 |
pgno_t pgno;
|
|
3865 |
COPY_PGNO(pgno, mp->mp_pgno);
|
|
3866 |
DPRINTF("searching %u keys in %s %spage %zu",
|
|
3867 |
nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
|
|
3868 |
pgno);
|
|
3869 |
}
|
|
3870 |
#endif
|
|
3871 |
|
|
3872 |
assert(nkeys > 0);
|
|
3873 |
|
|
3874 |
low = IS_LEAF(mp) ? 0 : 1;
|
|
3875 |
high = nkeys - 1;
|
|
3876 |
cmp = mc->mc_dbx->md_cmp;
|
|
3877 |
|
|
3878 |
/* Branch pages have no data, so if using integer keys,
|
|
3879 |
* alignment is guaranteed. Use faster mdb_cmp_int.
|
|
3880 |
*/
|
|
3881 |
if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
|
|
3882 |
if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
|
|
3883 |
cmp = mdb_cmp_long;
|
|
3884 |
else
|
|
3885 |
cmp = mdb_cmp_int;
|
|
3886 |
}
|
|
3887 |
|
|
3888 |
if (IS_LEAF2(mp)) {
|
|
3889 |
nodekey.mv_size = mc->mc_db->md_pad;
|
|
3890 |
node = NODEPTR(mp, 0); /* fake */
|
|
3891 |
while (low <= high) {
|
|
3892 |
i = (low + high) >> 1;
|
|
3893 |
nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
|
|
3894 |
rc = cmp(key, &nodekey);
|
|
3895 |
DPRINTF("found leaf index %u [%s], rc = %i",
|
|
3896 |
i, DKEY(&nodekey), rc);
|
|
3897 |
if (rc == 0)
|
|
3898 |
break;
|
|
3899 |
if (rc > 0)
|
|
3900 |
low = i + 1;
|
|
3901 |
else
|
|
3902 |
high = i - 1;
|
|
3903 |
}
|
|
3904 |
} else {
|
|
3905 |
while (low <= high) {
|
|
3906 |
i = (low + high) >> 1;
|
|
3907 |
|
|
3908 |
node = NODEPTR(mp, i);
|
|
3909 |
nodekey.mv_size = NODEKSZ(node);
|
|
3910 |
nodekey.mv_data = NODEKEY(node);
|
|
3911 |
|
|
3912 |
rc = cmp(key, &nodekey);
|
|
3913 |
#if MDB_DEBUG
|
|
3914 |
if (IS_LEAF(mp))
|
|
3915 |
DPRINTF("found leaf index %u [%s], rc = %i",
|
|
3916 |
i, DKEY(&nodekey), rc);
|
|
3917 |
else
|
|
3918 |
DPRINTF("found branch index %u [%s -> %zu], rc = %i",
|
|
3919 |
i, DKEY(&nodekey), NODEPGNO(node), rc);
|
|
3920 |
#endif
|
|
3921 |
if (rc == 0)
|
|
3922 |
break;
|
|
3923 |
if (rc > 0)
|
|
3924 |
low = i + 1;
|
|
3925 |
else
|
|
3926 |
high = i - 1;
|
|
3927 |
}
|
|
3928 |
}
|
|
3929 |
|
|
3930 |
if (rc > 0) { /* Found entry is less than the key. */
|
|
3931 |
i++; /* Skip to get the smallest entry larger than key. */
|
|
3932 |
if (!IS_LEAF2(mp))
|
|
3933 |
node = NODEPTR(mp, i);
|
|
3934 |
}
|
|
3935 |
if (exactp)
|
|
3936 |
*exactp = (rc == 0);
|
|
3937 |
/* store the key index */
|
|
3938 |
mc->mc_ki[mc->mc_top] = i;
|
|
3939 |
if (i >= nkeys)
|
|
3940 |
/* There is no entry larger or equal to the key. */
|
|
3941 |
return NULL;
|
|
3942 |
|
|
3943 |
/* nodeptr is fake for LEAF2 */
|
|
3944 |
return node;
|
|
3945 |
}
|
|
3946 |
|
|
3947 |
#if 0
|
|
3948 |
static void
|
|
3949 |
mdb_cursor_adjust(MDB_cursor *mc, func)
|
|
3950 |
{
|
|
3951 |
MDB_cursor *m2;
|
|
3952 |
|
|
3953 |
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
|
3954 |
if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
|
|
3955 |
func(mc, m2);
|
|
3956 |
}
|
|
3957 |
}
|
|
3958 |
}
|
|
3959 |
#endif
|
|
3960 |
|
|
3961 |
/** Pop a page off the top of the cursor's stack. */
|
|
3962 |
static void
|
|
3963 |
mdb_cursor_pop(MDB_cursor *mc)
|
|
3964 |
{
|
|
3965 |
if (mc->mc_snum) {
|
|
3966 |
#ifndef MDB_DEBUG_SKIP
|
|
3967 |
MDB_page *top = mc->mc_pg[mc->mc_top];
|
|
3968 |
#endif
|
|
3969 |
mc->mc_snum--;
|
|
3970 |
if (mc->mc_snum)
|
|
3971 |
mc->mc_top--;
|
|
3972 |
|
|
3973 |
DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno,
|
|
3974 |
mc->mc_dbi, (void *) mc);
|
|
3975 |
}
|
|
3976 |
}
|
|
3977 |
|
|
3978 |
/** Push a page onto the top of the cursor's stack. */
|
|
3979 |
static int
|
|
3980 |
mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
|
|
3981 |
{
|
|
3982 |
DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno,
|
|
3983 |
mc->mc_dbi, (void *) mc);
|
|
3984 |
|
|
3985 |
if (mc->mc_snum >= CURSOR_STACK) {
|
|
3986 |
assert(mc->mc_snum < CURSOR_STACK);
|
|
3987 |
return MDB_CURSOR_FULL;
|
|
3988 |
}
|
|
3989 |
|
|
3990 |
mc->mc_top = mc->mc_snum++;
|
|
3991 |
mc->mc_pg[mc->mc_top] = mp;
|
|
3992 |
mc->mc_ki[mc->mc_top] = 0;
|
|
3993 |
|
|
3994 |
return MDB_SUCCESS;
|
|
3995 |
}
|
|
3996 |
|
|
3997 |
/** Find the address of the page corresponding to a given page number.
|
|
3998 |
* @param[in] txn the transaction for this access.
|
|
3999 |
* @param[in] pgno the page number for the page to retrieve.
|
|
4000 |
* @param[out] ret address of a pointer where the page's address will be stored.
|
|
4001 |
* @return 0 on success, non-zero on failure.
|
|
4002 |
*/
|
|
4003 |
static int
|
|
4004 |
mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret)
|
|
4005 |
{
|
|
4006 |
MDB_page *p = NULL;
|
|
4007 |
|
|
4008 |
if (!((txn->mt_flags & MDB_TXN_RDONLY) |
|
|
4009 |
(txn->mt_env->me_flags & MDB_WRITEMAP)))
|
|
4010 |
{
|
|
4011 |
MDB_txn *tx2 = txn;
|
|
4012 |
do {
|
|
4013 |
MDB_ID2L dl = tx2->mt_u.dirty_list;
|
|
4014 |
if (dl[0].mid) {
|
|
4015 |
unsigned x = mdb_mid2l_search(dl, pgno);
|
|
4016 |
if (x <= dl[0].mid && dl[x].mid == pgno) {
|
|
4017 |
p = dl[x].mptr;
|
|
4018 |
goto done;
|
|
4019 |
}
|
|
4020 |
}
|
|
4021 |
} while ((tx2 = tx2->mt_parent) != NULL);
|
|
4022 |
}
|
|
4023 |
|
|
4024 |
if (pgno < txn->mt_next_pgno) {
|
|
4025 |
p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno);
|
|
4026 |
} else {
|
|
4027 |
DPRINTF("page %zu not found", pgno);
|
|
4028 |
assert(p != NULL);
|
|
4029 |
}
|
|
4030 |
|
|
4031 |
done:
|
|
4032 |
*ret = p;
|
|
4033 |
return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND;
|
|
4034 |
}
|
|
4035 |
|
|
4036 |
/** Search for the page a given key should be in.
|
|
4037 |
* Pushes parent pages on the cursor stack. This function continues a
|
|
4038 |
* search on a cursor that has already been initialized. (Usually by
|
|
4039 |
* #mdb_page_search() but also by #mdb_node_move().)
|
|
4040 |
* @param[in,out] mc the cursor for this operation.
|
|
4041 |
* @param[in] key the key to search for. If NULL, search for the lowest
|
|
4042 |
* page. (This is used by #mdb_cursor_first().)
|
|
4043 |
* @param[in] flags If MDB_PS_MODIFY set, visited pages are updated with new page numbers.
|
|
4044 |
* If MDB_PS_ROOTONLY set, just fetch root node, no further lookups.
|
|
4045 |
* @return 0 on success, non-zero on failure.
|
|
4046 |
*/
|
|
4047 |
static int
|
|
4048 |
mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
|
|
4049 |
{
|
|
4050 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
|
4051 |
DKBUF;
|
|
4052 |
int rc;
|
|
4053 |
|
|
4054 |
|
|
4055 |
while (IS_BRANCH(mp)) {
|
|
4056 |
MDB_node *node;
|
|
4057 |
indx_t i;
|
|
4058 |
|
|
4059 |
DPRINTF("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp));
|
|
4060 |
assert(NUMKEYS(mp) > 1);
|
|
4061 |
DPRINTF("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0)));
|
|
4062 |
|
|
4063 |
if (key == NULL) /* Initialize cursor to first page. */
|
|
4064 |
i = 0;
|
|
4065 |
else if (key->mv_size > MDB_MAXKEYSIZE && key->mv_data == NULL) {
|
|
4066 |
/* cursor to last page */
|
|
4067 |
i = NUMKEYS(mp)-1;
|
|
4068 |
} else {
|
|
4069 |
int exact;
|
|
4070 |
node = mdb_node_search(mc, key, &exact);
|
|
4071 |
if (node == NULL)
|
|
4072 |
i = NUMKEYS(mp) - 1;
|
|
4073 |
else {
|
|
4074 |
i = mc->mc_ki[mc->mc_top];
|
|
4075 |
if (!exact) {
|
|
4076 |
assert(i > 0);
|
|
4077 |
i--;
|
|
4078 |
}
|
|
4079 |
}
|
|
4080 |
}
|
|
4081 |
|
|
4082 |
if (key)
|
|
4083 |
DPRINTF("following index %u for key [%s]",
|
|
4084 |
i, DKEY(key));
|
|
4085 |
assert(i < NUMKEYS(mp));
|
|
4086 |
node = NODEPTR(mp, i);
|
|
4087 |
|
|
4088 |
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp)))
|
|
4089 |
return rc;
|
|
4090 |
|
|
4091 |
mc->mc_ki[mc->mc_top] = i;
|
|
4092 |
if ((rc = mdb_cursor_push(mc, mp)))
|
|
4093 |
return rc;
|
|
4094 |
|
|
4095 |
if (modify) {
|
|
4096 |
if ((rc = mdb_page_touch(mc)) != 0)
|
|
4097 |
return rc;
|
|
4098 |
mp = mc->mc_pg[mc->mc_top];
|
|
4099 |
}
|
|
4100 |
}
|
|
4101 |
|
|
4102 |
if (!IS_LEAF(mp)) {
|
|
4103 |
DPRINTF("internal error, index points to a %02X page!?",
|
|
4104 |
mp->mp_flags);
|
|
4105 |
return MDB_CORRUPTED;
|
|
4106 |
}
|
|
4107 |
|
|
4108 |
DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno,
|
|
4109 |
key ? DKEY(key) : NULL);
|
|
4110 |
|
|
4111 |
return MDB_SUCCESS;
|
|
4112 |
}
|
|
4113 |
|
|
4114 |
/** Search for the page a given key should be in.
|
|
4115 |
* Pushes parent pages on the cursor stack. This function just sets up
|
|
4116 |
* the search; it finds the root page for \b mc's database and sets this
|
|
4117 |
* as the root of the cursor's stack. Then #mdb_page_search_root() is
|
|
4118 |
* called to complete the search.
|
|
4119 |
* @param[in,out] mc the cursor for this operation.
|
|
4120 |
* @param[in] key the key to search for. If NULL, search for the lowest
|
|
4121 |
* page. (This is used by #mdb_cursor_first().)
|
|
4122 |
* @param[in] modify If true, visited pages are updated with new page numbers.
|
|
4123 |
* @return 0 on success, non-zero on failure.
|
|
4124 |
*/
|
|
4125 |
static int
|
|
4126 |
mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags)
|
|
4127 |
{
|
|
4128 |
int rc;
|
|
4129 |
pgno_t root;
|
|
4130 |
|
|
4131 |
/* Make sure the txn is still viable, then find the root from
|
|
4132 |
* the txn's db table.
|
|
4133 |
*/
|
|
4134 |
if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
|
|
4135 |
DPUTS("transaction has failed, must abort");
|
|
4136 |
return EINVAL;
|
|
4137 |
} else {
|
|
4138 |
/* Make sure we're using an up-to-date root */
|
|
4139 |
if (mc->mc_dbi > MAIN_DBI) {
|
|
4140 |
if ((*mc->mc_dbflag & DB_STALE) ||
|
|
4141 |
((flags & MDB_PS_MODIFY) && !(*mc->mc_dbflag & DB_DIRTY))) {
|
|
4142 |
MDB_cursor mc2;
|
|
4143 |
unsigned char dbflag = 0;
|
|
4144 |
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
|
|
4145 |
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, flags & MDB_PS_MODIFY);
|
|
4146 |
if (rc)
|
|
4147 |
return rc;
|
|
4148 |
if (*mc->mc_dbflag & DB_STALE) {
|
|
4149 |
MDB_val data;
|
|
4150 |
int exact = 0;
|
|
4151 |
uint16_t flags;
|
|
4152 |
MDB_node *leaf = mdb_node_search(&mc2,
|
|
4153 |
&mc->mc_dbx->md_name, &exact);
|
|
4154 |
if (!exact)
|
|
4155 |
return MDB_NOTFOUND;
|
|
4156 |
mdb_node_read(mc->mc_txn, leaf, &data);
|
|
4157 |
memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)),
|
|
4158 |
sizeof(uint16_t));
|
|
4159 |
/* The txn may not know this DBI, or another process may
|
|
4160 |
* have dropped and recreated the DB with other flags.
|
|
4161 |
*/
|
|
4162 |
if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
|
|
4163 |
return MDB_INCOMPATIBLE;
|
|
4164 |
memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
|
|
4165 |
}
|
|
4166 |
if (flags & MDB_PS_MODIFY)
|
|
4167 |
dbflag = DB_DIRTY;
|
|
4168 |
*mc->mc_dbflag &= ~DB_STALE;
|
|
4169 |
*mc->mc_dbflag |= dbflag;
|
|
4170 |
}
|
|
4171 |
}
|
|
4172 |
root = mc->mc_db->md_root;
|
|
4173 |
|
|
4174 |
if (root == P_INVALID) { /* Tree is empty. */
|
|
4175 |
DPUTS("tree is empty");
|
|
4176 |
return MDB_NOTFOUND;
|
|
4177 |
}
|
|
4178 |
}
|
|
4179 |
|
|
4180 |
assert(root > 1);
|
|
4181 |
if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
|
|
4182 |
if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0])))
|
|
4183 |
return rc;
|
|
4184 |
|
|
4185 |
mc->mc_snum = 1;
|
|
4186 |
mc->mc_top = 0;
|
|
4187 |
|
|
4188 |
DPRINTF("db %u root page %zu has flags 0x%X",
|
|
4189 |
mc->mc_dbi, root, mc->mc_pg[0]->mp_flags);
|
|
4190 |
|
|
4191 |
if (flags & MDB_PS_MODIFY) {
|
|
4192 |
if ((rc = mdb_page_touch(mc)))
|
|
4193 |
return rc;
|
|
4194 |
}
|
|
4195 |
|
|
4196 |
if (flags & MDB_PS_ROOTONLY)
|
|
4197 |
return MDB_SUCCESS;
|
|
4198 |
|
|
4199 |
return mdb_page_search_root(mc, key, flags);
|
|
4200 |
}
|
|
4201 |
|
|
4202 |
/** Return the data associated with a given node.
|
|
4203 |
* @param[in] txn The transaction for this operation.
|
|
4204 |
* @param[in] leaf The node being read.
|
|
4205 |
* @param[out] data Updated to point to the node's data.
|
|
4206 |
* @return 0 on success, non-zero on failure.
|
|
4207 |
*/
|
|
4208 |
static int
|
|
4209 |
mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
|
|
4210 |
{
|
|
4211 |
MDB_page *omp; /* overflow page */
|
|
4212 |
pgno_t pgno;
|
|
4213 |
int rc;
|
|
4214 |
|
|
4215 |
if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
|
4216 |
data->mv_size = NODEDSZ(leaf);
|
|
4217 |
data->mv_data = NODEDATA(leaf);
|
|
4218 |
return MDB_SUCCESS;
|
|
4219 |
}
|
|
4220 |
|
|
4221 |
/* Read overflow data.
|
|
4222 |
*/
|
|
4223 |
data->mv_size = NODEDSZ(leaf);
|
|
4224 |
memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
|
|
4225 |
if ((rc = mdb_page_get(txn, pgno, &omp))) {
|
|
4226 |
DPRINTF("read overflow page %zu failed", pgno);
|
|
4227 |
return rc;
|
|
4228 |
}
|
|
4229 |
data->mv_data = METADATA(omp);
|
|
4230 |
|
|
4231 |
return MDB_SUCCESS;
|
|
4232 |
}
|
|
4233 |
|
|
4234 |
int
|
|
4235 |
mdb_get(MDB_txn *txn, MDB_dbi dbi,
|
|
4236 |
MDB_val *key, MDB_val *data)
|
|
4237 |
{
|
|
4238 |
MDB_cursor mc;
|
|
4239 |
MDB_xcursor mx;
|
|
4240 |
int exact = 0;
|
|
4241 |
DKBUF;
|
|
4242 |
|
|
4243 |
assert(key);
|
|
4244 |
assert(data);
|
|
4245 |
DPRINTF("===> get db %u key [%s]", dbi, DKEY(key));
|
|
4246 |
|
|
4247 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
4248 |
return EINVAL;
|
|
4249 |
|
|
4250 |
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
|
4251 |
return EINVAL;
|
|
4252 |
}
|
|
4253 |
|
|
4254 |
mdb_cursor_init(&mc, txn, dbi, &mx);
|
|
4255 |
return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
|
|
4256 |
}
|
|
4257 |
|
|
4258 |
/** Find a sibling for a page.
|
|
4259 |
* Replaces the page at the top of the cursor's stack with the
|
|
4260 |
* specified sibling, if one exists.
|
|
4261 |
* @param[in] mc The cursor for this operation.
|
|
4262 |
* @param[in] move_right Non-zero if the right sibling is requested,
|
|
4263 |
* otherwise the left sibling.
|
|
4264 |
* @return 0 on success, non-zero on failure.
|
|
4265 |
*/
|
|
4266 |
static int
|
|
4267 |
mdb_cursor_sibling(MDB_cursor *mc, int move_right)
|
|
4268 |
{
|
|
4269 |
int rc;
|
|
4270 |
MDB_node *indx;
|
|
4271 |
MDB_page *mp;
|
|
4272 |
|
|
4273 |
if (mc->mc_snum < 2) {
|
|
4274 |
return MDB_NOTFOUND; /* root has no siblings */
|
|
4275 |
}
|
|
4276 |
|
|
4277 |
mdb_cursor_pop(mc);
|
|
4278 |
DPRINTF("parent page is page %zu, index %u",
|
|
4279 |
mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
|
|
4280 |
|
|
4281 |
if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
|
|
4282 |
: (mc->mc_ki[mc->mc_top] == 0)) {
|
|
4283 |
DPRINTF("no more keys left, moving to %s sibling",
|
|
4284 |
move_right ? "right" : "left");
|
|
4285 |
if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS) {
|
|
4286 |
/* undo cursor_pop before returning */
|
|
4287 |
mc->mc_top++;
|
|
4288 |
mc->mc_snum++;
|
|
4289 |
return rc;
|
|
4290 |
}
|
|
4291 |
} else {
|
|
4292 |
if (move_right)
|
|
4293 |
mc->mc_ki[mc->mc_top]++;
|
|
4294 |
else
|
|
4295 |
mc->mc_ki[mc->mc_top]--;
|
|
4296 |
DPRINTF("just moving to %s index key %u",
|
|
4297 |
move_right ? "right" : "left", mc->mc_ki[mc->mc_top]);
|
|
4298 |
}
|
|
4299 |
assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
|
|
4300 |
|
|
4301 |
indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
4302 |
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp)))
|
|
4303 |
return rc;
|
|
4304 |
|
|
4305 |
mdb_cursor_push(mc, mp);
|
|
4306 |
if (!move_right)
|
|
4307 |
mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
|
|
4308 |
|
|
4309 |
return MDB_SUCCESS;
|
|
4310 |
}
|
|
4311 |
|
|
4312 |
/** Move the cursor to the next data item. */
|
|
4313 |
static int
|
|
4314 |
mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
|
|
4315 |
{
|
|
4316 |
MDB_page *mp;
|
|
4317 |
MDB_node *leaf;
|
|
4318 |
int rc;
|
|
4319 |
|
|
4320 |
if (mc->mc_flags & C_EOF) {
|
|
4321 |
return MDB_NOTFOUND;
|
|
4322 |
}
|
|
4323 |
|
|
4324 |
assert(mc->mc_flags & C_INITIALIZED);
|
|
4325 |
|
|
4326 |
mp = mc->mc_pg[mc->mc_top];
|
|
4327 |
|
|
4328 |
if (mc->mc_db->md_flags & MDB_DUPSORT) {
|
|
4329 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4330 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4331 |
if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
|
|
4332 |
rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
|
|
4333 |
if (op != MDB_NEXT || rc == MDB_SUCCESS)
|
|
4334 |
return rc;
|
|
4335 |
}
|
|
4336 |
} else {
|
|
4337 |
mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
|
|
4338 |
if (op == MDB_NEXT_DUP)
|
|
4339 |
return MDB_NOTFOUND;
|
|
4340 |
}
|
|
4341 |
}
|
|
4342 |
|
|
4343 |
DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
|
|
4344 |
|
|
4345 |
if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
|
|
4346 |
DPUTS("=====> move to next sibling page");
|
|
4347 |
if (mdb_cursor_sibling(mc, 1) != MDB_SUCCESS) {
|
|
4348 |
mc->mc_flags |= C_EOF;
|
|
4349 |
mc->mc_flags &= ~C_INITIALIZED;
|
|
4350 |
return MDB_NOTFOUND;
|
|
4351 |
}
|
|
4352 |
mp = mc->mc_pg[mc->mc_top];
|
|
4353 |
DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
|
|
4354 |
} else
|
|
4355 |
mc->mc_ki[mc->mc_top]++;
|
|
4356 |
|
|
4357 |
DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
|
|
4358 |
mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
|
|
4359 |
|
|
4360 |
if (IS_LEAF2(mp)) {
|
|
4361 |
key->mv_size = mc->mc_db->md_pad;
|
|
4362 |
key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
|
|
4363 |
return MDB_SUCCESS;
|
|
4364 |
}
|
|
4365 |
|
|
4366 |
assert(IS_LEAF(mp));
|
|
4367 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4368 |
|
|
4369 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4370 |
mdb_xcursor_init1(mc, leaf);
|
|
4371 |
}
|
|
4372 |
if (data) {
|
|
4373 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
|
|
4374 |
return rc;
|
|
4375 |
|
|
4376 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4377 |
rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4378 |
if (rc != MDB_SUCCESS)
|
|
4379 |
return rc;
|
|
4380 |
}
|
|
4381 |
}
|
|
4382 |
|
|
4383 |
MDB_GET_KEY(leaf, key);
|
|
4384 |
return MDB_SUCCESS;
|
|
4385 |
}
|
|
4386 |
|
|
4387 |
/** Move the cursor to the previous data item. */
|
|
4388 |
static int
|
|
4389 |
mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
|
|
4390 |
{
|
|
4391 |
MDB_page *mp;
|
|
4392 |
MDB_node *leaf;
|
|
4393 |
int rc;
|
|
4394 |
|
|
4395 |
assert(mc->mc_flags & C_INITIALIZED);
|
|
4396 |
|
|
4397 |
mp = mc->mc_pg[mc->mc_top];
|
|
4398 |
|
|
4399 |
if (mc->mc_db->md_flags & MDB_DUPSORT) {
|
|
4400 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4401 |
if (op == MDB_PREV || op == MDB_PREV_DUP) {
|
|
4402 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4403 |
rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
|
|
4404 |
if (op != MDB_PREV || rc == MDB_SUCCESS)
|
|
4405 |
return rc;
|
|
4406 |
} else {
|
|
4407 |
mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
|
|
4408 |
if (op == MDB_PREV_DUP)
|
|
4409 |
return MDB_NOTFOUND;
|
|
4410 |
}
|
|
4411 |
}
|
|
4412 |
}
|
|
4413 |
|
|
4414 |
DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
|
|
4415 |
|
|
4416 |
if (mc->mc_ki[mc->mc_top] == 0) {
|
|
4417 |
DPUTS("=====> move to prev sibling page");
|
|
4418 |
if (mdb_cursor_sibling(mc, 0) != MDB_SUCCESS) {
|
|
4419 |
mc->mc_flags &= ~C_INITIALIZED;
|
|
4420 |
return MDB_NOTFOUND;
|
|
4421 |
}
|
|
4422 |
mp = mc->mc_pg[mc->mc_top];
|
|
4423 |
mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
|
|
4424 |
DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
|
|
4425 |
} else
|
|
4426 |
mc->mc_ki[mc->mc_top]--;
|
|
4427 |
|
|
4428 |
mc->mc_flags &= ~C_EOF;
|
|
4429 |
|
|
4430 |
DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
|
|
4431 |
mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
|
|
4432 |
|
|
4433 |
if (IS_LEAF2(mp)) {
|
|
4434 |
key->mv_size = mc->mc_db->md_pad;
|
|
4435 |
key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
|
|
4436 |
return MDB_SUCCESS;
|
|
4437 |
}
|
|
4438 |
|
|
4439 |
assert(IS_LEAF(mp));
|
|
4440 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4441 |
|
|
4442 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4443 |
mdb_xcursor_init1(mc, leaf);
|
|
4444 |
}
|
|
4445 |
if (data) {
|
|
4446 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
|
|
4447 |
return rc;
|
|
4448 |
|
|
4449 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4450 |
rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4451 |
if (rc != MDB_SUCCESS)
|
|
4452 |
return rc;
|
|
4453 |
}
|
|
4454 |
}
|
|
4455 |
|
|
4456 |
MDB_GET_KEY(leaf, key);
|
|
4457 |
return MDB_SUCCESS;
|
|
4458 |
}
|
|
4459 |
|
|
4460 |
/** Set the cursor on a specific data item. */
|
|
4461 |
static int
|
|
4462 |
mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
4463 |
MDB_cursor_op op, int *exactp)
|
|
4464 |
{
|
|
4465 |
int rc;
|
|
4466 |
MDB_page *mp;
|
|
4467 |
MDB_node *leaf = NULL;
|
|
4468 |
DKBUF;
|
|
4469 |
|
|
4470 |
assert(mc);
|
|
4471 |
assert(key);
|
|
4472 |
assert(key->mv_size > 0);
|
|
4473 |
|
|
4474 |
/* See if we're already on the right page */
|
|
4475 |
if (mc->mc_flags & C_INITIALIZED) {
|
|
4476 |
MDB_val nodekey;
|
|
4477 |
|
|
4478 |
mp = mc->mc_pg[mc->mc_top];
|
|
4479 |
if (!NUMKEYS(mp)) {
|
|
4480 |
mc->mc_ki[mc->mc_top] = 0;
|
|
4481 |
return MDB_NOTFOUND;
|
|
4482 |
}
|
|
4483 |
if (mp->mp_flags & P_LEAF2) {
|
|
4484 |
nodekey.mv_size = mc->mc_db->md_pad;
|
|
4485 |
nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
|
|
4486 |
} else {
|
|
4487 |
leaf = NODEPTR(mp, 0);
|
|
4488 |
MDB_GET_KEY(leaf, &nodekey);
|
|
4489 |
}
|
|
4490 |
rc = mc->mc_dbx->md_cmp(key, &nodekey);
|
|
4491 |
if (rc == 0) {
|
|
4492 |
/* Probably happens rarely, but first node on the page
|
|
4493 |
* was the one we wanted.
|
|
4494 |
*/
|
|
4495 |
mc->mc_ki[mc->mc_top] = 0;
|
|
4496 |
if (exactp)
|
|
4497 |
*exactp = 1;
|
|
4498 |
goto set1;
|
|
4499 |
}
|
|
4500 |
if (rc > 0) {
|
|
4501 |
unsigned int i;
|
|
4502 |
unsigned int nkeys = NUMKEYS(mp);
|
|
4503 |
if (nkeys > 1) {
|
|
4504 |
if (mp->mp_flags & P_LEAF2) {
|
|
4505 |
nodekey.mv_data = LEAF2KEY(mp,
|
|
4506 |
nkeys-1, nodekey.mv_size);
|
|
4507 |
} else {
|
|
4508 |
leaf = NODEPTR(mp, nkeys-1);
|
|
4509 |
MDB_GET_KEY(leaf, &nodekey);
|
|
4510 |
}
|
|
4511 |
rc = mc->mc_dbx->md_cmp(key, &nodekey);
|
|
4512 |
if (rc == 0) {
|
|
4513 |
/* last node was the one we wanted */
|
|
4514 |
mc->mc_ki[mc->mc_top] = nkeys-1;
|
|
4515 |
if (exactp)
|
|
4516 |
*exactp = 1;
|
|
4517 |
goto set1;
|
|
4518 |
}
|
|
4519 |
if (rc < 0) {
|
|
4520 |
if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
|
|
4521 |
/* This is definitely the right page, skip search_page */
|
|
4522 |
if (mp->mp_flags & P_LEAF2) {
|
|
4523 |
nodekey.mv_data = LEAF2KEY(mp,
|
|
4524 |
mc->mc_ki[mc->mc_top], nodekey.mv_size);
|
|
4525 |
} else {
|
|
4526 |
leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4527 |
MDB_GET_KEY(leaf, &nodekey);
|
|
4528 |
}
|
|
4529 |
rc = mc->mc_dbx->md_cmp(key, &nodekey);
|
|
4530 |
if (rc == 0) {
|
|
4531 |
/* current node was the one we wanted */
|
|
4532 |
if (exactp)
|
|
4533 |
*exactp = 1;
|
|
4534 |
goto set1;
|
|
4535 |
}
|
|
4536 |
}
|
|
4537 |
rc = 0;
|
|
4538 |
goto set2;
|
|
4539 |
}
|
|
4540 |
}
|
|
4541 |
/* If any parents have right-sibs, search.
|
|
4542 |
* Otherwise, there's nothing further.
|
|
4543 |
*/
|
|
4544 |
for (i=0; i<mc->mc_top; i++)
|
|
4545 |
if (mc->mc_ki[i] <
|
|
4546 |
NUMKEYS(mc->mc_pg[i])-1)
|
|
4547 |
break;
|
|
4548 |
if (i == mc->mc_top) {
|
|
4549 |
/* There are no other pages */
|
|
4550 |
mc->mc_ki[mc->mc_top] = nkeys;
|
|
4551 |
return MDB_NOTFOUND;
|
|
4552 |
}
|
|
4553 |
}
|
|
4554 |
if (!mc->mc_top) {
|
|
4555 |
/* There are no other pages */
|
|
4556 |
mc->mc_ki[mc->mc_top] = 0;
|
|
4557 |
return MDB_NOTFOUND;
|
|
4558 |
}
|
|
4559 |
}
|
|
4560 |
|
|
4561 |
rc = mdb_page_search(mc, key, 0);
|
|
4562 |
if (rc != MDB_SUCCESS)
|
|
4563 |
return rc;
|
|
4564 |
|
|
4565 |
mp = mc->mc_pg[mc->mc_top];
|
|
4566 |
assert(IS_LEAF(mp));
|
|
4567 |
|
|
4568 |
set2:
|
|
4569 |
leaf = mdb_node_search(mc, key, exactp);
|
|
4570 |
if (exactp != NULL && !*exactp) {
|
|
4571 |
/* MDB_SET specified and not an exact match. */
|
|
4572 |
return MDB_NOTFOUND;
|
|
4573 |
}
|
|
4574 |
|
|
4575 |
if (leaf == NULL) {
|
|
4576 |
DPUTS("===> inexact leaf not found, goto sibling");
|
|
4577 |
if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
|
|
4578 |
return rc; /* no entries matched */
|
|
4579 |
mp = mc->mc_pg[mc->mc_top];
|
|
4580 |
assert(IS_LEAF(mp));
|
|
4581 |
leaf = NODEPTR(mp, 0);
|
|
4582 |
}
|
|
4583 |
|
|
4584 |
set1:
|
|
4585 |
mc->mc_flags |= C_INITIALIZED;
|
|
4586 |
mc->mc_flags &= ~C_EOF;
|
|
4587 |
|
|
4588 |
if (IS_LEAF2(mp)) {
|
|
4589 |
key->mv_size = mc->mc_db->md_pad;
|
|
4590 |
key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
|
|
4591 |
return MDB_SUCCESS;
|
|
4592 |
}
|
|
4593 |
|
|
4594 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4595 |
mdb_xcursor_init1(mc, leaf);
|
|
4596 |
}
|
|
4597 |
if (data) {
|
|
4598 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4599 |
if (op == MDB_SET || op == MDB_SET_KEY || op == MDB_SET_RANGE) {
|
|
4600 |
rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4601 |
} else {
|
|
4602 |
int ex2, *ex2p;
|
|
4603 |
if (op == MDB_GET_BOTH) {
|
|
4604 |
ex2p = &ex2;
|
|
4605 |
ex2 = 0;
|
|
4606 |
} else {
|
|
4607 |
ex2p = NULL;
|
|
4608 |
}
|
|
4609 |
rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
|
|
4610 |
if (rc != MDB_SUCCESS)
|
|
4611 |
return rc;
|
|
4612 |
}
|
|
4613 |
} else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
|
|
4614 |
MDB_val d2;
|
|
4615 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
|
|
4616 |
return rc;
|
|
4617 |
rc = mc->mc_dbx->md_dcmp(data, &d2);
|
|
4618 |
if (rc) {
|
|
4619 |
if (op == MDB_GET_BOTH || rc > 0)
|
|
4620 |
return MDB_NOTFOUND;
|
|
4621 |
}
|
|
4622 |
|
|
4623 |
} else {
|
|
4624 |
if (mc->mc_xcursor)
|
|
4625 |
mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
|
|
4626 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
|
|
4627 |
return rc;
|
|
4628 |
}
|
|
4629 |
}
|
|
4630 |
|
|
4631 |
/* The key already matches in all other cases */
|
|
4632 |
if (op == MDB_SET_RANGE || op == MDB_SET_KEY)
|
|
4633 |
MDB_GET_KEY(leaf, key);
|
|
4634 |
DPRINTF("==> cursor placed on key [%s]", DKEY(key));
|
|
4635 |
|
|
4636 |
return rc;
|
|
4637 |
}
|
|
4638 |
|
|
4639 |
/** Move the cursor to the first item in the database. */
|
|
4640 |
static int
|
|
4641 |
mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
4642 |
{
|
|
4643 |
int rc;
|
|
4644 |
MDB_node *leaf;
|
|
4645 |
|
|
4646 |
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
|
4647 |
rc = mdb_page_search(mc, NULL, 0);
|
|
4648 |
if (rc != MDB_SUCCESS)
|
|
4649 |
return rc;
|
|
4650 |
}
|
|
4651 |
assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
|
|
4652 |
|
|
4653 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
|
|
4654 |
mc->mc_flags |= C_INITIALIZED;
|
|
4655 |
mc->mc_flags &= ~C_EOF;
|
|
4656 |
|
|
4657 |
mc->mc_ki[mc->mc_top] = 0;
|
|
4658 |
|
|
4659 |
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
|
4660 |
key->mv_size = mc->mc_db->md_pad;
|
|
4661 |
key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
|
|
4662 |
return MDB_SUCCESS;
|
|
4663 |
}
|
|
4664 |
|
|
4665 |
if (data) {
|
|
4666 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4667 |
mdb_xcursor_init1(mc, leaf);
|
|
4668 |
rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4669 |
if (rc)
|
|
4670 |
return rc;
|
|
4671 |
} else {
|
|
4672 |
if (mc->mc_xcursor)
|
|
4673 |
mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
|
|
4674 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
|
|
4675 |
return rc;
|
|
4676 |
}
|
|
4677 |
}
|
|
4678 |
MDB_GET_KEY(leaf, key);
|
|
4679 |
return MDB_SUCCESS;
|
|
4680 |
}
|
|
4681 |
|
|
4682 |
/** Move the cursor to the last item in the database. */
|
|
4683 |
static int
|
|
4684 |
mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
|
|
4685 |
{
|
|
4686 |
int rc;
|
|
4687 |
MDB_node *leaf;
|
|
4688 |
|
|
4689 |
if (!(mc->mc_flags & C_EOF)) {
|
|
4690 |
|
|
4691 |
if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
|
|
4692 |
MDB_val lkey;
|
|
4693 |
|
|
4694 |
lkey.mv_size = MDB_MAXKEYSIZE+1;
|
|
4695 |
lkey.mv_data = NULL;
|
|
4696 |
rc = mdb_page_search(mc, &lkey, 0);
|
|
4697 |
if (rc != MDB_SUCCESS)
|
|
4698 |
return rc;
|
|
4699 |
}
|
|
4700 |
assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
|
|
4701 |
|
|
4702 |
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
|
|
4703 |
}
|
|
4704 |
mc->mc_flags |= C_INITIALIZED|C_EOF;
|
|
4705 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
4706 |
|
|
4707 |
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
|
4708 |
key->mv_size = mc->mc_db->md_pad;
|
|
4709 |
key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
|
|
4710 |
return MDB_SUCCESS;
|
|
4711 |
}
|
|
4712 |
|
|
4713 |
if (data) {
|
|
4714 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4715 |
mdb_xcursor_init1(mc, leaf);
|
|
4716 |
rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4717 |
if (rc)
|
|
4718 |
return rc;
|
|
4719 |
} else {
|
|
4720 |
if (mc->mc_xcursor)
|
|
4721 |
mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
|
|
4722 |
if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
|
|
4723 |
return rc;
|
|
4724 |
}
|
|
4725 |
}
|
|
4726 |
|
|
4727 |
MDB_GET_KEY(leaf, key);
|
|
4728 |
return MDB_SUCCESS;
|
|
4729 |
}
|
|
4730 |
|
|
4731 |
int
|
|
4732 |
mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
4733 |
MDB_cursor_op op)
|
|
4734 |
{
|
|
4735 |
int rc;
|
|
4736 |
int exact = 0;
|
|
4737 |
|
|
4738 |
assert(mc);
|
|
4739 |
|
|
4740 |
switch (op) {
|
|
4741 |
case MDB_GET_CURRENT:
|
|
4742 |
if (!(mc->mc_flags & C_INITIALIZED)) {
|
|
4743 |
rc = EINVAL;
|
|
4744 |
} else {
|
|
4745 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
|
4746 |
if (!NUMKEYS(mp)) {
|
|
4747 |
mc->mc_ki[mc->mc_top] = 0;
|
|
4748 |
rc = MDB_NOTFOUND;
|
|
4749 |
break;
|
|
4750 |
}
|
|
4751 |
rc = MDB_SUCCESS;
|
|
4752 |
if (IS_LEAF2(mp)) {
|
|
4753 |
key->mv_size = mc->mc_db->md_pad;
|
|
4754 |
key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
|
|
4755 |
} else {
|
|
4756 |
MDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
4757 |
MDB_GET_KEY(leaf, key);
|
|
4758 |
if (data) {
|
|
4759 |
if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
4760 |
rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT);
|
|
4761 |
} else {
|
|
4762 |
rc = mdb_node_read(mc->mc_txn, leaf, data);
|
|
4763 |
}
|
|
4764 |
}
|
|
4765 |
}
|
|
4766 |
}
|
|
4767 |
break;
|
|
4768 |
case MDB_GET_BOTH:
|
|
4769 |
case MDB_GET_BOTH_RANGE:
|
|
4770 |
if (data == NULL || mc->mc_xcursor == NULL) {
|
|
4771 |
rc = EINVAL;
|
|
4772 |
break;
|
|
4773 |
}
|
|
4774 |
/* FALLTHRU */
|
|
4775 |
case MDB_SET:
|
|
4776 |
case MDB_SET_KEY:
|
|
4777 |
case MDB_SET_RANGE:
|
|
4778 |
if (key == NULL || key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
|
4779 |
rc = EINVAL;
|
|
4780 |
} else if (op == MDB_SET_RANGE)
|
|
4781 |
rc = mdb_cursor_set(mc, key, data, op, NULL);
|
|
4782 |
else
|
|
4783 |
rc = mdb_cursor_set(mc, key, data, op, &exact);
|
|
4784 |
break;
|
|
4785 |
case MDB_GET_MULTIPLE:
|
|
4786 |
if (data == NULL ||
|
|
4787 |
!(mc->mc_db->md_flags & MDB_DUPFIXED) ||
|
|
4788 |
!(mc->mc_flags & C_INITIALIZED)) {
|
|
4789 |
rc = EINVAL;
|
|
4790 |
break;
|
|
4791 |
}
|
|
4792 |
rc = MDB_SUCCESS;
|
|
4793 |
if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
|
|
4794 |
(mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
|
|
4795 |
break;
|
|
4796 |
goto fetchm;
|
|
4797 |
case MDB_NEXT_MULTIPLE:
|
|
4798 |
if (data == NULL ||
|
|
4799 |
!(mc->mc_db->md_flags & MDB_DUPFIXED)) {
|
|
4800 |
rc = EINVAL;
|
|
4801 |
break;
|
|
4802 |
}
|
|
4803 |
if (!(mc->mc_flags & C_INITIALIZED))
|
|
4804 |
rc = mdb_cursor_first(mc, key, data);
|
|
4805 |
else
|
|
4806 |
rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
|
|
4807 |
if (rc == MDB_SUCCESS) {
|
|
4808 |
if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
|
|
4809 |
MDB_cursor *mx;
|
|
4810 |
fetchm:
|
|
4811 |
mx = &mc->mc_xcursor->mx_cursor;
|
|
4812 |
data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
|
|
4813 |
mx->mc_db->md_pad;
|
|
4814 |
data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
|
|
4815 |
mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
|
|
4816 |
} else {
|
|
4817 |
rc = MDB_NOTFOUND;
|
|
4818 |
}
|
|
4819 |
}
|
|
4820 |
break;
|
|
4821 |
case MDB_NEXT:
|
|
4822 |
case MDB_NEXT_DUP:
|
|
4823 |
case MDB_NEXT_NODUP:
|
|
4824 |
if (!(mc->mc_flags & C_INITIALIZED))
|
|
4825 |
rc = mdb_cursor_first(mc, key, data);
|
|
4826 |
else
|
|
4827 |
rc = mdb_cursor_next(mc, key, data, op);
|
|
4828 |
break;
|
|
4829 |
case MDB_PREV:
|
|
4830 |
case MDB_PREV_DUP:
|
|
4831 |
case MDB_PREV_NODUP:
|
|
4832 |
if (!(mc->mc_flags & C_INITIALIZED)) {
|
|
4833 |
rc = mdb_cursor_last(mc, key, data);
|
|
4834 |
mc->mc_flags |= C_INITIALIZED;
|
|
4835 |
mc->mc_ki[mc->mc_top]++;
|
|
4836 |
}
|
|
4837 |
rc = mdb_cursor_prev(mc, key, data, op);
|
|
4838 |
break;
|
|
4839 |
case MDB_FIRST:
|
|
4840 |
rc = mdb_cursor_first(mc, key, data);
|
|
4841 |
break;
|
|
4842 |
case MDB_FIRST_DUP:
|
|
4843 |
if (data == NULL ||
|
|
4844 |
!(mc->mc_db->md_flags & MDB_DUPSORT) ||
|
|
4845 |
!(mc->mc_flags & C_INITIALIZED) ||
|
|
4846 |
!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
|
|
4847 |
rc = EINVAL;
|
|
4848 |
break;
|
|
4849 |
}
|
|
4850 |
rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4851 |
break;
|
|
4852 |
case MDB_LAST:
|
|
4853 |
rc = mdb_cursor_last(mc, key, data);
|
|
4854 |
break;
|
|
4855 |
case MDB_LAST_DUP:
|
|
4856 |
if (data == NULL ||
|
|
4857 |
!(mc->mc_db->md_flags & MDB_DUPSORT) ||
|
|
4858 |
!(mc->mc_flags & C_INITIALIZED) ||
|
|
4859 |
!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
|
|
4860 |
rc = EINVAL;
|
|
4861 |
break;
|
|
4862 |
}
|
|
4863 |
rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
|
|
4864 |
break;
|
|
4865 |
default:
|
|
4866 |
DPRINTF("unhandled/unimplemented cursor operation %u", op);
|
|
4867 |
rc = EINVAL;
|
|
4868 |
break;
|
|
4869 |
}
|
|
4870 |
|
|
4871 |
return rc;
|
|
4872 |
}
|
|
4873 |
|
|
4874 |
/** Touch all the pages in the cursor stack.
|
|
4875 |
* Makes sure all the pages are writable, before attempting a write operation.
|
|
4876 |
* @param[in] mc The cursor to operate on.
|
|
4877 |
*/
|
|
4878 |
static int
|
|
4879 |
mdb_cursor_touch(MDB_cursor *mc)
|
|
4880 |
{
|
|
4881 |
int rc;
|
|
4882 |
|
|
4883 |
if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
|
|
4884 |
MDB_cursor mc2;
|
|
4885 |
MDB_xcursor mcx;
|
|
4886 |
mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI,
|
|
4887 |
mc->mc_txn->mt_dbs[MAIN_DBI].md_flags & MDB_DUPSORT ? &mcx : NULL);
|
|
4888 |
rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, MDB_PS_MODIFY);
|
|
4889 |
if (rc)
|
|
4890 |
return rc;
|
|
4891 |
*mc->mc_dbflag |= DB_DIRTY;
|
|
4892 |
}
|
|
4893 |
for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
|
|
4894 |
rc = mdb_page_touch(mc);
|
|
4895 |
if (rc)
|
|
4896 |
return rc;
|
|
4897 |
}
|
|
4898 |
mc->mc_top = mc->mc_snum-1;
|
|
4899 |
return MDB_SUCCESS;
|
|
4900 |
}
|
|
4901 |
|
|
4902 |
int
|
|
4903 |
mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
|
|
4904 |
unsigned int flags)
|
|
4905 |
{
|
|
4906 |
MDB_node *leaf = NULL;
|
|
4907 |
MDB_val xdata, *rdata, dkey;
|
|
4908 |
MDB_page *fp;
|
|
4909 |
MDB_db dummy;
|
|
4910 |
int do_sub = 0, insert = 0;
|
|
4911 |
unsigned int mcount = 0;
|
|
4912 |
size_t nsize;
|
|
4913 |
int rc, rc2;
|
|
4914 |
MDB_pagebuf pbuf;
|
|
4915 |
char dbuf[MDB_MAXKEYSIZE+1];
|
|
4916 |
unsigned int nflags;
|
|
4917 |
DKBUF;
|
|
4918 |
|
|
4919 |
if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
|
|
4920 |
return EACCES;
|
|
4921 |
|
|
4922 |
if (flags != MDB_CURRENT && (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE))
|
|
4923 |
return EINVAL;
|
|
4924 |
|
|
4925 |
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT) && data->mv_size > MDB_MAXKEYSIZE)
|
|
4926 |
return EINVAL;
|
|
4927 |
|
|
4928 |
#if SIZE_MAX > MAXDATASIZE
|
|
4929 |
if (data->mv_size > MAXDATASIZE)
|
|
4930 |
return EINVAL;
|
|
4931 |
#endif
|
|
4932 |
|
|
4933 |
DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
|
|
4934 |
mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);
|
|
4935 |
|
|
4936 |
dkey.mv_size = 0;
|
|
4937 |
|
|
4938 |
if (flags == MDB_CURRENT) {
|
|
4939 |
if (!(mc->mc_flags & C_INITIALIZED))
|
|
4940 |
return EINVAL;
|
|
4941 |
rc = MDB_SUCCESS;
|
|
4942 |
} else if (mc->mc_db->md_root == P_INVALID) {
|
|
4943 |
MDB_page *np;
|
|
4944 |
/* new database, write a root leaf page */
|
|
4945 |
DPUTS("allocating new root leaf page");
|
|
4946 |
if ((rc = mdb_page_new(mc, P_LEAF, 1, &np))) {
|
|
4947 |
return rc;
|
|
4948 |
}
|
|
4949 |
mc->mc_snum = 0;
|
|
4950 |
mdb_cursor_push(mc, np);
|
|
4951 |
mc->mc_db->md_root = np->mp_pgno;
|
|
4952 |
mc->mc_db->md_depth++;
|
|
4953 |
*mc->mc_dbflag |= DB_DIRTY;
|
|
4954 |
if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
|
|
4955 |
== MDB_DUPFIXED)
|
|
4956 |
np->mp_flags |= P_LEAF2;
|
|
4957 |
mc->mc_flags |= C_INITIALIZED;
|
|
4958 |
rc = MDB_NOTFOUND;
|
|
4959 |
goto top;
|
|
4960 |
} else {
|
|
4961 |
int exact = 0;
|
|
4962 |
MDB_val d2;
|
|
4963 |
if (flags & MDB_APPEND) {
|
|
4964 |
MDB_val k2;
|
|
4965 |
rc = mdb_cursor_last(mc, &k2, &d2);
|
|
4966 |
if (rc == 0) {
|
|
4967 |
rc = mc->mc_dbx->md_cmp(key, &k2);
|
|
4968 |
if (rc > 0) {
|
|
4969 |
rc = MDB_NOTFOUND;
|
|
4970 |
mc->mc_ki[mc->mc_top]++;
|
|
4971 |
} else {
|
|
4972 |
/* new key is <= last key */
|
|
4973 |
rc = MDB_KEYEXIST;
|
|
4974 |
}
|
|
4975 |
}
|
|
4976 |
} else {
|
|
4977 |
rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
|
|
4978 |
}
|
|
4979 |
if ((flags & MDB_NOOVERWRITE) && rc == 0) {
|
|
4980 |
DPRINTF("duplicate key [%s]", DKEY(key));
|
|
4981 |
*data = d2;
|
|
4982 |
return MDB_KEYEXIST;
|
|
4983 |
}
|
|
4984 |
if (rc && rc != MDB_NOTFOUND)
|
|
4985 |
return rc;
|
|
4986 |
}
|
|
4987 |
|
|
4988 |
/* Cursor is positioned, now make sure all pages are writable */
|
|
4989 |
rc2 = mdb_cursor_touch(mc);
|
|
4990 |
if (rc2)
|
|
4991 |
return rc2;
|
|
4992 |
|
|
4993 |
top:
|
|
4994 |
/* The key already exists */
|
|
4995 |
if (rc == MDB_SUCCESS) {
|
|
4996 |
/* there's only a key anyway, so this is a no-op */
|
|
4997 |
if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
|
|
4998 |
unsigned int ksize = mc->mc_db->md_pad;
|
|
4999 |
if (key->mv_size != ksize)
|
|
5000 |
return EINVAL;
|
|
5001 |
if (flags == MDB_CURRENT) {
|
|
5002 |
char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
|
|
5003 |
memcpy(ptr, key->mv_data, ksize);
|
|
5004 |
}
|
|
5005 |
return MDB_SUCCESS;
|
|
5006 |
}
|
|
5007 |
|
|
5008 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5009 |
|
|
5010 |
/* DB has dups? */
|
|
5011 |
if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
|
|
5012 |
/* Was a single item before, must convert now */
|
|
5013 |
more:
|
|
5014 |
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
5015 |
/* Just overwrite the current item */
|
|
5016 |
if (flags == MDB_CURRENT)
|
|
5017 |
goto current;
|
|
5018 |
|
|
5019 |
dkey.mv_size = NODEDSZ(leaf);
|
|
5020 |
dkey.mv_data = NODEDATA(leaf);
|
|
5021 |
#if UINT_MAX < SIZE_MAX
|
|
5022 |
if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
|
|
5023 |
#ifdef MISALIGNED_OK
|
|
5024 |
mc->mc_dbx->md_dcmp = mdb_cmp_long;
|
|
5025 |
#else
|
|
5026 |
mc->mc_dbx->md_dcmp = mdb_cmp_cint;
|
|
5027 |
#endif
|
|
5028 |
#endif
|
|
5029 |
/* if data matches, ignore it */
|
|
5030 |
if (!mc->mc_dbx->md_dcmp(data, &dkey))
|
|
5031 |
return (flags == MDB_NODUPDATA) ? MDB_KEYEXIST : MDB_SUCCESS;
|
|
5032 |
|
|
5033 |
/* create a fake page for the dup items */
|
|
5034 |
memcpy(dbuf, dkey.mv_data, dkey.mv_size);
|
|
5035 |
dkey.mv_data = dbuf;
|
|
5036 |
fp = (MDB_page *)&pbuf;
|
|
5037 |
fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
|
5038 |
fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
|
|
5039 |
fp->mp_lower = PAGEHDRSZ;
|
|
5040 |
fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
|
|
5041 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
|
5042 |
fp->mp_flags |= P_LEAF2;
|
|
5043 |
fp->mp_pad = data->mv_size;
|
|
5044 |
fp->mp_upper += 2 * data->mv_size; /* leave space for 2 more */
|
|
5045 |
} else {
|
|
5046 |
fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
|
|
5047 |
(dkey.mv_size & 1) + (data->mv_size & 1);
|
|
5048 |
}
|
|
5049 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
|
5050 |
do_sub = 1;
|
|
5051 |
rdata = &xdata;
|
|
5052 |
xdata.mv_size = fp->mp_upper;
|
|
5053 |
xdata.mv_data = fp;
|
|
5054 |
flags |= F_DUPDATA;
|
|
5055 |
goto new_sub;
|
|
5056 |
}
|
|
5057 |
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
|
5058 |
/* See if we need to convert from fake page to subDB */
|
|
5059 |
MDB_page *mp;
|
|
5060 |
unsigned int offset;
|
|
5061 |
unsigned int i;
|
|
5062 |
|
|
5063 |
fp = NODEDATA(leaf);
|
|
5064 |
if (flags == MDB_CURRENT) {
|
|
5065 |
reuse:
|
|
5066 |
fp->mp_flags |= P_DIRTY;
|
|
5067 |
COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
|
|
5068 |
mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
|
|
5069 |
flags |= F_DUPDATA;
|
|
5070 |
goto put_sub;
|
|
5071 |
}
|
|
5072 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
|
5073 |
offset = fp->mp_pad;
|
|
5074 |
if (SIZELEFT(fp) >= offset)
|
|
5075 |
goto reuse;
|
|
5076 |
offset *= 4; /* space for 4 more */
|
|
5077 |
} else {
|
|
5078 |
offset = NODESIZE + sizeof(indx_t) + data->mv_size;
|
|
5079 |
}
|
|
5080 |
offset += offset & 1;
|
|
5081 |
if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
|
|
5082 |
offset >= mc->mc_txn->mt_env->me_nodemax) {
|
|
5083 |
/* yes, convert it */
|
|
5084 |
dummy.md_flags = 0;
|
|
5085 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
|
5086 |
dummy.md_pad = fp->mp_pad;
|
|
5087 |
dummy.md_flags = MDB_DUPFIXED;
|
|
5088 |
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
|
|
5089 |
dummy.md_flags |= MDB_INTEGERKEY;
|
|
5090 |
}
|
|
5091 |
dummy.md_depth = 1;
|
|
5092 |
dummy.md_branch_pages = 0;
|
|
5093 |
dummy.md_leaf_pages = 1;
|
|
5094 |
dummy.md_overflow_pages = 0;
|
|
5095 |
dummy.md_entries = NUMKEYS(fp);
|
|
5096 |
rdata = &xdata;
|
|
5097 |
xdata.mv_size = sizeof(MDB_db);
|
|
5098 |
xdata.mv_data = &dummy;
|
|
5099 |
if ((rc = mdb_page_alloc(mc, 1, &mp)))
|
|
5100 |
return rc;
|
|
5101 |
offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
|
|
5102 |
flags |= F_DUPDATA|F_SUBDATA;
|
|
5103 |
dummy.md_root = mp->mp_pgno;
|
|
5104 |
} else {
|
|
5105 |
/* no, just grow it */
|
|
5106 |
rdata = &xdata;
|
|
5107 |
xdata.mv_size = NODEDSZ(leaf) + offset;
|
|
5108 |
xdata.mv_data = &pbuf;
|
|
5109 |
mp = (MDB_page *)&pbuf;
|
|
5110 |
mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
|
|
5111 |
flags |= F_DUPDATA;
|
|
5112 |
}
|
|
5113 |
mp->mp_flags = fp->mp_flags | P_DIRTY;
|
|
5114 |
mp->mp_pad = fp->mp_pad;
|
|
5115 |
mp->mp_lower = fp->mp_lower;
|
|
5116 |
mp->mp_upper = fp->mp_upper + offset;
|
|
5117 |
if (IS_LEAF2(fp)) {
|
|
5118 |
memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
|
|
5119 |
} else {
|
|
5120 |
nsize = NODEDSZ(leaf) - fp->mp_upper;
|
|
5121 |
memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
|
|
5122 |
for (i=0; i<NUMKEYS(fp); i++)
|
|
5123 |
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
|
|
5124 |
}
|
|
5125 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
|
5126 |
do_sub = 1;
|
|
5127 |
goto new_sub;
|
|
5128 |
}
|
|
5129 |
/* data is on sub-DB, just store it */
|
|
5130 |
flags |= F_DUPDATA|F_SUBDATA;
|
|
5131 |
goto put_sub;
|
|
5132 |
}
|
|
5133 |
current:
|
|
5134 |
/* overflow page overwrites need special handling */
|
|
5135 |
if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
|
5136 |
MDB_page *omp;
|
|
5137 |
pgno_t pg;
|
|
5138 |
int ovpages, dpages;
|
|
5139 |
|
|
5140 |
ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
|
|
5141 |
dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
|
|
5142 |
memcpy(&pg, NODEDATA(leaf), sizeof(pg));
|
|
5143 |
mdb_page_get(mc->mc_txn, pg, &omp);
|
|
5144 |
/* Is the ov page writable and large enough? */
|
|
5145 |
if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) {
|
|
5146 |
/* yes, overwrite it. Note in this case we don't
|
|
5147 |
* bother to try shrinking the node if the new data
|
|
5148 |
* is smaller than the overflow threshold.
|
|
5149 |
*/
|
|
5150 |
if (F_ISSET(flags, MDB_RESERVE))
|
|
5151 |
data->mv_data = METADATA(omp);
|
|
5152 |
else
|
|
5153 |
memcpy(METADATA(omp), data->mv_data, data->mv_size);
|
|
5154 |
goto done;
|
|
5155 |
} else {
|
|
5156 |
/* no, free ovpages */
|
|
5157 |
int i;
|
|
5158 |
mc->mc_db->md_overflow_pages -= ovpages;
|
|
5159 |
for (i=0; i<ovpages; i++) {
|
|
5160 |
DPRINTF("freed ov page %zu", pg);
|
|
5161 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
|
|
5162 |
pg++;
|
|
5163 |
}
|
|
5164 |
}
|
|
5165 |
} else if (NODEDSZ(leaf) == data->mv_size) {
|
|
5166 |
/* same size, just replace it. Note that we could
|
|
5167 |
* also reuse this node if the new data is smaller,
|
|
5168 |
* but instead we opt to shrink the node in that case.
|
|
5169 |
*/
|
|
5170 |
if (F_ISSET(flags, MDB_RESERVE))
|
|
5171 |
data->mv_data = NODEDATA(leaf);
|
|
5172 |
else if (data->mv_size)
|
|
5173 |
memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
|
|
5174 |
else
|
|
5175 |
memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
|
|
5176 |
goto done;
|
|
5177 |
}
|
|
5178 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
|
5179 |
mc->mc_db->md_entries--;
|
|
5180 |
} else {
|
|
5181 |
DPRINTF("inserting key at index %i", mc->mc_ki[mc->mc_top]);
|
|
5182 |
insert = 1;
|
|
5183 |
}
|
|
5184 |
|
|
5185 |
rdata = data;
|
|
5186 |
|
|
5187 |
new_sub:
|
|
5188 |
nflags = flags & NODE_ADD_FLAGS;
|
|
5189 |
nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
|
|
5190 |
if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
|
|
5191 |
if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
|
|
5192 |
nflags &= ~MDB_APPEND;
|
|
5193 |
if (!insert)
|
|
5194 |
nflags |= MDB_SPLIT_REPLACE;
|
|
5195 |
rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
|
|
5196 |
} else {
|
|
5197 |
/* There is room already in this leaf page. */
|
|
5198 |
rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
|
|
5199 |
if (rc == 0 && !do_sub && insert) {
|
|
5200 |
/* Adjust other cursors pointing to mp */
|
|
5201 |
MDB_cursor *m2, *m3;
|
|
5202 |
MDB_dbi dbi = mc->mc_dbi;
|
|
5203 |
unsigned i = mc->mc_top;
|
|
5204 |
MDB_page *mp = mc->mc_pg[i];
|
|
5205 |
|
|
5206 |
if (mc->mc_flags & C_SUB)
|
|
5207 |
dbi--;
|
|
5208 |
|
|
5209 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
5210 |
if (mc->mc_flags & C_SUB)
|
|
5211 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
5212 |
else
|
|
5213 |
m3 = m2;
|
|
5214 |
if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
|
|
5215 |
if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
|
|
5216 |
m3->mc_ki[i]++;
|
|
5217 |
}
|
|
5218 |
}
|
|
5219 |
}
|
|
5220 |
}
|
|
5221 |
|
|
5222 |
if (rc != MDB_SUCCESS)
|
|
5223 |
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
|
|
5224 |
else {
|
|
5225 |
/* Now store the actual data in the child DB. Note that we're
|
|
5226 |
* storing the user data in the keys field, so there are strict
|
|
5227 |
* size limits on dupdata. The actual data fields of the child
|
|
5228 |
* DB are all zero size.
|
|
5229 |
*/
|
|
5230 |
if (do_sub) {
|
|
5231 |
int xflags;
|
|
5232 |
put_sub:
|
|
5233 |
xdata.mv_size = 0;
|
|
5234 |
xdata.mv_data = "";
|
|
5235 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5236 |
if (flags & MDB_CURRENT) {
|
|
5237 |
xflags = MDB_CURRENT;
|
|
5238 |
} else {
|
|
5239 |
mdb_xcursor_init1(mc, leaf);
|
|
5240 |
xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0;
|
|
5241 |
}
|
|
5242 |
/* converted, write the original data first */
|
|
5243 |
if (dkey.mv_size) {
|
|
5244 |
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
|
|
5245 |
if (rc)
|
|
5246 |
return rc;
|
|
5247 |
{
|
|
5248 |
/* Adjust other cursors pointing to mp */
|
|
5249 |
MDB_cursor *m2;
|
|
5250 |
unsigned i = mc->mc_top;
|
|
5251 |
MDB_page *mp = mc->mc_pg[i];
|
|
5252 |
|
|
5253 |
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
|
|
5254 |
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
|
|
5255 |
if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
|
|
5256 |
mdb_xcursor_init1(m2, leaf);
|
|
5257 |
}
|
|
5258 |
}
|
|
5259 |
}
|
|
5260 |
/* we've done our job */
|
|
5261 |
dkey.mv_size = 0;
|
|
5262 |
}
|
|
5263 |
if (flags & MDB_APPENDDUP)
|
|
5264 |
xflags |= MDB_APPEND;
|
|
5265 |
rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
|
|
5266 |
if (flags & F_SUBDATA) {
|
|
5267 |
void *db = NODEDATA(leaf);
|
|
5268 |
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
|
|
5269 |
}
|
|
5270 |
}
|
|
5271 |
/* sub-writes might have failed so check rc again.
|
|
5272 |
* Don't increment count if we just replaced an existing item.
|
|
5273 |
*/
|
|
5274 |
if (!rc && !(flags & MDB_CURRENT))
|
|
5275 |
mc->mc_db->md_entries++;
|
|
5276 |
if (flags & MDB_MULTIPLE) {
|
|
5277 |
mcount++;
|
|
5278 |
if (mcount < data[1].mv_size) {
|
|
5279 |
data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
|
|
5280 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5281 |
goto more;
|
|
5282 |
}
|
|
5283 |
}
|
|
5284 |
}
|
|
5285 |
done:
|
|
5286 |
/* If we succeeded and the key didn't exist before, make sure
|
|
5287 |
* the cursor is marked valid.
|
|
5288 |
*/
|
|
5289 |
if (!rc && insert)
|
|
5290 |
mc->mc_flags |= C_INITIALIZED;
|
|
5291 |
return rc;
|
|
5292 |
}
|
|
5293 |
|
|
5294 |
int
|
|
5295 |
mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
|
|
5296 |
{
|
|
5297 |
MDB_node *leaf;
|
|
5298 |
int rc;
|
|
5299 |
|
|
5300 |
if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
|
|
5301 |
return EACCES;
|
|
5302 |
|
|
5303 |
if (!(mc->mc_flags & C_INITIALIZED))
|
|
5304 |
return EINVAL;
|
|
5305 |
|
|
5306 |
rc = mdb_cursor_touch(mc);
|
|
5307 |
if (rc)
|
|
5308 |
return rc;
|
|
5309 |
|
|
5310 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5311 |
|
|
5312 |
if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
5313 |
if (flags != MDB_NODUPDATA) {
|
|
5314 |
if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
|
|
5315 |
mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
|
|
5316 |
}
|
|
5317 |
rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0);
|
|
5318 |
/* If sub-DB still has entries, we're done */
|
|
5319 |
if (mc->mc_xcursor->mx_db.md_entries) {
|
|
5320 |
if (leaf->mn_flags & F_SUBDATA) {
|
|
5321 |
/* update subDB info */
|
|
5322 |
void *db = NODEDATA(leaf);
|
|
5323 |
memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
|
|
5324 |
} else {
|
|
5325 |
/* shrink fake page */
|
|
5326 |
mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5327 |
}
|
|
5328 |
mc->mc_db->md_entries--;
|
|
5329 |
return rc;
|
|
5330 |
}
|
|
5331 |
/* otherwise fall thru and delete the sub-DB */
|
|
5332 |
}
|
|
5333 |
|
|
5334 |
if (leaf->mn_flags & F_SUBDATA) {
|
|
5335 |
/* add all the child DB's pages to the free list */
|
|
5336 |
rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
|
|
5337 |
if (rc == MDB_SUCCESS) {
|
|
5338 |
mc->mc_db->md_entries -=
|
|
5339 |
mc->mc_xcursor->mx_db.md_entries;
|
|
5340 |
}
|
|
5341 |
}
|
|
5342 |
}
|
|
5343 |
|
|
5344 |
return mdb_cursor_del0(mc, leaf);
|
|
5345 |
}
|
|
5346 |
|
|
5347 |
/** Allocate and initialize new pages for a database.
|
|
5348 |
* @param[in] mc a cursor on the database being added to.
|
|
5349 |
* @param[in] flags flags defining what type of page is being allocated.
|
|
5350 |
* @param[in] num the number of pages to allocate. This is usually 1,
|
|
5351 |
* unless allocating overflow pages for a large record.
|
|
5352 |
* @param[out] mp Address of a page, or NULL on failure.
|
|
5353 |
* @return 0 on success, non-zero on failure.
|
|
5354 |
*/
|
|
5355 |
static int
|
|
5356 |
mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp)
|
|
5357 |
{
|
|
5358 |
MDB_page *np;
|
|
5359 |
int rc;
|
|
5360 |
|
|
5361 |
if ((rc = mdb_page_alloc(mc, num, &np)))
|
|
5362 |
return rc;
|
|
5363 |
DPRINTF("allocated new mpage %zu, page size %u",
|
|
5364 |
np->mp_pgno, mc->mc_txn->mt_env->me_psize);
|
|
5365 |
np->mp_flags = flags | P_DIRTY;
|
|
5366 |
np->mp_lower = PAGEHDRSZ;
|
|
5367 |
np->mp_upper = mc->mc_txn->mt_env->me_psize;
|
|
5368 |
|
|
5369 |
if (IS_BRANCH(np))
|
|
5370 |
mc->mc_db->md_branch_pages++;
|
|
5371 |
else if (IS_LEAF(np))
|
|
5372 |
mc->mc_db->md_leaf_pages++;
|
|
5373 |
else if (IS_OVERFLOW(np)) {
|
|
5374 |
mc->mc_db->md_overflow_pages += num;
|
|
5375 |
np->mp_pages = num;
|
|
5376 |
}
|
|
5377 |
*mp = np;
|
|
5378 |
|
|
5379 |
return 0;
|
|
5380 |
}
|
|
5381 |
|
|
5382 |
/** Calculate the size of a leaf node.
|
|
5383 |
* The size depends on the environment's page size; if a data item
|
|
5384 |
* is too large it will be put onto an overflow page and the node
|
|
5385 |
* size will only include the key and not the data. Sizes are always
|
|
5386 |
* rounded up to an even number of bytes, to guarantee 2-byte alignment
|
|
5387 |
* of the #MDB_node headers.
|
|
5388 |
* @param[in] env The environment handle.
|
|
5389 |
* @param[in] key The key for the node.
|
|
5390 |
* @param[in] data The data for the node.
|
|
5391 |
* @return The number of bytes needed to store the node.
|
|
5392 |
*/
|
|
5393 |
static size_t
|
|
5394 |
mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
|
|
5395 |
{
|
|
5396 |
size_t sz;
|
|
5397 |
|
|
5398 |
sz = LEAFSIZE(key, data);
|
|
5399 |
if (sz >= env->me_nodemax) {
|
|
5400 |
/* put on overflow page */
|
|
5401 |
sz -= data->mv_size - sizeof(pgno_t);
|
|
5402 |
}
|
|
5403 |
sz += sz & 1;
|
|
5404 |
|
|
5405 |
return sz + sizeof(indx_t);
|
|
5406 |
}
|
|
5407 |
|
|
5408 |
/** Calculate the size of a branch node.
|
|
5409 |
* The size should depend on the environment's page size but since
|
|
5410 |
* we currently don't support spilling large keys onto overflow
|
|
5411 |
* pages, it's simply the size of the #MDB_node header plus the
|
|
5412 |
* size of the key. Sizes are always rounded up to an even number
|
|
5413 |
* of bytes, to guarantee 2-byte alignment of the #MDB_node headers.
|
|
5414 |
* @param[in] env The environment handle.
|
|
5415 |
* @param[in] key The key for the node.
|
|
5416 |
* @return The number of bytes needed to store the node.
|
|
5417 |
*/
|
|
5418 |
static size_t
|
|
5419 |
mdb_branch_size(MDB_env *env, MDB_val *key)
|
|
5420 |
{
|
|
5421 |
size_t sz;
|
|
5422 |
|
|
5423 |
sz = INDXSIZE(key);
|
|
5424 |
if (sz >= env->me_nodemax) {
|
|
5425 |
/* put on overflow page */
|
|
5426 |
/* not implemented */
|
|
5427 |
/* sz -= key->size - sizeof(pgno_t); */
|
|
5428 |
}
|
|
5429 |
|
|
5430 |
return sz + sizeof(indx_t);
|
|
5431 |
}
|
|
5432 |
|
|
5433 |
/** Add a node to the page pointed to by the cursor.
|
|
5434 |
* @param[in] mc The cursor for this operation.
|
|
5435 |
* @param[in] indx The index on the page where the new node should be added.
|
|
5436 |
* @param[in] key The key for the new node.
|
|
5437 |
* @param[in] data The data for the new node, if any.
|
|
5438 |
* @param[in] pgno The page number, if adding a branch node.
|
|
5439 |
* @param[in] flags Flags for the node.
|
|
5440 |
* @return 0 on success, non-zero on failure. Possible errors are:
|
|
5441 |
* <ul>
|
|
5442 |
* <li>ENOMEM - failed to allocate overflow pages for the node.
|
|
5443 |
* <li>MDB_PAGE_FULL - there is insufficient room in the page. This error
|
|
5444 |
* should never happen since all callers already calculate the
|
|
5445 |
* page's free space before calling this function.
|
|
5446 |
* </ul>
|
|
5447 |
*/
|
|
5448 |
static int
|
|
5449 |
mdb_node_add(MDB_cursor *mc, indx_t indx,
|
|
5450 |
MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
|
|
5451 |
{
|
|
5452 |
unsigned int i;
|
|
5453 |
size_t node_size = NODESIZE;
|
|
5454 |
indx_t ofs;
|
|
5455 |
MDB_node *node;
|
|
5456 |
MDB_page *mp = mc->mc_pg[mc->mc_top];
|
|
5457 |
MDB_page *ofp = NULL; /* overflow page */
|
|
5458 |
DKBUF;
|
|
5459 |
|
|
5460 |
assert(mp->mp_upper >= mp->mp_lower);
|
|
5461 |
|
|
5462 |
DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]",
|
|
5463 |
IS_LEAF(mp) ? "leaf" : "branch",
|
|
5464 |
IS_SUBP(mp) ? "sub-" : "",
|
|
5465 |
mp->mp_pgno, indx, data ? data->mv_size : 0,
|
|
5466 |
key ? key->mv_size : 0, key ? DKEY(key) : NULL);
|
|
5467 |
|
|
5468 |
if (IS_LEAF2(mp)) {
|
|
5469 |
/* Move higher keys up one slot. */
|
|
5470 |
int ksize = mc->mc_db->md_pad, dif;
|
|
5471 |
char *ptr = LEAF2KEY(mp, indx, ksize);
|
|
5472 |
dif = NUMKEYS(mp) - indx;
|
|
5473 |
if (dif > 0)
|
|
5474 |
memmove(ptr+ksize, ptr, dif*ksize);
|
|
5475 |
/* insert new key */
|
|
5476 |
memcpy(ptr, key->mv_data, ksize);
|
|
5477 |
|
|
5478 |
/* Just using these for counting */
|
|
5479 |
mp->mp_lower += sizeof(indx_t);
|
|
5480 |
mp->mp_upper -= ksize - sizeof(indx_t);
|
|
5481 |
return MDB_SUCCESS;
|
|
5482 |
}
|
|
5483 |
|
|
5484 |
if (key != NULL)
|
|
5485 |
node_size += key->mv_size;
|
|
5486 |
|
|
5487 |
if (IS_LEAF(mp)) {
|
|
5488 |
assert(data);
|
|
5489 |
if (F_ISSET(flags, F_BIGDATA)) {
|
|
5490 |
/* Data already on overflow page. */
|
|
5491 |
node_size += sizeof(pgno_t);
|
|
5492 |
} else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_nodemax) {
|
|
5493 |
int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
|
|
5494 |
int rc;
|
|
5495 |
/* Put data on overflow page. */
|
|
5496 |
DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
|
|
5497 |
data->mv_size, node_size+data->mv_size);
|
|
5498 |
node_size += sizeof(pgno_t);
|
|
5499 |
if ((rc = mdb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
|
|
5500 |
return rc;
|
|
5501 |
DPRINTF("allocated overflow page %zu", ofp->mp_pgno);
|
|
5502 |
flags |= F_BIGDATA;
|
|
5503 |
} else {
|
|
5504 |
node_size += data->mv_size;
|
|
5505 |
}
|
|
5506 |
}
|
|
5507 |
node_size += node_size & 1;
|
|
5508 |
|
|
5509 |
if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
|
|
5510 |
DPRINTF("not enough room in page %zu, got %u ptrs",
|
|
5511 |
mp->mp_pgno, NUMKEYS(mp));
|
|
5512 |
DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
|
|
5513 |
mp->mp_upper - mp->mp_lower);
|
|
5514 |
DPRINTF("node size = %zu", node_size);
|
|
5515 |
return MDB_PAGE_FULL;
|
|
5516 |
}
|
|
5517 |
|
|
5518 |
/* Move higher pointers up one slot. */
|
|
5519 |
for (i = NUMKEYS(mp); i > indx; i--)
|
|
5520 |
mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
|
|
5521 |
|
|
5522 |
/* Adjust free space offsets. */
|
|
5523 |
ofs = mp->mp_upper - node_size;
|
|
5524 |
assert(ofs >= mp->mp_lower + sizeof(indx_t));
|
|
5525 |
mp->mp_ptrs[indx] = ofs;
|
|
5526 |
mp->mp_upper = ofs;
|
|
5527 |
mp->mp_lower += sizeof(indx_t);
|
|
5528 |
|
|
5529 |
/* Write the node data. */
|
|
5530 |
node = NODEPTR(mp, indx);
|
|
5531 |
node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
|
|
5532 |
node->mn_flags = flags;
|
|
5533 |
if (IS_LEAF(mp))
|
|
5534 |
SETDSZ(node,data->mv_size);
|
|
5535 |
else
|
|
5536 |
SETPGNO(node,pgno);
|
|
5537 |
|
|
5538 |
if (key)
|
|
5539 |
memcpy(NODEKEY(node), key->mv_data, key->mv_size);
|
|
5540 |
|
|
5541 |
if (IS_LEAF(mp)) {
|
|
5542 |
assert(key);
|
|
5543 |
if (ofp == NULL) {
|
|
5544 |
if (F_ISSET(flags, F_BIGDATA))
|
|
5545 |
memcpy(node->mn_data + key->mv_size, data->mv_data,
|
|
5546 |
sizeof(pgno_t));
|
|
5547 |
else if (F_ISSET(flags, MDB_RESERVE))
|
|
5548 |
data->mv_data = node->mn_data + key->mv_size;
|
|
5549 |
else
|
|
5550 |
memcpy(node->mn_data + key->mv_size, data->mv_data,
|
|
5551 |
data->mv_size);
|
|
5552 |
} else {
|
|
5553 |
memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
|
|
5554 |
sizeof(pgno_t));
|
|
5555 |
if (F_ISSET(flags, MDB_RESERVE))
|
|
5556 |
data->mv_data = METADATA(ofp);
|
|
5557 |
else
|
|
5558 |
memcpy(METADATA(ofp), data->mv_data, data->mv_size);
|
|
5559 |
}
|
|
5560 |
}
|
|
5561 |
|
|
5562 |
return MDB_SUCCESS;
|
|
5563 |
}
|
|
5564 |
|
|
5565 |
/** Delete the specified node from a page.
|
|
5566 |
* @param[in] mp The page to operate on.
|
|
5567 |
* @param[in] indx The index of the node to delete.
|
|
5568 |
* @param[in] ksize The size of a node. Only used if the page is
|
|
5569 |
* part of a #MDB_DUPFIXED database.
|
|
5570 |
*/
|
|
5571 |
static void
|
|
5572 |
mdb_node_del(MDB_page *mp, indx_t indx, int ksize)
|
|
5573 |
{
|
|
5574 |
unsigned int sz;
|
|
5575 |
indx_t i, j, numkeys, ptr;
|
|
5576 |
MDB_node *node;
|
|
5577 |
char *base;
|
|
5578 |
|
|
5579 |
#if MDB_DEBUG
|
|
5580 |
{
|
|
5581 |
pgno_t pgno;
|
|
5582 |
COPY_PGNO(pgno, mp->mp_pgno);
|
|
5583 |
DPRINTF("delete node %u on %s page %zu", indx,
|
|
5584 |
IS_LEAF(mp) ? "leaf" : "branch", pgno);
|
|
5585 |
}
|
|
5586 |
#endif
|
|
5587 |
assert(indx < NUMKEYS(mp));
|
|
5588 |
|
|
5589 |
if (IS_LEAF2(mp)) {
|
|
5590 |
int x = NUMKEYS(mp) - 1 - indx;
|
|
5591 |
base = LEAF2KEY(mp, indx, ksize);
|
|
5592 |
if (x)
|
|
5593 |
memmove(base, base + ksize, x * ksize);
|
|
5594 |
mp->mp_lower -= sizeof(indx_t);
|
|
5595 |
mp->mp_upper += ksize - sizeof(indx_t);
|
|
5596 |
return;
|
|
5597 |
}
|
|
5598 |
|
|
5599 |
node = NODEPTR(mp, indx);
|
|
5600 |
sz = NODESIZE + node->mn_ksize;
|
|
5601 |
if (IS_LEAF(mp)) {
|
|
5602 |
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
|
5603 |
sz += sizeof(pgno_t);
|
|
5604 |
else
|
|
5605 |
sz += NODEDSZ(node);
|
|
5606 |
}
|
|
5607 |
sz += sz & 1;
|
|
5608 |
|
|
5609 |
ptr = mp->mp_ptrs[indx];
|
|
5610 |
numkeys = NUMKEYS(mp);
|
|
5611 |
for (i = j = 0; i < numkeys; i++) {
|
|
5612 |
if (i != indx) {
|
|
5613 |
mp->mp_ptrs[j] = mp->mp_ptrs[i];
|
|
5614 |
if (mp->mp_ptrs[i] < ptr)
|
|
5615 |
mp->mp_ptrs[j] += sz;
|
|
5616 |
j++;
|
|
5617 |
}
|
|
5618 |
}
|
|
5619 |
|
|
5620 |
base = (char *)mp + mp->mp_upper;
|
|
5621 |
memmove(base + sz, base, ptr - mp->mp_upper);
|
|
5622 |
|
|
5623 |
mp->mp_lower -= sizeof(indx_t);
|
|
5624 |
mp->mp_upper += sz;
|
|
5625 |
}
|
|
5626 |
|
|
5627 |
/** Compact the main page after deleting a node on a subpage.
|
|
5628 |
* @param[in] mp The main page to operate on.
|
|
5629 |
* @param[in] indx The index of the subpage on the main page.
|
|
5630 |
*/
|
|
5631 |
static void
|
|
5632 |
mdb_node_shrink(MDB_page *mp, indx_t indx)
|
|
5633 |
{
|
|
5634 |
MDB_node *node;
|
|
5635 |
MDB_page *sp, *xp;
|
|
5636 |
char *base;
|
|
5637 |
int osize, nsize;
|
|
5638 |
int delta;
|
|
5639 |
indx_t i, numkeys, ptr;
|
|
5640 |
|
|
5641 |
node = NODEPTR(mp, indx);
|
|
5642 |
sp = (MDB_page *)NODEDATA(node);
|
|
5643 |
osize = NODEDSZ(node);
|
|
5644 |
|
|
5645 |
delta = sp->mp_upper - sp->mp_lower;
|
|
5646 |
SETDSZ(node, osize - delta);
|
|
5647 |
xp = (MDB_page *)((char *)sp + delta);
|
|
5648 |
|
|
5649 |
/* shift subpage upward */
|
|
5650 |
if (IS_LEAF2(sp)) {
|
|
5651 |
nsize = NUMKEYS(sp) * sp->mp_pad;
|
|
5652 |
memmove(METADATA(xp), METADATA(sp), nsize);
|
|
5653 |
} else {
|
|
5654 |
int i;
|
|
5655 |
nsize = osize - sp->mp_upper;
|
|
5656 |
numkeys = NUMKEYS(sp);
|
|
5657 |
for (i=numkeys-1; i>=0; i--)
|
|
5658 |
xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
|
|
5659 |
}
|
|
5660 |
xp->mp_upper = sp->mp_lower;
|
|
5661 |
xp->mp_lower = sp->mp_lower;
|
|
5662 |
xp->mp_flags = sp->mp_flags;
|
|
5663 |
xp->mp_pad = sp->mp_pad;
|
|
5664 |
COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
|
|
5665 |
|
|
5666 |
/* shift lower nodes upward */
|
|
5667 |
ptr = mp->mp_ptrs[indx];
|
|
5668 |
numkeys = NUMKEYS(mp);
|
|
5669 |
for (i = 0; i < numkeys; i++) {
|
|
5670 |
if (mp->mp_ptrs[i] <= ptr)
|
|
5671 |
mp->mp_ptrs[i] += delta;
|
|
5672 |
}
|
|
5673 |
|
|
5674 |
base = (char *)mp + mp->mp_upper;
|
|
5675 |
memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
|
|
5676 |
mp->mp_upper += delta;
|
|
5677 |
}
|
|
5678 |
|
|
5679 |
/** Initial setup of a sorted-dups cursor.
|
|
5680 |
* Sorted duplicates are implemented as a sub-database for the given key.
|
|
5681 |
* The duplicate data items are actually keys of the sub-database.
|
|
5682 |
* Operations on the duplicate data items are performed using a sub-cursor
|
|
5683 |
* initialized when the sub-database is first accessed. This function does
|
|
5684 |
* the preliminary setup of the sub-cursor, filling in the fields that
|
|
5685 |
* depend only on the parent DB.
|
|
5686 |
* @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
|
|
5687 |
*/
|
|
5688 |
static void
|
|
5689 |
mdb_xcursor_init0(MDB_cursor *mc)
|
|
5690 |
{
|
|
5691 |
MDB_xcursor *mx = mc->mc_xcursor;
|
|
5692 |
|
|
5693 |
mx->mx_cursor.mc_xcursor = NULL;
|
|
5694 |
mx->mx_cursor.mc_txn = mc->mc_txn;
|
|
5695 |
mx->mx_cursor.mc_db = &mx->mx_db;
|
|
5696 |
mx->mx_cursor.mc_dbx = &mx->mx_dbx;
|
|
5697 |
mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
|
|
5698 |
mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
|
|
5699 |
mx->mx_cursor.mc_snum = 0;
|
|
5700 |
mx->mx_cursor.mc_top = 0;
|
|
5701 |
mx->mx_cursor.mc_flags = C_SUB;
|
|
5702 |
mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
|
|
5703 |
mx->mx_dbx.md_dcmp = NULL;
|
|
5704 |
mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
|
|
5705 |
}
|
|
5706 |
|
|
5707 |
/** Final setup of a sorted-dups cursor.
|
|
5708 |
* Sets up the fields that depend on the data from the main cursor.
|
|
5709 |
* @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
|
|
5710 |
* @param[in] node The data containing the #MDB_db record for the
|
|
5711 |
* sorted-dup database.
|
|
5712 |
*/
|
|
5713 |
static void
|
|
5714 |
mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
|
|
5715 |
{
|
|
5716 |
MDB_xcursor *mx = mc->mc_xcursor;
|
|
5717 |
|
|
5718 |
if (node->mn_flags & F_SUBDATA) {
|
|
5719 |
memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
|
|
5720 |
mx->mx_cursor.mc_pg[0] = 0;
|
|
5721 |
mx->mx_cursor.mc_snum = 0;
|
|
5722 |
mx->mx_cursor.mc_flags = C_SUB;
|
|
5723 |
} else {
|
|
5724 |
MDB_page *fp = NODEDATA(node);
|
|
5725 |
mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
|
|
5726 |
mx->mx_db.md_flags = 0;
|
|
5727 |
mx->mx_db.md_depth = 1;
|
|
5728 |
mx->mx_db.md_branch_pages = 0;
|
|
5729 |
mx->mx_db.md_leaf_pages = 1;
|
|
5730 |
mx->mx_db.md_overflow_pages = 0;
|
|
5731 |
mx->mx_db.md_entries = NUMKEYS(fp);
|
|
5732 |
COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
|
|
5733 |
mx->mx_cursor.mc_snum = 1;
|
|
5734 |
mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
|
|
5735 |
mx->mx_cursor.mc_top = 0;
|
|
5736 |
mx->mx_cursor.mc_pg[0] = fp;
|
|
5737 |
mx->mx_cursor.mc_ki[0] = 0;
|
|
5738 |
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
|
|
5739 |
mx->mx_db.md_flags = MDB_DUPFIXED;
|
|
5740 |
mx->mx_db.md_pad = fp->mp_pad;
|
|
5741 |
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
|
|
5742 |
mx->mx_db.md_flags |= MDB_INTEGERKEY;
|
|
5743 |
}
|
|
5744 |
}
|
|
5745 |
DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi,
|
|
5746 |
mx->mx_db.md_root);
|
|
5747 |
mx->mx_dbflag = DB_VALID | (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY) ?
|
|
5748 |
DB_DIRTY : 0);
|
|
5749 |
mx->mx_dbx.md_name.mv_data = NODEKEY(node);
|
|
5750 |
mx->mx_dbx.md_name.mv_size = node->mn_ksize;
|
|
5751 |
#if UINT_MAX < SIZE_MAX
|
|
5752 |
if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
|
|
5753 |
#ifdef MISALIGNED_OK
|
|
5754 |
mx->mx_dbx.md_cmp = mdb_cmp_long;
|
|
5755 |
#else
|
|
5756 |
mx->mx_dbx.md_cmp = mdb_cmp_cint;
|
|
5757 |
#endif
|
|
5758 |
#endif
|
|
5759 |
}
|
|
5760 |
|
|
5761 |
/** Initialize a cursor for a given transaction and database. */
|
|
5762 |
static void
|
|
5763 |
mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
|
|
5764 |
{
|
|
5765 |
mc->mc_orig = NULL;
|
|
5766 |
mc->mc_dbi = dbi;
|
|
5767 |
mc->mc_txn = txn;
|
|
5768 |
mc->mc_db = &txn->mt_dbs[dbi];
|
|
5769 |
mc->mc_dbx = &txn->mt_dbxs[dbi];
|
|
5770 |
mc->mc_dbflag = &txn->mt_dbflags[dbi];
|
|
5771 |
mc->mc_snum = 0;
|
|
5772 |
mc->mc_top = 0;
|
|
5773 |
mc->mc_pg[0] = 0;
|
|
5774 |
mc->mc_flags = 0;
|
|
5775 |
if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
|
|
5776 |
assert(mx != NULL);
|
|
5777 |
mc->mc_xcursor = mx;
|
|
5778 |
mdb_xcursor_init0(mc);
|
|
5779 |
} else {
|
|
5780 |
mc->mc_xcursor = NULL;
|
|
5781 |
}
|
|
5782 |
if (*mc->mc_dbflag & DB_STALE) {
|
|
5783 |
mdb_page_search(mc, NULL, MDB_PS_ROOTONLY);
|
|
5784 |
}
|
|
5785 |
}
|
|
5786 |
|
|
5787 |
int
|
|
5788 |
mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
|
|
5789 |
{
|
|
5790 |
MDB_cursor *mc;
|
|
5791 |
MDB_xcursor *mx = NULL;
|
|
5792 |
size_t size = sizeof(MDB_cursor);
|
|
5793 |
|
|
5794 |
if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
5795 |
return EINVAL;
|
|
5796 |
|
|
5797 |
/* Allow read access to the freelist */
|
|
5798 |
if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
|
|
5799 |
return EINVAL;
|
|
5800 |
|
|
5801 |
if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
|
|
5802 |
size += sizeof(MDB_xcursor);
|
|
5803 |
|
|
5804 |
if ((mc = malloc(size)) != NULL) {
|
|
5805 |
if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
|
|
5806 |
mx = (MDB_xcursor *)(mc + 1);
|
|
5807 |
}
|
|
5808 |
mdb_cursor_init(mc, txn, dbi, mx);
|
|
5809 |
if (txn->mt_cursors) {
|
|
5810 |
mc->mc_next = txn->mt_cursors[dbi];
|
|
5811 |
txn->mt_cursors[dbi] = mc;
|
|
5812 |
}
|
|
5813 |
mc->mc_flags |= C_ALLOCD;
|
|
5814 |
} else {
|
|
5815 |
return ENOMEM;
|
|
5816 |
}
|
|
5817 |
|
|
5818 |
*ret = mc;
|
|
5819 |
|
|
5820 |
return MDB_SUCCESS;
|
|
5821 |
}
|
|
5822 |
|
|
5823 |
int
|
|
5824 |
mdb_cursor_renew(MDB_txn *txn, MDB_cursor *mc)
|
|
5825 |
{
|
|
5826 |
if (txn == NULL || mc == NULL || mc->mc_dbi >= txn->mt_numdbs)
|
|
5827 |
return EINVAL;
|
|
5828 |
|
|
5829 |
if (txn->mt_cursors)
|
|
5830 |
return EINVAL;
|
|
5831 |
|
|
5832 |
mdb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
|
|
5833 |
return MDB_SUCCESS;
|
|
5834 |
}
|
|
5835 |
|
|
5836 |
/* Return the count of duplicate data items for the current key */
|
|
5837 |
int
|
|
5838 |
mdb_cursor_count(MDB_cursor *mc, size_t *countp)
|
|
5839 |
{
|
|
5840 |
MDB_node *leaf;
|
|
5841 |
|
|
5842 |
if (mc == NULL || countp == NULL)
|
|
5843 |
return EINVAL;
|
|
5844 |
|
|
5845 |
if (!(mc->mc_db->md_flags & MDB_DUPSORT))
|
|
5846 |
return EINVAL;
|
|
5847 |
|
|
5848 |
leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
|
|
5849 |
if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
|
|
5850 |
*countp = 1;
|
|
5851 |
} else {
|
|
5852 |
if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
|
|
5853 |
return EINVAL;
|
|
5854 |
|
|
5855 |
*countp = mc->mc_xcursor->mx_db.md_entries;
|
|
5856 |
}
|
|
5857 |
return MDB_SUCCESS;
|
|
5858 |
}
|
|
5859 |
|
|
5860 |
void
|
|
5861 |
mdb_cursor_close(MDB_cursor *mc)
|
|
5862 |
{
|
|
5863 |
if (mc != NULL) {
|
|
5864 |
/* remove from txn, if tracked */
|
|
5865 |
if (mc->mc_txn->mt_cursors) {
|
|
5866 |
MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
|
|
5867 |
while (*prev && *prev != mc) prev = &(*prev)->mc_next;
|
|
5868 |
if (*prev == mc)
|
|
5869 |
*prev = mc->mc_next;
|
|
5870 |
}
|
|
5871 |
if (mc->mc_flags & C_ALLOCD)
|
|
5872 |
free(mc);
|
|
5873 |
}
|
|
5874 |
}
|
|
5875 |
|
|
5876 |
MDB_txn *
|
|
5877 |
mdb_cursor_txn(MDB_cursor *mc)
|
|
5878 |
{
|
|
5879 |
if (!mc) return NULL;
|
|
5880 |
return mc->mc_txn;
|
|
5881 |
}
|
|
5882 |
|
|
5883 |
MDB_dbi
|
|
5884 |
mdb_cursor_dbi(MDB_cursor *mc)
|
|
5885 |
{
|
|
5886 |
assert(mc != NULL);
|
|
5887 |
return mc->mc_dbi;
|
|
5888 |
}
|
|
5889 |
|
|
5890 |
/** Replace the key for a node with a new key.
|
|
5891 |
* @param[in] mp The page containing the node to operate on.
|
|
5892 |
* @param[in] indx The index of the node to operate on.
|
|
5893 |
* @param[in] key The new key to use.
|
|
5894 |
* @return 0 on success, non-zero on failure.
|
|
5895 |
*/
|
|
5896 |
static int
|
|
5897 |
mdb_update_key(MDB_cursor *mc, MDB_val *key)
|
|
5898 |
{
|
|
5899 |
MDB_page *mp;
|
|
5900 |
MDB_node *node;
|
|
5901 |
char *base;
|
|
5902 |
size_t len;
|
|
5903 |
int delta, delta0;
|
|
5904 |
indx_t ptr, i, numkeys, indx;
|
|
5905 |
DKBUF;
|
|
5906 |
|
|
5907 |
indx = mc->mc_ki[mc->mc_top];
|
|
5908 |
mp = mc->mc_pg[mc->mc_top];
|
|
5909 |
node = NODEPTR(mp, indx);
|
|
5910 |
ptr = mp->mp_ptrs[indx];
|
|
5911 |
#if MDB_DEBUG
|
|
5912 |
{
|
|
5913 |
MDB_val k2;
|
|
5914 |
char kbuf2[(MDB_MAXKEYSIZE*2+1)];
|
|
5915 |
k2.mv_data = NODEKEY(node);
|
|
5916 |
k2.mv_size = node->mn_ksize;
|
|
5917 |
DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
|
|
5918 |
indx, ptr,
|
|
5919 |
mdb_dkey(&k2, kbuf2),
|
|
5920 |
DKEY(key),
|
|
5921 |
mp->mp_pgno);
|
|
5922 |
}
|
|
5923 |
#endif
|
|
5924 |
|
|
5925 |
delta0 = delta = key->mv_size - node->mn_ksize;
|
|
5926 |
|
|
5927 |
/* Must be 2-byte aligned. If new key is
|
|
5928 |
* shorter by 1, the shift will be skipped.
|
|
5929 |
*/
|
|
5930 |
delta += (delta & 1);
|
|
5931 |
if (delta) {
|
|
5932 |
if (delta > 0 && SIZELEFT(mp) < delta) {
|
|
5933 |
pgno_t pgno;
|
|
5934 |
/* not enough space left, do a delete and split */
|
|
5935 |
DPRINTF("Not enough room, delta = %d, splitting...", delta);
|
|
5936 |
pgno = NODEPGNO(node);
|
|
5937 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
|
|
5938 |
return mdb_page_split(mc, key, NULL, pgno, MDB_SPLIT_REPLACE);
|
|
5939 |
}
|
|
5940 |
|
|
5941 |
numkeys = NUMKEYS(mp);
|
|
5942 |
for (i = 0; i < numkeys; i++) {
|
|
5943 |
if (mp->mp_ptrs[i] <= ptr)
|
|
5944 |
mp->mp_ptrs[i] -= delta;
|
|
5945 |
}
|
|
5946 |
|
|
5947 |
base = (char *)mp + mp->mp_upper;
|
|
5948 |
len = ptr - mp->mp_upper + NODESIZE;
|
|
5949 |
memmove(base - delta, base, len);
|
|
5950 |
mp->mp_upper -= delta;
|
|
5951 |
|
|
5952 |
node = NODEPTR(mp, indx);
|
|
5953 |
}
|
|
5954 |
|
|
5955 |
/* But even if no shift was needed, update ksize */
|
|
5956 |
if (delta0)
|
|
5957 |
node->mn_ksize = key->mv_size;
|
|
5958 |
|
|
5959 |
if (key->mv_size)
|
|
5960 |
memcpy(NODEKEY(node), key->mv_data, key->mv_size);
|
|
5961 |
|
|
5962 |
return MDB_SUCCESS;
|
|
5963 |
}
|
|
5964 |
|
|
5965 |
static void
|
|
5966 |
mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
|
|
5967 |
|
|
5968 |
/** Move a node from csrc to cdst.
|
|
5969 |
*/
|
|
5970 |
static int
|
|
5971 |
mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
5972 |
{
|
|
5973 |
MDB_node *srcnode;
|
|
5974 |
MDB_val key, data;
|
|
5975 |
pgno_t srcpg;
|
|
5976 |
MDB_cursor mn;
|
|
5977 |
int rc;
|
|
5978 |
unsigned short flags;
|
|
5979 |
|
|
5980 |
DKBUF;
|
|
5981 |
|
|
5982 |
/* Mark src and dst as dirty. */
|
|
5983 |
if ((rc = mdb_page_touch(csrc)) ||
|
|
5984 |
(rc = mdb_page_touch(cdst)))
|
|
5985 |
return rc;
|
|
5986 |
|
|
5987 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
5988 |
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); /* fake */
|
|
5989 |
key.mv_size = csrc->mc_db->md_pad;
|
|
5990 |
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
|
|
5991 |
data.mv_size = 0;
|
|
5992 |
data.mv_data = NULL;
|
|
5993 |
srcpg = 0;
|
|
5994 |
flags = 0;
|
|
5995 |
} else {
|
|
5996 |
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
|
|
5997 |
assert(!((long)srcnode&1));
|
|
5998 |
srcpg = NODEPGNO(srcnode);
|
|
5999 |
flags = srcnode->mn_flags;
|
|
6000 |
if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
|
6001 |
unsigned int snum = csrc->mc_snum;
|
|
6002 |
MDB_node *s2;
|
|
6003 |
/* must find the lowest key below src */
|
|
6004 |
mdb_page_search_root(csrc, NULL, 0);
|
|
6005 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
6006 |
key.mv_size = csrc->mc_db->md_pad;
|
|
6007 |
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
|
|
6008 |
} else {
|
|
6009 |
s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
|
|
6010 |
key.mv_size = NODEKSZ(s2);
|
|
6011 |
key.mv_data = NODEKEY(s2);
|
|
6012 |
}
|
|
6013 |
csrc->mc_snum = snum--;
|
|
6014 |
csrc->mc_top = snum;
|
|
6015 |
} else {
|
|
6016 |
key.mv_size = NODEKSZ(srcnode);
|
|
6017 |
key.mv_data = NODEKEY(srcnode);
|
|
6018 |
}
|
|
6019 |
data.mv_size = NODEDSZ(srcnode);
|
|
6020 |
data.mv_data = NODEDATA(srcnode);
|
|
6021 |
}
|
|
6022 |
if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
|
|
6023 |
unsigned int snum = cdst->mc_snum;
|
|
6024 |
MDB_node *s2;
|
|
6025 |
MDB_val bkey;
|
|
6026 |
/* must find the lowest key below dst */
|
|
6027 |
mdb_page_search_root(cdst, NULL, 0);
|
|
6028 |
if (IS_LEAF2(cdst->mc_pg[cdst->mc_top])) {
|
|
6029 |
bkey.mv_size = cdst->mc_db->md_pad;
|
|
6030 |
bkey.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, bkey.mv_size);
|
|
6031 |
} else {
|
|
6032 |
s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
|
|
6033 |
bkey.mv_size = NODEKSZ(s2);
|
|
6034 |
bkey.mv_data = NODEKEY(s2);
|
|
6035 |
}
|
|
6036 |
cdst->mc_snum = snum--;
|
|
6037 |
cdst->mc_top = snum;
|
|
6038 |
mdb_cursor_copy(cdst, &mn);
|
|
6039 |
mn.mc_ki[snum] = 0;
|
|
6040 |
rc = mdb_update_key(&mn, &bkey);
|
|
6041 |
if (rc)
|
|
6042 |
return rc;
|
|
6043 |
}
|
|
6044 |
|
|
6045 |
DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
|
|
6046 |
IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
|
|
6047 |
csrc->mc_ki[csrc->mc_top],
|
|
6048 |
DKEY(&key),
|
|
6049 |
csrc->mc_pg[csrc->mc_top]->mp_pgno,
|
|
6050 |
cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno);
|
|
6051 |
|
|
6052 |
/* Add the node to the destination page.
|
|
6053 |
*/
|
|
6054 |
rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
|
|
6055 |
if (rc != MDB_SUCCESS)
|
|
6056 |
return rc;
|
|
6057 |
|
|
6058 |
/* Delete the node from the source page.
|
|
6059 |
*/
|
|
6060 |
mdb_node_del(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
|
|
6061 |
|
|
6062 |
{
|
|
6063 |
/* Adjust other cursors pointing to mp */
|
|
6064 |
MDB_cursor *m2, *m3;
|
|
6065 |
MDB_dbi dbi = csrc->mc_dbi;
|
|
6066 |
MDB_page *mp = csrc->mc_pg[csrc->mc_top];
|
|
6067 |
|
|
6068 |
if (csrc->mc_flags & C_SUB)
|
|
6069 |
dbi--;
|
|
6070 |
|
|
6071 |
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
6072 |
if (m2 == csrc) continue;
|
|
6073 |
if (csrc->mc_flags & C_SUB)
|
|
6074 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
6075 |
else
|
|
6076 |
m3 = m2;
|
|
6077 |
if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
|
|
6078 |
csrc->mc_ki[csrc->mc_top]) {
|
|
6079 |
m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
|
|
6080 |
m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
|
|
6081 |
}
|
|
6082 |
}
|
|
6083 |
}
|
|
6084 |
|
|
6085 |
/* Update the parent separators.
|
|
6086 |
*/
|
|
6087 |
if (csrc->mc_ki[csrc->mc_top] == 0) {
|
|
6088 |
if (csrc->mc_ki[csrc->mc_top-1] != 0) {
|
|
6089 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
6090 |
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
|
|
6091 |
} else {
|
|
6092 |
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
|
|
6093 |
key.mv_size = NODEKSZ(srcnode);
|
|
6094 |
key.mv_data = NODEKEY(srcnode);
|
|
6095 |
}
|
|
6096 |
DPRINTF("update separator for source page %zu to [%s]",
|
|
6097 |
csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key));
|
|
6098 |
mdb_cursor_copy(csrc, &mn);
|
|
6099 |
mn.mc_snum--;
|
|
6100 |
mn.mc_top--;
|
|
6101 |
if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
|
|
6102 |
return rc;
|
|
6103 |
}
|
|
6104 |
if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
|
6105 |
MDB_val nullkey;
|
|
6106 |
indx_t ix = csrc->mc_ki[csrc->mc_top];
|
|
6107 |
nullkey.mv_size = 0;
|
|
6108 |
csrc->mc_ki[csrc->mc_top] = 0;
|
|
6109 |
rc = mdb_update_key(csrc, &nullkey);
|
|
6110 |
csrc->mc_ki[csrc->mc_top] = ix;
|
|
6111 |
assert(rc == MDB_SUCCESS);
|
|
6112 |
}
|
|
6113 |
}
|
|
6114 |
|
|
6115 |
if (cdst->mc_ki[cdst->mc_top] == 0) {
|
|
6116 |
if (cdst->mc_ki[cdst->mc_top-1] != 0) {
|
|
6117 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
6118 |
key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
|
|
6119 |
} else {
|
|
6120 |
srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
|
|
6121 |
key.mv_size = NODEKSZ(srcnode);
|
|
6122 |
key.mv_data = NODEKEY(srcnode);
|
|
6123 |
}
|
|
6124 |
DPRINTF("update separator for destination page %zu to [%s]",
|
|
6125 |
cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key));
|
|
6126 |
mdb_cursor_copy(cdst, &mn);
|
|
6127 |
mn.mc_snum--;
|
|
6128 |
mn.mc_top--;
|
|
6129 |
if ((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)
|
|
6130 |
return rc;
|
|
6131 |
}
|
|
6132 |
if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
|
|
6133 |
MDB_val nullkey;
|
|
6134 |
indx_t ix = cdst->mc_ki[cdst->mc_top];
|
|
6135 |
nullkey.mv_size = 0;
|
|
6136 |
cdst->mc_ki[cdst->mc_top] = 0;
|
|
6137 |
rc = mdb_update_key(cdst, &nullkey);
|
|
6138 |
cdst->mc_ki[cdst->mc_top] = ix;
|
|
6139 |
assert(rc == MDB_SUCCESS);
|
|
6140 |
}
|
|
6141 |
}
|
|
6142 |
|
|
6143 |
return MDB_SUCCESS;
|
|
6144 |
}
|
|
6145 |
|
|
6146 |
/** Merge one page into another.
|
|
6147 |
* The nodes from the page pointed to by \b csrc will
|
|
6148 |
* be copied to the page pointed to by \b cdst and then
|
|
6149 |
* the \b csrc page will be freed.
|
|
6150 |
* @param[in] csrc Cursor pointing to the source page.
|
|
6151 |
* @param[in] cdst Cursor pointing to the destination page.
|
|
6152 |
*/
|
|
6153 |
static int
|
|
6154 |
mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6155 |
{
|
|
6156 |
int rc;
|
|
6157 |
indx_t i, j;
|
|
6158 |
MDB_node *srcnode;
|
|
6159 |
MDB_val key, data;
|
|
6160 |
unsigned nkeys;
|
|
6161 |
|
|
6162 |
DPRINTF("merging page %zu into %zu", csrc->mc_pg[csrc->mc_top]->mp_pgno,
|
|
6163 |
cdst->mc_pg[cdst->mc_top]->mp_pgno);
|
|
6164 |
|
|
6165 |
assert(csrc->mc_snum > 1); /* can't merge root page */
|
|
6166 |
assert(cdst->mc_snum > 1);
|
|
6167 |
|
|
6168 |
/* Mark dst as dirty. */
|
|
6169 |
if ((rc = mdb_page_touch(cdst)))
|
|
6170 |
return rc;
|
|
6171 |
|
|
6172 |
/* Move all nodes from src to dst.
|
|
6173 |
*/
|
|
6174 |
j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
|
|
6175 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
6176 |
key.mv_size = csrc->mc_db->md_pad;
|
|
6177 |
key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
|
|
6178 |
for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
|
|
6179 |
rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
|
|
6180 |
if (rc != MDB_SUCCESS)
|
|
6181 |
return rc;
|
|
6182 |
key.mv_data = (char *)key.mv_data + key.mv_size;
|
|
6183 |
}
|
|
6184 |
} else {
|
|
6185 |
for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
|
|
6186 |
srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
|
|
6187 |
if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
|
|
6188 |
unsigned int snum = csrc->mc_snum;
|
|
6189 |
MDB_node *s2;
|
|
6190 |
/* must find the lowest key below src */
|
|
6191 |
mdb_page_search_root(csrc, NULL, 0);
|
|
6192 |
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
|
|
6193 |
key.mv_size = csrc->mc_db->md_pad;
|
|
6194 |
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
|
|
6195 |
} else {
|
|
6196 |
s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
|
|
6197 |
key.mv_size = NODEKSZ(s2);
|
|
6198 |
key.mv_data = NODEKEY(s2);
|
|
6199 |
}
|
|
6200 |
csrc->mc_snum = snum--;
|
|
6201 |
csrc->mc_top = snum;
|
|
6202 |
} else {
|
|
6203 |
key.mv_size = srcnode->mn_ksize;
|
|
6204 |
key.mv_data = NODEKEY(srcnode);
|
|
6205 |
}
|
|
6206 |
|
|
6207 |
data.mv_size = NODEDSZ(srcnode);
|
|
6208 |
data.mv_data = NODEDATA(srcnode);
|
|
6209 |
rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
|
|
6210 |
if (rc != MDB_SUCCESS)
|
|
6211 |
return rc;
|
|
6212 |
}
|
|
6213 |
}
|
|
6214 |
|
|
6215 |
DPRINTF("dst page %zu now has %u keys (%.1f%% filled)",
|
|
6216 |
cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10);
|
|
6217 |
|
|
6218 |
/* Unlink the src page from parent and add to free list.
|
|
6219 |
*/
|
|
6220 |
mdb_node_del(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], 0);
|
|
6221 |
if (csrc->mc_ki[csrc->mc_top-1] == 0) {
|
|
6222 |
key.mv_size = 0;
|
|
6223 |
csrc->mc_top--;
|
|
6224 |
rc = mdb_update_key(csrc, &key);
|
|
6225 |
csrc->mc_top++;
|
|
6226 |
if (rc)
|
|
6227 |
return rc;
|
|
6228 |
}
|
|
6229 |
|
|
6230 |
mdb_midl_append(&csrc->mc_txn->mt_free_pgs, csrc->mc_pg[csrc->mc_top]->mp_pgno);
|
|
6231 |
if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
|
|
6232 |
csrc->mc_db->md_leaf_pages--;
|
|
6233 |
else
|
|
6234 |
csrc->mc_db->md_branch_pages--;
|
|
6235 |
{
|
|
6236 |
/* Adjust other cursors pointing to mp */
|
|
6237 |
MDB_cursor *m2, *m3;
|
|
6238 |
MDB_dbi dbi = csrc->mc_dbi;
|
|
6239 |
MDB_page *mp = cdst->mc_pg[cdst->mc_top];
|
|
6240 |
|
|
6241 |
if (csrc->mc_flags & C_SUB)
|
|
6242 |
dbi--;
|
|
6243 |
|
|
6244 |
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
6245 |
if (csrc->mc_flags & C_SUB)
|
|
6246 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
6247 |
else
|
|
6248 |
m3 = m2;
|
|
6249 |
if (m3 == csrc) continue;
|
|
6250 |
if (m3->mc_snum < csrc->mc_snum) continue;
|
|
6251 |
if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
|
|
6252 |
m3->mc_pg[csrc->mc_top] = mp;
|
|
6253 |
m3->mc_ki[csrc->mc_top] += nkeys;
|
|
6254 |
}
|
|
6255 |
}
|
|
6256 |
}
|
|
6257 |
mdb_cursor_pop(csrc);
|
|
6258 |
|
|
6259 |
return mdb_rebalance(csrc);
|
|
6260 |
}
|
|
6261 |
|
|
6262 |
/** Copy the contents of a cursor.
|
|
6263 |
* @param[in] csrc The cursor to copy from.
|
|
6264 |
* @param[out] cdst The cursor to copy to.
|
|
6265 |
*/
|
|
6266 |
static void
|
|
6267 |
mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
|
|
6268 |
{
|
|
6269 |
unsigned int i;
|
|
6270 |
|
|
6271 |
cdst->mc_txn = csrc->mc_txn;
|
|
6272 |
cdst->mc_dbi = csrc->mc_dbi;
|
|
6273 |
cdst->mc_db = csrc->mc_db;
|
|
6274 |
cdst->mc_dbx = csrc->mc_dbx;
|
|
6275 |
cdst->mc_snum = csrc->mc_snum;
|
|
6276 |
cdst->mc_top = csrc->mc_top;
|
|
6277 |
cdst->mc_flags = csrc->mc_flags;
|
|
6278 |
|
|
6279 |
for (i=0; i<csrc->mc_snum; i++) {
|
|
6280 |
cdst->mc_pg[i] = csrc->mc_pg[i];
|
|
6281 |
cdst->mc_ki[i] = csrc->mc_ki[i];
|
|
6282 |
}
|
|
6283 |
}
|
|
6284 |
|
|
6285 |
/** Rebalance the tree after a delete operation.
|
|
6286 |
* @param[in] mc Cursor pointing to the page where rebalancing
|
|
6287 |
* should begin.
|
|
6288 |
* @return 0 on success, non-zero on failure.
|
|
6289 |
*/
|
|
6290 |
static int
|
|
6291 |
mdb_rebalance(MDB_cursor *mc)
|
|
6292 |
{
|
|
6293 |
MDB_node *node;
|
|
6294 |
int rc;
|
|
6295 |
unsigned int ptop, minkeys;
|
|
6296 |
MDB_cursor mn;
|
|
6297 |
|
|
6298 |
minkeys = 1 + (IS_BRANCH(mc->mc_pg[mc->mc_top]));
|
|
6299 |
#if MDB_DEBUG
|
|
6300 |
{
|
|
6301 |
pgno_t pgno;
|
|
6302 |
COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
|
|
6303 |
DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)",
|
|
6304 |
IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
|
|
6305 |
pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
|
|
6306 |
}
|
|
6307 |
#endif
|
|
6308 |
|
|
6309 |
if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD &&
|
|
6310 |
NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
|
|
6311 |
#if MDB_DEBUG
|
|
6312 |
pgno_t pgno;
|
|
6313 |
COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
|
|
6314 |
DPRINTF("no need to rebalance page %zu, above fill threshold",
|
|
6315 |
pgno);
|
|
6316 |
#endif
|
|
6317 |
return MDB_SUCCESS;
|
|
6318 |
}
|
|
6319 |
|
|
6320 |
if (mc->mc_snum < 2) {
|
|
6321 |
MDB_page *mp = mc->mc_pg[0];
|
|
6322 |
if (NUMKEYS(mp) == 0) {
|
|
6323 |
DPUTS("tree is completely empty");
|
|
6324 |
mc->mc_db->md_root = P_INVALID;
|
|
6325 |
mc->mc_db->md_depth = 0;
|
|
6326 |
mc->mc_db->md_leaf_pages = 0;
|
|
6327 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
|
|
6328 |
mc->mc_snum = 0;
|
|
6329 |
mc->mc_top = 0;
|
|
6330 |
{
|
|
6331 |
/* Adjust other cursors pointing to mp */
|
|
6332 |
MDB_cursor *m2, *m3;
|
|
6333 |
MDB_dbi dbi = mc->mc_dbi;
|
|
6334 |
|
|
6335 |
if (mc->mc_flags & C_SUB)
|
|
6336 |
dbi--;
|
|
6337 |
|
|
6338 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
6339 |
if (m2 == mc) continue;
|
|
6340 |
if (mc->mc_flags & C_SUB)
|
|
6341 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
6342 |
else
|
|
6343 |
m3 = m2;
|
|
6344 |
if (m3->mc_snum < mc->mc_snum) continue;
|
|
6345 |
if (m3->mc_pg[0] == mp) {
|
|
6346 |
m3->mc_snum = 0;
|
|
6347 |
m3->mc_top = 0;
|
|
6348 |
}
|
|
6349 |
}
|
|
6350 |
}
|
|
6351 |
} else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
|
|
6352 |
DPUTS("collapsing root page!");
|
|
6353 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
|
|
6354 |
mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
|
|
6355 |
if ((rc = mdb_page_get(mc->mc_txn, mc->mc_db->md_root,
|
|
6356 |
&mc->mc_pg[0])))
|
|
6357 |
return rc;
|
|
6358 |
mc->mc_db->md_depth--;
|
|
6359 |
mc->mc_db->md_branch_pages--;
|
|
6360 |
{
|
|
6361 |
/* Adjust other cursors pointing to mp */
|
|
6362 |
MDB_cursor *m2, *m3;
|
|
6363 |
MDB_dbi dbi = mc->mc_dbi;
|
|
6364 |
|
|
6365 |
if (mc->mc_flags & C_SUB)
|
|
6366 |
dbi--;
|
|
6367 |
|
|
6368 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
6369 |
if (m2 == mc) continue;
|
|
6370 |
if (mc->mc_flags & C_SUB)
|
|
6371 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
6372 |
else
|
|
6373 |
m3 = m2;
|
|
6374 |
if (m3->mc_snum < mc->mc_snum) continue;
|
|
6375 |
if (m3->mc_pg[0] == mp) {
|
|
6376 |
m3->mc_pg[0] = mc->mc_pg[0];
|
|
6377 |
m3->mc_snum = 1;
|
|
6378 |
m3->mc_top = 0;
|
|
6379 |
}
|
|
6380 |
}
|
|
6381 |
}
|
|
6382 |
} else
|
|
6383 |
DPUTS("root page doesn't need rebalancing");
|
|
6384 |
return MDB_SUCCESS;
|
|
6385 |
}
|
|
6386 |
|
|
6387 |
/* The parent (branch page) must have at least 2 pointers,
|
|
6388 |
* otherwise the tree is invalid.
|
|
6389 |
*/
|
|
6390 |
ptop = mc->mc_top-1;
|
|
6391 |
assert(NUMKEYS(mc->mc_pg[ptop]) > 1);
|
|
6392 |
|
|
6393 |
/* Leaf page fill factor is below the threshold.
|
|
6394 |
* Try to move keys from left or right neighbor, or
|
|
6395 |
* merge with a neighbor page.
|
|
6396 |
*/
|
|
6397 |
|
|
6398 |
/* Find neighbors.
|
|
6399 |
*/
|
|
6400 |
mdb_cursor_copy(mc, &mn);
|
|
6401 |
mn.mc_xcursor = NULL;
|
|
6402 |
|
|
6403 |
if (mc->mc_ki[ptop] == 0) {
|
|
6404 |
/* We're the leftmost leaf in our parent.
|
|
6405 |
*/
|
|
6406 |
DPUTS("reading right neighbor");
|
|
6407 |
mn.mc_ki[ptop]++;
|
|
6408 |
node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
|
|
6409 |
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
|
|
6410 |
return rc;
|
|
6411 |
mn.mc_ki[mn.mc_top] = 0;
|
|
6412 |
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
|
|
6413 |
} else {
|
|
6414 |
/* There is at least one neighbor to the left.
|
|
6415 |
*/
|
|
6416 |
DPUTS("reading left neighbor");
|
|
6417 |
mn.mc_ki[ptop]--;
|
|
6418 |
node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
|
|
6419 |
if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
|
|
6420 |
return rc;
|
|
6421 |
mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
|
|
6422 |
mc->mc_ki[mc->mc_top] = 0;
|
|
6423 |
}
|
|
6424 |
|
|
6425 |
DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)",
|
|
6426 |
mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10);
|
|
6427 |
|
|
6428 |
/* If the neighbor page is above threshold and has enough keys,
|
|
6429 |
* move one key from it. Otherwise we should try to merge them.
|
|
6430 |
* (A branch page must never have less than 2 keys.)
|
|
6431 |
*/
|
|
6432 |
minkeys = 1 + (IS_BRANCH(mn.mc_pg[mn.mc_top]));
|
|
6433 |
if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys)
|
|
6434 |
return mdb_node_move(&mn, mc);
|
|
6435 |
else {
|
|
6436 |
if (mc->mc_ki[ptop] == 0)
|
|
6437 |
rc = mdb_page_merge(&mn, mc);
|
|
6438 |
else
|
|
6439 |
rc = mdb_page_merge(mc, &mn);
|
|
6440 |
mc->mc_flags &= ~C_INITIALIZED;
|
|
6441 |
}
|
|
6442 |
return rc;
|
|
6443 |
}
|
|
6444 |
|
|
6445 |
/** Complete a delete operation started by #mdb_cursor_del(). */
|
|
6446 |
static int
|
|
6447 |
mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
|
|
6448 |
{
|
|
6449 |
int rc;
|
|
6450 |
|
|
6451 |
/* add overflow pages to free list */
|
|
6452 |
if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) {
|
|
6453 |
int i, ovpages;
|
|
6454 |
pgno_t pg;
|
|
6455 |
|
|
6456 |
memcpy(&pg, NODEDATA(leaf), sizeof(pg));
|
|
6457 |
ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
|
|
6458 |
mc->mc_db->md_overflow_pages -= ovpages;
|
|
6459 |
for (i=0; i<ovpages; i++) {
|
|
6460 |
DPRINTF("freed ov page %zu", pg);
|
|
6461 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
|
|
6462 |
pg++;
|
|
6463 |
}
|
|
6464 |
}
|
|
6465 |
mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad);
|
|
6466 |
mc->mc_db->md_entries--;
|
|
6467 |
rc = mdb_rebalance(mc);
|
|
6468 |
if (rc != MDB_SUCCESS)
|
|
6469 |
mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
|
|
6470 |
/* if mc points past last node in page, invalidate */
|
|
6471 |
else if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
|
|
6472 |
mc->mc_flags &= ~C_INITIALIZED;
|
|
6473 |
|
|
6474 |
return rc;
|
|
6475 |
}
|
|
6476 |
|
|
6477 |
int
|
|
6478 |
mdb_del(MDB_txn *txn, MDB_dbi dbi,
|
|
6479 |
MDB_val *key, MDB_val *data)
|
|
6480 |
{
|
|
6481 |
MDB_cursor mc;
|
|
6482 |
MDB_xcursor mx;
|
|
6483 |
MDB_cursor_op op;
|
|
6484 |
MDB_val rdata, *xdata;
|
|
6485 |
int rc, exact;
|
|
6486 |
DKBUF;
|
|
6487 |
|
|
6488 |
assert(key != NULL);
|
|
6489 |
|
|
6490 |
DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key));
|
|
6491 |
|
|
6492 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
6493 |
return EINVAL;
|
|
6494 |
|
|
6495 |
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
|
|
6496 |
return EACCES;
|
|
6497 |
}
|
|
6498 |
|
|
6499 |
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
|
6500 |
return EINVAL;
|
|
6501 |
}
|
|
6502 |
|
|
6503 |
mdb_cursor_init(&mc, txn, dbi, &mx);
|
|
6504 |
|
|
6505 |
exact = 0;
|
|
6506 |
if (data) {
|
|
6507 |
op = MDB_GET_BOTH;
|
|
6508 |
rdata = *data;
|
|
6509 |
xdata = &rdata;
|
|
6510 |
} else {
|
|
6511 |
op = MDB_SET;
|
|
6512 |
xdata = NULL;
|
|
6513 |
}
|
|
6514 |
rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
|
|
6515 |
if (rc == 0) {
|
|
6516 |
/* let mdb_page_split know about this cursor if needed:
|
|
6517 |
* delete will trigger a rebalance; if it needs to move
|
|
6518 |
* a node from one page to another, it will have to
|
|
6519 |
* update the parent's separator key(s). If the new sepkey
|
|
6520 |
* is larger than the current one, the parent page may
|
|
6521 |
* run out of space, triggering a split. We need this
|
|
6522 |
* cursor to be consistent until the end of the rebalance.
|
|
6523 |
*/
|
|
6524 |
mc.mc_next = txn->mt_cursors[dbi];
|
|
6525 |
txn->mt_cursors[dbi] = &mc;
|
|
6526 |
rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA);
|
|
6527 |
txn->mt_cursors[dbi] = mc.mc_next;
|
|
6528 |
}
|
|
6529 |
return rc;
|
|
6530 |
}
|
|
6531 |
|
|
6532 |
/** Split a page and insert a new node.
|
|
6533 |
* @param[in,out] mc Cursor pointing to the page and desired insertion index.
|
|
6534 |
* The cursor will be updated to point to the actual page and index where
|
|
6535 |
* the node got inserted after the split.
|
|
6536 |
* @param[in] newkey The key for the newly inserted node.
|
|
6537 |
* @param[in] newdata The data for the newly inserted node.
|
|
6538 |
* @param[in] newpgno The page number, if the new node is a branch node.
|
|
6539 |
* @param[in] nflags The #NODE_ADD_FLAGS for the new node.
|
|
6540 |
* @return 0 on success, non-zero on failure.
|
|
6541 |
*/
|
|
6542 |
static int
|
|
6543 |
mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
|
|
6544 |
unsigned int nflags)
|
|
6545 |
{
|
|
6546 |
unsigned int flags;
|
|
6547 |
int rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1, did_split = 0;
|
|
6548 |
indx_t newindx;
|
|
6549 |
pgno_t pgno = 0;
|
|
6550 |
unsigned int i, j, split_indx, nkeys, pmax;
|
|
6551 |
MDB_node *node;
|
|
6552 |
MDB_val sepkey, rkey, xdata, *rdata = &xdata;
|
|
6553 |
MDB_page *copy;
|
|
6554 |
MDB_page *mp, *rp, *pp;
|
|
6555 |
unsigned int ptop;
|
|
6556 |
MDB_cursor mn;
|
|
6557 |
DKBUF;
|
|
6558 |
|
|
6559 |
mp = mc->mc_pg[mc->mc_top];
|
|
6560 |
newindx = mc->mc_ki[mc->mc_top];
|
|
6561 |
|
|
6562 |
DPRINTF("-----> splitting %s page %zu and adding [%s] at index %i",
|
|
6563 |
IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
|
|
6564 |
DKEY(newkey), mc->mc_ki[mc->mc_top]);
|
|
6565 |
|
|
6566 |
/* Create a right sibling. */
|
|
6567 |
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
|
|
6568 |
return rc;
|
|
6569 |
DPRINTF("new right sibling: page %zu", rp->mp_pgno);
|
|
6570 |
|
|
6571 |
if (mc->mc_snum < 2) {
|
|
6572 |
if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
|
|
6573 |
return rc;
|
|
6574 |
/* shift current top to make room for new parent */
|
|
6575 |
mc->mc_pg[1] = mc->mc_pg[0];
|
|
6576 |
mc->mc_ki[1] = mc->mc_ki[0];
|
|
6577 |
mc->mc_pg[0] = pp;
|
|
6578 |
mc->mc_ki[0] = 0;
|
|
6579 |
mc->mc_db->md_root = pp->mp_pgno;
|
|
6580 |
DPRINTF("root split! new root = %zu", pp->mp_pgno);
|
|
6581 |
mc->mc_db->md_depth++;
|
|
6582 |
new_root = 1;
|
|
6583 |
|
|
6584 |
/* Add left (implicit) pointer. */
|
|
6585 |
if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
|
|
6586 |
/* undo the pre-push */
|
|
6587 |
mc->mc_pg[0] = mc->mc_pg[1];
|
|
6588 |
mc->mc_ki[0] = mc->mc_ki[1];
|
|
6589 |
mc->mc_db->md_root = mp->mp_pgno;
|
|
6590 |
mc->mc_db->md_depth--;
|
|
6591 |
return rc;
|
|
6592 |
}
|
|
6593 |
mc->mc_snum = 2;
|
|
6594 |
mc->mc_top = 1;
|
|
6595 |
ptop = 0;
|
|
6596 |
} else {
|
|
6597 |
ptop = mc->mc_top-1;
|
|
6598 |
DPRINTF("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno);
|
|
6599 |
}
|
|
6600 |
|
|
6601 |
mc->mc_flags |= C_SPLITTING;
|
|
6602 |
mdb_cursor_copy(mc, &mn);
|
|
6603 |
mn.mc_pg[mn.mc_top] = rp;
|
|
6604 |
mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
|
|
6605 |
|
|
6606 |
if (nflags & MDB_APPEND) {
|
|
6607 |
mn.mc_ki[mn.mc_top] = 0;
|
|
6608 |
sepkey = *newkey;
|
|
6609 |
split_indx = newindx;
|
|
6610 |
nkeys = 0;
|
|
6611 |
goto newsep;
|
|
6612 |
}
|
|
6613 |
|
|
6614 |
nkeys = NUMKEYS(mp);
|
|
6615 |
split_indx = nkeys / 2;
|
|
6616 |
if (newindx < split_indx)
|
|
6617 |
newpos = 0;
|
|
6618 |
|
|
6619 |
if (IS_LEAF2(rp)) {
|
|
6620 |
char *split, *ins;
|
|
6621 |
int x;
|
|
6622 |
unsigned int lsize, rsize, ksize;
|
|
6623 |
/* Move half of the keys to the right sibling */
|
|
6624 |
copy = NULL;
|
|
6625 |
x = mc->mc_ki[mc->mc_top] - split_indx;
|
|
6626 |
ksize = mc->mc_db->md_pad;
|
|
6627 |
split = LEAF2KEY(mp, split_indx, ksize);
|
|
6628 |
rsize = (nkeys - split_indx) * ksize;
|
|
6629 |
lsize = (nkeys - split_indx) * sizeof(indx_t);
|
|
6630 |
mp->mp_lower -= lsize;
|
|
6631 |
rp->mp_lower += lsize;
|
|
6632 |
mp->mp_upper += rsize - lsize;
|
|
6633 |
rp->mp_upper -= rsize - lsize;
|
|
6634 |
sepkey.mv_size = ksize;
|
|
6635 |
if (newindx == split_indx) {
|
|
6636 |
sepkey.mv_data = newkey->mv_data;
|
|
6637 |
} else {
|
|
6638 |
sepkey.mv_data = split;
|
|
6639 |
}
|
|
6640 |
if (x<0) {
|
|
6641 |
ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
|
|
6642 |
memcpy(rp->mp_ptrs, split, rsize);
|
|
6643 |
sepkey.mv_data = rp->mp_ptrs;
|
|
6644 |
memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
|
|
6645 |
memcpy(ins, newkey->mv_data, ksize);
|
|
6646 |
mp->mp_lower += sizeof(indx_t);
|
|
6647 |
mp->mp_upper -= ksize - sizeof(indx_t);
|
|
6648 |
} else {
|
|
6649 |
if (x)
|
|
6650 |
memcpy(rp->mp_ptrs, split, x * ksize);
|
|
6651 |
ins = LEAF2KEY(rp, x, ksize);
|
|
6652 |
memcpy(ins, newkey->mv_data, ksize);
|
|
6653 |
memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
|
|
6654 |
rp->mp_lower += sizeof(indx_t);
|
|
6655 |
rp->mp_upper -= ksize - sizeof(indx_t);
|
|
6656 |
mc->mc_ki[mc->mc_top] = x;
|
|
6657 |
mc->mc_pg[mc->mc_top] = rp;
|
|
6658 |
}
|
|
6659 |
goto newsep;
|
|
6660 |
}
|
|
6661 |
|
|
6662 |
/* For leaf pages, check the split point based on what
|
|
6663 |
* fits where, since otherwise mdb_node_add can fail.
|
|
6664 |
*
|
|
6665 |
* This check is only needed when the data items are
|
|
6666 |
* relatively large, such that being off by one will
|
|
6667 |
* make the difference between success or failure.
|
|
6668 |
*
|
|
6669 |
* It's also relevant if a page happens to be laid out
|
|
6670 |
* such that one half of its nodes are all "small" and
|
|
6671 |
* the other half of its nodes are "large." If the new
|
|
6672 |
* item is also "large" and falls on the half with
|
|
6673 |
* "large" nodes, it also may not fit.
|
|
6674 |
*/
|
|
6675 |
if (IS_LEAF(mp)) {
|
|
6676 |
unsigned int psize, nsize;
|
|
6677 |
/* Maximum free space in an empty page */
|
|
6678 |
pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
|
|
6679 |
nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
|
|
6680 |
if ((nkeys < 20) || (nsize > pmax/16)) {
|
|
6681 |
if (newindx <= split_indx) {
|
|
6682 |
psize = nsize;
|
|
6683 |
newpos = 0;
|
|
6684 |
for (i=0; i<split_indx; i++) {
|
|
6685 |
node = NODEPTR(mp, i);
|
|
6686 |
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
|
6687 |
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
|
6688 |
psize += sizeof(pgno_t);
|
|
6689 |
else
|
|
6690 |
psize += NODEDSZ(node);
|
|
6691 |
psize += psize & 1;
|
|
6692 |
if (psize > pmax) {
|
|
6693 |
if (i <= newindx) {
|
|
6694 |
split_indx = newindx;
|
|
6695 |
if (i < newindx)
|
|
6696 |
newpos = 1;
|
|
6697 |
}
|
|
6698 |
else
|
|
6699 |
split_indx = i;
|
|
6700 |
break;
|
|
6701 |
}
|
|
6702 |
}
|
|
6703 |
} else {
|
|
6704 |
psize = nsize;
|
|
6705 |
for (i=nkeys-1; i>=split_indx; i--) {
|
|
6706 |
node = NODEPTR(mp, i);
|
|
6707 |
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
|
6708 |
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
|
6709 |
psize += sizeof(pgno_t);
|
|
6710 |
else
|
|
6711 |
psize += NODEDSZ(node);
|
|
6712 |
psize += psize & 1;
|
|
6713 |
if (psize > pmax) {
|
|
6714 |
if (i >= newindx) {
|
|
6715 |
split_indx = newindx;
|
|
6716 |
newpos = 0;
|
|
6717 |
} else
|
|
6718 |
split_indx = i+1;
|
|
6719 |
break;
|
|
6720 |
}
|
|
6721 |
}
|
|
6722 |
}
|
|
6723 |
}
|
|
6724 |
}
|
|
6725 |
|
|
6726 |
/* First find the separating key between the split pages.
|
|
6727 |
* The case where newindx == split_indx is ambiguous; the
|
|
6728 |
* new item could go to the new page or stay on the original
|
|
6729 |
* page. If newpos == 1 it goes to the new page.
|
|
6730 |
*/
|
|
6731 |
if (newindx == split_indx && newpos) {
|
|
6732 |
sepkey.mv_size = newkey->mv_size;
|
|
6733 |
sepkey.mv_data = newkey->mv_data;
|
|
6734 |
} else {
|
|
6735 |
node = NODEPTR(mp, split_indx);
|
|
6736 |
sepkey.mv_size = node->mn_ksize;
|
|
6737 |
sepkey.mv_data = NODEKEY(node);
|
|
6738 |
}
|
|
6739 |
|
|
6740 |
newsep:
|
|
6741 |
DPRINTF("separator is [%s]", DKEY(&sepkey));
|
|
6742 |
|
|
6743 |
/* Copy separator key to the parent.
|
|
6744 |
*/
|
|
6745 |
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
|
|
6746 |
mn.mc_snum--;
|
|
6747 |
mn.mc_top--;
|
|
6748 |
did_split = 1;
|
|
6749 |
rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
|
|
6750 |
|
|
6751 |
/* root split? */
|
|
6752 |
if (mn.mc_snum == mc->mc_snum) {
|
|
6753 |
mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
|
|
6754 |
mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
|
|
6755 |
mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
|
|
6756 |
mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
|
|
6757 |
mc->mc_snum++;
|
|
6758 |
mc->mc_top++;
|
|
6759 |
ptop++;
|
|
6760 |
}
|
|
6761 |
/* Right page might now have changed parent.
|
|
6762 |
* Check if left page also changed parent.
|
|
6763 |
*/
|
|
6764 |
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
|
|
6765 |
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
|
|
6766 |
for (i=0; i<ptop; i++) {
|
|
6767 |
mc->mc_pg[i] = mn.mc_pg[i];
|
|
6768 |
mc->mc_ki[i] = mn.mc_ki[i];
|
|
6769 |
}
|
|
6770 |
mc->mc_pg[ptop] = mn.mc_pg[ptop];
|
|
6771 |
mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
|
|
6772 |
}
|
|
6773 |
} else {
|
|
6774 |
mn.mc_top--;
|
|
6775 |
rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
|
|
6776 |
mn.mc_top++;
|
|
6777 |
}
|
|
6778 |
mc->mc_flags ^= C_SPLITTING;
|
|
6779 |
if (rc != MDB_SUCCESS) {
|
|
6780 |
return rc;
|
|
6781 |
}
|
|
6782 |
if (nflags & MDB_APPEND) {
|
|
6783 |
mc->mc_pg[mc->mc_top] = rp;
|
|
6784 |
mc->mc_ki[mc->mc_top] = 0;
|
|
6785 |
rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
|
|
6786 |
if (rc)
|
|
6787 |
return rc;
|
|
6788 |
for (i=0; i<mc->mc_top; i++)
|
|
6789 |
mc->mc_ki[i] = mn.mc_ki[i];
|
|
6790 |
goto done;
|
|
6791 |
}
|
|
6792 |
if (IS_LEAF2(rp)) {
|
|
6793 |
goto done;
|
|
6794 |
}
|
|
6795 |
|
|
6796 |
/* Move half of the keys to the right sibling. */
|
|
6797 |
|
|
6798 |
/* grab a page to hold a temporary copy */
|
|
6799 |
copy = mdb_page_malloc(mc);
|
|
6800 |
if (copy == NULL)
|
|
6801 |
return ENOMEM;
|
|
6802 |
|
|
6803 |
copy->mp_pgno = mp->mp_pgno;
|
|
6804 |
copy->mp_flags = mp->mp_flags;
|
|
6805 |
copy->mp_lower = PAGEHDRSZ;
|
|
6806 |
copy->mp_upper = mc->mc_txn->mt_env->me_psize;
|
|
6807 |
mc->mc_pg[mc->mc_top] = copy;
|
|
6808 |
for (i = j = 0; i <= nkeys; j++) {
|
|
6809 |
if (i == split_indx) {
|
|
6810 |
/* Insert in right sibling. */
|
|
6811 |
/* Reset insert index for right sibling. */
|
|
6812 |
if (i != newindx || (newpos ^ ins_new)) {
|
|
6813 |
j = 0;
|
|
6814 |
mc->mc_pg[mc->mc_top] = rp;
|
|
6815 |
}
|
|
6816 |
}
|
|
6817 |
|
|
6818 |
if (i == newindx && !ins_new) {
|
|
6819 |
/* Insert the original entry that caused the split. */
|
|
6820 |
rkey.mv_data = newkey->mv_data;
|
|
6821 |
rkey.mv_size = newkey->mv_size;
|
|
6822 |
if (IS_LEAF(mp)) {
|
|
6823 |
rdata = newdata;
|
|
6824 |
} else
|
|
6825 |
pgno = newpgno;
|
|
6826 |
flags = nflags;
|
|
6827 |
|
|
6828 |
ins_new = 1;
|
|
6829 |
|
|
6830 |
/* Update index for the new key. */
|
|
6831 |
mc->mc_ki[mc->mc_top] = j;
|
|
6832 |
} else if (i == nkeys) {
|
|
6833 |
break;
|
|
6834 |
} else {
|
|
6835 |
node = NODEPTR(mp, i);
|
|
6836 |
rkey.mv_data = NODEKEY(node);
|
|
6837 |
rkey.mv_size = node->mn_ksize;
|
|
6838 |
if (IS_LEAF(mp)) {
|
|
6839 |
xdata.mv_data = NODEDATA(node);
|
|
6840 |
xdata.mv_size = NODEDSZ(node);
|
|
6841 |
rdata = &xdata;
|
|
6842 |
} else
|
|
6843 |
pgno = NODEPGNO(node);
|
|
6844 |
flags = node->mn_flags;
|
|
6845 |
|
|
6846 |
i++;
|
|
6847 |
}
|
|
6848 |
|
|
6849 |
if (!IS_LEAF(mp) && j == 0) {
|
|
6850 |
/* First branch index doesn't need key data. */
|
|
6851 |
rkey.mv_size = 0;
|
|
6852 |
}
|
|
6853 |
|
|
6854 |
rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
|
|
6855 |
if (rc) break;
|
|
6856 |
}
|
|
6857 |
|
|
6858 |
nkeys = NUMKEYS(copy);
|
|
6859 |
for (i=0; i<nkeys; i++)
|
|
6860 |
mp->mp_ptrs[i] = copy->mp_ptrs[i];
|
|
6861 |
mp->mp_lower = copy->mp_lower;
|
|
6862 |
mp->mp_upper = copy->mp_upper;
|
|
6863 |
memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
|
|
6864 |
mc->mc_txn->mt_env->me_psize - copy->mp_upper);
|
|
6865 |
|
|
6866 |
/* reset back to original page */
|
|
6867 |
if (newindx < split_indx || (!newpos && newindx == split_indx)) {
|
|
6868 |
mc->mc_pg[mc->mc_top] = mp;
|
|
6869 |
if (nflags & MDB_RESERVE) {
|
|
6870 |
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
|
|
6871 |
if (!(node->mn_flags & F_BIGDATA))
|
|
6872 |
newdata->mv_data = NODEDATA(node);
|
|
6873 |
}
|
|
6874 |
} else {
|
|
6875 |
mc->mc_ki[ptop]++;
|
|
6876 |
}
|
|
6877 |
|
|
6878 |
/* return tmp page to freelist */
|
|
6879 |
mdb_page_free(mc->mc_txn->mt_env, copy);
|
|
6880 |
done:
|
|
6881 |
{
|
|
6882 |
/* Adjust other cursors pointing to mp */
|
|
6883 |
MDB_cursor *m2, *m3;
|
|
6884 |
MDB_dbi dbi = mc->mc_dbi;
|
|
6885 |
int fixup = NUMKEYS(mp);
|
|
6886 |
|
|
6887 |
if (mc->mc_flags & C_SUB)
|
|
6888 |
dbi--;
|
|
6889 |
|
|
6890 |
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
|
|
6891 |
if (m2 == mc) continue;
|
|
6892 |
if (mc->mc_flags & C_SUB)
|
|
6893 |
m3 = &m2->mc_xcursor->mx_cursor;
|
|
6894 |
else
|
|
6895 |
m3 = m2;
|
|
6896 |
if (!(m3->mc_flags & C_INITIALIZED))
|
|
6897 |
continue;
|
|
6898 |
if (m3->mc_flags & C_SPLITTING)
|
|
6899 |
continue;
|
|
6900 |
if (new_root) {
|
|
6901 |
int k;
|
|
6902 |
/* root split */
|
|
6903 |
for (k=m3->mc_top; k>=0; k--) {
|
|
6904 |
m3->mc_ki[k+1] = m3->mc_ki[k];
|
|
6905 |
m3->mc_pg[k+1] = m3->mc_pg[k];
|
|
6906 |
}
|
|
6907 |
if (m3->mc_ki[0] >= split_indx) {
|
|
6908 |
m3->mc_ki[0] = 1;
|
|
6909 |
} else {
|
|
6910 |
m3->mc_ki[0] = 0;
|
|
6911 |
}
|
|
6912 |
m3->mc_pg[0] = mc->mc_pg[0];
|
|
6913 |
m3->mc_snum++;
|
|
6914 |
m3->mc_top++;
|
|
6915 |
}
|
|
6916 |
if (m3->mc_pg[mc->mc_top] == mp) {
|
|
6917 |
if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
|
|
6918 |
m3->mc_ki[mc->mc_top]++;
|
|
6919 |
if (m3->mc_ki[mc->mc_top] >= fixup) {
|
|
6920 |
m3->mc_pg[mc->mc_top] = rp;
|
|
6921 |
m3->mc_ki[mc->mc_top] -= fixup;
|
|
6922 |
m3->mc_ki[ptop] = mn.mc_ki[ptop];
|
|
6923 |
}
|
|
6924 |
} else if (!did_split && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
|
|
6925 |
m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
|
|
6926 |
m3->mc_ki[ptop]++;
|
|
6927 |
}
|
|
6928 |
}
|
|
6929 |
}
|
|
6930 |
return rc;
|
|
6931 |
}
|
|
6932 |
|
|
6933 |
int
|
|
6934 |
mdb_put(MDB_txn *txn, MDB_dbi dbi,
|
|
6935 |
MDB_val *key, MDB_val *data, unsigned int flags)
|
|
6936 |
{
|
|
6937 |
MDB_cursor mc;
|
|
6938 |
MDB_xcursor mx;
|
|
6939 |
|
|
6940 |
assert(key != NULL);
|
|
6941 |
assert(data != NULL);
|
|
6942 |
|
|
6943 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
6944 |
return EINVAL;
|
|
6945 |
|
|
6946 |
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
|
|
6947 |
return EACCES;
|
|
6948 |
}
|
|
6949 |
|
|
6950 |
if (key->mv_size == 0 || key->mv_size > MDB_MAXKEYSIZE) {
|
|
6951 |
return EINVAL;
|
|
6952 |
}
|
|
6953 |
|
|
6954 |
if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND|MDB_APPENDDUP)) != flags)
|
|
6955 |
return EINVAL;
|
|
6956 |
|
|
6957 |
mdb_cursor_init(&mc, txn, dbi, &mx);
|
|
6958 |
return mdb_cursor_put(&mc, key, data, flags);
|
|
6959 |
}
|
|
6960 |
|
|
6961 |
int
|
|
6962 |
mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
|
|
6963 |
{
|
|
6964 |
if ((flag & CHANGEABLE) != flag)
|
|
6965 |
return EINVAL;
|
|
6966 |
if (onoff)
|
|
6967 |
env->me_flags |= flag;
|
|
6968 |
else
|
|
6969 |
env->me_flags &= ~flag;
|
|
6970 |
return MDB_SUCCESS;
|
|
6971 |
}
|
|
6972 |
|
|
6973 |
int
|
|
6974 |
mdb_env_get_flags(MDB_env *env, unsigned int *arg)
|
|
6975 |
{
|
|
6976 |
if (!env || !arg)
|
|
6977 |
return EINVAL;
|
|
6978 |
|
|
6979 |
*arg = env->me_flags;
|
|
6980 |
return MDB_SUCCESS;
|
|
6981 |
}
|
|
6982 |
|
|
6983 |
int
|
|
6984 |
mdb_env_get_path(MDB_env *env, const char **arg)
|
|
6985 |
{
|
|
6986 |
if (!env || !arg)
|
|
6987 |
return EINVAL;
|
|
6988 |
|
|
6989 |
*arg = env->me_path;
|
|
6990 |
return MDB_SUCCESS;
|
|
6991 |
}
|
|
6992 |
|
|
6993 |
/** Common code for #mdb_stat() and #mdb_env_stat().
|
|
6994 |
* @param[in] env the environment to operate in.
|
|
6995 |
* @param[in] db the #MDB_db record containing the stats to return.
|
|
6996 |
* @param[out] arg the address of an #MDB_stat structure to receive the stats.
|
|
6997 |
* @return 0, this function always succeeds.
|
|
6998 |
*/
|
|
6999 |
static int
|
|
7000 |
mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
|
|
7001 |
{
|
|
7002 |
arg->ms_psize = env->me_psize;
|
|
7003 |
arg->ms_depth = db->md_depth;
|
|
7004 |
arg->ms_branch_pages = db->md_branch_pages;
|
|
7005 |
arg->ms_leaf_pages = db->md_leaf_pages;
|
|
7006 |
arg->ms_overflow_pages = db->md_overflow_pages;
|
|
7007 |
arg->ms_entries = db->md_entries;
|
|
7008 |
|
|
7009 |
return MDB_SUCCESS;
|
|
7010 |
}
|
|
7011 |
int
|
|
7012 |
mdb_env_stat(MDB_env *env, MDB_stat *arg)
|
|
7013 |
{
|
|
7014 |
int toggle;
|
|
7015 |
|
|
7016 |
if (env == NULL || arg == NULL)
|
|
7017 |
return EINVAL;
|
|
7018 |
|
|
7019 |
toggle = mdb_env_pick_meta(env);
|
|
7020 |
|
|
7021 |
return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
|
|
7022 |
}
|
|
7023 |
|
|
7024 |
int
|
|
7025 |
mdb_env_info(MDB_env *env, MDB_envinfo *arg)
|
|
7026 |
{
|
|
7027 |
int toggle;
|
|
7028 |
|
|
7029 |
if (env == NULL || arg == NULL)
|
|
7030 |
return EINVAL;
|
|
7031 |
|
|
7032 |
toggle = mdb_env_pick_meta(env);
|
|
7033 |
arg->me_mapaddr = (env->me_flags & MDB_FIXEDMAP) ? env->me_map : 0;
|
|
7034 |
arg->me_mapsize = env->me_mapsize;
|
|
7035 |
arg->me_maxreaders = env->me_maxreaders;
|
|
7036 |
arg->me_numreaders = env->me_numreaders;
|
|
7037 |
arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
|
|
7038 |
arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
|
|
7039 |
return MDB_SUCCESS;
|
|
7040 |
}
|
|
7041 |
|
|
7042 |
/** Set the default comparison functions for a database.
|
|
7043 |
* Called immediately after a database is opened to set the defaults.
|
|
7044 |
* The user can then override them with #mdb_set_compare() or
|
|
7045 |
* #mdb_set_dupsort().
|
|
7046 |
* @param[in] txn A transaction handle returned by #mdb_txn_begin()
|
|
7047 |
* @param[in] dbi A database handle returned by #mdb_dbi_open()
|
|
7048 |
*/
|
|
7049 |
static void
|
|
7050 |
mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
|
|
7051 |
{
|
|
7052 |
uint16_t f = txn->mt_dbs[dbi].md_flags;
|
|
7053 |
|
|
7054 |
txn->mt_dbxs[dbi].md_cmp =
|
|
7055 |
(f & MDB_REVERSEKEY) ? mdb_cmp_memnr :
|
|
7056 |
(f & MDB_INTEGERKEY) ? mdb_cmp_cint : mdb_cmp_memn;
|
|
7057 |
|
|
7058 |
txn->mt_dbxs[dbi].md_dcmp =
|
|
7059 |
!(f & MDB_DUPSORT) ? 0 :
|
|
7060 |
((f & MDB_INTEGERDUP)
|
|
7061 |
? ((f & MDB_DUPFIXED) ? mdb_cmp_int : mdb_cmp_cint)
|
|
7062 |
: ((f & MDB_REVERSEDUP) ? mdb_cmp_memnr : mdb_cmp_memn));
|
|
7063 |
}
|
|
7064 |
|
|
7065 |
int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
|
|
7066 |
{
|
|
7067 |
MDB_val key, data;
|
|
7068 |
MDB_dbi i;
|
|
7069 |
MDB_cursor mc;
|
|
7070 |
int rc, dbflag, exact;
|
|
7071 |
unsigned int unused = 0;
|
|
7072 |
size_t len;
|
|
7073 |
|
|
7074 |
if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
|
|
7075 |
mdb_default_cmp(txn, FREE_DBI);
|
|
7076 |
}
|
|
7077 |
|
|
7078 |
if ((flags & VALID_FLAGS) != flags)
|
|
7079 |
return EINVAL;
|
|
7080 |
|
|
7081 |
/* main DB? */
|
|
7082 |
if (!name) {
|
|
7083 |
*dbi = MAIN_DBI;
|
|
7084 |
if (flags & PERSISTENT_FLAGS) {
|
|
7085 |
uint16_t f2 = flags & PERSISTENT_FLAGS;
|
|
7086 |
/* make sure flag changes get committed */
|
|
7087 |
if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
|
|
7088 |
txn->mt_dbs[MAIN_DBI].md_flags |= f2;
|
|
7089 |
txn->mt_flags |= MDB_TXN_DIRTY;
|
|
7090 |
}
|
|
7091 |
}
|
|
7092 |
mdb_default_cmp(txn, MAIN_DBI);
|
|
7093 |
return MDB_SUCCESS;
|
|
7094 |
}
|
|
7095 |
|
|
7096 |
if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
|
|
7097 |
mdb_default_cmp(txn, MAIN_DBI);
|
|
7098 |
}
|
|
7099 |
|
|
7100 |
/* Is the DB already open? */
|
|
7101 |
len = strlen(name);
|
|
7102 |
for (i=2; i<txn->mt_numdbs; i++) {
|
|
7103 |
if (!txn->mt_dbxs[i].md_name.mv_size) {
|
|
7104 |
/* Remember this free slot */
|
|
7105 |
if (!unused) unused = i;
|
|
7106 |
continue;
|
|
7107 |
}
|
|
7108 |
if (len == txn->mt_dbxs[i].md_name.mv_size &&
|
|
7109 |
!strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
|
|
7110 |
*dbi = i;
|
|
7111 |
return MDB_SUCCESS;
|
|
7112 |
}
|
|
7113 |
}
|
|
7114 |
|
|
7115 |
/* If no free slot and max hit, fail */
|
|
7116 |
if (!unused && txn->mt_numdbs >= txn->mt_env->me_maxdbs)
|
|
7117 |
return MDB_DBS_FULL;
|
|
7118 |
|
|
7119 |
/* Find the DB info */
|
|
7120 |
dbflag = DB_NEW|DB_VALID;
|
|
7121 |
exact = 0;
|
|
7122 |
key.mv_size = len;
|
|
7123 |
key.mv_data = (void *)name;
|
|
7124 |
mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
|
|
7125 |
rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
|
|
7126 |
if (rc == MDB_SUCCESS) {
|
|
7127 |
/* make sure this is actually a DB */
|
|
7128 |
MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
|
|
7129 |
if (!(node->mn_flags & F_SUBDATA))
|
|
7130 |
return EINVAL;
|
|
7131 |
} else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
|
|
7132 |
/* Create if requested */
|
|
7133 |
MDB_db dummy;
|
|
7134 |
data.mv_size = sizeof(MDB_db);
|
|
7135 |
data.mv_data = &dummy;
|
|
7136 |
memset(&dummy, 0, sizeof(dummy));
|
|
7137 |
dummy.md_root = P_INVALID;
|
|
7138 |
dummy.md_flags = flags & PERSISTENT_FLAGS;
|
|
7139 |
rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
|
|
7140 |
dbflag |= DB_DIRTY;
|
|
7141 |
}
|
|
7142 |
|
|
7143 |
/* OK, got info, add to table */
|
|
7144 |
if (rc == MDB_SUCCESS) {
|
|
7145 |
unsigned int slot = unused ? unused : txn->mt_numdbs;
|
|
7146 |
txn->mt_dbxs[slot].md_name.mv_data = strdup(name);
|
|
7147 |
txn->mt_dbxs[slot].md_name.mv_size = len;
|
|
7148 |
txn->mt_dbxs[slot].md_rel = NULL;
|
|
7149 |
txn->mt_dbflags[slot] = dbflag;
|
|
7150 |
memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(MDB_db));
|
|
7151 |
*dbi = slot;
|
|
7152 |
txn->mt_env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags;
|
|
7153 |
mdb_default_cmp(txn, slot);
|
|
7154 |
if (!unused) {
|
|
7155 |
txn->mt_numdbs++;
|
|
7156 |
}
|
|
7157 |
}
|
|
7158 |
|
|
7159 |
return rc;
|
|
7160 |
}
|
|
7161 |
|
|
7162 |
int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
|
|
7163 |
{
|
|
7164 |
if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs)
|
|
7165 |
return EINVAL;
|
|
7166 |
|
|
7167 |
return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
|
|
7168 |
}
|
|
7169 |
|
|
7170 |
void mdb_dbi_close(MDB_env *env, MDB_dbi dbi)
|
|
7171 |
{
|
|
7172 |
char *ptr;
|
|
7173 |
if (dbi <= MAIN_DBI || dbi >= env->me_maxdbs)
|
|
7174 |
return;
|
|
7175 |
ptr = env->me_dbxs[dbi].md_name.mv_data;
|
|
7176 |
env->me_dbxs[dbi].md_name.mv_data = NULL;
|
|
7177 |
env->me_dbxs[dbi].md_name.mv_size = 0;
|
|
7178 |
free(ptr);
|
|
7179 |
}
|
|
7180 |
|
|
7181 |
/** Add all the DB's pages to the free list.
|
|
7182 |
* @param[in] mc Cursor on the DB to free.
|
|
7183 |
* @param[in] subs non-Zero to check for sub-DBs in this DB.
|
|
7184 |
* @return 0 on success, non-zero on failure.
|
|
7185 |
*/
|
|
7186 |
static int
|
|
7187 |
mdb_drop0(MDB_cursor *mc, int subs)
|
|
7188 |
{
|
|
7189 |
int rc;
|
|
7190 |
|
|
7191 |
rc = mdb_page_search(mc, NULL, 0);
|
|
7192 |
if (rc == MDB_SUCCESS) {
|
|
7193 |
MDB_node *ni;
|
|
7194 |
MDB_cursor mx;
|
|
7195 |
unsigned int i;
|
|
7196 |
|
|
7197 |
/* LEAF2 pages have no nodes, cannot have sub-DBs */
|
|
7198 |
if (IS_LEAF2(mc->mc_pg[mc->mc_top]))
|
|
7199 |
mdb_cursor_pop(mc);
|
|
7200 |
|
|
7201 |
mdb_cursor_copy(mc, &mx);
|
|
7202 |
while (mc->mc_snum > 0) {
|
|
7203 |
if (IS_LEAF(mc->mc_pg[mc->mc_top])) {
|
|
7204 |
for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
|
|
7205 |
ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
|
|
7206 |
if (ni->mn_flags & F_BIGDATA) {
|
|
7207 |
int j, ovpages = OVPAGES(NODEDSZ(ni), mc->mc_txn->mt_env->me_psize);
|
|
7208 |
pgno_t pg;
|
|
7209 |
memcpy(&pg, NODEDATA(ni), sizeof(pg));
|
|
7210 |
for (j=0; j<ovpages; j++) {
|
|
7211 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
|
|
7212 |
pg++;
|
|
7213 |
}
|
|
7214 |
} else if (subs && (ni->mn_flags & F_SUBDATA)) {
|
|
7215 |
mdb_xcursor_init1(mc, ni);
|
|
7216 |
rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
|
|
7217 |
if (rc)
|
|
7218 |
return rc;
|
|
7219 |
}
|
|
7220 |
}
|
|
7221 |
} else {
|
|
7222 |
for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
|
|
7223 |
pgno_t pg;
|
|
7224 |
ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
|
|
7225 |
pg = NODEPGNO(ni);
|
|
7226 |
/* free it */
|
|
7227 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
|
|
7228 |
}
|
|
7229 |
}
|
|
7230 |
if (!mc->mc_top)
|
|
7231 |
break;
|
|
7232 |
mc->mc_ki[mc->mc_top] = i;
|
|
7233 |
rc = mdb_cursor_sibling(mc, 1);
|
|
7234 |
if (rc) {
|
|
7235 |
/* no more siblings, go back to beginning
|
|
7236 |
* of previous level.
|
|
7237 |
*/
|
|
7238 |
mdb_cursor_pop(mc);
|
|
7239 |
mc->mc_ki[0] = 0;
|
|
7240 |
for (i=1; i<mc->mc_snum; i++) {
|
|
7241 |
mc->mc_ki[i] = 0;
|
|
7242 |
mc->mc_pg[i] = mx.mc_pg[i];
|
|
7243 |
}
|
|
7244 |
}
|
|
7245 |
}
|
|
7246 |
/* free it */
|
|
7247 |
mdb_midl_append(&mc->mc_txn->mt_free_pgs,
|
|
7248 |
mc->mc_db->md_root);
|
|
7249 |
}
|
|
7250 |
return 0;
|
|
7251 |
}
|
|
7252 |
|
|
7253 |
int mdb_drop(MDB_txn *txn, MDB_dbi dbi, int del)
|
|
7254 |
{
|
|
7255 |
MDB_cursor *mc;
|
|
7256 |
int rc;
|
|
7257 |
|
|
7258 |
if (!txn || !dbi || dbi >= txn->mt_numdbs || (unsigned)del > 1 || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
7259 |
return EINVAL;
|
|
7260 |
|
|
7261 |
if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
|
|
7262 |
return EACCES;
|
|
7263 |
|
|
7264 |
rc = mdb_cursor_open(txn, dbi, &mc);
|
|
7265 |
if (rc)
|
|
7266 |
return rc;
|
|
7267 |
|
|
7268 |
rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
|
|
7269 |
if (rc)
|
|
7270 |
goto leave;
|
|
7271 |
|
|
7272 |
/* Can't delete the main DB */
|
|
7273 |
if (del && dbi > MAIN_DBI) {
|
|
7274 |
rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL);
|
|
7275 |
if (!rc) {
|
|
7276 |
txn->mt_dbflags[dbi] = DB_STALE;
|
|
7277 |
mdb_dbi_close(txn->mt_env, dbi);
|
|
7278 |
}
|
|
7279 |
} else {
|
|
7280 |
/* reset the DB record, mark it dirty */
|
|
7281 |
txn->mt_dbflags[dbi] |= DB_DIRTY;
|
|
7282 |
txn->mt_dbs[dbi].md_depth = 0;
|
|
7283 |
txn->mt_dbs[dbi].md_branch_pages = 0;
|
|
7284 |
txn->mt_dbs[dbi].md_leaf_pages = 0;
|
|
7285 |
txn->mt_dbs[dbi].md_overflow_pages = 0;
|
|
7286 |
txn->mt_dbs[dbi].md_entries = 0;
|
|
7287 |
txn->mt_dbs[dbi].md_root = P_INVALID;
|
|
7288 |
|
|
7289 |
txn->mt_flags |= MDB_TXN_DIRTY;
|
|
7290 |
}
|
|
7291 |
leave:
|
|
7292 |
mdb_cursor_close(mc);
|
|
7293 |
return rc;
|
|
7294 |
}
|
|
7295 |
|
|
7296 |
int mdb_set_compare(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
|
|
7297 |
{
|
|
7298 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
7299 |
return EINVAL;
|
|
7300 |
|
|
7301 |
txn->mt_dbxs[dbi].md_cmp = cmp;
|
|
7302 |
return MDB_SUCCESS;
|
|
7303 |
}
|
|
7304 |
|
|
7305 |
int mdb_set_dupsort(MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
|
|
7306 |
{
|
|
7307 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
7308 |
return EINVAL;
|
|
7309 |
|
|
7310 |
txn->mt_dbxs[dbi].md_dcmp = cmp;
|
|
7311 |
return MDB_SUCCESS;
|
|
7312 |
}
|
|
7313 |
|
|
7314 |
int mdb_set_relfunc(MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
|
|
7315 |
{
|
|
7316 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
7317 |
return EINVAL;
|
|
7318 |
|
|
7319 |
txn->mt_dbxs[dbi].md_rel = rel;
|
|
7320 |
return MDB_SUCCESS;
|
|
7321 |
}
|
|
7322 |
|
|
7323 |
int mdb_set_relctx(MDB_txn *txn, MDB_dbi dbi, void *ctx)
|
|
7324 |
{
|
|
7325 |
if (txn == NULL || !dbi || dbi >= txn->mt_numdbs || !(txn->mt_dbflags[dbi] & DB_VALID))
|
|
7326 |
return EINVAL;
|
|
7327 |
|
|
7328 |
txn->mt_dbxs[dbi].md_relctx = ctx;
|
|
7329 |
return MDB_SUCCESS;
|
|
7330 |
}
|
|
7331 |
|
|
7332 |
/** @} */
|