Back to index

openldap  2.4.31
mdb.c
Go to the documentation of this file.
00001 
00007 /*
00008  * Copyright 2011-2012 Howard Chu, Symas Corp.
00009  * All rights reserved.
00010  *
00011  * Redistribution and use in source and binary forms, with or without
00012  * modification, are permitted only as authorized by the OpenLDAP
00013  * Public License.
00014  *
00015  * A copy of this license is available in the file LICENSE in the
00016  * top-level directory of the distribution or, alternatively, at
00017  * <http://www.OpenLDAP.org/license.html>.
00018  *
00019  * This code is derived from btree.c written by Martin Hedenfalk.
00020  *
00021  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
00022  *
00023  * Permission to use, copy, modify, and distribute this software for any
00024  * purpose with or without fee is hereby granted, provided that the above
00025  * copyright notice and this permission notice appear in all copies.
00026  *
00027  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
00028  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
00029  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
00030  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
00031  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
00032  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
00033  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
00034  */
00035 #include <sys/types.h>
00036 #include <sys/stat.h>
00037 #include <sys/param.h>
00038 #ifdef _WIN32
00039 #include <windows.h>
00040 #else
00041 #include <sys/uio.h>
00042 #include <sys/mman.h>
00043 #ifdef HAVE_SYS_FILE_H
00044 #include <sys/file.h>
00045 #endif
00046 #include <fcntl.h>
00047 #endif
00048 
00049 #include <assert.h>
00050 #include <errno.h>
00051 #include <limits.h>
00052 #include <stddef.h>
00053 #include <inttypes.h>
00054 #include <stdio.h>
00055 #include <stdlib.h>
00056 #include <string.h>
00057 #include <time.h>
00058 #include <unistd.h>
00059 
00060 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
00061 #include <resolv.h>  /* defines BYTE_ORDER on HPUX and Solaris */
00062 #endif
00063 
00064 #ifndef _WIN32
00065 #include <pthread.h>
00066 #ifdef __APPLE__
00067 #include <semaphore.h>
00068 #endif
00069 #endif
00070 
00071 #ifdef USE_VALGRIND
00072 #include <valgrind/memcheck.h>
00073 #define VGMEMP_CREATE(h,r,z)    VALGRIND_CREATE_MEMPOOL(h,r,z)
00074 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
00075 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
00076 #define VGMEMP_DESTROY(h)   VALGRIND_DESTROY_MEMPOOL(h)
00077 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
00078 #else
00079 #define VGMEMP_CREATE(h,r,z)
00080 #define VGMEMP_ALLOC(h,a,s)
00081 #define VGMEMP_FREE(h,a)
00082 #define VGMEMP_DESTROY(h)
00083 #define VGMEMP_DEFINED(a,s)
00084 #endif
00085 
00086 #ifndef BYTE_ORDER
00087 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
00088 /* Solaris just defines one or the other */
00089 #  define LITTLE_ENDIAN     1234
00090 #  define BIG_ENDIAN 4321
00091 #  ifdef _LITTLE_ENDIAN
00092 #   define BYTE_ORDER  LITTLE_ENDIAN
00093 #  else
00094 #   define BYTE_ORDER  BIG_ENDIAN
00095 #  endif
00096 # else
00097 #  define BYTE_ORDER   __BYTE_ORDER
00098 # endif
00099 #endif
00100 
00101 #ifndef LITTLE_ENDIAN
00102 #define LITTLE_ENDIAN       __LITTLE_ENDIAN
00103 #endif
00104 #ifndef BIG_ENDIAN
00105 #define BIG_ENDIAN   __BIG_ENDIAN
00106 #endif
00107 
00108 #if defined(__i386) || defined(__x86_64)
00109 #define MISALIGNED_OK       1
00110 #endif
00111 
00112 #include "mdb.h"
00113 #include "midl.h"
00114 
00115 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
00116 # error "Unknown or unsupported endianness (BYTE_ORDER)"
00117 #elif (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF
00118 # error "Two's complement, reasonably sized integer types, please"
00119 #endif
00120 
00131 #ifdef _WIN32
00132 #define pthread_t    DWORD
00133 #define pthread_mutex_t     HANDLE
00134 #define pthread_key_t       DWORD
00135 #define pthread_self()      GetCurrentThreadId()
00136 #define pthread_key_create(x,y)    (*(x) = TlsAlloc())
00137 #define pthread_key_delete(x)      TlsFree(x)
00138 #define pthread_getspecific(x)     TlsGetValue(x)
00139 #define pthread_setspecific(x,y)   TlsSetValue(x,y)
00140 #define pthread_mutex_unlock(x)    ReleaseMutex(x)
00141 #define pthread_mutex_lock(x)      WaitForSingleObject(x, INFINITE)
00142 #define LOCK_MUTEX_R(env)   pthread_mutex_lock((env)->me_rmutex)
00143 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock((env)->me_rmutex)
00144 #define LOCK_MUTEX_W(env)   pthread_mutex_lock((env)->me_wmutex)
00145 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock((env)->me_wmutex)
00146 #define getpid()     GetCurrentProcessId()
00147 #define       fdatasync(fd) (!FlushFileBuffers(fd))
00148 #define       ErrCode()     GetLastError()
00149 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
00150 #define       close(fd)     CloseHandle(fd)
00151 #define       munmap(ptr,len)      UnmapViewOfFile(ptr)
00152 #else
00153 #ifdef __APPLE__
00154 #define LOCK_MUTEX_R(env)   sem_wait((env)->me_rmutex)
00155 #define UNLOCK_MUTEX_R(env) sem_post((env)->me_rmutex)
00156 #define LOCK_MUTEX_W(env)   sem_wait((env)->me_wmutex)
00157 #define UNLOCK_MUTEX_W(env) sem_post((env)->me_wmutex)
00158 #define fdatasync(fd)       fsync(fd)
00159 #else
00160 #ifdef ANDROID
00161 #define fdatasync(fd)       fsync(fd)
00162 #endif
00163 
00165 #define LOCK_MUTEX_R(env)   pthread_mutex_lock(&(env)->me_txns->mti_mutex)
00166 
00168 #define UNLOCK_MUTEX_R(env) pthread_mutex_unlock(&(env)->me_txns->mti_mutex)
00169 
00174 #define LOCK_MUTEX_W(env)   pthread_mutex_lock(&(env)->me_txns->mti_wmutex)
00175 
00177 #define UNLOCK_MUTEX_W(env) pthread_mutex_unlock(&(env)->me_txns->mti_wmutex)
00178 #endif /* __APPLE__ */
00179 
00182 #define       ErrCode()     errno
00183 
00188 #define       HANDLE int
00189 
00194 #define INVALID_HANDLE_VALUE       (-1)
00195 
00200 #define       GET_PAGESIZE(x)      ((x) = sysconf(_SC_PAGE_SIZE))
00201 #endif
00202 
00203 #if defined(_WIN32) || defined(__APPLE__)
00204 #define MNAME_LEN    32
00205 #else
00206 #define MNAME_LEN    (sizeof(pthread_mutex_t))
00207 #endif
00208 
00211 #ifndef _WIN32
00212 
00221 #ifndef MDB_DSYNC
00222 # define MDB_DSYNC   O_DSYNC
00223 #endif
00224 #endif
00225 
00229 #ifndef MDB_FDATASYNC
00230 # define MDB_FDATASYNC      fdatasync
00231 #endif
00232 
00241 typedef ID    pgno_t;
00242 
00246 typedef ID    txnid_t;
00247 
00251 #ifndef MDB_DEBUG
00252 
00256 #define MDB_DEBUG 0
00257 #endif
00258 
00259 #if !(__STDC_VERSION__ >= 199901L || defined(__GNUC__))
00260 # define DPRINTF     (void) /* Vararg macros may be unsupported */
00261 #elif MDB_DEBUG
00262 static int mdb_debug;
00263 static int mdb_debug_start;
00264 
00266 # define DPRINTF(fmt, ...)   \
00267        if (mdb_debug) fprintf(stderr, "%s:%d " fmt "\n", __func__, __LINE__, __VA_ARGS__)
00268 #else
00269 # define DPRINTF(fmt, ...)  ((void) 0)
00270 #endif
00271 
00274 #define DPUTS(arg)   DPRINTF("%s", arg)
00275 
00289 #define MDB_PAGESIZE  4096
00290 
00303 #define MDB_MINKEYS   2
00304 
00309 #define MDB_MAGIC     0xBEEFC0DE
00310 
00312 #define MDB_VERSION   1
00313 
00319 #define MAXKEYSIZE    511
00320 
00321 #if MDB_DEBUG
00322 
00326 #define DKBUF char kbuf[(MAXKEYSIZE*2+1)]
00327 
00331 #define       DKEY(x)       mdb_dkey(x, kbuf)
00332 #else
00333 #define       DKBUF  typedef int dummy_kbuf      /* so we can put ';' after */
00334 #define DKEY(x)      0
00335 #endif
00336 
00343 #ifndef       LAZY_LOCKS
00344 
00345 #define       LAZY_LOCKS    1
00346 #endif
00347 #if    LAZY_LOCKS
00348 
00349 #define       LAZY_MUTEX_LOCK(x)
00350 
00351 #define       LAZY_MUTEX_UNLOCK(x)
00352 
00353 #define       LAZY_RWLOCK_UNLOCK(x)
00354 
00355 #define       LAZY_RWLOCK_WRLOCK(x)
00356 
00357 #define       LAZY_RWLOCK_RDLOCK(x)
00358 
00359 #define       LAZY_RWLOCK_DEF(x)
00360 
00361 #define       LAZY_RWLOCK_INIT(x,y)
00362 
00363 #define       LAZY_RWLOCK_DESTROY(x)
00364 #else
00365 #define       LAZY_MUTEX_LOCK(x)          pthread_mutex_lock(x)
00366 #define       LAZY_MUTEX_UNLOCK(x) pthread_mutex_unlock(x)
00367 #define       LAZY_RWLOCK_UNLOCK(x)       pthread_rwlock_unlock(x)
00368 #define       LAZY_RWLOCK_WRLOCK(x)       pthread_rwlock_wrlock(x)
00369 #define       LAZY_RWLOCK_RDLOCK(x)       pthread_rwlock_rdlock(x)
00370 #define       LAZY_RWLOCK_DEF(x)          pthread_rwlock_t     x;
00371 #define       LAZY_RWLOCK_INIT(x,y)       pthread_rwlock_init(x,y)
00372 #define       LAZY_RWLOCK_DESTROY(x)      pthread_rwlock_destroy(x)
00373 #endif
00374 
00379 #define P_INVALID     (~0UL)
00380 
00382 #define F_ISSET(w, f)        (((w) & (f)) == (f))
00383 
00388 typedef uint16_t      indx_t;
00389 
00394 #define DEFAULT_MAPSIZE     1048576
00395 
00438 #define DEFAULT_READERS     126
00439 
00445 #ifndef CACHELINE
00446 #define CACHELINE    64
00447 #endif
00448 
00457 typedef struct MDB_rxbody {
00465        txnid_t              mrb_txnid;
00467        pid_t         mrb_pid;
00469        pthread_t     mrb_tid;
00470 } MDB_rxbody;
00471 
00473 typedef struct MDB_reader {
00474        union {
00475               MDB_rxbody mrx;
00477 #define       mr_txnid      mru.mrx.mrb_txnid
00478 #define       mr_pid mru.mrx.mrb_pid
00479 #define       mr_tid mru.mrx.mrb_tid
00480 
00481               char pad[(sizeof(MDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
00482        } mru;
00483 } MDB_reader;
00484 
00499 typedef struct MDB_txbody {
00502        uint32_t      mtb_magic;
00504        uint32_t      mtb_version;
00505 #if defined(_WIN32) || defined(__APPLE__)
00506        char   mtb_rmname[MNAME_LEN];
00507 #else
00508 
00511        pthread_mutex_t      mtb_mutex;
00512 #endif
00513 
00517        txnid_t              mtb_txnid;
00522        unsigned      mtb_numreaders;
00527        uint32_t      mtb_me_toggle;
00528 } MDB_txbody;
00529 
00531 typedef struct MDB_txninfo {
00532        union {
00533               MDB_txbody mtb;
00534 #define mti_magic    mt1.mtb.mtb_magic
00535 #define mti_version  mt1.mtb.mtb_version
00536 #define mti_mutex    mt1.mtb.mtb_mutex
00537 #define mti_rmname   mt1.mtb.mtb_rmname
00538 #define mti_txnid    mt1.mtb.mtb_txnid
00539 #define mti_numreaders      mt1.mtb.mtb_numreaders
00540 #define mti_me_toggle       mt1.mtb.mtb_me_toggle
00541               char pad[(sizeof(MDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
00542        } mt1;
00543        union {
00544 #if defined(_WIN32) || defined(__APPLE__)
00545               char mt2_wmname[MNAME_LEN];
00546 #define       mti_wmname    mt2.mt2_wmname
00547 #else
00548               pthread_mutex_t      mt2_wmutex;
00549 #define mti_wmutex   mt2.mt2_wmutex
00550 #endif
00551               char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
00552        } mt2;
00553        MDB_reader    mti_readers[1];
00554 } MDB_txninfo;
00561 typedef struct MDB_page {
00562 #define       mp_pgno       mp_p.p_pgno
00563 #define       mp_next       mp_p.p_next
00564        union {
00565               pgno_t        p_pgno;       
00566               void *        p_next;       
00567        } mp_p;
00568        uint16_t      mp_pad;
00574 #define       P_BRANCH       0x01         
00575 #define       P_LEAF         0x02         
00576 #define       P_OVERFLOW     0x04         
00577 #define       P_META         0x08         
00578 #define       P_DIRTY               0x10         
00579 #define       P_LEAF2               0x20         
00580 #define       P_SUBP         0x40         
00582        uint16_t      mp_flags;            
00583 #define mp_lower     mp_pb.pb.pb_lower
00584 #define mp_upper     mp_pb.pb.pb_upper
00585 #define mp_pages     mp_pb.pb_pages
00586        union {
00587               struct {
00588                      indx_t        pb_lower;            
00589                      indx_t        pb_upper;            
00590               } pb;
00591               uint32_t      pb_pages;     
00592        } mp_pb;
00593        indx_t        mp_ptrs[1];          
00594 } MDB_page;
00595 
00597 #define PAGEHDRSZ     ((unsigned) offsetof(MDB_page, mp_ptrs))
00598 
00600 #define METADATA(p)   ((void *)((char *)(p) + PAGEHDRSZ))
00601 
00603 #define NUMKEYS(p)    (((p)->mp_lower - PAGEHDRSZ) >> 1)
00604 
00606 #define SIZELEFT(p)   (indx_t)((p)->mp_upper - (p)->mp_lower)
00607 
00609 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
00610                             ((env)->me_psize - PAGEHDRSZ))
00611 
00614 #define FILL_THRESHOLD       250
00615 
00617 #define IS_LEAF(p)    F_ISSET((p)->mp_flags, P_LEAF)
00618 
00619 #define IS_LEAF2(p)   F_ISSET((p)->mp_flags, P_LEAF2)
00620 
00621 #define IS_BRANCH(p)  F_ISSET((p)->mp_flags, P_BRANCH)
00622 
00623 #define IS_OVERFLOW(p)       F_ISSET((p)->mp_flags, P_OVERFLOW)
00624 
00625 #define IS_SUBP(p)    F_ISSET((p)->mp_flags, P_SUBP)
00626 
00628 #define OVPAGES(size, psize)       ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
00629 
00633 typedef struct MDB_node {
00640 #define mn_lo mn_offset[BYTE_ORDER!=LITTLE_ENDIAN]
00641 #define mn_hi mn_offset[BYTE_ORDER==LITTLE_ENDIAN] 
00642        unsigned short       mn_offset[2]; 
00648 #define F_BIGDATA     0x01                
00649 #define F_SUBDATA     0x02                
00650 #define F_DUPDATA     0x04                
00653 #define       NODE_ADD_FLAGS       (F_DUPDATA|F_SUBDATA|MDB_RESERVE|MDB_APPEND)
00654 
00656        unsigned short       mn_flags;            
00657        unsigned short       mn_ksize;            
00658        char          mn_data[1];                 
00659 } MDB_node;
00660 
00662 #define NODESIZE      offsetof(MDB_node, mn_data)
00663 
00665 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
00666 
00670 #define INDXSIZE(k)   (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
00671 
00675 #define LEAFSIZE(k, d)       (NODESIZE + (k)->mv_size + (d)->mv_size)
00676 
00678 #define NODEPTR(p, i)        ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
00679 
00681 #define NODEKEY(node)        (void *)((node)->mn_data)
00682 
00684 #define NODEDATA(node)       (void *)((char *)(node)->mn_data + (node)->mn_ksize)
00685 
00687 #define NODEPGNO(node) \
00688        ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
00689         (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
00690 
00691 #define SETPGNO(node,pgno)  do { \
00692        (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
00693        if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
00694 
00696 #define NODEDSZ(node)        ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
00697 
00698 #define SETDSZ(node,size)   do { \
00699        (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
00700 
00701 #define NODEKSZ(node)        ((node)->mn_ksize)
00702 
00704 #ifdef MISALIGNED_OK
00705 #define COPY_PGNO(dst,src)  dst = src
00706 #else
00707 #if SIZE_MAX > 4294967295UL
00708 #define COPY_PGNO(dst,src)  do { \
00709        unsigned short *s, *d;      \
00710        s = (unsigned short *)&(src);      \
00711        d = (unsigned short *)&(dst);      \
00712        *d++ = *s++;  \
00713        *d++ = *s++;  \
00714        *d++ = *s++;  \
00715        *d = *s;      \
00716 } while (0)
00717 #else
00718 #define COPY_PGNO(dst,src)  do { \
00719        unsigned short *s, *d;      \
00720        s = (unsigned short *)&(src);      \
00721        d = (unsigned short *)&(dst);      \
00722        *d++ = *s++;  \
00723        *d = *s;      \
00724 } while (0)
00725 #endif
00726 #endif
00727 
00731 #define LEAF2KEY(p, i, ks)  ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
00732 
00734 #define MDB_SET_KEY(node, key)     { if ((key) != NULL) { \
00735        (key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } }
00736 
00738 typedef struct MDB_db {
00739        uint32_t      md_pad;              
00740        uint16_t      md_flags;     
00741        uint16_t      md_depth;     
00742        pgno_t        md_branch_pages;     
00743        pgno_t        md_leaf_pages;              
00744        pgno_t        md_overflow_pages;   
00745        size_t        md_entries;          
00746        pgno_t        md_root;             
00747 } MDB_db;
00748 
00750 #define       FREE_DBI      0
00751 
00752 #define       MAIN_DBI      1
00753 
00755 typedef struct MDB_meta {
00758        uint32_t      mm_magic;
00760        uint32_t      mm_version;
00761        void          *mm_address;         
00762        size_t        mm_mapsize;                 
00763        MDB_db        mm_dbs[2];                  
00765 #define       mm_psize      mm_dbs[0].md_pad
00766 
00767 #define       mm_flags      mm_dbs[0].md_flags
00768        pgno_t        mm_last_pg;                 
00769        txnid_t              mm_txnid;                   
00770 } MDB_meta;
00771 
00777 typedef union MDB_pagebuf {
00778        char          mb_raw[MDB_PAGESIZE];
00779        MDB_page      mb_page;
00780        struct {
00781               char          mm_pad[PAGEHDRSZ];
00782               MDB_meta      mm_meta;
00783        } mb_metabuf;
00784 } MDB_pagebuf;
00785 
00790 typedef struct MDB_dbx {
00791        MDB_val              md_name;             
00792        MDB_cmp_func  *md_cmp;      
00793        MDB_cmp_func  *md_dcmp;     
00794        MDB_rel_func  *md_rel;      
00795        void          *md_relctx;          
00796 } MDB_dbx;
00797 
00801 struct MDB_txn {
00802        MDB_txn              *mt_parent;          
00803        MDB_txn              *mt_child;           
00804        pgno_t        mt_next_pgno; 
00809        txnid_t              mt_txnid;
00810        MDB_env              *mt_env;             
00813        IDL                  mt_free_pgs;
00814        union {
00815               ID2L   dirty_list;   
00816               MDB_reader    *reader;      
00817        } mt_u;
00819        MDB_dbx              *mt_dbxs;
00821        MDB_db        *mt_dbs;
00826 #define DB_DIRTY     0x01          
00827 #define DB_STALE     0x02          
00830        MDB_cursor    **mt_cursors;
00831 
00832        unsigned char *mt_dbflags;
00836        MDB_dbi              mt_numdbs;
00837 
00842 #define MDB_TXN_RDONLY             0x01          
00843 #define MDB_TXN_ERROR              0x02          
00845        unsigned int  mt_flags;            
00849        unsigned int  mt_toggle;
00850 };
00851 
00856 #define CURSOR_STACK         32
00857 
00858 struct MDB_xcursor;
00859 
00861 struct MDB_cursor {
00863        MDB_cursor    *mc_next;
00865        MDB_cursor    *mc_orig;
00867        struct MDB_xcursor   *mc_xcursor;
00869        MDB_txn              *mc_txn;
00871        MDB_dbi              mc_dbi;
00873        MDB_db        *mc_db;
00875        MDB_dbx              *mc_dbx;
00877        unsigned char *mc_dbflag;
00878        unsigned short       mc_snum;      
00879        unsigned short       mc_top;              
00885 #define C_INITIALIZED       0x01   
00886 #define C_EOF 0x02                 
00887 #define C_SUB 0x04                 
00888 #define C_SHADOW     0x08          
00889 #define C_ALLOCD     0x10          
00891        unsigned int  mc_flags;     
00892        MDB_page      *mc_pg[CURSOR_STACK];       
00893        indx_t        mc_ki[CURSOR_STACK]; 
00894 };
00895 
00901 typedef struct MDB_xcursor {
00903        MDB_cursor mx_cursor;
00905        MDB_db mx_db;
00907        MDB_dbx       mx_dbx;
00909        unsigned char mx_dbflag;
00910 } MDB_xcursor;
00911 
00913 typedef struct MDB_oldpages {
00917        struct MDB_oldpages *mo_next;
00919        txnid_t              mo_txnid;
00921        pgno_t        mo_pages[1];  /* dynamic */
00922 } MDB_oldpages;
00923 
00925 struct MDB_env {
00926        HANDLE        me_fd;        
00927        HANDLE        me_lfd;              
00928        HANDLE        me_mfd;                     
00930 #define       MDB_FATAL_ERROR      0x80000000U
00931        uint32_t      me_flags;            
00932        uint32_t      me_extrapad;  
00933        unsigned int  me_maxreaders;       
00934        MDB_dbi              me_numdbs;           
00935        MDB_dbi              me_maxdbs;           
00936        char          *me_path;            
00937        char          *me_map;             
00938        MDB_txninfo   *me_txns;            
00939        MDB_meta      *me_metas[2]; 
00940        MDB_txn              *me_txn;             
00941        size_t        me_mapsize;          
00942        off_t         me_size;             
00943        pgno_t        me_maxpg;            
00944        unsigned int  me_psize;     
00945        unsigned int  me_db_toggle; 
00946        txnid_t              me_wtxnid;           
00947        txnid_t              me_pgfirst;          
00948        txnid_t              me_pglast;           
00949        MDB_dbx              *me_dbxs;            
00950        MDB_db        *me_dbs[2];          
00951        MDB_oldpages *me_pghead;    
00952        pthread_key_t me_txkey;     
00953        MDB_page      *me_dpages;          
00955        IDL                  me_free_pgs;
00957        ID2                  me_dirty_list[MDB_IDL_UM_SIZE];
00959        LAZY_RWLOCK_DEF(me_dblock)
00960 #ifdef _WIN32
00961        HANDLE        me_rmutex;           /* Windows mutexes don't reside in shared mem */
00962        HANDLE        me_wmutex;
00963 #endif
00964 #ifdef __APPLE__
00965        sem_t         *me_rmutex;          /* Apple doesn't support shared mutexes */
00966        sem_t         *me_wmutex;
00967 #endif
00968 };
00970 #define MDB_COMMIT_PAGES     64
00971 #if defined(IOV_MAX) && IOV_MAX < MDB_COMMIT_PAGES
00972 #undef MDB_COMMIT_PAGES
00973 #define MDB_COMMIT_PAGES    IOV_MAX
00974 #endif
00975 
00976 static MDB_page *mdb_page_alloc(MDB_cursor *mc, int num);
00977 static MDB_page *mdb_page_new(MDB_cursor *mc, uint32_t flags, int num);
00978 static int           mdb_page_touch(MDB_cursor *mc);
00979 
00980 static int  mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp);
00981 static int  mdb_page_search_root(MDB_cursor *mc,
00982                          MDB_val *key, int modify);
00983 static int  mdb_page_search(MDB_cursor *mc,
00984                          MDB_val *key, int modify);
00985 static int    mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst);
00986 static int    mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata,
00987                             pgno_t newpgno, unsigned int nflags);
00988 
00989 static int  mdb_env_read_header(MDB_env *env, MDB_meta *meta);
00990 static int  mdb_env_read_meta(MDB_env *env, int *which);
00991 static int  mdb_env_write_meta(MDB_txn *txn);
00992 
00993 static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp);
00994 static int  mdb_node_add(MDB_cursor *mc, indx_t indx,
00995                          MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags);
00996 static void mdb_node_del(MDB_page *mp, indx_t indx, int ksize);
00997 static void mdb_node_shrink(MDB_page *mp, indx_t indx);
00998 static int    mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
00999 static int  mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
01000 static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
01001 static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
01002 
01003 static int    mdb_rebalance(MDB_cursor *mc);
01004 static int    mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key);
01005 
01006 static void   mdb_cursor_pop(MDB_cursor *mc);
01007 static int    mdb_cursor_push(MDB_cursor *mc, MDB_page *mp);
01008 
01009 static int    mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf);
01010 static int    mdb_cursor_sibling(MDB_cursor *mc, int move_right);
01011 static int    mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
01012 static int    mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op);
01013 static int    mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op,
01014                             int *exactp);
01015 static int    mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data);
01016 static int    mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data);
01017 
01018 static void   mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx);
01019 static void   mdb_xcursor_init0(MDB_cursor *mc);
01020 static void   mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node);
01021 
01022 static int    mdb_drop0(MDB_cursor *mc, int subs);
01023 static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi);
01024 
01026 static MDB_cmp_func  mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int, mdb_cmp_cint, mdb_cmp_long;
01029 #ifdef _WIN32
01030 static SECURITY_DESCRIPTOR mdb_null_sd;
01031 static SECURITY_ATTRIBUTES mdb_all_sa;
01032 static int mdb_sec_inited;
01033 #endif
01034 
01036 char *
01037 mdb_version(int *major, int *minor, int *patch)
01038 {
01039        if (major) *major = MDB_VERSION_MAJOR;
01040        if (minor) *minor = MDB_VERSION_MINOR;
01041        if (patch) *patch = MDB_VERSION_PATCH;
01042        return MDB_VERSION_STRING;
01043 }
01044 
01046 static char *const mdb_errstr[] = {
01047        "MDB_KEYEXIST: Key/data pair already exists",
01048        "MDB_NOTFOUND: No matching key/data pair found",
01049        "MDB_PAGE_NOTFOUND: Requested page not found",
01050        "MDB_CORRUPTED: Located page was wrong type",
01051        "MDB_PANIC: Update of meta page failed",
01052        "MDB_VERSION_MISMATCH: Database environment version mismatch"
01053 };
01054 
01055 char *
01056 mdb_strerror(int err)
01057 {
01058        if (!err)
01059               return ("Successful return: 0");
01060 
01061        if (err >= MDB_KEYEXIST && err <= MDB_VERSION_MISMATCH)
01062               return mdb_errstr[err - MDB_KEYEXIST];
01063 
01064        return strerror(err);
01065 }
01066 
01067 #if MDB_DEBUG
01068 
01073 char *
01074 mdb_dkey(MDB_val *key, char *buf)
01075 {
01076        char *ptr = buf;
01077        unsigned char *c = key->mv_data;
01078        unsigned int i;
01079        if (key->mv_size > MAXKEYSIZE)
01080               return "MAXKEYSIZE";
01081        /* may want to make this a dynamic check: if the key is mostly
01082         * printable characters, print it as-is instead of converting to hex.
01083         */
01084 #if 1
01085        buf[0] = '\0';
01086        for (i=0; i<key->mv_size; i++)
01087               ptr += sprintf(ptr, "%02x", *c++);
01088 #else
01089        sprintf(buf, "%.*s", key->mv_size, key->mv_data);
01090 #endif
01091        return buf;
01092 }
01093 
01095 static void
01096 mdb_page_keys(MDB_page *mp)
01097 {
01098        MDB_node *node;
01099        unsigned int i, nkeys;
01100        MDB_val key;
01101        DKBUF;
01102 
01103        nkeys = NUMKEYS(mp);
01104        DPRINTF("numkeys %d", nkeys);
01105        for (i=0; i<nkeys; i++) {
01106               node = NODEPTR(mp, i);
01107               key.mv_size = node->mn_ksize;
01108               key.mv_data = node->mn_data;
01109               DPRINTF("key %d: %s", i, DKEY(&key));
01110        }
01111 }
01112 #endif
01113 
01114 #if MDB_DEBUG > 2
01115 
01119 static void mdb_audit(MDB_txn *txn)
01120 {
01121        MDB_cursor mc;
01122        MDB_val key, data;
01123        int rc, i;
01124        ID freecount, count;
01125 
01126        freecount = 0;
01127        mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
01128        while ((rc = mdb_cursor_get(&mc, &key, &data, MDB_NEXT)) == 0)
01129               freecount += *(ID *)data.mv_data;
01130        freecount += txn->mt_dbs[0].md_branch_pages + txn->mt_dbs[0].md_leaf_pages +
01131               txn->mt_dbs[0].md_overflow_pages;
01132 
01133        count = 0;
01134        for (i = 0; i<txn->mt_numdbs; i++) {
01135               count += txn->mt_dbs[i].md_branch_pages +
01136                      txn->mt_dbs[i].md_leaf_pages +
01137                      txn->mt_dbs[i].md_overflow_pages;
01138               if (txn->mt_dbs[i].md_flags & MDB_DUPSORT) {
01139                      MDB_xcursor mx;
01140                      mdb_cursor_init(&mc, txn, i, &mx);
01141                      mdb_page_search(&mc, NULL, 0);
01142                      do {
01143                             int j;
01144                             MDB_page *mp;
01145                             mp = mc.mc_pg[mc.mc_top];
01146                             for (j=0; j<NUMKEYS(mp); j++) {
01147                                    MDB_node *leaf = NODEPTR(mp, j);
01148                                    if (leaf->mn_flags & F_SUBDATA) {
01149                                           MDB_db db;
01150                                           memcpy(&db, NODEDATA(leaf), sizeof(db));
01151                                           count += db.md_branch_pages + db.md_leaf_pages +
01152                                                  db.md_overflow_pages;
01153                                    }
01154                             }
01155                      }
01156                      while (mdb_cursor_sibling(&mc, 1) == 0);
01157               }
01158        }
01159        assert(freecount + count + 2 >= txn->mt_next_pgno - 1);
01160 }
01161 #endif
01162 
01163 int
01164 mdb_cmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
01165 {
01166        return txn->mt_dbxs[dbi].md_cmp(a, b);
01167 }
01168 
01169 int
01170 mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
01171 {
01172        if (txn->mt_dbxs[dbi].md_dcmp)
01173               return txn->mt_dbxs[dbi].md_dcmp(a, b);
01174        else
01175               return EINVAL;       /* too bad you can't distinguish this from a valid result */
01176 }
01177 
01181 static MDB_page *
01182 mdb_page_malloc(MDB_cursor *mc) {
01183        MDB_page *ret;
01184        size_t sz = mc->mc_txn->mt_env->me_psize;
01185        if ((ret = mc->mc_txn->mt_env->me_dpages) != NULL) {
01186               VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
01187               VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
01188               mc->mc_txn->mt_env->me_dpages = ret->mp_next;
01189        } else if ((ret = malloc(sz)) != NULL) {
01190               VGMEMP_ALLOC(mc->mc_txn->mt_env, ret, sz);
01191        }
01192        return ret;
01193 }
01194 
01204 static MDB_page *
01205 mdb_page_alloc(MDB_cursor *mc, int num)
01206 {
01207        MDB_txn *txn = mc->mc_txn;
01208        MDB_page *np;
01209        pgno_t pgno = P_INVALID;
01210        ID2 mid;
01211 
01212        if (txn->mt_txnid > 2) {
01213 
01214               if (!txn->mt_env->me_pghead &&
01215                      txn->mt_dbs[FREE_DBI].md_root != P_INVALID) {
01216                      /* See if there's anything in the free DB */
01217                      MDB_cursor m2;
01218                      MDB_node *leaf;
01219                      MDB_val data;
01220                      txnid_t *kptr, oldest, last;
01221 
01222                      mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
01223                      if (!txn->mt_env->me_pgfirst) {
01224                             mdb_page_search(&m2, NULL, 0);
01225                             leaf = NODEPTR(m2.mc_pg[m2.mc_top], 0);
01226                             kptr = (txnid_t *)NODEKEY(leaf);
01227                             last = *kptr;
01228                      } else {
01229                             MDB_val key;
01230                             int rc, exact;
01231 again:
01232                             exact = 0;
01233                             last = txn->mt_env->me_pglast + 1;
01234                             leaf = NULL;
01235                             key.mv_data = &last;
01236                             key.mv_size = sizeof(last);
01237                             rc = mdb_cursor_set(&m2, &key, &data, MDB_SET, &exact);
01238                             if (rc)
01239                                    goto none;
01240                             last = *(txnid_t *)key.mv_data;
01241                      }
01242 
01243                      {
01244                             unsigned int i;
01245                             oldest = txn->mt_txnid - 1;
01246                             for (i=0; i<txn->mt_env->me_txns->mti_numreaders; i++) {
01247                                    txnid_t mr = txn->mt_env->me_txns->mti_readers[i].mr_txnid;
01248                                    if (mr && mr < oldest)
01249                                           oldest = mr;
01250                             }
01251                      }
01252 
01253                      if (oldest > last) {
01254                             /* It's usable, grab it.
01255                              */
01256                             MDB_oldpages *mop;
01257                             pgno_t *idl;
01258 
01259                             if (!txn->mt_env->me_pgfirst) {
01260                                    mdb_node_read(txn, leaf, &data);
01261                             }
01262                             txn->mt_env->me_pglast = last;
01263                             if (!txn->mt_env->me_pgfirst)
01264                                    txn->mt_env->me_pgfirst = last;
01265                             idl = (ID *) data.mv_data;
01266                             /* We might have a zero-length IDL due to freelist growth
01267                              * during a prior commit
01268                              */
01269                             if (!idl[0]) goto again;
01270                             mop = malloc(sizeof(MDB_oldpages) + MDB_IDL_SIZEOF(idl) - sizeof(pgno_t));
01271                             mop->mo_next = txn->mt_env->me_pghead;
01272                             mop->mo_txnid = last;
01273                             txn->mt_env->me_pghead = mop;
01274                             memcpy(mop->mo_pages, idl, MDB_IDL_SIZEOF(idl));
01275 
01276 #if MDB_DEBUG > 1
01277                             {
01278                                    unsigned int i;
01279                                    DPRINTF("IDL read txn %zu root %zu num %zu",
01280                                           mop->mo_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
01281                                    for (i=0; i<idl[0]; i++) {
01282                                           DPRINTF("IDL %zu", idl[i+1]);
01283                                    }
01284                             }
01285 #endif
01286                      }
01287               }
01288 none:
01289               if (txn->mt_env->me_pghead) {
01290                      MDB_oldpages *mop = txn->mt_env->me_pghead;
01291                      if (num > 1) {
01292                             /* FIXME: For now, always use fresh pages. We
01293                              * really ought to search the free list for a
01294                              * contiguous range.
01295                              */
01296                             ;
01297                      } else {
01298                             /* peel pages off tail, so we only have to truncate the list */
01299                             pgno = MDB_IDL_LAST(mop->mo_pages);
01300                             if (MDB_IDL_IS_RANGE(mop->mo_pages)) {
01301                                    mop->mo_pages[2]++;
01302                                    if (mop->mo_pages[2] > mop->mo_pages[1])
01303                                           mop->mo_pages[0] = 0;
01304                             } else {
01305                                    mop->mo_pages[0]--;
01306                             }
01307                             if (MDB_IDL_IS_ZERO(mop->mo_pages)) {
01308                                    txn->mt_env->me_pghead = mop->mo_next;
01309                                    free(mop);
01310                             }
01311                      }
01312               }
01313        }
01314 
01315        if (pgno == P_INVALID) {
01316               /* DB size is maxed out */
01317               if (txn->mt_next_pgno + num >= txn->mt_env->me_maxpg) {
01318                      DPUTS("DB size maxed out");
01319                      return NULL;
01320               }
01321        }
01322        if (txn->mt_env->me_dpages && num == 1) {
01323               np = txn->mt_env->me_dpages;
01324               VGMEMP_ALLOC(txn->mt_env, np, txn->mt_env->me_psize);
01325               VGMEMP_DEFINED(np, sizeof(np->mp_next));
01326               txn->mt_env->me_dpages = np->mp_next;
01327        } else {
01328               size_t sz = txn->mt_env->me_psize * num;
01329               if ((np = malloc(sz)) == NULL)
01330                      return NULL;
01331               VGMEMP_ALLOC(txn->mt_env, np, sz);
01332        }
01333        if (pgno == P_INVALID) {
01334               np->mp_pgno = txn->mt_next_pgno;
01335               txn->mt_next_pgno += num;
01336        } else {
01337               np->mp_pgno = pgno;
01338        }
01339        mid.mid = np->mp_pgno;
01340        mid.mptr = np;
01341        mdb_mid2l_insert(txn->mt_u.dirty_list, &mid);
01342 
01343        return np;
01344 }
01345 
01350 static int
01351 mdb_page_touch(MDB_cursor *mc)
01352 {
01353        MDB_page *mp = mc->mc_pg[mc->mc_top];
01354        pgno_t pgno;
01355 
01356        if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
01357               MDB_page *np;
01358               if ((np = mdb_page_alloc(mc, 1)) == NULL)
01359                      return ENOMEM;
01360               DPRINTF("touched db %u page %zu -> %zu", mc->mc_dbi, mp->mp_pgno, np->mp_pgno);
01361               assert(mp->mp_pgno != np->mp_pgno);
01362               mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
01363               pgno = np->mp_pgno;
01364               memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
01365               mp = np;
01366               mp->mp_pgno = pgno;
01367               mp->mp_flags |= P_DIRTY;
01368 
01369 finish:
01370               /* Adjust other cursors pointing to mp */
01371               if (mc->mc_flags & C_SUB) {
01372                      MDB_cursor *m2, *m3;
01373                      MDB_dbi dbi = mc->mc_dbi-1;
01374 
01375                      for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
01376                             if (m2 == mc) continue;
01377                             m3 = &m2->mc_xcursor->mx_cursor;
01378                             if (m3->mc_snum < mc->mc_snum) continue;
01379                             if (m3->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
01380                                    m3->mc_pg[mc->mc_top] = mp;
01381                             }
01382                      }
01383               } else {
01384                      MDB_cursor *m2;
01385 
01386                      for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
01387                             if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
01388                             if (m2->mc_pg[mc->mc_top] == mc->mc_pg[mc->mc_top]) {
01389                                    m2->mc_pg[mc->mc_top] = mp;
01390                             }
01391                      }
01392               }
01393               mc->mc_pg[mc->mc_top] = mp;
01397               if (mc->mc_top)
01398                      SETPGNO(NODEPTR(mc->mc_pg[mc->mc_top-1], mc->mc_ki[mc->mc_top-1]), mp->mp_pgno);
01399               else
01400                      mc->mc_db->md_root = mp->mp_pgno;
01401        } else if (mc->mc_txn->mt_parent) {
01402               MDB_page *np;
01403               ID2 mid;
01404               /* If txn has a parent, make sure the page is in our
01405                * dirty list.
01406                */
01407               if (mc->mc_txn->mt_u.dirty_list[0].mid) {
01408                      unsigned x = mdb_mid2l_search(mc->mc_txn->mt_u.dirty_list, mp->mp_pgno);
01409                      if (x <= mc->mc_txn->mt_u.dirty_list[0].mid &&
01410                             mc->mc_txn->mt_u.dirty_list[x].mid == mp->mp_pgno) {
01411                             if (mc->mc_txn->mt_u.dirty_list[x].mptr != mp) {
01412                                    mp = mc->mc_txn->mt_u.dirty_list[x].mptr;
01413                                    mc->mc_pg[mc->mc_top] = mp;
01414                             }
01415                             return 0;
01416                      }
01417               }
01418               /* No - copy it */
01419               np = mdb_page_malloc(mc);
01420               memcpy(np, mp, mc->mc_txn->mt_env->me_psize);
01421               mid.mid = np->mp_pgno;
01422               mid.mptr = np;
01423               mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &mid);
01424               mp = np;
01425               goto finish;
01426        }
01427        return 0;
01428 }
01429 
01430 int
01431 mdb_env_sync(MDB_env *env, int force)
01432 {
01433        int rc = 0;
01434        if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
01435               if (MDB_FDATASYNC(env->me_fd))
01436                      rc = ErrCode();
01437        }
01438        return rc;
01439 }
01440 
01442 static int
01443 mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
01444 {
01445        MDB_cursor *mc, *m2;
01446        unsigned int i, j, size;
01447 
01448        for (i=0;i<src->mt_numdbs; i++) {
01449               if (src->mt_cursors[i]) {
01450                      size = sizeof(MDB_cursor);
01451                      if (src->mt_cursors[i]->mc_xcursor)
01452                             size += sizeof(MDB_xcursor);
01453                      for (m2 = src->mt_cursors[i]; m2; m2=m2->mc_next) {
01454                             mc = malloc(size);
01455                             if (!mc)
01456                                    return ENOMEM;
01457                             mc->mc_orig = m2;
01458                             mc->mc_txn = dst;
01459                             mc->mc_dbi = i;
01460                             mc->mc_db = &dst->mt_dbs[i];
01461                             mc->mc_dbx = m2->mc_dbx;
01462                             mc->mc_dbflag = &dst->mt_dbflags[i];
01463                             mc->mc_snum = m2->mc_snum;
01464                             mc->mc_top = m2->mc_top;
01465                             mc->mc_flags = m2->mc_flags | C_SHADOW;
01466                             for (j=0; j<mc->mc_snum; j++) {
01467                                    mc->mc_pg[j] = m2->mc_pg[j];
01468                                    mc->mc_ki[j] = m2->mc_ki[j];
01469                             }
01470                             if (m2->mc_xcursor) {
01471                                    MDB_xcursor *mx, *mx2;
01472                                    mx = (MDB_xcursor *)(mc+1);
01473                                    mc->mc_xcursor = mx;
01474                                    mx2 = m2->mc_xcursor;
01475                                    mx->mx_db = mx2->mx_db;
01476                                    mx->mx_dbx = mx2->mx_dbx;
01477                                    mx->mx_dbflag = mx2->mx_dbflag;
01478                                    mx->mx_cursor.mc_txn = dst;
01479                                    mx->mx_cursor.mc_dbi = mx2->mx_cursor.mc_dbi;
01480                                    mx->mx_cursor.mc_db = &mx->mx_db;
01481                                    mx->mx_cursor.mc_dbx = &mx->mx_dbx;
01482                                    mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
01483                                    mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum;
01484                                    mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top;
01485                                    mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags | C_SHADOW;
01486                                    for (j=0; j<mx2->mx_cursor.mc_snum; j++) {
01487                                           mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j];
01488                                           mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j];
01489                                    }
01490                             } else {
01491                                    mc->mc_xcursor = NULL;
01492                             }
01493                             mc->mc_next = dst->mt_cursors[i];
01494                             dst->mt_cursors[i] = mc;
01495                      }
01496               }
01497        }
01498        return MDB_SUCCESS;
01499 }
01500 
01502 static void
01503 mdb_cursor_merge(MDB_txn *txn)
01504 {
01505        MDB_dbi i;
01506        for (i=0; i<txn->mt_numdbs; i++) {
01507               if (txn->mt_cursors[i]) {
01508                      MDB_cursor *mc;
01509                      while ((mc = txn->mt_cursors[i])) {
01510                             txn->mt_cursors[i] = mc->mc_next;
01511                             if (mc->mc_flags & C_SHADOW) {
01512                                    MDB_cursor *m2 = mc->mc_orig;
01513                                    unsigned int j;
01514                                    m2->mc_snum = mc->mc_snum;
01515                                    m2->mc_top = mc->mc_top;
01516                                    for (j=0; j<mc->mc_snum; j++) {
01517                                           m2->mc_pg[j] = mc->mc_pg[j];
01518                                           m2->mc_ki[j] = mc->mc_ki[j];
01519                                    }
01520                             }
01521                             if (mc->mc_flags & C_ALLOCD)
01522                                    free(mc);
01523                      }
01524               }
01525        }
01526 }
01527 
01528 static void
01529 mdb_txn_reset0(MDB_txn *txn);
01530 
01537 static int
01538 mdb_txn_renew0(MDB_txn *txn)
01539 {
01540        MDB_env *env = txn->mt_env;
01541        char mt_dbflag = 0;
01542 
01543        if (txn->mt_flags & MDB_TXN_RDONLY) {
01544               MDB_reader *r = pthread_getspecific(env->me_txkey);
01545               if (!r) {
01546                      unsigned int i;
01547                      pid_t pid = getpid();
01548                      pthread_t tid = pthread_self();
01549 
01550                      LOCK_MUTEX_R(env);
01551                      for (i=0; i<env->me_txns->mti_numreaders; i++)
01552                             if (env->me_txns->mti_readers[i].mr_pid == 0)
01553                                    break;
01554                      if (i == env->me_maxreaders) {
01555                             UNLOCK_MUTEX_R(env);
01556                             return ENOMEM;
01557                      }
01558                      env->me_txns->mti_readers[i].mr_pid = pid;
01559                      env->me_txns->mti_readers[i].mr_tid = tid;
01560                      if (i >= env->me_txns->mti_numreaders)
01561                             env->me_txns->mti_numreaders = i+1;
01562                      UNLOCK_MUTEX_R(env);
01563                      r = &env->me_txns->mti_readers[i];
01564                      pthread_setspecific(env->me_txkey, r);
01565               }
01566               txn->mt_toggle = env->me_txns->mti_me_toggle;
01567               txn->mt_txnid = env->me_txns->mti_txnid;
01568               /* This happens if a different process was the
01569                * last writer to the DB.
01570                */
01571               if (env->me_wtxnid < txn->mt_txnid)
01572                      mt_dbflag = DB_STALE;
01573               r->mr_txnid = txn->mt_txnid;
01574               txn->mt_u.reader = r;
01575        } else {
01576               LOCK_MUTEX_W(env);
01577 
01578               txn->mt_txnid = env->me_txns->mti_txnid;
01579               if (env->me_wtxnid < txn->mt_txnid)
01580                      mt_dbflag = DB_STALE;
01581               txn->mt_txnid++;
01582 #if MDB_DEBUG
01583               if (txn->mt_txnid == mdb_debug_start)
01584                      mdb_debug = 1;
01585 #endif
01586               txn->mt_toggle = env->me_txns->mti_me_toggle;
01587               txn->mt_u.dirty_list = env->me_dirty_list;
01588               txn->mt_u.dirty_list[0].mid = 0;
01589               txn->mt_free_pgs = env->me_free_pgs;
01590               txn->mt_free_pgs[0] = 0;
01591               txn->mt_next_pgno = env->me_metas[txn->mt_toggle]->mm_last_pg+1;
01592               env->me_txn = txn;
01593        }
01594 
01595        /* Copy the DB arrays */
01596        LAZY_RWLOCK_RDLOCK(&env->me_dblock);
01597        txn->mt_numdbs = env->me_numdbs;
01598        txn->mt_dbxs = env->me_dbxs;       /* mostly static anyway */
01599        memcpy(txn->mt_dbs, env->me_metas[txn->mt_toggle]->mm_dbs, 2 * sizeof(MDB_db));
01600        if (txn->mt_numdbs > 2)
01601               memcpy(txn->mt_dbs+2, env->me_dbs[env->me_db_toggle]+2,
01602                      (txn->mt_numdbs - 2) * sizeof(MDB_db));
01603        LAZY_RWLOCK_UNLOCK(&env->me_dblock);
01604 
01605        memset(txn->mt_dbflags, mt_dbflag, env->me_numdbs);
01606 
01607        return MDB_SUCCESS;
01608 }
01609 
01610 int
01611 mdb_txn_renew(MDB_txn *txn)
01612 {
01613        int rc;
01614 
01615        if (!txn)
01616               return EINVAL;
01617 
01618        if (txn->mt_env->me_flags & MDB_FATAL_ERROR) {
01619               DPUTS("environment had fatal error, must shutdown!");
01620               return MDB_PANIC;
01621        }
01622 
01623        rc = mdb_txn_renew0(txn);
01624        if (rc == MDB_SUCCESS) {
01625               DPRINTF("renew txn %zu%c %p on mdbenv %p, root page %zu",
01626                      txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
01627                      (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
01628        }
01629        return rc;
01630 }
01631 
01632 int
01633 mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
01634 {
01635        MDB_txn *txn;
01636        int rc, size;
01637 
01638        if (env->me_flags & MDB_FATAL_ERROR) {
01639               DPUTS("environment had fatal error, must shutdown!");
01640               return MDB_PANIC;
01641        }
01642        if (parent) {
01643               /* parent already has an active child txn */
01644               if (parent->mt_child) {
01645                      return EINVAL;
01646               }
01647        }
01648        size = sizeof(MDB_txn) + env->me_maxdbs * (sizeof(MDB_db)+1);
01649        if (!(flags & MDB_RDONLY))
01650               size += env->me_maxdbs * sizeof(MDB_cursor *);
01651 
01652        if ((txn = calloc(1, size)) == NULL) {
01653               DPRINTF("calloc: %s", strerror(ErrCode()));
01654               return ENOMEM;
01655        }
01656        txn->mt_dbs = (MDB_db *)(txn+1);
01657        if (flags & MDB_RDONLY) {
01658               txn->mt_flags |= MDB_TXN_RDONLY;
01659               txn->mt_dbflags = (unsigned char *)(txn->mt_dbs + env->me_maxdbs);
01660        } else {
01661               txn->mt_cursors = (MDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
01662               txn->mt_dbflags = (unsigned char *)(txn->mt_cursors + env->me_maxdbs);
01663        }
01664        txn->mt_env = env;
01665 
01666        if (parent) {
01667               txn->mt_free_pgs = mdb_midl_alloc();
01668               if (!txn->mt_free_pgs) {
01669                      free(txn);
01670                      return ENOMEM;
01671               }
01672               txn->mt_u.dirty_list = malloc(sizeof(ID2)*MDB_IDL_UM_SIZE);
01673               if (!txn->mt_u.dirty_list) {
01674                      free(txn->mt_free_pgs);
01675                      free(txn);
01676                      return ENOMEM;
01677               }
01678               txn->mt_txnid = parent->mt_txnid;
01679               txn->mt_toggle = parent->mt_toggle;
01680               txn->mt_u.dirty_list[0].mid = 0;
01681               txn->mt_free_pgs[0] = 0;
01682               txn->mt_next_pgno = parent->mt_next_pgno;
01683               parent->mt_child = txn;
01684               txn->mt_parent = parent;
01685               txn->mt_numdbs = parent->mt_numdbs;
01686               txn->mt_dbxs = parent->mt_dbxs;
01687               memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDB_db));
01688               memcpy(txn->mt_dbflags, parent->mt_dbflags, txn->mt_numdbs);
01689               mdb_cursor_shadow(parent, txn);
01690               rc = 0;
01691        } else {
01692               rc = mdb_txn_renew0(txn);
01693        }
01694        if (rc)
01695               free(txn);
01696        else {
01697               *ret = txn;
01698               DPRINTF("begin txn %zu%c %p on mdbenv %p, root page %zu",
01699                      txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
01700                      (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root);
01701        }
01702 
01703        return rc;
01704 }
01705 
01709 static void
01710 mdb_txn_reset0(MDB_txn *txn)
01711 {
01712        MDB_env       *env = txn->mt_env;
01713 
01714        if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
01715               txn->mt_u.reader->mr_txnid = 0;
01716        } else {
01717               MDB_oldpages *mop;
01718               MDB_page *dp;
01719               unsigned int i;
01720 
01721               /* close(free) all cursors */
01722               for (i=0; i<txn->mt_numdbs; i++) {
01723                      if (txn->mt_cursors[i]) {
01724                             MDB_cursor *mc;
01725                             while ((mc = txn->mt_cursors[i])) {
01726                                    txn->mt_cursors[i] = mc->mc_next;
01727                                    if (mc->mc_flags & C_ALLOCD)
01728                                           free(mc);
01729                             }
01730                      }
01731               }
01732 
01733               /* return all dirty pages to dpage list */
01734               for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
01735                      dp = txn->mt_u.dirty_list[i].mptr;
01736                      if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
01737                             dp->mp_next = txn->mt_env->me_dpages;
01738                             VGMEMP_FREE(txn->mt_env, dp);
01739                             txn->mt_env->me_dpages = dp;
01740                      } else {
01741                             /* large pages just get freed directly */
01742                             VGMEMP_FREE(txn->mt_env, dp);
01743                             free(dp);
01744                      }
01745               }
01746 
01747               if (txn->mt_parent) {
01748                      txn->mt_parent->mt_child = NULL;
01749                      free(txn->mt_free_pgs);
01750                      free(txn->mt_u.dirty_list);
01751                      return;
01752               } else {
01753                      if (mdb_midl_shrink(&txn->mt_free_pgs))
01754                             env->me_free_pgs = txn->mt_free_pgs;
01755               }
01756 
01757               while ((mop = txn->mt_env->me_pghead)) {
01758                      txn->mt_env->me_pghead = mop->mo_next;
01759                      free(mop);
01760               }
01761               txn->mt_env->me_pgfirst = 0;
01762               txn->mt_env->me_pglast = 0;
01763 
01764               env->me_txn = NULL;
01765               /* The writer mutex was locked in mdb_txn_begin. */
01766               UNLOCK_MUTEX_W(env);
01767        }
01768 }
01769 
01770 void
01771 mdb_txn_reset(MDB_txn *txn)
01772 {
01773        if (txn == NULL)
01774               return;
01775 
01776        DPRINTF("reset txn %zu%c %p on mdbenv %p, root page %zu",
01777               txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
01778               (void *) txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
01779 
01780        mdb_txn_reset0(txn);
01781 }
01782 
01783 void
01784 mdb_txn_abort(MDB_txn *txn)
01785 {
01786        if (txn == NULL)
01787               return;
01788 
01789        DPRINTF("abort txn %zu%c %p on mdbenv %p, root page %zu",
01790               txn->mt_txnid, (txn->mt_flags & MDB_TXN_RDONLY) ? 'r' : 'w',
01791               (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root);
01792 
01793        if (txn->mt_child)
01794               mdb_txn_abort(txn->mt_child);
01795 
01796        mdb_txn_reset0(txn);
01797        free(txn);
01798 }
01799 
01800 int
01801 mdb_txn_commit(MDB_txn *txn)
01802 {
01803        int            n, done;
01804        unsigned int i;
01805        ssize_t               rc;
01806        off_t          size;
01807        MDB_page      *dp;
01808        MDB_env       *env;
01809        pgno_t next, freecnt;
01810        MDB_cursor mc;
01811 
01812        assert(txn != NULL);
01813        assert(txn->mt_env != NULL);
01814 
01815        if (txn->mt_child) {
01816               mdb_txn_commit(txn->mt_child);
01817               txn->mt_child = NULL;
01818        }
01819 
01820        env = txn->mt_env;
01821 
01822        if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
01823               if (txn->mt_numdbs > env->me_numdbs) {
01824                      /* update the DB tables */
01825                      int toggle = !env->me_db_toggle;
01826                      MDB_db *ip, *jp;
01827                      MDB_dbi i;
01828 
01829                      ip = &env->me_dbs[toggle][env->me_numdbs];
01830                      jp = &txn->mt_dbs[env->me_numdbs];
01831                      LAZY_RWLOCK_WRLOCK(&env->me_dblock);
01832                      for (i = env->me_numdbs; i < txn->mt_numdbs; i++) {
01833                             *ip++ = *jp++;
01834                      }
01835 
01836                      env->me_db_toggle = toggle;
01837                      env->me_numdbs = txn->mt_numdbs;
01838                      LAZY_RWLOCK_UNLOCK(&env->me_dblock);
01839               }
01840               mdb_txn_abort(txn);
01841               return MDB_SUCCESS;
01842        }
01843 
01844        if (F_ISSET(txn->mt_flags, MDB_TXN_ERROR)) {
01845               DPUTS("error flag is set, can't commit");
01846               if (txn->mt_parent)
01847                      txn->mt_parent->mt_flags |= MDB_TXN_ERROR;
01848               mdb_txn_abort(txn);
01849               return EINVAL;
01850        }
01851 
01852        /* Merge (and close) our cursors with parent's */
01853        mdb_cursor_merge(txn);
01854 
01855        if (txn->mt_parent) {
01856               MDB_db *ip, *jp;
01857               MDB_dbi i;
01858               unsigned x, y;
01859               ID2L dst, src;
01860 
01861               /* Update parent's DB table */
01862               ip = &txn->mt_parent->mt_dbs[2];
01863               jp = &txn->mt_dbs[2];
01864               for (i = 2; i < txn->mt_numdbs; i++) {
01865                      if (ip->md_root != jp->md_root)
01866                             *ip = *jp;
01867                      ip++; jp++;
01868               }
01869               txn->mt_parent->mt_numdbs = txn->mt_numdbs;
01870 
01871               /* Append our free list to parent's */
01872               mdb_midl_append_list(&txn->mt_parent->mt_free_pgs,
01873                      txn->mt_free_pgs);
01874               mdb_midl_free(txn->mt_free_pgs);
01875 
01876               /* Merge our dirty list with parent's */
01877               dst = txn->mt_parent->mt_u.dirty_list;
01878               src = txn->mt_u.dirty_list;
01879               x = mdb_mid2l_search(dst, src[1].mid);
01880               for (y=1; y<=src[0].mid; y++) {
01881                      while (x <= dst[0].mid && dst[x].mid != src[y].mid) x++;
01882                      if (x > dst[0].mid)
01883                             break;
01884                      free(dst[x].mptr);
01885                      dst[x].mptr = src[y].mptr;
01886               }
01887               x = dst[0].mid;
01888               for (; y<=src[0].mid; y++) {
01889                      if (++x >= MDB_IDL_UM_MAX) {
01890                             mdb_txn_abort(txn);
01891                             return ENOMEM;
01892                      }
01893                      dst[x] = src[y];
01894               }
01895               dst[0].mid = x;
01896               free(txn->mt_u.dirty_list);
01897               txn->mt_parent->mt_child = NULL;
01898               free(txn);
01899               return MDB_SUCCESS;
01900        }
01901 
01902        if (txn != env->me_txn) {
01903               DPUTS("attempt to commit unknown transaction");
01904               mdb_txn_abort(txn);
01905               return EINVAL;
01906        }
01907 
01908        if (!txn->mt_u.dirty_list[0].mid)
01909               goto done;
01910 
01911        DPRINTF("committing txn %zu %p on mdbenv %p, root page %zu",
01912            txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root);
01913 
01914        mdb_cursor_init(&mc, txn, FREE_DBI, NULL);
01915 
01916        /* should only be one record now */
01917        if (env->me_pghead) {
01918               /* make sure first page of freeDB is touched and on freelist */
01919               mdb_page_search(&mc, NULL, 1);
01920        }
01921 
01922        /* Delete IDLs we used from the free list */
01923        if (env->me_pgfirst) {
01924               txnid_t cur;
01925               MDB_val key;
01926               int exact = 0;
01927 
01928               key.mv_size = sizeof(cur);
01929               for (cur = env->me_pgfirst; cur <= env->me_pglast; cur++) {
01930                      key.mv_data = &cur;
01931 
01932                      mdb_cursor_set(&mc, &key, NULL, MDB_SET, &exact);
01933                      mdb_cursor_del(&mc, 0);
01934               }
01935               env->me_pgfirst = 0;
01936               env->me_pglast = 0;
01937        }
01938 
01939        /* save to free list */
01940 free2:
01941        freecnt = txn->mt_free_pgs[0];
01942        if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
01943               MDB_val key, data;
01944 
01945               /* make sure last page of freeDB is touched and on freelist */
01946               key.mv_size = MAXKEYSIZE+1;
01947               key.mv_data = NULL;
01948               mdb_page_search(&mc, &key, 1);
01949 
01950               mdb_midl_sort(txn->mt_free_pgs);
01951 #if MDB_DEBUG > 1
01952               {
01953                      unsigned int i;
01954                      ID *idl = txn->mt_free_pgs;
01955                      DPRINTF("IDL write txn %zu root %zu num %zu",
01956                             txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, idl[0]);
01957                      for (i=0; i<idl[0]; i++) {
01958                             DPRINTF("IDL %zu", idl[i+1]);
01959                      }
01960               }
01961 #endif
01962               /* write to last page of freeDB */
01963               key.mv_size = sizeof(pgno_t);
01964               key.mv_data = &txn->mt_txnid;
01965               data.mv_data = txn->mt_free_pgs;
01966               /* The free list can still grow during this call,
01967                * despite the pre-emptive touches above. So check
01968                * and make sure the entire thing got written.
01969                */
01970               do {
01971                      freecnt = txn->mt_free_pgs[0];
01972                      data.mv_size = MDB_IDL_SIZEOF(txn->mt_free_pgs);
01973                      rc = mdb_cursor_put(&mc, &key, &data, 0);
01974                      if (rc) {
01975                             mdb_txn_abort(txn);
01976                             return rc;
01977                      }
01978               } while (freecnt != txn->mt_free_pgs[0]);
01979        }
01980        /* should only be one record now */
01981 again:
01982        if (env->me_pghead) {
01983               MDB_val key, data;
01984               MDB_oldpages *mop;
01985               pgno_t orig;
01986               txnid_t id;
01987 
01988               mop = env->me_pghead;
01989               id = mop->mo_txnid;
01990               key.mv_size = sizeof(id);
01991               key.mv_data = &id;
01992               data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
01993               data.mv_data = mop->mo_pages;
01994               orig = mop->mo_pages[0];
01995               /* These steps may grow the freelist again
01996                * due to freed overflow pages...
01997                */
01998               mdb_cursor_put(&mc, &key, &data, 0);
01999               if (mop == env->me_pghead && env->me_pghead->mo_txnid == id) {
02000                      /* could have been used again here */
02001                      if (mop->mo_pages[0] != orig) {
02002                             data.mv_size = MDB_IDL_SIZEOF(mop->mo_pages);
02003                             data.mv_data = mop->mo_pages;
02004                             id = mop->mo_txnid;
02005                             mdb_cursor_put(&mc, &key, &data, 0);
02006                      }
02007                      env->me_pghead = NULL;
02008                      free(mop);
02009               } else {
02010                      /* was completely used up */
02011                      mdb_cursor_del(&mc, 0);
02012                      if (env->me_pghead)
02013                             goto again;
02014               }
02015               env->me_pgfirst = 0;
02016               env->me_pglast = 0;
02017        }
02018        /* Check for growth of freelist again */
02019        if (freecnt != txn->mt_free_pgs[0])
02020               goto free2;
02021 
02022        if (!MDB_IDL_IS_ZERO(txn->mt_free_pgs)) {
02023               if (mdb_midl_shrink(&txn->mt_free_pgs))
02024                      env->me_free_pgs = txn->mt_free_pgs;
02025        }
02026 
02027        /* Update DB root pointers. Their pages have already been
02028         * touched so this is all in-place and cannot fail.
02029         */
02030        {
02031               MDB_dbi i;
02032               MDB_val data;
02033               data.mv_size = sizeof(MDB_db);
02034 
02035               mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
02036               for (i = 2; i < txn->mt_numdbs; i++) {
02037                      if (txn->mt_dbflags[i] & DB_DIRTY) {
02038                             data.mv_data = &txn->mt_dbs[i];
02039                             mdb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data, 0);
02040                      }
02041               }
02042        }
02043 #if MDB_DEBUG > 2
02044        mdb_audit(txn);
02045 #endif
02046 
02047        /* Commit up to MDB_COMMIT_PAGES dirty pages to disk until done.
02048         */
02049        next = 0;
02050        i = 1;
02051        do {
02052 #ifdef _WIN32
02053               /* Windows actually supports scatter/gather I/O, but only on
02054                * unbuffered file handles. Since we're relying on the OS page
02055                * cache for all our data, that's self-defeating. So we just
02056                * write pages one at a time. We use the ov structure to set
02057                * the write offset, to at least save the overhead of a Seek
02058                * system call.
02059                */
02060               OVERLAPPED ov;
02061               memset(&ov, 0, sizeof(ov));
02062               for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
02063                      size_t wsize;
02064                      dp = txn->mt_u.dirty_list[i].mptr;
02065                      DPRINTF("committing page %zu", dp->mp_pgno);
02066                      size = dp->mp_pgno * env->me_psize;
02067                      ov.Offset = size & 0xffffffff;
02068                      ov.OffsetHigh = size >> 16;
02069                      ov.OffsetHigh >>= 16;
02070                      /* clear dirty flag */
02071                      dp->mp_flags &= ~P_DIRTY;
02072                      wsize = env->me_psize;
02073                      if (IS_OVERFLOW(dp)) wsize *= dp->mp_pages;
02074                      rc = WriteFile(env->me_fd, dp, wsize, NULL, &ov);
02075                      if (!rc) {
02076                             n = ErrCode();
02077                             DPRINTF("WriteFile: %d", n);
02078                             mdb_txn_abort(txn);
02079                             return n;
02080                      }
02081               }
02082               done = 1;
02083 #else
02084               struct iovec   iov[MDB_COMMIT_PAGES];
02085               n = 0;
02086               done = 1;
02087               size = 0;
02088               for (; i<=txn->mt_u.dirty_list[0].mid; i++) {
02089                      dp = txn->mt_u.dirty_list[i].mptr;
02090                      if (dp->mp_pgno != next) {
02091                             if (n) {
02092                                    rc = writev(env->me_fd, iov, n);
02093                                    if (rc != size) {
02094                                           n = ErrCode();
02095                                           if (rc > 0)
02096                                                  DPUTS("short write, filesystem full?");
02097                                           else
02098                                                  DPRINTF("writev: %s", strerror(n));
02099                                           mdb_txn_abort(txn);
02100                                           return n;
02101                                    }
02102                                    n = 0;
02103                                    size = 0;
02104                             }
02105                             lseek(env->me_fd, dp->mp_pgno * env->me_psize, SEEK_SET);
02106                             next = dp->mp_pgno;
02107                      }
02108                      DPRINTF("committing page %zu", dp->mp_pgno);
02109                      iov[n].iov_len = env->me_psize;
02110                      if (IS_OVERFLOW(dp)) iov[n].iov_len *= dp->mp_pages;
02111                      iov[n].iov_base = (char *)dp;
02112                      size += iov[n].iov_len;
02113                      next = dp->mp_pgno + (IS_OVERFLOW(dp) ? dp->mp_pages : 1);
02114                      /* clear dirty flag */
02115                      dp->mp_flags &= ~P_DIRTY;
02116                      if (++n >= MDB_COMMIT_PAGES) {
02117                             done = 0;
02118                             i++;
02119                             break;
02120                      }
02121               }
02122 
02123               if (n == 0)
02124                      break;
02125 
02126               rc = writev(env->me_fd, iov, n);
02127               if (rc != size) {
02128                      n = ErrCode();
02129                      if (rc > 0)
02130                             DPUTS("short write, filesystem full?");
02131                      else
02132                             DPRINTF("writev: %s", strerror(n));
02133                      mdb_txn_abort(txn);
02134                      return n;
02135               }
02136 #endif
02137        } while (!done);
02138 
02139        /* Drop the dirty pages.
02140         */
02141        for (i=1; i<=txn->mt_u.dirty_list[0].mid; i++) {
02142               dp = txn->mt_u.dirty_list[i].mptr;
02143               if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
02144                      dp->mp_next = txn->mt_env->me_dpages;
02145                      VGMEMP_FREE(txn->mt_env, dp);
02146                      txn->mt_env->me_dpages = dp;
02147               } else {
02148                      VGMEMP_FREE(txn->mt_env, dp);
02149                      free(dp);
02150               }
02151               txn->mt_u.dirty_list[i].mid = 0;
02152        }
02153        txn->mt_u.dirty_list[0].mid = 0;
02154 
02155        if ((n = mdb_env_sync(env, 0)) != 0 ||
02156            (n = mdb_env_write_meta(txn)) != MDB_SUCCESS) {
02157               mdb_txn_abort(txn);
02158               return n;
02159        }
02160        env->me_wtxnid = txn->mt_txnid;
02161 
02162 done:
02163        env->me_txn = NULL;
02164        /* update the DB tables */
02165        {
02166               int toggle = !env->me_db_toggle;
02167               MDB_db *ip, *jp;
02168               MDB_dbi i;
02169 
02170               ip = &env->me_dbs[toggle][2];
02171               jp = &txn->mt_dbs[2];
02172               LAZY_RWLOCK_WRLOCK(&env->me_dblock);
02173               for (i = 2; i < txn->mt_numdbs; i++) {
02174                      if (ip->md_root != jp->md_root)
02175                             *ip = *jp;
02176                      ip++; jp++;
02177               }
02178 
02179               env->me_db_toggle = toggle;
02180               env->me_numdbs = txn->mt_numdbs;
02181               LAZY_RWLOCK_UNLOCK(&env->me_dblock);
02182        }
02183 
02184        UNLOCK_MUTEX_W(env);
02185        free(txn);
02186 
02187        return MDB_SUCCESS;
02188 }
02189 
02196 static int
02197 mdb_env_read_header(MDB_env *env, MDB_meta *meta)
02198 {
02199        MDB_pagebuf   pbuf;
02200        MDB_page      *p;
02201        MDB_meta      *m;
02202        int            rc, err;
02203 
02204        /* We don't know the page size yet, so use a minimum value.
02205         */
02206 
02207 #ifdef _WIN32
02208        if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
02209 #else
02210        if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0)
02211 #endif
02212        {
02213               return ENOENT;
02214        }
02215        else if (rc != MDB_PAGESIZE) {
02216               err = ErrCode();
02217               if (rc > 0)
02218                      err = EINVAL;
02219               DPRINTF("read: %s", strerror(err));
02220               return err;
02221        }
02222 
02223        p = (MDB_page *)&pbuf;
02224 
02225        if (!F_ISSET(p->mp_flags, P_META)) {
02226               DPRINTF("page %zu not a meta page", p->mp_pgno);
02227               return EINVAL;
02228        }
02229 
02230        m = METADATA(p);
02231        if (m->mm_magic != MDB_MAGIC) {
02232               DPUTS("meta has invalid magic");
02233               return EINVAL;
02234        }
02235 
02236        if (m->mm_version != MDB_VERSION) {
02237               DPRINTF("database is version %u, expected version %u",
02238                   m->mm_version, MDB_VERSION);
02239               return MDB_VERSION_MISMATCH;
02240        }
02241 
02242        memcpy(meta, m, sizeof(*m));
02243        return 0;
02244 }
02245 
02251 static int
02252 mdb_env_init_meta(MDB_env *env, MDB_meta *meta)
02253 {
02254        MDB_page *p, *q;
02255        MDB_meta *m;
02256        int rc;
02257        unsigned int   psize;
02258 
02259        DPUTS("writing new meta page");
02260 
02261        GET_PAGESIZE(psize);
02262 
02263        meta->mm_magic = MDB_MAGIC;
02264        meta->mm_version = MDB_VERSION;
02265        meta->mm_psize = psize;
02266        meta->mm_last_pg = 1;
02267        meta->mm_flags = env->me_flags & 0xffff;
02268        meta->mm_flags |= MDB_INTEGERKEY;
02269        meta->mm_dbs[0].md_root = P_INVALID;
02270        meta->mm_dbs[1].md_root = P_INVALID;
02271 
02272        p = calloc(2, psize);
02273        p->mp_pgno = 0;
02274        p->mp_flags = P_META;
02275 
02276        m = METADATA(p);
02277        memcpy(m, meta, sizeof(*meta));
02278 
02279        q = (MDB_page *)((char *)p + psize);
02280 
02281        q->mp_pgno = 1;
02282        q->mp_flags = P_META;
02283 
02284        m = METADATA(q);
02285        memcpy(m, meta, sizeof(*meta));
02286 
02287 #ifdef _WIN32
02288        {
02289               DWORD len;
02290               rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
02291               rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
02292        }
02293 #else
02294        rc = write(env->me_fd, p, psize * 2);
02295        rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
02296 #endif
02297        free(p);
02298        return rc;
02299 }
02300 
02305 static int
02306 mdb_env_write_meta(MDB_txn *txn)
02307 {
02308        MDB_env *env;
02309        MDB_meta      meta, metab;
02310        off_t off;
02311        int rc, len, toggle;
02312        char *ptr;
02313 #ifdef _WIN32
02314        OVERLAPPED ov;
02315 #endif
02316 
02317        assert(txn != NULL);
02318        assert(txn->mt_env != NULL);
02319 
02320        toggle = !txn->mt_toggle;
02321        DPRINTF("writing meta page %d for root page %zu",
02322               toggle, txn->mt_dbs[MAIN_DBI].md_root);
02323 
02324        env = txn->mt_env;
02325 
02326        metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
02327        metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;
02328 
02329        ptr = (char *)&meta;
02330        off = offsetof(MDB_meta, mm_dbs[0].md_depth);
02331        len = sizeof(MDB_meta) - off;
02332 
02333        ptr += off;
02334        meta.mm_dbs[0] = txn->mt_dbs[0];
02335        meta.mm_dbs[1] = txn->mt_dbs[1];
02336        meta.mm_last_pg = txn->mt_next_pgno - 1;
02337        meta.mm_txnid = txn->mt_txnid;
02338 
02339        if (toggle)
02340               off += env->me_psize;
02341        off += PAGEHDRSZ;
02342 
02343        /* Write to the SYNC fd */
02344 #ifdef _WIN32
02345        {
02346               memset(&ov, 0, sizeof(ov));
02347               ov.Offset = off;
02348               WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov);
02349        }
02350 #else
02351        rc = pwrite(env->me_mfd, ptr, len, off);
02352 #endif
02353        if (rc != len) {
02354               int r2;
02355               rc = ErrCode();
02356               DPUTS("write failed, disk error?");
02357               /* On a failure, the pagecache still contains the new data.
02358                * Write some old data back, to prevent it from being used.
02359                * Use the non-SYNC fd; we know it will fail anyway.
02360                */
02361               meta.mm_last_pg = metab.mm_last_pg;
02362               meta.mm_txnid = metab.mm_txnid;
02363 #ifdef _WIN32
02364               WriteFile(env->me_fd, ptr, len, NULL, &ov);
02365 #else
02366               r2 = pwrite(env->me_fd, ptr, len, off);
02367 #endif
02368               env->me_flags |= MDB_FATAL_ERROR;
02369               return rc;
02370        }
02371        /* Memory ordering issues are irrelevant; since the entire writer
02372         * is wrapped by wmutex, all of these changes will become visible
02373         * after the wmutex is unlocked. Since the DB is multi-version,
02374         * readers will get consistent data regardless of how fresh or
02375         * how stale their view of these values is.
02376         */
02377        LAZY_MUTEX_LOCK(&env->me_txns->mti_mutex);
02378        txn->mt_env->me_txns->mti_me_toggle = toggle;
02379        txn->mt_env->me_txns->mti_txnid = txn->mt_txnid;
02380        LAZY_MUTEX_UNLOCK(&env->me_txns->mti_mutex);
02381 
02382        return MDB_SUCCESS;
02383 }
02384 
02390 static int
02391 mdb_env_read_meta(MDB_env *env, int *which)
02392 {
02393        int toggle = 0;
02394 
02395        assert(env != NULL);
02396 
02397        if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
02398               toggle = 1;
02399 
02400        DPRINTF("Using meta page %d", toggle);
02401        *which = toggle;
02402 
02403        return MDB_SUCCESS;
02404 }
02405 
02406 int
02407 mdb_env_create(MDB_env **env)
02408 {
02409        MDB_env *e;
02410 
02411        e = calloc(1, sizeof(MDB_env));
02412        if (!e)
02413               return ENOMEM;
02414 
02415        e->me_free_pgs = mdb_midl_alloc();
02416        if (!e->me_free_pgs) {
02417               free(e);
02418               return ENOMEM;
02419        }
02420        e->me_maxreaders = DEFAULT_READERS;
02421        e->me_maxdbs = 2;
02422        e->me_fd = INVALID_HANDLE_VALUE;
02423        e->me_lfd = INVALID_HANDLE_VALUE;
02424        e->me_mfd = INVALID_HANDLE_VALUE;
02425        VGMEMP_CREATE(e,0,0);
02426        *env = e;
02427        return MDB_SUCCESS;
02428 }
02429 
02430 int
02431 mdb_env_set_mapsize(MDB_env *env, size_t size)
02432 {
02433        if (env->me_map)
02434               return EINVAL;
02435        env->me_mapsize = size;
02436        if (env->me_psize)
02437               env->me_maxpg = env->me_mapsize / env->me_psize;
02438        return MDB_SUCCESS;
02439 }
02440 
02441 int
02442 mdb_env_set_maxdbs(MDB_env *env, MDB_dbi dbs)
02443 {
02444        if (env->me_map)
02445               return EINVAL;
02446        env->me_maxdbs = dbs;
02447        return MDB_SUCCESS;
02448 }
02449 
02450 int
02451 mdb_env_set_maxreaders(MDB_env *env, unsigned int readers)
02452 {
02453        if (env->me_map || readers < 1)
02454               return EINVAL;
02455        env->me_maxreaders = readers;
02456        return MDB_SUCCESS;
02457 }
02458 
02459 int
02460 mdb_env_get_maxreaders(MDB_env *env, unsigned int *readers)
02461 {
02462        if (!env || !readers)
02463               return EINVAL;
02464        *readers = env->me_maxreaders;
02465        return MDB_SUCCESS;
02466 }
02467 
02470 static int
02471 mdb_env_open2(MDB_env *env, unsigned int flags)
02472 {
02473        int i, newenv = 0, toggle;
02474        MDB_meta meta;
02475        MDB_page *p;
02476 
02477        env->me_flags = flags;
02478 
02479        memset(&meta, 0, sizeof(meta));
02480 
02481        if ((i = mdb_env_read_header(env, &meta)) != 0) {
02482               if (i != ENOENT)
02483                      return i;
02484               DPUTS("new mdbenv");
02485               newenv = 1;
02486        }
02487 
02488        if (!env->me_mapsize) {
02489               env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
02490        }
02491 
02492 #ifdef _WIN32
02493        {
02494               HANDLE mh;
02495               LONG sizelo, sizehi;
02496               sizelo = env->me_mapsize & 0xffffffff;
02497               sizehi = env->me_mapsize >> 16;           /* pointless on WIN32, only needed on W64 */
02498               sizehi >>= 16;
02499               /* Windows won't create mappings for zero length files.
02500                * Just allocate the maxsize right now.
02501                */
02502               if (newenv) {
02503                      SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
02504                      if (!SetEndOfFile(env->me_fd))
02505                             return ErrCode();
02506                      SetFilePointer(env->me_fd, 0, NULL, 0);
02507               }
02508               mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY,
02509                      sizehi, sizelo, NULL);
02510               if (!mh)
02511                      return ErrCode();
02512               env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize,
02513                      meta.mm_address);
02514               CloseHandle(mh);
02515               if (!env->me_map)
02516                      return ErrCode();
02517        }
02518 #else
02519        i = MAP_SHARED;
02520        if (meta.mm_address && (flags & MDB_FIXEDMAP))
02521               i |= MAP_FIXED;
02522        env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
02523               env->me_fd, 0);
02524        if (env->me_map == MAP_FAILED) {
02525               env->me_map = NULL;
02526               return ErrCode();
02527        }
02528 #endif
02529 
02530        if (newenv) {
02531               meta.mm_mapsize = env->me_mapsize;
02532               if (flags & MDB_FIXEDMAP)
02533                      meta.mm_address = env->me_map;
02534               i = mdb_env_init_meta(env, &meta);
02535               if (i != MDB_SUCCESS) {
02536                      munmap(env->me_map, env->me_mapsize);
02537                      return i;
02538               }
02539        }
02540        env->me_psize = meta.mm_psize;
02541 
02542        env->me_maxpg = env->me_mapsize / env->me_psize;
02543 
02544        p = (MDB_page *)env->me_map;
02545        env->me_metas[0] = METADATA(p);
02546        env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize);
02547 
02548        if ((i = mdb_env_read_meta(env, &toggle)) != 0)
02549               return i;
02550 
02551        DPRINTF("opened database version %u, pagesize %u",
02552            env->me_metas[toggle]->mm_version, env->me_psize);
02553        DPRINTF("depth: %u", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_depth);
02554        DPRINTF("entries: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_entries);
02555        DPRINTF("branch pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_branch_pages);
02556        DPRINTF("leaf pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_leaf_pages);
02557        DPRINTF("overflow pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_overflow_pages);
02558        DPRINTF("root: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_root);
02559 
02560        return MDB_SUCCESS;
02561 }
02562 
02563 #ifndef _WIN32
02564 
02570 static void
02571 mdb_env_reader_dest(void *ptr)
02572 {
02573        MDB_reader *reader = ptr;
02574 
02575        reader->mr_txnid = 0;
02576        reader->mr_pid = 0;
02577        reader->mr_tid = 0;
02578 }
02579 #endif
02580 
02582 static void
02583 mdb_env_share_locks(MDB_env *env)
02584 {
02585        int toggle = 0;
02586 
02587        if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
02588               toggle = 1;
02589        env->me_txns->mti_me_toggle = toggle;
02590        env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;
02591 
02592 #ifdef _WIN32
02593        {
02594               OVERLAPPED ov;
02595               /* First acquire a shared lock. The Unlock will
02596                * then release the existing exclusive lock.
02597                */
02598               memset(&ov, 0, sizeof(ov));
02599               LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
02600               UnlockFile(env->me_lfd, 0, 0, 1, 0);
02601        }
02602 #else
02603        {
02604               struct flock lock_info;
02605               /* The shared lock replaces the existing lock */
02606               memset((void *)&lock_info, 0, sizeof(lock_info));
02607               lock_info.l_type = F_RDLCK;
02608               lock_info.l_whence = SEEK_SET;
02609               lock_info.l_start = 0;
02610               lock_info.l_len = 1;
02611               fcntl(env->me_lfd, F_SETLK, &lock_info);
02612        }
02613 #endif
02614 }
02615 #if defined(_WIN32) || defined(__APPLE__)
02616 /*
02617  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
02618  *
02619  * @(#) $Revision: 5.1 $
02620  * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
02621  * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
02622  *
02623  *       http://www.isthe.com/chongo/tech/comp/fnv/index.html
02624  *
02625  ***
02626  *
02627  * Please do not copyright this code.  This code is in the public domain.
02628  *
02629  * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
02630  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
02631  * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
02632  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
02633  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
02634  * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
02635  * PERFORMANCE OF THIS SOFTWARE.
02636  *
02637  * By:
02638  *     chongo <Landon Curt Noll> /\oo/\
02639  *       http://www.isthe.com/chongo/
02640  *
02641  * Share and Enjoy!  :-)
02642  */
02643 
02644 typedef unsigned long long  mdb_hash_t;
02645 #define MDB_HASH_INIT ((mdb_hash_t)0xcbf29ce484222325ULL)
02646 
02655 static mdb_hash_t
02656 mdb_hash_str(char *str, mdb_hash_t hval)
02657 {
02658        unsigned char *s = (unsigned char *)str;  /* unsigned string */
02659        /*
02660         * FNV-1a hash each octet of the string
02661         */
02662        while (*s) {
02663               /* xor the bottom with the current octet */
02664               hval ^= (mdb_hash_t)*s++;
02665 
02666               /* multiply by the 64 bit FNV magic prime mod 2^64 */
02667               hval += (hval << 1) + (hval << 4) + (hval << 5) +
02668                      (hval << 7) + (hval << 8) + (hval << 40);
02669        }
02670        /* return our new hash value */
02671        return hval;
02672 }
02673 
02678 static void
02679 mdb_hash_hex(char *str, char *hexbuf)
02680 {
02681        int i;
02682        mdb_hash_t h = mdb_hash_str(str, MDB_HASH_INIT);
02683        for (i=0; i<8; i++) {
02684               hexbuf += sprintf(hexbuf, "%02x", (unsigned int)h & 0xff);
02685               h >>= 8;
02686        }
02687 }
02688 #endif
02689 
02697 static int
02698 mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
02699 {
02700        int rc;
02701        off_t size, rsize;
02702 
02703        *excl = 0;
02704 
02705 #ifdef _WIN32
02706        if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
02707               FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
02708               FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
02709               rc = ErrCode();
02710               return rc;
02711        }
02712        /* Try to get exclusive lock. If we succeed, then
02713         * nobody is using the lock region and we should initialize it.
02714         */
02715        {
02716               if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
02717                      *excl = 1;
02718               } else {
02719                      OVERLAPPED ov;
02720                      memset(&ov, 0, sizeof(ov));
02721                      if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
02722                             rc = ErrCode();
02723                             goto fail;
02724                      }
02725               }
02726        }
02727        size = GetFileSize(env->me_lfd, NULL);
02728 #else
02729        if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
02730               rc = ErrCode();
02731               return rc;
02732        }
02733        /* Try to get exclusive lock. If we succeed, then
02734         * nobody is using the lock region and we should initialize it.
02735         */
02736        {
02737               struct flock lock_info;
02738               memset((void *)&lock_info, 0, sizeof(lock_info));
02739               lock_info.l_type = F_WRLCK;
02740               lock_info.l_whence = SEEK_SET;
02741               lock_info.l_start = 0;
02742               lock_info.l_len = 1;
02743               rc = fcntl(env->me_lfd, F_SETLK, &lock_info);
02744               if (rc == 0) {
02745                      *excl = 1;
02746               } else {
02747                      lock_info.l_type = F_RDLCK;
02748                      rc = fcntl(env->me_lfd, F_SETLKW, &lock_info);
02749                      if (rc) {
02750                             rc = ErrCode();
02751                             goto fail;
02752                      }
02753               }
02754        }
02755        size = lseek(env->me_lfd, 0, SEEK_END);
02756 #endif
02757        rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
02758        if (size < rsize && *excl) {
02759 #ifdef _WIN32
02760               SetFilePointer(env->me_lfd, rsize, NULL, 0);
02761               if (!SetEndOfFile(env->me_lfd)) {
02762                      rc = ErrCode();
02763                      goto fail;
02764               }
02765 #else
02766               if (ftruncate(env->me_lfd, rsize) != 0) {
02767                      rc = ErrCode();
02768                      goto fail;
02769               }
02770 #endif
02771        } else {
02772               rsize = size;
02773               size = rsize - sizeof(MDB_txninfo);
02774               env->me_maxreaders = size/sizeof(MDB_reader) + 1;
02775        }
02776        {
02777 #ifdef _WIN32
02778               HANDLE mh;
02779               mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
02780                      0, 0, NULL);
02781               if (!mh) {
02782                      rc = ErrCode();
02783                      goto fail;
02784               }
02785               env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
02786               CloseHandle(mh);
02787               if (!env->me_txns) {
02788                      rc = ErrCode();
02789                      goto fail;
02790               }
02791 #else
02792               void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
02793                      env->me_lfd, 0);
02794               if (m == MAP_FAILED) {
02795                      env->me_txns = NULL;
02796                      rc = ErrCode();
02797                      goto fail;
02798               }
02799               env->me_txns = m;
02800 #endif
02801        }
02802        if (*excl) {
02803 #ifdef _WIN32
02804               char hexbuf[17];
02805               if (!mdb_sec_inited) {
02806                      InitializeSecurityDescriptor(&mdb_null_sd,
02807                             SECURITY_DESCRIPTOR_REVISION);
02808                      SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
02809                      mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
02810                      mdb_all_sa.bInheritHandle = FALSE;
02811                      mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
02812                      mdb_sec_inited = 1;
02813               }
02814               mdb_hash_hex(lpath, hexbuf);
02815               sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", hexbuf);
02816               env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
02817               if (!env->me_rmutex) {
02818                      rc = ErrCode();
02819                      goto fail;
02820               }
02821               sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", hexbuf);
02822               env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
02823               if (!env->me_wmutex) {
02824                      rc = ErrCode();
02825                      goto fail;
02826               }
02827 #else  /* _WIN32 */
02828 #ifdef __APPLE__
02829               char hexbuf[17];
02830               mdb_hash_hex(lpath, hexbuf);
02831               sprintf(env->me_txns->mti_rmname, "MDBr%s", hexbuf);
02832               if (sem_unlink(env->me_txns->mti_rmname)) {
02833                      rc = ErrCode();
02834                      if (rc != ENOENT && rc != EINVAL)
02835                             goto fail;
02836               }
02837               env->me_rmutex = sem_open(env->me_txns->mti_rmname, O_CREAT, mode, 1);
02838               if (!env->me_rmutex) {
02839                      rc = ErrCode();
02840                      goto fail;
02841               }
02842               sprintf(env->me_txns->mti_wmname, "MDBw%s", hexbuf);
02843               if (sem_unlink(env->me_txns->mti_wmname)) {
02844                      rc = ErrCode();
02845                      if (rc != ENOENT && rc != EINVAL)
02846                             goto fail;
02847               }
02848               env->me_wmutex = sem_open(env->me_txns->mti_wmname, O_CREAT, mode, 1);
02849               if (!env->me_wmutex) {
02850                      rc = ErrCode();
02851                      goto fail;
02852               }
02853 #else  /* __APPLE__ */
02854               pthread_mutexattr_t mattr;
02855 
02856               pthread_mutexattr_init(&mattr);
02857               rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
02858               if (rc) {
02859                      goto fail;
02860               }
02861               pthread_mutex_init(&env->me_txns->mti_mutex, &mattr);
02862               pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr);
02863 #endif /* __APPLE__ */
02864 #endif /* _WIN32 */
02865               env->me_txns->mti_version = MDB_VERSION;
02866               env->me_txns->mti_magic = MDB_MAGIC;
02867               env->me_txns->mti_txnid = 0;
02868               env->me_txns->mti_numreaders = 0;
02869               env->me_txns->mti_me_toggle = 0;
02870 
02871        } else {
02872               if (env->me_txns->mti_magic != MDB_MAGIC) {
02873                      DPUTS("lock region has invalid magic");
02874                      rc = EINVAL;
02875                      goto fail;
02876               }
02877               if (env->me_txns->mti_version != MDB_VERSION) {
02878                      DPRINTF("lock region is version %u, expected version %u",
02879                             env->me_txns->mti_version, MDB_VERSION);
02880                      rc = MDB_VERSION_MISMATCH;
02881                      goto fail;
02882               }
02883               rc = ErrCode();
02884               if (rc != EACCES && rc != EAGAIN) {
02885                      goto fail;
02886               }
02887 #ifdef _WIN32
02888               env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
02889               if (!env->me_rmutex) {
02890                      rc = ErrCode();
02891                      goto fail;
02892               }
02893               env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
02894               if (!env->me_wmutex) {
02895                      rc = ErrCode();
02896                      goto fail;
02897               }
02898 #endif
02899 #ifdef __APPLE__
02900               env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
02901               if (!env->me_rmutex) {
02902                      rc = ErrCode();
02903                      goto fail;
02904               }
02905               env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
02906               if (!env->me_wmutex) {
02907                      rc = ErrCode();
02908                      goto fail;
02909               }
02910 #endif
02911        }
02912        return MDB_SUCCESS;
02913 
02914 fail:
02915        close(env->me_lfd);
02916        env->me_lfd = INVALID_HANDLE_VALUE;
02917        return rc;
02918 
02919 }
02920 
02922 #define LOCKNAME     "/lock.mdb"
02923 
02924 #define DATANAME     "/data.mdb"
02925 
02926 #define LOCKSUFF     "-lock"
02927 
02928 int
02929 mdb_env_open(MDB_env *env, const char *path, unsigned int flags, mode_t mode)
02930 {
02931        int           oflags, rc, len, excl;
02932        char *lpath, *dpath;
02933 
02934        len = strlen(path);
02935        if (flags & MDB_NOSUBDIR) {
02936               rc = len + sizeof(LOCKSUFF) + len + 1;
02937        } else {
02938               rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
02939        }
02940        lpath = malloc(rc);
02941        if (!lpath)
02942               return ENOMEM;
02943        if (flags & MDB_NOSUBDIR) {
02944               dpath = lpath + len + sizeof(LOCKSUFF);
02945               sprintf(lpath, "%s" LOCKSUFF, path);
02946               strcpy(dpath, path);
02947        } else {
02948               dpath = lpath + len + sizeof(LOCKNAME);
02949               sprintf(lpath, "%s" LOCKNAME, path);
02950               sprintf(dpath, "%s" DATANAME, path);
02951        }
02952 
02953        rc = mdb_env_setup_locks(env, lpath, mode, &excl);
02954        if (rc)
02955               goto leave;
02956 
02957 #ifdef _WIN32
02958        if (F_ISSET(flags, MDB_RDONLY)) {
02959               oflags = GENERIC_READ;
02960               len = OPEN_EXISTING;
02961        } else {
02962               oflags = GENERIC_READ|GENERIC_WRITE;
02963               len = OPEN_ALWAYS;
02964        }
02965        mode = FILE_ATTRIBUTE_NORMAL;
02966        if ((env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
02967                      NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
02968               rc = ErrCode();
02969               goto leave;
02970        }
02971 #else
02972        if (F_ISSET(flags, MDB_RDONLY))
02973               oflags = O_RDONLY;
02974        else
02975               oflags = O_RDWR | O_CREAT;
02976 
02977        if ((env->me_fd = open(dpath, oflags, mode)) == -1) {
02978               rc = ErrCode();
02979               goto leave;
02980        }
02981 #endif
02982 
02983        if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) {
02984               /* synchronous fd for meta writes */
02985 #ifdef _WIN32
02986               if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
02987                      mode |= FILE_FLAG_WRITE_THROUGH;
02988               if ((env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
02989                      NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
02990                      rc = ErrCode();
02991                      goto leave;
02992               }
02993 #else
02994               if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
02995                      oflags |= MDB_DSYNC;
02996               if ((env->me_mfd = open(dpath, oflags, mode)) == -1) {
02997                      rc = ErrCode();
02998                      goto leave;
02999               }
03000 #endif
03001               env->me_path = strdup(path);
03002               DPRINTF("opened dbenv %p", (void *) env);
03003               pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
03004               LAZY_RWLOCK_INIT(&env->me_dblock, NULL);
03005               if (excl)
03006                      mdb_env_share_locks(env);
03007               env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
03008               env->me_dbs[0] = calloc(env->me_maxdbs, sizeof(MDB_db));
03009               env->me_dbs[1] = calloc(env->me_maxdbs, sizeof(MDB_db));
03010               env->me_numdbs = 2;
03011        }
03012 
03013 leave:
03014        if (rc) {
03015               if (env->me_fd != INVALID_HANDLE_VALUE) {
03016                      close(env->me_fd);
03017                      env->me_fd = INVALID_HANDLE_VALUE;
03018               }
03019               if (env->me_lfd != INVALID_HANDLE_VALUE) {
03020                      close(env->me_lfd);
03021                      env->me_lfd = INVALID_HANDLE_VALUE;
03022               }
03023        }
03024        free(lpath);
03025        return rc;
03026 }
03027 
03028 void
03029 mdb_env_close(MDB_env *env)
03030 {
03031        MDB_page *dp;
03032 
03033        if (env == NULL)
03034               return;
03035 
03036        VGMEMP_DESTROY(env);
03037        while (env->me_dpages) {
03038               dp = env->me_dpages;
03039               VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
03040               env->me_dpages = dp->mp_next;
03041               free(dp);
03042        }
03043 
03044        free(env->me_dbs[1]);
03045        free(env->me_dbs[0]);
03046        free(env->me_dbxs);
03047        free(env->me_path);
03048 
03049        LAZY_RWLOCK_DESTROY(&env->me_dblock);
03050        pthread_key_delete(env->me_txkey);
03051 
03052        if (env->me_map) {
03053               munmap(env->me_map, env->me_mapsize);
03054        }
03055        close(env->me_mfd);
03056        close(env->me_fd);
03057        if (env->me_txns) {
03058               pid_t pid = getpid();
03059               unsigned int i;
03060               for (i=0; i<env->me_txns->mti_numreaders; i++)
03061                      if (env->me_txns->mti_readers[i].mr_pid == pid)
03062                             env->me_txns->mti_readers[i].mr_pid = 0;
03063               munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
03064        }
03065        close(env->me_lfd);
03066        mdb_midl_free(env->me_free_pgs);
03067        free(env);
03068 }
03069 
03071 static int
03072 mdb_cmp_long(const MDB_val *a, const MDB_val *b)
03073 {
03074        return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
03075               *(size_t *)a->mv_data > *(size_t *)b->mv_data;
03076 }
03077 
03079 static int
03080 mdb_cmp_int(const MDB_val *a, const MDB_val *b)
03081 {
03082        return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
03083               *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
03084 }
03085 
03089 static int
03090 mdb_cmp_cint(const MDB_val *a, const MDB_val *b)
03091 {
03092 #if BYTE_ORDER == LITTLE_ENDIAN
03093        unsigned short *u, *c;
03094        int x;
03095 
03096        u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
03097        c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
03098        do {
03099               x = *--u - *--c;
03100        } while(!x && u > (unsigned short *)a->mv_data);
03101        return x;
03102 #else
03103        return memcmp(a->mv_data, b->mv_data, a->mv_size);
03104 #endif
03105 }
03106 
03108 static int
03109 mdb_cmp_memn(const MDB_val *a, const MDB_val *b)
03110 {
03111        int diff;
03112        ssize_t len_diff;
03113        unsigned int len;
03114 
03115        len = a->mv_size;
03116        len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
03117        if (len_diff > 0) {
03118               len = b->mv_size;
03119               len_diff = 1;
03120        }
03121 
03122        diff = memcmp(a->mv_data, b->mv_data, len);
03123        return diff ? diff : len_diff<0 ? -1 : len_diff;
03124 }
03125 
03127 static int
03128 mdb_cmp_memnr(const MDB_val *a, const MDB_val *b)
03129 {
03130        const unsigned char  *p1, *p2, *p1_lim;
03131        ssize_t len_diff;
03132        int diff;
03133 
03134        p1_lim = (const unsigned char *)a->mv_data;
03135        p1 = (const unsigned char *)a->mv_data + a->mv_size;
03136        p2 = (const unsigned char *)b->mv_data + b->mv_size;
03137 
03138        len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
03139        if (len_diff > 0) {
03140               p1_lim += len_diff;
03141               len_diff = 1;
03142        }
03143 
03144        while (p1 > p1_lim) {
03145               diff = *--p1 - *--p2;
03146               if (diff)
03147                      return diff;
03148        }
03149        return len_diff<0 ? -1 : len_diff;
03150 }
03151 
03159 static MDB_node *
03160 mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp)
03161 {
03162        unsigned int   i = 0, nkeys;
03163        int            low, high;
03164        int            rc = 0;
03165        MDB_page *mp = mc->mc_pg[mc->mc_top];
03166        MDB_node      *node = NULL;
03167        MDB_val        nodekey;
03168        MDB_cmp_func *cmp;
03169        DKBUF;
03170 
03171        nkeys = NUMKEYS(mp);
03172 
03173 #if MDB_DEBUG
03174        {
03175        pgno_t pgno;
03176        COPY_PGNO(pgno, mp->mp_pgno);
03177        DPRINTF("searching %u keys in %s %spage %zu",
03178            nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
03179            pgno);
03180        }
03181 #endif
03182 
03183        assert(nkeys > 0);
03184 
03185        low = IS_LEAF(mp) ? 0 : 1;
03186        high = nkeys - 1;
03187        cmp = mc->mc_dbx->md_cmp;
03188 
03189        /* Branch pages have no data, so if using integer keys,
03190         * alignment is guaranteed. Use faster mdb_cmp_int.
03191         */
03192        if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
03193               if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
03194                      cmp = mdb_cmp_long;
03195               else
03196                      cmp = mdb_cmp_int;
03197        }
03198 
03199        if (IS_LEAF2(mp)) {
03200               nodekey.mv_size = mc->mc_db->md_pad;
03201               node = NODEPTR(mp, 0);      /* fake */
03202               while (low <= high) {
03203                      i = (low + high) >> 1;
03204                      nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
03205                      rc = cmp(key, &nodekey);
03206                      DPRINTF("found leaf index %u [%s], rc = %i",
03207                          i, DKEY(&nodekey), rc);
03208                      if (rc == 0)
03209                             break;
03210                      if (rc > 0)
03211                             low = i + 1;
03212                      else
03213                             high = i - 1;
03214               }
03215        } else {
03216               while (low <= high) {
03217                      i = (low + high) >> 1;
03218 
03219                      node = NODEPTR(mp, i);
03220                      nodekey.mv_size = NODEKSZ(node);
03221                      nodekey.mv_data = NODEKEY(node);
03222 
03223                      rc = cmp(key, &nodekey);
03224 #if MDB_DEBUG
03225                      if (IS_LEAF(mp))
03226                             DPRINTF("found leaf index %u [%s], rc = %i",
03227                                 i, DKEY(&nodekey), rc);
03228                      else
03229                             DPRINTF("found branch index %u [%s -> %zu], rc = %i",
03230                                 i, DKEY(&nodekey), NODEPGNO(node), rc);
03231 #endif
03232                      if (rc == 0)
03233                             break;
03234                      if (rc > 0)
03235                             low = i + 1;
03236                      else
03237                             high = i - 1;
03238               }
03239        }
03240 
03241        if (rc > 0) { /* Found entry is less than the key. */
03242               i++;   /* Skip to get the smallest entry larger than key. */
03243               if (!IS_LEAF2(mp))
03244                      node = NODEPTR(mp, i);
03245        }
03246        if (exactp)
03247               *exactp = (rc == 0);
03248        /* store the key index */
03249        mc->mc_ki[mc->mc_top] = i;
03250        if (i >= nkeys)
03251               /* There is no entry larger or equal to the key. */
03252               return NULL;
03253 
03254        /* nodeptr is fake for LEAF2 */
03255        return node;
03256 }
03257 
03258 #if 0
03259 static void
03260 mdb_cursor_adjust(MDB_cursor *mc, func)
03261 {
03262        MDB_cursor *m2;
03263 
03264        for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
03265               if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
03266                      func(mc, m2);
03267               }
03268        }
03269 }
03270 #endif
03271 
03273 static void
03274 mdb_cursor_pop(MDB_cursor *mc)
03275 {
03276        MDB_page      *top;
03277 
03278        if (mc->mc_snum) {
03279               top = mc->mc_pg[mc->mc_top];
03280               mc->mc_snum--;
03281               if (mc->mc_snum)
03282                      mc->mc_top--;
03283 
03284               DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno,
03285                      mc->mc_dbi, (void *) mc);
03286        }
03287 }
03288 
03290 static int
03291 mdb_cursor_push(MDB_cursor *mc, MDB_page *mp)
03292 {
03293        DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno,
03294               mc->mc_dbi, (void *) mc);
03295 
03296        if (mc->mc_snum >= CURSOR_STACK) {
03297               assert(mc->mc_snum < CURSOR_STACK);
03298               return ENOMEM;
03299        }
03300 
03301        mc->mc_top = mc->mc_snum++;
03302        mc->mc_pg[mc->mc_top] = mp;
03303        mc->mc_ki[mc->mc_top] = 0;
03304 
03305        return MDB_SUCCESS;
03306 }
03307 
03314 static int
03315 mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret)
03316 {
03317        MDB_page *p = NULL;
03318 
03319        if (!F_ISSET(txn->mt_flags, MDB_TXN_RDONLY) && txn->mt_u.dirty_list[0].mid) {
03320               unsigned x;
03321               x = mdb_mid2l_search(txn->mt_u.dirty_list, pgno);
03322               if (x <= txn->mt_u.dirty_list[0].mid && txn->mt_u.dirty_list[x].mid == pgno) {
03323                      p = txn->mt_u.dirty_list[x].mptr;
03324               }
03325        }
03326        if (!p) {
03327               if (pgno <= txn->mt_env->me_metas[txn->mt_toggle]->mm_last_pg)
03328                      p = (MDB_page *)(txn->mt_env->me_map + txn->mt_env->me_psize * pgno);
03329        }
03330        *ret = p;
03331        if (!p) {
03332               DPRINTF("page %zu not found", pgno);
03333               assert(p != NULL);
03334        }
03335        return (p != NULL) ? MDB_SUCCESS : MDB_PAGE_NOTFOUND;
03336 }
03337 
03348 static int
03349 mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify)
03350 {
03351        MDB_page      *mp = mc->mc_pg[mc->mc_top];
03352        DKBUF;
03353        int rc;
03354 
03355 
03356        while (IS_BRANCH(mp)) {
03357               MDB_node      *node;
03358               indx_t        i;
03359 
03360               DPRINTF("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp));
03361               assert(NUMKEYS(mp) > 1);
03362               DPRINTF("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0)));
03363 
03364               if (key == NULL)     /* Initialize cursor to first page. */
03365                      i = 0;
03366               else if (key->mv_size > MAXKEYSIZE && key->mv_data == NULL) {
03367                                                  /* cursor to last page */
03368                      i = NUMKEYS(mp)-1;
03369               } else {
03370                      int     exact;
03371                      node = mdb_node_search(mc, key, &exact);
03372                      if (node == NULL)
03373                             i = NUMKEYS(mp) - 1;
03374                      else {
03375                             i = mc->mc_ki[mc->mc_top];
03376                             if (!exact) {
03377                                    assert(i > 0);
03378                                    i--;
03379                             }
03380                      }
03381               }
03382 
03383               if (key)
03384                      DPRINTF("following index %u for key [%s]",
03385                          i, DKEY(key));
03386               assert(i < NUMKEYS(mp));
03387               node = NODEPTR(mp, i);
03388 
03389               if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp)))
03390                      return rc;
03391 
03392               mc->mc_ki[mc->mc_top] = i;
03393               if ((rc = mdb_cursor_push(mc, mp)))
03394                      return rc;
03395 
03396               if (modify) {
03397                      if ((rc = mdb_page_touch(mc)) != 0)
03398                             return rc;
03399                      mp = mc->mc_pg[mc->mc_top];
03400               }
03401        }
03402 
03403        if (!IS_LEAF(mp)) {
03404               DPRINTF("internal error, index points to a %02X page!?",
03405                   mp->mp_flags);
03406               return MDB_CORRUPTED;
03407        }
03408 
03409        DPRINTF("found leaf page %zu for key [%s]", mp->mp_pgno,
03410            key ? DKEY(key) : NULL);
03411 
03412        return MDB_SUCCESS;
03413 }
03414 
03426 static int
03427 mdb_page_search(MDB_cursor *mc, MDB_val *key, int modify)
03428 {
03429        int            rc;
03430        pgno_t         root;
03431 
03432        /* Make sure the txn is still viable, then find the root from
03433         * the txn's db table.
03434         */
03435        if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_ERROR)) {
03436               DPUTS("transaction has failed, must abort");
03437               return EINVAL;
03438        } else {
03439               /* Make sure we're using an up-to-date root */
03440               if (mc->mc_dbi > MAIN_DBI) {
03441                      if ((*mc->mc_dbflag & DB_STALE) ||
03442                      (modify && !(*mc->mc_dbflag & DB_DIRTY))) {
03443                             MDB_cursor mc2;
03444                             unsigned char dbflag = 0;
03445                             mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
03446                             rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, modify);
03447                             if (rc)
03448                                    return rc;
03449                             if (*mc->mc_dbflag & DB_STALE) {
03450                                    MDB_val data;
03451                                    int exact = 0;
03452                                    MDB_node *leaf = mdb_node_search(&mc2,
03453                                           &mc->mc_dbx->md_name, &exact);
03454                                    if (!exact)
03455                                           return MDB_NOTFOUND;
03456                                    mdb_node_read(mc->mc_txn, leaf, &data);
03457                                    memcpy(mc->mc_db, data.mv_data, sizeof(MDB_db));
03458                             }
03459                             if (modify)
03460                                    dbflag = DB_DIRTY;
03461                             *mc->mc_dbflag = dbflag;
03462                      }
03463               }
03464               root = mc->mc_db->md_root;
03465 
03466               if (root == P_INVALID) {           /* Tree is empty. */
03467                      DPUTS("tree is empty");
03468                      return MDB_NOTFOUND;
03469               }
03470        }
03471 
03472        assert(root > 1);
03473        if ((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0])))
03474               return rc;
03475 
03476        mc->mc_snum = 1;
03477        mc->mc_top = 0;
03478 
03479        DPRINTF("db %u root page %zu has flags 0x%X",
03480               mc->mc_dbi, root, mc->mc_pg[0]->mp_flags);
03481 
03482        if (modify) {
03483               if ((rc = mdb_page_touch(mc)))
03484                      return rc;
03485        }
03486 
03487        return mdb_page_search_root(mc, key, modify);
03488 }
03489 
03496 static int
03497 mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data)
03498 {
03499        MDB_page      *omp;         /* overflow page */
03500        pgno_t         pgno;
03501        int rc;
03502 
03503        if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
03504               data->mv_size = NODEDSZ(leaf);
03505               data->mv_data = NODEDATA(leaf);
03506               return MDB_SUCCESS;
03507        }
03508 
03509        /* Read overflow data.
03510         */
03511        data->mv_size = NODEDSZ(leaf);
03512        memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
03513        if ((rc = mdb_page_get(txn, pgno, &omp))) {
03514               DPRINTF("read overflow page %zu failed", pgno);
03515               return rc;
03516        }
03517        data->mv_data = METADATA(omp);
03518 
03519        return MDB_SUCCESS;
03520 }
03521 
03522 int
03523 mdb_get(MDB_txn *txn, MDB_dbi dbi,
03524     MDB_val *key, MDB_val *data)
03525 {
03526        MDB_cursor    mc;
03527        MDB_xcursor   mx;
03528        int exact = 0;
03529        DKBUF;
03530 
03531        assert(key);
03532        assert(data);
03533        DPRINTF("===> get db %u key [%s]", dbi, DKEY(key));
03534 
03535        if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
03536               return EINVAL;
03537 
03538        if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
03539               return EINVAL;
03540        }
03541 
03542        mdb_cursor_init(&mc, txn, dbi, &mx);
03543        return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
03544 }
03545 
03554 static int
03555 mdb_cursor_sibling(MDB_cursor *mc, int move_right)
03556 {
03557        int            rc;
03558        MDB_node      *indx;
03559        MDB_page      *mp;
03560 
03561        if (mc->mc_snum < 2) {
03562               return MDB_NOTFOUND;        /* root has no siblings */
03563        }
03564 
03565        mdb_cursor_pop(mc);
03566        DPRINTF("parent page is page %zu, index %u",
03567               mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);
03568 
03569        if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
03570                      : (mc->mc_ki[mc->mc_top] == 0)) {
03571               DPRINTF("no more keys left, moving to %s sibling",
03572                   move_right ? "right" : "left");
03573               if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)
03574                      return rc;
03575        } else {
03576               if (move_right)
03577                      mc->mc_ki[mc->mc_top]++;
03578               else
03579                      mc->mc_ki[mc->mc_top]--;
03580               DPRINTF("just moving to %s index key %u",
03581                   move_right ? "right" : "left", mc->mc_ki[mc->mc_top]);
03582        }
03583        assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));
03584 
03585        indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
03586        if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp)))
03587               return rc;;
03588 
03589        mdb_cursor_push(mc, mp);
03590 
03591        return MDB_SUCCESS;
03592 }
03593 
03595 static int
03596 mdb_cursor_next(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
03597 {
03598        MDB_page      *mp;
03599        MDB_node      *leaf;
03600        int rc;
03601 
03602        if (mc->mc_flags & C_EOF) {
03603               return MDB_NOTFOUND;
03604        }
03605 
03606        assert(mc->mc_flags & C_INITIALIZED);
03607 
03608        mp = mc->mc_pg[mc->mc_top];
03609 
03610        if (mc->mc_db->md_flags & MDB_DUPSORT) {
03611               leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
03612               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03613                      if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
03614                             rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
03615                             if (op != MDB_NEXT || rc == MDB_SUCCESS)
03616                                    return rc;
03617                      }
03618               } else {
03619                      mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
03620                      if (op == MDB_NEXT_DUP)
03621                             return MDB_NOTFOUND;
03622               }
03623        }
03624 
03625        DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
03626 
03627        if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
03628               DPUTS("=====> move to next sibling page");
03629               if (mdb_cursor_sibling(mc, 1) != MDB_SUCCESS) {
03630                      mc->mc_flags |= C_EOF;
03631                      mc->mc_flags &= ~C_INITIALIZED;
03632                      return MDB_NOTFOUND;
03633               }
03634               mp = mc->mc_pg[mc->mc_top];
03635               DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
03636        } else
03637               mc->mc_ki[mc->mc_top]++;
03638 
03639        DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
03640            mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
03641 
03642        if (IS_LEAF2(mp)) {
03643               key->mv_size = mc->mc_db->md_pad;
03644               key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
03645               return MDB_SUCCESS;
03646        }
03647 
03648        assert(IS_LEAF(mp));
03649        leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
03650 
03651        if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03652               mdb_xcursor_init1(mc, leaf);
03653        }
03654        if (data) {
03655               if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
03656                      return rc;
03657 
03658               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03659                      rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
03660                      if (rc != MDB_SUCCESS)
03661                             return rc;
03662               }
03663        }
03664 
03665        MDB_SET_KEY(leaf, key);
03666        return MDB_SUCCESS;
03667 }
03668 
03670 static int
03671 mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
03672 {
03673        MDB_page      *mp;
03674        MDB_node      *leaf;
03675        int rc;
03676 
03677        assert(mc->mc_flags & C_INITIALIZED);
03678 
03679        mp = mc->mc_pg[mc->mc_top];
03680 
03681        if (mc->mc_db->md_flags & MDB_DUPSORT) {
03682               leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
03683               if (op == MDB_PREV || op == MDB_PREV_DUP) {
03684                      if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03685                             rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
03686                             if (op != MDB_PREV || rc == MDB_SUCCESS)
03687                                    return rc;
03688                      } else {
03689                             mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
03690                             if (op == MDB_PREV_DUP)
03691                                    return MDB_NOTFOUND;
03692                      }
03693               }
03694        }
03695 
03696        DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);
03697 
03698        if (mc->mc_ki[mc->mc_top] == 0)  {
03699               DPUTS("=====> move to prev sibling page");
03700               if (mdb_cursor_sibling(mc, 0) != MDB_SUCCESS) {
03701                      mc->mc_flags &= ~C_INITIALIZED;
03702                      return MDB_NOTFOUND;
03703               }
03704               mp = mc->mc_pg[mc->mc_top];
03705               mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
03706               DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
03707        } else
03708               mc->mc_ki[mc->mc_top]--;
03709 
03710        mc->mc_flags &= ~C_EOF;
03711 
03712        DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
03713            mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);
03714 
03715        if (IS_LEAF2(mp)) {
03716               key->mv_size = mc->mc_db->md_pad;
03717               key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
03718               return MDB_SUCCESS;
03719        }
03720 
03721        assert(IS_LEAF(mp));
03722        leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
03723 
03724        if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03725               mdb_xcursor_init1(mc, leaf);
03726        }
03727        if (data) {
03728               if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
03729                      return rc;
03730 
03731               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03732                      rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
03733                      if (rc != MDB_SUCCESS)
03734                             return rc;
03735               }
03736        }
03737 
03738        MDB_SET_KEY(leaf, key);
03739        return MDB_SUCCESS;
03740 }
03741 
03743 static int
03744 mdb_cursor_set(MDB_cursor *mc, MDB_val *key, MDB_val *data,
03745     MDB_cursor_op op, int *exactp)
03746 {
03747        int            rc;
03748        MDB_page      *mp;
03749        MDB_node      *leaf;
03750        DKBUF;
03751 
03752        assert(mc);
03753        assert(key);
03754        assert(key->mv_size > 0);
03755 
03756        /* See if we're already on the right page */
03757        if (mc->mc_flags & C_INITIALIZED) {
03758               MDB_val nodekey;
03759 
03760               mp = mc->mc_pg[mc->mc_top];
03761               if (!NUMKEYS(mp)) {
03762                      mc->mc_ki[mc->mc_top] = 0;
03763                      return MDB_NOTFOUND;
03764               }
03765               if (mp->mp_flags & P_LEAF2) {
03766                      nodekey.mv_size = mc->mc_db->md_pad;
03767                      nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
03768               } else {
03769                      leaf = NODEPTR(mp, 0);
03770                      MDB_SET_KEY(leaf, &nodekey);
03771               }
03772               rc = mc->mc_dbx->md_cmp(key, &nodekey);
03773               if (rc == 0) {
03774                      /* Probably happens rarely, but first node on the page
03775                       * was the one we wanted.
03776                       */
03777                      mc->mc_ki[mc->mc_top] = 0;
03778                      leaf = NODEPTR(mp, 0);
03779                      if (exactp)
03780                             *exactp = 1;
03781                      goto set1;
03782               }
03783               if (rc > 0) {
03784                      unsigned int i;
03785                      unsigned int nkeys = NUMKEYS(mp);
03786                      if (nkeys > 1) {
03787                             if (mp->mp_flags & P_LEAF2) {
03788                                    nodekey.mv_data = LEAF2KEY(mp,
03789                                            nkeys-1, nodekey.mv_size);
03790                             } else {
03791                                    leaf = NODEPTR(mp, nkeys-1);
03792                                    MDB_SET_KEY(leaf, &nodekey);
03793                             }
03794                             rc = mc->mc_dbx->md_cmp(key, &nodekey);
03795                             if (rc == 0) {
03796                                    /* last node was the one we wanted */
03797                                    mc->mc_ki[mc->mc_top] = nkeys-1;
03798                                    leaf = NODEPTR(mp, nkeys-1);
03799                                    if (exactp)
03800                                           *exactp = 1;
03801                                    goto set1;
03802                             }
03803                             if (rc < 0) {
03804                                    /* This is definitely the right page, skip search_page */
03805                                    rc = 0;
03806                                    goto set2;
03807                             }
03808                      }
03809                      /* If any parents have right-sibs, search.
03810                       * Otherwise, there's nothing further.
03811                       */
03812                      for (i=0; i<mc->mc_top; i++)
03813                             if (mc->mc_ki[i] <
03814                                    NUMKEYS(mc->mc_pg[i])-1)
03815                                    break;
03816                      if (i == mc->mc_top) {
03817                             /* There are no other pages */
03818                             mc->mc_ki[mc->mc_top] = nkeys;
03819                             return MDB_NOTFOUND;
03820                      }
03821               }
03822               if (!mc->mc_top) {
03823                      /* There are no other pages */
03824                      mc->mc_ki[mc->mc_top] = 0;
03825                      return MDB_NOTFOUND;
03826               }
03827        }
03828 
03829        rc = mdb_page_search(mc, key, 0);
03830        if (rc != MDB_SUCCESS)
03831               return rc;
03832 
03833        mp = mc->mc_pg[mc->mc_top];
03834        assert(IS_LEAF(mp));
03835 
03836 set2:
03837        leaf = mdb_node_search(mc, key, exactp);
03838        if (exactp != NULL && !*exactp) {
03839               /* MDB_SET specified and not an exact match. */
03840               return MDB_NOTFOUND;
03841        }
03842 
03843        if (leaf == NULL) {
03844               DPUTS("===> inexact leaf not found, goto sibling");
03845               if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
03846                      return rc;           /* no entries matched */
03847               mp = mc->mc_pg[mc->mc_top];
03848               assert(IS_LEAF(mp));
03849               leaf = NODEPTR(mp, 0);
03850        }
03851 
03852 set1:
03853        mc->mc_flags |= C_INITIALIZED;
03854        mc->mc_flags &= ~C_EOF;
03855 
03856        if (IS_LEAF2(mp)) {
03857               key->mv_size = mc->mc_db->md_pad;
03858               key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
03859               return MDB_SUCCESS;
03860        }
03861 
03862        if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03863               mdb_xcursor_init1(mc, leaf);
03864        }
03865        if (data) {
03866               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03867                      if (op == MDB_SET || op == MDB_SET_RANGE) {
03868                             rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
03869                      } else {
03870                             int ex2, *ex2p;
03871                             if (op == MDB_GET_BOTH) {
03872                                    ex2p = &ex2;
03873                                    ex2 = 0;
03874                             } else {
03875                                    ex2p = NULL;
03876                             }
03877                             rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
03878                             if (rc != MDB_SUCCESS)
03879                                    return rc;
03880                      }
03881               } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
03882                      MDB_val d2;
03883                      if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
03884                             return rc;
03885                      rc = mc->mc_dbx->md_dcmp(data, &d2);
03886                      if (rc) {
03887                             if (op == MDB_GET_BOTH || rc > 0)
03888                                    return MDB_NOTFOUND;
03889                      }
03890 
03891               } else {
03892                      if (mc->mc_xcursor)
03893                             mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
03894                      if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
03895                             return rc;
03896               }
03897        }
03898 
03899        /* The key already matches in all other cases */
03900        if (op == MDB_SET_RANGE)
03901               MDB_SET_KEY(leaf, key);
03902        DPRINTF("==> cursor placed on key [%s]", DKEY(key));
03903 
03904        return rc;
03905 }
03906 
03908 static int
03909 mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data)
03910 {
03911        int            rc;
03912        MDB_node      *leaf;
03913 
03914        if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
03915               rc = mdb_page_search(mc, NULL, 0);
03916               if (rc != MDB_SUCCESS)
03917                      return rc;
03918        }
03919        assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
03920 
03921        leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
03922        mc->mc_flags |= C_INITIALIZED;
03923        mc->mc_flags &= ~C_EOF;
03924 
03925        mc->mc_ki[mc->mc_top] = 0;
03926 
03927        if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
03928               key->mv_size = mc->mc_db->md_pad;
03929               key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
03930               return MDB_SUCCESS;
03931        }
03932 
03933        if (data) {
03934               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03935                      mdb_xcursor_init1(mc, leaf);
03936                      rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
03937                      if (rc)
03938                             return rc;
03939               } else {
03940                      if (mc->mc_xcursor)
03941                             mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
03942                      if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
03943                             return rc;
03944               }
03945        }
03946        MDB_SET_KEY(leaf, key);
03947        return MDB_SUCCESS;
03948 }
03949 
03951 static int
03952 mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data)
03953 {
03954        int            rc;
03955        MDB_node      *leaf;
03956        MDB_val       lkey;
03957 
03958        lkey.mv_size = MAXKEYSIZE+1;
03959        lkey.mv_data = NULL;
03960 
03961        if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
03962               rc = mdb_page_search(mc, &lkey, 0);
03963               if (rc != MDB_SUCCESS)
03964                      return rc;
03965        }
03966        assert(IS_LEAF(mc->mc_pg[mc->mc_top]));
03967 
03968        leaf = NODEPTR(mc->mc_pg[mc->mc_top], NUMKEYS(mc->mc_pg[mc->mc_top])-1);
03969        mc->mc_flags |= C_INITIALIZED;
03970        mc->mc_flags &= ~C_EOF;
03971 
03972        mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
03973 
03974        if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
03975               key->mv_size = mc->mc_db->md_pad;
03976               key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
03977               return MDB_SUCCESS;
03978        }
03979 
03980        if (data) {
03981               if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
03982                      mdb_xcursor_init1(mc, leaf);
03983                      rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
03984                      if (rc)
03985                             return rc;
03986               } else {
03987                      if (mc->mc_xcursor)
03988                             mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
03989                      if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
03990                             return rc;
03991               }
03992        }
03993 
03994        MDB_SET_KEY(leaf, key);
03995        return MDB_SUCCESS;
03996 }
03997 
03998 int
03999 mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data,
04000     MDB_cursor_op op)
04001 {
04002        int            rc;
04003        int            exact = 0;
04004 
04005        assert(mc);
04006 
04007        switch (op) {
04008        case MDB_GET_BOTH:
04009        case MDB_GET_BOTH_RANGE:
04010               if (data == NULL || mc->mc_xcursor == NULL) {
04011                      rc = EINVAL;
04012                      break;
04013               }
04014               /* FALLTHRU */
04015        case MDB_SET:
04016        case MDB_SET_RANGE:
04017               if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
04018                      rc = EINVAL;
04019               } else if (op == MDB_SET_RANGE)
04020                      rc = mdb_cursor_set(mc, key, data, op, NULL);
04021               else
04022                      rc = mdb_cursor_set(mc, key, data, op, &exact);
04023               break;
04024        case MDB_GET_MULTIPLE:
04025               if (data == NULL ||
04026                      !(mc->mc_db->md_flags & MDB_DUPFIXED) ||
04027                      !(mc->mc_flags & C_INITIALIZED)) {
04028                      rc = EINVAL;
04029                      break;
04030               }
04031               rc = MDB_SUCCESS;
04032               if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
04033                      (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
04034                      break;
04035               goto fetchm;
04036        case MDB_NEXT_MULTIPLE:
04037               if (data == NULL ||
04038                      !(mc->mc_db->md_flags & MDB_DUPFIXED)) {
04039                      rc = EINVAL;
04040                      break;
04041               }
04042               if (!(mc->mc_flags & C_INITIALIZED))
04043                      rc = mdb_cursor_first(mc, key, data);
04044               else
04045                      rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
04046               if (rc == MDB_SUCCESS) {
04047                      if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
04048                             MDB_cursor *mx;
04049 fetchm:
04050                             mx = &mc->mc_xcursor->mx_cursor;
04051                             data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
04052                                    mx->mc_db->md_pad;
04053                             data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
04054                             mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
04055                      } else {
04056                             rc = MDB_NOTFOUND;
04057                      }
04058               }
04059               break;
04060        case MDB_NEXT:
04061        case MDB_NEXT_DUP:
04062        case MDB_NEXT_NODUP:
04063               if (!(mc->mc_flags & C_INITIALIZED))
04064                      rc = mdb_cursor_first(mc, key, data);
04065               else
04066                      rc = mdb_cursor_next(mc, key, data, op);
04067               break;
04068        case MDB_PREV:
04069        case MDB_PREV_DUP:
04070        case MDB_PREV_NODUP:
04071               if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF))
04072                      rc = mdb_cursor_last(mc, key, data);
04073               else
04074                      rc = mdb_cursor_prev(mc, key, data, op);
04075               break;
04076        case MDB_FIRST:
04077               rc = mdb_cursor_first(mc, key, data);
04078               break;
04079        case MDB_FIRST_DUP:
04080               if (data == NULL ||
04081                      !(mc->mc_db->md_flags & MDB_DUPSORT) ||
04082                      !(mc->mc_flags & C_INITIALIZED) ||
04083                      !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
04084                      rc = EINVAL;
04085                      break;
04086               }
04087               rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
04088               break;
04089        case MDB_LAST:
04090               rc = mdb_cursor_last(mc, key, data);
04091               break;
04092        case MDB_LAST_DUP:
04093               if (data == NULL ||
04094                      !(mc->mc_db->md_flags & MDB_DUPSORT) ||
04095                      !(mc->mc_flags & C_INITIALIZED) ||
04096                      !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
04097                      rc = EINVAL;
04098                      break;
04099               }
04100               rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
04101               break;
04102        default:
04103               DPRINTF("unhandled/unimplemented cursor operation %u", op);
04104               rc = EINVAL;
04105               break;
04106        }
04107 
04108        return rc;
04109 }
04110 
04115 static int
04116 mdb_cursor_touch(MDB_cursor *mc)
04117 {
04118        int rc;
04119 
04120        if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
04121               MDB_cursor mc2;
04122               mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
04123               rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 1);
04124               if (rc)
04125                       return rc;
04126               *mc->mc_dbflag = DB_DIRTY;
04127        }
04128        for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
04129               rc = mdb_page_touch(mc);
04130               if (rc)
04131                      return rc;
04132        }
04133        mc->mc_top = mc->mc_snum-1;
04134        return MDB_SUCCESS;
04135 }
04136 
04137 int
04138 mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
04139     unsigned int flags)
04140 {
04141        MDB_node      *leaf = NULL;
04142        MDB_val       xdata, *rdata, dkey;
04143        MDB_page      *fp;
04144        MDB_db dummy;
04145        int do_sub = 0;
04146        unsigned int mcount = 0;
04147        size_t nsize;
04148        int rc, rc2;
04149        MDB_pagebuf pbuf;
04150        char dbuf[MAXKEYSIZE+1];
04151        unsigned int nflags;
04152        DKBUF;
04153 
04154        if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
04155               return EACCES;
04156 
04157        DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
04158               mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);
04159 
04160        dkey.mv_size = 0;
04161 
04162        if (flags == MDB_CURRENT) {
04163               if (!(mc->mc_flags & C_INITIALIZED))
04164                      return EINVAL;
04165               rc = MDB_SUCCESS;
04166        } else if (mc->mc_db->md_root == P_INVALID) {
04167               MDB_page *np;
04168               /* new database, write a root leaf page */
04169               DPUTS("allocating new root leaf page");
04170               if ((np = mdb_page_new(mc, P_LEAF, 1)) == NULL) {
04171                      return ENOMEM;
04172               }
04173               mc->mc_snum = 0;
04174               mdb_cursor_push(mc, np);
04175               mc->mc_db->md_root = np->mp_pgno;
04176               mc->mc_db->md_depth++;
04177               *mc->mc_dbflag = DB_DIRTY;
04178               if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
04179                      == MDB_DUPFIXED)
04180                      np->mp_flags |= P_LEAF2;
04181               mc->mc_flags |= C_INITIALIZED;
04182               rc = MDB_NOTFOUND;
04183               goto top;
04184        } else {
04185               int exact = 0;
04186               MDB_val d2;
04187               rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
04188               if ((flags & MDB_NOOVERWRITE) && rc == 0) {
04189                      DPRINTF("duplicate key [%s]", DKEY(key));
04190                      *data = d2;
04191                      return MDB_KEYEXIST;
04192               }
04193               if (rc && rc != MDB_NOTFOUND)
04194                      return rc;
04195        }
04196 
04197        /* Cursor is positioned, now make sure all pages are writable */
04198        rc2 = mdb_cursor_touch(mc);
04199        if (rc2)
04200               return rc2;
04201 
04202 top:
04203        /* The key already exists */
04204        if (rc == MDB_SUCCESS) {
04205               /* there's only a key anyway, so this is a no-op */
04206               if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
04207                      unsigned int ksize = mc->mc_db->md_pad;
04208                      if (key->mv_size != ksize)
04209                             return EINVAL;
04210                      if (flags == MDB_CURRENT) {
04211                             char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
04212                             memcpy(ptr, key->mv_data, ksize);
04213                      }
04214                      return MDB_SUCCESS;
04215               }
04216 
04217               leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
04218 
04219               /* DB has dups? */
04220               if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
04221                      /* Was a single item before, must convert now */
04222 more:
04223                      if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
04224                             /* Just overwrite the current item */
04225                             if (flags == MDB_CURRENT)
04226                                    goto current;
04227 
04228                             dkey.mv_size = NODEDSZ(leaf);
04229                             dkey.mv_data = NODEDATA(leaf);
04230 #if UINT_MAX < SIZE_MAX
04231                             if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
04232 #ifdef MISALIGNED_OK
04233                                    mc->mc_dbx->md_dcmp = mdb_cmp_long;
04234 #else
04235                                    mc->mc_dbx->md_dcmp = mdb_cmp_cint;
04236 #endif
04237 #endif
04238                             /* if data matches, ignore it */
04239                             if (!mc->mc_dbx->md_dcmp(data, &dkey))
04240                                    return (flags == MDB_NODUPDATA) ? MDB_KEYEXIST : MDB_SUCCESS;
04241 
04242                             /* create a fake page for the dup items */
04243                             memcpy(dbuf, dkey.mv_data, dkey.mv_size);
04244                             dkey.mv_data = dbuf;
04245                             fp = (MDB_page *)&pbuf;
04246                             fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
04247                             fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
04248                             fp->mp_lower = PAGEHDRSZ;
04249                             fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
04250                             if (mc->mc_db->md_flags & MDB_DUPFIXED) {
04251                                    fp->mp_flags |= P_LEAF2;
04252                                    fp->mp_pad = data->mv_size;
04253                             } else {
04254                                    fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
04255                                           (dkey.mv_size & 1) + (data->mv_size & 1);
04256                             }
04257                             mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
04258                             do_sub = 1;
04259                             rdata = &xdata;
04260                             xdata.mv_size = fp->mp_upper;
04261                             xdata.mv_data = fp;
04262                             flags |= F_DUPDATA;
04263                             goto new_sub;
04264                      }
04265                      if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
04266                             /* See if we need to convert from fake page to subDB */
04267                             MDB_page *mp;
04268                             unsigned int offset;
04269                             unsigned int i;
04270 
04271                             fp = NODEDATA(leaf);
04272                             if (flags == MDB_CURRENT) {
04273                                    fp->mp_flags |= P_DIRTY;
04274                                    COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
04275                                    mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
04276                                    flags |= F_DUPDATA;
04277                                    goto put_sub;
04278                             }
04279                             if (mc->mc_db->md_flags & MDB_DUPFIXED) {
04280                                    offset = fp->mp_pad;
04281                             } else {
04282                                    offset = NODESIZE + sizeof(indx_t) + data->mv_size;
04283                             }
04284                             offset += offset & 1;
04285                             if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
04286                                    offset >= (mc->mc_txn->mt_env->me_psize - PAGEHDRSZ) /
04287                                           MDB_MINKEYS) {
04288                                    /* yes, convert it */
04289                                    dummy.md_flags = 0;
04290                                    if (mc->mc_db->md_flags & MDB_DUPFIXED) {
04291                                           dummy.md_pad = fp->mp_pad;
04292                                           dummy.md_flags = MDB_DUPFIXED;
04293                                           if (mc->mc_db->md_flags & MDB_INTEGERDUP)
04294                                                  dummy.md_flags |= MDB_INTEGERKEY;
04295                                    }
04296                                    dummy.md_depth = 1;
04297                                    dummy.md_branch_pages = 0;
04298                                    dummy.md_leaf_pages = 1;
04299                                    dummy.md_overflow_pages = 0;
04300                                    dummy.md_entries = NUMKEYS(fp);
04301                                    rdata = &xdata;
04302                                    xdata.mv_size = sizeof(MDB_db);
04303                                    xdata.mv_data = &dummy;
04304                                    mp = mdb_page_alloc(mc, 1);
04305                                    if (!mp)
04306                                           return ENOMEM;
04307                                    offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
04308                                    flags |= F_DUPDATA|F_SUBDATA;
04309                                    dummy.md_root = mp->mp_pgno;
04310                             } else {
04311                                    /* no, just grow it */
04312                                    rdata = &xdata;
04313                                    xdata.mv_size = NODEDSZ(leaf) + offset;
04314                                    xdata.mv_data = &pbuf;
04315                                    mp = (MDB_page *)&pbuf;
04316                                    mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
04317                                    flags |= F_DUPDATA;
04318                             }
04319                             mp->mp_flags = fp->mp_flags | P_DIRTY;
04320                             mp->mp_pad   = fp->mp_pad;
04321                             mp->mp_lower = fp->mp_lower;
04322                             mp->mp_upper = fp->mp_upper + offset;
04323                             if (IS_LEAF2(fp)) {
04324                                    memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
04325                             } else {
04326                                    nsize = NODEDSZ(leaf) - fp->mp_upper;
04327                                    memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
04328                                    for (i=0; i<NUMKEYS(fp); i++)
04329                                           mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
04330                             }
04331                             mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
04332                             do_sub = 1;
04333                             goto new_sub;
04334                      }
04335                      /* data is on sub-DB, just store it */
04336                      flags |= F_DUPDATA|F_SUBDATA;
04337                      goto put_sub;
04338               }
04339 current:
04340               /* overflow page overwrites need special handling */
04341               if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
04342                      MDB_page *omp;
04343                      pgno_t pg;
04344                      int ovpages, dpages;
04345 
04346                      ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
04347                      dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
04348                      memcpy(&pg, NODEDATA(leaf), sizeof(pg));
04349                      mdb_page_get(mc->mc_txn, pg, &omp);
04350                      /* Is the ov page writable and large enough? */
04351                      if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) {
04352                             /* yes, overwrite it. Note in this case we don't
04353                              * bother to try shrinking the node if the new data
04354                              * is smaller than the overflow threshold.
04355                              */
04356                             if (F_ISSET(flags, MDB_RESERVE))
04357                                    data->mv_data = METADATA(omp);
04358                             else
04359                                    memcpy(METADATA(omp), data->mv_data, data->mv_size);
04360                             goto done;
04361                      } else {
04362                             /* no, free ovpages */
04363                             int i;
04364                             mc->mc_db->md_overflow_pages -= ovpages;
04365                             for (i=0; i<ovpages; i++) {
04366                                    DPRINTF("freed ov page %zu", pg);
04367                                    mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
04368                                    pg++;
04369                             }
04370                      }
04371               } else if (NODEDSZ(leaf) == data->mv_size) {
04372                      /* same size, just replace it. Note that we could
04373                       * also reuse this node if the new data is smaller,
04374                       * but instead we opt to shrink the node in that case.
04375                       */
04376                      if (F_ISSET(flags, MDB_RESERVE))
04377                             data->mv_data = NODEDATA(leaf);
04378                      else
04379                             memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
04380                      goto done;
04381               }
04382               mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
04383               mc->mc_db->md_entries--;
04384        } else {
04385               DPRINTF("inserting key at index %i", mc->mc_ki[mc->mc_top]);
04386        }
04387 
04388        rdata = data;
04389 
04390 new_sub:
04391        nflags = flags & NODE_ADD_FLAGS;
04392        nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
04393        if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
04394               if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
04395                      nflags &= ~MDB_APPEND;
04396               rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
04397        } else {
04398               /* There is room already in this leaf page. */
04399               rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
04400               if (rc == 0 && !do_sub) {
04401                      /* Adjust other cursors pointing to mp */
04402                      MDB_cursor *m2, *m3;
04403                      MDB_dbi dbi = mc->mc_dbi;
04404                      unsigned i = mc->mc_top;
04405                      MDB_page *mp = mc->mc_pg[i];
04406 
04407                      if (mc->mc_flags & C_SUB)
04408                             dbi--;
04409 
04410                      for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
04411                             if (mc->mc_flags & C_SUB)
04412                                    m3 = &m2->mc_xcursor->mx_cursor;
04413                             else
04414                                    m3 = m2;
04415                             if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
04416                             if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
04417                                    m3->mc_ki[i]++;
04418                             }
04419                      }
04420               }
04421        }
04422 
04423        if (rc != MDB_SUCCESS)
04424               mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
04425        else {
04426               /* Now store the actual data in the child DB. Note that we're
04427                * storing the user data in the keys field, so there are strict
04428                * size limits on dupdata. The actual data fields of the child
04429                * DB are all zero size.
04430                */
04431               if (do_sub) {
04432                      int xflags;
04433 put_sub:
04434                      xdata.mv_size = 0;
04435                      xdata.mv_data = "";
04436                      leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
04437                      if (flags & MDB_CURRENT) {
04438                             xflags = MDB_CURRENT;
04439                      } else {
04440                             mdb_xcursor_init1(mc, leaf);
04441                             xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0;
04442                      }
04443                      /* converted, write the original data first */
04444                      if (dkey.mv_size) {
04445                             rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
04446                             if (rc)
04447                                    return rc;
04448                             {
04449                                    /* Adjust other cursors pointing to mp */
04450                                    MDB_cursor *m2;
04451                                    unsigned i = mc->mc_top;
04452                                    MDB_page *mp = mc->mc_pg[i];
04453 
04454                                    for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
04455                                           if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
04456                                           if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
04457                                                  mdb_xcursor_init1(m2, leaf);
04458                                           }
04459                                    }
04460                             }
04461                      }
04462                      xflags |= (flags & MDB_APPEND);
04463                      rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
04464                      if (flags & F_SUBDATA) {
04465                             void *db = NODEDATA(leaf);
04466                             memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
04467                      }
04468               }
04469               /* sub-writes might have failed so check rc again.
04470                * Don't increment count if we just replaced an existing item.
04471                */
04472               if (!rc && !(flags & MDB_CURRENT))
04473                      mc->mc_db->md_entries++;
04474               if (flags & MDB_MULTIPLE) {
04475                      mcount++;
04476                      if (mcount < data[1].mv_size) {
04477                             data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
04478                             leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
04479                             goto more;
04480                      }
04481               }
04482        }
04483 done:
04484        return rc;
04485 }
04486 
04487 int
04488 mdb_cursor_del(MDB_cursor *mc, unsigned int flags)
04489 {
04490        MDB_node      *leaf;
04491        int rc;
04492 
04493        if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
04494               return EACCES;
04495 
04496        if (!mc->mc_flags & C_INITIALIZED)
04497               return EINVAL;
04498 
04499        rc = mdb_cursor_touch(mc);
04500        if (rc)
04501               return rc;
04502 
04503        leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
04504 
04505        if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
04506               if (flags != MDB_NODUPDATA) {
04507                      if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
04508                             mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
04509                      }
04510                      rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0);
04511                      /* If sub-DB still has entries, we're done */
04512                      if (mc->mc_xcursor->mx_db.md_entries) {
04513                             if (leaf->mn_flags & F_SUBDATA) {
04514                                    /* update subDB info */
04515                                    void *db = NODEDATA(leaf);
04516                                    memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
04517                             } else {
04518                                    /* shrink fake page */
04519                                    mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
04520                             }
04521                             mc->mc_db->md_entries--;
04522                             return rc;
04523                      }
04524                      /* otherwise fall thru and delete the sub-DB */
04525               }
04526 
04527               if (leaf->mn_flags & F_SUBDATA) {
04528                      /* add all the child DB's pages to the free list */
04529                      rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
04530                      if (rc == MDB_SUCCESS) {
04531                             mc->mc_db->md_entries -=
04532                                    mc->mc_xcursor->mx_db.md_entries;
04533                      }
04534               }
04535        }
04536 
04537        return mdb_cursor_del0(mc, leaf);
04538 }
04539 
04547 static MDB_page *
04548 mdb_page_new(MDB_cursor *mc, uint32_t flags, int num)
04549 {
04550        MDB_page      *np;
04551 
04552        if ((np = mdb_page_alloc(mc, num)) == NULL)
04553               return NULL;
04554        DPRINTF("allocated new mpage %zu, page size %u",
04555            np->mp_pgno, mc->mc_txn->mt_env->me_psize);
04556        np->mp_flags = flags | P_DIRTY;
04557        np->mp_lower = PAGEHDRSZ;
04558        np->mp_upper = mc->mc_txn->mt_env->me_psize;
04559 
04560        if (IS_BRANCH(np))
04561               mc->mc_db->md_branch_pages++;
04562        else if (IS_LEAF(np))
04563               mc->mc_db->md_leaf_pages++;
04564        else if (IS_OVERFLOW(np)) {
04565               mc->mc_db->md_overflow_pages += num;
04566               np->mp_pages = num;
04567        }
04568 
04569        return np;
04570 }
04571 
04583 static size_t
04584 mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data)
04585 {
04586        size_t         sz;
04587 
04588        sz = LEAFSIZE(key, data);
04589        if (sz >= env->me_psize / MDB_MINKEYS) {
04590               /* put on overflow page */
04591               sz -= data->mv_size - sizeof(pgno_t);
04592        }
04593        sz += sz & 1;
04594 
04595        return sz + sizeof(indx_t);
04596 }
04597 
04608 static size_t
04609 mdb_branch_size(MDB_env *env, MDB_val *key)
04610 {
04611        size_t         sz;
04612 
04613        sz = INDXSIZE(key);
04614        if (sz >= env->me_psize / MDB_MINKEYS) {
04615               /* put on overflow page */
04616               /* not implemented */
04617               /* sz -= key->size - sizeof(pgno_t); */
04618        }
04619 
04620        return sz + sizeof(indx_t);
04621 }
04622 
04638 static int
04639 mdb_node_add(MDB_cursor *mc, indx_t indx,
04640     MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
04641 {
04642        unsigned int   i;
04643        size_t         node_size = NODESIZE;
04644        indx_t         ofs;
04645        MDB_node      *node;
04646        MDB_page      *mp = mc->mc_pg[mc->mc_top];
04647        MDB_page      *ofp = NULL;         /* overflow page */
04648        DKBUF;
04649 
04650        assert(mp->mp_upper >= mp->mp_lower);
04651 
04652        DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]",
04653            IS_LEAF(mp) ? "leaf" : "branch",
04654               IS_SUBP(mp) ? "sub-" : "",
04655            mp->mp_pgno, indx, data ? data->mv_size : 0,
04656               key ? key->mv_size : 0, key ? DKEY(key) : NULL);
04657 
04658        if (IS_LEAF2(mp)) {
04659               /* Move higher keys up one slot. */
04660               int ksize = mc->mc_db->md_pad, dif;
04661               char *ptr = LEAF2KEY(mp, indx, ksize);
04662               dif = NUMKEYS(mp) - indx;
04663               if (dif > 0)
04664                      memmove(ptr+ksize, ptr, dif*ksize);
04665               /* insert new key */
04666               memcpy(ptr, key->mv_data, ksize);
04667 
04668               /* Just using these for counting */
04669               mp->mp_lower += sizeof(indx_t);
04670               mp->mp_upper -= ksize - sizeof(indx_t);
04671               return MDB_SUCCESS;
04672        }
04673 
04674        if (key != NULL)
04675               node_size += key->mv_size;
04676 
04677        if (IS_LEAF(mp)) {
04678               assert(data);
04679               if (F_ISSET(flags, F_BIGDATA)) {
04680                      /* Data already on overflow page. */
04681                      node_size += sizeof(pgno_t);
04682               } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) {
04683                      int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
04684                      /* Put data on overflow page. */
04685                      DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
04686                          data->mv_size, node_size+data->mv_size);
04687                      node_size += sizeof(pgno_t);
04688                      if ((ofp = mdb_page_new(mc, P_OVERFLOW, ovpages)) == NULL)
04689                             return ENOMEM;
04690                      DPRINTF("allocated overflow page %zu", ofp->mp_pgno);
04691                      flags |= F_BIGDATA;
04692               } else {
04693                      node_size += data->mv_size;
04694               }
04695        }
04696        node_size += node_size & 1;
04697 
04698        if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
04699               DPRINTF("not enough room in page %zu, got %u ptrs",
04700                   mp->mp_pgno, NUMKEYS(mp));
04701               DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
04702                   mp->mp_upper - mp->mp_lower);
04703               DPRINTF("node size = %zu", node_size);
04704               return ENOSPC;
04705        }
04706 
04707        /* Move higher pointers up one slot. */
04708        for (i = NUMKEYS(mp); i > indx; i--)
04709               mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
04710 
04711        /* Adjust free space offsets. */
04712        ofs = mp->mp_upper - node_size;
04713        assert(ofs >= mp->mp_lower + sizeof(indx_t));
04714        mp->mp_ptrs[indx] = ofs;
04715        mp->mp_upper = ofs;
04716        mp->mp_lower += sizeof(indx_t);
04717 
04718        /* Write the node data. */
04719        node = NODEPTR(mp, indx);
04720        node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
04721        node->mn_flags = flags;
04722        if (IS_LEAF(mp))
04723               SETDSZ(node,data->mv_size);
04724        else
04725               SETPGNO(node,pgno);
04726 
04727        if (key)
04728               memcpy(NODEKEY(node), key->mv_data, key->mv_size);
04729 
04730        if (IS_LEAF(mp)) {
04731               assert(key);
04732               if (ofp == NULL) {
04733                      if (F_ISSET(flags, F_BIGDATA))
04734                             memcpy(node->mn_data + key->mv_size, data->mv_data,
04735                                 sizeof(pgno_t));
04736                      else if (F_ISSET(flags, MDB_RESERVE))
04737                             data->mv_data = node->mn_data + key->mv_size;
04738                      else
04739                             memcpy(node->mn_data + key->mv_size, data->mv_data,
04740                                 data->mv_size);
04741               } else {
04742                      memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
04743                          sizeof(pgno_t));
04744                      if (F_ISSET(flags, MDB_RESERVE))
04745                             data->mv_data = METADATA(ofp);
04746                      else
04747                             memcpy(METADATA(ofp), data->mv_data, data->mv_size);
04748               }
04749        }
04750 
04751        return MDB_SUCCESS;
04752 }
04753 
04760 static void
04761 mdb_node_del(MDB_page *mp, indx_t indx, int ksize)
04762 {
04763        unsigned int   sz;
04764        indx_t         i, j, numkeys, ptr;
04765        MDB_node      *node;
04766        char          *base;
04767 
04768 #if MDB_DEBUG
04769        {
04770        pgno_t pgno;
04771        COPY_PGNO(pgno, mp->mp_pgno);
04772        DPRINTF("delete node %u on %s page %zu", indx,
04773            IS_LEAF(mp) ? "leaf" : "branch", pgno);
04774        }
04775 #endif
04776        assert(indx < NUMKEYS(mp));
04777 
04778        if (IS_LEAF2(mp)) {
04779               int x = NUMKEYS(mp) - 1 - indx;
04780               base = LEAF2KEY(mp, indx, ksize);
04781               if (x)
04782                      memmove(base, base + ksize, x * ksize);
04783               mp->mp_lower -= sizeof(indx_t);
04784               mp->mp_upper += ksize - sizeof(indx_t);
04785               return;
04786        }
04787 
04788        node = NODEPTR(mp, indx);
04789        sz = NODESIZE + node->mn_ksize;
04790        if (IS_LEAF(mp)) {
04791               if (F_ISSET(node->mn_flags, F_BIGDATA))
04792                      sz += sizeof(pgno_t);
04793               else
04794                      sz += NODEDSZ(node);
04795        }
04796        sz += sz & 1;
04797 
04798        ptr = mp->mp_ptrs[indx];
04799        numkeys = NUMKEYS(mp);
04800        for (i = j = 0; i < numkeys; i++) {
04801               if (i != indx) {
04802                      mp->mp_ptrs[j] = mp->mp_ptrs[i];
04803                      if (mp->mp_ptrs[i] < ptr)
04804                             mp->mp_ptrs[j] += sz;
04805                      j++;
04806               }
04807        }
04808 
04809        base = (char *)mp + mp->mp_upper;
04810        memmove(base + sz, base, ptr - mp->mp_upper);
04811 
04812        mp->mp_lower -= sizeof(indx_t);
04813        mp->mp_upper += sz;
04814 }
04815 
04820 static void
04821 mdb_node_shrink(MDB_page *mp, indx_t indx)
04822 {
04823        MDB_node *node;
04824        MDB_page *sp, *xp;
04825        char *base;
04826        int osize, nsize;
04827        int delta;
04828        indx_t         i, numkeys, ptr;
04829 
04830        node = NODEPTR(mp, indx);
04831        sp = (MDB_page *)NODEDATA(node);
04832        osize = NODEDSZ(node);
04833 
04834        delta = sp->mp_upper - sp->mp_lower;
04835        SETDSZ(node, osize - delta);
04836        xp = (MDB_page *)((char *)sp + delta);
04837 
04838        /* shift subpage upward */
04839        if (IS_LEAF2(sp)) {
04840               nsize = NUMKEYS(sp) * sp->mp_pad;
04841               memmove(METADATA(xp), METADATA(sp), nsize);
04842        } else {
04843               int i;
04844               nsize = osize - sp->mp_upper;
04845               numkeys = NUMKEYS(sp);
04846               for (i=numkeys-1; i>=0; i--)
04847                      xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
04848        }
04849        xp->mp_upper = sp->mp_lower;
04850        xp->mp_lower = sp->mp_lower;
04851        xp->mp_flags = sp->mp_flags;
04852        xp->mp_pad = sp->mp_pad;
04853        COPY_PGNO(xp->mp_pgno, mp->mp_pgno);
04854 
04855        /* shift lower nodes upward */
04856        ptr = mp->mp_ptrs[indx];
04857        numkeys = NUMKEYS(mp);
04858        for (i = 0; i < numkeys; i++) {
04859               if (mp->mp_ptrs[i] <= ptr)
04860                      mp->mp_ptrs[i] += delta;
04861        }
04862 
04863        base = (char *)mp + mp->mp_upper;
04864        memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
04865        mp->mp_upper += delta;
04866 }
04867 
04877 static void
04878 mdb_xcursor_init0(MDB_cursor *mc)
04879 {
04880        MDB_xcursor *mx = mc->mc_xcursor;
04881 
04882        mx->mx_cursor.mc_xcursor = NULL;
04883        mx->mx_cursor.mc_txn = mc->mc_txn;
04884        mx->mx_cursor.mc_db = &mx->mx_db;
04885        mx->mx_cursor.mc_dbx = &mx->mx_dbx;
04886        mx->mx_cursor.mc_dbi = mc->mc_dbi+1;
04887        mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
04888        mx->mx_cursor.mc_snum = 0;
04889        mx->mx_cursor.mc_top = 0;
04890        mx->mx_cursor.mc_flags = C_SUB;
04891        mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
04892        mx->mx_dbx.md_dcmp = NULL;
04893        mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
04894 }
04895 
04902 static void
04903 mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
04904 {
04905        MDB_xcursor *mx = mc->mc_xcursor;
04906 
04907        if (node->mn_flags & F_SUBDATA) {
04908               memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDB_db));
04909               mx->mx_cursor.mc_snum = 0;
04910               mx->mx_cursor.mc_flags = C_SUB;
04911        } else {
04912               MDB_page *fp = NODEDATA(node);
04913               mx->mx_db.md_pad = mc->mc_pg[mc->mc_top]->mp_pad;
04914               mx->mx_db.md_flags = 0;
04915               mx->mx_db.md_depth = 1;
04916               mx->mx_db.md_branch_pages = 0;
04917               mx->mx_db.md_leaf_pages = 1;
04918               mx->mx_db.md_overflow_pages = 0;
04919               mx->mx_db.md_entries = NUMKEYS(fp);
04920               COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
04921               mx->mx_cursor.mc_snum = 1;
04922               mx->mx_cursor.mc_flags = C_INITIALIZED|C_SUB;
04923               mx->mx_cursor.mc_top = 0;
04924               mx->mx_cursor.mc_pg[0] = fp;
04925               mx->mx_cursor.mc_ki[0] = 0;
04926               if (mc->mc_db->md_flags & MDB_DUPFIXED) {
04927                      mx->mx_db.md_flags = MDB_DUPFIXED;
04928                      mx->mx_db.md_pad = fp->mp_pad;
04929                      if (mc->mc_db->md_flags & MDB_INTEGERDUP)
04930                             mx->mx_db.md_flags |= MDB_INTEGERKEY;
04931               }
04932        }
04933        DPRINTF("Sub-db %u for db %u root page %zu", mx->mx_cursor.mc_dbi, mc->mc_dbi,
04934               mx->mx_db.md_root);
04935        mx->mx_dbflag = (F_ISSET(mc->mc_pg[mc->mc_top]->mp_flags, P_DIRTY)) ?
04936               DB_DIRTY : 0;
04937        mx->mx_dbx.md_name.mv_data = NODEKEY(node);
04938        mx->mx_dbx.md_name.mv_size = node->mn_ksize;
04939 #if UINT_MAX < SIZE_MAX
04940        if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t))
04941 #ifdef MISALIGNED_OK
04942               mx->mx_dbx.md_cmp = mdb_cmp_long;
04943 #else
04944               mx->mx_dbx.md_cmp = mdb_cmp_cint;
04945 #endif
04946 #endif
04947 }
04948 
04950 static void
04951 mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
04952 {
04953        mc->mc_orig = NULL;
04954        mc->mc_dbi = dbi;
04955        mc->mc_txn = txn;
04956        mc->mc_db = &txn->mt_dbs[dbi];
04957        mc->mc_dbx = &txn->mt_dbxs[dbi];
04958        mc->mc_dbflag = &txn->mt_dbflags[dbi];
04959        mc->mc_snum = 0;
04960        mc->mc_top = 0;
04961        mc->mc_flags = 0;
04962        if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
04963               assert(mx != NULL);
04964               mc->mc_xcursor = mx;
04965               mdb_xcursor_init0(mc);
04966        } else {
04967               mc->mc_xcursor = NULL;
04968        }
04969 }
04970 
04971 int
04972 mdb_cursor_open(MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
04973 {
04974        MDB_cursor    *mc;
04975        MDB_xcursor   *mx = NULL;
04976        size_t size = sizeof(MDB_cursor);
04977 
04978        if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs)
04979               return EINVAL;
04980 
04981        /* Allow read access to the freelist */
04982        if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
04983               return EINVAL;
04984 
04985        if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
04986               size += sizeof(MDB_xcursor);
04987 
04988        if ((mc = malloc(size)) != NULL) {
04989               if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
04990                      mx = (MDB_xcursor *)(mc + 1);
04991               }
04992               mdb_cursor_init(mc, txn, dbi, mx);
04993               if (txn->mt_cursors) {
04994                      mc->mc_next = txn->mt_cursors[dbi];
04995                      txn->mt_cursors[dbi] = mc;
04996               }
04997               mc->mc_flags |= C_ALLOCD;
04998        } else {
04999               return ENOMEM;
05000        }
05001 
05002        *ret = mc;
05003 
05004        return MDB_SUCCESS;
05005 }
05006 
05007 /* Return the count of duplicate data items for the current key */
05008 int
05009 mdb_cursor_count(MDB_cursor *mc, size_t *countp)
05010 {
05011        MDB_node      *leaf;
05012 
05013        if (mc == NULL || countp == NULL)
05014               return EINVAL;
05015 
05016        if (!(mc->mc_db->md_flags & MDB_DUPSORT))
05017               return EINVAL;
05018 
05019        leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
05020        if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
05021               *countp = 1;
05022        } else {
05023               if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
05024                      return EINVAL;
05025 
05026               *countp = mc->mc_xcursor->mx_db.md_entries;
05027        }
05028        return MDB_SUCCESS;
05029 }
05030 
05031 void
05032 mdb_cursor_close(MDB_cursor *mc)
05033 {
05034        if (mc != NULL) {
05035               /* remove from txn, if tracked */
05036               if (mc->mc_txn->mt_cursors) {
05037                      MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
05038                      while (*prev && *prev != mc) prev = &(*prev)->mc_next;
05039                      if (*prev == mc)
05040                             *prev = mc->mc_next;
05041               }
05042               if (mc->mc_flags & C_ALLOCD)
05043                      free(mc);
05044        }
05045 }
05046 
05047 MDB_txn *
05048 mdb_cursor_txn(MDB_cursor *mc)
05049 {
05050        if (!mc) return NULL;
05051        return mc->mc_txn;
05052 }
05053 
05054 MDB_dbi
05055 mdb_cursor_dbi(MDB_cursor *mc)
05056 {
05057        if (!mc) return 0;
05058        return mc->mc_dbi;
05059 }
05060 
05067 static int
05068 mdb_update_key(MDB_page *mp, indx_t indx, MDB_val *key)
05069 {
05070        MDB_node             *node;
05071        char                 *base;
05072        size_t                len;
05073        int                   delta, delta0;
05074        indx_t                ptr, i, numkeys;
05075        DKBUF;
05076 
05077        node = NODEPTR(mp, indx);
05078        ptr = mp->mp_ptrs[indx];
05079 #if MDB_DEBUG
05080        {
05081               MDB_val       k2;
05082               char kbuf2[(MAXKEYSIZE*2+1)];
05083               k2.mv_data = NODEKEY(node);
05084               k2.mv_size = node->mn_ksize;
05085               DPRINTF("update key %u (ofs %u) [%s] to [%s] on page %zu",
05086                      indx, ptr,
05087                      mdb_dkey(&k2, kbuf2),
05088                      DKEY(key),
05089                      mp->mp_pgno);
05090        }
05091 #endif
05092 
05093        delta0 = delta = key->mv_size - node->mn_ksize;
05094 
05095        /* Must be 2-byte aligned. If new key is
05096         * shorter by 1, the shift will be skipped.
05097         */
05098        delta += (delta & 1);
05099        if (delta) {
05100               if (delta > 0 && SIZELEFT(mp) < delta) {
05101                      DPRINTF("OUCH! Not enough room, delta = %d", delta);
05102                      return ENOSPC;
05103               }
05104 
05105               numkeys = NUMKEYS(mp);
05106               for (i = 0; i < numkeys; i++) {
05107                      if (mp->mp_ptrs[i] <= ptr)
05108                             mp->mp_ptrs[i] -= delta;
05109               }
05110 
05111               base = (char *)mp + mp->mp_upper;
05112               len = ptr - mp->mp_upper + NODESIZE;
05113               memmove(base - delta, base, len);
05114               mp->mp_upper -= delta;
05115 
05116               node = NODEPTR(mp, indx);
05117        }
05118 
05119        /* But even if no shift was needed, update ksize */
05120        if (delta0)
05121               node->mn_ksize = key->mv_size;
05122 
05123        if (key->mv_size)
05124               memcpy(NODEKEY(node), key->mv_data, key->mv_size);
05125 
05126        return MDB_SUCCESS;
05127 }
05128 
05131 static int
05132 mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
05133 {
05134        int                   rc;
05135        MDB_node             *srcnode;
05136        MDB_val               key, data;
05137        pgno_t srcpg;
05138        unsigned short flags;
05139 
05140        DKBUF;
05141 
05142        /* Mark src and dst as dirty. */
05143        if ((rc = mdb_page_touch(csrc)) ||
05144            (rc = mdb_page_touch(cdst)))
05145               return rc;
05146 
05147        if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
05148               srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); /* fake */
05149               key.mv_size = csrc->mc_db->md_pad;
05150               key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
05151               data.mv_size = 0;
05152               data.mv_data = NULL;
05153               srcpg = 0;
05154               flags = 0;
05155        } else {
05156               srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
05157               assert(!((long)srcnode&1));
05158               srcpg = NODEPGNO(srcnode);
05159               flags = srcnode->mn_flags;
05160               if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
05161                      unsigned int snum = csrc->mc_snum;
05162                      MDB_node *s2;
05163                      /* must find the lowest key below src */
05164                      mdb_page_search_root(csrc, NULL, 0);
05165                      s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
05166                      key.mv_size = NODEKSZ(s2);
05167                      key.mv_data = NODEKEY(s2);
05168                      csrc->mc_snum = snum--;
05169                      csrc->mc_top = snum;
05170               } else {
05171                      key.mv_size = NODEKSZ(srcnode);
05172                      key.mv_data = NODEKEY(srcnode);
05173               }
05174               data.mv_size = NODEDSZ(srcnode);
05175               data.mv_data = NODEDATA(srcnode);
05176        }
05177        if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
05178               unsigned int snum = cdst->mc_snum;
05179               MDB_node *s2;
05180               MDB_val bkey;
05181               /* must find the lowest key below dst */
05182               mdb_page_search_root(cdst, NULL, 0);
05183               s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
05184               bkey.mv_size = NODEKSZ(s2);
05185               bkey.mv_data = NODEKEY(s2);
05186               cdst->mc_snum = snum--;
05187               cdst->mc_top = snum;
05188               rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey);
05189        }
05190 
05191        DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
05192            IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
05193            csrc->mc_ki[csrc->mc_top],
05194               DKEY(&key),
05195            csrc->mc_pg[csrc->mc_top]->mp_pgno,
05196            cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno);
05197 
05198        /* Add the node to the destination page.
05199         */
05200        rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
05201        if (rc != MDB_SUCCESS)
05202               return rc;
05203 
05204        /* Delete the node from the source page.
05205         */
05206        mdb_node_del(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
05207 
05208        {
05209               /* Adjust other cursors pointing to mp */
05210               MDB_cursor *m2, *m3;
05211               MDB_dbi dbi = csrc->mc_dbi;
05212               MDB_page *mp = csrc->mc_pg[csrc->mc_top];
05213 
05214               if (csrc->mc_flags & C_SUB)
05215                      dbi--;
05216 
05217               for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
05218                      if (m2 == csrc) continue;
05219                      if (csrc->mc_flags & C_SUB)
05220                             m3 = &m2->mc_xcursor->mx_cursor;
05221                      else
05222                             m3 = m2;
05223                      if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
05224                             csrc->mc_ki[csrc->mc_top]) {
05225                             m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
05226                             m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
05227                      }
05228               }
05229        }
05230 
05231        /* Update the parent separators.
05232         */
05233        if (csrc->mc_ki[csrc->mc_top] == 0) {
05234               if (csrc->mc_ki[csrc->mc_top-1] != 0) {
05235                      if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
05236                             key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
05237                      } else {
05238                             srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
05239                             key.mv_size = NODEKSZ(srcnode);
05240                             key.mv_data = NODEKEY(srcnode);
05241                      }
05242                      DPRINTF("update separator for source page %zu to [%s]",
05243                             csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key));
05244                      if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1],
05245                             &key)) != MDB_SUCCESS)
05246                             return rc;
05247               }
05248               if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
05249                      MDB_val        nullkey;
05250                      nullkey.mv_size = 0;
05251                      rc = mdb_update_key(csrc->mc_pg[csrc->mc_top], 0, &nullkey);
05252                      assert(rc == MDB_SUCCESS);
05253               }
05254        }
05255 
05256        if (cdst->mc_ki[cdst->mc_top] == 0) {
05257               if (cdst->mc_ki[cdst->mc_top-1] != 0) {
05258                      if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
05259                             key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
05260                      } else {
05261                             srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
05262                             key.mv_size = NODEKSZ(srcnode);
05263                             key.mv_data = NODEKEY(srcnode);
05264                      }
05265                      DPRINTF("update separator for destination page %zu to [%s]",
05266                             cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key));
05267                      if ((rc = mdb_update_key(cdst->mc_pg[cdst->mc_top-1], cdst->mc_ki[cdst->mc_top-1],
05268                             &key)) != MDB_SUCCESS)
05269                             return rc;
05270               }
05271               if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
05272                      MDB_val        nullkey;
05273                      nullkey.mv_size = 0;
05274                      rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &nullkey);
05275                      assert(rc == MDB_SUCCESS);
05276               }
05277        }
05278 
05279        return MDB_SUCCESS;
05280 }
05281 
05289 static int
05290 mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
05291 {
05292        int                   rc;
05293        indx_t                i, j;
05294        MDB_node             *srcnode;
05295        MDB_val               key, data;
05296        unsigned      nkeys;
05297 
05298        DPRINTF("merging page %zu into %zu", csrc->mc_pg[csrc->mc_top]->mp_pgno,
05299               cdst->mc_pg[cdst->mc_top]->mp_pgno);
05300 
05301        assert(csrc->mc_snum > 1);  /* can't merge root page */
05302        assert(cdst->mc_snum > 1);
05303 
05304        /* Mark dst as dirty. */
05305        if ((rc = mdb_page_touch(cdst)))
05306               return rc;
05307 
05308        /* Move all nodes from src to dst.
05309         */
05310        j = nkeys = NUMKEYS(cdst->mc_pg[cdst->mc_top]);
05311        if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
05312               key.mv_size = csrc->mc_db->md_pad;
05313               key.mv_data = METADATA(csrc->mc_pg[csrc->mc_top]);
05314               for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
05315                      rc = mdb_node_add(cdst, j, &key, NULL, 0, 0);
05316                      if (rc != MDB_SUCCESS)
05317                             return rc;
05318                      key.mv_data = (char *)key.mv_data + key.mv_size;
05319               }
05320        } else {
05321               for (i = 0; i < NUMKEYS(csrc->mc_pg[csrc->mc_top]); i++, j++) {
05322                      srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], i);
05323                      if (i == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
05324                             unsigned int snum = csrc->mc_snum;
05325                             MDB_node *s2;
05326                             /* must find the lowest key below src */
05327                             mdb_page_search_root(csrc, NULL, 0);
05328                             s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
05329                             key.mv_size = NODEKSZ(s2);
05330                             key.mv_data = NODEKEY(s2);
05331                             csrc->mc_snum = snum--;
05332                             csrc->mc_top = snum;
05333                      } else {
05334                             key.mv_size = srcnode->mn_ksize;
05335                             key.mv_data = NODEKEY(srcnode);
05336                      }
05337 
05338                      data.mv_size = NODEDSZ(srcnode);
05339                      data.mv_data = NODEDATA(srcnode);
05340                      rc = mdb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
05341                      if (rc != MDB_SUCCESS)
05342                             return rc;
05343               }
05344        }
05345 
05346        DPRINTF("dst page %zu now has %u keys (%.1f%% filled)",
05347            cdst->mc_pg[cdst->mc_top]->mp_pgno, NUMKEYS(cdst->mc_pg[cdst->mc_top]), (float)PAGEFILL(cdst->mc_txn->mt_env, cdst->mc_pg[cdst->mc_top]) / 10);
05348 
05349        /* Unlink the src page from parent and add to free list.
05350         */
05351        mdb_node_del(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1], 0);
05352        if (csrc->mc_ki[csrc->mc_top-1] == 0) {
05353               key.mv_size = 0;
05354               if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], 0, &key)) != MDB_SUCCESS)
05355                      return rc;
05356        }
05357 
05358        mdb_midl_append(&csrc->mc_txn->mt_free_pgs, csrc->mc_pg[csrc->mc_top]->mp_pgno);
05359        if (IS_LEAF(csrc->mc_pg[csrc->mc_top]))
05360               csrc->mc_db->md_leaf_pages--;
05361        else
05362               csrc->mc_db->md_branch_pages--;
05363        {
05364               /* Adjust other cursors pointing to mp */
05365               MDB_cursor *m2, *m3;
05366               MDB_dbi dbi = csrc->mc_dbi;
05367               MDB_page *mp = cdst->mc_pg[cdst->mc_top];
05368 
05369               if (csrc->mc_flags & C_SUB)
05370                      dbi--;
05371 
05372               for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
05373                      if (csrc->mc_flags & C_SUB)
05374                             m3 = &m2->mc_xcursor->mx_cursor;
05375                      else
05376                             m3 = m2;
05377                      if (m3 == csrc) continue;
05378                      if (m3->mc_snum < csrc->mc_snum) continue;
05379                      if (m3->mc_pg[csrc->mc_top] == csrc->mc_pg[csrc->mc_top]) {
05380                             m3->mc_pg[csrc->mc_top] = mp;
05381                             m3->mc_ki[csrc->mc_top] += nkeys;
05382                      }
05383               }
05384        }
05385        mdb_cursor_pop(csrc);
05386 
05387        return mdb_rebalance(csrc);
05388 }
05389 
05394 static void
05395 mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst)
05396 {
05397        unsigned int i;
05398 
05399        cdst->mc_txn = csrc->mc_txn;
05400        cdst->mc_dbi = csrc->mc_dbi;
05401        cdst->mc_db  = csrc->mc_db;
05402        cdst->mc_dbx = csrc->mc_dbx;
05403        cdst->mc_snum = csrc->mc_snum;
05404        cdst->mc_top = csrc->mc_top;
05405        cdst->mc_flags = csrc->mc_flags;
05406 
05407        for (i=0; i<csrc->mc_snum; i++) {
05408               cdst->mc_pg[i] = csrc->mc_pg[i];
05409               cdst->mc_ki[i] = csrc->mc_ki[i];
05410        }
05411 }
05412 
05418 static int
05419 mdb_rebalance(MDB_cursor *mc)
05420 {
05421        MDB_node      *node;
05422        int rc;
05423        unsigned int ptop;
05424        MDB_cursor    mn;
05425 
05426 #if MDB_DEBUG
05427        {
05428        pgno_t pgno;
05429        COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
05430        DPRINTF("rebalancing %s page %zu (has %u keys, %.1f%% full)",
05431            IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
05432            pgno, NUMKEYS(mc->mc_pg[mc->mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10);
05433        }
05434 #endif
05435 
05436        if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= FILL_THRESHOLD) {
05437 #if MDB_DEBUG
05438               pgno_t pgno;
05439               COPY_PGNO(pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
05440               DPRINTF("no need to rebalance page %zu, above fill threshold",
05441                   pgno);
05442 #endif
05443               return MDB_SUCCESS;
05444        }
05445 
05446        if (mc->mc_snum < 2) {
05447               MDB_page *mp = mc->mc_pg[0];
05448               if (NUMKEYS(mp) == 0) {
05449                      DPUTS("tree is completely empty");
05450                      mc->mc_db->md_root = P_INVALID;
05451                      mc->mc_db->md_depth = 0;
05452                      mc->mc_db->md_leaf_pages = 0;
05453                      mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
05454                      mc->mc_snum = 0;
05455                      mc->mc_top = 0;
05456                      {
05457                             /* Adjust other cursors pointing to mp */
05458                             MDB_cursor *m2, *m3;
05459                             MDB_dbi dbi = mc->mc_dbi;
05460 
05461                             if (mc->mc_flags & C_SUB)
05462                                    dbi--;
05463 
05464                             for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
05465                                    if (m2 == mc) continue;
05466                                    if (mc->mc_flags & C_SUB)
05467                                           m3 = &m2->mc_xcursor->mx_cursor;
05468                                    else
05469                                           m3 = m2;
05470                                    if (m3->mc_snum < mc->mc_snum) continue;
05471                                    if (m3->mc_pg[0] == mp) {
05472                                           m3->mc_snum = 0;
05473                                           m3->mc_top = 0;
05474                                    }
05475                             }
05476                      }
05477               } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
05478                      DPUTS("collapsing root page!");
05479                      mdb_midl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
05480                      mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
05481                      if ((rc = mdb_page_get(mc->mc_txn, mc->mc_db->md_root,
05482                             &mc->mc_pg[0])))
05483                             return rc;
05484                      mc->mc_db->md_depth--;
05485                      mc->mc_db->md_branch_pages--;
05486                      {
05487                             /* Adjust other cursors pointing to mp */
05488                             MDB_cursor *m2, *m3;
05489                             MDB_dbi dbi = mc->mc_dbi;
05490 
05491                             if (mc->mc_flags & C_SUB)
05492                                    dbi--;
05493 
05494                             for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
05495                                    if (m2 == mc) continue;
05496                                    if (mc->mc_flags & C_SUB)
05497                                           m3 = &m2->mc_xcursor->mx_cursor;
05498                                    else
05499                                           m3 = m2;
05500                                    if (m3->mc_snum < mc->mc_snum) continue;
05501                                    if (m3->mc_pg[0] == mp) {
05502                                           m3->mc_pg[0] = mc->mc_pg[0];
05503                                    }
05504                             }
05505                      }
05506               } else
05507                      DPUTS("root page doesn't need rebalancing");
05508               return MDB_SUCCESS;
05509        }
05510 
05511        /* The parent (branch page) must have at least 2 pointers,
05512         * otherwise the tree is invalid.
05513         */
05514        ptop = mc->mc_top-1;
05515        assert(NUMKEYS(mc->mc_pg[ptop]) > 1);
05516 
05517        /* Leaf page fill factor is below the threshold.
05518         * Try to move keys from left or right neighbor, or
05519         * merge with a neighbor page.
05520         */
05521 
05522        /* Find neighbors.
05523         */
05524        mdb_cursor_copy(mc, &mn);
05525        mn.mc_xcursor = NULL;
05526 
05527        if (mc->mc_ki[ptop] == 0) {
05528               /* We're the leftmost leaf in our parent.
05529                */
05530               DPUTS("reading right neighbor");
05531               mn.mc_ki[ptop]++;
05532               node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
05533               if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
05534                      return rc;
05535               mn.mc_ki[mn.mc_top] = 0;
05536               mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
05537        } else {
05538               /* There is at least one neighbor to the left.
05539                */
05540               DPUTS("reading left neighbor");
05541               mn.mc_ki[ptop]--;
05542               node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
05543               if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mn.mc_pg[mn.mc_top])))
05544                      return rc;
05545               mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
05546               mc->mc_ki[mc->mc_top] = 0;
05547        }
05548 
05549        DPRINTF("found neighbor page %zu (%u keys, %.1f%% full)",
05550            mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]), (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10);
05551 
05552        /* If the neighbor page is above threshold and has at least two
05553         * keys, move one key from it.
05554         *
05555         * Otherwise we should try to merge them.
05556         */
05557        if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= FILL_THRESHOLD && NUMKEYS(mn.mc_pg[mn.mc_top]) >= 2)
05558               return mdb_node_move(&mn, mc);
05559        else { /* FIXME: if (has_enough_room()) */
05560               mc->mc_flags &= ~C_INITIALIZED;
05561               if (mc->mc_ki[ptop] == 0)
05562                      return mdb_page_merge(&mn, mc);
05563               else
05564                      return mdb_page_merge(mc, &mn);
05565        }
05566 }
05567 
05569 static int
05570 mdb_cursor_del0(MDB_cursor *mc, MDB_node *leaf)
05571 {
05572        int rc;
05573 
05574        /* add overflow pages to free list */
05575        if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) {
05576               int i, ovpages;
05577               pgno_t pg;
05578 
05579               memcpy(&pg, NODEDATA(leaf), sizeof(pg));
05580               ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
05581               mc->mc_db->md_overflow_pages -= ovpages;
05582               for (i=0; i<ovpages; i++) {
05583                      DPRINTF("freed ov page %zu", pg);
05584                      mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
05585                      pg++;
05586               }
05587        }
05588        mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad);
05589        mc->mc_db->md_entries--;
05590        rc = mdb_rebalance(mc);
05591        if (rc != MDB_SUCCESS)
05592               mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
05593 
05594        return rc;
05595 }
05596 
05597 int
05598 mdb_del(MDB_txn *txn, MDB_dbi dbi,
05599     MDB_val *key, MDB_val *data)
05600 {
05601        MDB_cursor mc;
05602        MDB_xcursor mx;
05603        MDB_cursor_op op;
05604        MDB_val rdata, *xdata;
05605        int            rc, exact;
05606        DKBUF;
05607 
05608        assert(key != NULL);
05609 
05610        DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key));
05611 
05612        if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
05613               return EINVAL;
05614 
05615        if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
05616               return EACCES;
05617        }
05618 
05619        if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
05620               return EINVAL;
05621        }
05622 
05623        mdb_cursor_init(&mc, txn, dbi, &mx);
05624 
05625        exact = 0;
05626        if (data) {
05627               op = MDB_GET_BOTH;
05628               rdata = *data;
05629               xdata = &rdata;
05630        } else {
05631               op = MDB_SET;
05632               xdata = NULL;
05633        }
05634        rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
05635        if (rc == 0)
05636               rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA);
05637        return rc;
05638 }
05639 
05649 static int
05650 mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno,
05651        unsigned int nflags)
05652 {
05653        unsigned int flags;
05654        int            rc = MDB_SUCCESS, ins_new = 0, new_root = 0, newpos = 1;
05655        indx_t         newindx;
05656        pgno_t         pgno = 0;
05657        unsigned int   i, j, split_indx, nkeys, pmax;
05658        MDB_node      *node;
05659        MDB_val        sepkey, rkey, xdata, *rdata = &xdata;
05660        MDB_page      *copy;
05661        MDB_page      *mp, *rp, *pp;
05662        unsigned int ptop;
05663        MDB_cursor    mn;
05664        DKBUF;
05665 
05666        mp = mc->mc_pg[mc->mc_top];
05667        newindx = mc->mc_ki[mc->mc_top];
05668 
05669        DPRINTF("-----> splitting %s page %zu and adding [%s] at index %i",
05670            IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
05671            DKEY(newkey), mc->mc_ki[mc->mc_top]);
05672 
05673        /* Create a right sibling. */
05674        if ((rp = mdb_page_new(mc, mp->mp_flags, 1)) == NULL)
05675               return ENOMEM;
05676        DPRINTF("new right sibling: page %zu", rp->mp_pgno);
05677 
05678        if (mc->mc_snum < 2) {
05679               if ((pp = mdb_page_new(mc, P_BRANCH, 1)) == NULL)
05680                      return ENOMEM;
05681               /* shift current top to make room for new parent */
05682               mc->mc_pg[1] = mc->mc_pg[0];
05683               mc->mc_ki[1] = mc->mc_ki[0];
05684               mc->mc_pg[0] = pp;
05685               mc->mc_ki[0] = 0;
05686               mc->mc_db->md_root = pp->mp_pgno;
05687               DPRINTF("root split! new root = %zu", pp->mp_pgno);
05688               mc->mc_db->md_depth++;
05689               new_root = 1;
05690 
05691               /* Add left (implicit) pointer. */
05692               if ((rc = mdb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != MDB_SUCCESS) {
05693                      /* undo the pre-push */
05694                      mc->mc_pg[0] = mc->mc_pg[1];
05695                      mc->mc_ki[0] = mc->mc_ki[1];
05696                      mc->mc_db->md_root = mp->mp_pgno;
05697                      mc->mc_db->md_depth--;
05698                      return rc;
05699               }
05700               mc->mc_snum = 2;
05701               mc->mc_top = 1;
05702               ptop = 0;
05703        } else {
05704               ptop = mc->mc_top-1;
05705               DPRINTF("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno);
05706        }
05707 
05708        mdb_cursor_copy(mc, &mn);
05709        mn.mc_pg[mn.mc_top] = rp;
05710        mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
05711 
05712        if (nflags & MDB_APPEND) {
05713               mn.mc_ki[mn.mc_top] = 0;
05714               sepkey = *newkey;
05715               nkeys = 0;
05716               split_indx = 0;
05717               goto newsep;
05718        }
05719 
05720        nkeys = NUMKEYS(mp);
05721        split_indx = (nkeys + 1) / 2;
05722 
05723        if (IS_LEAF2(rp)) {
05724               char *split, *ins;
05725               int x;
05726               unsigned int lsize, rsize, ksize;
05727               /* Move half of the keys to the right sibling */
05728               copy = NULL;
05729               x = mc->mc_ki[mc->mc_top] - split_indx;
05730               ksize = mc->mc_db->md_pad;
05731               split = LEAF2KEY(mp, split_indx, ksize);
05732               rsize = (nkeys - split_indx) * ksize;
05733               lsize = (nkeys - split_indx) * sizeof(indx_t);
05734               mp->mp_lower -= lsize;
05735               rp->mp_lower += lsize;
05736               mp->mp_upper += rsize - lsize;
05737               rp->mp_upper -= rsize - lsize;
05738               sepkey.mv_size = ksize;
05739               if (newindx == split_indx) {
05740                      sepkey.mv_data = newkey->mv_data;
05741               } else {
05742                      sepkey.mv_data = split;
05743               }
05744               if (x<0) {
05745                      ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
05746                      memcpy(rp->mp_ptrs, split, rsize);
05747                      sepkey.mv_data = rp->mp_ptrs;
05748                      memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
05749                      memcpy(ins, newkey->mv_data, ksize);
05750                      mp->mp_lower += sizeof(indx_t);
05751                      mp->mp_upper -= ksize - sizeof(indx_t);
05752               } else {
05753                      if (x)
05754                             memcpy(rp->mp_ptrs, split, x * ksize);
05755                      ins = LEAF2KEY(rp, x, ksize);
05756                      memcpy(ins, newkey->mv_data, ksize);
05757                      memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
05758                      rp->mp_lower += sizeof(indx_t);
05759                      rp->mp_upper -= ksize - sizeof(indx_t);
05760                      mc->mc_ki[mc->mc_top] = x;
05761                      mc->mc_pg[mc->mc_top] = rp;
05762               }
05763               goto newsep;
05764        }
05765 
05766        /* For leaf pages, check the split point based on what
05767         * fits where, since otherwise mdb_node_add can fail.
05768         *
05769         * This check is only needed when the data items are
05770         * relatively large, such that being off by one will
05771         * make the difference between success or failure.
05772         * When the size of the data items is much smaller than
05773         * one-half of a page, this check is irrelevant.
05774         */
05775        if (IS_LEAF(mp)) {
05776               unsigned int psize, nsize;
05777               /* Maximum free space in an empty page */
05778               pmax = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
05779               nsize = mdb_leaf_size(mc->mc_txn->mt_env, newkey, newdata);
05780               if ((nkeys < 20) || (nsize > pmax/4)) {
05781                      if (newindx <= split_indx) {
05782                             psize = nsize;
05783                             newpos = 0;
05784                             for (i=0; i<split_indx; i++) {
05785                                    node = NODEPTR(mp, i);
05786                                    psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
05787                                    if (F_ISSET(node->mn_flags, F_BIGDATA))
05788                                           psize += sizeof(pgno_t);
05789                                    else
05790                                           psize += NODEDSZ(node);
05791                                    psize += psize & 1;
05792                                    if (psize > pmax) {
05793                                           if (i == split_indx - 1 && newindx == split_indx)
05794                                                  newpos = 1;
05795                                           else
05796                                                  split_indx = i;
05797                                           break;
05798                                    }
05799                             }
05800                      } else {
05801                             psize = nsize;
05802                             for (i=nkeys-1; i>=split_indx; i--) {
05803                                    node = NODEPTR(mp, i);
05804                                    psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
05805                                    if (F_ISSET(node->mn_flags, F_BIGDATA))
05806                                           psize += sizeof(pgno_t);
05807                                    else
05808                                           psize += NODEDSZ(node);
05809                                    psize += psize & 1;
05810                                    if (psize > pmax) {
05811                                           split_indx = i+1;
05812                                           break;
05813                                    }
05814                             }
05815                      }
05816               }
05817        }
05818 
05819        /* First find the separating key between the split pages.
05820         * The case where newindx == split_indx is ambiguous; the
05821         * new item could go to the new page or stay on the original
05822         * page. If newpos == 1 it goes to the new page.
05823         */
05824        if (newindx == split_indx && newpos) {
05825               sepkey.mv_size = newkey->mv_size;
05826               sepkey.mv_data = newkey->mv_data;
05827        } else {
05828               node = NODEPTR(mp, split_indx);
05829               sepkey.mv_size = node->mn_ksize;
05830               sepkey.mv_data = NODEKEY(node);
05831        }
05832 
05833 newsep:
05834        DPRINTF("separator is [%s]", DKEY(&sepkey));
05835 
05836        /* Copy separator key to the parent.
05837         */
05838        if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(mc->mc_txn->mt_env, &sepkey)) {
05839               mn.mc_snum--;
05840               mn.mc_top--;
05841               rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
05842 
05843               /* Right page might now have changed parent.
05844                * Check if left page also changed parent.
05845                */
05846               if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
05847                   mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
05848                      mc->mc_pg[ptop] = mn.mc_pg[ptop];
05849                      mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
05850               }
05851        } else {
05852               mn.mc_top--;
05853               rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
05854               mn.mc_top++;
05855        }
05856        if (rc != MDB_SUCCESS) {
05857               return rc;
05858        }
05859        if (nflags & MDB_APPEND) {
05860               mc->mc_pg[mc->mc_top] = rp;
05861               mc->mc_ki[mc->mc_top] = 0;
05862               rc = mdb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
05863               if (rc)
05864                      return rc;
05865               goto done;
05866        }
05867        if (IS_LEAF2(rp)) {
05868               goto done;
05869        }
05870 
05871        /* Move half of the keys to the right sibling. */
05872 
05873        /* grab a page to hold a temporary copy */
05874        copy = mdb_page_malloc(mc);
05875        if (copy == NULL)
05876               return ENOMEM;
05877 
05878        copy->mp_pgno  = mp->mp_pgno;
05879        copy->mp_flags = mp->mp_flags;
05880        copy->mp_lower = PAGEHDRSZ;
05881        copy->mp_upper = mc->mc_txn->mt_env->me_psize;
05882        mc->mc_pg[mc->mc_top] = copy;
05883        for (i = j = 0; i <= nkeys; j++) {
05884               if (i == split_indx) {
05885               /* Insert in right sibling. */
05886               /* Reset insert index for right sibling. */
05887                      if (i != newindx || (newpos ^ ins_new)) {
05888                             j = 0;
05889                             mc->mc_pg[mc->mc_top] = rp;
05890                      }
05891               }
05892 
05893               if (i == newindx && !ins_new) {
05894                      /* Insert the original entry that caused the split. */
05895                      rkey.mv_data = newkey->mv_data;
05896                      rkey.mv_size = newkey->mv_size;
05897                      if (IS_LEAF(mp)) {
05898                             rdata = newdata;
05899                      } else
05900                             pgno = newpgno;
05901                      flags = nflags;
05902 
05903                      ins_new = 1;
05904 
05905                      /* Update index for the new key. */
05906                      mc->mc_ki[mc->mc_top] = j;
05907               } else if (i == nkeys) {
05908                      break;
05909               } else {
05910                      node = NODEPTR(mp, i);
05911                      rkey.mv_data = NODEKEY(node);
05912                      rkey.mv_size = node->mn_ksize;
05913                      if (IS_LEAF(mp)) {
05914                             xdata.mv_data = NODEDATA(node);
05915                             xdata.mv_size = NODEDSZ(node);
05916                             rdata = &xdata;
05917                      } else
05918                             pgno = NODEPGNO(node);
05919                      flags = node->mn_flags;
05920 
05921                      i++;
05922               }
05923 
05924               if (!IS_LEAF(mp) && j == 0) {
05925                      /* First branch index doesn't need key data. */
05926                      rkey.mv_size = 0;
05927               }
05928 
05929               rc = mdb_node_add(mc, j, &rkey, rdata, pgno, flags);
05930               if (rc) break;
05931        }
05932 
05933        nkeys = NUMKEYS(copy);
05934        for (i=0; i<nkeys; i++)
05935               mp->mp_ptrs[i] = copy->mp_ptrs[i];
05936        mp->mp_lower = copy->mp_lower;
05937        mp->mp_upper = copy->mp_upper;
05938        memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
05939               mc->mc_txn->mt_env->me_psize - copy->mp_upper);
05940 
05941        /* reset back to original page */
05942        if (newindx < split_indx || (!newpos && newindx == split_indx)) {
05943               mc->mc_pg[mc->mc_top] = mp;
05944               if (nflags & MDB_RESERVE) {
05945                      node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
05946                      if (!(node->mn_flags & F_BIGDATA))
05947                             newdata->mv_data = NODEDATA(node);
05948               }
05949        }
05950 
05951        /* return tmp page to freelist */
05952        copy->mp_next = mc->mc_txn->mt_env->me_dpages;
05953        VGMEMP_FREE(mc->mc_txn->mt_env, copy);
05954        mc->mc_txn->mt_env->me_dpages = copy;
05955 done:
05956        {
05957               /* Adjust other cursors pointing to mp */
05958               MDB_cursor *m2, *m3;
05959               MDB_dbi dbi = mc->mc_dbi;
05960 
05961               if (mc->mc_flags & C_SUB)
05962                      dbi--;
05963 
05964               for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
05965                      if (m2 == mc) continue;
05966                      if (mc->mc_flags & C_SUB)
05967                             m3 = &m2->mc_xcursor->mx_cursor;
05968                      else
05969                             m3 = m2;
05970                      if (!(m3->mc_flags & C_INITIALIZED))
05971                             continue;
05972                      if (new_root) {
05973                             int k;
05974                             /* root split */
05975                             for (k=m3->mc_top; k>=0; k--) {
05976                                    m3->mc_ki[k+1] = m3->mc_ki[k];
05977                                    m3->mc_pg[k+1] = m3->mc_pg[k];
05978                             }
05979                             m3->mc_ki[0] = mc->mc_ki[0];
05980                             m3->mc_pg[0] = mc->mc_pg[0];
05981                             m3->mc_snum++;
05982                             m3->mc_top++;
05983                      }
05984                      if (m3->mc_pg[mc->mc_top] == mp) {
05985                             if (m3->mc_ki[m3->mc_top] >= split_indx) {
05986                                    m3->mc_pg[m3->mc_top] = rp;
05987                                    m3->mc_ki[m3->mc_top] -= split_indx;
05988                             }
05989                      }
05990               }
05991        }
05992        return rc;
05993 }
05994 
05995 int
05996 mdb_put(MDB_txn *txn, MDB_dbi dbi,
05997     MDB_val *key, MDB_val *data, unsigned int flags)
05998 {
05999        MDB_cursor mc;
06000        MDB_xcursor mx;
06001 
06002        assert(key != NULL);
06003        assert(data != NULL);
06004 
06005        if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
06006               return EINVAL;
06007 
06008        if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
06009               return EACCES;
06010        }
06011 
06012        if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
06013               return EINVAL;
06014        }
06015 
06016        if ((flags & (MDB_NOOVERWRITE|MDB_NODUPDATA|MDB_RESERVE|MDB_APPEND)) != flags)
06017               return EINVAL;
06018 
06019        mdb_cursor_init(&mc, txn, dbi, &mx);
06020        return mdb_cursor_put(&mc, key, data, flags);
06021 }
06022 
06027 #define       CHANGEABLE    (MDB_NOSYNC)
06028 int
06029 mdb_env_set_flags(MDB_env *env, unsigned int flag, int onoff)
06030 {
06031        if ((flag & CHANGEABLE) != flag)
06032               return EINVAL;
06033        if (onoff)
06034               env->me_flags |= flag;
06035        else
06036               env->me_flags &= ~flag;
06037        return MDB_SUCCESS;
06038 }
06039 
06040 int
06041 mdb_env_get_flags(MDB_env *env, unsigned int *arg)
06042 {
06043        if (!env || !arg)
06044               return EINVAL;
06045 
06046        *arg = env->me_flags;
06047        return MDB_SUCCESS;
06048 }
06049 
06050 int
06051 mdb_env_get_path(MDB_env *env, const char **arg)
06052 {
06053        if (!env || !arg)
06054               return EINVAL;
06055 
06056        *arg = env->me_path;
06057        return MDB_SUCCESS;
06058 }
06059 
06066 static int
06067 mdb_stat0(MDB_env *env, MDB_db *db, MDB_stat *arg)
06068 {
06069        arg->ms_psize = env->me_psize;
06070        arg->ms_depth = db->md_depth;
06071        arg->ms_branch_pages = db->md_branch_pages;
06072        arg->ms_leaf_pages = db->md_leaf_pages;
06073        arg->ms_overflow_pages = db->md_overflow_pages;
06074        arg->ms_entries = db->md_entries;
06075 
06076        return MDB_SUCCESS;
06077 }
06078 int
06079 mdb_env_stat(MDB_env *env, MDB_stat *arg)
06080 {
06081        int toggle;
06082 
06083        if (env == NULL || arg == NULL)
06084               return EINVAL;
06085 
06086        mdb_env_read_meta(env, &toggle);
06087 
06088        return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
06089 }
06090 
06098 static void
06099 mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi)
06100 {
06101        if (txn->mt_dbs[dbi].md_flags & MDB_REVERSEKEY)
06102               txn->mt_dbxs[dbi].md_cmp = mdb_cmp_memnr;
06103        else if (txn->mt_dbs[dbi].md_flags & MDB_INTEGERKEY)
06104               txn->mt_dbxs[dbi].md_cmp = mdb_cmp_cint;
06105        else
06106               txn->mt_dbxs[dbi].md_cmp = mdb_cmp_memn;
06107 
06108        if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
06109               if (txn->mt_dbs[dbi].md_flags & MDB_INTEGERDUP) {
06110                      if (txn->mt_dbs[dbi].md_flags & MDB_DUPFIXED)
06111                             txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_int;
06112                      else
06113                             txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_cint;
06114               } else if (txn->mt_dbs[dbi].md_flags & MDB_REVERSEDUP) {
06115                      txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_memnr;
06116               } else {
06117                      txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_memn;
06118               }
06119        } else {
06120               txn->mt_dbxs[dbi].md_dcmp = NULL;
06121        }
06122 }
06123 
06124 int mdb_open(MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
06125 {
06126        MDB_val key, data;
06127        MDB_dbi i;
06128        MDB_cursor mc;
06129        int rc, dbflag, exact;
06130        size_t len;
06131 
06132        if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
06133               mdb_default_cmp(txn, FREE_DBI);
06134        }
06135 
06136        /* main DB? */
06137        if (!name) {
06138               *dbi = MAIN_DBI;
06139               if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY))
06140                      txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY));
06141               mdb_default_cmp(txn, MAIN_DBI);
06142               return MDB_SUCCESS;
06143        }
06144 
06145        if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
06146               mdb_default_cmp(txn, MAIN_DBI);
06147        }
06148 
06149        /* Is the DB already open? */
06150        len = strlen(name);
06151        for (i=2; i<txn->mt_numdbs; i++) {
06152               if (len == txn->mt_dbxs[i].md_name.mv_size &&
06153                      !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
06154                      *dbi = i;
06155                      return MDB_SUCCESS;
06156               }
06157        }
06158 
06159        if (txn->mt_numdbs >= txn->mt_env->me_maxdbs - 1)
06160               return ENFILE;
06161 
06162        /* Find the DB info */
06163        dbflag = 0;
06164        exact = 0;
06165        key.mv_size = len;
06166        key.mv_data = (void *)name;
06167        mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
06168        rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
06169        if (rc == MDB_SUCCESS) {
06170               /* make sure this is actually a DB */
06171               MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
06172               if (!(node->mn_flags & F_SUBDATA))
06173                      return EINVAL;
06174        } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
06175               /* Create if requested */
06176               MDB_db dummy;
06177               data.mv_size = sizeof(MDB_db);
06178               data.mv_data = &dummy;
06179               memset(&dummy, 0, sizeof(dummy));
06180               dummy.md_root = P_INVALID;
06181               dummy.md_flags = flags & 0xffff;
06182               rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
06183               dbflag = DB_DIRTY;
06184        }
06185 
06186        /* OK, got info, add to table */
06187        if (rc == MDB_SUCCESS) {
06188               txn->mt_dbxs[txn->mt_numdbs].md_name.mv_data = strdup(name);
06189               txn->mt_dbxs[txn->mt_numdbs].md_name.mv_size = len;
06190               txn->mt_dbxs[txn->mt_numdbs].md_rel = NULL;
06191               txn->mt_dbflags[txn->mt_numdbs] = dbflag;
06192               memcpy(&txn->mt_dbs[txn->mt_numdbs], data.mv_data, sizeof(MDB_db));
06193               *dbi = txn->mt_numdbs;
06194               txn->mt_env->me_dbs[0][txn->mt_numdbs] = txn->mt_dbs[txn->mt_numdbs];
06195               txn->mt_env->me_dbs[1][txn->mt_numdbs] = txn->mt_dbs[txn->mt_numdbs];
06196               mdb_default_cmp(txn, txn->mt_numdbs);
06197               txn->mt_numdbs++;
06198        }
06199 
06200        return rc;
06201 }
06202 
06203 int mdb_stat(MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
06204 {
06205        if (txn == NULL || arg == NULL || dbi >= txn->mt_numdbs)
06206               return EINVAL;
06207 
06208        return mdb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
06209 }
06210 
06211 void mdb_close(MDB_env *env, MDB_dbi dbi)
06212 {
06213        char *ptr;
06214        if (dbi <= MAIN_DBI || dbi >= env->me_numdbs)
06215               return;
06216        ptr = env->me_dbxs[dbi].md_name.mv_data;
06217        env->me_dbxs[dbi].md_name.mv_data = NULL;
06218        env->me_dbxs[dbi].