Back to index

openldap  2.4.31
Modules | Classes | Defines | Typedefs | Functions | Variables
MDB Internals
Collaboration diagram for MDB Internals:

Modules

 Windows Compatibility Macros
 

A bunch of macros to minimize the amount of platform-specific ifdefs needed throughout the rest of the code.


 Debug Macros
 Lazy Locking
 

Macros for locks that aren't actually needed.


 Reader Lock Table
 

Readers don't acquire any locks for their data access.


 Page Flags
 
   Flags for the page headers.

 Node Flags
 
   Flags for node headers.

 Transaction DB Flags
 Transaction Flags
 Cursor Flags
 
   Cursor state flags.

 ID List Management

Classes

struct  MDB_page
 Common header for all page types. More...
union  MDB_page.mp_p
union  MDB_page.mp_pb
struct  MDB_page.mp_pb.pb
struct  MDB_node
 Header for a single key/data pair within a page. More...
struct  MDB_db
 Information about a single database in the environment. More...
struct  MDB_meta
 Meta page content. More...
union  MDB_pagebuf
 Buffer for a stack-allocated dirty page. More...
struct  MDB_pagebuf.mb_metabuf
struct  MDB_dbx
 Auxiliary DB info. More...
struct  MDB_txn
 A database transaction. More...
union  MDB_txn.mt_u
struct  MDB_cursor
 Cursors are used for all DB operations. More...
struct  MDB_xcursor
 Context for sorted-dup records. More...
struct  MDB_oldpages
 A set of pages freed by an earlier transaction. More...
struct  MDB_env
 The database environment. More...

Defines

#define MDB_DSYNC   O_DSYNC
 A flag for opening a file and requesting synchronous data writes.
#define MDB_FDATASYNC   fdatasync
 Function for flushing the data of a file.
#define MDB_PAGESIZE   4096
 A default memory page size.
#define MDB_MINKEYS   2
 The minimum number of keys required in a database page.
#define MDB_MAGIC   0xBEEFC0DE
 A stamp that identifies a file as an MDB file.
#define MDB_VERSION   1
 The version number for a database's file format.
#define MAXKEYSIZE   511
 The maximum size of a key in the database.
#define DKBUF   typedef int dummy_kbuf /* so we can put ';' after */
#define DKEY(x)   0
#define P_INVALID   (~0UL)
 An invalid page number.
#define F_ISSET(w, f)   (((w) & (f)) == (f))
 Test if a flag f is set in a flag word w.
#define DEFAULT_MAPSIZE   1048576
 Default size of memory map.
#define PAGEHDRSZ   ((unsigned) offsetof(MDB_page, mp_ptrs))
 Size of the page header, excluding dynamic data at the end.
#define METADATA(p)   ((void *)((char *)(p) + PAGEHDRSZ))
 Address of first usable data byte in a page, after the header.
#define NUMKEYS(p)   (((p)->mp_lower - PAGEHDRSZ) >> 1)
 Number of nodes on a page.
#define SIZELEFT(p)   (indx_t)((p)->mp_upper - (p)->mp_lower)
 The amount of space remaining in the page.
#define PAGEFILL(env, p)
 The percentage of space used in the page, in tenths of a percent.
#define FILL_THRESHOLD   250
 The minimum page fill factor, in tenths of a percent.
#define IS_LEAF(p)   F_ISSET((p)->mp_flags, P_LEAF)
 Test if a page is a leaf page.
#define IS_LEAF2(p)   F_ISSET((p)->mp_flags, P_LEAF2)
 Test if a page is a LEAF2 page.
#define IS_BRANCH(p)   F_ISSET((p)->mp_flags, P_BRANCH)
 Test if a page is a branch page.
#define IS_OVERFLOW(p)   F_ISSET((p)->mp_flags, P_OVERFLOW)
 Test if a page is an overflow page.
#define IS_SUBP(p)   F_ISSET((p)->mp_flags, P_SUBP)
 Test if a page is a sub page.
#define OVPAGES(size, psize)   ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
 The number of overflow pages needed to store the given size.
#define NODESIZE   offsetof(MDB_node, mn_data)
 Size of the node header, excluding dynamic data at the end.
#define PGNO_TOPWORD   ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
 Bit position of top word in page number, for shifting mn_flags.
#define INDXSIZE(k)   (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
 Size of a node in a branch page with a given key.
#define LEAFSIZE(k, d)   (NODESIZE + (k)->mv_size + (d)->mv_size)
 Size of a node in a leaf page with a given key and data.
#define NODEPTR(p, i)   ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))
 Address of node i in page p.
#define NODEKEY(node)   (void *)((node)->mn_data)
 Address of the key for the node.
#define NODEDATA(node)   (void *)((char *)(node)->mn_data + (node)->mn_ksize)
 Address of the data for a node.
#define NODEPGNO(node)
 Get the page number pointed to by a branch node.
#define SETPGNO(node, pgno)
 Set the page number in a branch node.
#define NODEDSZ(node)   ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
 Get the size of the data in a leaf node.
#define SETDSZ(node, size)
 Set the size of the data for a leaf node.
#define NODEKSZ(node)   ((node)->mn_ksize)
 The size of a key in a node.
#define COPY_PGNO(dst, src)
 Copy a page number from src to dst.
#define LEAF2KEY(p, i, ks)   ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
 The address of a key in a LEAF2 page.
#define MDB_SET_KEY(node, key)
 Set the node's key into key, if requested.
#define FREE_DBI   0
 Handle for the DB used to track free pages.
#define MAIN_DBI   1
 Handle for the default DB.
#define CURSOR_STACK   32
 Enough space for 2^32 nodes with minimum of 2 keys per node.
#define MDB_COMMIT_PAGES   64
 max number of pages to commit in one writev() call
#define LOCKNAME   "/lock.mdb"
 The name of the lock file in the DB environment.
#define DATANAME   "/data.mdb"
 The name of the data file in the DB environment.
#define LOCKSUFF   "-lock"
 The suffix of the lock file when no subdir is used.
#define CHANGEABLE   (MDB_NOSYNC)
 Only a subset of the Environment Flags flags can be changed at runtime.

Typedefs

typedef ID pgno_t
 A page number in the database.
typedef ID txnid_t
 A transaction ID.
typedef uint16_t indx_t
 Used for offsets within a single page.
typedef struct MDB_page MDB_page
 Common header for all page types.
typedef struct MDB_node MDB_node
 Header for a single key/data pair within a page.
typedef struct MDB_db MDB_db
 Information about a single database in the environment.
typedef struct MDB_meta MDB_meta
 Meta page content.
typedef union MDB_pagebuf MDB_pagebuf
 Buffer for a stack-allocated dirty page.
typedef struct MDB_dbx MDB_dbx
 Auxiliary DB info.
typedef struct MDB_xcursor MDB_xcursor
 Context for sorted-dup records.
typedef struct MDB_oldpages MDB_oldpages
 A set of pages freed by an earlier transaction.

Functions

static MDB_pagemdb_page_alloc (MDB_cursor *mc, int num)
 Allocate pages for writing.
static MDB_pagemdb_page_new (MDB_cursor *mc, uint32_t flags, int num)
 Allocate and initialize new pages for a database.
static int mdb_page_touch (MDB_cursor *mc)
 Touch a page: make it dirty and re-insert into tree with updated pgno.
static int mdb_page_get (MDB_txn *txn, pgno_t pgno, MDB_page **ret)
 Find the address of the page corresponding to a given page number.
static int mdb_page_search_root (MDB_cursor *mc, MDB_val *key, int modify)
 Search for the page a given key should be in.
static int mdb_page_search (MDB_cursor *mc, MDB_val *key, int modify)
 Search for the page a given key should be in.
static int mdb_page_merge (MDB_cursor *csrc, MDB_cursor *cdst)
 Merge one page into another.
static int mdb_page_split (MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned int nflags)
 Split a page and insert a new node.
static int mdb_env_read_header (MDB_env *env, MDB_meta *meta)
 Read the environment parameters of a DB environment before mapping it into memory.
static int mdb_env_read_meta (MDB_env *env, int *which)
 Check both meta pages to see which one is newer.
static int mdb_env_write_meta (MDB_txn *txn)
 Update the environment info to commit a transaction.
static MDB_nodemdb_node_search (MDB_cursor *mc, MDB_val *key, int *exactp)
 Search for key within a page, using binary search.
static int mdb_node_add (MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned int flags)
 Add a node to the page pointed to by the cursor.
static void mdb_node_del (MDB_page *mp, indx_t indx, int ksize)
 Delete the specified node from a page.
static void mdb_node_shrink (MDB_page *mp, indx_t indx)
 Compact the main page after deleting a node on a subpage.
static int mdb_node_move (MDB_cursor *csrc, MDB_cursor *cdst)
 Move a node from csrc to cdst.
static int mdb_node_read (MDB_txn *txn, MDB_node *leaf, MDB_val *data)
 Return the data associated with a given node.
static size_t mdb_leaf_size (MDB_env *env, MDB_val *key, MDB_val *data)
 Calculate the size of a leaf node.
static size_t mdb_branch_size (MDB_env *env, MDB_val *key)
 Calculate the size of a branch node.
static int mdb_rebalance (MDB_cursor *mc)
 Rebalance the tree after a delete operation.
static int mdb_update_key (MDB_page *mp, indx_t indx, MDB_val *key)
 Replace the key for a node with a new key.
static void mdb_cursor_pop (MDB_cursor *mc)
 Pop a page off the top of the cursor's stack.
static int mdb_cursor_push (MDB_cursor *mc, MDB_page *mp)
 Push a page onto the top of the cursor's stack.
static int mdb_cursor_del0 (MDB_cursor *mc, MDB_node *leaf)
 Complete a delete operation started by mdb_cursor_del().
static int mdb_cursor_sibling (MDB_cursor *mc, int move_right)
 Find a sibling for a page.
static int mdb_cursor_next (MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
 Move the cursor to the next data item.
static int mdb_cursor_prev (MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
 Move the cursor to the previous data item.
static int mdb_cursor_set (MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op, int *exactp)
 Set the cursor on a specific data item.
static int mdb_cursor_first (MDB_cursor *mc, MDB_val *key, MDB_val *data)
 Move the cursor to the first item in the database.
static int mdb_cursor_last (MDB_cursor *mc, MDB_val *key, MDB_val *data)
 Move the cursor to the last item in the database.
static void mdb_cursor_init (MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx)
 Initialize a cursor for a given transaction and database.
static void mdb_xcursor_init0 (MDB_cursor *mc)
 Initial setup of a sorted-dups cursor.
static void mdb_xcursor_init1 (MDB_cursor *mc, MDB_node *node)
 Final setup of a sorted-dups cursor.
static int mdb_drop0 (MDB_cursor *mc, int subs)
 Add all the DB's pages to the free list.
static void mdb_default_cmp (MDB_txn *txn, MDB_dbi dbi)
 Set the default comparison functions for a database.
char * mdb_version (int *major, int *minor, int *patch)
 Return the library version info.
char * mdb_strerror (int err)
 Return a string describing a given error code.
int mdb_cmp (MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
 Compare two data items according to a particular database.
int mdb_dcmp (MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b)
 Compare two data items according to a particular database.
static MDB_pagemdb_page_malloc (MDB_cursor *mc)
 Allocate a single page.
int mdb_env_sync (MDB_env *env, int force)
 Flush the data buffers to disk.
static int mdb_cursor_shadow (MDB_txn *src, MDB_txn *dst)
 Make shadow copies of all of parent txn's cursors.
static void mdb_cursor_merge (MDB_txn *txn)
 Merge shadow cursors back into parent's.
static void mdb_txn_reset0 (MDB_txn *txn)
 Common code for mdb_txn_reset() and mdb_txn_abort().
static int mdb_txn_renew0 (MDB_txn *txn)
 Common code for mdb_txn_begin() and mdb_txn_renew().
int mdb_txn_renew (MDB_txn *txn)
 Renew a read-only transaction.
int mdb_txn_begin (MDB_env *env, MDB_txn *parent, unsigned int flags, MDB_txn **ret)
 Create a transaction for use with the environment.
void mdb_txn_reset (MDB_txn *txn)
 Reset a read-only transaction.
void mdb_txn_abort (MDB_txn *txn)
 Abandon all the operations of the transaction instead of saving them.
int mdb_txn_commit (MDB_txn *txn)
 Commit all the operations of a transaction into the database.
static int mdb_env_init_meta (MDB_env *env, MDB_meta *meta)
 Write the environment parameters of a freshly created DB environment.
int mdb_env_create (MDB_env **env)
 Create an MDB environment handle.
int mdb_env_set_mapsize (MDB_env *env, size_t size)
 Set the size of the memory map to use for this environment.
int mdb_env_set_maxdbs (MDB_env *env, MDB_dbi dbs)
 Set the maximum number of databases for the environment.
int mdb_env_set_maxreaders (MDB_env *env, unsigned int readers)
 Set the maximum number of threads for the environment.
int mdb_env_get_maxreaders (MDB_env *env, unsigned int *readers)
 Get the maximum number of threads for the environment.
static int mdb_env_open2 (MDB_env *env, unsigned int flags)
 Further setup required for opening an MDB environment.
static void mdb_env_reader_dest (void *ptr)
 Release a reader thread's slot in the reader lock table.
static void mdb_env_share_locks (MDB_env *env)
 Downgrade the exclusive lock on the region back to shared.
static int mdb_env_setup_locks (MDB_env *env, char *lpath, int mode, int *excl)
 Open and/or initialize the lock region for the environment.
int mdb_env_open (MDB_env *env, const char *path, unsigned int flags, mode_t mode)
 Open an environment handle.
void mdb_env_close (MDB_env *env)
 Close the environment and release the memory map.
static int mdb_cmp_long (const MDB_val *a, const MDB_val *b)
 Compare two items pointing at aligned size_t's.
static int mdb_cmp_int (const MDB_val *a, const MDB_val *b)
 Compare two items pointing at aligned int's.
static int mdb_cmp_cint (const MDB_val *a, const MDB_val *b)
 Compare two items pointing at ints of unknown alignment.
static int mdb_cmp_memn (const MDB_val *a, const MDB_val *b)
 Compare two items lexically.
static int mdb_cmp_memnr (const MDB_val *a, const MDB_val *b)
 Compare two items in reverse byte order.
int mdb_get (MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data)
 Get items from a database.
int mdb_cursor_get (MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op)
 Retrieve by cursor.
static int mdb_cursor_touch (MDB_cursor *mc)
 Touch all the pages in the cursor stack.
int mdb_cursor_put (MDB_cursor *mc, MDB_val *key, MDB_val *data, unsigned int flags)
 Store by cursor.
int mdb_cursor_del (MDB_cursor *mc, unsigned int flags)
 Delete current key/data pair.
int mdb_cursor_open (MDB_txn *txn, MDB_dbi dbi, MDB_cursor **ret)
 Create a cursor handle.
int mdb_cursor_count (MDB_cursor *mc, size_t *countp)
 Return count of duplicates for current key.
void mdb_cursor_close (MDB_cursor *mc)
 Close a cursor handle.
MDB_txnmdb_cursor_txn (MDB_cursor *mc)
 Return the cursor's transaction handle.
MDB_dbi mdb_cursor_dbi (MDB_cursor *mc)
 Return the cursor's database handle.
static void mdb_cursor_copy (const MDB_cursor *csrc, MDB_cursor *cdst)
 Copy the contents of a cursor.
int mdb_del (MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data)
 Delete items from a database.
int mdb_put (MDB_txn *txn, MDB_dbi dbi, MDB_val *key, MDB_val *data, unsigned int flags)
 Store items into a database.
int mdb_env_set_flags (MDB_env *env, unsigned int flag, int onoff)
 Set environment flags.
int mdb_env_get_flags (MDB_env *env, unsigned int *arg)
 Get environment flags.
int mdb_env_get_path (MDB_env *env, const char **arg)
 Return the path that was used in mdb_env_open().
static int mdb_stat0 (MDB_env *env, MDB_db *db, MDB_stat *arg)
 Common code for mdb_stat() and mdb_env_stat().
int mdb_env_stat (MDB_env *env, MDB_stat *arg)
 Return statistics about the MDB environment.
int mdb_open (MDB_txn *txn, const char *name, unsigned int flags, MDB_dbi *dbi)
 Open a database in the environment.
int mdb_stat (MDB_txn *txn, MDB_dbi dbi, MDB_stat *arg)
 Retrieve statistics for a database.
void mdb_close (MDB_env *env, MDB_dbi dbi)
 Close a database handle.
int mdb_drop (MDB_txn *txn, MDB_dbi dbi, int del)
 Delete a database and/or free all its pages.
int mdb_set_compare (MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
 Set a custom key comparison function for a database.
int mdb_set_dupsort (MDB_txn *txn, MDB_dbi dbi, MDB_cmp_func *cmp)
 Set a custom data comparison function for a MDB_DUPSORT database.
int mdb_set_relfunc (MDB_txn *txn, MDB_dbi dbi, MDB_rel_func *rel)
 Set a relocation function for a MDB_FIXEDMAP database.
int mdb_set_relctx (MDB_txn *txn, MDB_dbi dbi, void *ctx)
 Set a context pointer for a MDB_FIXEDMAP database's relocation function.

Variables

static char *const mdb_errstr []
 Table of descriptions for MDB Return Codes.

Class Documentation

struct MDB_page

Common header for all page types.

Overflow records occupy a number of contiguous pages with no headers on any page after the first.

Definition at line 561 of file mdb.c.

Class Members
uint16_t mp_flags Page Flags
union MDB_page mp_p
uint16_t mp_pad
union MDB_page mp_pb
indx_t mp_ptrs dynamic size
union MDB_page.mp_p

Definition at line 564 of file mdb.c.

Class Members
void * p_next for in-memory list of freed structs
pgno_t p_pgno page number
union MDB_page.mp_pb

Definition at line 586 of file mdb.c.

Class Members
mp_pb pb
uint32_t pb_pages number of overflow pages
struct MDB_page.mp_pb.pb

Definition at line 587 of file mdb.c.

Class Members
indx_t pb_lower lower bound of free space
indx_t pb_upper upper bound of free space
struct MDB_node

Header for a single key/data pair within a page.

We guarantee 2-byte alignment for nodes.

Definition at line 633 of file mdb.c.

Class Members
char mn_data key and data are appended here
unsigned short mn_flags Node Flags
unsigned short mn_ksize key size
unsigned short mn_offset storage for mn_lo and mn_hi
struct MDB_db

Information about a single database in the environment.

Definition at line 738 of file mdb.c.

Class Members
pgno_t md_branch_pages number of internal pages
uint16_t md_depth depth of this tree
size_t md_entries number of data items
uint16_t md_flags Database Flags
pgno_t md_leaf_pages number of leaf pages
pgno_t md_overflow_pages number of overflow pages
uint32_t md_pad also ksize for LEAF2 pages
pgno_t md_root the root page of this tree
struct MDB_meta

Meta page content.

Definition at line 755 of file mdb.c.

Collaboration diagram for MDB_meta:
Class Members
void * mm_address address for fixed mapping
MDB_db mm_dbs first is free space, 2nd is main db
pgno_t mm_last_pg last used page in file
uint32_t mm_magic Stamp identifying this as an MDB file. It must be set to MDB_MAGIC.
size_t mm_mapsize size of mmap region
txnid_t mm_txnid txnid that committed this page
uint32_t mm_version Version number of this lock file. Must be set to MDB_VERSION.
union MDB_pagebuf

Buffer for a stack-allocated dirty page.

The members define size and alignment, and silence type aliasing warnings. They are not used directly; that could mean incorrectly using several union members in parallel.

Definition at line 777 of file mdb.c.

Collaboration diagram for MDB_pagebuf:
Class Members
struct MDB_pagebuf mb_metabuf
MDB_page mb_page
char mb_raw
struct MDB_pagebuf.mb_metabuf

Definition at line 780 of file mdb.c.

Class Members
MDB_meta mm_meta
char mm_pad
struct MDB_dbx

Auxiliary DB info.

The information here is mostly static/read-only. There is only a single copy of this record in the environment.

Definition at line 790 of file mdb.c.

Collaboration diagram for MDB_dbx:
Class Members
MDB_cmp_func * md_cmp function for comparing keys
MDB_cmp_func * md_dcmp function for comparing data items
MDB_val md_name name of the database
MDB_rel_func * md_rel user relocate function
void * md_relctx user-provided context for md_rel
struct MDB_txn

A database transaction.

Every operation requires a transaction handle.

Definition at line 801 of file mdb.c.

Collaboration diagram for MDB_txn:
Class Members
MDB_txn * mt_child nested txn under this txn
MDB_cursor ** mt_cursors Array of cursors for each DB.
unsigned char * mt_dbflags Array of flags for each DB.
MDB_db * mt_dbs Array of MDB_db records for each known DB.
MDB_dbx * mt_dbxs Array of records for each DB known in the environment.
MDB_env * mt_env the DB environment
unsigned int mt_flags Transaction Flags
IDL mt_free_pgs The list of pages that became unused during this transaction.
pgno_t mt_next_pgno next unallocated page
MDB_dbi mt_numdbs Number of DB records in use. This number only ever increments; we don't decrement it when individual DB handles are closed.
MDB_txn * mt_parent parent of a nested txn
unsigned int mt_toggle Tracks which of the two meta pages was used at the start of this transaction.
txnid_t mt_txnid The ID of this transaction. IDs are integers incrementing from 1. Only committed write transactions increment the ID. If a transaction aborts, the ID may be re-used by the next writer.
union MDB_txn mt_u
union MDB_txn.mt_u

Definition at line 814 of file mdb.c.

Class Members
ID2L dirty_list modified pages
MDB_reader * reader this thread's slot in the reader table
struct MDB_cursor

Cursors are used for all DB operations.

Definition at line 861 of file mdb.c.

Collaboration diagram for MDB_cursor:
Class Members
MDB_db * mc_db The database record for this cursor.
unsigned char * mc_dbflag The Transaction DB Flags for this database.
MDB_dbi mc_dbi The database handle this cursor operates on.
MDB_dbx * mc_dbx The database auxiliary record for this cursor.
unsigned int mc_flags Cursor Flags
indx_t mc_ki stack of page indices
MDB_cursor * mc_next Next cursor on this DB in this txn.
MDB_cursor * mc_orig Original cursor if this is a shadow.
MDB_page * mc_pg stack of pushed pages
unsigned short mc_snum number of pushed pages
unsigned short mc_top index of top page, normally mc_snum-1
MDB_txn * mc_txn The transaction that owns this cursor.
struct MDB_xcursor * mc_xcursor Context used for databases with MDB_DUPSORT, otherwise NULL.
struct MDB_xcursor

Context for sorted-dup records.

We could have gone to a fully recursive design, with arbitrarily deep nesting of sub-databases. But for now we only handle these levels - main DB, optional sub-DB, sorted-duplicate DB.

Definition at line 901 of file mdb.c.

Collaboration diagram for MDB_xcursor:
Class Members
MDB_cursor mx_cursor A sub-cursor for traversing the Dup DB.
MDB_db mx_db The database record for this Dup DB.
unsigned char mx_dbflag The Transaction DB Flags for this Dup DB.
MDB_dbx mx_dbx The auxiliary DB record for this Dup DB.
struct MDB_oldpages

A set of pages freed by an earlier transaction.

Definition at line 913 of file mdb.c.

Collaboration diagram for MDB_oldpages:
Class Members
struct MDB_oldpages * mo_next Usually we only read one record from the FREEDB at a time, but in case we read more, this will chain them together.
pgno_t mo_pages An IDL of the pages.
txnid_t mo_txnid The ID of the transaction in which these pages were freed.
struct MDB_env

The database environment.

Definition at line 925 of file mdb.c.

Collaboration diagram for MDB_env:
Class Members
unsigned int me_db_toggle which DB table is current
MDB_db * me_dbs two arrays of MDB_db info
MDB_dbx * me_dbxs array of static DB info
ID2 me_dirty_list ID2L of pages that were written during a write txn.
MDB_page * me_dpages list of malloc'd blocks for re-use
uint32_t me_extrapad unused for now
HANDLE me_fd The main data file.
uint32_t me_flags Environment Flags
IDL me_free_pgs IDL of pages that became unused in a write txn.
HANDLE me_lfd The lock file.
char * me_map the memory map of the data file
size_t me_mapsize size of the data memory map
MDB_dbi me_maxdbs size of the DB table
pgno_t me_maxpg me_mapsize / me_psize
unsigned int me_maxreaders size of the reader table
MDB_meta * me_metas pointers to the two meta pages
HANDLE me_mfd just for writing the meta pages
MDB_dbi me_numdbs number of DBs opened
char * me_path path to the DB files
txnid_t me_pgfirst ID of first old page record we used.
MDB_oldpages * me_pghead list of old page records
txnid_t me_pglast ID of last old page record we used.
unsigned int me_psize size of a page, from GET_PAGESIZE
off_t me_size current file size
pthread_key_t me_txkey thread-key for readers
MDB_txn * me_txn current write transaction
MDB_txninfo * me_txns the memory map of the lock file
txnid_t me_wtxnid ID of last txn we committed.

Define Documentation

#define CHANGEABLE   (MDB_NOSYNC)

Only a subset of the Environment Flags flags can be changed at runtime.

Changing other flags requires closing the environment and re-opening it with the new flags.

Definition at line 6027 of file mdb.c.

#define COPY_PGNO (   dst,
  src 
)
Value:
do { \
       unsigned short *s, *d;      \
       s = (unsigned short *)&(src);      \
       d = (unsigned short *)&(dst);      \
       *d++ = *s++;  \
       *d = *s;      \
} while (0)

Copy a page number from src to dst.

Definition at line 718 of file mdb.c.

#define CURSOR_STACK   32

Enough space for 2^32 nodes with minimum of 2 keys per node.

I.e., plenty. At 4 keys per node, enough for 2^64 nodes, so there's probably no need to raise this on a 64 bit machine.

Definition at line 856 of file mdb.c.

#define DATANAME   "/data.mdb"

The name of the data file in the DB environment.

Definition at line 2924 of file mdb.c.

#define DEFAULT_MAPSIZE   1048576

Default size of memory map.

This is certainly too small for any actual applications. Apps should always set the size explicitly using mdb_env_set_mapsize().

Definition at line 394 of file mdb.c.

#define DKBUF   typedef int dummy_kbuf /* so we can put ';' after */

Definition at line 333 of file mdb.c.

#define DKEY (   x)    0

Definition at line 334 of file mdb.c.

#define F_ISSET (   w,
  f 
)    (((w) & (f)) == (f))

Test if a flag f is set in a flag word w.

Definition at line 382 of file mdb.c.

#define FILL_THRESHOLD   250

The minimum page fill factor, in tenths of a percent.

Pages emptier than this are candidates for merging.

Definition at line 614 of file mdb.c.

#define FREE_DBI   0

Handle for the DB used to track free pages.

Definition at line 750 of file mdb.c.

#define INDXSIZE (   k)    (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))

Size of a node in a branch page with a given key.

This is just the node header plus the key, there is no data.

Definition at line 670 of file mdb.c.

#define IS_BRANCH (   p)    F_ISSET((p)->mp_flags, P_BRANCH)

Test if a page is a branch page.

Definition at line 621 of file mdb.c.

#define IS_LEAF (   p)    F_ISSET((p)->mp_flags, P_LEAF)

Test if a page is a leaf page.

Definition at line 617 of file mdb.c.

#define IS_LEAF2 (   p)    F_ISSET((p)->mp_flags, P_LEAF2)

Test if a page is a LEAF2 page.

Definition at line 619 of file mdb.c.

#define IS_OVERFLOW (   p)    F_ISSET((p)->mp_flags, P_OVERFLOW)

Test if a page is an overflow page.

Definition at line 623 of file mdb.c.

#define IS_SUBP (   p)    F_ISSET((p)->mp_flags, P_SUBP)

Test if a page is a sub page.

Definition at line 625 of file mdb.c.

#define LEAF2KEY (   p,
  i,
  ks 
)    ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))

The address of a key in a LEAF2 page.

LEAF2 pages are used for MDB_DUPFIXED sorted-duplicate sub-DBs. There are no node headers, keys are stored contiguously.

Definition at line 731 of file mdb.c.

#define LEAFSIZE (   k,
 
)    (NODESIZE + (k)->mv_size + (d)->mv_size)

Size of a node in a leaf page with a given key and data.

This is node header plus key plus data size.

Definition at line 675 of file mdb.c.

#define LOCKNAME   "/lock.mdb"

The name of the lock file in the DB environment.

Definition at line 2922 of file mdb.c.

#define LOCKSUFF   "-lock"

The suffix of the lock file when no subdir is used.

Definition at line 2926 of file mdb.c.

#define MAIN_DBI   1

Handle for the default DB.

Definition at line 752 of file mdb.c.

#define MAXKEYSIZE   511

The maximum size of a key in the database.

While data items have essentially unbounded size, we require that keys all fit onto a regular page. This limit could be raised a bit further if needed; to something just under MDB_PAGESIZE / MDB_MINKEYS.

Definition at line 319 of file mdb.c.

#define MDB_COMMIT_PAGES   64

max number of pages to commit in one writev() call

Definition at line 970 of file mdb.c.

#define MDB_DSYNC   O_DSYNC

A flag for opening a file and requesting synchronous data writes.

  This is only used when writing a meta page. It's not strictly needed;
  we could just do a normal write and then immediately perform a flush.
  But if this flag is available it saves us an extra system call.

  @note If O_DSYNC is undefined but exists in /usr/include,

preferably set some compiler flag to get the definition. Otherwise compile with the less efficient -DMDB_DSYNC=O_SYNC.

Definition at line 222 of file mdb.c.

#define MDB_FDATASYNC   fdatasync

Function for flushing the data of a file.

Define this to fsync if fdatasync() is not supported.

Definition at line 230 of file mdb.c.

#define MDB_MAGIC   0xBEEFC0DE

A stamp that identifies a file as an MDB file.

There's nothing special about this value other than that it is easily recognizable, and it will reflect any byte order mismatches.

Definition at line 309 of file mdb.c.

#define MDB_MINKEYS   2

The minimum number of keys required in a database page.

Setting this to a larger value will place a smaller bound on the maximum size of a data item. Data items larger than this size will be pushed into overflow pages instead of being stored directly in the B-tree node. This value used to default to 4. With a page size of 4096 bytes that meant that any item larger than 1024 bytes would go into an overflow page. That also meant that on average 2-3KB of each overflow page was wasted space. The value cannot be lower than 2 because then there would no longer be a tree structure. With this value, items larger than 2KB will go into overflow pages, and on average only 1KB will be wasted.

Definition at line 303 of file mdb.c.

#define MDB_PAGESIZE   4096

A default memory page size.

The actual size is platform-dependent, but we use this for boot-strapping. We probably should not be using this any more. The GET_PAGESIZE() macro is used to get the actual size.

Note that we don't currently support Huge pages. On Linux, regular data files cannot use Huge pages, and in general Huge pages aren't actually pageable. We rely on the OS demand-pager to read our data and page it out when memory pressure from other processes is high. So until OSs have actual paging support for Huge pages, they're not viable.

Definition at line 289 of file mdb.c.

#define MDB_SET_KEY (   node,
  key 
)
Value:
{ if ((key) != NULL) { \
       (key)->mv_size = NODEKSZ(node); (key)->mv_data = NODEKEY(node); } }

Set the node's key into key, if requested.

Definition at line 734 of file mdb.c.

#define MDB_VERSION   1

The version number for a database's file format.

Definition at line 312 of file mdb.c.

#define METADATA (   p)    ((void *)((char *)(p) + PAGEHDRSZ))

Address of first usable data byte in a page, after the header.

Definition at line 600 of file mdb.c.

#define NODEDATA (   node)    (void *)((char *)(node)->mn_data + (node)->mn_ksize)

Address of the data for a node.

Definition at line 684 of file mdb.c.

#define NODEDSZ (   node)    ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))

Get the size of the data in a leaf node.

Definition at line 696 of file mdb.c.

#define NODEKEY (   node)    (void *)((node)->mn_data)

Address of the key for the node.

Definition at line 681 of file mdb.c.

#define NODEKSZ (   node)    ((node)->mn_ksize)

The size of a key in a node.

Definition at line 701 of file mdb.c.

#define NODEPGNO (   node)
Value:
((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
        (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))

Get the page number pointed to by a branch node.

Definition at line 687 of file mdb.c.

#define NODEPTR (   p,
 
)    ((MDB_node *)((char *)(p) + (p)->mp_ptrs[i]))

Address of node i in page p.

Definition at line 678 of file mdb.c.

#define NODESIZE   offsetof(MDB_node, mn_data)

Size of the node header, excluding dynamic data at the end.

Definition at line 662 of file mdb.c.

#define NUMKEYS (   p)    (((p)->mp_lower - PAGEHDRSZ) >> 1)

Number of nodes on a page.

Definition at line 603 of file mdb.c.

#define OVPAGES (   size,
  psize 
)    ((PAGEHDRSZ-1 + (size)) / (psize) + 1)

The number of overflow pages needed to store the given size.

Definition at line 628 of file mdb.c.

#define P_INVALID   (~0UL)

An invalid page number.

Mainly used to denote an empty tree.

Definition at line 379 of file mdb.c.

#define PAGEFILL (   env,
 
)
Value:
(1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
                            ((env)->me_psize - PAGEHDRSZ))

The percentage of space used in the page, in tenths of a percent.

Definition at line 609 of file mdb.c.

#define PAGEHDRSZ   ((unsigned) offsetof(MDB_page, mp_ptrs))

Size of the page header, excluding dynamic data at the end.

Definition at line 597 of file mdb.c.

#define PGNO_TOPWORD   ((pgno_t)-1 > 0xffffffffu ? 32 : 0)

Bit position of top word in page number, for shifting mn_flags.

Definition at line 665 of file mdb.c.

#define SETDSZ (   node,
  size 
)
Value:
do { \
       (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)

Set the size of the data for a leaf node.

Definition at line 698 of file mdb.c.

#define SETPGNO (   node,
  pgno 
)
Value:
do { \
       (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
       if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)

Set the page number in a branch node.

Definition at line 691 of file mdb.c.

#define SIZELEFT (   p)    (indx_t)((p)->mp_upper - (p)->mp_lower)

The amount of space remaining in the page.

Definition at line 606 of file mdb.c.


Typedef Documentation

typedef uint16_t indx_t

Used for offsets within a single page.

Since memory pages are typically 4 or 8KB in size, 12-13 bits, this is plenty.

Definition at line 388 of file mdb.c.

typedef struct MDB_db MDB_db

Information about a single database in the environment.

typedef struct MDB_dbx MDB_dbx

Auxiliary DB info.

The information here is mostly static/read-only. There is only a single copy of this record in the environment.

typedef struct MDB_meta MDB_meta

Meta page content.

typedef struct MDB_node MDB_node

Header for a single key/data pair within a page.

We guarantee 2-byte alignment for nodes.

typedef struct MDB_oldpages MDB_oldpages

A set of pages freed by an earlier transaction.

typedef struct MDB_page MDB_page

Common header for all page types.

Overflow records occupy a number of contiguous pages with no headers on any page after the first.

typedef union MDB_pagebuf MDB_pagebuf

Buffer for a stack-allocated dirty page.

The members define size and alignment, and silence type aliasing warnings. They are not used directly; that could mean incorrectly using several union members in parallel.

typedef struct MDB_xcursor MDB_xcursor

Context for sorted-dup records.

We could have gone to a fully recursive design, with arbitrarily deep nesting of sub-databases. But for now we only handle these levels - main DB, optional sub-DB, sorted-duplicate DB.

typedef ID pgno_t

A page number in the database.

Note that 64 bit page numbers are overkill, since pages themselves already represent 12-13 bits of addressable memory, and the OS will always limit applications to a maximum of 63 bits of address space.

Note:
In the MDB_node structure, we only store 48 bits of this value, which thus limits us to only 60 bits of addressable data.

Definition at line 241 of file mdb.c.

typedef ID txnid_t

A transaction ID.

See struct MDB_txn.mt_txnid for details.

Definition at line 246 of file mdb.c.


Function Documentation

static size_t mdb_branch_size ( MDB_env env,
MDB_val key 
) [static]

Calculate the size of a branch node.

The size should depend on the environment's page size but since we currently don't support spilling large keys onto overflow pages, it's simply the size of the MDB_node header plus the size of the key. Sizes are always rounded up to an even number of bytes, to guarantee 2-byte alignment of the MDB_node headers.

Parameters:
[in]envThe environment handle.
[in]keyThe key for the node.
Returns:
The number of bytes needed to store the node.

Definition at line 4609 of file mdb.c.

{
       size_t         sz;

       sz = INDXSIZE(key);
       if (sz >= env->me_psize / MDB_MINKEYS) {
              /* put on overflow page */
              /* not implemented */
              /* sz -= key->size - sizeof(pgno_t); */
       }

       return sz + sizeof(indx_t);
}

Here is the caller graph for this function:

void mdb_close ( MDB_env env,
MDB_dbi  dbi 
)

Close a database handle.

This call is not mutex protected. Handles should only be closed by a single thread, and only if no other threads are going to reference the database handle any further.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]dbiA database handle returned by mdb_open()

Definition at line 6211 of file mdb.c.

{
       char *ptr;
       if (dbi <= MAIN_DBI || dbi >= env->me_numdbs)
              return;
       ptr = env->me_dbxs[dbi].md_name.mv_data;
       env->me_dbxs[dbi].md_name.mv_data = NULL;
       env->me_dbxs[dbi].md_name.mv_size = 0;
       free(ptr);
}

Here is the caller graph for this function:

int mdb_cmp ( MDB_txn txn,
MDB_dbi  dbi,
const MDB_val a,
const MDB_val b 
)

Compare two data items according to a particular database.

This returns a comparison as if the two data items were keys in the specified database.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[in]aThe first item to compare
[in]bThe second item to compare
Returns:
< 0 if a < b, 0 if a == b, > 0 if a > b

Definition at line 1164 of file mdb.c.

{
       return txn->mt_dbxs[dbi].md_cmp(a, b);
}
static int mdb_cmp_cint ( const MDB_val a,
const MDB_val b 
) [static]

Compare two items pointing at ints of unknown alignment.

Nodes and keys are guaranteed to be 2-byte aligned.

Definition at line 3090 of file mdb.c.

{
#if BYTE_ORDER == LITTLE_ENDIAN
       unsigned short *u, *c;
       int x;

       u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
       c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
       do {
              x = *--u - *--c;
       } while(!x && u > (unsigned short *)a->mv_data);
       return x;
#else
       return memcmp(a->mv_data, b->mv_data, a->mv_size);
#endif
}

Here is the caller graph for this function:

static int mdb_cmp_int ( const MDB_val a,
const MDB_val b 
) [static]

Compare two items pointing at aligned int's.

Definition at line 3080 of file mdb.c.

{
       return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
              *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
}

Here is the caller graph for this function:

static int mdb_cmp_long ( const MDB_val a,
const MDB_val b 
) [static]

Compare two items pointing at aligned size_t's.

Definition at line 3072 of file mdb.c.

{
       return (*(size_t *)a->mv_data < *(size_t *)b->mv_data) ? -1 :
              *(size_t *)a->mv_data > *(size_t *)b->mv_data;
}

Here is the caller graph for this function:

static int mdb_cmp_memn ( const MDB_val a,
const MDB_val b 
) [static]

Compare two items lexically.

Definition at line 3109 of file mdb.c.

{
       int diff;
       ssize_t len_diff;
       unsigned int len;

       len = a->mv_size;
       len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
       if (len_diff > 0) {
              len = b->mv_size;
              len_diff = 1;
       }

       diff = memcmp(a->mv_data, b->mv_data, len);
       return diff ? diff : len_diff<0 ? -1 : len_diff;
}

Here is the caller graph for this function:

static int mdb_cmp_memnr ( const MDB_val a,
const MDB_val b 
) [static]

Compare two items in reverse byte order.

Definition at line 3128 of file mdb.c.

{
       const unsigned char  *p1, *p2, *p1_lim;
       ssize_t len_diff;
       int diff;

       p1_lim = (const unsigned char *)a->mv_data;
       p1 = (const unsigned char *)a->mv_data + a->mv_size;
       p2 = (const unsigned char *)b->mv_data + b->mv_size;

       len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
       if (len_diff > 0) {
              p1_lim += len_diff;
              len_diff = 1;
       }

       while (p1 > p1_lim) {
              diff = *--p1 - *--p2;
              if (diff)
                     return diff;
       }
       return len_diff<0 ? -1 : len_diff;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Close a cursor handle.

The cursor handle will be freed and must not be used again after this call.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()

Definition at line 5032 of file mdb.c.

{
       if (mc != NULL) {
              /* remove from txn, if tracked */
              if (mc->mc_txn->mt_cursors) {
                     MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
                     while (*prev && *prev != mc) prev = &(*prev)->mc_next;
                     if (*prev == mc)
                            *prev = mc->mc_next;
              }
              if (mc->mc_flags & C_ALLOCD)
                     free(mc);
       }
}

Here is the caller graph for this function:

static void mdb_cursor_copy ( const MDB_cursor csrc,
MDB_cursor cdst 
) [static]

Copy the contents of a cursor.

Parameters:
[in]csrcThe cursor to copy from.
[out]cdstThe cursor to copy to.

Definition at line 5395 of file mdb.c.

{
       unsigned int i;

       cdst->mc_txn = csrc->mc_txn;
       cdst->mc_dbi = csrc->mc_dbi;
       cdst->mc_db  = csrc->mc_db;
       cdst->mc_dbx = csrc->mc_dbx;
       cdst->mc_snum = csrc->mc_snum;
       cdst->mc_top = csrc->mc_top;
       cdst->mc_flags = csrc->mc_flags;

       for (i=0; i<csrc->mc_snum; i++) {
              cdst->mc_pg[i] = csrc->mc_pg[i];
              cdst->mc_ki[i] = csrc->mc_ki[i];
       }
}

Here is the caller graph for this function:

int mdb_cursor_count ( MDB_cursor cursor,
size_t *  countp 
)

Return count of duplicates for current key.

This call is only valid on databases that support sorted duplicate data items MDB_DUPSORT.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()
[out]countpAddress where the count will be stored
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - cursor is not initialized, or an invalid parameter was specified.

Definition at line 5009 of file mdb.c.

{
       MDB_node      *leaf;

       if (mc == NULL || countp == NULL)
              return EINVAL;

       if (!(mc->mc_db->md_flags & MDB_DUPSORT))
              return EINVAL;

       leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
       if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
              *countp = 1;
       } else {
              if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
                     return EINVAL;

              *countp = mc->mc_xcursor->mx_db.md_entries;
       }
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

Return the cursor's database handle.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()

Definition at line 5055 of file mdb.c.

{
       if (!mc) return 0;
       return mc->mc_dbi;
}
int mdb_cursor_del ( MDB_cursor cursor,
unsigned int  flags 
)

Delete current key/data pair.

This function deletes the key/data pair to which the cursor refers.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()
[in]flagsOptions for this operation. This parameter must be set to 0 or one of the values described here.
  • MDB_NODUPDATA - delete all of the data items for the current key. This flag may only be specified if the database was opened with MDB_DUPSORT.
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EACCES - an attempt was made to modify a read-only database.
  • EINVAL - an invalid parameter was specified.

Definition at line 4488 of file mdb.c.

{
       MDB_node      *leaf;
       int rc;

       if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
              return EACCES;

       if (!mc->mc_flags & C_INITIALIZED)
              return EINVAL;

       rc = mdb_cursor_touch(mc);
       if (rc)
              return rc;

       leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);

       if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_DUPDATA)) {
              if (flags != MDB_NODUPDATA) {
                     if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
                            mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
                     }
                     rc = mdb_cursor_del(&mc->mc_xcursor->mx_cursor, 0);
                     /* If sub-DB still has entries, we're done */
                     if (mc->mc_xcursor->mx_db.md_entries) {
                            if (leaf->mn_flags & F_SUBDATA) {
                                   /* update subDB info */
                                   void *db = NODEDATA(leaf);
                                   memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
                            } else {
                                   /* shrink fake page */
                                   mdb_node_shrink(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
                            }
                            mc->mc_db->md_entries--;
                            return rc;
                     }
                     /* otherwise fall thru and delete the sub-DB */
              }

              if (leaf->mn_flags & F_SUBDATA) {
                     /* add all the child DB's pages to the free list */
                     rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
                     if (rc == MDB_SUCCESS) {
                            mc->mc_db->md_entries -=
                                   mc->mc_xcursor->mx_db.md_entries;
                     }
              }
       }

       return mdb_cursor_del0(mc, leaf);
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_del0 ( MDB_cursor mc,
MDB_node leaf 
) [static]

Complete a delete operation started by mdb_cursor_del().

Definition at line 5570 of file mdb.c.

{
       int rc;

       /* add overflow pages to free list */
       if (!IS_LEAF2(mc->mc_pg[mc->mc_top]) && F_ISSET(leaf->mn_flags, F_BIGDATA)) {
              int i, ovpages;
              pgno_t pg;

              memcpy(&pg, NODEDATA(leaf), sizeof(pg));
              ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
              mc->mc_db->md_overflow_pages -= ovpages;
              for (i=0; i<ovpages; i++) {
                     DPRINTF("freed ov page %zu", pg);
                     mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
                     pg++;
              }
       }
       mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], mc->mc_db->md_pad);
       mc->mc_db->md_entries--;
       rc = mdb_rebalance(mc);
       if (rc != MDB_SUCCESS)
              mc->mc_txn->mt_flags |= MDB_TXN_ERROR;

       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_first ( MDB_cursor mc,
MDB_val key,
MDB_val data 
) [static]

Move the cursor to the first item in the database.

Definition at line 3909 of file mdb.c.

{
       int            rc;
       MDB_node      *leaf;

       if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
              rc = mdb_page_search(mc, NULL, 0);
              if (rc != MDB_SUCCESS)
                     return rc;
       }
       assert(IS_LEAF(mc->mc_pg[mc->mc_top]));

       leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
       mc->mc_flags |= C_INITIALIZED;
       mc->mc_flags &= ~C_EOF;

       mc->mc_ki[mc->mc_top] = 0;

       if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
              key->mv_size = mc->mc_db->md_pad;
              key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
              return MDB_SUCCESS;
       }

       if (data) {
              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     mdb_xcursor_init1(mc, leaf);
                     rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
                     if (rc)
                            return rc;
              } else {
                     if (mc->mc_xcursor)
                            mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                     if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
                            return rc;
              }
       }
       MDB_SET_KEY(leaf, key);
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_cursor_get ( MDB_cursor cursor,
MDB_val key,
MDB_val data,
MDB_cursor_op  op 
)

Retrieve by cursor.

This function retrieves key/data pairs from the database. The address and length of the key are returned in the object to which key refers (except for the case of the MDB_SET option, in which the key object is unchanged), and the address and length of the data are returned in the object to which data refers.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()
[in,out]keyThe key for a retrieved item
[in,out]dataThe data of a retrieved item
[in]opA cursor operation MDB_cursor_op
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • MDB_NOTFOUND - no matching key found.
  • EINVAL - an invalid parameter was specified.

Definition at line 3999 of file mdb.c.

{
       int            rc;
       int            exact = 0;

       assert(mc);

       switch (op) {
       case MDB_GET_BOTH:
       case MDB_GET_BOTH_RANGE:
              if (data == NULL || mc->mc_xcursor == NULL) {
                     rc = EINVAL;
                     break;
              }
              /* FALLTHRU */
       case MDB_SET:
       case MDB_SET_RANGE:
              if (key == NULL || key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
                     rc = EINVAL;
              } else if (op == MDB_SET_RANGE)
                     rc = mdb_cursor_set(mc, key, data, op, NULL);
              else
                     rc = mdb_cursor_set(mc, key, data, op, &exact);
              break;
       case MDB_GET_MULTIPLE:
              if (data == NULL ||
                     !(mc->mc_db->md_flags & MDB_DUPFIXED) ||
                     !(mc->mc_flags & C_INITIALIZED)) {
                     rc = EINVAL;
                     break;
              }
              rc = MDB_SUCCESS;
              if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
                     (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
                     break;
              goto fetchm;
       case MDB_NEXT_MULTIPLE:
              if (data == NULL ||
                     !(mc->mc_db->md_flags & MDB_DUPFIXED)) {
                     rc = EINVAL;
                     break;
              }
              if (!(mc->mc_flags & C_INITIALIZED))
                     rc = mdb_cursor_first(mc, key, data);
              else
                     rc = mdb_cursor_next(mc, key, data, MDB_NEXT_DUP);
              if (rc == MDB_SUCCESS) {
                     if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
                            MDB_cursor *mx;
fetchm:
                            mx = &mc->mc_xcursor->mx_cursor;
                            data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
                                   mx->mc_db->md_pad;
                            data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
                            mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
                     } else {
                            rc = MDB_NOTFOUND;
                     }
              }
              break;
       case MDB_NEXT:
       case MDB_NEXT_DUP:
       case MDB_NEXT_NODUP:
              if (!(mc->mc_flags & C_INITIALIZED))
                     rc = mdb_cursor_first(mc, key, data);
              else
                     rc = mdb_cursor_next(mc, key, data, op);
              break;
       case MDB_PREV:
       case MDB_PREV_DUP:
       case MDB_PREV_NODUP:
              if (!(mc->mc_flags & C_INITIALIZED) || (mc->mc_flags & C_EOF))
                     rc = mdb_cursor_last(mc, key, data);
              else
                     rc = mdb_cursor_prev(mc, key, data, op);
              break;
       case MDB_FIRST:
              rc = mdb_cursor_first(mc, key, data);
              break;
       case MDB_FIRST_DUP:
              if (data == NULL ||
                     !(mc->mc_db->md_flags & MDB_DUPSORT) ||
                     !(mc->mc_flags & C_INITIALIZED) ||
                     !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
                     rc = EINVAL;
                     break;
              }
              rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
              break;
       case MDB_LAST:
              rc = mdb_cursor_last(mc, key, data);
              break;
       case MDB_LAST_DUP:
              if (data == NULL ||
                     !(mc->mc_db->md_flags & MDB_DUPSORT) ||
                     !(mc->mc_flags & C_INITIALIZED) ||
                     !(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
                     rc = EINVAL;
                     break;
              }
              rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
              break;
       default:
              DPRINTF("unhandled/unimplemented cursor operation %u", op);
              rc = EINVAL;
              break;
       }

       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void mdb_cursor_init ( MDB_cursor mc,
MDB_txn txn,
MDB_dbi  dbi,
MDB_xcursor mx 
) [static]

Initialize a cursor for a given transaction and database.

Definition at line 4951 of file mdb.c.

{
       mc->mc_orig = NULL;
       mc->mc_dbi = dbi;
       mc->mc_txn = txn;
       mc->mc_db = &txn->mt_dbs[dbi];
       mc->mc_dbx = &txn->mt_dbxs[dbi];
       mc->mc_dbflag = &txn->mt_dbflags[dbi];
       mc->mc_snum = 0;
       mc->mc_top = 0;
       mc->mc_flags = 0;
       if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
              assert(mx != NULL);
              mc->mc_xcursor = mx;
              mdb_xcursor_init0(mc);
       } else {
              mc->mc_xcursor = NULL;
       }
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_last ( MDB_cursor mc,
MDB_val key,
MDB_val data 
) [static]

Move the cursor to the last item in the database.

Definition at line 3952 of file mdb.c.

{
       int            rc;
       MDB_node      *leaf;
       MDB_val       lkey;

       lkey.mv_size = MAXKEYSIZE+1;
       lkey.mv_data = NULL;

       if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
              rc = mdb_page_search(mc, &lkey, 0);
              if (rc != MDB_SUCCESS)
                     return rc;
       }
       assert(IS_LEAF(mc->mc_pg[mc->mc_top]));

       leaf = NODEPTR(mc->mc_pg[mc->mc_top], NUMKEYS(mc->mc_pg[mc->mc_top])-1);
       mc->mc_flags |= C_INITIALIZED;
       mc->mc_flags &= ~C_EOF;

       mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;

       if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
              key->mv_size = mc->mc_db->md_pad;
              key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
              return MDB_SUCCESS;
       }

       if (data) {
              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     mdb_xcursor_init1(mc, leaf);
                     rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
                     if (rc)
                            return rc;
              } else {
                     if (mc->mc_xcursor)
                            mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                     if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
                            return rc;
              }
       }

       MDB_SET_KEY(leaf, key);
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void mdb_cursor_merge ( MDB_txn txn) [static]

Merge shadow cursors back into parent's.

Definition at line 1503 of file mdb.c.

{
       MDB_dbi i;
       for (i=0; i<txn->mt_numdbs; i++) {
              if (txn->mt_cursors[i]) {
                     MDB_cursor *mc;
                     while ((mc = txn->mt_cursors[i])) {
                            txn->mt_cursors[i] = mc->mc_next;
                            if (mc->mc_flags & C_SHADOW) {
                                   MDB_cursor *m2 = mc->mc_orig;
                                   unsigned int j;
                                   m2->mc_snum = mc->mc_snum;
                                   m2->mc_top = mc->mc_top;
                                   for (j=0; j<mc->mc_snum; j++) {
                                          m2->mc_pg[j] = mc->mc_pg[j];
                                          m2->mc_ki[j] = mc->mc_ki[j];
                                   }
                            }
                            if (mc->mc_flags & C_ALLOCD)
                                   free(mc);
                     }
              }
       }
}

Here is the caller graph for this function:

static int mdb_cursor_next ( MDB_cursor mc,
MDB_val key,
MDB_val data,
MDB_cursor_op  op 
) [static]

Move the cursor to the next data item.

Definition at line 3596 of file mdb.c.

{
       MDB_page      *mp;
       MDB_node      *leaf;
       int rc;

       if (mc->mc_flags & C_EOF) {
              return MDB_NOTFOUND;
       }

       assert(mc->mc_flags & C_INITIALIZED);

       mp = mc->mc_pg[mc->mc_top];

       if (mc->mc_db->md_flags & MDB_DUPSORT) {
              leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     if (op == MDB_NEXT || op == MDB_NEXT_DUP) {
                            rc = mdb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_NEXT);
                            if (op != MDB_NEXT || rc == MDB_SUCCESS)
                                   return rc;
                     }
              } else {
                     mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                     if (op == MDB_NEXT_DUP)
                            return MDB_NOTFOUND;
              }
       }

       DPRINTF("cursor_next: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);

       if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
              DPUTS("=====> move to next sibling page");
              if (mdb_cursor_sibling(mc, 1) != MDB_SUCCESS) {
                     mc->mc_flags |= C_EOF;
                     mc->mc_flags &= ~C_INITIALIZED;
                     return MDB_NOTFOUND;
              }
              mp = mc->mc_pg[mc->mc_top];
              DPRINTF("next page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
       } else
              mc->mc_ki[mc->mc_top]++;

       DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
           mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);

       if (IS_LEAF2(mp)) {
              key->mv_size = mc->mc_db->md_pad;
              key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
              return MDB_SUCCESS;
       }

       assert(IS_LEAF(mp));
       leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);

       if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
              mdb_xcursor_init1(mc, leaf);
       }
       if (data) {
              if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
                     return rc;

              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
                     if (rc != MDB_SUCCESS)
                            return rc;
              }
       }

       MDB_SET_KEY(leaf, key);
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_cursor_open ( MDB_txn txn,
MDB_dbi  dbi,
MDB_cursor **  cursor 
)

Create a cursor handle.

Cursors are associated with a specific transaction and database and may not span threads.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[out]cursorAddress where the new MDB_cursor handle will be stored
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.

Definition at line 4972 of file mdb.c.

{
       MDB_cursor    *mc;
       MDB_xcursor   *mx = NULL;
       size_t size = sizeof(MDB_cursor);

       if (txn == NULL || ret == NULL || dbi >= txn->mt_numdbs)
              return EINVAL;

       /* Allow read access to the freelist */
       if (!dbi && !F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
              return EINVAL;

       if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT)
              size += sizeof(MDB_xcursor);

       if ((mc = malloc(size)) != NULL) {
              if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
                     mx = (MDB_xcursor *)(mc + 1);
              }
              mdb_cursor_init(mc, txn, dbi, mx);
              if (txn->mt_cursors) {
                     mc->mc_next = txn->mt_cursors[dbi];
                     txn->mt_cursors[dbi] = mc;
              }
              mc->mc_flags |= C_ALLOCD;
       } else {
              return ENOMEM;
       }

       *ret = mc;

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void mdb_cursor_pop ( MDB_cursor mc) [static]

Pop a page off the top of the cursor's stack.

Definition at line 3274 of file mdb.c.

{
       MDB_page      *top;

       if (mc->mc_snum) {
              top = mc->mc_pg[mc->mc_top];
              mc->mc_snum--;
              if (mc->mc_snum)
                     mc->mc_top--;

              DPRINTF("popped page %zu off db %u cursor %p", top->mp_pgno,
                     mc->mc_dbi, (void *) mc);
       }
}

Here is the caller graph for this function:

static int mdb_cursor_prev ( MDB_cursor mc,
MDB_val key,
MDB_val data,
MDB_cursor_op  op 
) [static]

Move the cursor to the previous data item.

Definition at line 3671 of file mdb.c.

{
       MDB_page      *mp;
       MDB_node      *leaf;
       int rc;

       assert(mc->mc_flags & C_INITIALIZED);

       mp = mc->mc_pg[mc->mc_top];

       if (mc->mc_db->md_flags & MDB_DUPSORT) {
              leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
              if (op == MDB_PREV || op == MDB_PREV_DUP) {
                     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                            rc = mdb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_PREV);
                            if (op != MDB_PREV || rc == MDB_SUCCESS)
                                   return rc;
                     } else {
                            mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                            if (op == MDB_PREV_DUP)
                                   return MDB_NOTFOUND;
                     }
              }
       }

       DPRINTF("cursor_prev: top page is %zu in cursor %p", mp->mp_pgno, (void *) mc);

       if (mc->mc_ki[mc->mc_top] == 0)  {
              DPUTS("=====> move to prev sibling page");
              if (mdb_cursor_sibling(mc, 0) != MDB_SUCCESS) {
                     mc->mc_flags &= ~C_INITIALIZED;
                     return MDB_NOTFOUND;
              }
              mp = mc->mc_pg[mc->mc_top];
              mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
              DPRINTF("prev page is %zu, key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]);
       } else
              mc->mc_ki[mc->mc_top]--;

       mc->mc_flags &= ~C_EOF;

       DPRINTF("==> cursor points to page %zu with %u keys, key index %u",
           mp->mp_pgno, NUMKEYS(mp), mc->mc_ki[mc->mc_top]);

       if (IS_LEAF2(mp)) {
              key->mv_size = mc->mc_db->md_pad;
              key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
              return MDB_SUCCESS;
       }

       assert(IS_LEAF(mp));
       leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);

       if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
              mdb_xcursor_init1(mc, leaf);
       }
       if (data) {
              if ((rc = mdb_node_read(mc->mc_txn, leaf, data) != MDB_SUCCESS))
                     return rc;

              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     rc = mdb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
                     if (rc != MDB_SUCCESS)
                            return rc;
              }
       }

       MDB_SET_KEY(leaf, key);
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_push ( MDB_cursor mc,
MDB_page mp 
) [static]

Push a page onto the top of the cursor's stack.

Definition at line 3291 of file mdb.c.

{
       DPRINTF("pushing page %zu on db %u cursor %p", mp->mp_pgno,
              mc->mc_dbi, (void *) mc);

       if (mc->mc_snum >= CURSOR_STACK) {
              assert(mc->mc_snum < CURSOR_STACK);
              return ENOMEM;
       }

       mc->mc_top = mc->mc_snum++;
       mc->mc_pg[mc->mc_top] = mp;
       mc->mc_ki[mc->mc_top] = 0;

       return MDB_SUCCESS;
}

Here is the caller graph for this function:

int mdb_cursor_put ( MDB_cursor cursor,
MDB_val key,
MDB_val data,
unsigned int  flags 
)

Store by cursor.

This function stores key/data pairs into the database. If the function fails for any reason, the state of the cursor will be unchanged. If the function succeeds and an item is inserted into the database, the cursor is always positioned to refer to the newly inserted item.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()
[in]keyThe key operated on.
[in]dataThe data operated on.
[in]flagsOptions for this operation. This parameter must be set to 0 or one of the values described here.
  • MDB_CURRENT - overwrite the data of the key/data pair to which the cursor refers with the specified data item. The key parameter is ignored.
  • MDB_NODUPDATA - enter the new key/data pair only if it does not already appear in the database. This flag may only be specified if the database was opened with MDB_DUPSORT. The function will return MDB_KEYEXIST if the key/data pair already appears in the database.
  • MDB_NOOVERWRITE - enter the new key/data pair only if the key does not already appear in the database. The function will return

    MDB_KEYEXIST if the key already appears in the database, even if

    the database supports duplicates (MDB_DUPSORT).

Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EACCES - an attempt was made to modify a read-only database.
  • EINVAL - an invalid parameter was specified.

Definition at line 4138 of file mdb.c.

{
       MDB_node      *leaf = NULL;
       MDB_val       xdata, *rdata, dkey;
       MDB_page      *fp;
       MDB_db dummy;
       int do_sub = 0;
       unsigned int mcount = 0;
       size_t nsize;
       int rc, rc2;
       MDB_pagebuf pbuf;
       char dbuf[MAXKEYSIZE+1];
       unsigned int nflags;
       DKBUF;

       if (F_ISSET(mc->mc_txn->mt_flags, MDB_TXN_RDONLY))
              return EACCES;

       DPRINTF("==> put db %u key [%s], size %zu, data size %zu",
              mc->mc_dbi, DKEY(key), key ? key->mv_size:0, data->mv_size);

       dkey.mv_size = 0;

       if (flags == MDB_CURRENT) {
              if (!(mc->mc_flags & C_INITIALIZED))
                     return EINVAL;
              rc = MDB_SUCCESS;
       } else if (mc->mc_db->md_root == P_INVALID) {
              MDB_page *np;
              /* new database, write a root leaf page */
              DPUTS("allocating new root leaf page");
              if ((np = mdb_page_new(mc, P_LEAF, 1)) == NULL) {
                     return ENOMEM;
              }
              mc->mc_snum = 0;
              mdb_cursor_push(mc, np);
              mc->mc_db->md_root = np->mp_pgno;
              mc->mc_db->md_depth++;
              *mc->mc_dbflag = DB_DIRTY;
              if ((mc->mc_db->md_flags & (MDB_DUPSORT|MDB_DUPFIXED))
                     == MDB_DUPFIXED)
                     np->mp_flags |= P_LEAF2;
              mc->mc_flags |= C_INITIALIZED;
              rc = MDB_NOTFOUND;
              goto top;
       } else {
              int exact = 0;
              MDB_val d2;
              rc = mdb_cursor_set(mc, key, &d2, MDB_SET, &exact);
              if ((flags & MDB_NOOVERWRITE) && rc == 0) {
                     DPRINTF("duplicate key [%s]", DKEY(key));
                     *data = d2;
                     return MDB_KEYEXIST;
              }
              if (rc && rc != MDB_NOTFOUND)
                     return rc;
       }

       /* Cursor is positioned, now make sure all pages are writable */
       rc2 = mdb_cursor_touch(mc);
       if (rc2)
              return rc2;

top:
       /* The key already exists */
       if (rc == MDB_SUCCESS) {
              /* there's only a key anyway, so this is a no-op */
              if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
                     unsigned int ksize = mc->mc_db->md_pad;
                     if (key->mv_size != ksize)
                            return EINVAL;
                     if (flags == MDB_CURRENT) {
                            char *ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
                            memcpy(ptr, key->mv_data, ksize);
                     }
                     return MDB_SUCCESS;
              }

              leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);

              /* DB has dups? */
              if (F_ISSET(mc->mc_db->md_flags, MDB_DUPSORT)) {
                     /* Was a single item before, must convert now */
more:
                     if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                            /* Just overwrite the current item */
                            if (flags == MDB_CURRENT)
                                   goto current;

                            dkey.mv_size = NODEDSZ(leaf);
                            dkey.mv_data = NODEDATA(leaf);
#if UINT_MAX < SIZE_MAX
                            if (mc->mc_dbx->md_dcmp == mdb_cmp_int && dkey.mv_size == sizeof(size_t))
#ifdef MISALIGNED_OK
                                   mc->mc_dbx->md_dcmp = mdb_cmp_long;
#else
                                   mc->mc_dbx->md_dcmp = mdb_cmp_cint;
#endif
#endif
                            /* if data matches, ignore it */
                            if (!mc->mc_dbx->md_dcmp(data, &dkey))
                                   return (flags == MDB_NODUPDATA) ? MDB_KEYEXIST : MDB_SUCCESS;

                            /* create a fake page for the dup items */
                            memcpy(dbuf, dkey.mv_data, dkey.mv_size);
                            dkey.mv_data = dbuf;
                            fp = (MDB_page *)&pbuf;
                            fp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
                            fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
                            fp->mp_lower = PAGEHDRSZ;
                            fp->mp_upper = PAGEHDRSZ + dkey.mv_size + data->mv_size;
                            if (mc->mc_db->md_flags & MDB_DUPFIXED) {
                                   fp->mp_flags |= P_LEAF2;
                                   fp->mp_pad = data->mv_size;
                            } else {
                                   fp->mp_upper += 2 * sizeof(indx_t) + 2 * NODESIZE +
                                          (dkey.mv_size & 1) + (data->mv_size & 1);
                            }
                            mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
                            do_sub = 1;
                            rdata = &xdata;
                            xdata.mv_size = fp->mp_upper;
                            xdata.mv_data = fp;
                            flags |= F_DUPDATA;
                            goto new_sub;
                     }
                     if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
                            /* See if we need to convert from fake page to subDB */
                            MDB_page *mp;
                            unsigned int offset;
                            unsigned int i;

                            fp = NODEDATA(leaf);
                            if (flags == MDB_CURRENT) {
                                   fp->mp_flags |= P_DIRTY;
                                   COPY_PGNO(fp->mp_pgno, mc->mc_pg[mc->mc_top]->mp_pgno);
                                   mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
                                   flags |= F_DUPDATA;
                                   goto put_sub;
                            }
                            if (mc->mc_db->md_flags & MDB_DUPFIXED) {
                                   offset = fp->mp_pad;
                            } else {
                                   offset = NODESIZE + sizeof(indx_t) + data->mv_size;
                            }
                            offset += offset & 1;
                            if (NODESIZE + sizeof(indx_t) + NODEKSZ(leaf) + NODEDSZ(leaf) +
                                   offset >= (mc->mc_txn->mt_env->me_psize - PAGEHDRSZ) /
                                          MDB_MINKEYS) {
                                   /* yes, convert it */
                                   dummy.md_flags = 0;
                                   if (mc->mc_db->md_flags & MDB_DUPFIXED) {
                                          dummy.md_pad = fp->mp_pad;
                                          dummy.md_flags = MDB_DUPFIXED;
                                          if (mc->mc_db->md_flags & MDB_INTEGERDUP)
                                                 dummy.md_flags |= MDB_INTEGERKEY;
                                   }
                                   dummy.md_depth = 1;
                                   dummy.md_branch_pages = 0;
                                   dummy.md_leaf_pages = 1;
                                   dummy.md_overflow_pages = 0;
                                   dummy.md_entries = NUMKEYS(fp);
                                   rdata = &xdata;
                                   xdata.mv_size = sizeof(MDB_db);
                                   xdata.mv_data = &dummy;
                                   mp = mdb_page_alloc(mc, 1);
                                   if (!mp)
                                          return ENOMEM;
                                   offset = mc->mc_txn->mt_env->me_psize - NODEDSZ(leaf);
                                   flags |= F_DUPDATA|F_SUBDATA;
                                   dummy.md_root = mp->mp_pgno;
                            } else {
                                   /* no, just grow it */
                                   rdata = &xdata;
                                   xdata.mv_size = NODEDSZ(leaf) + offset;
                                   xdata.mv_data = &pbuf;
                                   mp = (MDB_page *)&pbuf;
                                   mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
                                   flags |= F_DUPDATA;
                            }
                            mp->mp_flags = fp->mp_flags | P_DIRTY;
                            mp->mp_pad   = fp->mp_pad;
                            mp->mp_lower = fp->mp_lower;
                            mp->mp_upper = fp->mp_upper + offset;
                            if (IS_LEAF2(fp)) {
                                   memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
                            } else {
                                   nsize = NODEDSZ(leaf) - fp->mp_upper;
                                   memcpy((char *)mp + mp->mp_upper, (char *)fp + fp->mp_upper, nsize);
                                   for (i=0; i<NUMKEYS(fp); i++)
                                          mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
                            }
                            mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
                            do_sub = 1;
                            goto new_sub;
                     }
                     /* data is on sub-DB, just store it */
                     flags |= F_DUPDATA|F_SUBDATA;
                     goto put_sub;
              }
current:
              /* overflow page overwrites need special handling */
              if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
                     MDB_page *omp;
                     pgno_t pg;
                     int ovpages, dpages;

                     ovpages = OVPAGES(NODEDSZ(leaf), mc->mc_txn->mt_env->me_psize);
                     dpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
                     memcpy(&pg, NODEDATA(leaf), sizeof(pg));
                     mdb_page_get(mc->mc_txn, pg, &omp);
                     /* Is the ov page writable and large enough? */
                     if ((omp->mp_flags & P_DIRTY) && ovpages >= dpages) {
                            /* yes, overwrite it. Note in this case we don't
                             * bother to try shrinking the node if the new data
                             * is smaller than the overflow threshold.
                             */
                            if (F_ISSET(flags, MDB_RESERVE))
                                   data->mv_data = METADATA(omp);
                            else
                                   memcpy(METADATA(omp), data->mv_data, data->mv_size);
                            goto done;
                     } else {
                            /* no, free ovpages */
                            int i;
                            mc->mc_db->md_overflow_pages -= ovpages;
                            for (i=0; i<ovpages; i++) {
                                   DPRINTF("freed ov page %zu", pg);
                                   mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
                                   pg++;
                            }
                     }
              } else if (NODEDSZ(leaf) == data->mv_size) {
                     /* same size, just replace it. Note that we could
                      * also reuse this node if the new data is smaller,
                      * but instead we opt to shrink the node in that case.
                      */
                     if (F_ISSET(flags, MDB_RESERVE))
                            data->mv_data = NODEDATA(leaf);
                     else
                            memcpy(NODEDATA(leaf), data->mv_data, data->mv_size);
                     goto done;
              }
              mdb_node_del(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], 0);
              mc->mc_db->md_entries--;
       } else {
              DPRINTF("inserting key at index %i", mc->mc_ki[mc->mc_top]);
       }

       rdata = data;

new_sub:
       nflags = flags & NODE_ADD_FLAGS;
       nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : mdb_leaf_size(mc->mc_txn->mt_env, key, rdata);
       if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
              if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
                     nflags &= ~MDB_APPEND;
              rc = mdb_page_split(mc, key, rdata, P_INVALID, nflags);
       } else {
              /* There is room already in this leaf page. */
              rc = mdb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
              if (rc == 0 && !do_sub) {
                     /* Adjust other cursors pointing to mp */
                     MDB_cursor *m2, *m3;
                     MDB_dbi dbi = mc->mc_dbi;
                     unsigned i = mc->mc_top;
                     MDB_page *mp = mc->mc_pg[i];

                     if (mc->mc_flags & C_SUB)
                            dbi--;

                     for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
                            if (mc->mc_flags & C_SUB)
                                   m3 = &m2->mc_xcursor->mx_cursor;
                            else
                                   m3 = m2;
                            if (m3 == mc || m3->mc_snum < mc->mc_snum) continue;
                            if (m3->mc_pg[i] == mp && m3->mc_ki[i] >= mc->mc_ki[i]) {
                                   m3->mc_ki[i]++;
                            }
                     }
              }
       }

       if (rc != MDB_SUCCESS)
              mc->mc_txn->mt_flags |= MDB_TXN_ERROR;
       else {
              /* Now store the actual data in the child DB. Note that we're
               * storing the user data in the keys field, so there are strict
               * size limits on dupdata. The actual data fields of the child
               * DB are all zero size.
               */
              if (do_sub) {
                     int xflags;
put_sub:
                     xdata.mv_size = 0;
                     xdata.mv_data = "";
                     leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
                     if (flags & MDB_CURRENT) {
                            xflags = MDB_CURRENT;
                     } else {
                            mdb_xcursor_init1(mc, leaf);
                            xflags = (flags & MDB_NODUPDATA) ? MDB_NOOVERWRITE : 0;
                     }
                     /* converted, write the original data first */
                     if (dkey.mv_size) {
                            rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
                            if (rc)
                                   return rc;
                            {
                                   /* Adjust other cursors pointing to mp */
                                   MDB_cursor *m2;
                                   unsigned i = mc->mc_top;
                                   MDB_page *mp = mc->mc_pg[i];

                                   for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
                                          if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
                                          if (m2->mc_pg[i] == mp && m2->mc_ki[i] == mc->mc_ki[i]) {
                                                 mdb_xcursor_init1(m2, leaf);
                                          }
                                   }
                            }
                     }
                     xflags |= (flags & MDB_APPEND);
                     rc = mdb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
                     if (flags & F_SUBDATA) {
                            void *db = NODEDATA(leaf);
                            memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDB_db));
                     }
              }
              /* sub-writes might have failed so check rc again.
               * Don't increment count if we just replaced an existing item.
               */
              if (!rc && !(flags & MDB_CURRENT))
                     mc->mc_db->md_entries++;
              if (flags & MDB_MULTIPLE) {
                     mcount++;
                     if (mcount < data[1].mv_size) {
                            data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
                            leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
                            goto more;
                     }
              }
       }
done:
       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_set ( MDB_cursor mc,
MDB_val key,
MDB_val data,
MDB_cursor_op  op,
int exactp 
) [static]

Set the cursor on a specific data item.

Definition at line 3744 of file mdb.c.

{
       int            rc;
       MDB_page      *mp;
       MDB_node      *leaf;
       DKBUF;

       assert(mc);
       assert(key);
       assert(key->mv_size > 0);

       /* See if we're already on the right page */
       if (mc->mc_flags & C_INITIALIZED) {
              MDB_val nodekey;

              mp = mc->mc_pg[mc->mc_top];
              if (!NUMKEYS(mp)) {
                     mc->mc_ki[mc->mc_top] = 0;
                     return MDB_NOTFOUND;
              }
              if (mp->mp_flags & P_LEAF2) {
                     nodekey.mv_size = mc->mc_db->md_pad;
                     nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
              } else {
                     leaf = NODEPTR(mp, 0);
                     MDB_SET_KEY(leaf, &nodekey);
              }
              rc = mc->mc_dbx->md_cmp(key, &nodekey);
              if (rc == 0) {
                     /* Probably happens rarely, but first node on the page
                      * was the one we wanted.
                      */
                     mc->mc_ki[mc->mc_top] = 0;
                     leaf = NODEPTR(mp, 0);
                     if (exactp)
                            *exactp = 1;
                     goto set1;
              }
              if (rc > 0) {
                     unsigned int i;
                     unsigned int nkeys = NUMKEYS(mp);
                     if (nkeys > 1) {
                            if (mp->mp_flags & P_LEAF2) {
                                   nodekey.mv_data = LEAF2KEY(mp,
                                           nkeys-1, nodekey.mv_size);
                            } else {
                                   leaf = NODEPTR(mp, nkeys-1);
                                   MDB_SET_KEY(leaf, &nodekey);
                            }
                            rc = mc->mc_dbx->md_cmp(key, &nodekey);
                            if (rc == 0) {
                                   /* last node was the one we wanted */
                                   mc->mc_ki[mc->mc_top] = nkeys-1;
                                   leaf = NODEPTR(mp, nkeys-1);
                                   if (exactp)
                                          *exactp = 1;
                                   goto set1;
                            }
                            if (rc < 0) {
                                   /* This is definitely the right page, skip search_page */
                                   rc = 0;
                                   goto set2;
                            }
                     }
                     /* If any parents have right-sibs, search.
                      * Otherwise, there's nothing further.
                      */
                     for (i=0; i<mc->mc_top; i++)
                            if (mc->mc_ki[i] <
                                   NUMKEYS(mc->mc_pg[i])-1)
                                   break;
                     if (i == mc->mc_top) {
                            /* There are no other pages */
                            mc->mc_ki[mc->mc_top] = nkeys;
                            return MDB_NOTFOUND;
                     }
              }
              if (!mc->mc_top) {
                     /* There are no other pages */
                     mc->mc_ki[mc->mc_top] = 0;
                     return MDB_NOTFOUND;
              }
       }

       rc = mdb_page_search(mc, key, 0);
       if (rc != MDB_SUCCESS)
              return rc;

       mp = mc->mc_pg[mc->mc_top];
       assert(IS_LEAF(mp));

set2:
       leaf = mdb_node_search(mc, key, exactp);
       if (exactp != NULL && !*exactp) {
              /* MDB_SET specified and not an exact match. */
              return MDB_NOTFOUND;
       }

       if (leaf == NULL) {
              DPUTS("===> inexact leaf not found, goto sibling");
              if ((rc = mdb_cursor_sibling(mc, 1)) != MDB_SUCCESS)
                     return rc;           /* no entries matched */
              mp = mc->mc_pg[mc->mc_top];
              assert(IS_LEAF(mp));
              leaf = NODEPTR(mp, 0);
       }

set1:
       mc->mc_flags |= C_INITIALIZED;
       mc->mc_flags &= ~C_EOF;

       if (IS_LEAF2(mp)) {
              key->mv_size = mc->mc_db->md_pad;
              key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
              return MDB_SUCCESS;
       }

       if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
              mdb_xcursor_init1(mc, leaf);
       }
       if (data) {
              if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
                     if (op == MDB_SET || op == MDB_SET_RANGE) {
                            rc = mdb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
                     } else {
                            int ex2, *ex2p;
                            if (op == MDB_GET_BOTH) {
                                   ex2p = &ex2;
                                   ex2 = 0;
                            } else {
                                   ex2p = NULL;
                            }
                            rc = mdb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_SET_RANGE, ex2p);
                            if (rc != MDB_SUCCESS)
                                   return rc;
                     }
              } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) {
                     MDB_val d2;
                     if ((rc = mdb_node_read(mc->mc_txn, leaf, &d2)) != MDB_SUCCESS)
                            return rc;
                     rc = mc->mc_dbx->md_dcmp(data, &d2);
                     if (rc) {
                            if (op == MDB_GET_BOTH || rc > 0)
                                   return MDB_NOTFOUND;
                     }

              } else {
                     if (mc->mc_xcursor)
                            mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
                     if ((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)
                            return rc;
              }
       }

       /* The key already matches in all other cases */
       if (op == MDB_SET_RANGE)
              MDB_SET_KEY(leaf, key);
       DPRINTF("==> cursor placed on key [%s]", DKEY(key));

       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_shadow ( MDB_txn src,
MDB_txn dst 
) [static]

Make shadow copies of all of parent txn's cursors.

Definition at line 1443 of file mdb.c.

{
       MDB_cursor *mc, *m2;
       unsigned int i, j, size;

       for (i=0;i<src->mt_numdbs; i++) {
              if (src->mt_cursors[i]) {
                     size = sizeof(MDB_cursor);
                     if (src->mt_cursors[i]->mc_xcursor)
                            size += sizeof(MDB_xcursor);
                     for (m2 = src->mt_cursors[i]; m2; m2=m2->mc_next) {
                            mc = malloc(size);
                            if (!mc)
                                   return ENOMEM;
                            mc->mc_orig = m2;
                            mc->mc_txn = dst;
                            mc->mc_dbi = i;
                            mc->mc_db = &dst->mt_dbs[i];
                            mc->mc_dbx = m2->mc_dbx;
                            mc->mc_dbflag = &dst->mt_dbflags[i];
                            mc->mc_snum = m2->mc_snum;
                            mc->mc_top = m2->mc_top;
                            mc->mc_flags = m2->mc_flags | C_SHADOW;
                            for (j=0; j<mc->mc_snum; j++) {
                                   mc->mc_pg[j] = m2->mc_pg[j];
                                   mc->mc_ki[j] = m2->mc_ki[j];
                            }
                            if (m2->mc_xcursor) {
                                   MDB_xcursor *mx, *mx2;
                                   mx = (MDB_xcursor *)(mc+1);
                                   mc->mc_xcursor = mx;
                                   mx2 = m2->mc_xcursor;
                                   mx->mx_db = mx2->mx_db;
                                   mx->mx_dbx = mx2->mx_dbx;
                                   mx->mx_dbflag = mx2->mx_dbflag;
                                   mx->mx_cursor.mc_txn = dst;
                                   mx->mx_cursor.mc_dbi = mx2->mx_cursor.mc_dbi;
                                   mx->mx_cursor.mc_db = &mx->mx_db;
                                   mx->mx_cursor.mc_dbx = &mx->mx_dbx;
                                   mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
                                   mx->mx_cursor.mc_snum = mx2->mx_cursor.mc_snum;
                                   mx->mx_cursor.mc_top = mx2->mx_cursor.mc_top;
                                   mx->mx_cursor.mc_flags = mx2->mx_cursor.mc_flags | C_SHADOW;
                                   for (j=0; j<mx2->mx_cursor.mc_snum; j++) {
                                          mx->mx_cursor.mc_pg[j] = mx2->mx_cursor.mc_pg[j];
                                          mx->mx_cursor.mc_ki[j] = mx2->mx_cursor.mc_ki[j];
                                   }
                            } else {
                                   mc->mc_xcursor = NULL;
                            }
                            mc->mc_next = dst->mt_cursors[i];
                            dst->mt_cursors[i] = mc;
                     }
              }
       }
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

static int mdb_cursor_sibling ( MDB_cursor mc,
int  move_right 
) [static]

Find a sibling for a page.

Replaces the page at the top of the cursor's stack with the specified sibling, if one exists.

Parameters:
[in]mcThe cursor for this operation.
[in]move_rightNon-zero if the right sibling is requested, otherwise the left sibling.
Returns:
0 on success, non-zero on failure.

Definition at line 3555 of file mdb.c.

{
       int            rc;
       MDB_node      *indx;
       MDB_page      *mp;

       if (mc->mc_snum < 2) {
              return MDB_NOTFOUND;        /* root has no siblings */
       }

       mdb_cursor_pop(mc);
       DPRINTF("parent page is page %zu, index %u",
              mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]);

       if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
                     : (mc->mc_ki[mc->mc_top] == 0)) {
              DPRINTF("no more keys left, moving to %s sibling",
                  move_right ? "right" : "left");
              if ((rc = mdb_cursor_sibling(mc, move_right)) != MDB_SUCCESS)
                     return rc;
       } else {
              if (move_right)
                     mc->mc_ki[mc->mc_top]++;
              else
                     mc->mc_ki[mc->mc_top]--;
              DPRINTF("just moving to %s index key %u",
                  move_right ? "right" : "left", mc->mc_ki[mc->mc_top]);
       }
       assert(IS_BRANCH(mc->mc_pg[mc->mc_top]));

       indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
       if ((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp)))
              return rc;;

       mdb_cursor_push(mc, mp);

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_cursor_touch ( MDB_cursor mc) [static]

Touch all the pages in the cursor stack.

  Makes sure all the pages are writable, before attempting a write operation.
Parameters:
[in]mcThe cursor to operate on.

Definition at line 4116 of file mdb.c.

{
       int rc;

       if (mc->mc_dbi > MAIN_DBI && !(*mc->mc_dbflag & DB_DIRTY)) {
              MDB_cursor mc2;
              mdb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
              rc = mdb_page_search(&mc2, &mc->mc_dbx->md_name, 1);
              if (rc)
                      return rc;
              *mc->mc_dbflag = DB_DIRTY;
       }
       for (mc->mc_top = 0; mc->mc_top < mc->mc_snum; mc->mc_top++) {
              rc = mdb_page_touch(mc);
              if (rc)
                     return rc;
       }
       mc->mc_top = mc->mc_snum-1;
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Return the cursor's transaction handle.

Parameters:
[in]cursorA cursor handle returned by mdb_cursor_open()

Definition at line 5048 of file mdb.c.

{
       if (!mc) return NULL;
       return mc->mc_txn;
}
int mdb_dcmp ( MDB_txn txn,
MDB_dbi  dbi,
const MDB_val a,
const MDB_val b 
)

Compare two data items according to a particular database.

This returns a comparison as if the two items were data items of a sorted duplicates MDB_DUPSORT database.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[in]aThe first item to compare
[in]bThe second item to compare
Returns:
< 0 if a < b, 0 if a == b, > 0 if a > b

Definition at line 1170 of file mdb.c.

{
       if (txn->mt_dbxs[dbi].md_dcmp)
              return txn->mt_dbxs[dbi].md_dcmp(a, b);
       else
              return EINVAL;       /* too bad you can't distinguish this from a valid result */
}
static void mdb_default_cmp ( MDB_txn txn,
MDB_dbi  dbi 
) [static]

Set the default comparison functions for a database.

Called immediately after a database is opened to set the defaults. The user can then override them with mdb_set_compare() or

mdb_set_dupsort().

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()

Definition at line 6099 of file mdb.c.

{
       if (txn->mt_dbs[dbi].md_flags & MDB_REVERSEKEY)
              txn->mt_dbxs[dbi].md_cmp = mdb_cmp_memnr;
       else if (txn->mt_dbs[dbi].md_flags & MDB_INTEGERKEY)
              txn->mt_dbxs[dbi].md_cmp = mdb_cmp_cint;
       else
              txn->mt_dbxs[dbi].md_cmp = mdb_cmp_memn;

       if (txn->mt_dbs[dbi].md_flags & MDB_DUPSORT) {
              if (txn->mt_dbs[dbi].md_flags & MDB_INTEGERDUP) {
                     if (txn->mt_dbs[dbi].md_flags & MDB_DUPFIXED)
                            txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_int;
                     else
                            txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_cint;
              } else if (txn->mt_dbs[dbi].md_flags & MDB_REVERSEDUP) {
                     txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_memnr;
              } else {
                     txn->mt_dbxs[dbi].md_dcmp = mdb_cmp_memn;
              }
       } else {
              txn->mt_dbxs[dbi].md_dcmp = NULL;
       }
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_del ( MDB_txn txn,
MDB_dbi  dbi,
MDB_val key,
MDB_val data 
)

Delete items from a database.

This function removes key/data pairs from the database. If the database does not support sorted duplicate data items (MDB_DUPSORT) the data parameter is ignored. If the database supports sorted duplicates and the data parameter is NULL, all of the duplicate data items for the key will be deleted. Otherwise, if the data parameter is non-NULL only the matching data item will be deleted. This function will return MDB_NOTFOUND if the specified key/data pair is not in the database.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[in]keyThe key to delete from the database
[in]dataThe data to delete
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EACCES - an attempt was made to write in a read-only transaction.
  • EINVAL - an invalid parameter was specified.

Definition at line 5598 of file mdb.c.

{
       MDB_cursor mc;
       MDB_xcursor mx;
       MDB_cursor_op op;
       MDB_val rdata, *xdata;
       int            rc, exact;
       DKBUF;

       assert(key != NULL);

       DPRINTF("====> delete db %u key [%s]", dbi, DKEY(key));

       if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
              return EINVAL;

       if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY)) {
              return EACCES;
       }

       if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
              return EINVAL;
       }

       mdb_cursor_init(&mc, txn, dbi, &mx);

       exact = 0;
       if (data) {
              op = MDB_GET_BOTH;
              rdata = *data;
              xdata = &rdata;
       } else {
              op = MDB_SET;
              xdata = NULL;
       }
       rc = mdb_cursor_set(&mc, key, xdata, op, &exact);
       if (rc == 0)
              rc = mdb_cursor_del(&mc, data ? 0 : MDB_NODUPDATA);
       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_drop ( MDB_txn txn,
MDB_dbi  dbi,
int  del 
)

Delete a database and/or free all its pages.

If the del parameter is non-zero the DB handle will be closed and the DB will be deleted.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[in]delnon-zero to delete the DB from the environment, otherwise just free its pages.
Returns:
A non-zero error value on failure and 0 on success.

Definition at line 6282 of file mdb.c.

{
       MDB_cursor *mc;
       int rc;

       if (!txn || !dbi || dbi >= txn->mt_numdbs)
              return EINVAL;

       if (F_ISSET(txn->mt_flags, MDB_TXN_RDONLY))
              return EACCES;

       rc = mdb_cursor_open(txn, dbi, &mc);
       if (rc)
              return rc;

       rc = mdb_drop0(mc, mc->mc_db->md_flags & MDB_DUPSORT);
       if (rc)
              goto leave;

       /* Can't delete the main DB */
       if (del && dbi > MAIN_DBI) {
              rc = mdb_del(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL);
              if (!rc)
                     mdb_close(txn->mt_env, dbi);
       } else {
              txn->mt_dbflags[dbi] |= DB_DIRTY;
              txn->mt_dbs[dbi].md_depth = 0;
              txn->mt_dbs[dbi].md_branch_pages = 0;
              txn->mt_dbs[dbi].md_leaf_pages = 0;
              txn->mt_dbs[dbi].md_overflow_pages = 0;
              txn->mt_dbs[dbi].md_entries = 0;
              txn->mt_dbs[dbi].md_root = P_INVALID;
       }
leave:
       mdb_cursor_close(mc);
       return rc;
}

Here is the call graph for this function:

static int mdb_drop0 ( MDB_cursor mc,
int  subs 
) [static]

Add all the DB's pages to the free list.

Parameters:
[in]mcCursor on the DB to free.
[in]subsnon-Zero to check for sub-DBs in this DB.
Returns:
0 on success, non-zero on failure.

Definition at line 6228 of file mdb.c.

{
       int rc;

       rc = mdb_page_search(mc, NULL, 0);
       if (rc == MDB_SUCCESS) {
              MDB_node *ni;
              MDB_cursor mx;
              unsigned int i;

              /* LEAF2 pages have no nodes, cannot have sub-DBs */
              if (!subs || IS_LEAF2(mc->mc_pg[mc->mc_top]))
                     mdb_cursor_pop(mc);

              mdb_cursor_copy(mc, &mx);
              while (mc->mc_snum > 0) {
                     if (IS_LEAF(mc->mc_pg[mc->mc_top])) {
                            for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
                                   ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
                                   if (ni->mn_flags & F_SUBDATA) {
                                          mdb_xcursor_init1(mc, ni);
                                          rc = mdb_drop0(&mc->mc_xcursor->mx_cursor, 0);
                                          if (rc)
                                                 return rc;
                                   }
                            }
                     } else {
                            for (i=0; i<NUMKEYS(mc->mc_pg[mc->mc_top]); i++) {
                                   pgno_t pg;
                                   ni = NODEPTR(mc->mc_pg[mc->mc_top], i);
                                   pg = NODEPGNO(ni);
                                   /* free it */
                                   mdb_midl_append(&mc->mc_txn->mt_free_pgs, pg);
                            }
                     }
                     if (!mc->mc_top)
                            break;
                     rc = mdb_cursor_sibling(mc, 1);
                     if (rc) {
                            /* no more siblings, go back to beginning
                             * of previous level. (stack was already popped
                             * by mdb_cursor_sibling)
                             */
                            for (i=1; i<mc->mc_top; i++)
                                   mc->mc_pg[i] = mx.mc_pg[i];
                     }
              }
              /* free it */
              mdb_midl_append(&mc->mc_txn->mt_free_pgs,
                     mc->mc_db->md_root);
       }
       return 0;
}

Here is the call graph for this function:

Here is the caller graph for this function:

Close the environment and release the memory map.

Only a single thread may call this function. All transactions, databases, and cursors must already be closed before calling this function. Attempts to use any such handles after calling this function will cause a SIGSEGV. The environment handle will be freed and must not be used again after this call.

Parameters:
[in]envAn environment handle returned by mdb_env_create()

Definition at line 3029 of file mdb.c.

{
       MDB_page *dp;

       if (env == NULL)
              return;

       VGMEMP_DESTROY(env);
       while (env->me_dpages) {
              dp = env->me_dpages;
              VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
              env->me_dpages = dp->mp_next;
              free(dp);
       }

       free(env->me_dbs[1]);
       free(env->me_dbs[0]);
       free(env->me_dbxs);
       free(env->me_path);

       LAZY_RWLOCK_DESTROY(&env->me_dblock);
       pthread_key_delete(env->me_txkey);

       if (env->me_map) {
              munmap(env->me_map, env->me_mapsize);
       }
       close(env->me_mfd);
       close(env->me_fd);
       if (env->me_txns) {
              pid_t pid = getpid();
              unsigned int i;
              for (i=0; i<env->me_txns->mti_numreaders; i++)
                     if (env->me_txns->mti_readers[i].mr_pid == pid)
                            env->me_txns->mti_readers[i].mr_pid = 0;
              munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
       }
       close(env->me_lfd);
       mdb_midl_free(env->me_free_pgs);
       free(env);
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_env_create ( MDB_env **  env)

Create an MDB environment handle.

This function allocates memory for a MDB_env structure. To release the allocated memory and discard the handle, call mdb_env_close(). Before the handle may be used, it must be opened using mdb_env_open(). Various other options may also need to be set before opening the handle, e.g. mdb_env_set_mapsize(), mdb_env_set_maxreaders(), mdb_env_set_maxdbs(), depending on usage requirements.

Parameters:
[out]envThe address where the new handle will be stored
Returns:
A non-zero error value on failure and 0 on success.

Definition at line 2407 of file mdb.c.

{
       MDB_env *e;

       e = calloc(1, sizeof(MDB_env));
       if (!e)
              return ENOMEM;

       e->me_free_pgs = mdb_midl_alloc();
       if (!e->me_free_pgs) {
              free(e);
              return ENOMEM;
       }
       e->me_maxreaders = DEFAULT_READERS;
       e->me_maxdbs = 2;
       e->me_fd = INVALID_HANDLE_VALUE;
       e->me_lfd = INVALID_HANDLE_VALUE;
       e->me_mfd = INVALID_HANDLE_VALUE;
       VGMEMP_CREATE(e,0,0);
       *env = e;
       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_env_get_flags ( MDB_env env,
unsigned int flags 
)

Get environment flags.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[out]flagsThe address of an integer to store the flags
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.

Definition at line 6041 of file mdb.c.

{
       if (!env || !arg)
              return EINVAL;

       *arg = env->me_flags;
       return MDB_SUCCESS;
}
int mdb_env_get_maxreaders ( MDB_env env,
unsigned int readers 
)

Get the maximum number of threads for the environment.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[out]readersAddress of an integer to store the number of readers
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.

Definition at line 2460 of file mdb.c.

{
       if (!env || !readers)
              return EINVAL;
       *readers = env->me_maxreaders;
       return MDB_SUCCESS;
}
int mdb_env_get_path ( MDB_env env,
const char **  path 
)

Return the path that was used in mdb_env_open().

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[out]pathAddress of a string pointer to contain the path. This is the actual string in the environment, not a copy. It should not be altered in any way.
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.

Definition at line 6051 of file mdb.c.

{
       if (!env || !arg)
              return EINVAL;

       *arg = env->me_path;
       return MDB_SUCCESS;
}
static int mdb_env_init_meta ( MDB_env env,
MDB_meta meta 
) [static]

Write the environment parameters of a freshly created DB environment.

Parameters:
[in]envthe environment handle
[out]metaaddress of where to store the meta information
Returns:
0 on success, non-zero on failure.

Definition at line 2252 of file mdb.c.

{
       MDB_page *p, *q;
       MDB_meta *m;
       int rc;
       unsigned int   psize;

       DPUTS("writing new meta page");

       GET_PAGESIZE(psize);

       meta->mm_magic = MDB_MAGIC;
       meta->mm_version = MDB_VERSION;
       meta->mm_psize = psize;
       meta->mm_last_pg = 1;
       meta->mm_flags = env->me_flags & 0xffff;
       meta->mm_flags |= MDB_INTEGERKEY;
       meta->mm_dbs[0].md_root = P_INVALID;
       meta->mm_dbs[1].md_root = P_INVALID;

       p = calloc(2, psize);
       p->mp_pgno = 0;
       p->mp_flags = P_META;

       m = METADATA(p);
       memcpy(m, meta, sizeof(*meta));

       q = (MDB_page *)((char *)p + psize);

       q->mp_pgno = 1;
       q->mp_flags = P_META;

       m = METADATA(q);
       memcpy(m, meta, sizeof(*meta));

#ifdef _WIN32
       {
              DWORD len;
              rc = WriteFile(env->me_fd, p, psize * 2, &len, NULL);
              rc = (len == psize * 2) ? MDB_SUCCESS : ErrCode();
       }
#else
       rc = write(env->me_fd, p, psize * 2);
       rc = (rc == (int)psize * 2) ? MDB_SUCCESS : ErrCode();
#endif
       free(p);
       return rc;
}

Here is the caller graph for this function:

int mdb_env_open ( MDB_env env,
const char *  path,
unsigned int  flags,
mode_t  mode 
)

Open an environment handle.

If this function fails, mdb_env_close() must be called to discard the MDB_env handle.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]pathThe directory in which the database files reside. This directory must already exist and be writable.
[in]flagsSpecial options for this environment. This parameter must be set to 0 or by bitwise OR'ing together one or more of the values described here.
  • MDB_FIXEDMAP use a fixed address for the mmap region. This flag must be specified when creating the environment, and is stored persistently in the environment. If successful, the memory map will always reside at the same virtual address and pointers used to reference data items in the database will be constant across multiple invocations. This option may not always work, depending on how the operating system has allocated memory to shared libraries and other uses. The feature is highly experimental.
  • MDB_NOSUBDIR By default, MDB creates its environment in a directory whose pathname is given in path, and creates its data and lock files under that directory. With this option, path is used as-is for the database main data file. The database lock file is the path with "-lock" appended.
  • MDB_NOSYNC Don't perform a synchronous flush after committing a transaction. This means transactions will exhibit the ACI (atomicity, consistency, and isolation) properties, but not D (durability); that is database integrity will be maintained but it is possible some number of the most recently committed transactions may be undone after a system crash. The number of transactions at risk is governed by how often the system flushes dirty buffers to disk and how often mdb_env_sync() is called. This flag may be changed at any time using mdb_env_set_flags().
  • MDB_RDONLY Open the environment in read-only mode. No write operations will be allowed.
[in]modeThe UNIX permissions to set on created files. This parameter is ignored on Windows.
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • MDB_VERSION_MISMATCH - the version of the MDB library doesn't match the version that created the database environment.
  • EINVAL - the environment file headers are corrupted.
  • ENOENT - the directory specified by the path parameter doesn't exist.
  • EACCES - the user didn't have permission to access the environment files.
  • EAGAIN - the environment was locked by another process.

Definition at line 2929 of file mdb.c.

{
       int           oflags, rc, len, excl;
       char *lpath, *dpath;

       len = strlen(path);
       if (flags & MDB_NOSUBDIR) {
              rc = len + sizeof(LOCKSUFF) + len + 1;
       } else {
              rc = len + sizeof(LOCKNAME) + len + sizeof(DATANAME);
       }
       lpath = malloc(rc);
       if (!lpath)
              return ENOMEM;
       if (flags & MDB_NOSUBDIR) {
              dpath = lpath + len + sizeof(LOCKSUFF);
              sprintf(lpath, "%s" LOCKSUFF, path);
              strcpy(dpath, path);
       } else {
              dpath = lpath + len + sizeof(LOCKNAME);
              sprintf(lpath, "%s" LOCKNAME, path);
              sprintf(dpath, "%s" DATANAME, path);
       }

       rc = mdb_env_setup_locks(env, lpath, mode, &excl);
       if (rc)
              goto leave;

#ifdef _WIN32
       if (F_ISSET(flags, MDB_RDONLY)) {
              oflags = GENERIC_READ;
              len = OPEN_EXISTING;
       } else {
              oflags = GENERIC_READ|GENERIC_WRITE;
              len = OPEN_ALWAYS;
       }
       mode = FILE_ATTRIBUTE_NORMAL;
       if ((env->me_fd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
                     NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
              rc = ErrCode();
              goto leave;
       }
#else
       if (F_ISSET(flags, MDB_RDONLY))
              oflags = O_RDONLY;
       else
              oflags = O_RDWR | O_CREAT;

       if ((env->me_fd = open(dpath, oflags, mode)) == -1) {
              rc = ErrCode();
              goto leave;
       }
#endif

       if ((rc = mdb_env_open2(env, flags)) == MDB_SUCCESS) {
              /* synchronous fd for meta writes */
#ifdef _WIN32
              if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
                     mode |= FILE_FLAG_WRITE_THROUGH;
              if ((env->me_mfd = CreateFile(dpath, oflags, FILE_SHARE_READ|FILE_SHARE_WRITE,
                     NULL, len, mode, NULL)) == INVALID_HANDLE_VALUE) {
                     rc = ErrCode();
                     goto leave;
              }
#else
              if (!(flags & (MDB_RDONLY|MDB_NOSYNC)))
                     oflags |= MDB_DSYNC;
              if ((env->me_mfd = open(dpath, oflags, mode)) == -1) {
                     rc = ErrCode();
                     goto leave;
              }
#endif
              env->me_path = strdup(path);
              DPRINTF("opened dbenv %p", (void *) env);
              pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
              LAZY_RWLOCK_INIT(&env->me_dblock, NULL);
              if (excl)
                     mdb_env_share_locks(env);
              env->me_dbxs = calloc(env->me_maxdbs, sizeof(MDB_dbx));
              env->me_dbs[0] = calloc(env->me_maxdbs, sizeof(MDB_db));
              env->me_dbs[1] = calloc(env->me_maxdbs, sizeof(MDB_db));
              env->me_numdbs = 2;
       }

leave:
       if (rc) {
              if (env->me_fd != INVALID_HANDLE_VALUE) {
                     close(env->me_fd);
                     env->me_fd = INVALID_HANDLE_VALUE;
              }
              if (env->me_lfd != INVALID_HANDLE_VALUE) {
                     close(env->me_lfd);
                     env->me_lfd = INVALID_HANDLE_VALUE;
              }
       }
       free(lpath);
       return rc;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_env_open2 ( MDB_env env,
unsigned int  flags 
) [static]

Further setup required for opening an MDB environment.

Definition at line 2471 of file mdb.c.

{
       int i, newenv = 0, toggle;
       MDB_meta meta;
       MDB_page *p;

       env->me_flags = flags;

       memset(&meta, 0, sizeof(meta));

       if ((i = mdb_env_read_header(env, &meta)) != 0) {
              if (i != ENOENT)
                     return i;
              DPUTS("new mdbenv");
              newenv = 1;
       }

       if (!env->me_mapsize) {
              env->me_mapsize = newenv ? DEFAULT_MAPSIZE : meta.mm_mapsize;
       }

#ifdef _WIN32
       {
              HANDLE mh;
              LONG sizelo, sizehi;
              sizelo = env->me_mapsize & 0xffffffff;
              sizehi = env->me_mapsize >> 16;           /* pointless on WIN32, only needed on W64 */
              sizehi >>= 16;
              /* Windows won't create mappings for zero length files.
               * Just allocate the maxsize right now.
               */
              if (newenv) {
                     SetFilePointer(env->me_fd, sizelo, sizehi ? &sizehi : NULL, 0);
                     if (!SetEndOfFile(env->me_fd))
                            return ErrCode();
                     SetFilePointer(env->me_fd, 0, NULL, 0);
              }
              mh = CreateFileMapping(env->me_fd, NULL, PAGE_READONLY,
                     sizehi, sizelo, NULL);
              if (!mh)
                     return ErrCode();
              env->me_map = MapViewOfFileEx(mh, FILE_MAP_READ, 0, 0, env->me_mapsize,
                     meta.mm_address);
              CloseHandle(mh);
              if (!env->me_map)
                     return ErrCode();
       }
#else
       i = MAP_SHARED;
       if (meta.mm_address && (flags & MDB_FIXEDMAP))
              i |= MAP_FIXED;
       env->me_map = mmap(meta.mm_address, env->me_mapsize, PROT_READ, i,
              env->me_fd, 0);
       if (env->me_map == MAP_FAILED) {
              env->me_map = NULL;
              return ErrCode();
       }
#endif

       if (newenv) {
              meta.mm_mapsize = env->me_mapsize;
              if (flags & MDB_FIXEDMAP)
                     meta.mm_address = env->me_map;
              i = mdb_env_init_meta(env, &meta);
              if (i != MDB_SUCCESS) {
                     munmap(env->me_map, env->me_mapsize);
                     return i;
              }
       }
       env->me_psize = meta.mm_psize;

       env->me_maxpg = env->me_mapsize / env->me_psize;

       p = (MDB_page *)env->me_map;
       env->me_metas[0] = METADATA(p);
       env->me_metas[1] = (MDB_meta *)((char *)env->me_metas[0] + meta.mm_psize);

       if ((i = mdb_env_read_meta(env, &toggle)) != 0)
              return i;

       DPRINTF("opened database version %u, pagesize %u",
           env->me_metas[toggle]->mm_version, env->me_psize);
       DPRINTF("depth: %u", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_depth);
       DPRINTF("entries: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_entries);
       DPRINTF("branch pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_branch_pages);
       DPRINTF("leaf pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_leaf_pages);
       DPRINTF("overflow pages: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_overflow_pages);
       DPRINTF("root: %zu", env->me_metas[toggle]->mm_dbs[MAIN_DBI].md_root);

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_env_read_header ( MDB_env env,
MDB_meta meta 
) [static]

Read the environment parameters of a DB environment before mapping it into memory.

Parameters:
[in]envthe environment handle
[out]metaaddress of where to store the meta information
Returns:
0 on success, non-zero on failure.

Definition at line 2197 of file mdb.c.

{
       MDB_pagebuf   pbuf;
       MDB_page      *p;
       MDB_meta      *m;
       int            rc, err;

       /* We don't know the page size yet, so use a minimum value.
        */

#ifdef _WIN32
       if (!ReadFile(env->me_fd, &pbuf, MDB_PAGESIZE, (DWORD *)&rc, NULL) || rc == 0)
#else
       if ((rc = read(env->me_fd, &pbuf, MDB_PAGESIZE)) == 0)
#endif
       {
              return ENOENT;
       }
       else if (rc != MDB_PAGESIZE) {
              err = ErrCode();
              if (rc > 0)
                     err = EINVAL;
              DPRINTF("read: %s", strerror(err));
              return err;
       }

       p = (MDB_page *)&pbuf;

       if (!F_ISSET(p->mp_flags, P_META)) {
              DPRINTF("page %zu not a meta page", p->mp_pgno);
              return EINVAL;
       }

       m = METADATA(p);
       if (m->mm_magic != MDB_MAGIC) {
              DPUTS("meta has invalid magic");
              return EINVAL;
       }

       if (m->mm_version != MDB_VERSION) {
              DPRINTF("database is version %u, expected version %u",
                  m->mm_version, MDB_VERSION);
              return MDB_VERSION_MISMATCH;
       }

       memcpy(meta, m, sizeof(*m));
       return 0;
}

Here is the caller graph for this function:

static int mdb_env_read_meta ( MDB_env env,
int which 
) [static]

Check both meta pages to see which one is newer.

Parameters:
[in]envthe environment handle
[out]whichaddress of where to store the meta toggle ID
Returns:
0 on success, non-zero on failure.

Definition at line 2391 of file mdb.c.

{
       int toggle = 0;

       assert(env != NULL);

       if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
              toggle = 1;

       DPRINTF("Using meta page %d", toggle);
       *which = toggle;

       return MDB_SUCCESS;
}

Here is the caller graph for this function:

static void mdb_env_reader_dest ( void ptr) [static]

Release a reader thread's slot in the reader lock table.

  This function is called automatically when a thread exits.
  Windows doesn't support destructor callbacks for thread-specific storage,
  so this function is not compiled there.
Parameters:
[in]ptrThis points to the slot in the reader lock table.

Definition at line 2571 of file mdb.c.

{
       MDB_reader *reader = ptr;

       reader->mr_txnid = 0;
       reader->mr_pid = 0;
       reader->mr_tid = 0;
}

Here is the caller graph for this function:

int mdb_env_set_flags ( MDB_env env,
unsigned int  flags,
int  onoff 
)

Set environment flags.

This may be used to set some flags that weren't already set during

mdb_env_open(), or to unset these flags. Currently only the

MDB_NOSYNC flag setting may be changed with this function.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]flagsThe flags to change, bitwise OR'ed together
[in]onoffA non-zero value sets the flags, zero clears them.
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.

Definition at line 6029 of file mdb.c.

{
       if ((flag & CHANGEABLE) != flag)
              return EINVAL;
       if (onoff)
              env->me_flags |= flag;
       else
              env->me_flags &= ~flag;
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

int mdb_env_set_mapsize ( MDB_env env,
size_t  size 
)

Set the size of the memory map to use for this environment.

The size should be a multiple of the OS page size. The default is 10485760 bytes. The size of the memory map is also the maximum size of the database. The value should be chosen as large as possible, to accommodate future growth of the database. This function may only be called after mdb_env_create() and before mdb_env_open().

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]sizeThe size in bytes
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified, or the environment is already open.

Definition at line 2431 of file mdb.c.

{
       if (env->me_map)
              return EINVAL;
       env->me_mapsize = size;
       if (env->me_psize)
              env->me_maxpg = env->me_mapsize / env->me_psize;
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

int mdb_env_set_maxdbs ( MDB_env env,
MDB_dbi  dbs 
)

Set the maximum number of databases for the environment.

This function is only needed if multiple databases will be used in the environment. Simpler applications that only use a single database can ignore this option. This function may only be called after mdb_env_create() and before mdb_env_open().

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]dbsThe maximum number of databases
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified, or the environment is already open.

Definition at line 2442 of file mdb.c.

{
       if (env->me_map)
              return EINVAL;
       env->me_maxdbs = dbs;
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

int mdb_env_set_maxreaders ( MDB_env env,
unsigned int  readers 
)

Set the maximum number of threads for the environment.

This defines the number of slots in the lock table that is used to track readers in the the environment. The default is 126. This function may only be called after mdb_env_create() and before mdb_env_open().

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]readersThe maximum number of threads
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified, or the environment is already open.

Definition at line 2451 of file mdb.c.

{
       if (env->me_map || readers < 1)
              return EINVAL;
       env->me_maxreaders = readers;
       return MDB_SUCCESS;
}

Here is the caller graph for this function:

static int mdb_env_setup_locks ( MDB_env env,
char *  lpath,
int  mode,
int excl 
) [static]

Open and/or initialize the lock region for the environment.

Parameters:
[in]envThe MDB environment.
[in]lpathThe pathname of the file used for the lock region.
[in]modeThe Unix permissions for the file, if we create it.
[out]exclSet to true if we got an exclusive lock on the region.
Returns:
0 on success, non-zero on failure.

Definition at line 2698 of file mdb.c.

{
       int rc;
       off_t size, rsize;

       *excl = 0;

#ifdef _WIN32
       if ((env->me_lfd = CreateFile(lpath, GENERIC_READ|GENERIC_WRITE,
              FILE_SHARE_READ|FILE_SHARE_WRITE, NULL, OPEN_ALWAYS,
              FILE_ATTRIBUTE_NORMAL, NULL)) == INVALID_HANDLE_VALUE) {
              rc = ErrCode();
              return rc;
       }
       /* Try to get exclusive lock. If we succeed, then
        * nobody is using the lock region and we should initialize it.
        */
       {
              if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
                     *excl = 1;
              } else {
                     OVERLAPPED ov;
                     memset(&ov, 0, sizeof(ov));
                     if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
                            rc = ErrCode();
                            goto fail;
                     }
              }
       }
       size = GetFileSize(env->me_lfd, NULL);
#else
       if ((env->me_lfd = open(lpath, O_RDWR|O_CREAT, mode)) == -1) {
              rc = ErrCode();
              return rc;
       }
       /* Try to get exclusive lock. If we succeed, then
        * nobody is using the lock region and we should initialize it.
        */
       {
              struct flock lock_info;
              memset((void *)&lock_info, 0, sizeof(lock_info));
              lock_info.l_type = F_WRLCK;
              lock_info.l_whence = SEEK_SET;
              lock_info.l_start = 0;
              lock_info.l_len = 1;
              rc = fcntl(env->me_lfd, F_SETLK, &lock_info);
              if (rc == 0) {
                     *excl = 1;
              } else {
                     lock_info.l_type = F_RDLCK;
                     rc = fcntl(env->me_lfd, F_SETLKW, &lock_info);
                     if (rc) {
                            rc = ErrCode();
                            goto fail;
                     }
              }
       }
       size = lseek(env->me_lfd, 0, SEEK_END);
#endif
       rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
       if (size < rsize && *excl) {
#ifdef _WIN32
              SetFilePointer(env->me_lfd, rsize, NULL, 0);
              if (!SetEndOfFile(env->me_lfd)) {
                     rc = ErrCode();
                     goto fail;
              }
#else
              if (ftruncate(env->me_lfd, rsize) != 0) {
                     rc = ErrCode();
                     goto fail;
              }
#endif
       } else {
              rsize = size;
              size = rsize - sizeof(MDB_txninfo);
              env->me_maxreaders = size/sizeof(MDB_reader) + 1;
       }
       {
#ifdef _WIN32
              HANDLE mh;
              mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
                     0, 0, NULL);
              if (!mh) {
                     rc = ErrCode();
                     goto fail;
              }
              env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
              CloseHandle(mh);
              if (!env->me_txns) {
                     rc = ErrCode();
                     goto fail;
              }
#else
              void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
                     env->me_lfd, 0);
              if (m == MAP_FAILED) {
                     env->me_txns = NULL;
                     rc = ErrCode();
                     goto fail;
              }
              env->me_txns = m;
#endif
       }
       if (*excl) {
#ifdef _WIN32
              char hexbuf[17];
              if (!mdb_sec_inited) {
                     InitializeSecurityDescriptor(&mdb_null_sd,
                            SECURITY_DESCRIPTOR_REVISION);
                     SetSecurityDescriptorDacl(&mdb_null_sd, TRUE, 0, FALSE);
                     mdb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
                     mdb_all_sa.bInheritHandle = FALSE;
                     mdb_all_sa.lpSecurityDescriptor = &mdb_null_sd;
                     mdb_sec_inited = 1;
              }
              mdb_hash_hex(lpath, hexbuf);
              sprintf(env->me_txns->mti_rmname, "Global\\MDBr%s", hexbuf);
              env->me_rmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_rmname);
              if (!env->me_rmutex) {
                     rc = ErrCode();
                     goto fail;
              }
              sprintf(env->me_txns->mti_wmname, "Global\\MDBw%s", hexbuf);
              env->me_wmutex = CreateMutex(&mdb_all_sa, FALSE, env->me_txns->mti_wmname);
              if (!env->me_wmutex) {
                     rc = ErrCode();
                     goto fail;
              }
#else  /* _WIN32 */
#ifdef __APPLE__
              char hexbuf[17];
              mdb_hash_hex(lpath, hexbuf);
              sprintf(env->me_txns->mti_rmname, "MDBr%s", hexbuf);
              if (sem_unlink(env->me_txns->mti_rmname)) {
                     rc = ErrCode();
                     if (rc != ENOENT && rc != EINVAL)
                            goto fail;
              }
              env->me_rmutex = sem_open(env->me_txns->mti_rmname, O_CREAT, mode, 1);
              if (!env->me_rmutex) {
                     rc = ErrCode();
                     goto fail;
              }
              sprintf(env->me_txns->mti_wmname, "MDBw%s", hexbuf);
              if (sem_unlink(env->me_txns->mti_wmname)) {
                     rc = ErrCode();
                     if (rc != ENOENT && rc != EINVAL)
                            goto fail;
              }
              env->me_wmutex = sem_open(env->me_txns->mti_wmname, O_CREAT, mode, 1);
              if (!env->me_wmutex) {
                     rc = ErrCode();
                     goto fail;
              }
#else  /* __APPLE__ */
              pthread_mutexattr_t mattr;

              pthread_mutexattr_init(&mattr);
              rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
              if (rc) {
                     goto fail;
              }
              pthread_mutex_init(&env->me_txns->mti_mutex, &mattr);
              pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr);
#endif /* __APPLE__ */
#endif /* _WIN32 */
              env->me_txns->mti_version = MDB_VERSION;
              env->me_txns->mti_magic = MDB_MAGIC;
              env->me_txns->mti_txnid = 0;
              env->me_txns->mti_numreaders = 0;
              env->me_txns->mti_me_toggle = 0;

       } else {
              if (env->me_txns->mti_magic != MDB_MAGIC) {
                     DPUTS("lock region has invalid magic");
                     rc = EINVAL;
                     goto fail;
              }
              if (env->me_txns->mti_version != MDB_VERSION) {
                     DPRINTF("lock region is version %u, expected version %u",
                            env->me_txns->mti_version, MDB_VERSION);
                     rc = MDB_VERSION_MISMATCH;
                     goto fail;
              }
              rc = ErrCode();
              if (rc != EACCES && rc != EAGAIN) {
                     goto fail;
              }
#ifdef _WIN32
              env->me_rmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_rmname);
              if (!env->me_rmutex) {
                     rc = ErrCode();
                     goto fail;
              }
              env->me_wmutex = OpenMutex(SYNCHRONIZE, FALSE, env->me_txns->mti_wmname);
              if (!env->me_wmutex) {
                     rc = ErrCode();
                     goto fail;
              }
#endif
#ifdef __APPLE__
              env->me_rmutex = sem_open(env->me_txns->mti_rmname, 0);
              if (!env->me_rmutex) {
                     rc = ErrCode();
                     goto fail;
              }
              env->me_wmutex = sem_open(env->me_txns->mti_wmname, 0);
              if (!env->me_wmutex) {
                     rc = ErrCode();
                     goto fail;
              }
#endif
       }
       return MDB_SUCCESS;

fail:
       close(env->me_lfd);
       env->me_lfd = INVALID_HANDLE_VALUE;
       return rc;

}

Here is the caller graph for this function:

static void mdb_env_share_locks ( MDB_env env) [static]

Downgrade the exclusive lock on the region back to shared.

Definition at line 2583 of file mdb.c.

{
       int toggle = 0;

       if (env->me_metas[0]->mm_txnid < env->me_metas[1]->mm_txnid)
              toggle = 1;
       env->me_txns->mti_me_toggle = toggle;
       env->me_txns->mti_txnid = env->me_metas[toggle]->mm_txnid;

#ifdef _WIN32
       {
              OVERLAPPED ov;
              /* First acquire a shared lock. The Unlock will
               * then release the existing exclusive lock.
               */
              memset(&ov, 0, sizeof(ov));
              LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov);
              UnlockFile(env->me_lfd, 0, 0, 1, 0);
       }
#else
       {
              struct flock lock_info;
              /* The shared lock replaces the existing lock */
              memset((void *)&lock_info, 0, sizeof(lock_info));
              lock_info.l_type = F_RDLCK;
              lock_info.l_whence = SEEK_SET;
              lock_info.l_start = 0;
              lock_info.l_len = 1;
              fcntl(env->me_lfd, F_SETLK, &lock_info);
       }
#endif
}

Here is the caller graph for this function:

int mdb_env_stat ( MDB_env env,
MDB_stat stat 
)

Return statistics about the MDB environment.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[out]statThe address of an MDB_stat structure where the statistics will be copied

Definition at line 6079 of file mdb.c.

{
       int toggle;

       if (env == NULL || arg == NULL)
              return EINVAL;

       mdb_env_read_meta(env, &toggle);

       return mdb_stat0(env, &env->me_metas[toggle]->mm_dbs[MAIN_DBI], arg);
}

Here is the call graph for this function:

Here is the caller graph for this function:

int mdb_env_sync ( MDB_env env,
int  force 
)

Flush the data buffers to disk.

Data is always written to disk when mdb_txn_commit() is called, but the operating system may keep it buffered. MDB always flushes the OS buffers upon commit as well, unless the environment was opened with MDB_NOSYNC.

Parameters:
[in]envAn environment handle returned by mdb_env_create()
[in]forceIf non-zero, force the flush to occur. Otherwise if the environment has the MDB_NOSYNC flag set the flushes will be omitted.
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • EINVAL - an invalid parameter was specified.
  • EIO - an error occurred during synchronization.

Definition at line 1431 of file mdb.c.

{
       int rc = 0;
       if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
              if (MDB_FDATASYNC(env->me_fd))
                     rc = ErrCode();
       }
       return rc;
}

Here is the caller graph for this function:

static int mdb_env_write_meta ( MDB_txn txn) [static]

Update the environment info to commit a transaction.

Parameters:
[in]txnthe transaction that's being committed
Returns:
0 on success, non-zero on failure.

Definition at line 2306 of file mdb.c.

{
       MDB_env *env;
       MDB_meta      meta, metab;
       off_t off;
       int rc, len, toggle;
       char *ptr;
#ifdef _WIN32
       OVERLAPPED ov;
#endif

       assert(txn != NULL);
       assert(txn->mt_env != NULL);

       toggle = !txn->mt_toggle;
       DPRINTF("writing meta page %d for root page %zu",
              toggle, txn->mt_dbs[MAIN_DBI].md_root);

       env = txn->mt_env;

       metab.mm_txnid = env->me_metas[toggle]->mm_txnid;
       metab.mm_last_pg = env->me_metas[toggle]->mm_last_pg;

       ptr = (char *)&meta;
       off = offsetof(MDB_meta, mm_dbs[0].md_depth);
       len = sizeof(MDB_meta) - off;

       ptr += off;
       meta.mm_dbs[0] = txn->mt_dbs[0];
       meta.mm_dbs[1] = txn->mt_dbs[1];
       meta.mm_last_pg = txn->mt_next_pgno - 1;
       meta.mm_txnid = txn->mt_txnid;

       if (toggle)
              off += env->me_psize;
       off += PAGEHDRSZ;

       /* Write to the SYNC fd */
#ifdef _WIN32
       {
              memset(&ov, 0, sizeof(ov));
              ov.Offset = off;
              WriteFile(env->me_mfd, ptr, len, (DWORD *)&rc, &ov);
       }
#else
       rc = pwrite(env->me_mfd, ptr, len, off);
#endif
       if (rc != len) {
              int r2;
              rc = ErrCode();
              DPUTS("write failed, disk error?");
              /* On a failure, the pagecache still contains the new data.
               * Write some old data back, to prevent it from being used.
               * Use the non-SYNC fd; we know it will fail anyway.
               */
              meta.mm_last_pg = metab.mm_last_pg;
              meta.mm_txnid = metab.mm_txnid;
#ifdef _WIN32
              WriteFile(env->me_fd, ptr, len, NULL, &ov);
#else
              r2 = pwrite(env->me_fd, ptr, len, off);
#endif
              env->me_flags |= MDB_FATAL_ERROR;
              return rc;
       }
       /* Memory ordering issues are irrelevant; since the entire writer
        * is wrapped by wmutex, all of these changes will become visible
        * after the wmutex is unlocked. Since the DB is multi-version,
        * readers will get consistent data regardless of how fresh or
        * how stale their view of these values is.
        */
       LAZY_MUTEX_LOCK(&env->me_txns->mti_mutex);
       txn->mt_env->me_txns->mti_me_toggle = toggle;
       txn->mt_env->me_txns->mti_txnid = txn->mt_txnid;
       LAZY_MUTEX_UNLOCK(&env->me_txns->mti_mutex);

       return MDB_SUCCESS;
}

Here is the caller graph for this function:

int mdb_get ( MDB_txn txn,
MDB_dbi  dbi,
MDB_val key,
MDB_val data 
)

Get items from a database.

This function retrieves key/data pairs from the database. The address and length of the data associated with the specified key are returned in the structure to which data refers. If the database supports duplicate keys (MDB_DUPSORT) then the first data item for the key will be returned. Retrieval of other items requires the use of mdb_cursor_get().

Note:
The memory pointed to by the returned values is owned by the database. The caller need not dispose of the memory, and may not modify it in any way. For values returned in a read-only transaction any modification attempts will cause a SIGSEGV.
Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]dbiA database handle returned by mdb_open()
[in]keyThe key to search for in the database
[out]dataThe data corresponding to the key
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:
  • MDB_NOTFOUND - the key was not in the database.
  • EINVAL - an invalid parameter was specified.

Definition at line 3523 of file mdb.c.

{
       MDB_cursor    mc;
       MDB_xcursor   mx;
       int exact = 0;
       DKBUF;

       assert(key);
       assert(data);
       DPRINTF("===> get db %u key [%s]", dbi, DKEY(key));

       if (txn == NULL || !dbi || dbi >= txn->mt_numdbs)
              return EINVAL;

       if (key->mv_size == 0 || key->mv_size > MAXKEYSIZE) {
              return EINVAL;
       }

       mdb_cursor_init(&mc, txn, dbi, &mx);
       return mdb_cursor_set(&mc, key, data, MDB_SET, &exact);
}

Here is the call graph for this function:

static size_t mdb_leaf_size ( MDB_env env,
MDB_val key,
MDB_val data 
) [static]

Calculate the size of a leaf node.

The size depends on the environment's page size; if a data item is too large it will be put onto an overflow page and the node size will only include the key and not the data. Sizes are always rounded up to an even number of bytes, to guarantee 2-byte alignment of the MDB_node headers.

Parameters:
[in]envThe environment handle.
[in]keyThe key for the node.
[in]dataThe data for the node.
Returns:
The number of bytes needed to store the node.

Definition at line 4584 of file mdb.c.

{
       size_t         sz;

       sz = LEAFSIZE(key, data);
       if (sz >= env->me_psize / MDB_MINKEYS) {
              /* put on overflow page */
              sz -= data->mv_size - sizeof(pgno_t);
       }
       sz += sz & 1;

       return sz + sizeof(indx_t);
}

Here is the caller graph for this function:

static int mdb_node_add ( MDB_cursor mc,
indx_t  indx,
MDB_val key,
MDB_val data,
pgno_t  pgno,
unsigned int  flags 
) [static]

Add a node to the page pointed to by the cursor.

Parameters:
[in]mcThe cursor for this operation.
[in]indxThe index on the page where the new node should be added.
[in]keyThe key for the new node.
[in]dataThe data for the new node, if any.
[in]pgnoThe page number, if adding a branch node.
[in]flagsFlags for the node.
Returns:
0 on success, non-zero on failure. Possible errors are:
  • ENOMEM - failed to allocate overflow pages for the node.
  • ENOSPC - there is insufficient room in the page. This error should never happen since all callers already calculate the page's free space before calling this function.

Definition at line 4639 of file mdb.c.

{
       unsigned int   i;
       size_t         node_size = NODESIZE;
       indx_t         ofs;
       MDB_node      *node;
       MDB_page      *mp = mc->mc_pg[mc->mc_top];
       MDB_page      *ofp = NULL;         /* overflow page */
       DKBUF;

       assert(mp->mp_upper >= mp->mp_lower);

       DPRINTF("add to %s %spage %zu index %i, data size %zu key size %zu [%s]",
           IS_LEAF(mp) ? "leaf" : "branch",
              IS_SUBP(mp) ? "sub-" : "",
           mp->mp_pgno, indx, data ? data->mv_size : 0,
              key ? key->mv_size : 0, key ? DKEY(key) : NULL);

       if (IS_LEAF2(mp)) {
              /* Move higher keys up one slot. */
              int ksize = mc->mc_db->md_pad, dif;
              char *ptr = LEAF2KEY(mp, indx, ksize);
              dif = NUMKEYS(mp) - indx;
              if (dif > 0)
                     memmove(ptr+ksize, ptr, dif*ksize);
              /* insert new key */
              memcpy(ptr, key->mv_data, ksize);

              /* Just using these for counting */
              mp->mp_lower += sizeof(indx_t);
              mp->mp_upper -= ksize - sizeof(indx_t);
              return MDB_SUCCESS;
       }

       if (key != NULL)
              node_size += key->mv_size;

       if (IS_LEAF(mp)) {
              assert(data);
              if (F_ISSET(flags, F_BIGDATA)) {
                     /* Data already on overflow page. */
                     node_size += sizeof(pgno_t);
              } else if (node_size + data->mv_size >= mc->mc_txn->mt_env->me_psize / MDB_MINKEYS) {
                     int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
                     /* Put data on overflow page. */
                     DPRINTF("data size is %zu, node would be %zu, put data on overflow page",
                         data->mv_size, node_size+data->mv_size);
                     node_size += sizeof(pgno_t);
                     if ((ofp = mdb_page_new(mc, P_OVERFLOW, ovpages)) == NULL)
                            return ENOMEM;
                     DPRINTF("allocated overflow page %zu", ofp->mp_pgno);
                     flags |= F_BIGDATA;
              } else {
                     node_size += data->mv_size;
              }
       }
       node_size += node_size & 1;

       if (node_size + sizeof(indx_t) > SIZELEFT(mp)) {
              DPRINTF("not enough room in page %zu, got %u ptrs",
                  mp->mp_pgno, NUMKEYS(mp));
              DPRINTF("upper - lower = %u - %u = %u", mp->mp_upper, mp->mp_lower,
                  mp->mp_upper - mp->mp_lower);
              DPRINTF("node size = %zu", node_size);
              return ENOSPC;
       }

       /* Move higher pointers up one slot. */
       for (i = NUMKEYS(mp); i > indx; i--)
              mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];

       /* Adjust free space offsets. */
       ofs = mp->mp_upper - node_size;
       assert(ofs >= mp->mp_lower + sizeof(indx_t));
       mp->mp_ptrs[indx] = ofs;
       mp->mp_upper = ofs;
       mp->mp_lower += sizeof(indx_t);

       /* Write the node data. */
       node = NODEPTR(mp, indx);
       node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
       node->mn_flags = flags;
       if (IS_LEAF(mp))
              SETDSZ(node,data->mv_size);
       else
              SETPGNO(node,pgno);

       if (key)
              memcpy(NODEKEY(node), key->mv_data, key->mv_size);

       if (IS_LEAF(mp)) {
              assert(key);
              if (ofp == NULL) {
                     if (F_ISSET(flags, F_BIGDATA))
                            memcpy(node->mn_data + key->mv_size, data->mv_data,
                                sizeof(pgno_t));
                     else if (F_ISSET(flags, MDB_RESERVE))
                            data->mv_data = node->mn_data + key->mv_size;
                     else
                            memcpy(node->mn_data + key->mv_size, data->mv_data,
                                data->mv_size);
              } else {
                     memcpy(node->mn_data + key->mv_size, &ofp->mp_pgno,
                         sizeof(pgno_t));
                     if (F_ISSET(flags, MDB_RESERVE))
                            data->mv_data = METADATA(ofp);
                     else
                            memcpy(METADATA(ofp), data->mv_data, data->mv_size);
              }
       }

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void mdb_node_del ( MDB_page mp,
indx_t  indx,
int  ksize 
) [static]

Delete the specified node from a page.

Parameters:
[in]mpThe page to operate on.
[in]indxThe index of the node to delete.
[in]ksizeThe size of a node. Only used if the page is part of a MDB_DUPFIXED database.

Definition at line 4761 of file mdb.c.

{
       unsigned int   sz;
       indx_t         i, j, numkeys, ptr;
       MDB_node      *node;
       char          *base;

#if MDB_DEBUG
       {
       pgno_t pgno;
       COPY_PGNO(pgno, mp->mp_pgno);
       DPRINTF("delete node %u on %s page %zu", indx,
           IS_LEAF(mp) ? "leaf" : "branch", pgno);
       }
#endif
       assert(indx < NUMKEYS(mp));

       if (IS_LEAF2(mp)) {
              int x = NUMKEYS(mp) - 1 - indx;
              base = LEAF2KEY(mp, indx, ksize);
              if (x)
                     memmove(base, base + ksize, x * ksize);
              mp->mp_lower -= sizeof(indx_t);
              mp->mp_upper += ksize - sizeof(indx_t);
              return;
       }

       node = NODEPTR(mp, indx);
       sz = NODESIZE + node->mn_ksize;
       if (IS_LEAF(mp)) {
              if (F_ISSET(node->mn_flags, F_BIGDATA))
                     sz += sizeof(pgno_t);
              else
                     sz += NODEDSZ(node);
       }
       sz += sz & 1;

       ptr = mp->mp_ptrs[indx];
       numkeys = NUMKEYS(mp);
       for (i = j = 0; i < numkeys; i++) {
              if (i != indx) {
                     mp->mp_ptrs[j] = mp->mp_ptrs[i];
                     if (mp->mp_ptrs[i] < ptr)
                            mp->mp_ptrs[j] += sz;
                     j++;
              }
       }

       base = (char *)mp + mp->mp_upper;
       memmove(base + sz, base, ptr - mp->mp_upper);

       mp->mp_lower -= sizeof(indx_t);
       mp->mp_upper += sz;
}

Here is the caller graph for this function:

static int mdb_node_move ( MDB_cursor csrc,
MDB_cursor cdst 
) [static]

Move a node from csrc to cdst.

Definition at line 5132 of file mdb.c.

{
       int                   rc;
       MDB_node             *srcnode;
       MDB_val               key, data;
       pgno_t srcpg;
       unsigned short flags;

       DKBUF;

       /* Mark src and dst as dirty. */
       if ((rc = mdb_page_touch(csrc)) ||
           (rc = mdb_page_touch(cdst)))
              return rc;

       if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
              srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0); /* fake */
              key.mv_size = csrc->mc_db->md_pad;
              key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
              data.mv_size = 0;
              data.mv_data = NULL;
              srcpg = 0;
              flags = 0;
       } else {
              srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
              assert(!((long)srcnode&1));
              srcpg = NODEPGNO(srcnode);
              flags = srcnode->mn_flags;
              if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
                     unsigned int snum = csrc->mc_snum;
                     MDB_node *s2;
                     /* must find the lowest key below src */
                     mdb_page_search_root(csrc, NULL, 0);
                     s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
                     key.mv_size = NODEKSZ(s2);
                     key.mv_data = NODEKEY(s2);
                     csrc->mc_snum = snum--;
                     csrc->mc_top = snum;
              } else {
                     key.mv_size = NODEKSZ(srcnode);
                     key.mv_data = NODEKEY(srcnode);
              }
              data.mv_size = NODEDSZ(srcnode);
              data.mv_data = NODEDATA(srcnode);
       }
       if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
              unsigned int snum = cdst->mc_snum;
              MDB_node *s2;
              MDB_val bkey;
              /* must find the lowest key below dst */
              mdb_page_search_root(cdst, NULL, 0);
              s2 = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
              bkey.mv_size = NODEKSZ(s2);
              bkey.mv_data = NODEKEY(s2);
              cdst->mc_snum = snum--;
              cdst->mc_top = snum;
              rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &bkey);
       }

       DPRINTF("moving %s node %u [%s] on page %zu to node %u on page %zu",
           IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
           csrc->mc_ki[csrc->mc_top],
              DKEY(&key),
           csrc->mc_pg[csrc->mc_top]->mp_pgno,
           cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno);

       /* Add the node to the destination page.
        */
       rc = mdb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
       if (rc != MDB_SUCCESS)
              return rc;

       /* Delete the node from the source page.
        */
       mdb_node_del(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);

       {
              /* Adjust other cursors pointing to mp */
              MDB_cursor *m2, *m3;
              MDB_dbi dbi = csrc->mc_dbi;
              MDB_page *mp = csrc->mc_pg[csrc->mc_top];

              if (csrc->mc_flags & C_SUB)
                     dbi--;

              for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
                     if (m2 == csrc) continue;
                     if (csrc->mc_flags & C_SUB)
                            m3 = &m2->mc_xcursor->mx_cursor;
                     else
                            m3 = m2;
                     if (m3->mc_pg[csrc->mc_top] == mp && m3->mc_ki[csrc->mc_top] ==
                            csrc->mc_ki[csrc->mc_top]) {
                            m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
                            m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
                     }
              }
       }

       /* Update the parent separators.
        */
       if (csrc->mc_ki[csrc->mc_top] == 0) {
              if (csrc->mc_ki[csrc->mc_top-1] != 0) {
                     if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
                            key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
                     } else {
                            srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
                            key.mv_size = NODEKSZ(srcnode);
                            key.mv_data = NODEKEY(srcnode);
                     }
                     DPRINTF("update separator for source page %zu to [%s]",
                            csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key));
                     if ((rc = mdb_update_key(csrc->mc_pg[csrc->mc_top-1], csrc->mc_ki[csrc->mc_top-1],
                            &key)) != MDB_SUCCESS)
                            return rc;
              }
              if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
                     MDB_val        nullkey;
                     nullkey.mv_size = 0;
                     rc = mdb_update_key(csrc->mc_pg[csrc->mc_top], 0, &nullkey);
                     assert(rc == MDB_SUCCESS);
              }
       }

       if (cdst->mc_ki[cdst->mc_top] == 0) {
              if (cdst->mc_ki[cdst->mc_top-1] != 0) {
                     if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
                            key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
                     } else {
                            srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
                            key.mv_size = NODEKSZ(srcnode);
                            key.mv_data = NODEKEY(srcnode);
                     }
                     DPRINTF("update separator for destination page %zu to [%s]",
                            cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key));
                     if ((rc = mdb_update_key(cdst->mc_pg[cdst->mc_top-1], cdst->mc_ki[cdst->mc_top-1],
                            &key)) != MDB_SUCCESS)
                            return rc;
              }
              if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
                     MDB_val        nullkey;
                     nullkey.mv_size = 0;
                     rc = mdb_update_key(cdst->mc_pg[cdst->mc_top], 0, &nullkey);
                     assert(rc == MDB_SUCCESS);
              }
       }

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static int mdb_node_read ( MDB_txn txn,
MDB_node leaf,
MDB_val data 
) [static]

Return the data associated with a given node.

Parameters:
[in]txnThe transaction for this operation.
[in]leafThe node being read.
[out]dataUpdated to point to the node's data.
Returns:
0 on success, non-zero on failure.

Definition at line 3497 of file mdb.c.

{
       MDB_page      *omp;         /* overflow page */
       pgno_t         pgno;
       int rc;

       if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
              data->mv_size = NODEDSZ(leaf);
              data->mv_data = NODEDATA(leaf);
              return MDB_SUCCESS;
       }

       /* Read overflow data.
        */
       data->mv_size = NODEDSZ(leaf);
       memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
       if ((rc = mdb_page_get(txn, pgno, &omp))) {
              DPRINTF("read overflow page %zu failed", pgno);
              return rc;
       }
       data->mv_data = METADATA(omp);

       return MDB_SUCCESS;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static MDB_node * mdb_node_search ( MDB_cursor mc,
MDB_val key,
int exactp 
) [static]

Search for key within a page, using binary search.

Returns the smallest entry larger or equal to the key. If exactp is non-null, stores whether the found entry was an exact match in *exactp (1 or 0). Updates the cursor index with the index of the found entry. If no entry larger or equal to the key is found, returns NULL.

Definition at line 3160 of file mdb.c.

{
       unsigned int   i = 0, nkeys;
       int            low, high;
       int            rc = 0;
       MDB_page *mp = mc->mc_pg[mc->mc_top];
       MDB_node      *node = NULL;
       MDB_val        nodekey;
       MDB_cmp_func *cmp;
       DKBUF;

       nkeys = NUMKEYS(mp);

#if MDB_DEBUG
       {
       pgno_t pgno;
       COPY_PGNO(pgno, mp->mp_pgno);
       DPRINTF("searching %u keys in %s %spage %zu",
           nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
           pgno);
       }
#endif

       assert(nkeys > 0);

       low = IS_LEAF(mp) ? 0 : 1;
       high = nkeys - 1;
       cmp = mc->mc_dbx->md_cmp;

       /* Branch pages have no data, so if using integer keys,
        * alignment is guaranteed. Use faster mdb_cmp_int.
        */
       if (cmp == mdb_cmp_cint && IS_BRANCH(mp)) {
              if (NODEPTR(mp, 1)->mn_ksize == sizeof(size_t))
                     cmp = mdb_cmp_long;
              else
                     cmp = mdb_cmp_int;
       }

       if (IS_LEAF2(mp)) {
              nodekey.mv_size = mc->mc_db->md_pad;
              node = NODEPTR(mp, 0);      /* fake */
              while (low <= high) {
                     i = (low + high) >> 1;
                     nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
                     rc = cmp(key, &nodekey);
                     DPRINTF("found leaf index %u [%s], rc = %i",
                         i, DKEY(&nodekey), rc);
                     if (rc == 0)
                            break;
                     if (rc > 0)
                            low = i + 1;
                     else
                            high = i - 1;
              }
       } else {
              while (low <= high) {
                     i = (low + high) >> 1;

                     node = NODEPTR(mp, i);
                     nodekey.mv_size = NODEKSZ(node);
                     nodekey.mv_data = NODEKEY(node);

                     rc = cmp(key, &nodekey);
#if MDB_DEBUG
                     if (IS_LEAF(mp))
                            DPRINTF("found leaf index %u [%s], rc = %i",
                                i, DKEY(&nodekey), rc);
                     else
                            DPRINTF("found branch index %u [%s -> %zu], rc = %i",
                                i, DKEY(&nodekey), NODEPGNO(node), rc);
#endif
                     if (rc == 0)
                            break;
                     if (rc > 0)
                            low = i + 1;
                     else
                            high = i - 1;
              }
       }

       if (rc > 0) { /* Found entry is less than the key. */
              i++;   /* Skip to get the smallest entry larger than key. */
              if (!IS_LEAF2(mp))
                     node = NODEPTR(mp, i);
       }
       if (exactp)
              *exactp = (rc == 0);
       /* store the key index */
       mc->mc_ki[mc->mc_top] = i;
       if (i >= nkeys)
              /* There is no entry larger or equal to the key. */
              return NULL;

       /* nodeptr is fake for LEAF2 */
       return node;
}

Here is the call graph for this function:

Here is the caller graph for this function:

static void mdb_node_shrink ( MDB_page mp,
indx_t  indx 
) [static]

Compact the main page after deleting a node on a subpage.

Parameters:
[in]mpThe main page to operate on.
[in]indxThe index of the subpage on the main page.

Definition at line 4821 of file mdb.c.

{
       MDB_node *node;
       MDB_page *sp, *xp;
       char *base;
       int osize, nsize;
       int delta;
       indx_t         i, numkeys, ptr;

       node = NODEPTR(mp, indx);
       sp = (MDB_page *)NODEDATA(node);
       osize = NODEDSZ(node);

       delta = sp->mp_upper - sp->mp_lower;
       SETDSZ(node, osize - delta);
       xp = (MDB_page *)((char *)sp + delta);

       /* shift subpage upward */
       if (IS_LEAF2(sp)) {
              nsize = NUMKEYS(sp) * sp->mp_pad;
              memmove(METADATA(xp), METADATA(sp), nsize);
       } else {
              int i;
              nsize = osize - sp->mp_upper;
              numkeys = NUMKEYS(sp);
              for (i=numkeys-1; i>=0; i--)
                     xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
       }
       xp->mp_upper = sp->mp_lower;
       xp->mp_lower = sp->mp_lower;
       xp->mp_flags = sp->mp_flags;
       xp->mp_pad = sp->mp_pad;
       COPY_PGNO(xp->mp_pgno, mp->mp_pgno);

       /* shift lower nodes upward */
       ptr = mp->mp_ptrs[indx];
       numkeys = NUMKEYS(mp);
       for (i = 0; i < numkeys; i++) {
              if (mp->mp_ptrs[i] <= ptr)
                     mp->mp_ptrs[i] += delta;
       }

       base = (char *)mp + mp->mp_upper;
       memmove(base + delta, base, ptr - mp->mp_upper + NODESIZE + NODEKSZ(node));
       mp->mp_upper += delta;
}

Here is the caller graph for this function:

int mdb_open ( MDB_txn txn,
const char *  name,
unsigned int  flags,
MDB_dbi dbi 
)

Open a database in the environment.

The database handle may be discarded by calling mdb_close(). Only one thread at a time may call this function; it is not mutex-protected in a read-only transaction.

Parameters:
[in]txnA transaction handle returned by mdb_txn_begin()
[in]nameThe name of the database to open. If only a single database is needed in the environment, this value may be NULL.
[in]flagsSpecial options for this database. This parameter must be set to 0 or by bitwise OR'ing together one or more of the values described here.
  • MDB_REVERSEKEY Keys are strings to be compared in reverse order, from the end of the strings to the beginning. By default, Keys are treated as strings and compared from beginning to end.
  • MDB_DUPSORT Duplicate keys may be used in the database. (Or, from another perspective, keys may have multiple data items, stored in sorted order.) By default keys must be unique and may have only a single data item.
  • MDB_INTEGERKEY Keys are binary integers in native byte order. Setting this option requires all keys to be the same size, typically sizeof(int) or sizeof(size_t).
  • MDB_DUPFIXED This flag may only be used in combination with MDB_DUPSORT. This option tells the library that the data items for this database are all the same size, which allows further optimizations in storage and retrieval. When all data items are the same size, the MDB_GET_MULTIPLE and MDB_NEXT_MULTIPLE cursor operations may be used to retrieve multiple items at once.
  • MDB_INTEGERDUP This option specifies that duplicate data items are also integers, and should be sorted as such.
  • MDB_REVERSEDUP This option specifies that duplicate data items should be compared as strings in reverse order.
  • MDB_CREATE Create the named database if it doesn't exist. This option is not allowed in a read-only transaction or a read-only environment.
[out]dbiAddress where the new MDB_dbi handle will be stored
Returns:
A non-zero error value on failure and 0 on success. Some possible errors are:

Definition at line 6124 of file mdb.c.

{
       MDB_val key, data;
       MDB_dbi i;
       MDB_cursor mc;
       int rc, dbflag, exact;
       size_t len;

       if (txn->mt_dbxs[FREE_DBI].md_cmp == NULL) {
              mdb_default_cmp(txn, FREE_DBI);
       }

       /* main DB? */
       if (!name) {
              *dbi = MAIN_DBI;
              if (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY))
                     txn->mt_dbs[MAIN_DBI].md_flags |= (flags & (MDB_DUPSORT|MDB_REVERSEKEY|MDB_INTEGERKEY));
              mdb_default_cmp(txn, MAIN_DBI);
              return MDB_SUCCESS;
       }

       if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
              mdb_default_cmp(txn, MAIN_DBI);
       }

       /* Is the DB already open? */
       len = strlen(name);
       for (i=2; i<txn->mt_numdbs; i++) {
              if (len == txn->mt_dbxs[i].md_name.mv_size &&
                     !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
                     *dbi = i;
                     return MDB_SUCCESS;
              }
       }

       if (txn->mt_numdbs >= txn->mt_env->me_maxdbs - 1)
              return ENFILE;

       /* Find the DB info */
       dbflag = 0;
       exact = 0;
       key.mv_size = len;
       key.mv_data = (void *)name;
       mdb_cursor_init(&mc, txn, MAIN_DBI, NULL);
       rc = mdb_cursor_set(&mc, &key, &data, MDB_SET, &exact);
       if (rc == MDB_SUCCESS) {
              /* make sure this is actually a DB */
              MDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
              if (!(node->mn_flags & F_SUBDATA))
                     return EINVAL;
       } else if (rc == MDB_NOTFOUND && (flags & MDB_CREATE)) {
              /* Create if requested */
              MDB_db dummy;
              data.mv_size = sizeof(MDB_db);
              data.mv_data = &dummy;
              memset(&dummy, 0, sizeof(dummy));
              dummy.md_root = P_INVALID;
              dummy.md_flags = flags & 0xffff;
              rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA);
              dbflag = DB_DIRTY;
       }

       /* OK, got info, add to table */
       if (rc == MDB_SUCCESS) {
              txn->mt_dbxs[txn->mt_numdbs].md_name.mv_data = strdup(name);
              txn->mt_dbxs[txn->mt_numdbs].md_name.mv_size = len;
              txn->mt_dbxs[txn->mt_numdbs].md_rel = NULL;
              txn->mt_dbflags[txn->mt_numdbs] = dbflag;
              memcpy(&txn->mt_dbs[txn->mt_numdbs], data.mv_data, sizeof(MDB_db));
              *dbi = txn->mt_numdbs;
              txn->mt_env->me_dbs[0][txn->mt_numdbs] = txn->mt_dbs[txn->mt_numdbs];
              txn->mt_env->me_dbs[1][txn->mt_numdbs] = txn->mt_dbs[txn->mt_numdbs];
              mdb_default_cmp(txn, txn->mt_numdbs);
              txn->mt_numdbs++;
       }

       return rc;
}

Here is the call graph for this function: