SQLite4
Documentation
Not logged in
/*
** 2011-08-18
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains the implementation of an in-memory tree structure.
**
** Technically the tree is a B-tree of order 4 (in the Knuth sense - each 
** node may have up to 4 children). Keys are stored within B-tree nodes by
** reference. This may be slightly slower than a conventional red-black
** tree, but it is simpler. It is also an easier structure to modify to 
** create a version that supports nested transaction rollback.
**
** This tree does not currently support a delete operation. One is not 
** required. When LSM deletes a key from a database, it inserts a DELETE
** marker into the data structure. As a result, although the value associated
** with a key stored in the in-memory tree structure may be modified, no
** keys are ever removed. 
*/

/*
** MVCC NOTES
**
**   The in-memory tree structure supports SQLite-style MVCC. This means
**   that while one client is writing to the tree structure, other clients
**   may still be querying an older snapshot of the tree.
**
**   One way to implement this is to use an append-only b-tree. In this 
**   case instead of modifying nodes in-place, a copy of the node is made
**   and the required modifications made to the copy. The parent of the
**   node is then modified (to update the pointer so that it points to
**   the new copy), which causes a copy of the parent to be made, and so on.
**   This means that each time the tree is written to a new root node is
**   created. A snapshot is identified by the root node that it uses.
**
**   The problem with the above is that each time the tree is written to,
**   a copy of the node structure modified and all of its ancestor nodes
**   is made. This may prove excessive with large tree structures.
**
**   To reduce this overhead, the data structure used for a tree node is
**   designed so that it may be edited in place exactly once without 
**   affecting existing users. In other words, the node structure is capable
**   of storing two separate versions of the node at the same time.
**   When a node is to be edited, if the node structure already contains 
**   two versions, a copy is made as in the append-only approach. Or, if
**   it only contains a single version, it is edited in place.
**
**   This reduces the overhead so that, roughly, one new node structure
**   must be allocated for each write (on top of those allocations that 
**   would have been required by a non-MVCC tree). Logic: Assume that at 
**   any time, 50% of nodes in the tree already contain 2 versions. When
**   a new entry is written to a node, there is a 50% chance that a copy
**   of the node will be required. And a 25% chance that a copy of its 
**   parent is required. And so on.
**
** ROLLBACK
**
**   The in-memory tree also supports transaction and sub-transaction 
**   rollback. In order to rollback to point in time X, the following is
**   necessary:
**
**     1. All memory allocated since X must be freed, and 
**     2. All "v2" data adding to nodes that existed at X should be zeroed.
**     3. The root node must be restored to its X value.
**
**   The Mempool object used to allocate memory for the tree supports 
**   operation (1) - see the lsmPoolMark() and lsmPoolRevert() functions.
**
**   To support (2), all nodes that have v2 data are part of a singly linked 
**   list, sorted by the age of the v2 data (nodes that have had data added 
**   most recently are at the end of the list). So to zero all v2 data added
**   since X, the linked list is traversed from the first node added following
**   X onwards.
**
*/

#ifndef _LSM_INT_H
# include "lsmInt.h"
#endif

#include <string.h>

#define MAX_DEPTH 32

typedef struct TreeKey TreeKey;
typedef struct TreeNode TreeNode;
typedef struct TreeLeaf TreeLeaf;
typedef struct NodeVersion NodeVersion;

struct TreeOld {
  u32 iShmid;                     /* Last shared-memory chunk in use by old */
  u32 iRoot;                      /* Offset of root node in shm file */
  u32 nHeight;                    /* Height of tree structure */
};

#ifndef NDEBUG
/*
** assert() that a TreeKey.flags value is sane. Usage:
**
**   assert( lsmAssertFlagsOk(pTreeKey->flags) );
*/
static int lsmAssertFlagsOk(u8 keyflags){
  /* At least one flag must be set. Otherwise, what is this key doing? */
  assert( keyflags!=0 );

  /* The POINT_DELETE and INSERT flags cannot both be set. */
  assert( (keyflags & LSM_POINT_DELETE)==0 || (keyflags & LSM_INSERT)==0 );

  /* If both the START_DELETE and END_DELETE flags are set, then the INSERT
  ** flag must also be set. In other words - the three DELETE flags cannot
  ** all be set */
  assert( (keyflags & LSM_END_DELETE)==0 
       || (keyflags & LSM_START_DELETE)==0 
       || (keyflags & LSM_POINT_DELETE)==0 
  );

  return 1;
}

static int assert_delete_ranges_match(lsm_db *);
static int treeCountEntries(lsm_db *db);
#else
# define lsmAssertFlagsOk(x)
#endif

/*
** Container for a key-value pair. Within the *-shm file, each key/value
** pair is stored in a single allocation (which may not actually be 
** contiguous in memory). Layout is the TreeKey structure, followed by
** the nKey bytes of key blob, followed by the nValue bytes of value blob
** (if nValue is non-negative).
*/
struct TreeKey {
  int nKey;                       /* Size of pKey in bytes */
  int nValue;                     /* Size of pValue. Or negative. */
  u8 flags;                       /* Various LSM_XXX flags */
};

#define TKV_KEY(p) ((void *)&(p)[1])
#define TKV_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey))

/*
** A single tree node. A node structure may contain up to 3 key/value
** pairs. Internal (non-leaf) nodes have up to 4 children.
**
** TODO: Update the format of this to be more compact. Get it working
** first though...
*/
struct TreeNode {
  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */

  /* The following fields are present for interior nodes only, not leaves. */
  u32 aiChildPtr[4];              /* Array of pointers to child nodes */

  /* The extra child pointer slot. */
  u32 iV2;                        /* Transaction number of v2 */
  u8 iV2Child;                    /* apChild[] entry replaced by pV2Ptr */
  u32 iV2Ptr;                     /* Substitute pointer */
};

struct TreeLeaf {
  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
};

typedef struct TreeBlob TreeBlob;
struct TreeBlob {
  int n;
  u8 *a;
};

/*
** Cursor for searching a tree structure.
**
** If a cursor does not point to any element (a.k.a. EOF), then the
** TreeCursor.iNode variable is set to a negative value. Otherwise, the
** cursor currently points to key aiCell[iNode] on node apTreeNode[iNode].
**
** Entries in the apTreeNode[] and aiCell[] arrays contain the node and
** index of the TreeNode.apChild[] pointer followed to descend to the 
** current element. Hence apTreeNode[0] always contains the root node of
** the tree.
*/
struct TreeCursor {
  lsm_db *pDb;                    /* Database handle for this cursor */
  TreeRoot *pRoot;                /* Root node and height of tree to access */
  int iNode;                      /* Cursor points at apTreeNode[iNode] */
  TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */
  u8 aiCell[MAX_DEPTH];           /* Current position in tree */
  TreeKey *pSave;                 /* Saved key */
  TreeBlob blob;                  /* Dynamic storage for a key */
};

/*
** A value guaranteed to be larger than the largest possible transaction
** id (TreeHeader.iTransId).
*/
#define WORKING_VERSION (1<<30)

static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){
  if( n>p->n ){
    lsmFree(pDb->pEnv, p->a);
    p->a = lsmMallocRc(pDb->pEnv, n, pRc);
    p->n = n;
  }
  return (p->a==0);
}
static void tblobFree(lsm_db *pDb, TreeBlob *p){
  lsmFree(pDb->pEnv, p->a);
}


/***********************************************************************
** Start of IntArray methods.  */
/*
** Append value iVal to the contents of IntArray *p. Return LSM_OK if 
** successful, or LSM_NOMEM if an OOM condition is encountered.
*/
static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){
  assert( p->nArray<=p->nAlloc );
  if( p->nArray>=p->nAlloc ){
    u32 *aNew;
    int nNew = p->nArray ? p->nArray*2 : 128;
    aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32));
    if( !aNew ) return LSM_NOMEM_BKPT;
    p->aArray = aNew;
    p->nAlloc = nNew;
  }

  p->aArray[p->nArray++] = iVal;
  return LSM_OK;
}

/*
** Zero the IntArray object.
*/
static void intArrayFree(lsm_env *pEnv, IntArray *p){
  p->nArray = 0;
}

/*
** Return the number of entries currently in the int-array object.
*/
static int intArraySize(IntArray *p){
  return p->nArray;
}

/*
** Return a copy of the iIdx'th entry in the int-array.
*/
static u32 intArrayEntry(IntArray *p, int iIdx){
  return p->aArray[iIdx];
}

/*
** Truncate the int-array so that all but the first nVal values are 
** discarded.
*/
static void intArrayTruncate(IntArray *p, int nVal){
  p->nArray = nVal;
}
/* End of IntArray methods.
***********************************************************************/

static int treeKeycmp(void *p1, int n1, void *p2, int n2){
  int res;
  res = memcmp(p1, p2, LSM_MIN(n1, n2));
  if( res==0 ) res = (n1-n2);
  return res;
}

/*
** The pointer passed as the first argument points to an interior node,
** not a leaf. This function returns the offset of the iCell'th child
** sub-tree of the node.
*/
static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){
  assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) );
  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr;
  return p->aiChildPtr[iCell];
}

/*
** Given an offset within the *-shm file, return the associated chunk number.
*/
static int treeOffsetToChunk(u32 iOff){
  assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
  return (int)(iOff>>15);
}

#define treeShmptrUnsafe(pDb, iPtr) \
(&((u8*)((pDb)->apShm[(iPtr)>>15]))[(iPtr) & (LSM_SHM_CHUNK_SIZE-1)])

/*
** Return a pointer to the mapped memory location associated with *-shm 
** file offset iPtr.
*/
static void *treeShmptr(lsm_db *pDb, u32 iPtr){

  assert( (iPtr>>15)<pDb->nShm );
  assert( pDb->apShm[iPtr>>15] );

  return iPtr ? treeShmptrUnsafe(pDb, iPtr) : 0;
}

static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){
  return (ShmChunk *)(pDb->apShm[iChunk]);
}

static ShmChunk * treeShmChunkRc(lsm_db *pDb, int iChunk, int *pRc){
  assert( *pRc==LSM_OK );
  if( iChunk<pDb->nShm || LSM_OK==(*pRc = lsmShmCacheChunks(pDb, iChunk+1)) ){
    return (ShmChunk *)(pDb->apShm[iChunk]);
  }
  return 0;
}


#ifndef NDEBUG
static void assertIsWorkingChild(
  lsm_db *db, 
  TreeNode *pNode, 
  TreeNode *pParent, 
  int iCell
){
  TreeNode *p;
  u32 iPtr = getChildPtr(pParent, WORKING_VERSION, iCell);
  p = treeShmptr(db, iPtr);
  assert( p==pNode );
}
#else
# define assertIsWorkingChild(w,x,y,z)
#endif

/* Values for the third argument to treeShmkey(). */
#define TKV_LOADKEY  1
#define TKV_LOADVAL  2

static TreeKey *treeShmkey(
  lsm_db *pDb,                    /* Database handle */
  u32 iPtr,                       /* Shmptr to TreeKey struct */
  int eLoad,                      /* Either zero or a TREEKEY_LOADXXX value */
  TreeBlob *pBlob,                /* Used if dynamic memory is required */
  int *pRc                        /* IN/OUT: Error code */
){
  TreeKey *pRet;

  assert( eLoad==TKV_LOADKEY || eLoad==TKV_LOADVAL );
  pRet = (TreeKey *)treeShmptr(pDb, iPtr);
  if( pRet ){
    int nReq;                     /* Bytes of space required at pRet */
    int nAvail;                   /* Bytes of space available at pRet */

    nReq = sizeof(TreeKey) + pRet->nKey;
    if( eLoad==TKV_LOADVAL && pRet->nValue>0 ){
      nReq += pRet->nValue;
    }
    assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
    nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1));

    if( nAvail<nReq ){
      if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){
        int nLoad = 0;
        while( *pRc==LSM_OK ){
          ShmChunk *pChunk;
          void *p = treeShmptr(pDb, iPtr);
          int n = LSM_MIN(nAvail, nReq-nLoad);

          memcpy(&pBlob->a[nLoad], p, n);
          nLoad += n;
          if( nLoad==nReq ) break;

          pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr));
          assert( pChunk );
          iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR;
          nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR;
        }
      }
      pRet = (TreeKey *)(pBlob->a);
    }
  }

  return pRet;
}

#if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)
void assert_leaf_looks_ok(TreeNode *pNode){
  assert( pNode->apKey[1] );
}

void assert_node_looks_ok(TreeNode *pNode, int nHeight){
  if( pNode ){
    assert( pNode->apKey[1] );
    if( nHeight>1 ){
      int i;
      assert( getChildPtr(pNode, WORKING_VERSION, 1) );
      assert( getChildPtr(pNode, WORKING_VERSION, 2) );
      for(i=0; i<4; i++){
        assert_node_looks_ok(getChildPtr(pNode, WORKING_VERSION, i), nHeight-1);
      }
    }
  }
}

/*
** Run various assert() statements to check that the working-version of the
** tree is correct in the following respects:
**
**   * todo...
*/
void assert_tree_looks_ok(int rc, Tree *pTree){
}
#else
# define assert_tree_looks_ok(x,y)
#endif

void lsmFlagsToString(int flags, char *zFlags){

  zFlags[0] = (flags & LSM_END_DELETE)   ? ']' : '.';

  /* Only one of LSM_POINT_DELETE, LSM_INSERT and LSM_SEPARATOR should ever
  ** be set. If this is not true, write a '?' to the output.  */
  switch( flags & (LSM_POINT_DELETE|LSM_INSERT|LSM_SEPARATOR) ){
    case 0:                zFlags[1] = '.'; break;
    case LSM_POINT_DELETE: zFlags[1] = '-'; break;
    case LSM_INSERT:       zFlags[1] = '+'; break;
    case LSM_SEPARATOR:    zFlags[1] = '^'; break;
    default:               zFlags[1] = '?'; break;
  }

  zFlags[2] = (flags & LSM_SYSTEMKEY)    ? '*' : '.';
  zFlags[3] = (flags & LSM_START_DELETE) ? '[' : '.';
  zFlags[4] = '\0';
}

#ifdef LSM_DEBUG

/*
** Pointer pBlob points to a buffer containing a blob of binary data
** nBlob bytes long. Append the contents of this blob to *pStr, with
** each octet represented by a 2-digit hexadecimal number. For example,
** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF},
** then "0144ff" is appended to *pStr.
*/
static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){
  int i;
  lsmStringExtend(pStr, nBlob*2);
  if( pStr->nAlloc==0 ) return;
  for(i=0; i<nBlob; i++){
    u8 c = ((u8*)pBlob)[i];
    if( c>='a' && c<='z' ){
      pStr->z[pStr->n++] = c;
    }else if( c!=0 || nBlob==1 || i!=(nBlob-1) ){
      pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
      pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
    }
  }
  pStr->z[pStr->n] = 0;
}

/*
** Append nIndent space (0x20) characters to string *pStr.
*/
static void lsmAppendIndent(LsmString *pStr, int nIndent){
  int i;
  lsmStringExtend(pStr, nIndent);
  for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1);
}

static void strAppendFlags(LsmString *pStr, u8 flags){
  char zFlags[8];

  lsmFlagsToString(flags, zFlags);
  zFlags[4] = ':';

  lsmStringAppend(pStr, zFlags, 5);
}

void dump_node_contents(
  lsm_db *pDb,
  u32 iNode,                      /* Print out the contents of this node */
  char *zPath,                    /* Path from root to this node */
  int nPath,                      /* Number of bytes in zPath */
  int nHeight                     /* Height: (0==leaf) (1==parent-of-leaf) */
){
  const char *zSpace = "                                           ";
  int i;
  int rc = LSM_OK;
  LsmString s;
  TreeNode *pNode;
  TreeBlob b = {0, 0};

  pNode = (TreeNode *)treeShmptr(pDb, iNode);

  if( nHeight==0 ){
    /* Append the nIndent bytes of space to string s. */
    lsmStringInit(&s, pDb->pEnv);

    /* Append each key to string s. */
    for(i=0; i<3; i++){
      u32 iPtr = pNode->aiKeyPtr[i];
      if( iPtr ){
        TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i],TKV_LOADKEY, &b,&rc);
        strAppendFlags(&s, pKey->flags);
        lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey);
        lsmStringAppend(&s, "     ", -1);
      }
    }

    printf("% 6d %.*sleaf%.*s: %s\n", 
        iNode, nPath, zPath, 20-nPath-4, zSpace, s.z
    );
    lsmStringClear(&s);
  }else{
    for(i=0; i<4 && nHeight>0; i++){
      u32 iPtr = getChildPtr(pNode, pDb->treehdr.root.iTransId, i);
      zPath[nPath] = i+'0';
      zPath[nPath+1] = '/';

      if( iPtr ){
        dump_node_contents(pDb, iPtr, zPath, nPath+2, nHeight-1);
      }
      if( i!=3 && pNode->aiKeyPtr[i] ){
        TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TKV_LOADKEY,&b,&rc);
        lsmStringInit(&s, pDb->pEnv);
        strAppendFlags(&s, pKey->flags);
        lsmAppendStrBlob(&s, TKV_KEY(pKey), pKey->nKey);
        printf("% 6d %.*s%.*s: %s\n", 
            iNode, nPath+1, zPath, 20-nPath-1, zSpace, s.z);
        lsmStringClear(&s);
      }
    }
  }

  tblobFree(pDb, &b);
}

void dump_tree_contents(lsm_db *pDb, const char *zCaption){
  char zPath[64];
  TreeRoot *p = &pDb->treehdr.root;
  printf("\n%s\n", zCaption);
  zPath[0] = '/';
  if( p->iRoot ){
    dump_node_contents(pDb, p->iRoot, zPath, 1, p->nHeight-1);
  }
  fflush(stdout);
}

#endif

/*
** Initialize a cursor object, the space for which has already been
** allocated.
*/
static void treeCursorInit(lsm_db *pDb, int bOld, TreeCursor *pCsr){
  memset(pCsr, 0, sizeof(TreeCursor));
  pCsr->pDb = pDb;
  if( bOld ){
    pCsr->pRoot = &pDb->treehdr.oldroot;
  }else{
    pCsr->pRoot = &pDb->treehdr.root;
  }
  pCsr->iNode = -1;
}

/*
** Return a pointer to the mapping of the TreeKey object that the cursor
** is pointing to. 
*/
static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){
  TreeKey *pRet;
  lsm_db *pDb = pCsr->pDb;
  u32 iPtr = pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]];

  assert( iPtr );
  pRet = (TreeKey*)treeShmptrUnsafe(pDb, iPtr);
  if( !(pRet->flags & LSM_CONTIGUOUS) ){
    pRet = treeShmkey(pDb, iPtr, TKV_LOADVAL, pBlob, pRc);
  }

  return pRet;
}

/*
** Save the current position of tree cursor pCsr.
*/
int lsmTreeCursorSave(TreeCursor *pCsr){
  int rc = LSM_OK;
  if( pCsr && pCsr->pSave==0 ){
    int iNode = pCsr->iNode;
    if( iNode>=0 ){
      pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc);
    }
    pCsr->iNode = -1;
  }
  return rc;
}

/*
** Restore the position of a saved tree cursor.
*/
static int treeCursorRestore(TreeCursor *pCsr, int *pRes){
  int rc = LSM_OK;
  if( pCsr->pSave ){
    TreeKey *pKey = pCsr->pSave;
    pCsr->pSave = 0;
    if( pRes ){
      rc = lsmTreeCursorSeek(pCsr, TKV_KEY(pKey), pKey->nKey, pRes);
    }
  }
  return rc;
}

/*
** Allocate nByte bytes of space within the *-shm file. If successful, 
** return LSM_OK and set *piPtr to the offset within the file at which
** the allocated space is located.
*/
static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){
  u32 iRet = 0;
  if( *pRc==LSM_OK ){
    const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE;
    const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR;
    u32 iWrite;                   /* Current write offset */
    u32 iEof;                     /* End of current chunk */
    int iChunk;                   /* Current chunk */

    assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) );

    /* Check if there is enough space on the current chunk to fit the
    ** new allocation. If not, link in a new chunk and put the new
    ** allocation at the start of it.  */
    iWrite = pDb->treehdr.iWrite;
    if( bAlign ){
      iWrite = (iWrite + 3) & ~0x0003;
      assert( (iWrite % 4)==0 );
    }

    assert( iWrite );
    iChunk = treeOffsetToChunk(iWrite-1);
    iEof = (iChunk+1) * CHUNK_SIZE;
    assert( iEof>=iWrite && (iEof-iWrite)<CHUNK_SIZE );
    if( (iWrite+nByte)>iEof ){
      ShmChunk *pHdr;           /* Header of chunk just finished (iChunk) */
      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
      ShmChunk *pNext;          /* Header of new chunk */
      int iNext = 0;            /* Next chunk */
      int rc = LSM_OK;

      pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);

      assert( shm_sequence_ge(pDb->treehdr.iUsedShmid, pFirst->iShmid) );
      assert( (pDb->treehdr.iNextShmid+1-pDb->treehdr.nChunk)==pFirst->iShmid );

      /* Check if the chunk at the start of the linked list is still in
      ** use. If not, reuse it. If so, allocate a new chunk by appending
      ** to the *-shm file.  */
      if( pDb->treehdr.iUsedShmid!=pFirst->iShmid ){
        int bInUse;
        rc = lsmTreeInUse(pDb, pFirst->iShmid, &bInUse);
        if( rc!=LSM_OK ){
          *pRc = rc;
          return 0;
        }
        if( bInUse==0 ){
          iNext = pDb->treehdr.iFirst;
          pDb->treehdr.iFirst = pFirst->iNext;
          assert( pDb->treehdr.iFirst );
        }
      }
      if( iNext==0 ) iNext = pDb->treehdr.nChunk++;

      /* Set the header values for the new chunk */
      pNext = treeShmChunkRc(pDb, iNext, &rc);
      if( pNext ){
        pNext->iNext = 0;
        pNext->iShmid = (pDb->treehdr.iNextShmid++);
      }else{
        *pRc = rc;
        return 0;
      }

      /* Set the header values for the chunk just finished */
      pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE);
      pHdr->iNext = iNext;

      /* Advance to the next chunk */
      iWrite = iNext * CHUNK_SIZE + CHUNK_HDR;
    }

    /* Allocate space at iWrite. */
    iRet = iWrite;
    pDb->treehdr.iWrite = iWrite + nByte;
    pDb->treehdr.root.nByte += nByte;
  }
  return iRet;
}

/*
** Allocate and zero nByte bytes of space within the *-shm file.
*/
static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){
  u32 iPtr;
  void *p;
  iPtr = treeShmalloc(pDb, 1, nByte, pRc);
  p = treeShmptr(pDb, iPtr);
  if( p ){
    assert( *pRc==LSM_OK );
    memset(p, 0, nByte);
    *piPtr = iPtr;
  }
  return p;
}

static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){
  return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc);
}

static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){
  return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc);
}

static TreeKey *newTreeKey(
  lsm_db *pDb, 
  u32 *piPtr, 
  void *pKey, int nKey,           /* Key data */
  void *pVal, int nVal,           /* Value data (or nVal<0 for delete) */
  int *pRc
){
  TreeKey *p;
  u32 iPtr;
  u32 iEnd;
  int nRem;
  u8 *a;
  int n;

  /* Allocate space for the TreeKey structure itself */
  *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc);
  p = treeShmptr(pDb, iPtr);
  if( *pRc ) return 0;
  p->nKey = nKey;
  p->nValue = nVal;

  /* Allocate and populate the space required for the key and value. */
  n = nRem = nKey;
  a = (u8 *)pKey;
  while( a ){
    while( nRem>0 ){
      u8 *aAlloc;
      int nAlloc;
      u32 iWrite;

      iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1));
      iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR);
      nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem);

      aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc));
      if( aAlloc==0 ) break;
      memcpy(aAlloc, &a[n-nRem], nAlloc);
      nRem -= nAlloc;
    }
    a = pVal;
    n = nRem = nVal;
    pVal = 0;
  }

  iEnd = iPtr + sizeof(TreeKey) + nKey + LSM_MAX(0, nVal);
  if( (iPtr & ~(LSM_SHM_CHUNK_SIZE-1))!=(iEnd & ~(LSM_SHM_CHUNK_SIZE-1)) ){
    p->flags = 0;
  }else{
    p->flags = LSM_CONTIGUOUS;
  }

  if( *pRc ) return 0;
#if 0
  printf("store: %d %s\n", (int)iPtr, (char *)pKey);
#endif
  return p;
}

static TreeNode *copyTreeNode(
  lsm_db *pDb, 
  TreeNode *pOld, 
  u32 *piNew, 
  int *pRc
){
  TreeNode *pNew;

  pNew = newTreeNode(pDb, piNew, pRc);
  if( pNew ){
    memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr));
    memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr));
    if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr;
  }
  return pNew;
}

static TreeNode *copyTreeLeaf(
  lsm_db *pDb, 
  TreeLeaf *pOld, 
  u32 *piNew, 
  int *pRc
){
  TreeLeaf *pNew;
  pNew = newTreeLeaf(pDb, piNew, pRc);
  if( pNew ){
    memcpy(pNew, pOld, sizeof(TreeLeaf));
  }
  return (TreeNode *)pNew;
}

/*
** The tree cursor passed as the second argument currently points to an 
** internal node (not a leaf). Specifically, to a sub-tree pointer. This
** function replaces the sub-tree that the cursor currently points to
** with sub-tree pNew.
**
** The sub-tree may be replaced either by writing the "v2 data" on the
** internal node, or by allocating a new TreeNode structure and then 
** calling this function on the parent of the internal node.
*/
static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){
  int rc = LSM_OK;
  if( pCsr->iNode<0 ){
    /* iNew is the new root node */
    pDb->treehdr.root.iRoot = iNew;
  }else{
    /* If this node already has version 2 content, allocate a copy and
    ** update the copy with the new pointer value. Otherwise, store the
    ** new pointer as v2 data within the current node structure.  */

    TreeNode *p;                  /* The node to be modified */
    int iChildPtr;                /* apChild[] entry to modify */

    p = pCsr->apTreeNode[pCsr->iNode];
    iChildPtr = pCsr->aiCell[pCsr->iNode];

    if( p->iV2 ){
      /* The "allocate new TreeNode" option */
      u32 iCopy;
      TreeNode *pCopy;
      pCopy = copyTreeNode(pDb, p, &iCopy, &rc);
      if( pCopy ){
        assert( rc==LSM_OK );
        pCopy->aiChildPtr[iChildPtr] = iNew;
        pCsr->iNode--;
        rc = treeUpdatePtr(pDb, pCsr, iCopy);
      }
    }else{
      /* The "v2 data" option */
      u32 iPtr;
      assert( pDb->treehdr.root.iTransId>0 );

      if( pCsr->iNode ){
        iPtr = getChildPtr(
            pCsr->apTreeNode[pCsr->iNode-1], 
            pDb->treehdr.root.iTransId, pCsr->aiCell[pCsr->iNode-1]
        );
      }else{
        iPtr = pDb->treehdr.root.iRoot;
      }
      rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr);

      if( rc==LSM_OK ){
        p->iV2 = pDb->treehdr.root.iTransId;
        p->iV2Child = (u8)iChildPtr;
        p->iV2Ptr = iNew;
      }
    }
  }

  return rc;
}

/*
** Cursor pCsr points at a node that is part of pTree. This function
** inserts a new key and optionally child node pointer into that node.
**
** The position into which the new key and pointer are inserted is
** determined by the iSlot parameter. The new key will be inserted to
** the left of the key currently stored in apKey[iSlot]. Or, if iSlot is
** greater than the index of the rightmost key in the node.
**
** Pointer pLeftPtr points to a child tree that contains keys that are
** smaller than pTreeKey.
*/
static int treeInsert(
  lsm_db *pDb,                    /* Database handle */
  TreeCursor *pCsr,               /* Cursor indicating path to insert at */
  u32 iLeftPtr,                   /* Left child pointer */
  u32 iTreeKey,                   /* Location of key to insert */
  u32 iRightPtr,                  /* Right child pointer */
  int iSlot                       /* Position to insert key into */
){
  int rc = LSM_OK;
  TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];

  /* Check if the node is currently full. If so, split pNode in two and
  ** call this function recursively to add a key to the parent. Otherwise, 
  ** insert the new key directly into pNode.  */
  assert( pNode->aiKeyPtr[1] );
  if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){
    u32 iLeft; TreeNode *pLeft;   /* New left-hand sibling node */
    u32 iRight; TreeNode *pRight; /* New right-hand sibling node */

    pLeft = newTreeNode(pDb, &iLeft, &rc);
    pRight = newTreeNode(pDb, &iRight, &rc);
    if( rc ) return rc;

    pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0);
    pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0];
    pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1);

    pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2);
    pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2];
    pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3);

    if( pCsr->iNode==0 ){
      /* pNode is the root of the tree. Grow the tree by one level. */
      u32 iRoot; TreeNode *pRoot; /* New root node */

      pRoot = newTreeNode(pDb, &iRoot, &rc);
      pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1];
      pRoot->aiChildPtr[1] = iLeft;
      pRoot->aiChildPtr[2] = iRight;

      pDb->treehdr.root.iRoot = iRoot;
      pDb->treehdr.root.nHeight++;
    }else{

      pCsr->iNode--;
      rc = treeInsert(pDb, pCsr, 
          iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode]
      );
    }

    assert( pLeft->iV2==0 );
    assert( pRight->iV2==0 );
    switch( iSlot ){
      case 0:
        pLeft->aiKeyPtr[0] = iTreeKey;
        pLeft->aiChildPtr[0] = iLeftPtr;
        if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr;
        break;
      case 1:
        pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]);
        pLeft->aiKeyPtr[2] = iTreeKey;
        pLeft->aiChildPtr[2] = iLeftPtr;
        break;
      case 2:
        pRight->aiKeyPtr[0] = iTreeKey;
        pRight->aiChildPtr[0] = iLeftPtr;
        if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr;
        break;
      case 3:
        pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]);
        pRight->aiKeyPtr[2] = iTreeKey;
        pRight->aiChildPtr[2] = iLeftPtr;
        break;
    }

  }else{
    TreeNode *pNew;
    u32 *piKey;
    u32 *piChild;
    u32 iStore = 0;
    u32 iNew = 0;
    int i;

    /* Allocate a new version of node pNode. */
    pNew = newTreeNode(pDb, &iNew, &rc);
    if( rc ) return rc;

    piKey = pNew->aiKeyPtr;
    piChild = pNew->aiChildPtr;

    for(i=0; i<iSlot; i++){
      if( pNode->aiKeyPtr[i] ){
        *(piKey++) = pNode->aiKeyPtr[i];
        *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i);
      }
    }

    *piKey++ = iTreeKey;
    *piChild++ = iLeftPtr;

    iStore = iRightPtr;
    for(i=iSlot; i<3; i++){
      if( pNode->aiKeyPtr[i] ){
        *(piKey++) = pNode->aiKeyPtr[i];
        *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i);
        iStore = 0;
      }
    }

    if( iStore ){
      *piChild = iStore;
    }else{
      *piChild = getChildPtr(pNode, WORKING_VERSION, 
          (pNode->aiKeyPtr[2] ? 3 : 2)
      );
    }
    pCsr->iNode--;
    rc = treeUpdatePtr(pDb, pCsr, iNew);
  }

  return rc;
}

static int treeInsertLeaf(
  lsm_db *pDb,                    /* Database handle */
  TreeCursor *pCsr,               /* Cursor structure */
  u32 iTreeKey,                   /* Key pointer to insert */
  int iSlot                       /* Insert key to the left of this */
){
  int rc = LSM_OK;                /* Return code */
  TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode];
  TreeLeaf *pNew;
  u32 iNew;

  assert( iSlot>=0 && iSlot<=4 );
  assert( pCsr->iNode>0 );
  assert( pLeaf->aiKeyPtr[1] );

  pCsr->iNode--;

  pNew = newTreeLeaf(pDb, &iNew, &rc);
  if( pNew ){
    if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){
      /* The leaf is full. Split it in two. */
      TreeLeaf *pRight;
      u32 iRight;
      pRight = newTreeLeaf(pDb, &iRight, &rc);
      if( pRight ){
        assert( rc==LSM_OK );
        pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0];
        pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2];
        switch( iSlot ){
          case 0: pNew->aiKeyPtr[0] = iTreeKey; break;
          case 1: pNew->aiKeyPtr[2] = iTreeKey; break;
          case 2: pRight->aiKeyPtr[0] = iTreeKey; break;
          case 3: pRight->aiKeyPtr[2] = iTreeKey; break;
        }

        rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, 
            pCsr->aiCell[pCsr->iNode]
        );
      }
    }else{
      int iOut = 0;
      int i;
      for(i=0; i<4; i++){
        if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey;
        if( i<3 && pLeaf->aiKeyPtr[i] ){
          pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i];
        }
      }
      rc = treeUpdatePtr(pDb, pCsr, iNew);
    }
  }

  return rc;
}

void lsmTreeMakeOld(lsm_db *pDb){

  /* A write transaction must be open. Otherwise the code below that
  ** assumes (pDb->pClient->iLogOff) is current may malfunction. 
  **
  ** Update: currently this assert fails due to lsm_flush(), which does
  ** not set nTransOpen.
  */
  assert( /* pDb->nTransOpen>0 && */ pDb->iReader>=0 );

  if( pDb->treehdr.iOldShmid==0 ){
    pDb->treehdr.iOldLog = (pDb->treehdr.log.aRegion[2].iEnd << 1);
    pDb->treehdr.iOldLog |= (~(pDb->pClient->iLogOff) & (i64)0x0001);

    pDb->treehdr.oldcksum0 = pDb->treehdr.log.cksum0;
    pDb->treehdr.oldcksum1 = pDb->treehdr.log.cksum1;
    pDb->treehdr.iOldShmid = pDb->treehdr.iNextShmid-1;
    memcpy(&pDb->treehdr.oldroot, &pDb->treehdr.root, sizeof(TreeRoot));

    pDb->treehdr.root.iTransId = 1;
    pDb->treehdr.root.iRoot = 0;
    pDb->treehdr.root.nHeight = 0;
    pDb->treehdr.root.nByte = 0;
  }
}

void lsmTreeDiscardOld(lsm_db *pDb){
  assert( lsmShmAssertLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL) 
       || lsmShmAssertLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL) 
  );
  pDb->treehdr.iUsedShmid = pDb->treehdr.iOldShmid;
  pDb->treehdr.iOldShmid = 0;
}

int lsmTreeHasOld(lsm_db *pDb){
  return pDb->treehdr.iOldShmid!=0;
}

/*
** This function is called during recovery to initialize the 
** tree header. Only the database connections private copy of the tree-header
** is initialized here - it will be copied into shared memory if log file
** recovery is successful.
*/
int lsmTreeInit(lsm_db *pDb){
  ShmChunk *pOne;
  int rc = LSM_OK;

  memset(&pDb->treehdr, 0, sizeof(TreeHeader));
  pDb->treehdr.root.iTransId = 1;
  pDb->treehdr.iFirst = 1;
  pDb->treehdr.nChunk = 2;
  pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR;
  pDb->treehdr.iNextShmid = 2;
  pDb->treehdr.iUsedShmid = 1;

  pOne = treeShmChunkRc(pDb, 1, &rc);
  if( pOne ){
    pOne->iNext = 0;
    pOne->iShmid = 1;
  }
  return rc;
}

static void treeHeaderChecksum(
  TreeHeader *pHdr, 
  u32 *aCksum
){
  u32 cksum1 = 0x12345678;
  u32 cksum2 = 0x9ABCDEF0;
  u32 *a = (u32 *)pHdr;
  int i;

  assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) );
  assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 );

  for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){
    cksum1 += a[i];
    cksum2 += (cksum1 + a[i+1]);
  }
  aCksum[0] = cksum1;
  aCksum[1] = cksum2;
}

/*
** Return true if the checksum stored in TreeHeader object *pHdr is 
** consistent with the contents of its other fields.
*/
static int treeHeaderChecksumOk(TreeHeader *pHdr){
  u32 aCksum[2];
  treeHeaderChecksum(pHdr, aCksum);
  return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum)));
}

/*
** This type is used by functions lsmTreeRepair() and treeSortByShmid() to
** make relinking the linked list of shared-memory chunks easier.
*/
typedef struct ShmChunkLoc ShmChunkLoc;
struct ShmChunkLoc {
  ShmChunk *pShm;
  u32 iLoc;
};

/*
** This function checks that the linked list of shared memory chunks 
** that starts at chunk db->treehdr.iFirst:
**
**   1) Includes all chunks in the shared-memory region, and
**   2) Links them together in order of ascending shm-id.
**
** If no error occurs and the conditions above are met, LSM_OK is returned.
**
** If either of the conditions are untrue, LSM_CORRUPT is returned. Or, if
** an error is encountered before the checks are completed, another LSM error
** code (i.e. LSM_IOERR or LSM_NOMEM) may be returned.
*/
static int treeCheckLinkedList(lsm_db *db){
  int rc = LSM_OK;
  int nVisit = 0;
  ShmChunk *p;

  p = treeShmChunkRc(db, db->treehdr.iFirst, &rc);
  while( rc==LSM_OK && p ){
    if( p->iNext ){
      if( p->iNext>=db->treehdr.nChunk ){
        rc = LSM_CORRUPT_BKPT;
      }else{
        ShmChunk *pNext = treeShmChunkRc(db, p->iNext, &rc);
        if( rc==LSM_OK ){
          if( pNext->iShmid!=p->iShmid+1 ){
            rc = LSM_CORRUPT_BKPT;
          }
          p = pNext;
        }
      }
    }else{
      p = 0;
    }
    nVisit++;
  }

  if( rc==LSM_OK && nVisit!=db->treehdr.nChunk-1 ){
    rc = LSM_CORRUPT_BKPT;
  }
  return rc;
}

/*
** Iterate through the current in-memory tree. If there are any v2-pointers
** with transaction ids larger than db->treehdr.iTransId, zero them.
*/
static int treeRepairPtrs(lsm_db *db){
  int rc = LSM_OK;

  if( db->treehdr.root.nHeight>1 ){
    TreeCursor csr;               /* Cursor used to iterate through tree */
    u32 iTransId = db->treehdr.root.iTransId;

    /* Initialize the cursor structure. Also decrement the nHeight variable
    ** in the tree-header. This will prevent the cursor from visiting any
    ** leaf nodes.  */
    db->treehdr.root.nHeight--;
    treeCursorInit(db, 0, &csr);

    rc = lsmTreeCursorEnd(&csr, 0);
    while( rc==LSM_OK && lsmTreeCursorValid(&csr) ){
      TreeNode *pNode = csr.apTreeNode[csr.iNode];
      if( pNode->iV2>iTransId ){
        pNode->iV2Child = 0;
        pNode->iV2Ptr = 0;
        pNode->iV2 = 0;
      }
      rc = lsmTreeCursorNext(&csr);
    }
    tblobFree(csr.pDb, &csr.blob);

    db->treehdr.root.nHeight++;
  }

  return rc;
}

static int treeRepairList(lsm_db *db){
  int rc = LSM_OK;
  int i;
  ShmChunk *p;
  ShmChunk *pMin = 0;
  u32 iMin = 0;

  /* Iterate through all shm chunks. Find the smallest shm-id present in
  ** the shared-memory region. */
  for(i=1; rc==LSM_OK && i<db->treehdr.nChunk; i++){
    p = treeShmChunkRc(db, i, &rc);
    if( p && (pMin==0 || shm_sequence_ge(pMin->iShmid, p->iShmid)) ){
      pMin = p;
      iMin = i;
    }
  }

  /* Fix the shm-id values on any chunks with a shm-id greater than or 
  ** equal to treehdr.iNextShmid. Then do a merge-sort of all chunks to 
  ** fix the ShmChunk.iNext pointers.
  */
  if( rc==LSM_OK ){
    int nSort;
    int nByte;
    u32 iPrevShmid;
    ShmChunkLoc *aSort;

    /* Allocate space for a merge sort. */
    nSort = 1;
    while( nSort < (db->treehdr.nChunk-1) ) nSort = nSort * 2;
    nByte = sizeof(ShmChunkLoc) * nSort * 2;
    aSort = lsmMallocZeroRc(db->pEnv, nByte, &rc);
    iPrevShmid = pMin->iShmid;

    /* Fix all shm-ids, if required. */
    if( rc==LSM_OK ){
      iPrevShmid = pMin->iShmid-1;
      for(i=1; i<db->treehdr.nChunk; i++){
        p = treeShmChunk(db, i);
        aSort[i-1].pShm = p;
        aSort[i-1].iLoc = i;
        if( i!=db->treehdr.iFirst ){
          if( shm_sequence_ge(p->iShmid, db->treehdr.iNextShmid) ){
            p->iShmid = iPrevShmid--;
          }
        }
      }
      if( iMin!=db->treehdr.iFirst ){
        p = treeShmChunk(db, db->treehdr.iFirst);
        p->iShmid = iPrevShmid;
      }
    }

    if( rc==LSM_OK ){
      ShmChunkLoc *aSpace = &aSort[nSort];
      for(i=0; i<nSort; i++){
        if( aSort[i].pShm ){
          assert( shm_sequence_ge(aSort[i].pShm->iShmid, iPrevShmid) );
          assert( aSpace[aSort[i].pShm->iShmid - iPrevShmid].pShm==0 );
          aSpace[aSort[i].pShm->iShmid - iPrevShmid] = aSort[i];
        }
      }

      if( aSpace[nSort-1].pShm ) aSpace[nSort-1].pShm->iNext = 0;
      for(i=0; i<nSort-1; i++){
        if( aSpace[i].pShm ){
          aSpace[i].pShm->iNext = aSpace[i+1].iLoc;
        }
      }

      rc = treeCheckLinkedList(db);
      lsmFree(db->pEnv, aSort);
    }
  }

  return rc;
}

/*
** This function is called as part of opening a write-transaction if the
** writer-flag is already set - indicating that the previous writer 
** failed before ending its transaction.
*/
int lsmTreeRepair(lsm_db *db){
  int rc = LSM_OK;
  TreeHeader hdr;
  ShmHeader *pHdr = db->pShmhdr;

  /* Ensure that the two tree-headers are consistent. Copy one over the other
  ** if necessary. Prefer the data from a tree-header for which the checksum
  ** computes. Or, if they both compute, prefer tree-header-1.  */
  if( memcmp(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader)) ){
    if( treeHeaderChecksumOk(&pHdr->hdr1) ){
      memcpy(&pHdr->hdr2, &pHdr->hdr1, sizeof(TreeHeader));
    }else{
      memcpy(&pHdr->hdr1, &pHdr->hdr2, sizeof(TreeHeader));
    }
  }

  /* Save the connections current copy of the tree-header. It will be 
  ** restored before returning.  */
  memcpy(&hdr, &db->treehdr, sizeof(TreeHeader));

  /* Walk the tree. Zero any v2 pointers with a transaction-id greater than
  ** the transaction-id currently in the tree-headers.  */
  rc = treeRepairPtrs(db);

  /* Repair the linked list of shared-memory chunks. */
  if( rc==LSM_OK ){
    rc = treeRepairList(db);
  }

  memcpy(&db->treehdr, &hdr, sizeof(TreeHeader));
  return rc;
}

static void treeOverwriteKey(lsm_db *db, TreeCursor *pCsr, u32 iKey, int *pRc){
  if( *pRc==LSM_OK ){
    TreeRoot *p = &db->treehdr.root;
    TreeNode *pNew;
    u32 iNew;
    TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];
    int iCell = pCsr->aiCell[pCsr->iNode];

    /* Create a copy of this node */
    if( (pCsr->iNode>0 && pCsr->iNode==(p->nHeight-1)) ){
      pNew = copyTreeLeaf(db, (TreeLeaf *)pNode, &iNew, pRc);
    }else{
      pNew = copyTreeNode(db, pNode, &iNew, pRc);
    }

    if( pNew ){
      /* Modify the value in the new version */
      pNew->aiKeyPtr[iCell] = iKey;

      /* Change the pointer in the parent (if any) to point at the new 
       ** TreeNode */
      pCsr->iNode--;
      treeUpdatePtr(db, pCsr, iNew);
    }
  }
}

static int treeNextIsEndDelete(lsm_db *db, TreeCursor *pCsr){
  int iNode = pCsr->iNode;
  int iCell = pCsr->aiCell[iNode]+1;

  /* Cursor currently points to a leaf node. */
  assert( pCsr->iNode==(db->treehdr.root.nHeight-1) );

  while( iNode>=0 ){
    TreeNode *pNode = pCsr->apTreeNode[iNode];
    if( iCell<3 && pNode->aiKeyPtr[iCell] ){
      int rc = LSM_OK;
      TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]);
      assert( rc==LSM_OK );
      return ((pKey->flags & LSM_END_DELETE) ? 1 : 0);
    }
    iNode--;
    iCell = pCsr->aiCell[iNode];
  }

  return 0;
}

static int treePrevIsStartDelete(lsm_db *db, TreeCursor *pCsr){
  int iNode = pCsr->iNode;

  /* Cursor currently points to a leaf node. */
  assert( pCsr->iNode==(db->treehdr.root.nHeight-1) );

  while( iNode>=0 ){
    TreeNode *pNode = pCsr->apTreeNode[iNode];
    int iCell = pCsr->aiCell[iNode]-1;
    if( iCell>=0 && pNode->aiKeyPtr[iCell] ){
      int rc = LSM_OK;
      TreeKey *pKey = treeShmptr(db, pNode->aiKeyPtr[iCell]);
      assert( rc==LSM_OK );
      return ((pKey->flags & LSM_START_DELETE) ? 1 : 0);
    }
    iNode--;
  }

  return 0;
}


static int treeInsertEntry(
  lsm_db *pDb,                    /* Database handle */
  int flags,                      /* Flags associated with entry */
  void *pKey,                     /* Pointer to key data */
  int nKey,                       /* Size of key data in bytes */
  void *pVal,                     /* Pointer to value data (or NULL) */
  int nVal                        /* Bytes in value data (or -ve for delete) */
){
  int rc = LSM_OK;                /* Return Code */
  TreeKey *pTreeKey;              /* New key-value being inserted */
  u32 iTreeKey;
  TreeRoot *p = &pDb->treehdr.root;
  TreeCursor csr;                 /* Cursor to seek to pKey/nKey */
  int res;                        /* Result of seek operation on csr */

  assert( nVal>=0 || pVal==0 );
  assert_tree_looks_ok(LSM_OK, pTree);
  assert( flags==LSM_INSERT       || flags==LSM_POINT_DELETE 
       || flags==LSM_START_DELETE || flags==LSM_END_DELETE 
  );
  assert( (flags & LSM_CONTIGUOUS)==0 );
#if 0
  dump_tree_contents(pDb, "before");
#endif

  if( p->iRoot ){
    TreeKey *pRes;                /* Key at end of seek operation */
    treeCursorInit(pDb, 0, &csr);

    /* Seek to the leaf (or internal node) that the new key belongs on */
    rc = lsmTreeCursorSeek(&csr, pKey, nKey, &res);
    pRes = csrGetKey(&csr, &csr.blob, &rc);
    if( rc!=LSM_OK ) return rc;

    if( flags==LSM_START_DELETE ){
      /* When inserting a start-delete-range entry, if the key that
      ** occurs immediately before the new entry is already a START_DELETE,
      ** then the new entry is not required.  */
      if( (res<=0 && (pRes->flags & LSM_START_DELETE))
       || (res>0  && treePrevIsStartDelete(pDb, &csr))
      ){ 
        goto insert_entry_out;
      }
    }else if( flags==LSM_END_DELETE ){
      /* When inserting an start-delete-range entry, if the key that
      ** occurs immediately after the new entry is already an END_DELETE,
      ** then the new entry is not required.  */
      if( (res<0  && treeNextIsEndDelete(pDb, &csr))
       || (res>=0 && (pRes->flags & LSM_END_DELETE))
      ){
        goto insert_entry_out;
      }
    }

    if( res==0 && (flags & (LSM_END_DELETE|LSM_START_DELETE)) ){
      if( pRes->flags & LSM_INSERT ){
        nVal = pRes->nValue;
        pVal = TKV_VAL(pRes);
      }
      flags = flags | pRes->flags;
    }

    if( flags & (LSM_INSERT|LSM_POINT_DELETE) ){
      if( (res<0 && (pRes->flags & LSM_START_DELETE))
       || (res>0 && (pRes->flags & LSM_END_DELETE)) 
      ){
        flags = flags | (LSM_END_DELETE|LSM_START_DELETE);
      }else if( res==0 ){
        flags = flags | (pRes->flags & (LSM_END_DELETE|LSM_START_DELETE));
      }
    }
  }else{
    memset(&csr, 0, sizeof(TreeCursor));
  }

  /* Allocate and populate a new key-value pair structure */
  pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc);
  if( rc!=LSM_OK ) return rc;
  assert( pTreeKey->flags==0 || pTreeKey->flags==LSM_CONTIGUOUS );
  pTreeKey->flags |= flags;

  if( p->iRoot==0 ){
    /* The tree is completely empty. Add a new root node and install
    ** (pKey/nKey) as the middle entry. Even though it is a leaf at the
    ** moment, use newTreeNode() to allocate the node (i.e. allocate enough
    ** space for the fields used by interior nodes). This is because the
    ** treeInsert() routine may convert this node to an interior node. */
    TreeNode *pRoot = newTreeNode(pDb, &p->iRoot, &rc);
    if( rc==LSM_OK ){
      assert( p->nHeight==0 );
      pRoot->aiKeyPtr[1] = iTreeKey;
      p->nHeight = 1;
    }
  }else{
    if( res==0 ){
      /* The search found a match within the tree. */
      treeOverwriteKey(pDb, &csr, iTreeKey, &rc);
    }else{
      /* The cursor now points to the leaf node into which the new entry should
      ** be inserted. There may or may not be a free slot within the leaf for
      ** the new key-value pair. 
      **
      ** iSlot is set to the index of the key within pLeaf that the new key
      ** should be inserted to the left of (or to a value 1 greater than the
      ** index of the rightmost key if the new key is larger than all keys
      ** currently stored in the node).
      */
      int iSlot = csr.aiCell[csr.iNode] + (res<0);
      if( csr.iNode==0 ){
        rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot);
      }else{
        rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot);
      }
    }
  }

#if 0
  dump_tree_contents(pDb, "after");
#endif
 insert_entry_out:
  tblobFree(pDb, &csr.blob);
  assert_tree_looks_ok(rc, pTree);
  return rc;
}

/*
** Insert a new entry into the in-memory tree.
**
** If the value of the 5th parameter, nVal, is negative, then a delete-marker
** is inserted into the tree. In this case the value pointer, pVal, must be
** NULL.
*/
int lsmTreeInsert(
  lsm_db *pDb,                    /* Database handle */
  void *pKey,                     /* Pointer to key data */
  int nKey,                       /* Size of key data in bytes */
  void *pVal,                     /* Pointer to value data (or NULL) */
  int nVal                        /* Bytes in value data (or -ve for delete) */
){
  int flags;
  if( nVal<0 ){
    flags = LSM_POINT_DELETE;
  }else{
    flags = LSM_INSERT;
  }

  return treeInsertEntry(pDb, flags, pKey, nKey, pVal, nVal);
}

static int treeDeleteEntry(lsm_db *db, TreeCursor *pCsr, u32 iNewptr){
  TreeRoot *p = &db->treehdr.root;
  TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];
  int iSlot = pCsr->aiCell[pCsr->iNode];
  int bLeaf;
  int rc = LSM_OK;

  assert( pNode->aiKeyPtr[1] );
  assert( pNode->aiKeyPtr[iSlot] );
  assert( iSlot==0 || iSlot==1 || iSlot==2 );
  assert( (pCsr->iNode==(db->treehdr.root.nHeight-1))==(iNewptr==0) );

  bLeaf = (pCsr->iNode==(p->nHeight-1) && p->nHeight>1);
  
  if( pNode->aiKeyPtr[0] || pNode->aiKeyPtr[2] ){
    /* There are currently at least 2 keys on this node. So just create
    ** a new copy of the node with one of the keys removed. If the node
    ** happens to be the root node of the tree, allocate an entire 
    ** TreeNode structure instead of just a TreeLeaf.  */
    TreeNode *pNew;
    u32 iNew;

    if( bLeaf ){
      pNew = (TreeNode *)newTreeLeaf(db, &iNew, &rc);
    }else{
      pNew = newTreeNode(db, &iNew, &rc);
    }
    if( pNew ){
      int i;
      int iOut = 1;
      for(i=0; i<4; i++){
        if( i==iSlot ){
          i++;
          if( bLeaf==0 ) pNew->aiChildPtr[iOut] = iNewptr;
          if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i];
          iOut++;
        }else if( bLeaf || p->nHeight==1 ){
          if( i<3 && pNode->aiKeyPtr[i] ){
            pNew->aiKeyPtr[iOut++] = pNode->aiKeyPtr[i];
          }
        }else{
          if( getChildPtr(pNode, WORKING_VERSION, i) ){
            pNew->aiChildPtr[iOut] = getChildPtr(pNode, WORKING_VERSION, i);
            if( i<3 ) pNew->aiKeyPtr[iOut] = pNode->aiKeyPtr[i];
            iOut++;
          }
        }
      }
      assert( iOut<=4 );
      assert( bLeaf || pNew->aiChildPtr[0]==0 );
      pCsr->iNode--;
      rc = treeUpdatePtr(db, pCsr, iNew);
    }

  }else if( pCsr->iNode==0 ){
    /* Removing the only key in the root node. iNewptr is the new root. */
    assert( iSlot==1 );
    db->treehdr.root.iRoot = iNewptr;
    db->treehdr.root.nHeight--;

  }else{
    /* There is only one key on this node and the node is not the root
    ** node. Find a peer for this node. Then redistribute the contents of
    ** the peer and the parent cell between the parent and either one or
    ** two new nodes.  */
    TreeNode *pParent;            /* Parent tree node */
    int iPSlot;
    u32 iPeer;                    /* Pointer to peer leaf node */
    int iDir;
    TreeNode *pPeer;              /* The peer leaf node */
    TreeNode *pNew1; u32 iNew1;   /* First new leaf node */

    assert( iSlot==1 );

    pParent = pCsr->apTreeNode[pCsr->iNode-1];
    iPSlot = pCsr->aiCell[pCsr->iNode-1];

    if( iPSlot>0 && getChildPtr(pParent, WORKING_VERSION, iPSlot-1) ){
      iDir = -1;
    }else{
      iDir = +1;
    }
    iPeer = getChildPtr(pParent, WORKING_VERSION, iPSlot+iDir);
    pPeer = (TreeNode *)treeShmptr(db, iPeer);
    assertIsWorkingChild(db, pNode, pParent, iPSlot);

    /* Allocate the first new leaf node. This is always required. */
    if( bLeaf ){
      pNew1 = (TreeNode *)newTreeLeaf(db, &iNew1, &rc);
    }else{
      pNew1 = (TreeNode *)newTreeNode(db, &iNew1, &rc);
    }

    if( pPeer->aiKeyPtr[0] && pPeer->aiKeyPtr[2] ){
      /* Peer node is completely full. This means that two new leaf nodes
      ** and a new parent node are required. */

      TreeNode *pNew2; u32 iNew2; /* Second new leaf node */
      TreeNode *pNewP; u32 iNewP; /* New parent node */

      if( bLeaf ){
        pNew2 = (TreeNode *)newTreeLeaf(db, &iNew2, &rc);
      }else{
        pNew2 = (TreeNode *)newTreeNode(db, &iNew2, &rc);
      }
      pNewP = copyTreeNode(db, pParent, &iNewP, &rc);

      if( iDir==-1 ){
        pNew1->aiKeyPtr[1] = pPeer->aiKeyPtr[0];
        if( bLeaf==0 ){
          pNew1->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 0);
          pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 1);
        }

        pNewP->aiChildPtr[iPSlot-1] = iNew1;
        pNewP->aiKeyPtr[iPSlot-1] = pPeer->aiKeyPtr[1];
        pNewP->aiChildPtr[iPSlot] = iNew2;

        pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[2];
        pNew2->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot-1];
        if( bLeaf==0 ){
          pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 2);
          pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 3);
          pNew2->aiChildPtr[2] = iNewptr;
        }
      }else{
        pNew1->aiKeyPtr[1] = pParent->aiKeyPtr[iPSlot];
        if( bLeaf==0 ){
          pNew1->aiChildPtr[1] = iNewptr;
          pNew1->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 0);
        }

        pNewP->aiChildPtr[iPSlot] = iNew1;
        pNewP->aiKeyPtr[iPSlot] = pPeer->aiKeyPtr[0];
        pNewP->aiChildPtr[iPSlot+1] = iNew2;

        pNew2->aiKeyPtr[0] = pPeer->aiKeyPtr[1];
        pNew2->aiKeyPtr[1] = pPeer->aiKeyPtr[2];
        if( bLeaf==0 ){
          pNew2->aiChildPtr[0] = getChildPtr(pPeer, WORKING_VERSION, 1);
          pNew2->aiChildPtr[1] = getChildPtr(pPeer, WORKING_VERSION, 2);
          pNew2->aiChildPtr[2] = getChildPtr(pPeer, WORKING_VERSION, 3);
        }
      }
      assert( pCsr->iNode>=1 );
      pCsr->iNode -= 2;
      if( rc==LSM_OK ){
        assert( pNew1->aiKeyPtr[1] && pNew2->aiKeyPtr[1] );
        rc = treeUpdatePtr(db, pCsr, iNewP);
      }
    }else{
      int iKOut = 0;
      int iPOut = 0;
      int i;

      pCsr->iNode--;

      if( iDir==1 ){
        pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot];
        if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr;
      }
      for(i=0; i<3; i++){
        if( pPeer->aiKeyPtr[i] ){
          pNew1->aiKeyPtr[iKOut++] = pPeer->aiKeyPtr[i];
        }
      }
      if( bLeaf==0 ){
        for(i=0; i<4; i++){
          if( getChildPtr(pPeer, WORKING_VERSION, i) ){
            pNew1->aiChildPtr[iPOut++] = getChildPtr(pPeer, WORKING_VERSION, i);
          }
        }
      }
      if( iDir==-1 ){
        iPSlot--;
        pNew1->aiKeyPtr[iKOut++] = pParent->aiKeyPtr[iPSlot];
        if( bLeaf==0 ) pNew1->aiChildPtr[iPOut++] = iNewptr;
        pCsr->aiCell[pCsr->iNode] = iPSlot;
      }

      rc = treeDeleteEntry(db, pCsr, iNew1);
    }
  }

  return rc;
}

/*
** Delete a range of keys from the tree structure (i.e. the lsm_delete_range()
** function, not lsm_delete()).
**
** This is a two step process: 
**
**     1) Remove all entries currently stored in the tree that have keys
**        that fall into the deleted range.
**
**        TODO: There are surely good ways to optimize this step - removing 
**        a range of keys from a b-tree. But for now, this function removes
**        them one at a time using the usual approach.
**
**     2) Unless the largest key smaller than or equal to (pKey1/nKey1) is
**        already marked as START_DELETE, insert a START_DELETE key. 
**        Similarly, unless the smallest key greater than or equal to
**        (pKey2/nKey2) is already START_END, insert a START_END key.
*/
int lsmTreeDelete(
  lsm_db *db,
  void *pKey1, int nKey1,         /* Start of range */
  void *pKey2, int nKey2          /* End of range */
){
  int rc = LSM_OK;
  int bDone = 0;
  TreeRoot *p = &db->treehdr.root;
  TreeBlob blob = {0, 0};

  /* The range must be sensible - that (key1 < key2). */
  assert( treeKeycmp(pKey1, nKey1, pKey2, nKey2)<0 );
  assert( assert_delete_ranges_match(db) );

#if 0
  static int nCall = 0;
  printf("\n");
  nCall++;
  printf("%d delete %s .. %s\n", nCall, (char *)pKey1, (char *)pKey2);
  dump_tree_contents(db, "before delete");
#endif

  /* Step 1. This loop runs until the tree contains no keys within the
  ** range being deleted. Or until an error occurs. */
  while( bDone==0 && rc==LSM_OK ){
    int res;
    TreeCursor csr;               /* Cursor to seek to first key in range */
    void *pDel; int nDel;         /* Key to (possibly) delete this iteration */
#ifndef NDEBUG
    int nEntry = treeCountEntries(db);
#endif

    /* Seek the cursor to the first entry in the tree greater than pKey1. */
    treeCursorInit(db, 0, &csr);
    lsmTreeCursorSeek(&csr, pKey1, nKey1, &res);
    if( res<=0 && lsmTreeCursorValid(&csr) ) lsmTreeCursorNext(&csr);

    /* If there is no such entry, or if it is greater than pKey2, then the
    ** tree now contains no keys in the range being deleted. In this case
    ** break out of the loop.  */
    bDone = 1;
    if( lsmTreeCursorValid(&csr) ){
      lsmTreeCursorKey(&csr, 0, &pDel, &nDel);
      if( treeKeycmp(pDel, nDel, pKey2, nKey2)<0 ) bDone = 0;
    }

    if( bDone==0 ){
      if( csr.iNode==(p->nHeight-1) ){
        /* The element to delete already lies on a leaf node */
        rc = treeDeleteEntry(db, &csr, 0);
      }else{
        /* 1. Overwrite the current key with a copy of the next key in the 
        **    tree (key N).
        **
        ** 2. Seek to key N (cursor will stop at the internal node copy of
        **    N). Move to the next key (original copy of N). Delete
        **    this entry. 
        */
        u32 iKey;
        TreeKey *pKey;
        int iNode = csr.iNode;
        lsmTreeCursorNext(&csr);
        assert( csr.iNode==(p->nHeight-1) );

        iKey = csr.apTreeNode[csr.iNode]->aiKeyPtr[csr.aiCell[csr.iNode]];
        lsmTreeCursorPrev(&csr);

        treeOverwriteKey(db, &csr, iKey, &rc);
        pKey = treeShmkey(db, iKey, TKV_LOADKEY, &blob, &rc);
        if( pKey ){
          rc = lsmTreeCursorSeek(&csr, TKV_KEY(pKey), pKey->nKey, &res);
        }
        if( rc==LSM_OK ){
          assert( res==0 && csr.iNode==iNode );
          rc = lsmTreeCursorNext(&csr);
          if( rc==LSM_OK ){
            rc = treeDeleteEntry(db, &csr, 0);
          }
        }
      }
    }

    /* Clean up any memory allocated by the cursor. */
    tblobFree(db, &csr.blob);
#if 0
    dump_tree_contents(db, "ddd delete");
#endif
    assert( bDone || treeCountEntries(db)==(nEntry-1) );
  }

#if 0
  dump_tree_contents(db, "during delete");
#endif

  /* Now insert the START_DELETE and END_DELETE keys. */
  if( rc==LSM_OK ){
    rc = treeInsertEntry(db, LSM_START_DELETE, pKey1, nKey1, 0, -1);
  }
#if 0
  dump_tree_contents(db, "during delete 2");
#endif
  if( rc==LSM_OK ){
    rc = treeInsertEntry(db, LSM_END_DELETE, pKey2, nKey2, 0, -1);
  }

#if 0
  dump_tree_contents(db, "after delete");
#endif

  tblobFree(db, &blob);
  assert( assert_delete_ranges_match(db) );
  return rc;
}

/*
** Return, in bytes, the amount of memory currently used by the tree 
** structure.
*/
int lsmTreeSize(lsm_db *pDb){
  return pDb->treehdr.root.nByte;
}

/*
** Open a cursor on the in-memory tree pTree.
*/
int lsmTreeCursorNew(lsm_db *pDb, int bOld, TreeCursor **ppCsr){
  TreeCursor *pCsr;
  *ppCsr = pCsr = lsmMalloc(pDb->pEnv, sizeof(TreeCursor));
  if( pCsr ){
    treeCursorInit(pDb, bOld, pCsr);
    return LSM_OK;
  }
  return LSM_NOMEM_BKPT;
}

/*
** Close an in-memory tree cursor.
*/
void lsmTreeCursorDestroy(TreeCursor *pCsr){
  if( pCsr ){
    tblobFree(pCsr->pDb, &pCsr->blob);
    lsmFree(pCsr->pDb->pEnv, pCsr);
  }
}

void lsmTreeCursorReset(TreeCursor *pCsr){
  if( pCsr ){
    pCsr->iNode = -1;
    pCsr->pSave = 0;
  }
}

#ifndef NDEBUG
static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){
  TreeKey *p;
  int cmp = 0;
  int rc = LSM_OK;
  assert( pCsr->iNode>=0 );
  p = csrGetKey(pCsr, &pCsr->blob, &rc);
  if( p ){
    cmp = treeKeycmp(TKV_KEY(p), p->nKey, pKey, nKey);
  }
  return cmp;
}
#endif


/*
** Attempt to seek the cursor passed as the first argument to key (pKey/nKey)
** in the tree structure. If an exact match for the key is found, leave the
** cursor pointing to it and set *pRes to zero before returning. If an
** exact match cannot be found, do one of the following:
**
**   * Leave the cursor pointing to the smallest element in the tree that 
**     is larger than the key and set *pRes to +1, or
**
**   * Leave the cursor pointing to the largest element in the tree that 
**     is smaller than the key and set *pRes to -1, or
**
**   * If the tree is empty, leave the cursor at EOF and set *pRes to -1.
*/
int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){
  int rc = LSM_OK;                /* Return code */
  lsm_db *pDb = pCsr->pDb;
  TreeRoot *pRoot = pCsr->pRoot;
  u32 iNodePtr;                   /* Location of current node in search */

  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);

  iNodePtr = pRoot->iRoot;
  if( iNodePtr==0 ){
    /* Either an error occurred or the tree is completely empty. */
    assert( rc!=LSM_OK || pRoot->iRoot==0 );
    *pRes = -1;
    pCsr->iNode = -1;
  }else{
    TreeBlob b = {0, 0};
    int res = 0;                  /* Result of comparison function */
    int iNode = -1;
    while( iNodePtr ){
      TreeNode *pNode;            /* Node at location iNodePtr */
      int iTest;                  /* Index of second key to test (0 or 2) */
      u32 iTreeKey;
      TreeKey *pTreeKey;          /* Key to compare against */

      pNode = (TreeNode *)treeShmptrUnsafe(pDb, iNodePtr);
      iNode++;
      pCsr->apTreeNode[iNode] = pNode;

      /* Compare (pKey/nKey) with the key in the middle slot of B-tree node
      ** pNode. The middle slot is never empty. If the comparison is a match,
      ** then the search is finished. Break out of the loop. */
      pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, pNode->aiKeyPtr[1]);
      if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){
        pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TKV_LOADKEY, &b, &rc);
        if( rc!=LSM_OK ) break;
      }
      res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
      if( res==0 ){
        pCsr->aiCell[iNode] = 1;
        break;
      }

      /* Based on the results of the previous comparison, compare (pKey/nKey)
      ** to either the left or right key of the B-tree node, if such a key
      ** exists. */
      iTest = (res>0 ? 0 : 2);
      iTreeKey = pNode->aiKeyPtr[iTest];
      if( iTreeKey ){
        pTreeKey = (TreeKey*)treeShmptrUnsafe(pDb, iTreeKey);
        if( !(pTreeKey->flags & LSM_CONTIGUOUS) ){
          pTreeKey = treeShmkey(pDb, iTreeKey, TKV_LOADKEY, &b, &rc);
          if( rc ) break;
        }
        res = treeKeycmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
        if( res==0 ){
          pCsr->aiCell[iNode] = iTest;
          break;
        }
      }else{
        iTest = 1;
      }

      if( iNode<(pRoot->nHeight-1) ){
        iNodePtr = getChildPtr(pNode, pRoot->iTransId, iTest + (res<0));
      }else{
        iNodePtr = 0;
      }
      pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0));
    }

    *pRes = res;
    pCsr->iNode = iNode;
    tblobFree(pDb, &b);
  }

  /* assert() that *pRes has been set properly */
#ifndef NDEBUG
  if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){
    int cmp = treeCsrCompare(pCsr, pKey, nKey);
    assert( *pRes==cmp || (*pRes ^ cmp)>0 );
  }
#endif

  return rc;
}

int lsmTreeCursorNext(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;
  TreeBlob key1 = {0, 0};
#endif
  lsm_db *pDb = pCsr->pDb;
  TreeRoot *pRoot = pCsr->pRoot;
  const int iLeaf = pRoot->nHeight-1;
  int iCell; 
  int rc = LSM_OK; 
  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore>0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is larger
  ** than the current key. */
#ifndef NDEBUG
  pK1 = csrGetKey(pCsr, &key1, &rc);
  if( rc!=LSM_OK ) return rc;
#endif

  assert( lsmTreeCursorValid(pCsr) );
  assert( pCsr->aiCell[pCsr->iNode]<3 );

  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = ++pCsr->aiCell[pCsr->iNode];

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the left-most key on the left-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){
    do {
      u32 iNodePtr;
      pCsr->iNode++;
      iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell);
      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr);
      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0);
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree 
  ** until there is a key immediately to the right of the pointer followed 
  ** to reach the sub-tree containing the current key. */
  else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){
    while( (--pCsr->iNode)>=0 ){
      iCell = pCsr->aiCell[pCsr->iNode];
      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
    }
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
    assert( rc||treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)>=0 );
  }
  tblobFree(pDb, &key1);
#endif

  return rc;
}

int lsmTreeCursorPrev(TreeCursor *pCsr){
#ifndef NDEBUG
  TreeKey *pK1;
  TreeBlob key1 = {0, 0};
#endif
  lsm_db *pDb = pCsr->pDb;
  TreeRoot *pRoot = pCsr->pRoot;
  const int iLeaf = pRoot->nHeight-1;
  int iCell; 
  int rc = LSM_OK; 
  TreeNode *pNode; 

  /* Restore the cursor position, if required */
  int iRestore = 0;
  treeCursorRestore(pCsr, &iRestore);
  if( iRestore<0 ) return LSM_OK;

  /* Save a pointer to the current key. This is used in an assert() at the
  ** end of this function - to check that the 'next' key really is smaller
  ** than the current key. */
#ifndef NDEBUG
  pK1 = csrGetKey(pCsr, &key1, &rc);
  if( rc!=LSM_OK ) return rc;
#endif

  assert( lsmTreeCursorValid(pCsr) );
  pNode = pCsr->apTreeNode[pCsr->iNode];
  iCell = pCsr->aiCell[pCsr->iNode];
  assert( iCell>=0 && iCell<3 );

  /* If the current node is not a leaf, and the current cell has sub-tree
  ** associated with it, descend to the right-most key on the right-most
  ** leaf of the sub-tree.  */
  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pRoot->iTransId, iCell) ){
    do {
      u32 iNodePtr;
      pCsr->iNode++;
      iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell);
      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr);
      if( rc!=LSM_OK ) break;
      pCsr->apTreeNode[pCsr->iNode] = pNode;
      iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf);
      pCsr->aiCell[pCsr->iNode] = iCell;
    }while( pCsr->iNode < iLeaf );
  }

  /* Otherwise, the next key is found by following pointer up the tree until
  ** there is a key immediately to the left of the pointer followed to reach
  ** the sub-tree containing the current key. */
  else{
    do {
      iCell = pCsr->aiCell[pCsr->iNode]-1;
      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
    }while( (--pCsr->iNode)>=0 );
    pCsr->aiCell[pCsr->iNode] = iCell;
  }

#ifndef NDEBUG
  if( pCsr->iNode>=0 ){
    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
    assert( rc || treeKeycmp(TKV_KEY(pK2),pK2->nKey,TKV_KEY(pK1),pK1->nKey)<0 );
  }
  tblobFree(pDb, &key1);
#endif

  return rc;
}

/*
** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the
** in-memory tree.
*/
int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){
  lsm_db *pDb = pCsr->pDb;
  TreeRoot *pRoot = pCsr->pRoot;
  int rc = LSM_OK;

  u32 iNodePtr;
  pCsr->iNode = -1;

  /* Discard any saved position data */
  treeCursorRestore(pCsr, 0);

  iNodePtr = pRoot->iRoot;
  while( iNodePtr ){
    int iCell;
    TreeNode *pNode;

    pNode = (TreeNode *)treeShmptr(pDb, iNodePtr);
    if( rc ) break;

    if( bLast ){
      iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3);
    }else{
      iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0);
    }
    pCsr->iNode++;
    pCsr->apTreeNode[pCsr->iNode] = pNode;

    if( pCsr->iNode<pRoot->nHeight-1 ){
      iNodePtr = getChildPtr(pNode, pRoot->iTransId, iCell);
    }else{
      iNodePtr = 0;
    }
    pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast);
  }

  return rc;
}

int lsmTreeCursorFlags(TreeCursor *pCsr){
  int flags = 0;
  if( pCsr && pCsr->iNode>=0 ){
    int rc = LSM_OK;
    TreeKey *pKey = (TreeKey *)treeShmptrUnsafe(pCsr->pDb,
        pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]]
    );
    assert( rc==LSM_OK );
    flags = (pKey->flags & ~LSM_CONTIGUOUS);
  }
  return flags;
}

int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey){
  TreeKey *pTreeKey;
  int rc = LSM_OK;

  assert( lsmTreeCursorValid(pCsr) );

  pTreeKey = pCsr->pSave;
  if( !pTreeKey ){
    pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
  }
  if( rc==LSM_OK ){
    *pnKey = pTreeKey->nKey;
    if( pFlags ) *pFlags = pTreeKey->flags;
    *ppKey = (void *)&pTreeKey[1];
  }

  return rc;
}

int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){
  int res = 0;
  int rc;

  rc = treeCursorRestore(pCsr, &res);
  if( res==0 ){
    TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
    if( rc==LSM_OK ){
      if( pTreeKey->flags & LSM_INSERT ){
        *pnVal = pTreeKey->nValue;
        *ppVal = TKV_VAL(pTreeKey);
      }else{
        *ppVal = 0;
        *pnVal = -1;
      }
    }
  }else{
    *ppVal = 0;
    *pnVal = 0;
  }

  return rc;
}

/*
** Return true if the cursor currently points to a valid entry. 
*/
int lsmTreeCursorValid(TreeCursor *pCsr){
  return (pCsr && (pCsr->pSave || pCsr->iNode>=0));
}

/*
** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
** pointer to the same TreeMark structure may be used to roll the tree
** contents back to their current state.
*/
void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){
  pMark->iRoot = pDb->treehdr.root.iRoot;
  pMark->nHeight = pDb->treehdr.root.nHeight;
  pMark->iWrite = pDb->treehdr.iWrite;
  pMark->nChunk = pDb->treehdr.nChunk;
  pMark->iNextShmid = pDb->treehdr.iNextShmid;
  pMark->iRollback = intArraySize(&pDb->rollback);
}

/*
** Roll back to mark pMark. Structure *pMark should have been previously
** populated by a call to lsmTreeMark().
*/
void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
  int iIdx;
  int nIdx;
  u32 iNext;
  ShmChunk *pChunk;
  u32 iChunk;
  u32 iShmid;

  /* Revert all required v2 pointers. */
  nIdx = intArraySize(&pDb->rollback);
  for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){
    TreeNode *pNode;
    pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx));
    assert( pNode );
    pNode->iV2 = 0;
    pNode->iV2Child = 0;
    pNode->iV2Ptr = 0;
  }
  intArrayTruncate(&pDb->rollback, pMark->iRollback);

  /* Restore the free-chunk list. */
  assert( pMark->iWrite!=0 );
  iChunk = treeOffsetToChunk(pMark->iWrite-1);
  pChunk = treeShmChunk(pDb, iChunk);
  iNext = pChunk->iNext;
  pChunk->iNext = 0;

  pChunk = treeShmChunk(pDb, pDb->treehdr.iFirst);
  iShmid = pChunk->iShmid-1;

  while( iNext ){
    u32 iFree = iNext;            /* Current chunk being rollback-freed */
    ShmChunk *pFree;              /* Pointer to chunk iFree */

    pFree = treeShmChunk(pDb, iFree);
    iNext = pFree->iNext;

    if( iFree<pMark->nChunk ){
      pFree->iNext = pDb->treehdr.iFirst;
      pFree->iShmid = iShmid--;
      pDb->treehdr.iFirst = iFree;
    }
  }

  /* Restore the tree-header fields */
  pDb->treehdr.root.iRoot = pMark->iRoot;
  pDb->treehdr.root.nHeight = pMark->nHeight;
  pDb->treehdr.iWrite = pMark->iWrite;
  pDb->treehdr.nChunk = pMark->nChunk;
  pDb->treehdr.iNextShmid = pMark->iNextShmid;
}

/*
** Load the in-memory tree header from shared-memory into pDb->treehdr.
** If the header cannot be loaded, return LSM_PROTOCOL.
**
** If the header is successfully loaded and parameter piRead is not NULL,
** is is set to 1 if the header was loaded from ShmHeader.hdr1, or 2 if
** the header was loaded from ShmHeader.hdr2.
*/
int lsmTreeLoadHeader(lsm_db *pDb, int *piRead){
  int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL;
  while( (nRem--)>0 ){
    ShmHeader *pShm = pDb->pShmhdr;

    memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
    if( treeHeaderChecksumOk(&pDb->treehdr) ){
      if( piRead ) *piRead = 1;
      return LSM_OK;
    }
    memcpy(&pDb->treehdr, &pShm->hdr2, sizeof(TreeHeader));
    if( treeHeaderChecksumOk(&pDb->treehdr) ){
      if( piRead ) *piRead = 2;
      return LSM_OK;
    }

    lsmShmBarrier(pDb);
  }
  return LSM_PROTOCOL_BKPT;
}

int lsmTreeLoadHeaderOk(lsm_db *pDb, int iRead){
  TreeHeader *p = (iRead==1) ? &pDb->pShmhdr->hdr1 : &pDb->pShmhdr->hdr2;
  assert( iRead==1 || iRead==2 );
  return (0==memcmp(pDb->treehdr.aCksum, p->aCksum, sizeof(u32)*2));
}

/*
** This function is called to conclude a transaction. If argument bCommit
** is true, the transaction is committed. Otherwise it is rolled back.
*/
int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){
  ShmHeader *pShm = pDb->pShmhdr;

  treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum);
  memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader));
  lsmShmBarrier(pDb);
  memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader));
  pShm->bWriter = 0;
  intArrayFree(pDb->pEnv, &pDb->rollback);

  return LSM_OK;
}

#ifndef NDEBUG
static int assert_delete_ranges_match(lsm_db *db){
  int prev = 0;
  TreeBlob blob = {0, 0};
  TreeCursor csr;               /* Cursor used to iterate through tree */
  int rc;

  treeCursorInit(db, 0, &csr);
  for( rc = lsmTreeCursorEnd(&csr, 0);
       rc==LSM_OK && lsmTreeCursorValid(&csr);
       rc = lsmTreeCursorNext(&csr)
  ){
    TreeKey *pKey = csrGetKey(&csr, &blob, &rc);
    if( rc!=LSM_OK ) break;
    assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) );
    prev = pKey->flags;
  }

  tblobFree(csr.pDb, &csr.blob);
  tblobFree(csr.pDb, &blob);

  return 1;
}

static int treeCountEntries(lsm_db *db){
  TreeCursor csr;               /* Cursor used to iterate through tree */
  int rc;
  int nEntry = 0;

  treeCursorInit(db, 0, &csr);
  for( rc = lsmTreeCursorEnd(&csr, 0);
       rc==LSM_OK && lsmTreeCursorValid(&csr);
       rc = lsmTreeCursorNext(&csr)
  ){
    nEntry++;
  }

  tblobFree(csr.pDb, &csr.blob);

  return nEntry;
}
#endif