Index: lsm-test/lsmtest1.c
==================================================================
--- lsm-test/lsmtest1.c
+++ lsm-test/lsmtest1.c
@@ -58,13 +58,26 @@
 
 static int testControlDb(TestDb **ppDb){
 #ifdef HAVE_KYOTOCABINET
   return tdb_open("kyotocabinet", "tmp.db", 1, ppDb);
 #else
-  return tdb_open("sqlite3", "tmp.db", 1, ppDb);
+  return tdb_open("sqlite3", ":memory:", 1, ppDb);
 #endif
 }
+
+void testDatasourceFetch(
+  TestDb *pDb,                    /* Database handle */
+  Datasource *pData,
+  int iKey,
+  int *pRc                        /* IN/OUT: Error code */
+){
+  void *pKey; int nKey;           /* Database key to query for */
+  void *pVal; int nVal;           /* Expected result of query */
+
+  testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
+  testFetch(pDb, pKey, nKey, pVal, nVal, pRc);
+}
 
 /*
 ** This function is called to test that the contents of database pDb
 ** are as expected. In this case, expected is defined as containing
 ** key-value pairs iFirst through iLast, inclusive, from data source 

Index: lsm-test/lsmtest5.c
==================================================================
--- lsm-test/lsmtest5.c
+++ lsm-test/lsmtest5.c
@@ -524,11 +524,13 @@
   /* Open a new database connection. Initialize the pseudo-random number
   ** argument based on the thread number.  */
   iPrng = testPrngValue(iThread);
   pDb = testOpen(p->zSystem, 0, &rc);
 
-  tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
+  if( rc==0 ){
+    tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
+  }
 
   /* Loop until either an error occurs or some other thread sets the
   ** halt flag.  */
   while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){
     int iKey;

Index: lsm-test/lsmtest_main.c
==================================================================
--- lsm-test/lsmtest_main.c
+++ lsm-test/lsmtest_main.c
@@ -171,11 +171,11 @@
     res = nKey1 - nKey2;
   }
   return res;
 }
 
-static int test_scan_debug = 0;
+int test_scan_debug = 0;
 
 static void scanCompareCb(
   void *pCtx, 
   void *pKey, int nKey,
   void *pVal, int nVal
@@ -183,11 +183,14 @@
   ScanResult *p = (ScanResult *)pCtx;
   u8 *aKey = (u8 *)pKey;
   u8 *aVal = (u8 *)pVal;
   int i;
 
-  if( test_scan_debug ) printf("%.20s\n", (char *)pKey);
+  if( test_scan_debug ) printf("%.*s\n", nKey, (char *)pKey);
+#if 0
+  if( test_scan_debug ) printf("%.20s\n", (char *)pVal);
+#endif
 
 #if 0
   /* Check tdb_fetch() matches */
   int rc = 0;
   testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc);
@@ -458,11 +461,11 @@
 
 static lsm_db *configure_lsm_db(TestDb *pDb){
   lsm_db *pLsm;
   pLsm = tdb_lsm(pDb);
   if( pLsm ){
-    tdb_lsm_config_str(pDb, "mmap=0 autowork=1 nmerge=4 worker_nmerge=4");
+    tdb_lsm_config_str(pDb, "mmap=1 autowork=1 nmerge=4 worker_nmerge=4");
   }
   return pLsm;
 }
 
 

Index: lsm-test/lsmtest_tdb3.c
==================================================================
--- lsm-test/lsmtest_tdb3.c
+++ lsm-test/lsmtest_tdb3.c
@@ -310,10 +310,31 @@
 static int testEnvUnlink(lsm_env *pEnv, const char *zFile){
   lsm_env *pRealEnv = tdb_lsm_env();
   unused_parameter(pEnv);
   return pRealEnv->xUnlink(pRealEnv, zFile);
 }
+
+static int testEnvLock(lsm_file *pFile, int iLock, int eType){
+  LsmFile *p = (LsmFile *)pFile;
+  lsm_env *pRealEnv = tdb_lsm_env();
+  return pRealEnv->xLock(p->pReal, iLock, eType);
+}
+
+static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){
+  LsmFile *p = (LsmFile *)pFile;
+  lsm_env *pRealEnv = tdb_lsm_env();
+  return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp);
+}
+
+static void testEnvShmBarrier(void){
+}
+
+static int testEnvShmUnmap(lsm_file *pFile, int bDel){
+  LsmFile *p = (LsmFile *)pFile;
+  lsm_env *pRealEnv = tdb_lsm_env();
+  return pRealEnv->xShmUnmap(p->pReal, bDel);
+}
 
 static void doSystemCrash(LsmDb *pDb){
   lsm_env *pEnv = tdb_lsm_env();
   int iFile;
   int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;
@@ -574,10 +595,12 @@
     { "autowork",       0, LSM_CONFIG_AUTOWORK },
     { "log_size",       0, LSM_CONFIG_LOG_SIZE },
     { "mmap",           0, LSM_CONFIG_MMAP },
     { "use_log",        0, LSM_CONFIG_USE_LOG },
     { "nmerge",         0, LSM_CONFIG_NMERGE },
+    { "max_freelist",   0, LSM_CONFIG_MAX_FREELIST },
+    { "multi_proc",     0, LSM_CONFIG_MULTIPLE_PROCESSES },
     { "worker_nmerge",  1, LSM_CONFIG_NMERGE },
     { 0, 0 }
   };
   const char *z = zStr;
 
@@ -693,10 +716,14 @@
   pDb->env.xSectorSize = testEnvSectorSize;
   pDb->env.xRemap = testEnvRemap;
   pDb->env.xFileid = testEnvFileid;
   pDb->env.xClose = testEnvClose;
   pDb->env.xUnlink = testEnvUnlink;
+  pDb->env.xLock = testEnvLock;
+  pDb->env.xShmBarrier = testEnvShmBarrier;
+  pDb->env.xShmMap = testEnvShmMap;
+  pDb->env.xShmUnmap = testEnvShmUnmap;
 
   rc = lsm_new(&pDb->env, &pDb->db);
   if( rc==LSM_OK ){
     lsm_config_log(pDb->db, xLog, 0);
     lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
@@ -728,11 +755,12 @@
 int test_lsm_lomem_open(
   const char *zFilename, 
   int bClear, 
   TestDb **ppDb
 ){
-  const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384";
+  const char *zCfg = 
+    "page_size=256 block_size=65536 write_buffer=16384 max_freelist=4";
   return testLsmOpen(zCfg, zFilename, bClear, ppDb);
 }
 
 lsm_db *tdb_lsm(TestDb *pDb){
   if( pDb->pMethods->xClose==test_lsm_close ){

Index: src/build.c
==================================================================
--- src/build.c
+++ src/build.c
@@ -1396,27 +1396,30 @@
     pIndex->pTable = pTab;
     pIndex->nColumn = nCol;
     pIndex->onError = (u8)onError;
     pIndex->pSchema = pTab->pSchema;
 
-    if( db->init.busy ){
-      Hash *pIdxHash = &pIndex->pSchema->idxHash;
-      Index *p;
-
-      p = sqlite4HashInsert(pIdxHash, pIndex->zName, nName, pIndex);
-      if( p ){
-        assert( p==pIndex );
-        db->mallocFailed = 1;
-        sqlite4DbFree(db, pIndex);
-        pIndex = 0;
-      }
-    }
   }
 
   *pzExtra = zExtra;
   return pIndex;
 }
+
+static int addIndexToHash(sqlite4 *db, Index *pIdx){
+  if( db->init.busy ){
+    Hash *pIdxHash = &pIdx->pSchema->idxHash;
+    int nName = sqlite4Strlen30(pIdx->zName);
+    Index *p;
+    p = sqlite4HashInsert(pIdxHash, pIdx->zName, nName, pIdx);
+    if( p ){
+      assert( p==pIdx );
+      db->mallocFailed = 1;
+      return SQLITE4_NOMEM;
+    }
+  }
+  return SQLITE4_OK;
+}
 
 
 /*
 ** Allocate and populate an Index structure representing an implicit 
 ** primary key. In implicit primary key behaves similarly to the built-in
@@ -1425,19 +1428,22 @@
 static void addImplicitPrimaryKey(
   Parse *pParse,                  /* Parse context */
   Table *pTab,                    /* Table to add implicit PRIMARY KEY to */
   int iDb
 ){
+  sqlite4 *db = pParse->db;
   Index *pIndex;                  /* New index */
   char *zExtra;
 
   assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY );
   assert( sqlite4Strlen30("binary")==6 );
   pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra);
+  if( addIndexToHash(db, pIndex) ){
+    sqlite4DbFree(db, pIndex);
+    pIndex = 0;
+  }
   if( pIndex ){
-    sqlite4 *db = pParse->db;
-
     pIndex->aiColumn[0] = -1;
     pIndex->azColl[0] = zExtra;
     memcpy(zExtra, "binary", 7);
     pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY;
     pIndex->pNext = pTab->pIndex;
@@ -2665,10 +2671,11 @@
   if( db->init.busy ){
     db->flags |= SQLITE4_InternChanges;
     if( pTblName!=0 || bPrimaryKey ){
       pIndex->tnum = db->init.newTnum;
     }
+    if( addIndexToHash(db, pIndex) ) goto exit_create_index;
   }
 
   /* If the db->init.busy is 0 then create the index on disk.  This
   ** involves writing the index into the master table and filling in the
   ** index with the current table contents.

Index: src/kvlsm.c
==================================================================
--- src/kvlsm.c
+++ src/kvlsm.c
@@ -440,16 +440,31 @@
 
   pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm));
   if( pNew==0 ){
     rc = SQLITE4_NOMEM;
   }else{
+    struct Config {
+      const char *zParam;
+      int eParam;
+    } aConfig[] = {
+      { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE }
+    };
+
     memset(pNew, 0, sizeof(KVLsm));
     pNew->base.pStoreVfunc = &kvlsmMethods;
     pNew->base.pEnv = pEnv;
-
     rc = lsm_new(0, &pNew->pDb);
     if( rc==SQLITE4_OK ){
+      int i;
+      for(i=0; i<ArraySize(aConfig); i++){
+        const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam);
+        if( zVal ){
+          int nVal = sqlite4Atoi(zVal);
+          lsm_config(pNew->pDb, aConfig[i].eParam, &nVal);
+        }
+      }
+
       rc = lsm_open(pNew->pDb, zName);
     }
 
     if( rc!=SQLITE4_OK ){
       lsm_close(pNew->pDb);

Index: src/lsm.h
==================================================================
--- src/lsm.h
+++ src/lsm.h
@@ -32,10 +32,15 @@
 typedef long long int lsm_i64;              /* 64-bit signed integer type */
 
 /* Forward reference */
 typedef struct lsm_env lsm_env;             /* Runtime environment */
 
+/* Candidate values for the 3rd argument to lsm_env.xLock() */
+#define LSM_LOCK_UNLOCK 0
+#define LSM_LOCK_SHARED 1
+#define LSM_LOCK_EXCL   2
+
 /*
 ** Run-time environment used by LSM
 */
 struct lsm_env {
   int nByte;                 /* Size of this structure in bytes */
@@ -51,18 +56,20 @@
   int (*xSectorSize)(lsm_file *);
   int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*);
   int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf);
   int (*xClose)(lsm_file *);
   int (*xUnlink)(lsm_env*, const char *);
+  int (*xLock)(lsm_file*, int, int);
+  int (*xShmMap)(lsm_file*, int, int, void **);
+  void (*xShmBarrier)(void);
+  int (*xShmUnmap)(lsm_file*, int);
   /****** memory allocation ****************************************/
   void *pMemCtx;
   void *(*xMalloc)(lsm_env*, int);            /* malloc(3) function */
   void *(*xRealloc)(lsm_env*, void *, int);   /* realloc(3) function */
   void (*xFree)(lsm_env*, void *);            /* free(3) function */
-#if 1
   sqlite4_size_t (*xSize)(lsm_env*, void *);  /* xSize function */
-#endif
   /****** mutexes ****************************************************/
   void *pMutexCtx;
   int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */
   int (*xMutexNew)(lsm_env*, lsm_mutex**);       /* Get a new dynamic mutex */
   void (*xMutexDel)(lsm_mutex *);           /* Delete an allocated mutex */
@@ -165,20 +172,33 @@
 **     file normally. False otherwise.
 **
 **   LSM_CONFIG_NMERGE
 **     A read/write integer parameter. The minimum number of segments to
 **     merge together at a time. Default value 4.
+**
+**   LSM_CONFIG_MAX_FREELIST
+**     A read/write integer parameter. The maximum number of free-list 
+**     entries that are stored in a database checkpoint (the others are
+**     stored elsewhere in the database).
+**
+**     There is no reason for an application to configure or query this
+**     parameter. It is only present because configuring a small value
+**     makes certain parts of the lsm code easier to test.
+**
+**   LSM_CONFIG_MULTIPLE_PROCESSES
 */
-#define LSM_CONFIG_WRITE_BUFFER  1
-#define LSM_CONFIG_PAGE_SIZE     2
-#define LSM_CONFIG_SAFETY        3
-#define LSM_CONFIG_BLOCK_SIZE    4
-#define LSM_CONFIG_AUTOWORK      5
-#define LSM_CONFIG_LOG_SIZE      6
-#define LSM_CONFIG_MMAP          7
-#define LSM_CONFIG_USE_LOG       8
-#define LSM_CONFIG_NMERGE        9
+#define LSM_CONFIG_WRITE_BUFFER        1
+#define LSM_CONFIG_PAGE_SIZE           2
+#define LSM_CONFIG_SAFETY              3
+#define LSM_CONFIG_BLOCK_SIZE          4
+#define LSM_CONFIG_AUTOWORK            5
+#define LSM_CONFIG_LOG_SIZE            6
+#define LSM_CONFIG_MMAP                7
+#define LSM_CONFIG_USE_LOG             8
+#define LSM_CONFIG_NMERGE              9
+#define LSM_CONFIG_MAX_FREELIST       10
+#define LSM_CONFIG_MULTIPLE_PROCESSES 11
 
 #define LSM_SAFETY_OFF    0
 #define LSM_SAFETY_NORMAL 1
 #define LSM_SAFETY_FULL   2
 

Index: src/lsmInt.h
==================================================================
--- src/lsmInt.h
+++ src/lsmInt.h
@@ -43,11 +43,10 @@
 ** overridden by calls to lsm_config().
 */
 #define LSM_PAGE_SIZE   4096
 #define LSM_BLOCK_SIZE  (2 * 1024 * 1024)
 #define LSM_TREE_BYTES  (2 * 1024 * 1024)
-#define LSM_ECOLA       4
 
 #define LSM_DEFAULT_LOG_SIZE (128*1024)
 #define LSM_DEFAULT_NMERGE   4
 
 /* Places where a NULL needs to be changed to a real lsm_env pointer
@@ -56,17 +55,27 @@
 
 /* Initial values for log file checksums. These are only used if the 
 ** database file does not contain a valid checkpoint.  */
 #define LSM_CKSUM0_INIT 42
 #define LSM_CKSUM1_INIT 42
+
+#define LSM_META_PAGE_SIZE 4096
 
 /* "mmap" mode is currently only used in environments with 64-bit address 
 ** spaces. The following macro is used to test for this.  */
 #define LSM_IS_64_BIT (sizeof(void*)==8)
 
 #define LSM_AUTOWORK_QUANT 32
 
+/* Minimum number of free-list entries to store in the checkpoint, assuming
+** the free-list contains this many entries. i.e. if overflow is required,
+** the first LSM_CKPT_MIN_FREELIST entries are stored in the checkpoint and
+** the remainder in an LSM system entry.  */
+#define LSM_CKPT_MIN_FREELIST     6
+#define LSM_CKPT_MAX_REFREE       2
+#define LSM_CKPT_MIN_NONLSM       (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE)
+
 typedef struct Database Database;
 typedef struct DbLog DbLog;
 typedef struct FileSystem FileSystem;
 typedef struct Level Level;
 typedef struct LogMark LogMark;
@@ -86,10 +95,15 @@
 typedef struct TreeVersion TreeVersion;
 typedef struct TreeCursor TreeCursor;
 typedef struct Merge Merge;
 typedef struct MergeInput MergeInput;
 
+typedef struct TreeHeader TreeHeader;
+typedef struct ShmHeader ShmHeader;
+typedef struct ShmChunk ShmChunk;
+typedef struct ShmReader ShmReader;
+
 typedef unsigned char u8;
 typedef unsigned short int u16;
 typedef unsigned int u32;
 typedef lsm_i64 i64;
 typedef unsigned long long int u64;
@@ -109,10 +123,35 @@
 #define LSM_MISUSE_BKPT  lsmErrorBkpt(LSM_MISUSE)
 
 #define unused_parameter(x) (void)(x)
 #define array_size(x) (sizeof(x)/sizeof(x[0]))
 
+
+/* The size of each shared-memory chunk */
+#define LSM_SHM_CHUNK_SIZE (32*1024)
+
+/* The number of bytes reserved at the start of each shm chunk for MM. */
+#define LSM_SHM_CHUNK_HDR  (3 * 4)
+
+/* The number of available read locks. */
+#define LSM_LOCK_NREADER   6
+
+/* Lock definitions */
+#define LSM_LOCK_DMS1         1
+#define LSM_LOCK_DMS2         2
+#define LSM_LOCK_WRITER       3
+#define LSM_LOCK_WORKER       4
+#define LSM_LOCK_CHECKPOINTER 5
+#define LSM_LOCK_READER(i)    ((i) + LSM_LOCK_CHECKPOINTER + 1)
+
+/*
+** Hard limit on the number of free-list entries that may be stored in 
+** a checkpoint (the remainder are stored as a system record in the LSM).
+** See also LSM_CONFIG_MAX_FREELIST.
+*/
+#define LSM_MAX_FREELIST_ENTRIES 100
+
 /*
 ** A string that can grow by appending.
 */
 struct LsmString {
   lsm_env *pEnv;              /* Run-time environment */
@@ -119,22 +158,45 @@
   int n;                      /* Size of string.  -1 indicates error */
   int nAlloc;                 /* Space allocated for z[] */
   char *z;                    /* The string content */
 };
 
+typedef struct LsmFile LsmFile;
+struct LsmFile {
+  lsm_file *pFile;
+  LsmFile *pNext;
+};
+
+/*
+** An instance of the following type is used to store an ordered list of
+** u32 values. 
+**
+** Note: This is a place-holder implementation. It should be replaced by
+** a version that avoids making a single large allocation when the array
+** contains a large number of values. For this reason, the internals of 
+** this object should only manipulated by the intArrayXXX() functions in 
+** lsm_tree.c.
+*/
+typedef struct IntArray IntArray;
+struct IntArray {
+  int nAlloc;
+  int nArray;
+  u32 *aArray;
+};
+
 /*
 ** An instance of this structure represents a point in the history of the
-** tree structure to roll back to. Refer to comments in tree.c for details.
-**
-** Pointers pRollback and pRoot both point to structures of type TreeNode.
+** tree structure to roll back to. Refer to comments in lsm_tree.c for 
+** details.
 */
 struct TreeMark {
-  void *pMpChunk;                 /* Mempool chunk to roll back to */
-  int iMpOff;                     /* Mempool chunk offset to roll back to */
-  void *pRollback;                /* Zero v2 information starting here */
-  void *pRoot;                    /* Root node to restore */
-  int nHeight;                    /* Height of tree at pRoot */
+  u32 iRoot;                      /* Offset of root node in shm file */
+  u32 nHeight;                    /* Current height of tree structure */
+  u32 iWrite;                     /* Write offset in shm file */
+  u32 nChunk;                     /* Number of chunks in shared-memory file */
+  u32 iFirst;                     /* First chunk in linked list */
+  int iRollback;                  /* Index in lsm->rollback to revert to */
 };
 
 /*
 ** An instance of this structure represents a point in the database log.
 */
@@ -165,40 +227,77 @@
   u32 cksum0;                     /* Checksum 0 at offset iOff */
   u32 cksum1;                     /* Checksum 1 at offset iOff */
   LogRegion aRegion[3];           /* Log file regions (see docs in lsm_log.c) */
 };
 
+/*
+** Tree header structure. 
+*/
+struct TreeHeader {
+  u32 iTreeId;                    /* Current tree id */
+  u32 iTransId;                   /* Current transaction id */
+  u32 iRoot;                      /* Offset of root node in shm file */
+  u32 nHeight;                    /* Current height of tree structure */
+  u32 iWrite;                     /* Write offset in shm file */
+  u32 nChunk;                     /* Number of chunks in shared-memory file */
+  u32 iFirst;                     /* First chunk in linked list */
+  u32 nByte;                      /* Size of current tree structure in bytes */
+  DbLog log;                      /* Current layout of log file */ 
+  i64 iCkpt;                      /* Id of ckpt log space is reclaimed for */
+  u32 aCksum[2];                  /* Checksums 1 and 2. */
+};
+
 /*
 ** Database handle structure.
+**
+** mLock:
+**   A bitmask representing the locks currently held by the connection.
+**   An LSM database supports N distinct locks, where N is some number less
+**   than or equal to 16. Locks are numbered starting from 1 (see the 
+**   definitions for LSM_LOCK_WRITER and co.).
+**
+**   The least significant 16-bits in mLock represent EXCLUSIVE locks. The
+**   most significant are SHARED locks. So, if a connection holds a SHARED
+**   lock on lock region iLock, then the following is true:
+**
+**       (mLock & ((iLock+16-1) << 1))
+**
+**   Or for an EXCLUSIVE lock:
+**
+**       (mLock & ((iLock-1) << 1))
 */
 struct lsm_db {
 
   /* Database handle configuration */
   lsm_env *pEnv;                            /* runtime environment */
   int (*xCmp)(void *, int, void *, int);    /* Compare function */
-  int nTreeLimit;                 /* Maximum size of in-memory tree in bytes */
-  int bAutowork;                  /* True to do auto-work after writing */
+
+  /* Values configured by calls to lsm_config */
   int eSafety;                    /* LSM_SAFETY_OFF, NORMAL or FULL */
-
+  int bAutowork;                  /* Configured by LSM_CONFIG_AUTOWORK */
+  int nTreeLimit;                 /* Configured by LSM_CONFIG_WRITE_BUFFER */
   int nMerge;                     /* Configured by LSM_CONFIG_NMERGE */
   int nLogSz;                     /* Configured by LSM_CONFIG_LOG_SIZE */
   int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
   int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
   int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */
+  int nMaxFreelist;               /* Configured by LSM_CONFIG_MAX_FREELIST */
+  int bMultiProc;                 /* Configured by L_C_MULTIPLE_PROCESSES */
 
   /* Sub-system handles */
   FileSystem *pFS;                /* On-disk portion of database */
   Database *pDatabase;            /* Database shared data */
 
   /* Client transaction context */
-  TreeVersion *pTV;               /* In-memory tree snapshot (non-NULL in rt) */
   Snapshot *pClient;              /* Client snapshot (non-NULL in read trans) */
+  int iReader;                    /* Read lock held (-1 == unlocked) */
   MultiCursor *pCsr;              /* List of all open cursors */
-  LogWriter *pLogWriter;
+  LogWriter *pLogWriter;          /* Context for writing to the log file */
   int nTransOpen;                 /* Number of opened write transactions */
   int nTransAlloc;                /* Allocated size of aTrans[] array */
   TransMark *aTrans;              /* Array of marks for transaction rollback */
+  IntArray rollback;              /* List of tree-nodes to roll back */
 
   /* Worker context */
   Snapshot *pWorker;              /* Worker snapshot (or NULL) */
 
   /* Debugging message callback */
@@ -206,10 +305,19 @@
   void *pLogCtx;
 
   /* Work done notification callback */
   void (*xWork)(lsm_db *, void *);
   void *pWorkCtx;
+
+  u32 mLock;                      /* Mask of current locks. See lsmShmLock(). */
+  lsm_db *pNext;                  /* Next connection to same database */
+
+  int nShm;                       /* Size of apShm[] array */
+  void **apShm;                   /* Shared memory chunks */
+  ShmHeader *pShmhdr;             /* Live shared-memory header */
+  TreeHeader treehdr;             /* Local copy of tree-header */
+  u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)];
 };
 
 struct Segment {
   int iFirst;                     /* First page of this run */
   int iLast;                      /* Last page of this run */
@@ -225,11 +333,11 @@
 struct Level {
   Segment lhs;                    /* Left-hand (main) segment */
   int iAge;                       /* Number of times data has been written */
   int nRight;                     /* Size of apRight[] array */
   Segment *aRhs;                  /* Old segments being merged into this */
-  int iSplitTopic;
+  int iSplitTopic;                /* Split key topic (if nRight>0) */
   void *pSplitKey;                /* Pointer to split-key (if nRight>0) */
   int nSplitKey;                  /* Number of bytes in split-key */
   Merge *pMerge;                  /* Merge operation currently underway */
   Level *pNext;                   /* Next level in tree */
 };
@@ -268,38 +376,142 @@
 ** array is valid.
 */
 #define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)
 
 /*
-** Number of integers in the free-list delta.
+** The values that accompany the lock held by a database reader.
+*/
+struct ShmReader {
+  i64 iTreeId;
+  i64 iLsmId;
+};
+
+/*
+** An instance of this structure is stored in the first shared-memory
+** page. The shared-memory header.
+**
+** bWriter:
+**   Immediately after opening a write transaction taking the WRITER lock, 
+**   each writer client sets this flag. It is cleared right before the 
+**   WRITER lock is relinquished. If a subsequent writer finds that this
+**   flag is already set when a write transaction is opened, this indicates
+**   that a previous writer failed mid-transaction.
+**
+** iMetaPage:
+**   If the database file does not contain a valid, synced, checkpoint, this
+**   value is set to 0. Otherwise, it is set to the meta-page number that
+**   contains the most recently written checkpoint (either 1 or 2).
+**
+** hdr1, hdr2:
+**   The two copies of the in-memory tree header. Two copies are required
+**   in case a writer fails while updating one of them.
+*/
+struct ShmHeader {
+  u32 aClient[LSM_META_PAGE_SIZE / 4];
+  u32 aWorker[LSM_META_PAGE_SIZE / 4];
+  u32 bWriter;
+  u32 iMetaPage;
+  TreeHeader hdr1;
+  TreeHeader hdr2;
+  ShmReader aReader[LSM_LOCK_NREADER];
+};
+
+/*
+** An instance of this structure is stored at the start of each shared-memory
+** chunk except the first (which is the header chunk - see above).
+*/
+struct ShmChunk {
+  u32 iFirstTree;
+  u32 iLastTree;
+  u32 iNext;
+};
+
+#define LSM_APPLIST_SZ 4
+
+typedef struct Freelist Freelist;
+typedef struct FreelistEntry FreelistEntry;
+
+/*
+** An instance of the following structure stores the current database free
+** block list. The free list is a list of blocks that are not currently
+** used by the worker snapshot. Assocated with each block in the list is the
+** snapshot id of the most recent snapshot that did actually use the block.
+*/
+struct Freelist {
+  FreelistEntry *aEntry;          /* Free list entries */
+  int nEntry;                     /* Number of valid slots in aEntry[] */
+  int nAlloc;                     /* Allocated size of aEntry[] */
+};
+struct FreelistEntry {
+  u32 iBlk;                       /* Block number */
+  i64 iId;                        /* Largest snapshot id to use this block */
+};
+
+/*
+** A snapshot of a database. A snapshot contains all the information required
+** to read or write a database file on disk. See the description of struct
+** Database below for futher details.
 */
-#define LSM_FREELIST_DELTA_SIZE 3
+struct Snapshot {
+  Database *pDatabase;            /* Database this snapshot belongs to */
+  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
+  i64 iId;                        /* Snapshot id */
 
-/* 
+  /* Used by worker snapshots only */
+  int nBlock;                     /* Number of blocks in database file */
+  u32 aiAppend[LSM_APPLIST_SZ];   /* Append point list */
+  Freelist freelist;              /* Free block list */
+  int nFreelistOvfl;              /* Number of extra free-list entries in LSM */
+};
+#define LSM_INITIAL_SNAPSHOT_ID 11
+
+/*
 ** Functions from file "lsm_ckpt.c".
 */
-int lsmCheckpointRead(lsm_db *, int *, int *);
 int lsmCheckpointWrite(lsm_db *);
-int lsmCheckpointExport(lsm_db *, int, int, i64, int, void **, int *);
-void lsmChecksumBytes(const u8 *, int, const u32 *, u32 *);
-lsm_i64 lsmCheckpointLogOffset(void *pExport);
 int lsmCheckpointLevels(lsm_db *, int, void **, int *);
 int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);
-int lsmCheckpointOverflow(lsm_db *pDb, int *pnLsmLevel);
+
+int lsmCheckpointOverflow(lsm_db *pDb, void **, int *, int *);
+int lsmCheckpointOverflowRequired(lsm_db *pDb);
+int lsmCheckpointOverflowLoad(lsm_db *pDb, Freelist *);
+
+int lsmCheckpointRecover(lsm_db *);
+int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **);
+
+int lsmCheckpointLoad(lsm_db *pDb);
+int lsmCheckpointLoadWorker(lsm_db *pDb);
+int lsmCheckpointStore(lsm_db *pDb, int);
+
+i64 lsmCheckpointId(u32 *, int);
+i64 lsmCheckpointLogOffset(u32 *);
+int lsmCheckpointPgsz(u32 *);
+int lsmCheckpointBlksz(u32 *);
+void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog);
+void lsmCheckpointZeroLogoffset(lsm_db *);
+
+int lsmCheckpointSaveWorker(lsm_db *pDb, int, int);
+int lsmDatabaseFull(lsm_db *pDb);
+int lsmCheckpointSynced(lsm_db *pDb, i64 *piId);
+
 
 /* 
 ** Functions from file "lsm_tree.c".
 */
 int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
 void lsmTreeRelease(lsm_env *, Tree *);
+void lsmTreeClear(lsm_db *);
+void lsmTreeInit(lsm_db *);
 
-int lsmTreeSize(TreeVersion *pTV);
-int lsmTreeIsEmpty(Tree *pTree);
+int lsmTreeSize(lsm_db *);
+int lsmTreeEndTransaction(lsm_db *pDb, int bCommit);
+int lsmTreeBeginTransaction(lsm_db *pDb);
+int lsmTreeLoadHeader(lsm_db *pDb);
 
 int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
 void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
-void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark);
+void lsmTreeMark(lsm_db *pDb, TreeMark *pMark);
 
 int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **);
 void lsmTreeCursorDestroy(TreeCursor *);
 
 int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
@@ -308,19 +520,11 @@
 int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
 void lsmTreeCursorReset(TreeCursor *pCsr);
 int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey);
 int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
 int lsmTreeCursorValid(TreeCursor *pCsr);
-void lsmTreeCursorSave(TreeCursor *pCsr);
-
-TreeVersion *lsmTreeReadVersion(Tree *);
-int lsmTreeWriteVersion(lsm_env *pEnv, Tree *, TreeVersion **);
-TreeVersion *lsmTreeRecoverVersion(Tree *);
-int lsmTreeIsWriteVersion(TreeVersion *);
-int lsmTreeReleaseWriteVersion(lsm_env *, TreeVersion *, int, TreeVersion **);
-void lsmTreeReleaseReadVersion(lsm_env *, TreeVersion *);
-
+int lsmTreeCursorSave(TreeCursor *pCsr);
 
 /* 
 ** Functions from file "mem.c".
 */
 int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool);
@@ -386,11 +590,10 @@
 FileSystem *lsmPageFS(Page *);
 
 int lsmFsSectorSize(FileSystem *);
 
 void lsmSortedSplitkey(lsm_db *, Level *, int *);
-int lsmFsSetupAppendList(lsm_db *db);
 
 /* Reading sorted run content. */
 int lsmFsDbPageGet(FileSystem *, Pgno, Page **);
 int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);
 
@@ -406,14 +609,12 @@
 
 int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
 int lsmFsMetaPageRelease(MetaPage *);
 u8 *lsmFsMetaPageData(MetaPage *, int *);
 
-#ifdef LSM_EXPENSIVE_DEBUG
+#ifdef LSM_DEBUG
 int lsmFsIntegrityCheck(lsm_db *);
-#else
-# define lsmFsIntegrityCheck(pDb) 1
 #endif
 
 int lsmFsPageWritable(Page *);
 
 /* Functions to read, write and sync the log file. */
@@ -428,19 +629,27 @@
 
 /* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
 int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut);
 int lsmConfigMmap(lsm_db *pDb, int *piParam);
 
+int lsmEnvOpen(lsm_env *, const char *, lsm_file **);
+int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile);
+int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock);
+
+int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); 
+void lsmEnvShmBarrier(lsm_env *);
+void lsmEnvShmUnmap(lsm_env *, lsm_file *, int);
+
 /*
 ** End of functions from "lsm_file.c".
 **************************************************************************/
 
 /* 
 ** Functions from file "lsm_sorted.c".
 */
 int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
-int lsmSortedFlushTree(lsm_db *, int, int);
+int lsmSortedFlushTree(lsm_db *, int *);
 void lsmSortedCleanup(lsm_db *);
 int lsmSortedAutoWork(lsm_db *, int nUnit);
 
 void lsmSortedRemap(lsm_db *pDb);
 
@@ -448,12 +657,11 @@
 
 int lsmSortedFlushDb(lsm_db *);
 int lsmSortedAdvanceAll(lsm_db *pDb);
 
 int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);
-
-int lsmSortedLoadSystem(lsm_db *pDb);
+int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *);
 
 void *lsmSortedSplitKey(Level *pLevel, int *pnByte);
 
 void lsmSortedSaveTreeCursors(lsm_db *);
 
@@ -498,52 +706,46 @@
 int lsmFlushToDisk(lsm_db *);
 
 /*
 ** Functions from file "lsm_log.c".
 */
-int lsmLogBegin(lsm_db *pDb, DbLog *pLog);
+int lsmLogBegin(lsm_db *pDb);
 int lsmLogWrite(lsm_db *, void *, int, void *, int);
 int lsmLogCommit(lsm_db *);
-void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit);
+void lsmLogEnd(lsm_db *pDb, int bCommit);
 void lsmLogTell(lsm_db *, LogMark *);
 void lsmLogSeek(lsm_db *, LogMark *);
 
 int lsmLogRecover(lsm_db *);
-void lsmLogCheckpoint(lsm_db *, DbLog *pLog, lsm_i64);
+void lsmLogCheckpoint(lsm_db *, lsm_i64);
 int lsmLogStructure(lsm_db *pDb, char **pzVal);
 
 
 /**************************************************************************
 ** Functions from file "lsm_shared.c".
 */
-int lsmDbDatabaseFind(lsm_db*, const char *);
+
+int lsmDbDatabaseConnect(lsm_db*, const char *);
 void lsmDbDatabaseRelease(lsm_db *);
 
-int lsmBeginRecovery(lsm_db *);
 int lsmBeginReadTrans(lsm_db *);
 int lsmBeginWriteTrans(lsm_db *);
 int lsmBeginFlush(lsm_db *);
 
+int lsmBeginWork(lsm_db *);
+void lsmFinishWork(lsm_db *, int, int, int *);
+
 int lsmFinishRecovery(lsm_db *);
 void lsmFinishReadTrans(lsm_db *);
 int lsmFinishWriteTrans(lsm_db *, int);
 int lsmFinishFlush(lsm_db *, int);
 
-int lsmDbUpdateClient(lsm_db *, int, int);
-
-int lsmSnapshotFreelist(lsm_db *, int **, int *);
 int lsmSnapshotSetFreelist(lsm_db *, int *, int);
 
-void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz);
-
 Snapshot *lsmDbSnapshotClient(lsm_db *);
 Snapshot *lsmDbSnapshotWorker(lsm_db *);
-Snapshot *lsmDbSnapshotRecover(lsm_db *);
-void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *);
 
-void lsmSnapshotSetNBlock(Snapshot *, int);
-int lsmSnapshotGetNBlock(Snapshot *);
 void lsmSnapshotSetCkptid(Snapshot *, i64);
 
 Level *lsmDbSnapshotLevel(Snapshot *);
 void lsmDbSnapshotSetLevel(Snapshot *, Level *);
 
@@ -553,28 +755,48 @@
 int lsmBlockFree(lsm_db *, int);
 int lsmBlockRefree(lsm_db *, int);
 
 void lsmFreelistDeltaBegin(lsm_db *);
 void lsmFreelistDeltaEnd(lsm_db *);
-void lsmFreelistDelta(lsm_db *, u32 *);
-u32 *lsmFreelistDeltaPtr(lsm_db *pDb);
-
-void lsmDatabaseDirty(lsm_db *pDb);
-int lsmDatabaseIsDirty(lsm_db *pDb);
+int lsmFreelistDelta(lsm_db *pDb);
 
 DbLog *lsmDatabaseLog(lsm_db *pDb);
 
-Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp);
-int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg);
-void lsmSharedAppendListRemove(lsm_db *db, int iIdx);
-
-int lsmDbTreeSize(lsm_db *pDb);
-
 #ifdef LSM_DEBUG
   int lsmHoldingClientMutex(lsm_db *pDb);
+  int lsmShmAssertLock(lsm_db *db, int iLock, int eOp);
+  int lsmShmAssertWorker(lsm_db *db);
+#endif
+
+void lsmFreeSnapshot(lsm_env *, Snapshot *);
+
+
+/* Candidate values for the 3rd argument to lsmShmLock() */
+#define LSM_LOCK_UNLOCK 0
+#define LSM_LOCK_SHARED 1
+#define LSM_LOCK_EXCL   2
+
+int lsmShmChunk(lsm_db *db, int iChunk, void **ppData);
+int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock);
+void lsmShmBarrier(lsm_db *db);
+
+#ifdef LSM_DEBUG
+void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
+#else
+# define lsmShmHasLock(x,y,z)
 #endif
 
+int lsmReadlock(lsm_db *, i64 iLsm, i64 iTree);
+int lsmReleaseReadlock(lsm_db *);
+
+int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
+int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
+int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);
+
+int lsmDbMultiProc(lsm_db *);
+void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *);
+
 
 /**************************************************************************
 ** functions in lsm_str.c
 */
 void lsmStringInit(LsmString*, lsm_env *pEnv);

Index: src/lsm_ckpt.c
==================================================================
--- src/lsm_ckpt.c
+++ src/lsm_ckpt.c
@@ -34,16 +34,20 @@
 **        the two checksum values.
 **     4. The total number of blocks in the database.
 **     5. The block size.
 **     6. The number of levels.
 **     7. The nominal database page size.
-**     8. Flag indicating if overflow records are used. If true, the top-level
-**        segment contains LEVELS and FREELIST entries. 
+**     8. Flag indicating if there exists a FREELIST record in the database.
 **
 **   Log pointer:
 **
-**     4 integers. See ckptExportLog() and ckptImportLog().
+**     4 integers (2 for a 64-bit offset and 2 for a 64-bit checksum). See 
+**     ckptExportLog() and ckptImportLog().
+**
+**   Append points:
+**
+**     4 integers. See ckptExportAppendlist().
 **
 **   For each level in the database, a level record. Formatted as follows:
 **
 **     0. Age of the level.
 **     1. The number of right-hand segments (nRight, possibly 0),
@@ -55,28 +59,22 @@
 **        5a. Page number of next cell to read during merge
 **        5b. Cell number of next cell to read during merge
 **     7. Page containing current split-key.
 **     8. Cell within page containing current split-key.
 **
-**   The freelist. If the checkpoint header indicates that the top level
-**   segment contains LEVELS and FREELIST records, then three integers are
-**   stored here:
-**
-**     1. The size to truncate the free list to after it is loaded.
-**     2. First refree block (or 0),
-**     3. Second refree block (or 0),
-**
-**   In this case, the free list is loaded from the top level segment, 
-**   then truncated so that it contains the nTruncate newest entries only, 
-**   where nTruncate is the first integer in the block of three above. If 
-**   either or both of the "refree block" integers are non-zero, then they 
-**   are appended to the free-list.
-**
-**   Or, if the checkpoint header flag is clear, then the entire free-list
-**   is stored in the checkpoint. The format is the number of entries in
-**   the free-list, followed by the entries themselves (i.e. N+1 integers
-**   for an N entry free-list).
+**   The freelist. 
+**
+**     1. Number of free-list entries stored in checkpoint header.
+**     2. For each entry:
+**        2a. Block number of free block.
+**        2b. MSW of associated checkpoint id.
+**        2c. LSW of associated checkpoint id.
+**
+**   If the overflow flag is set, then extra free-list entries may be stored
+**   in the FREELIST record. The FREELIST record contains 3 32-bit integers
+**   per entry, in the same format as above (without the "number of entries"
+**   field).
 **
 **   The checksum:
 **
 **     1. Checksum value 1.
 **     2. Checksum value 2.
@@ -88,29 +86,58 @@
 **     3. Root page of array (or 0),
 **     4. Size of array in pages,
 */
 
 /*
-** OVERSIZED CHECKPOINT BLOBS:
-**
-** There are two slots allocated for checkpoints at the start of each
-** database file. Each are 4096 bytes in size, so may accommodate
-** checkpoints that consist of up to 1024 32-bit integers. Normally,
-** this is enough.
-**
-** However, if a database contains a sufficiently large number of levels,
-** a checkpoint may exceed 1024 integers in size. In most circumstances this 
-** is an undesirable scenario, as a database with so many levels will be 
-** slow to query. If this does happen, then only the uppermost (more recent)
-** levels are stored in the checkpoint blob itself. The remainder are stored
-** in an LSM record with the system key "LEVELS". The payload of the entry
-** is a series of 32-bit big-endian integers, as follows:
-**
-**    1. Number of levels (store in the LEVELS record, not total).
-**    2. For each level, a "level record" (as desribed above).
-**
-** There is no checksum in the LEVELS record.
+** LARGE NUMBERS OF LEVEL RECORDS:
+**
+** A limit on the number of rhs segments that may be present in the database
+** file. Defining this limit ensures that all level records fit within
+** the 4096 byte limit for checkpoint blobs.
+**
+** The number of right-hand-side segments in a database is counted as 
+** follows:
+**
+**   * For each level in the database not undergoing a merge, add 1.
+**
+**   * For each level in the database that is undergoing a merge, add 
+**     the number of segments on the rhs of the level.
+**
+** A level record not undergoing a merge is 6 integers. A level record 
+** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the 
+** separators from the next level) is (6*nRhs+12) integers. The maximum
+** per right-hand-side level is therefore 12 integers. So the maximum
+** size of all level records in a checkpoint is 12*40=480 integers.
+*/
+#define LSM_MAX_RHS_SEGMENTS 40
+
+/*
+** LARGE NUMBERS OF FREELIST ENTRIES:
+**
+** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
+** on the number of free-list entries stored in a checkpoint. Since each 
+** free-list entry consists of 3 integers, the maximum free-list size is 
+** 3*100=300 integers. Combined with the limit on rhs segments defined
+** above, this ensures that a checkpoint always fits within a 4096 byte
+** meta page.
+**
+** If the database contains more than 100 free blocks, the "overflow" flag
+** in the checkpoint header is set and the remainder are stored in the
+** system FREELIST entry in the LSM (along with user data). The value
+** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
+** of 32-bit big-endian integers. As follows:
+**
+**     For each entry:
+**       a. Block number of free block.
+**       b. MSW of associated checkpoint id.
+**       c. LSW of associated checkpoint id.
+**
+** The number of entries is not required - it is implied by the size of the
+** value blob containing the integer array.
+**
+** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
+** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
 */
 
 /*
 ** The argument to this macro must be of type u32. On a little-endian
 ** architecture, it returns the u32 value that results from interpreting
@@ -124,15 +151,16 @@
 )
 
 static const int one = 1;
 #define LSM_LITTLE_ENDIAN (*(u8 *)(&one))
 
-/* Total number of 32-bit integers in the checkpoint header. */
-#define CKPT_HDR_SIZE       8
-#define CKPT_LOGPTR_SIZE    4
-#define CKPT_SEGMENT_SIZE   4
-#define CKPT_CKSUM_SIZE     2
+/* Sizes, in integers, of various parts of the checkpoint. */
+#define CKPT_HDR_SIZE         8
+#define CKPT_LOGPTR_SIZE      4
+#define CKPT_SEGMENT_SIZE     4
+#define CKPT_CKSUM_SIZE       2
+#define CKPT_APPENDLIST_SIZE  LSM_APPLIST_SZ
 
 /* A #define to describe each integer in the checkpoint header. */
 #define CKPT_HDR_ID_MSW   0
 #define CKPT_HDR_ID_LSW   1
 #define CKPT_HDR_NCKPT    2
@@ -140,67 +168,57 @@
 #define CKPT_HDR_BLKSZ    4
 #define CKPT_HDR_NLEVEL   5
 #define CKPT_HDR_PGSZ     6
 #define CKPT_HDR_OVFL     7
 
-/*
-** Generate or extend an 8 byte checksum based on the data in array aByte[]
-** and the initial values of aIn[0] and aIn[1] (or initial values of 0 and 
-** 0 if aIn==NULL).
-**
-** The checksum is written back into aOut[] before returning.
-*/
-void lsmChecksumBytes(
-  const u8 *a,     /* Content to be checksummed */
-  int nByte,       /* Bytes of content in a[] */
-  const u32 *aIn,  /* Initial checksum value input */
-  u32 *aOut        /* OUT: Final checksum value output */
-){
-  u32 s1, s2;
-  u32 *aData = (u32 *)a;
-  u32 *aEnd = (u32 *)&a[nByte & ~0x00000007];
-
-  u32 aExtra[2] = {0, 0};
-  memcpy(aExtra, &a[nByte & ~0x00000007], nByte & 0x00000007);
-
-  if( aIn ){
-    s1 = aIn[0];
-    s2 = aIn[1];
-  }else{
-    s1 = s2 = 0;
-  }
-
-  if( LSM_LITTLE_ENDIAN ){
-    /* little-endian */
-    s1 += aExtra[0] + s2;
-    s2 += aExtra[1] + s1;
-    while( aData<aEnd ){
-      s1 += *aData++ + s2;
-      s2 += *aData++ + s1;
-    }
-  }else{
-    /* big-endian */
-    s1 += BYTESWAP32(aExtra[0]) + s2;
-    s2 += BYTESWAP32(aExtra[1]) + s1;
-    while( aData<aEnd ){
-      s1 += BYTESWAP32(aData[0]) + s2;
-      s2 += BYTESWAP32(aData[1]) + s1;
-      aData += 2;
-    }
-  }
-
-  aOut[0] = s1;
-  aOut[1] = s2;
-}
+#define CKPT_HDR_LO_MSW     8
+#define CKPT_HDR_LO_LSW     9
+#define CKPT_HDR_LO_CKSUM1 10
+#define CKPT_HDR_LO_CKSUM2 11
 
 typedef struct CkptBuffer CkptBuffer;
+
+/*
+** Dynamic buffer used to accumulate data for a checkpoint.
+*/
 struct CkptBuffer {
   lsm_env *pEnv;
   int nAlloc;
   u32 *aCkpt;
 };
 
+/*
+** Calculate the checksum of the checkpoint specified by arguments aCkpt and
+** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
+**
+** The value of the nCkpt parameter includes the two checksum values at
+** the end of the checkpoint. They are not used as inputs to the checksum 
+** calculation. The checksum is based on the array of (nCkpt-2) integers
+** at aCkpt[].
+*/
+static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
+  int i;
+  u32 cksum1 = 1;
+  u32 cksum2 = 2;
+
+  if( nCkpt % 2 ){
+    cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
+    cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
+  }
+
+  for(i=0; (i+3)<nCkpt; i+=2){
+    cksum1 += cksum2 + aCkpt[i];
+    cksum2 += cksum1 + aCkpt[i+1];
+  }
+
+  *piCksum1 = cksum1;
+  *piCksum2 = cksum2;
+}
+
+/*
+** Set integer iIdx of the checkpoint accumulating in buffer *p to iVal.
+*/
 static void ckptSetValue(CkptBuffer *p, int iIdx, u32 iVal, int *pRc){
   if( *pRc ) return;
   if( iIdx>=p->nAlloc ){
     int nNew = LSM_MAX(8, iIdx*2);
     p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
@@ -211,23 +229,30 @@
     p->nAlloc = nNew;
   }
   p->aCkpt[iIdx] = iVal;
 }
 
-static void ckptChangeEndianness(u32 *a, int n){
+/*
+** Argument aInt points to an array nInt elements in size. Switch the 
+** endian-ness of each element of the array.
+*/
+static void ckptChangeEndianness(u32 *aInt, int nInt){
   if( LSM_LITTLE_ENDIAN ){
     int i;
-    for(i=0; i<n; i++) a[i] = BYTESWAP32(a[i]);
+    for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
   }
 }
 
+/*
+** Object *p contains a checkpoint in native byte-order. The checkpoint is
+** nCkpt integers in size, not including any checksum. This function sets
+** the two checksum elements of the checkpoint accordingly.
+*/
 static void ckptAddChecksum(CkptBuffer *p, int nCkpt, int *pRc){
   if( *pRc==LSM_OK ){
     u32 aCksum[2] = {0, 0};
-    ckptChangeEndianness(p->aCkpt, nCkpt);
-    lsmChecksumBytes((u8 *)p->aCkpt, sizeof(u32)*nCkpt, 0, aCksum);
-    ckptChangeEndianness(aCksum, 2);
+    ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);
     ckptSetValue(p, nCkpt, aCksum[0], pRc);
     ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
   }
 }
 
@@ -250,14 +275,14 @@
 
   *piOut = iOut;
 }
 
 static void ckptExportLevel(
-  Level *pLevel,
-  CkptBuffer *p,
-  int *piOut,
-  int *pRc
+  Level *pLevel,                  /* Level object to serialize */
+  CkptBuffer *p,                  /* Append new level record to this ckpt */
+  int *piOut,                     /* IN/OUT: Size of checkpoint so far */
+  int *pRc                        /* IN/OUT: Error code */
 ){
   int iOut = *piOut;
   Merge *pMerge;
 
   pMerge = pLevel->pMerge;
@@ -286,121 +311,124 @@
 
   *piOut = iOut;
 }
 
 /*
-** Write the current log offset into the checkpoint buffer. 4 values.
+** Populate the log offset fields of the checkpoint buffer. 4 values.
 */
-static void ckptExportLog(DbLog *pLog, CkptBuffer *p, int *piOut, int *pRc){
+static void ckptExportLog(
+  lsm_db *pDb, 
+  int bFlush,
+  CkptBuffer *p, 
+  int *piOut, 
+  int *pRc
+){
+  int iOut = *piOut;
+
+  assert( iOut==CKPT_HDR_LO_MSW );
+
+  if( bFlush ){
+    DbLog *pLog = &pDb->treehdr.log;
+    i64 iOff = pLog->aRegion[2].iEnd;
+    ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
+    ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
+    ckptSetValue(p, iOut++, pLog->cksum0, pRc);
+    ckptSetValue(p, iOut++, pLog->cksum1, pRc);
+  }else{
+    for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
+      ckptSetValue(p, iOut, pDb->pShmhdr->aWorker[iOut], pRc);
+    }
+  }
+
+  *piOut = iOut;
+}
+
+static void ckptExportAppendlist(
+  lsm_db *db,                     /* Database connection */
+  CkptBuffer *p,                  /* Checkpoint buffer to write to */
+  int *piOut,                     /* IN/OUT: Offset within checkpoint buffer */
+  int *pRc                        /* IN/OUT: Error code */
+){
+  int i;
   int iOut = *piOut;
-  i64 iOff = pLog->aRegion[2].iEnd;
+  u32 *aiAppend = db->pWorker->aiAppend;
 
-  ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc);
-  ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc);
-  ckptSetValue(p, iOut++, pLog->cksum0, pRc);
-  ckptSetValue(p, iOut++, pLog->cksum1, pRc);
-
+  for(i=0; i<CKPT_APPENDLIST_SIZE; i++){
+    ckptSetValue(p, iOut++, aiAppend[i], pRc);
+  }
   *piOut = iOut;
-}
-
-/*
-** Import a log offset.
-*/
-static void ckptImportLog(u32 *aIn, int *piIn, DbLog *pLog){
-  int iIn = *piIn;
-
-  /* TODO: Look at this again after updating lsmLogRecover() */
-  pLog->aRegion[2].iStart = (((i64)aIn[iIn]) << 32) + (i64)aIn[iIn+1];
-  pLog->cksum0 = aIn[iIn+2];
-  pLog->cksum1 = aIn[iIn+3];
-
-  *piIn = iIn+4;
-}
-
-lsm_i64 lsmCheckpointLogOffset(void *pExport){
-  u8 *aIn = (u8 *)pExport;
-  u32 i1;
-  u32 i2;
-  i1 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4]);
-  i2 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4+4]);
-  return (((i64)i1) << 32) + (i64)i2;
-}
-
-
-int lsmCheckpointExport( 
+};
+
+static int ckptExportSnapshot( 
   lsm_db *pDb,                    /* Connection handle */
-  int nLsmLevel,                  /* Number of levels to store in LSM */
-  int bOvfl,                      /* True if free list is stored in LSM */
+  int nOvfl,                      /* Number of free-list entries in LSM */
+  int bLog,                       /* True to update log-offset fields */
   i64 iId,                        /* Checkpoint id */
   int bCksum,                     /* If true, include checksums */
   void **ppCkpt,                  /* OUT: Buffer containing checkpoint */
   int *pnCkpt                     /* OUT: Size of checkpoint in bytes */
 ){
   int rc = LSM_OK;                /* Return Code */
   FileSystem *pFS = pDb->pFS;     /* File system object */
   Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
-  int nAll = 0;                   /* Number of levels in db */
-  int nHdrLevel = 0;              /* Number of levels in checkpoint */
-  int iLevel;                     /* Used to count out nHdrLevel levels */
+  int nLevel = 0;                 /* Number of levels in checkpoint */
+  int iLevel;                     /* Used to count out nLevel levels */
   int iOut = 0;                   /* Current offset in aCkpt[] */
   Level *pLevel;                  /* Level iterator */
   int i;                          /* Iterator used while serializing freelist */
-  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
   CkptBuffer ckpt;
+  int nFree;
+ 
+  nFree = pSnap->freelist.nEntry;
+  if( nOvfl>=0 ){
+    nFree -=  nOvfl;
+  }else{
+    nOvfl = pDb->pShmhdr->aWorker[CKPT_HDR_OVFL];
+  }
 
-  assert( bOvfl || nLsmLevel==0 );
-  
   /* Initialize the output buffer */
   memset(&ckpt, 0, sizeof(CkptBuffer));
   ckpt.pEnv = pDb->pEnv;
   iOut = CKPT_HDR_SIZE;
 
-  /* Write the current log offset */
-  ckptExportLog(lsmDatabaseLog(pDb), &ckpt, &iOut, &rc);
+  /* Write the log offset into the checkpoint. */
+  ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);
+
+  /* Write the append-point list */
+  ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);
 
   /* Figure out how many levels will be written to the checkpoint. */
-  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nAll++;
-  nHdrLevel = nAll - nLsmLevel;
-  assert( nHdrLevel>0 );
+  for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;
 
-  /* Serialize nHdrLevel levels. */
+  /* Serialize nLevel levels. */
   iLevel = 0;
-  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nHdrLevel; pLevel=pLevel->pNext){
+  for(pLevel=lsmDbSnapshotLevel(pSnap); iLevel<nLevel; pLevel=pLevel->pNext){
     ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
     iLevel++;
   }
 
-  /* Write the freelist delta (if bOvfl is true) or else the entire free-list
-  ** (if bOvfl is false).  */
+  /* Write the freelist */
   if( rc==LSM_OK ){
-    if( bOvfl ){
-      lsmFreelistDelta(pDb, aDelta);
-      for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
-        ckptSetValue(&ckpt, iOut++, aDelta[i], &rc);
-      }
-    }else{
-      int *aVal;
-      int nVal;
-      rc = lsmSnapshotFreelist(pDb, &aVal, &nVal);
-      ckptSetValue(&ckpt, iOut++, nVal, &rc);
-      for(i=0; i<nVal && rc==LSM_OK; i++){
-        ckptSetValue(&ckpt, iOut++, aVal[i], &rc);
-      }
-      lsmFree(pDb->pEnv, aVal);
+    ckptSetValue(&ckpt, iOut++, nFree, &rc);
+    for(i=0; i<nFree; i++){
+      FreelistEntry *p = &pSnap->freelist.aEntry[i];
+      ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
+      ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
+      ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
     }
   }
 
   /* Write the checkpoint header */
   assert( iId>=0 );
   ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
   ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
   ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
-  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, lsmSnapshotGetNBlock(pSnap), &rc);
+  ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
   ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
-  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nHdrLevel, &rc);
+  ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
   ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
-  ckptSetValue(&ckpt, CKPT_HDR_OVFL, bOvfl, &rc);
+  ckptSetValue(&ckpt, CKPT_HDR_OVFL, nOvfl, &rc);
 
   if( bCksum ){
     ckptAddChecksum(&ckpt, iOut, &rc);
   }else{
     ckptSetValue(&ckpt, iOut, 0, &rc);
@@ -407,10 +435,16 @@
     ckptSetValue(&ckpt, iOut+1, 0, &rc);
   }
   iOut += 2;
   assert( iOut<=1024 );
 
+#if 0
+  lsmLogMessage(pDb, rc, 
+      "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl
+  );
+#endif
+
   *ppCkpt = (void *)ckpt.aCkpt;
   if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
   return rc;
 }
 
@@ -525,78 +559,10 @@
   *ppLevel = pRet;
   *piIn = iIn;
   return rc;
 }
 
-static int ckptImport(
-  lsm_db *pDb, 
-  void *pCkpt, 
-  int nInt, 
-  int *pbOvfl, 
-  int *pRc
-){
-  int rc = *pRc;
-  int ret = 0;
-  if( rc==LSM_OK ){
-    Snapshot *pSnap = pDb->pWorker;
-    u32 cksum[2] = {0, 0};
-    u32 *aInt = (u32 *)pCkpt;
-
-    lsmChecksumBytes((u8 *)aInt, sizeof(u32)*(nInt-2), 0, cksum);
-    if( LSM_LITTLE_ENDIAN ){
-      int i;
-      for(i=0; i<nInt; i++) aInt[i] = BYTESWAP32(aInt[i]);
-    }
-
-    if( aInt[nInt-2]==cksum[0] && aInt[nInt-1]==cksum[1] ){
-      int i;
-      int nLevel;
-      int iIn = CKPT_HDR_SIZE;
-      int bOvfl;
-      i64 iId;
-      u32 *aDelta;
-
-      Level *pTopLevel = 0;
-
-      /* Read header fields */
-      iId = ((i64)aInt[CKPT_HDR_ID_MSW] << 32) + (i64)aInt[CKPT_HDR_ID_LSW];
-      lsmSnapshotSetCkptid(pSnap, iId);
-      nLevel = (int)aInt[CKPT_HDR_NLEVEL];
-      lsmSnapshotSetNBlock(pSnap, (int)aInt[CKPT_HDR_NBLOCK]);
-      lsmDbSetPagesize(pDb,(int)aInt[CKPT_HDR_PGSZ],(int)aInt[CKPT_HDR_BLKSZ]);
-      *pbOvfl = bOvfl = aInt[CKPT_HDR_OVFL];
-
-      /* Import log offset */
-      ckptImportLog(aInt, &iIn, lsmDatabaseLog(pDb));
-
-      /* Import all levels stored in the checkpoint. */
-      rc = ckptLoadLevels(pDb, aInt, &iIn, nLevel, &pTopLevel);
-      lsmDbSnapshotSetLevel(pSnap, pTopLevel);
-
-      /* Import the freelist delta */
-      if( rc==LSM_OK ){
-        if( bOvfl ){
-          aDelta = lsmFreelistDeltaPtr(pDb);
-          for(i=0; i<LSM_FREELIST_DELTA_SIZE; i++){
-            aDelta[i] = aInt[iIn++];
-          }
-        }else{
-          int nFree = aInt[iIn++];
-          rc = lsmSnapshotSetFreelist(pDb, (int *)&aInt[iIn], nFree);
-          iIn += nFree;
-        }
-      }
-
-      ret = 1;
-    }
-
-    assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
-    *pRc = rc;
-  }
-  return ret;
-}
-
 
 int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
   int rc = LSM_OK;
   if( nVal>0 ){
     u32 *aIn;
@@ -627,102 +593,10 @@
   }
 
   return rc;
 }
 
-
-/*
-** If *pRc is not LSM_OK when this function is called, it is a no-op. 
-** 
-** Otherwise, it attempts to read the id and size of the checkpoint stored in
-** slot iSlot of the database header. If an error occurs during processing, 
-** *pRc is set to an error code before returning. The returned value is 
-** always zero in this case.
-**
-** Or, if no error occurs, set *pnInt to the total number of integer values
-** in the checkpoint and return the checkpoint id.
-*/
-static i64 ckptReadId(
-  lsm_db *pDb,                    /* Connection handle */
-  int iSlot,                      /* Slot to read from (1 or 2) */
-  int *pnInt,                     /* OUT: Size of slot checkpoint in ints */
-  int *pRc                        /* IN/OUT: Error code */
-){
-  i64 iId = 0;                    /* Checkpoint id (return value) */
-
-  assert( iSlot==1 || iSlot==2 );
-  if( *pRc==LSM_OK ){
-    MetaPage *pPg;                    /* Meta page for slot iSlot */
-    *pRc = lsmFsMetaPageGet(pDb->pFS, 0, iSlot, &pPg);
-    if( *pRc==LSM_OK ){
-      u8 *aData = lsmFsMetaPageData(pPg, 0);
-
-      iId = (i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4]) << 32;
-      iId += (i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]);
-      *pnInt = (int)lsmGetU32(&aData[CKPT_HDR_NCKPT*4]);
-
-      lsmFsMetaPageRelease(pPg);
-    }
-  }
-  return iId;
-}
-
-/*
-** Attempt to load the checkpoint from slot iSlot. Return true if the
-** attempt is successful.
-*/
-static int ckptTryRead(
-  lsm_db *pDb, 
-  int iSlot, 
-  int nCkpt, 
-  int *pbOvfl,
-  int *pRc
-){
-  int ret = 0;
-  assert( iSlot==1 || iSlot==2 );
-  if( *pRc==LSM_OK 
-   && nCkpt>=CKPT_HDR_SIZE
-   && nCkpt<65536 
-  ){
-    u32 *aCkpt;
-    aCkpt = (u32 *)lsmMallocZeroRc(pDb->pEnv, sizeof(u32)*nCkpt, pRc);
-    if( aCkpt ){
-      int rc = LSM_OK;
-      int iPg;
-      int nRem;
-      u8 *aRem;
-
-      /* Read the checkpoint data. */
-      nRem = sizeof(u32) * nCkpt;
-      aRem = (u8 *)aCkpt;
-      iPg = iSlot;
-      while( rc==LSM_OK && nRem ){
-        MetaPage *pPg;
-        rc = lsmFsMetaPageGet(pDb->pFS, 0, iPg, &pPg);
-        if( rc==LSM_OK ){
-          int nCopy;
-          int nData;
-          u8 *aData = lsmFsMetaPageData(pPg, &nData);
-
-          nCopy = LSM_MIN(nRem, nData);
-          memcpy(aRem, aData, nCopy);
-          aRem += nCopy;
-          nRem -= nCopy;
-          lsmFsMetaPageRelease(pPg);
-        }
-        iPg += 2;
-      }
-
-      ret = ckptImport(pDb, aCkpt, nCkpt, pbOvfl, &rc);
-      lsmFree(pDb->pEnv, aCkpt);
-      *pRc = rc;
-    }
-  }
-
-  return ret;
-}
-
 /*
 ** Return the data for the LEVELS record.
 **
 ** The size of the checkpoint that can be stored in the database header
 ** must not exceed 1024 32-bit integers. Normally, it does not. However,
@@ -771,119 +645,570 @@
 
   return rc;
 }
 
 /*
-** The function is used to determine if the FREELIST and LEVELS overflow
-** records may be required if a new top level segment is written and a
-** serialized checkpoint blob created. 
-**
-** If the checkpoint will definitely fit in a single meta page, 0 is 
-** returned and *pnLsmLevel is set to 0. In this case the caller need not
-** bother creating FREELIST and LEVELS records. 
-**
-** Or, if it is likely that the overflow records will be required, non-zero
-** is returned.
+** The worker lock must be held to call this function.
+**
+** The function serializes and returns the data that should be stored as
+** the FREELIST system record.
 */
 int lsmCheckpointOverflow(
   lsm_db *pDb,                    /* Database handle (must hold worker lock) */
-  int *pnLsmLevel                 /* OUT: Number of levels to store in LSM */
-){
-  Level *p;                       /* Used to iterate through levels */
-  int nFree;                      /* Free integers remaining in db header */
-  int nList;                      /* Size of freelist in integers */
-  int nLevel = 0;                 /* Number of levels stored in LEVELS */
- 
-  /* Number of free integers - 1024 less those used by the checkpoint header,
-  ** less the 4 used for the log-pointer, less the 3 used for the free-list 
-  ** delta and the 2 used for the checkpoint checksum. Value nFree is 
-  ** therefore the total number of integers available to store the database 
-  ** levels and freelist.  */
-  nFree = 1024 - CKPT_HDR_SIZE - CKPT_LOGPTR_SIZE - CKPT_CKSUM_SIZE;
-
-  /* Allow space for the free-list delta */
-  nFree -= 3;
-
-  /* Allow space for the new level that may be created */
-  nFree -= (2 + CKPT_SEGMENT_SIZE);
-
-  /* Each level record not currently undergoing a merge consumes 2 + 4
-  ** integers. Each level that is undergoing a merge consumes 2 + 4 +
-  ** (nRhs * 4) + 1 + 1 + (nMerge * 2) + 2, where nRhs is the number of levels
-  ** used as input to the merge and nMerge is the total number of segments
-  ** (same as the number of levels, possibly plus 1 separators array). 
-  **
-  ** The calculation in the following block may overestimate the number
-  ** of integers required by a single level by 2 (as it assumes 
-  ** that nMerge==nRhs+1).  */
-  for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext){
-    int nThis;                    /* Number of integers required by level p */
-    if( p->pMerge ){
-      nThis = 2 + (1 + p->nRight) * (2 + CKPT_SEGMENT_SIZE) + 1 + 1 + 2;
-    }else{
-      nThis = 2 + CKPT_SEGMENT_SIZE;
-    }
-    if( nFree<nThis ) break;
-    nFree -= nThis;
-  }
-
-  /* Count the levels that will not fit in the checkpoint record. */
-  while( p ){
-    nLevel++;
-    p = p->pNext;
-  }
-  *pnLsmLevel = nLevel;
-
-  /* Set nList to the number of values required to store the free-list */
-  lsmSnapshotFreelist(pDb, 0, &nList);
-  nList++;
-
-  return (nLevel>0 || nList>nFree);
+  void **ppVal,                   /* OUT: lsmMalloc'd buffer */
+  int *pnVal,                     /* OUT: Size of *ppVal in bytes */
+  int *pnOvfl                     /* OUT: Number of freelist entries in buf */
+){
+  int rc = LSM_OK;
+  int nRet;
+  Snapshot *p = pDb->pWorker;
+
+  assert( lsmShmAssertWorker(pDb) );
+  assert( pnOvfl && ppVal && pnVal );
+  assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES );
+
+  if( p->nFreelistOvfl ){
+    rc = lsmCheckpointOverflowLoad(pDb, &p->freelist);
+    if( rc!=LSM_OK ) return rc;
+    p->nFreelistOvfl = 0;
+  }
+
+  if( p->freelist.nEntry<=pDb->nMaxFreelist ){
+    nRet = 0;
+    *pnVal = 0;
+    *ppVal = 0;
+  }else{
+    int i;                        /* Iterator variable */
+    int iOut = 0;                 /* Current size of blob in ckpt */
+    CkptBuffer ckpt;              /* Used to build FREELIST blob */
+
+    nRet = (p->freelist.nEntry - pDb->nMaxFreelist);
+
+    memset(&ckpt, 0, sizeof(CkptBuffer));
+    ckpt.pEnv = pDb->pEnv;
+    for(i=p->freelist.nEntry-nRet; rc==LSM_OK && i<p->freelist.nEntry; i++){
+      FreelistEntry *pEntry = &p->freelist.aEntry[i];
+      ckptSetValue(&ckpt, iOut++, pEntry->iBlk, &rc);
+      ckptSetValue(&ckpt, iOut++, (pEntry->iId >> 32) & 0xFFFFFFFF, &rc);
+      ckptSetValue(&ckpt, iOut++, pEntry->iId & 0xFFFFFFFF, &rc);
+    }
+    ckptChangeEndianness(ckpt.aCkpt, iOut);
+
+    *ppVal = ckpt.aCkpt;
+    *pnVal = iOut*sizeof(u32);
+  }
+
+  *pnOvfl = nRet;
+  return rc;
+}
+
+/*
+** The connection must be the worker in order to call this function.
+**
+** True is returned if there are currently too many free-list entries
+** in-memory to store in a checkpoint. Before calling lsmCheckpointSaveWorker()
+** to save the current worker snapshot, a new top-level LSM segment must
+** be created so that some of them can be written to the LSM. 
+*/
+int lsmCheckpointOverflowRequired(lsm_db *pDb){
+  assert( lsmShmAssertWorker(pDb) );
+  return (pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist);
+}
+
+/*
+** Connection pDb must be the worker to call this function.
+**
+** Load the FREELIST record from the database. Decode it and append the
+** results to list pFreelist.
+*/
+int lsmCheckpointOverflowLoad(
+  lsm_db *pDb,
+  Freelist *pFreelist
+){
+  int rc;
+  int nVal = 0;
+  void *pVal = 0;
+  assert( lsmShmAssertWorker(pDb) );
+
+  /* Load the blob of data from the LSM. If that is successful (and the
+  ** blob is greater than zero bytes in size), decode the contents and
+  ** merge them into the current contents of *pFreelist.  */
+  rc = lsmSortedLoadFreelist(pDb, &pVal, &nVal);
+  if( pVal ){
+    u32 *aFree = (u32 *)pVal;
+    int nFree = nVal / sizeof(int);
+    ckptChangeEndianness(aFree, nFree);
+    if( (nFree % 3) ){
+      rc = LSM_CORRUPT_BKPT;
+    }else{
+      int iNew = 0;               /* Offset of next element in aFree[] */
+      int iOld = 0;               /* Next element in freelist fl */
+      Freelist fl = *pFreelist;   /* Original contents of *pFreelist */
+
+      memset(pFreelist, 0, sizeof(Freelist));
+      while( rc==LSM_OK && (iNew<nFree || iOld<fl.nEntry) ){
+        int iBlk;
+        i64 iId;
+
+        if( iOld>=fl.nEntry ){
+          iBlk = aFree[iNew];
+          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
+          iNew += 3;
+        }else if( iNew>=nFree ){
+          iBlk = fl.aEntry[iOld].iBlk;
+          iId = fl.aEntry[iOld].iId;
+          iOld += 1;
+        }else{
+          iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2];
+          if( iId<fl.aEntry[iOld].iId ){
+            iBlk = aFree[iNew];
+            iNew += 3;
+          }else{
+            iBlk = fl.aEntry[iOld].iBlk;
+            iId = fl.aEntry[iOld].iId;
+            iOld += 1;
+          }
+        }
+
+        rc = lsmFreelistAppend(pDb->pEnv, pFreelist, iBlk, iId);
+      }
+      lsmFree(pDb->pEnv, fl.aEntry);
+
+#ifdef LSM_DEBUG
+      if( rc==LSM_OK ){
+        int i;
+        for(i=1; rc==LSM_OK && i<pFreelist->nEntry; i++){
+          assert( pFreelist->aEntry[i].iId >= pFreelist->aEntry[i-1].iId );
+        }
+        assert( pFreelist->nEntry==(fl.nEntry + nFree/3) );
+      }
+#endif
+    }
+
+    lsmFree(pDb->pEnv, pVal);
+  }
+
+  return rc;
+}
+
+/*
+** Read the checkpoint id from meta-page pPg.
+*/
+static i64 ckptLoadId(MetaPage *pPg){
+  i64 ret = 0;
+  if( pPg ){
+    int nData;
+    u8 *aData = lsmFsMetaPageData(pPg, &nData);
+    ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + 
+          ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
+  }
+  return ret;
+}
+
+/*
+** Return true if the buffer passed as an argument contains a valid
+** checkpoint.
+*/
+static int ckptChecksumOk(u32 *aCkpt){
+  u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
+  u32 cksum1;
+  u32 cksum2;
+
+  if( nCkpt<CKPT_HDR_NCKPT || nCkpt>(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0;
+  ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
+  return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
+}
+
+/*
+** Attempt to load a checkpoint from meta page iMeta.
+**
+** This function is a no-op if *pRc is set to any value other than LSM_OK
+** when it is called. If an error occurs, *pRc is set to an LSM error code
+** before returning.
+**
+** If no error occurs and the checkpoint is successfully loaded, copy it to
+** ShmHeader.aClient[] and ShmHeader.aWorker[], and set ShmHeader.iMetaPage 
+** to indicate its origin. In this case return 1. Or, if the checkpoint 
+** cannot be loaded (because the checksum does not compute), return 0.
+*/
+static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
+  int bLoaded = 0;                /* Return value */
+  if( *pRc==LSM_OK ){
+    int rc = LSM_OK;              /* Error code */
+    u32 *aCkpt = 0;               /* Pointer to buffer containing checkpoint */
+    u32 nCkpt;                    /* Number of elements in aCkpt[] */
+    int nData;                    /* Bytes of data in aData[] */
+    u8 *aData;                    /* Meta page data */
+   
+    aData = lsmFsMetaPageData(pPg, &nData);
+    nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
+    if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
+      aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
+    }
+    if( aCkpt ){
+      memcpy(aCkpt, aData, nCkpt*sizeof(u32));
+      ckptChangeEndianness(aCkpt, nCkpt);
+      if( ckptChecksumOk(aCkpt) ){
+        ShmHeader *pShm = pDb->pShmhdr;
+        memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
+        memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
+        memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
+        pShm->iMetaPage = iMeta;
+        bLoaded = 1;
+      }
+    }
+
+    lsmFree(pDb->pEnv, aCkpt);
+    *pRc = rc;
+  }
+  return bLoaded;
+}
+
+/*
+** Initialize the shared-memory header with an empty snapshot. This function
+** is called when no valid snapshot can be found in the database header.
+*/
+static void ckptLoadEmpty(lsm_db *pDb){
+  u32 aCkpt[] = {
+    0,                  /* CKPT_HDR_ID_MSW */
+    10,                 /* CKPT_HDR_ID_LSW */
+    0,                  /* CKPT_HDR_NCKPT */
+    0,                  /* CKPT_HDR_NBLOCK */
+    0,                  /* CKPT_HDR_BLKSZ */
+    0,                  /* CKPT_HDR_NLEVEL */
+    0,                  /* CKPT_HDR_PGSZ */
+    0,                  /* CKPT_HDR_OVFL */
+    0, 0, 1234, 5678,   /* The log pointer and initial checksum */
+    0, 0, 0, 0,         /* The append list */
+    0,                  /* The free block list */
+    0, 0                /* Space for checksum values */
+  };
+  u32 nCkpt = array_size(aCkpt);
+  ShmHeader *pShm = pDb->pShmhdr;
+
+  aCkpt[CKPT_HDR_NCKPT] = nCkpt;
+  aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
+  aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
+  ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);
+
+  memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32));
+  memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32));
+  memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
+}
+
+/*
+** This function is called as part of database recovery to initialize the
+** ShmHeader.aClient[] and ShmHeader.aWorker[] snapshots.
+*/
+int lsmCheckpointRecover(lsm_db *pDb){
+  int rc = LSM_OK;                /* Return Code */
+  i64 iId1;                       /* Id of checkpoint on meta-page 1 */
+  i64 iId2;                       /* Id of checkpoint on meta-page 2 */
+  int bLoaded = 0;                /* True once checkpoint has been loaded */
+  int cmp;                        /* True if (iId2>iId1) */
+  MetaPage *apPg[2] = {0, 0};     /* Meta-pages 1 and 2 */
+
+  rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
+  if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);
+
+  iId1 = ckptLoadId(apPg[0]);
+  iId2 = ckptLoadId(apPg[1]);
+  cmp = (iId2 > iId1);
+  bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
+  if( bLoaded==0 ){
+    bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
+  }
+
+  /* The database does not contain a valid checkpoint. Initialize the shared
+  ** memory header with an empty checkpoint.  */
+  if( bLoaded==0 ){
+    ckptLoadEmpty(pDb);
+  }
+
+  lsmFsMetaPageRelease(apPg[0]);
+  lsmFsMetaPageRelease(apPg[1]);
+
+  return rc;
+}
+
+/* 
+** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
+*/
+int lsmCheckpointStore(lsm_db *pDb, int iMeta){
+  MetaPage *pPg = 0;
+  int rc;
+
+  assert( iMeta==1 || iMeta==2 );
+  rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
+  if( rc==LSM_OK ){
+    u8 *aData;
+    int nData;
+    int nCkpt;
+
+    nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
+    aData = lsmFsMetaPageData(pPg, &nData);
+    memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
+    ckptChangeEndianness((u32 *)aData, nCkpt);
+    rc = lsmFsMetaPageRelease(pPg);
+  }
+      
+  return rc;
+}
+
+/*
+** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
+*/
+int lsmCheckpointLoad(lsm_db *pDb){
+  while( 1 ){
+    int rc;
+    int nInt;
+    ShmHeader *pShm = pDb->pShmhdr;
+
+    nInt = pShm->aClient[CKPT_HDR_NCKPT];
+    memcpy(pDb->aSnapshot, pShm->aClient, nInt*sizeof(u32));
+    if( ckptChecksumOk(pDb->aSnapshot) ) return LSM_OK;
+
+    rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
+    if( rc==LSM_BUSY ){
+      usleep(50);
+    }else{
+      if( rc==LSM_OK ){
+        if( ckptChecksumOk(pShm->aClient)==0 ){
+          nInt = pShm->aWorker[CKPT_HDR_NCKPT];
+          memcpy(pShm->aClient, pShm->aWorker, nInt*sizeof(u32));
+        }
+        nInt = pShm->aClient[CKPT_HDR_NCKPT];
+        memcpy(pDb->aSnapshot, &pShm->aClient, nInt*sizeof(u32));
+        lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
+
+        if( ckptChecksumOk(pDb->aSnapshot)==0 ){
+          rc = LSM_CORRUPT_BKPT;
+        }
+      }
+      return rc;
+    }
+  }
+}
+
+int lsmCheckpointLoadWorker(lsm_db *pDb){
+  int rc;
+  ShmHeader *pShm = pDb->pShmhdr;
+
+  /* Must be holding the WORKER lock to do this */
+  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
+
+  if( ckptChecksumOk(pShm->aWorker)==0 ){
+    int nInt = (int)pShm->aClient[CKPT_HDR_NCKPT];
+    memcpy(pShm->aWorker, pShm->aClient, nInt*sizeof(u32));
+    if( ckptChecksumOk(pShm->aWorker)==0 ) return LSM_CORRUPT_BKPT;
+  }
+
+  rc = lsmCheckpointDeserialize(pDb, 1, pShm->aWorker, &pDb->pWorker);
+  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
+  return rc;
+}
+
+int lsmCheckpointDeserialize(
+  lsm_db *pDb, 
+  int bInclFreelist,              /* If true, deserialize free-list */
+  u32 *aCkpt, 
+  Snapshot **ppSnap
+){
+  int rc = LSM_OK;
+  Snapshot *pNew;
+
+  pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
+  if( rc==LSM_OK ){
+    int nFree;
+    int nCopy;
+    int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
+    int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;
+
+    pNew->iId = lsmCheckpointId(aCkpt, 0);
+    pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
+    rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);
+
+    /* Make a copy of the append-list */
+    nCopy = sizeof(u32) * LSM_APPLIST_SZ;
+    memcpy(pNew->aiAppend, &aCkpt[CKPT_HDR_SIZE+CKPT_LOGPTR_SIZE], nCopy);
+
+    /* Copy the free-list */
+    if( bInclFreelist ){
+      pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL];
+      nFree = aCkpt[iIn++];
+      if( nFree ){
+        pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
+            pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
+        );
+        if( rc==LSM_OK ){
+          int i;
+          for(i=0; i<nFree; i++){
+            FreelistEntry *p = &pNew->freelist.aEntry[i];
+            p->iBlk = aCkpt[iIn++];
+            p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
+            iIn += 2;
+          }
+          pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
+        }
+      }
+    }
+  }
+
+  if( rc!=LSM_OK ){
+    lsmFreeSnapshot(pDb->pEnv, pNew);
+    pNew = 0;
+  }
+
+  *ppSnap = pNew;
+  return rc;
+}
+
+/*
+** Connection pDb must be the worker connection in order to call this
+** function. It returns true if the database already contains the maximum
+** number of levels or false otherwise.
+**
+** This is used when flushing the in-memory tree to disk. If the database
+** is already full, then the caller should invoke lsm_work() or similar
+** until it is not full before creating a new level by flushing the in-memory
+** tree to disk. Limiting the number of levels in the database ensures that
+** the records describing them always fit within the checkpoint blob.
+*/
+int lsmDatabaseFull(lsm_db *pDb){
+  Level *p;
+  int nRhs = 0;
+
+  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
+  assert( pDb->pWorker );
+
+  for(p=pDb->pWorker->pLevel; p; p=p->pNext){
+    nRhs += (p->nRight ? p->nRight : 1);
+  }
+
+  return (nRhs >= LSM_MAX_RHS_SEGMENTS);
+}
+
+/*
+** The connection passed as the only argument is currently the worker
+** connection. Some work has been performed on the database by the connection,
+** but no new snapshot has been written into shared memory.
+**
+** This function updates the shared-memory worker and client snapshots with
+** the new snapshot produced by the work performed by pDb.
+**
+** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
+** error code is returned.
+*/
+int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){
+  Snapshot *pSnap = pDb->pWorker;
+  ShmHeader *pShm = pDb->pShmhdr;
+  void *p = 0;
+  int n = 0;
+  int rc;
+
+  rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n);
+  if( rc!=LSM_OK ) return rc;
+  assert( ckptChecksumOk((u32 *)p) );
+
+  assert( n<=LSM_META_PAGE_SIZE );
+  memcpy(pShm->aWorker, p, n);
+  lsmShmBarrier(pDb);
+  memcpy(pShm->aClient, p, n);
+  lsmFree(pDb->pEnv, p);
+
+  return LSM_OK;
+}
+
+int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){
+  int rc = LSM_OK;
+  const int nAttempt = 3;
+  int i;
+  for(i=0; i<nAttempt; i++){
+    MetaPage *pPg;
+    u32 iMeta;
+
+    iMeta = pDb->pShmhdr->iMetaPage;
+    rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
+    if( rc==LSM_OK ){
+      int nCkpt;
+      int nData;
+      u8 *aData; 
+
+      aData = lsmFsMetaPageData(pPg, &nData);
+      assert( nData==LSM_META_PAGE_SIZE );
+      nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
+
+      if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){
+        u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
+        if( aCopy ){
+          memcpy(aCopy, aData, nCkpt*sizeof(u32));
+          ckptChangeEndianness(aCopy, nCkpt);
+          if( ckptChecksumOk(aCopy) ){
+            *piId = lsmCheckpointId(aCopy, 0);
+          }
+          lsmFree(pDb->pEnv, aCopy);
+        }
+      }
+      lsmFsMetaPageRelease(pPg);
+    }
+    if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break;
+  }
+
+  return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK;
 }
 
 /*
-** Attempt to read a checkpoint from the database header. If an error
-** occurs, return an error code. Otherwise, return LSM_OK and, if 
-** a checkpoint is successfully loaded, populate the shared database 
-** structure.
-**
-** If a checkpoint is loaded, set *piSlot to the page number of the 
-** meta-page from which it is read (either 1 or 2). Or, if a checkpoint
-** cannot be loaded, set *piSlot to 0. 
-**
-** If a checkpoint is loaded and it indicates that the LEVELS and FREELIST 
-** records are present in the top-level segment *pbOvfl is set to true 
-** before returning. Otherwise, it is set to false.
+** Return the checkpoint-id of the checkpoint array passed as the first
+** argument to this function. If the second argument is true, then assume
+** that the checkpoint is made up of 32-bit big-endian integers. If it
+** is false, assume that the integers are in machine byte order.
 */
-int lsmCheckpointRead(lsm_db *pDb, int *piSlot, int *pbOvfl){
-  int rc = LSM_OK;                /* Return Code */
-  i64 iId1;
-  i64 iId2;
-  int nInt1;
-  int nInt2;
-  int bLoaded = 0;
-  int iSlot = 0;
-
-  iId1 = ckptReadId(pDb, 1, &nInt1, &rc);
-  iId2 = ckptReadId(pDb, 2, &nInt2, &rc);
-
-  *pbOvfl = 0;
-  if( iId1>=iId2 ){
-    bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);
-    if( bLoaded ) iSlot = 1;
-    if( bLoaded==0 ){
-      bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);
-      if( bLoaded ) iSlot = 2;
-    }
+i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
+  i64 iId;
+  if( bDisk ){
+    u8 *aData = (u8 *)aCkpt;
+    iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
+    iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
   }else{
-    bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc);
-    if( bLoaded ) iSlot = 2;
-    if( bLoaded==0 ){
-      bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc);
-      if( bLoaded ) iSlot = 1;
-    }
-  }
-
-  *piSlot = iSlot;
-  return rc;
+    iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
+  }
+  return iId;
+}
+
+i64 lsmCheckpointLogOffset(u32 *aCkpt){
+  return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
+}
+
+int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }
+
+int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }
+
+void lsmCheckpointLogoffset(
+  u32 *aCkpt,
+  DbLog *pLog
+){ 
+  u32 iOffMSB = aCkpt[CKPT_HDR_LO_MSW];
+  u32 iOffLSB = aCkpt[CKPT_HDR_LO_LSW];
+  pLog->aRegion[2].iStart = (((i64)iOffMSB) << 32) + ((i64)iOffLSB);
+  pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
+  pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
+}
+
+void lsmCheckpointZeroLogoffset(lsm_db *pDb){
+  u32 nCkpt;
+
+  nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
+  assert( nCkpt>CKPT_HDR_NCKPT );
+  assert( nCkpt==pDb->pShmhdr->aClient[CKPT_HDR_NCKPT] );
+  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aClient, nCkpt*sizeof(u32)) );
+  assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aWorker, nCkpt*sizeof(u32)) );
+
+  pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
+  pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
+  ckptChecksum(pDb->aSnapshot, nCkpt, 
+      &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
+  );
+
+  memcpy(pDb->pShmhdr->aClient, pDb->aSnapshot, nCkpt*sizeof(u32));
+  memcpy(pDb->pShmhdr->aWorker, pDb->aSnapshot, nCkpt*sizeof(u32));
 }
 

Index: src/lsm_file.c
==================================================================
--- src/lsm_file.c
+++ src/lsm_file.c
@@ -31,21 +31,22 @@
 **   is page 33.
 **
 **   It is assumed that the first two meta pages and the data that follows
 **   them are located on different disk sectors. So that if a power failure 
 **   while writing to a meta page there is no risk of damage to the other
-**   meta page or any other part of the database file.
+**   meta page or any other part of the database file. TODO: This may need
+**   to be revisited.
 **
 ** Blocks:
 **
 **   The database file is also divided into blocks. The default block size is
 **   2MB. When writing to the database file, an attempt is made to write data
 **   in contiguous block-sized chunks.
 **
 **   The first and last page on each block are special in that they are 4 
 **   bytes smaller than all other pages. This is because the last four bytes 
-**   of space on the first and last pages of each block are reserved for a 
+**   of space on the first and last pages of each block are reserved for
 **   pointers to other blocks (i.e. a 32-bit block number).
 **
 ** Runs:
 **
 **   A run is a sequence of pages that the upper layer uses to store a 
@@ -75,10 +76,11 @@
 ** This file opens and closes the log file. But it does not contain any
 ** logic related to the log file format. Instead, it exports the following
 ** functions that are used by the code in lsm_log.c to read and write the
 ** log file:
 **
+**     lsmFsOpenLog
 **     lsmFsWriteLog
 **     lsmFsSyncLog
 **     lsmFsReadLog
 **     lsmFsTruncateLog
 **     lsmFsCloseAndDeleteLog
@@ -111,15 +113,17 @@
 */
 struct FileSystem {
   lsm_db *pDb;                    /* Database handle that owns this object */
   lsm_env *pEnv;                  /* Environment pointer */
   char *zDb;                      /* Database file name */
+  char *zLog;                     /* Database file name */
   int nMetasize;                  /* Size of meta pages in bytes */
   int nPagesize;                  /* Database page-size in bytes */
   int nBlocksize;                 /* Database block-size in bytes */
 
   /* r/w file descriptors for both files. */
+  LsmFile *pLsmFile;
   lsm_file *fdDb;                 /* Database file */
   lsm_file *fdLog;                /* Log file */
 
   /* mmap() mode things */
   int bUseMmap;                   /* True to use mmap() to access db file */
@@ -191,11 +195,11 @@
 **     lsmEnvClose()
 **     lsmEnvTruncate()
 **     lsmEnvUnlink()
 **     lsmEnvRemap()
 */
-static int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
+int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){
   return pEnv->xOpen(pEnv, zFile, ppNew);
 }
 static int lsmEnvRead(
   lsm_env *pEnv, 
   lsm_file *pFile, 
@@ -218,11 +222,11 @@
   return pEnv->xSync(pFile);
 }
 static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
   return pEnv->xSectorSize(pFile);
 }
-static int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
+int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
   return pEnv->xClose(pFile);
 }
 static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
   return pEnv->xTruncate(pFile, nByte);
 }
@@ -236,32 +240,59 @@
   void **ppMap,
   i64 *pszMap
 ){
   return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
 }
+
+int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
+  if( pFile==0 ) return LSM_OK;
+  return pEnv->xLock(pFile, iLock, eLock);
+}
+
+int lsmEnvShmMap(
+  lsm_env *pEnv, 
+  lsm_file *pFile, 
+  int iChunk, 
+  int sz, 
+  void **ppOut
+){
+  return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
+}
+
+void lsmEnvShmBarrier(lsm_env *pEnv){
+  return pEnv->xShmBarrier();
+}
+
+void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
+  return pEnv->xShmUnmap(pFile, bDel);
+}
+
 
 /*
 ** Write the contents of string buffer pStr into the log file, starting at
 ** offset iOff.
 */
 int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
+  assert( pFS->fdLog );
   return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
 }
 
 /*
 ** fsync() the log file.
 */
 int lsmFsSyncLog(FileSystem *pFS){
+  assert( pFS->fdLog );
   return lsmEnvSync(pFS->pEnv, pFS->fdLog);
 }
 
 /*
-** Read nRead bytes of data starting at offset iOff of the log file. Store
-** the results in string buffer pStr.
+** Read nRead bytes of data starting at offset iOff of the log file. Append
+** the results to string buffer pStr.
 */
 int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
   int rc;                         /* Return code */
+  assert( pFS->fdLog );
   rc = lsmStringExtend(pStr, nRead);
   if( rc==LSM_OK ){
     rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
     pStr->n += nRead;
   }
@@ -310,54 +341,70 @@
   int bLog,                       /* True for log, false for db */
   int *pRc                        /* IN/OUT: Error code */
 ){
   lsm_file *pFile = 0;
   if( *pRc==LSM_OK ){
-    char *zName;
-    zName = lsmMallocPrintf(pFS->pEnv, "%s%s", pFS->zDb, (bLog ? "-log" : ""));
-    if( !zName ){
-      *pRc = LSM_NOMEM;
-    }else{
-      *pRc = lsmEnvOpen(pFS->pEnv, zName, &pFile);
-    }
-    lsmFree(pFS->pEnv, zName);
+    *pRc = lsmEnvOpen(pFS->pEnv, (bLog ? pFS->zLog : pFS->zDb), &pFile);
   }
   return pFile;
 }
+
+/*
+** If it is not already open, this function opens the log file. It returns
+** LSM_OK if successful (or if the log file was already open) or an LSM
+** error code otherwise.
+**
+** The log file must be opened before any of the following may be called:
+**
+**     lsmFsWriteLog
+**     lsmFsSyncLog
+**     lsmFsReadLog
+*/
+int lsmFsOpenLog(FileSystem *pFS){
+  int rc = LSM_OK;
+  if( 0==pFS->fdLog ){ pFS->fdLog = fsOpenFile(pFS, 1, &rc); }
+  return rc;
+}
 
 /*
 ** Open a connection to a database stored within the file-system (the
 ** "system of files").
 */
 int lsmFsOpen(lsm_db *pDb, const char *zDb){
   FileSystem *pFS;
   int rc = LSM_OK;
+  int nDb = strlen(zDb);
+  int nByte;
 
   assert( pDb->pFS==0 );
   assert( pDb->pWorker==0 && pDb->pClient==0 );
 
-  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, sizeof(FileSystem), &rc);
+  nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
+  pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
   if( pFS ){
+    pFS->zDb = (char *)&pFS[1];
+    pFS->zLog = &pFS->zDb[nDb+1];
     pFS->nPagesize = LSM_PAGE_SIZE;
     pFS->nBlocksize = LSM_BLOCK_SIZE;
     pFS->nMetasize = 4 * 1024;
     pFS->pDb = pDb;
     pFS->pEnv = pDb->pEnv;
 
-    /* Make a copy of the database name. */
-    pFS->zDb = lsmMallocStrdup(pDb->pEnv, zDb);
-    if( pFS->zDb==0 ) rc = LSM_NOMEM;
+    /* Make a copy of the database and log file names. */
+    memcpy(pFS->zDb, zDb, nDb+1);
+    memcpy(pFS->zLog, zDb, nDb);
+    memcpy(&pFS->zLog[nDb], "-log", 5);
 
     /* Allocate the hash-table here. At some point, it should be changed
     ** so that it can grow dynamicly. */
     pFS->nCacheMax = 2048;
     pFS->nHash = 4096;
     pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
+    pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
 
-    /* Open the files */
+    /* Open the database file */
     pFS->fdDb = fsOpenFile(pFS, 0, &rc);
-    pFS->fdLog = fsOpenFile(pFS, 1, &rc);
 
     if( rc!=LSM_OK ){
       lsmFsClose(pFS);
       pFS = 0;
     }
@@ -383,13 +430,20 @@
       lsmFree(pEnv, pPg);
       pPg = pNext;
     }
 
     if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
-    if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
+    if( pFS->fdLog ){
+      if( lsmDbMultiProc(pFS->pDb) ){
+        lsmDbDeferredClose(pFS->pDb, pFS->fdLog, pFS->pLsmFile);
+        pFS->pLsmFile = 0;
+      }else{
+        lsmEnvClose(pFS->pEnv, pFS->fdLog );
+      }
+    }
+    lsmFree(pEnv, pFS->pLsmFile);
 
-    lsmFree(pEnv, pFS->zDb);
     lsmFree(pEnv, pFS->apHash);
     lsmFree(pEnv, pFS);
   }
 }
 
@@ -625,17 +679,17 @@
   int *pRc
 ){
   if( *pRc==LSM_OK && iSz>pFS->nMap ){
     Page *pFix;
     int rc;
+    u8 *aOld = pFS->pMap;
     rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
     if( rc==LSM_OK ){
       u8 *aData = (u8 *)pFS->pMap;
       for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){
         pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)];
       }
-
       lsmSortedRemap(pFS->pDb);
     }
     *pRc = rc;
   }
 }
@@ -781,15 +835,15 @@
   int iBlk
 ){
   int rc = LSM_OK;                /* Return code */
   int iFirst;                     /* First page on block iBlk */
   int iLast;                      /* Last page on block iBlk */
-  int i;                          /* Used to iterate through append points */
   Level *pLevel;                  /* Used to iterate through levels */
 
-  Pgno *aAppend;
-  int nAppend;
+  int iIn;                        /* Used to iterate through append points */
+  int iOut = 0;                   /* Used to output append points */
+  u32 *aApp = pSnapshot->aiAppend;
 
   iFirst = fsFirstPageOnBlock(pFS, iBlk);
   iLast = fsLastPageOnBlock(pFS, iBlk);
 
   /* Check if any other run in the snapshot has a start or end page 
@@ -798,17 +852,16 @@
     if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
       return LSM_OK;
     }
   }
 
-  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
-  for(i=0; i<nAppend; i++){
-    if( aAppend[i]>=iFirst && aAppend[i]<=iLast ){
-      lsmSharedAppendListRemove(pFS->pDb, i);
-      break;
+  for(iIn=0; iIn<LSM_APPLIST_SZ; iIn++){
+    if( aApp[iIn]<iFirst || aApp[iIn]>iLast ){
+      aApp[iOut++] = aApp[iIn];
     }
   }
+  while( iOut<LSM_APPLIST_SZ ) aApp[iOut++] = 0;
 
   if( rc==LSM_OK ){
     rc = lsmBlockFree(pFS->pDb, iBlk);
   }
   return rc;
@@ -933,116 +986,19 @@
   }
 
   return fsPageGet(pFS, iPg, 0, ppNext);
 }
 
-static Pgno findAppendPoint(FileSystem *pFS, int nMin){
-  Pgno ret = 0;
-  Pgno *aAppend;
-  int nAppend;
-  int i;
-
-  aAppend = lsmSharedAppendList(pFS->pDb, &nAppend);
-#if 1
-  for(i=nAppend-1; i>=0; i--){
-#else
-  for(i=0; i<nAppend; i++){
-#endif
-    Pgno iLastOnBlock;
-    iLastOnBlock = fsLastPageOnBlock(pFS, fsPageToBlock(pFS, aAppend[i]));
-    if( (iLastOnBlock - aAppend[i])>=nMin ){
-      ret = aAppend[i];
-      lsmSharedAppendListRemove(pFS->pDb, i);
-      break;
-    }
-  }
-
-  return ret;
-}
-
-static void addAppendPoint(
-  lsm_db *db, 
-  Pgno iLast,
-  int *pRc                        /* IN/OUT: Error code */
-){
-  if( *pRc==LSM_OK && iLast>0 ){
-    FileSystem *pFS = db->pFS;
-
-    Pgno *aPoint;
-    int nPoint;
-    int i;
-    int iBlk;
-    int bLast;
-
-    iBlk = fsPageToBlock(pFS, iLast);
-    bLast = (iLast==fsLastPageOnBlock(pFS, iBlk));
-
-    aPoint = lsmSharedAppendList(db, &nPoint);
-    for(i=0; i<nPoint; i++){
-      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
-        if( bLast ){
-          lsmSharedAppendListRemove(db, i);
-        }else if( iLast>=aPoint[i] ){
-          aPoint[i] = iLast+1;
-        }
-        return;
-      }
-    }
-
-    if( bLast==0 ){
-      *pRc = lsmSharedAppendListAdd(db, iLast+1);
-    }
-  }
-}
-
-static void subAppendPoint(lsm_db *db, Pgno iFirst){
-  if( iFirst>0 ){
-    FileSystem *pFS = db->pFS;
-    Pgno *aPoint;
-    int nPoint;
-    int i;
-    int iBlk;
-
-    iBlk = fsPageToBlock(pFS, iFirst);
-    aPoint = lsmSharedAppendList(db, &nPoint);
-    for(i=0; i<nPoint; i++){
-      if( iBlk==fsPageToBlock(pFS, aPoint[i]) ){
-        if( iFirst>=aPoint[i] ) lsmSharedAppendListRemove(db, i);
-        return;
-      }
-    }
-  }
-}
-
-int lsmFsSetupAppendList(lsm_db *db){
-  int rc = LSM_OK;
-  Level *pLvl;
-
-  assert( db->pWorker );
-  for(pLvl=lsmDbSnapshotLevel(db->pWorker); 
-      rc==LSM_OK && pLvl; 
-      pLvl=pLvl->pNext
-  ){
-    if( pLvl->nRight==0 ){
-      addAppendPoint(db, pLvl->lhs.iLast, &rc);
-    }else{
-      int i;
-      for(i=0; i<pLvl->nRight; i++){
-        addAppendPoint(db, pLvl->aRhs[i].iLast, &rc);
-      }
-    }
-  }
-
-  for(pLvl=lsmDbSnapshotLevel(db->pWorker); pLvl; pLvl=pLvl->pNext){
-    int i;
-    subAppendPoint(db, pLvl->lhs.iFirst);
-    for(i=0; i<pLvl->nRight; i++){
-      subAppendPoint(db, pLvl->aRhs[i].iFirst);
-    }
-  }
-
-  return rc;
+static Pgno findAppendPoint(FileSystem *pFS){
+  int i;
+  u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
+  u32 iRet = 0;
+
+  for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
+    if( (iRet = aiAppend[i]) ) aiAppend[i] = 0;
+  }
+  return iRet;
 }
 
 /*
 ** Append a page to file iFile. Return a reference to it. lsmFsPageWrite()
 ** has already been called on the returned reference.
@@ -1059,11 +1015,11 @@
   int iApp = 0;
   int iNext = 0;
   int iPrev = p->iLast;
 
   if( iPrev==0 ){
-    iApp = findAppendPoint(pFS, 0);
+    iApp = findAppendPoint(pFS);
   }else if( fsIsLast(pFS, iPrev) ){
     Page *pLast = 0;
     rc = fsPageGet(pFS, iPrev, 0, &pLast);
     if( rc!=LSM_OK ) return rc;
     iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
@@ -1133,11 +1089,18 @@
         int iBlk = fsPageToBlock(pFS, iPg);
         lsmBlockRefree(pFS->pDb, iBlk);
         lsmFsPageRelease(pLast);
       }
     }else{
-      rc = lsmSharedAppendListAdd(pFS->pDb, p->iLast+1);
+      int i;
+      u32 *aiAppend = pFS->pDb->pWorker->aiAppend;
+      for(i=0; i<LSM_APPLIST_SZ; i++){
+        if( aiAppend[i]==0 ){
+          aiAppend[i] = p->iLast+1;
+          break;
+        }
+      }
     }
   }
   return rc;
 }
 
@@ -1401,21 +1364,24 @@
 ** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
 */
 int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){
   int rc = LSM_OK;
   Snapshot *pWorker;              /* Worker snapshot */
-  Snapshot *pRelease = 0;         /* Snapshot to release */
   Segment *pArray = 0;            /* Array to report on */
   Level *pLvl;                    /* Used to iterate through db levels */
+  int bUnlock = 0;
 
   *pzOut = 0;
   if( iFirst==0 ) return LSM_ERROR;
 
   /* Obtain the worker snapshot */
   pWorker = pDb->pWorker;
   if( !pWorker ){
-    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
+    rc = lsmBeginWork(pDb);
+    if( rc!=LSM_OK ) return rc;
+    pWorker = pDb->pWorker;
+    bUnlock = 1;
   }
 
   /* Search for the array that starts on page iFirst */
   for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){
     if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){
@@ -1449,52 +1415,50 @@
     lsmStringAppendf(&str, " %d", pArray->iLast);
 
     *pzOut = str.z;
   }
 
-  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
+  if( bUnlock ){
+    int rcwork = LSM_BUSY;
+    lsmFinishWork(pDb, 0, 0, &rcwork);
+  }
   return rc;
 }
 
-#ifdef LSM_EXPENSIVE_DEBUG
 /*
 ** Helper function for lsmFsIntegrityCheck()
 */
 static void checkBlocks(
   FileSystem *pFS, 
-  Segment *pSeg, 
-  int bExtra,
+  Segment *pSeg,
+  int bExtra,                     /* If true, count the "next" block if any */
+  int nUsed,
   u8 *aUsed
 ){
   if( pSeg ){
-    int i;
-    for(i=0; i<2; i++){
-      Segment *p = (i ? pSeg->pRun : pSeg->pSep);
-
-      if( p && p->nSize>0 ){
-        const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
-
-        int iBlk;
-        int iLastBlk;
-        iBlk = fsPageToBlock(pFS, p->iFirst);
-        iLastBlk = fsPageToBlock(pFS, p->iLast);
-
-        while( iBlk ){
-          assert( iBlk<=pFS->nBlock );
-          /* assert( aUsed[iBlk-1]==0 ); */
-          aUsed[iBlk-1] = 1;
-          if( iBlk!=iLastBlk ){
-            fsBlockNext(pFS, iBlk, &iBlk);
-          }else{
-            iBlk = 0;
-          }
-        }
-
-        if( bExtra && (p->iLast % nPagePerBlock)==0 ){
-          fsBlockNext(pFS, iLastBlk, &iBlk);
-          aUsed[iBlk-1] = 1;
-        }
+    if( pSeg && pSeg->nSize>0 ){
+      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+
+      int iBlk;
+      int iLastBlk;
+      iBlk = fsPageToBlock(pFS, pSeg->iFirst);
+      iLastBlk = fsPageToBlock(pFS, pSeg->iLast);
+
+      while( iBlk ){
+        assert( iBlk<=nUsed );
+        /* assert( aUsed[iBlk-1]==0 ); */
+        aUsed[iBlk-1] = 1;
+        if( iBlk!=iLastBlk ){
+          fsBlockNext(pFS, iBlk, &iBlk);
+        }else{
+          iBlk = 0;
+        }
+      }
+
+      if( bExtra && (pSeg->iLast % nPagePerBlock)==0 ){
+        fsBlockNext(pFS, iLastBlk, &iBlk);
+        aUsed[iBlk-1] = 1;
       }
     }
   }
 }
 
@@ -1501,51 +1465,65 @@
 /*
 ** This function checks that all blocks in the database file are accounted
 ** for. For each block, exactly one of the following must be true:
 **
 **   + the block is part of a sorted run, or
-**   + the block is on the lPending list, or
-**   + the block is on the lFree list
+**   + the block is on the free-block list
 **
 ** This function also checks that there are no references to blocks with
 ** out-of-range block numbers.
 **
 ** If no errors are found, non-zero is returned. If an error is found, an
 ** assert() fails.
 */
 int lsmFsIntegrityCheck(lsm_db *pDb){
-  int nBlock;
-  int i;
-  FileSystem *pFS = pDb->pFS;
-  u8 *aUsed;
-  Level *pLevel;
-
-  nBlock = pFS->nBlock;
-  aUsed = lsmMallocZero(pDb->pEnv, nBlock);
-  assert( aUsed );
-
-  for(pLevel=pDb->pLevel; pLevel; pLevel=pLevel->pNext){
-    int i;
-    checkBlocks(pFS, &pLevel->lhs, (pLevel->pSMerger!=0), aUsed);
-
-    for(i=0; i<pLevel->nRight; i++){
-      checkBlocks(pFS, &pLevel->aRhs[i], 0, aUsed);
-    }
-  }
-
-  for(i=0; i<pFS->lFree.n; i++){
-    int iBlk = pFS->lFree.a[i];
-    assert( aUsed[iBlk-1]==0 );
-    aUsed[iBlk-1] = 1;
-  }
-  for(i=0; i<pFS->lPending.n; i++){
-    int iBlk = pFS->lPending.a[i];
-    assert( aUsed[iBlk-1]==0 );
-    aUsed[iBlk-1] = 1;
+  int i;
+  int j;
+  Freelist freelist = {0, 0, 0};
+  FileSystem *pFS = pDb->pFS;
+  u8 *aUsed;
+  Level *pLevel;
+  Snapshot *pWorker = pDb->pWorker;
+  int nBlock = pWorker->nBlock;
+
+  aUsed = lsmMallocZero(pDb->pEnv, nBlock);
+  if( aUsed==0 ){
+    /* Malloc has failed. Since this function is only called within debug
+    ** builds, this probably means the user is running an OOM injection test.
+    ** Regardless, it will not be possible to run the integrity-check at this
+    ** time, so assume the database is Ok and return non-zero. */
+    return 1;
+  }
+
+  for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
+    int i;
+    checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
+    for(i=0; i<pLevel->nRight; i++){
+      checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed);
+    }
+  }
+
+  if( pWorker->nFreelistOvfl ){
+    int rc = lsmCheckpointOverflowLoad(pDb, &freelist);
+    assert( rc==LSM_OK || rc==LSM_NOMEM );
+    if( rc!=LSM_OK ) return 1;
+  }
+
+  for(j=0; j<2; j++){
+    Freelist *pFreelist;
+    if( j==0 ) pFreelist = &pWorker->freelist;
+    if( j==1 ) pFreelist = &freelist;
+
+    for(i=0; i<pFreelist->nEntry; i++){
+      u32 iBlk = pFreelist->aEntry[i].iBlk;
+      assert( iBlk<=nBlock );
+      assert( aUsed[iBlk-1]==0 );
+      aUsed[iBlk-1] = 1;
+    }
   }
 
   for(i=0; i<nBlock; i++) assert( aUsed[i]==1 );
 
   lsmFree(pDb->pEnv, aUsed);
+  lsmFree(pDb->pEnv, freelist.aEntry);
   return 1;
 }
-#endif

Index: src/lsm_log.c
==================================================================
--- src/lsm_log.c
+++ src/lsm_log.c
@@ -302,18 +302,17 @@
 **
 ** Before returning, this function allocates the LogWriter object that
 ** will be used to write to the log file during the write transaction.
 ** LSM_OK is returned if no error occurs, otherwise an LSM error code.
 */
-int lsmLogBegin(lsm_db *pDb, DbLog *pLog){
+int lsmLogBegin(lsm_db *pDb){
   int rc = LSM_OK;
   LogWriter *pNew;
   LogRegion *aReg;
 
-  assert( lsmHoldingClientMutex(pDb) );
   if( pDb->bUseLog==0 ) return LSM_OK;
-
+  rc = lsmFsOpenLog(pDb->pFS);
   pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
   if( pNew ){
     lsmStringInit(&pNew->buf, pDb->pEnv);
     rc = lsmStringExtend(&pNew->buf, 2);
   }
@@ -344,17 +343,17 @@
   **      file than region 0. In this case, append data to region 2, but
   **      remember to jump over region 1 if required.
   **
   **   3) Region 2 is the last in the file. Append to it.
   */
-  aReg = &pLog->aRegion[0];
+  aReg = &pDb->treehdr.log.aRegion[0];
 
   assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
   assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );
 
-  pNew->cksum0 = pLog->cksum0;
-  pNew->cksum1 = pLog->cksum1;
+  pNew->cksum0 = pDb->treehdr.log.cksum0;
+  pNew->cksum1 = pDb->treehdr.log.cksum1;
 
   if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){
     /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP 
     ** into the log file in this case. Pad it out to 8 bytes using a PAD2
     ** record so that the checksums can be updated immediately.  */
@@ -401,16 +400,17 @@
 **
 ** A call to this function deletes the LogWriter object allocated by
 ** lsmLogBegin(). If the transaction is being committed, the shared state
 ** in *pLog is updated before returning.
 */
-void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit){
+void lsmLogEnd(lsm_db *pDb, int bCommit){
+  DbLog *pLog;
   LogWriter *p;
-  assert( lsmHoldingClientMutex(pDb) );
 
   if( pDb->bUseLog==0 ) return;
   p = pDb->pLogWriter;
+  pLog = &pDb->treehdr.log;
 
   if( bCommit ){
     pLog->aRegion[2].iEnd = p->iOff;
     pLog->cksum0 = p->cksum0;
     pLog->cksum1 = p->cksum1;
@@ -434,13 +434,13 @@
 ** file. The checkpoint specifies that the log starts at offset iOff.
 ** The shared state in *pLog is updated to reflect the fact that space
 ** in the log file that occurs logically before offset iOff may now
 ** be reused.
 */ 
-void lsmLogCheckpoint(lsm_db *pDb, DbLog *pLog, lsm_i64 iOff){
+void lsmLogCheckpoint(lsm_db *pDb, lsm_i64 iOff){
+  DbLog *pLog = &pDb->treehdr.log;
   int iRegion;
-  assert( lsmHoldingClientMutex(pDb) );
 
   for(iRegion=0; iRegion<3; iRegion++){
     LogRegion *p = &pLog->aRegion[iRegion];
     if( iOff>=p->iStart && iOff<=p->iEnd ) break;
     p->iStart = 0;
@@ -725,11 +725,11 @@
 
 /*
 ** TODO: Thread safety of this function?
 */
 int lsmLogStructure(lsm_db *pDb, char **pzVal){
-  DbLog *pLog = lsmDatabaseLog(pDb);
+  DbLog *pLog = &pDb->treehdr.log;
   *pzVal = lsmMallocPrintf(pDb->pEnv, 
       "%d %d %d %d %d %d", 
       (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
       (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
       (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
@@ -887,20 +887,23 @@
 */
 int lsmLogRecover(lsm_db *pDb){
   LsmString buf1;                 /* Key buffer */
   LsmString buf2;                 /* Value buffer */
   LogReader reader;               /* Log reader object */
-  int rc;                         /* Return code */
+  int rc = LSM_OK;                /* Return code */
   int nCommit = 0;                /* Number of transactions to recover */
   int iPass;
   int nJump = 0;                  /* Number of LSM_LOG_JUMP records in pass 0 */
   DbLog *pLog;
 
-  rc = lsmBeginRecovery(pDb);
+  rc = lsmFsOpenLog(pDb->pFS);
   if( rc!=LSM_OK ) return rc;
 
-  pLog = lsmDatabaseLog(pDb);
+  lsmTreeInit(pDb);
+  pLog = &pDb->treehdr.log;
+  lsmCheckpointLogoffset(pDb->pShmhdr->aWorker, pLog);
+
   logReaderInit(pDb, pLog, 1, &reader);
   lsmStringInit(&buf1, pDb->pEnv);
   lsmStringInit(&buf2, pDb->pEnv);
 
   /* The outer for() loop runs at most twice. The first iteration is to 
@@ -1014,10 +1017,11 @@
         if( pLog->aRegion[2].iStart==0 ){
           iPass = 1;
         }else{
           pLog->aRegion[2].iStart = 0;
           iPass = -1;
+          lsmCheckpointZeroLogoffset(pDb);
         }
       }
       logReaderInit(pDb, pLog, 0, &reader);
       nCommit = nCommit * -1;
     }

Index: src/lsm_main.c
==================================================================
--- src/lsm_main.c
+++ src/lsm_main.c
@@ -39,14 +39,11 @@
   ** handle must be holding a pointer to a client snapshot. And the reverse 
   ** - if there are no open cursors and no write transactions then there must 
   ** not be a client snapshot.  */
   assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) );
 
-  /* If there is a write transaction open according to pDb->nTransOpen, then
-  ** the connection must be holding the read/write TreeVersion.  */
   assert( pDb->nTransOpen>=0 );
-  assert( pDb->nTransOpen==0 || lsmTreeIsWriteVersion(pDb->pTV) );
 }
 #else
 # define assert_db_state(x) 
 #endif
 
@@ -82,31 +79,22 @@
   pDb->xCmp = xCmp;
   pDb->nLogSz = LSM_DEFAULT_LOG_SIZE;
   pDb->nDfltPgsz = LSM_PAGE_SIZE;
   pDb->nDfltBlksz = LSM_BLOCK_SIZE;
   pDb->nMerge = LSM_DEFAULT_NMERGE;
+  pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
   pDb->bUseLog = 1;
-
+  pDb->iReader = -1;
+  pDb->bMultiProc = 1;
   return LSM_OK;
 }
 
 lsm_env *lsm_get_env(lsm_db *pDb){
   assert( pDb->pEnv );
   return pDb->pEnv;
 }
 
-/*
-** Release snapshot handle *ppSnap. Then set *ppSnap to zero. This
-** is useful for doing (say):
-**
-**   dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
-*/
-static void dbReleaseSnapshot(lsm_env *pEnv, Snapshot **ppSnap){
-  lsmDbSnapshotRelease(pEnv, *ppSnap);
-  *ppSnap = 0;
-}
-
 /*
 ** If database handle pDb is currently holding a client snapshot, but does
 ** not have any open cursors or write transactions, release it.
 */
 static void dbReleaseClientSnapshot(lsm_db *pDb){
@@ -113,83 +101,30 @@
   if( pDb->nTransOpen==0 && pDb->pCsr==0 ){
     lsmFinishReadTrans(pDb);
   }
 }
 
-static void dbWorkerStart(lsm_db *pDb){
-  assert( pDb->pWorker==0 );
-  pDb->pWorker = lsmDbSnapshotWorker(pDb);
-}
-
-static void dbWorkerDone(lsm_db *pDb){
-  assert( pDb->pWorker );
-  dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
-}
-
 static int dbAutoWork(lsm_db *pDb, int nUnit){
   int rc = LSM_OK;                /* Return code */
 
   assert( pDb->pWorker==0 );
   assert( pDb->bAutowork );
   assert( nUnit>0 );
 
   /* If one is required, run a checkpoint. */
+#if 0
   rc = lsmCheckpointWrite(pDb);
-
-  dbWorkerStart(pDb);
-  rc = lsmSortedAutoWork(pDb, nUnit);
-  dbWorkerDone(pDb);
-
-  return rc;
-}
-
-/*
-** If required, run the recovery procedure to initialize the database.
-** Return LSM_OK if successful or an error code otherwise.
-*/
-static int dbRecoverIfRequired(lsm_db *pDb){
-  int rc = LSM_OK;
-
-  assert( pDb->pWorker==0 && pDb->pClient==0 );
-
-  /* The following call returns NULL if recovery is not required. */
-  pDb->pWorker = lsmDbSnapshotRecover(pDb);
-  if( pDb->pWorker ){
-    int bOvfl;
-    int iSlot;
-
-    /* Read the database structure */
-    rc = lsmCheckpointRead(pDb, &iSlot, &bOvfl);
-
-    /* Read the free block list and any level records stored in the LSM. */
-    if( rc==LSM_OK && bOvfl ){
-      rc = lsmSortedLoadSystem(pDb);
-    }
-
-    /* Set up the initial append list */
-    if( rc==LSM_OK ){
-      rc = lsmFsSetupAppendList(pDb);
-    }
-
-    /* Populate the in-memory tree by reading the log file. */
-    if( rc==LSM_OK ){
-      rc = lsmLogRecover(pDb);
-    }
-
-    /* Set the "recovery done" flag */
-    if( rc==LSM_OK ){
-      lsmDbRecoveryComplete(pDb, iSlot);
-    }
-
-    /* Set up the initial client snapshot. */
-    if( rc==LSM_OK ){
-      rc = lsmDbUpdateClient(pDb, 0, 0);
-    }
-
-    dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker);
-  }
-
+#endif
+
+  rc = lsmBeginWork(pDb);
+  if( rc==LSM_OK ) rc = lsmSortedAutoWork(pDb, nUnit);
+  if( pDb->pWorker && pDb->pWorker->pLevel ){
+    lsmFinishWork(pDb, 0, -1, &rc);
+  }else{
+    int rcdummy = LSM_BUSY;
+    lsmFinishWork(pDb, 0, 0, &rcdummy);
+  }
   return rc;
 }
 
 static int getFullpathname(
   lsm_env *pEnv, 
@@ -236,22 +171,27 @@
     ** path is required to ensure that the correct files are operated
     ** on even if the application changes the cwd.  */
     rc = getFullpathname(pDb->pEnv, zFilename, &zFull);
     assert( rc==LSM_OK || zFull==0 );
 
-    /* Open the database file */
+    /* Open the database file. */
     if( rc==LSM_OK ){
       rc = lsmFsOpen(pDb, zFull);
     }
 
-    /* Open the shared data handle. */
+    /* Connect to the database */
     if( rc==LSM_OK ){
-      rc = lsmDbDatabaseFind(pDb, zFilename);
+      rc = lsmDbDatabaseConnect(pDb, zFilename);
     }
 
-    if( rc==LSM_OK ){
-      rc = dbRecoverIfRequired(pDb);
+    /* Configure the file-system connection with the page-size and block-size
+    ** of this database. Even if the database file is zero bytes in size
+    ** on disk, these values have been set in shared-memory by now, and so are
+    ** guaranteed not to change during the lifetime of this connection.  */
+    if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb)) ){
+      lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot));
+      lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot));
     }
 
     lsmFree(pDb->pEnv, zFull);
   }
 
@@ -262,52 +202,54 @@
 ** This function flushes the contents of the in-memory tree to disk. It
 ** returns LSM_OK if successful, or an error code otherwise.
 */
 int lsmFlushToDisk(lsm_db *pDb){
   int rc = LSM_OK;                /* Return code */
-  int nLsmLevel;
-  int bOvfl;
+  int nOvfl = 0;                  /* Number of free-list entries in LSM */
 
   /* Must not hold the worker snapshot when this is called. */
   assert( pDb->pWorker==0 );
-  dbWorkerStart(pDb);
+  rc = lsmBeginWork(pDb);
 
   /* Save the position of each open cursor belonging to pDb. */
-  rc = lsmSaveCursors(pDb);
+  if( rc==LSM_OK ){
+    rc = lsmSaveCursors(pDb);
+  }
 
-  bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
   if( rc==LSM_OK && pDb->bAutowork ){
     rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
-    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
+  }
+  while( rc==LSM_OK && lsmDatabaseFull(pDb) ){
+    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
   }
 
   /* Write the contents of the in-memory tree into the database file and 
   ** update the worker snapshot accordingly. Then flush the contents of 
   ** the db file to disk too. No calls to fsync() are made here - just 
   ** write().  */
-  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
-#if 0
-  if( rc==LSM_OK && bAutowork ){
-    assert( bOvfl==0 && nLsmLevel==0 );
-    rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT);
-    bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel);
-    if( bOvfl && rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl);
-  }
-#endif
-  if( rc==LSM_OK ) rc = lsmSortedFlushDb(pDb);
-
-  /* Create a new client snapshot - one that uses the new runs created above. */
-  if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsmLevel, bOvfl);
+  if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, &nOvfl);
+  if( rc==LSM_OK ) lsmTreeClear(pDb);
+
+  lsmFinishWork(pDb, 1, nOvfl, &rc);
 
   /* Restore the position of any open cursors */
-  if( rc==LSM_OK ) rc = lsmRestoreCursors(pDb);
+  if( rc==LSM_OK && pDb->pCsr ){
+    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
+    pDb->pClient = 0;
+    rc = lsmCheckpointLoad(pDb);
+    if( rc==LSM_OK ){
+      rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
+    }
+    if( rc==LSM_OK ){
+      rc = lsmRestoreCursors(pDb);
+    }
+  }
 
 #if 0
   if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush");
 #endif
 
-  dbWorkerDone(pDb);
   return rc;
 }
 
 int lsm_close(lsm_db *pDb){
   int rc = LSM_OK;
@@ -314,11 +256,10 @@
   if( pDb ){
     assert_db_state(pDb);
     if( pDb->pCsr || pDb->nTransOpen ){
       rc = LSM_MISUSE_BKPT;
     }else{
-      assert( pDb->pWorker==0 && pDb->pTV==0 );
       lsmDbDatabaseRelease(pDb);
       lsmFsClose(pDb->pFS);
       lsmFree(pDb->pEnv, pDb->aTrans);
       lsmFree(pDb->pEnv, pDb);
     }
@@ -421,10 +362,32 @@
       int *piVal = va_arg(ap, int *);
       if( *piVal>1 ) pDb->nMerge = *piVal;
       *piVal = pDb->nMerge;
       break;
     }
+
+    case LSM_CONFIG_MAX_FREELIST: {
+      int *piVal = va_arg(ap, int *);
+      if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){
+        pDb->nMaxFreelist = *piVal;
+      }
+      *piVal = pDb->nMaxFreelist;
+      break;
+    }
+
+    case LSM_CONFIG_MULTIPLE_PROCESSES: {
+      int *piVal = va_arg(ap, int *);
+      if( pDb->pDatabase ){
+        /* If lsm_open() has been called, this is a read-only parameter. 
+        ** Set the output variable to true if this connection is currently
+        ** in multi-process mode.  */
+        *piVal = lsmDbMultiProc(pDb);
+      }else{
+        pDb->bMultiProc = *piVal = (*piVal!=0);
+      }
+      break;
+    }
 
     default:
       rc = LSM_MISUSE;
       break;
   }
@@ -446,16 +409,19 @@
   Level *pTopLevel = 0;           /* Top level of snapshot to report on */
   int rc = LSM_OK;
   Level *p;
   LsmString s;
   Snapshot *pWorker;              /* Worker snapshot */
-  Snapshot *pRelease = 0;         /* Snapshot to release */
+  int bUnlock = 0;
 
   /* Obtain the worker snapshot */
   pWorker = pDb->pWorker;
   if( !pWorker ){
-    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
+    rc = lsmBeginWork(pDb);
+    if( rc!=LSM_OK ) return rc;
+    pWorker = pDb->pWorker;
+    bUnlock = 1;
   }
 
   /* Format the contents of the snapshot as text */
   pTopLevel = lsmDbSnapshotLevel(pWorker);
   lsmStringInit(&s, pDb->pEnv);
@@ -469,11 +435,14 @@
     lsmStringAppend(&s, "}", 1);
   }
   rc = s.n>=0 ? LSM_OK : LSM_NOMEM;
 
   /* Release the snapshot and return */
-  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
+  if( bUnlock ){
+    int rcdummy = LSM_BUSY;
+    lsmFinishWork(pDb, 0, 0, &rcdummy);
+  }
   *pzOut = s.z;
   return rc;
 }
 
 int lsm_info(lsm_db *pDb, int eParam, ...){
@@ -545,11 +514,10 @@
     bCommit = 1;
     rc = lsm_begin(pDb, 1);
   }
 
   if( rc==LSM_OK ){
-    assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
     rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
   }
 
   lsmSortedSaveTreeCursors(pDb);
 
@@ -562,14 +530,13 @@
 
     if( nQuant>pDb->nTreeLimit ){
       nQuant = pDb->nTreeLimit;
     }
 
-    nBefore = lsmTreeSize(pDb->pTV);
+    nBefore = lsmTreeSize(pDb);
     rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
-    nAfter = lsmTreeSize(pDb->pTV);
-
+    nAfter = lsmTreeSize(pDb);
     nDiff = (nAfter/nQuant) - (nBefore/nQuant);
     if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){
       rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT);
     }
   }
@@ -739,11 +706,11 @@
       rc = lsmBeginWriteTrans(pDb);
     }
 
     if( rc==LSM_OK ){
       for(i=pDb->nTransOpen; i<iLevel; i++){
-        lsmTreeMark(pDb->pTV, &pDb->aTrans[i].tree);
+        lsmTreeMark(pDb, &pDb->aTrans[i].tree);
         lsmLogTell(pDb, &pDb->aTrans[i].log);
       }
       pDb->nTransOpen = iLevel;
     }
   }
@@ -750,21 +717,25 @@
 
   return rc;
 }
 
 int lsm_commit(lsm_db *pDb, int iLevel){
+  int bFlush = 0;
   int rc = LSM_OK;
 
   assert_db_state( pDb );
 
   /* A value less than zero means close the innermost nested transaction. */
   if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);
 
   if( iLevel<pDb->nTransOpen ){
     if( iLevel==0 ){
+
       /* Commit the transaction to disk. */
-      if( pDb->pTV && lsmTreeSize(pDb->pTV)>pDb->nTreeLimit ){
+      if( lsmTreeSize(pDb)>pDb->nTreeLimit ){
+        lsmTreeEndTransaction(pDb, 1);
+        bFlush = 1;
         rc = lsmFlushToDisk(pDb);
       }
       if( rc==LSM_OK ) rc = lsmLogCommit(pDb);
       if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){
         rc = lsmFsSyncLog(pDb->pFS);
@@ -772,11 +743,15 @@
 
       lsmFinishWriteTrans(pDb, (rc==LSM_OK));
     }
     pDb->nTransOpen = iLevel;
   }
+
   dbReleaseClientSnapshot(pDb);
+  if( pDb->bAutowork && bFlush && rc==LSM_OK ){
+    rc = lsmCheckpointWrite(pDb);
+  }
   return rc;
 }
 
 int lsm_rollback(lsm_db *pDb, int iLevel){
   int rc = LSM_OK;

Index: src/lsm_mem.c
==================================================================
--- src/lsm_mem.c
+++ src/lsm_mem.c
@@ -107,11 +107,10 @@
     pRet = lsmReallocOrFree(pEnv, p, N);
     if( !pRet ) *pRc = LSM_NOMEM_BKPT;
   }
   return pRet;
 }
-
 
 char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){
   int nByte;
   char *zRet;
   nByte = strlen(zIn);

Index: src/lsm_shared.c
==================================================================
--- src/lsm_shared.c
+++ src/lsm_shared.c
@@ -13,37 +13,10 @@
 ** Utilities used to help multiple LSM clients to coexist within the
 ** same process space.
 */
 #include "lsmInt.h"
 
-typedef struct Freelist Freelist;
-typedef struct AppendList AppendList;
-typedef struct FreelistEntry FreelistEntry;
-
-/*
-** TODO: Find homes for these miscellaneous notes. 
-**
-** FREE-LIST DELTA FORMAT
-**
-**   The free-list delta consists of three integers:
-**
-**     1. The number of elements to remove from the start of the free-list.
-**     2. If non-zero, a refreed block to append to the free-list.
-**     3. Same as (2).
-**
-** SNAPSHOT ID MANIPULATIONS
-**
-**   When the database is initialized the worker snapshot id is set to the
-**   value read from the checkpoint. Or, if there is no valid checkpoint,
-**   to a non-zero default value (e.g. 1).
-**
-**   The client snapshot is then initialized as a copy of the worker. The
-**   client snapshot id is a copy of the worker snapshot id (as read from
-**   the checkpoint). The worker snapshot id is then incremented.
-**
-*/
-
 /*
 ** Global data. All global variables used by code in this file are grouped
 ** into the following structure instance.
 **
 ** pDatabase:
@@ -53,159 +26,37 @@
 */
 static struct SharedData {
   Database *pDatabase;            /* Linked list of all Database objects */
 } gShared;
 
-/*
-** An instance of the following structure stores the current database free
-** block list. The free list is a list of blocks that are not currently
-** used by the worker snapshot. Assocated with each block in the list is the
-** snapshot id of the most recent snapshot that did actually use the block.
-*/
-struct Freelist {
-  FreelistEntry *aEntry;          /* Free list entries */
-  int nEntry;                     /* Number of valid slots in aEntry[] */
-  int nAlloc;                     /* Allocated size of aEntry[] */
-};
-struct FreelistEntry {
-  int iBlk;                       /* Block number */
-  i64 iId;                        /* Largest snapshot id to use this block */
-};
-
-struct AppendList {
-  Pgno *aPoint;
-  int nPoint;
-  int nAlloc;
-};
-
-/*
-** A snapshot of a database. A snapshot contains all the information required
-** to read or write a database file on disk. See the description of struct
-** Database below for futher details.
-**
-** pExport/nExport:
-**   pExport points to a buffer containing the serialized (checkpoint) 
-**   image of the snapshot. The serialized image is nExport bytes in size. 
-*/
-struct Snapshot {
-  Database *pDatabase;            /* Database this snapshot belongs to */
-  Level *pLevel;                  /* Pointer to level 0 of snapshot (or NULL) */
-  i64 iId;                        /* Snapshot id */
-
-  /* Used by client snapshots only */
-  void *pExport;                  /* Serialized snapshot image */
-  int nExport;                    /* Size of pExport in bytes */
-  int nRef;                       /* Number of references to this structure */
-  Snapshot *pSnapshotNext;        /* Next snapshot on this database */
-};
-#define LSM_INITIAL_SNAPSHOT_ID 11
-
 /*
 ** Database structure. There is one such structure for each distinct 
 ** database accessed by this process. They are stored in the singly linked 
 ** list starting at global variable gShared.pDatabase. Database objects are 
 ** reference counted. Once the number of connections to the associated
 ** database drops to zero, they are removed from the linked list and deleted.
-**
-** The primary purpose of the Database structure is to manage Snapshots. A
-** snapshot contains the information required to read a database - exactly
-** where each array is stored, and where new arrays can be written. A 
-** database has one worker snapshot and any number of client snapshots.
-**
-** WORKER SNAPSHOT
-**
-**   When a connection is first made to a database and the Database object
-**   created, the worker snapshot is initialized to the most recently 
-**   checkpointed database state (based on the values in the db header).
-**   Any time the database file is written to, either to flush the contents
-**   of an in-memory tree or to merge existing segments, the worker snapshot
-**   is updated to reflect the modifications.
-**
-**   The worker snapshot is protected by the worker mutex. The worker mutex
-**   must be obtained before a connection begins to modify the database
-**   file. After the db file is written, the worker snapshot is updated and
-**   the worker mutex released.
-**
-** CLIENT SNAPSHOTS
-**
-**   Client snapshots are used by database clients (readers). When a 
-**   transaction is opened, the client requests a pointer to a read-only 
-**   client snapshot. It is relinquished when the transaction ends. Client 
-**   snapshots are reference counted objects.
-**
-**   When a database is first loaded, the client snapshot is a copy of
-**   the worker snapshot. Each time the worker snapshot is checkpointed,
-**   the client snapshot is updated with the new checkpointed contents.
-**
-** THE FREE-BLOCK LIST
-**
-**   Each Database structure maintains a list of free blocks - the "free-list".
-**   There is an entry in the free-list for each block in the database file 
-**   that is not used in any way by the worker snapshot.
-**
-**   Associated with each free block in the free-list is a snapshot id.
-**   This is the id of the earliest snapshot that does not require the
-**   contents of the block. The block may therefore be reused only after:
-**
-**     (a) a snapshot with an id equal to or greater than the id associated
-**         with the block has been checkpointed into the db header, and
-**
-**     (b) all existing database clients are using a snapshot with an id
-**         equal to or greater than the id stored in the free-list entry.
-**
-** MULTI-THREADING ISSUES
-**
-**   Each Database structure carries with it two mutexes - the client 
-**   mutex and the worker mutex. In a multi-process version of LSM, these 
-**   will be replaced by some other robust locking mechanism. 
-**
-**   TODO - this description.
 */
 struct Database {
+  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
   char *zName;                    /* Canonical path to database file */
   void *pId;                      /* Database id (file inode) */
   int nId;                        /* Size of pId in bytes */
-
-  Tree *pTree;                    /* Current in-memory tree structure */
-  DbLog log;                      /* Database log state object */
-  int nPgsz;                      /* Nominal database page size */
-  int nBlksz;                     /* Database block size */
-
-  Snapshot *pClient;              /* Client (reader) snapshot */
-  Snapshot worker;                /* Worker (writer) snapshot */
-  AppendList append;              /* List of appendable points */
-
-  int nBlock;                     /* Number of blocks tracked by this ss */
-  Freelist freelist;              /* Database free-list */
-
-  u32 aDelta[LSM_FREELIST_DELTA_SIZE];
-  int bRecordDelta;               /* True when recording freelist delta */
-
-  lsm_mutex *pWorkerMutex;        /* Protects the worker snapshot */
-  lsm_mutex *pClientMutex;        /* Protects pClient */
-  int bDirty;                     /* True if worker has been modified */
-  int bRecovered;                 /* True if db does not require recovery */
-
-  int bCheckpointer;              /* True if there exists a checkpointer */
-  int bWriter;                    /* True if there exists a writer */
-  i64 iCheckpointId;              /* Largest snapshot id stored in db file */
-  int iSlot;                      /* Meta page containing iCheckpointId */
-
-  /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
   int nDbRef;                     /* Number of associated lsm_db handles */
   Database *pDbNext;              /* Next Database structure in global list */
+
+  /* Protected by the local mutex (pClientMutex) */
+  lsm_file *pFile;                /* Used for locks/shm in multi-proc mode */
+  LsmFile *pLsmFile;              /* List of deferred closes */
+  lsm_mutex *pClientMutex;        /* Protects the apShmChunk[] and pConn */
+  int nShmChunk;                  /* Number of entries in apShmChunk[] array */
+  void **apShmChunk;              /* Array of "shared" memory regions */
+  lsm_db *pConn;                  /* List of connections to this db. */
 };
 
-/*
-** Macro that evaluates to true if the snapshot passed as the only argument
-** is a worker snapshot. 
-*/
-#define isWorker(pSnap) ((pSnap)==(&(pSnap)->pDatabase->worker))
-
 /*
 ** Functions to enter and leave the global mutex. This mutex is used
-** to protect the global linked-list headed at 
+** to protect the global linked-list headed at gShared.pDatabase.
 */
 static int enterGlobalMutex(lsm_env *pEnv){
   lsm_mutex *p;
   int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
   if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
@@ -227,72 +78,18 @@
   int i; 
   for(i=0; i<p->nEntry; i++){
     assert( p->aEntry[i].iBlk!=iBlk );
   }
 }
-static void assertMustbeWorker(lsm_db *pDb){
-  assert( pDb->pWorker );
-  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
-}
-static void assertSnapshotListOk(Database *p){
-  Snapshot *pIter;
-  i64 iPrev = 0;
-
-  for(pIter=p->pClient; pIter; pIter=pIter->pSnapshotNext){
-    assert( pIter==p->pClient || pIter->iId<iPrev );
-    iPrev = pIter->iId;
-  }
-}
 #else
 # define assertNotInFreelist(x,y)
-# define assertMustbeWorker(x)
-# define assertSnapshotListOk(x)
 #endif
 
-
-Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp){
-  Database *p = db->pDatabase;
-  assert( db->pWorker );
-  *pnApp = p->append.nPoint;
-  return p->append.aPoint;
-}
-
-int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg){
-  AppendList *pList;
-  assert( db->pWorker );
-  pList = &db->pDatabase->append;
-
-  assert( pList->nAlloc>=pList->nPoint );
-  if( pList->nAlloc<=pList->nPoint ){
-    int nNew = pList->nAlloc+8;
-    Pgno *aNew = (Pgno *)lsmRealloc(db->pEnv, pList->aPoint, sizeof(Pgno)*nNew);
-    if( aNew==0 ) return LSM_NOMEM_BKPT;
-    pList->aPoint = aNew;
-    pList->nAlloc = nNew;
-  }
-
-  pList->aPoint[pList->nPoint++] = iPg;
-  return LSM_OK;
-}
-
-void lsmSharedAppendListRemove(lsm_db *db, int iIdx){
-  AppendList *pList;
-  int i;
-  assert( db->pWorker );
-  pList = &db->pDatabase->append;
-
-  assert( pList->nPoint>iIdx );
-  for(i=iIdx+1; i<pList->nPoint;i++){
-    pList->aPoint[i-1] = pList->aPoint[i];
-  }
-  pList->nPoint--;
-}
-
 /*
 ** Append an entry to the free-list.
 */
-static int flAppendEntry(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){
+int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){
 
   /* Assert that this is not an attempt to insert a duplicate block number */
   assertNotInFreelist(p, iBlk);
 
   /* Extend the space allocated for the freelist, if required */
@@ -314,10 +111,22 @@
   p->aEntry[p->nEntry].iId = iId;
   p->nEntry++;
 
   return LSM_OK;
 }
+
+static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){
+  int rc;
+
+  rc = lsmFreelistAppend(pEnv, p, iBlk, 1);
+  if( rc==LSM_OK ){
+    memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1));
+    p->aEntry[0].iBlk = iBlk;
+    p->aEntry[0].iId = 1;
+  }
+  return rc;
+}
 
 /*
 ** Remove the first entry of the free-list.
 */
 static void flRemoveEntry0(Freelist *p){
@@ -326,23 +135,115 @@
   memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew);
   p->nEntry = nNew;
 }
 
 /*
-** This function frees all resources held by the Database structure passed
+** tHIS Function frees all resources held by the Database structure passed
 ** as the only argument.
 */
 static void freeDatabase(lsm_env *pEnv, Database *p){
+  assert( holdingGlobalMutex(pEnv) );
   if( p ){
     /* Free the mutexes */
     lsmMutexDel(pEnv, p->pClientMutex);
-    lsmMutexDel(pEnv, p->pWorkerMutex);
+
+    if( p->pFile ){
+      lsmEnvClose(pEnv, p->pFile);
+    }
 
     /* Free the memory allocated for the Database struct itself */
     lsmFree(pEnv, p);
   }
 }
+
+static void doDbDisconnect(lsm_db *pDb){
+  int rc;
+
+  /* Block for an exclusive lock on DMS1. This lock serializes all calls
+  ** to doDbConnect() and doDbDisconnect() across all processes.  */
+  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+  if( rc==LSM_OK ){
+
+    /* Try an exclusive lock on DMS2. If successful, this is the last
+    ** connection to the database. In this case flush the contents of the
+    ** in-memory tree to disk and write a checkpoint.  */
+    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
+    if( rc==LSM_OK ){
+      /* Flush the in-memory tree, if required. If there is data to flush,
+      ** this will create a new client snapshot in Database.pClient. The
+      ** checkpoint (serialization) of this snapshot may be written to disk
+      ** by the following block.  */
+      rc = lsmTreeLoadHeader(pDb);
+      if( rc==LSM_OK && lsmTreeSize(pDb)>0 ){
+        rc = lsmFlushToDisk(pDb);
+      }
+
+      /* Write a checkpoint to disk. */
+      if( rc==LSM_OK ){
+        rc = lsmCheckpointWrite(pDb);
+      }
+
+      /* If the checkpoint was written successfully, delete the log file */
+      if( rc==LSM_OK && pDb->pFS ){
+        Database *p = pDb->pDatabase;
+        lsmFsCloseAndDeleteLog(pDb->pFS);
+        if( p->pFile ) lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
+      }
+    }
+  }
+
+  lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
+  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+  pDb->pShmhdr = 0;
+}
+
+static int doDbConnect(lsm_db *pDb){
+  int rc;
+
+  /* Obtain a pointer to the shared-memory header */
+  assert( pDb->pShmhdr==0 );
+  rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr);
+  if( rc!=LSM_OK ) return rc;
+
+  /* Block for an exclusive lock on DMS1. This lock serializes all calls
+  ** to doDbConnect() and doDbDisconnect() across all processes.  */
+  rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+  if( rc!=LSM_OK ){
+    pDb->pShmhdr = 0;
+    return rc;
+  }
+
+  /* Try an exclusive lock on DMS2. If successful, this is the first and 
+  ** only connection to the database. In this case initialize the 
+  ** shared-memory and run log file recovery.  */
+  rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0);
+  if( rc==LSM_OK ){
+    memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
+    rc = lsmCheckpointRecover(pDb);
+    if( rc==LSM_OK ){
+      rc = lsmLogRecover(pDb);
+    }
+  }else if( rc==LSM_BUSY ){
+    rc = LSM_OK;
+  }
+
+  /* Take a shared lock on DMS2. This lock "cannot" fail, as connections 
+  ** may only hold an exclusive lock on DMS2 if they first hold an exclusive
+  ** lock on DMS1. And this connection is currently holding the exclusive
+  ** lock on DSM1.  */
+  if( rc==LSM_OK ){
+    rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
+  }
+
+  /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */
+  if( rc!=LSM_OK ){
+    lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
+    pDb->pShmhdr = 0;
+  }
+  lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+  return rc;
+}
 
 /*
 ** Return a reference to the shared Database handle for the database 
 ** identified by canonical path zName. If this is the first connection to
 ** the named database, a new Database object is allocated. Otherwise, a
@@ -353,11 +254,11 @@
 ** and and LSM error code returned.
 **
 ** Each successful call to this function should be (eventually) matched
 ** by a call to lsmDbDatabaseRelease().
 */
-int lsmDbDatabaseFind(
+int lsmDbDatabaseConnect(
   lsm_db *pDb,                    /* Database handle */
   const char *zName               /* Path to db file */
 ){
   lsm_env *pEnv = pDb->pEnv;
   int rc;                         /* Return code */
@@ -382,19 +283,13 @@
     /* If no suitable Database object was found, allocate a new one. */
     if( p==0 ){
       int nName = strlen(zName);
       p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc);
 
-      /* Initialize the log handle */
-      if( rc==LSM_OK ){
-        p->log.cksum0 = LSM_CKSUM0_INIT;
-        p->log.cksum1 = LSM_CKSUM1_INIT;
-      }
-
-      /* Allocate the two mutexes */
-      if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pWorkerMutex);
+      /* Allocate the mutex */
       if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex);
+
 
       /* If no error has occurred, fill in other fields and link the new 
       ** Database structure into the global list starting at 
       ** gShared.pDatabase. Otherwise, if an error has occurred, free any
       ** resources allocated and return without linking anything new into
@@ -403,95 +298,91 @@
         p->zName = (char *)&p[1];
         memcpy((void *)p->zName, zName, nName+1);
         p->pId = (void *)&p->zName[nName+1];
         memcpy(p->pId, pId, nId);
         p->nId = nId;
-        p->worker.pDatabase = p;
         p->pDbNext = gShared.pDatabase;
         gShared.pDatabase = p;
 
-        p->worker.iId = LSM_INITIAL_SNAPSHOT_ID;
-        p->nPgsz = pDb->nDfltPgsz;
-        p->nBlksz = pDb->nDfltBlksz;
-      }else{
+      }
+
+      /* If running in multi-process mode, open the shared fd */
+      if( rc==LSM_OK && pDb->bMultiProc ){
+        rc = lsmEnvOpen(pDb->pEnv, p->zName, &p->pFile);
+      }
+
+      if( rc!=LSM_OK ){
         freeDatabase(pEnv, p);
         p = 0;
       }
     }
 
     if( p ) p->nDbRef++;
     leaveGlobalMutex(pEnv);
+
+    if( p ){
+      lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+      pDb->pNext = p->pConn;
+      p->pConn = pDb;
+      lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+    }
   }
 
   lsmFree(pEnv, pId);
   pDb->pDatabase = p;
+
+  if( rc==LSM_OK ){
+    rc = doDbConnect(pDb);
+  }
+
   return rc;
 }
-
-static void freeClientSnapshot(lsm_env *pEnv, Snapshot *p){
-  Level *pLevel;
-  
-  assert( p->nRef==0 );
-  for(pLevel=p->pLevel; pLevel; pLevel=pLevel->pNext){
-    lsmFree(pEnv, pLevel->pSplitKey);
-  }
-  lsmFree(pEnv, p->pExport);
-  lsmFree(pEnv, p);
-}
-
 
 /*
-** Release a reference to a Database object obtained from lsmDbDatabaseFind().
-** There should be exactly one call to this function for each successful
-** call to Find().
+** Release a reference to a Database object obtained from 
+** lsmDbDatabaseConnect(). There should be exactly one call to this function 
+** for each successful call to Find().
 */
 void lsmDbDatabaseRelease(lsm_db *pDb){
   Database *p = pDb->pDatabase;
   if( p ){
+    lsm_db **ppDb;
+
+    if( pDb->pShmhdr ){
+      doDbDisconnect(pDb);
+    }
+
+    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+    for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
+    *ppDb = pDb->pNext;
+    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+
     enterGlobalMutex(pDb->pEnv);
     p->nDbRef--;
     if( p->nDbRef==0 ){
-      int rc = LSM_OK;
       Database **pp;
 
       /* Remove the Database structure from the linked list. */
       for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
       *pp = p->pDbNext;
 
-      /* Flush the in-memory tree, if required. If there is data to flush,
-      ** this will create a new client snapshot in Database.pClient. The
-      ** checkpoint (serialization) of this snapshot may be written to disk
-      ** by the following block.  */
-      if( p->bDirty || 0==lsmTreeIsEmpty(p->pTree) ){
-        rc = lsmFlushToDisk(pDb);
-      }
-
-      /* Write a checkpoint, also if required */
-      if( rc==LSM_OK && p->pClient ){
-        rc = lsmCheckpointWrite(pDb);
-      }
-
-      /* If the checkpoint was written successfully, delete the log file */
-      if( rc==LSM_OK && pDb->pFS ){
-        lsmFsCloseAndDeleteLog(pDb->pFS);
-      }
-
-      /* Free the in-memory tree object */
-      lsmTreeRelease(pDb->pEnv, p->pTree);
-
-      /* Free the contents of the worker snapshot */
-      lsmSortedFreeLevel(pDb->pEnv, p->worker.pLevel);
-      lsmFree(pDb->pEnv, p->freelist.aEntry);
-      lsmFree(pDb->pEnv, p->append.aPoint);
-      
-      /* Free the client snapshot */
-      if( p->pClient ){
-        assert( p->pClient->nRef==1 );
-        p->pClient->nRef = 0;
-        freeClientSnapshot(pDb->pEnv, p->pClient);
-      }
-
+      /* Free the Database object and shared memory buffers. */
+      if( p->pFile==0 ){
+        int i;
+        for(i=0; i<p->nShmChunk; i++){
+          lsmFree(pDb->pEnv, p->apShmChunk[i]);
+        }
+      }else{
+        LsmFile *pIter;
+        LsmFile *pNext;
+        for(pIter=p->pLsmFile; pIter; pIter=pNext){
+          pNext = pIter->pNext;
+          lsmEnvClose(pDb->pEnv, pIter->pFile);
+          lsmFree(pDb->pEnv, pIter);
+        }
+      }
+      lsmFree(pDb->pEnv, p->apShmChunk);
       freeDatabase(pDb->pEnv, p);
     }
     leaveGlobalMutex(pDb->pEnv);
   }
 }
@@ -499,256 +390,13 @@
 Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
   return pSnapshot->pLevel;
 }
 
 void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
-  assert( isWorker(pSnap) );
   pSnap->pLevel = pLevel;
 }
 
-void lsmDatabaseDirty(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
-  if( p->bDirty==0 ){
-    p->worker.iId++;
-    p->bDirty = 1;
-  }
-}
-
-int lsmDatabaseIsDirty(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
-  return p->bDirty;
-}
-
-/*
-** Get/set methods for the snapshot block-count. These should only be
-** used with worker snapshots.
-*/
-void lsmSnapshotSetNBlock(Snapshot *pSnap, int nNew){
-  assert( isWorker(pSnap) );
-  pSnap->pDatabase->nBlock = nNew;
-}
-int lsmSnapshotGetNBlock(Snapshot *pSnap){
-  assert( isWorker(pSnap) );
-  return pSnap->pDatabase->nBlock;
-}
-
-void lsmSnapshotSetCkptid(Snapshot *pSnap, i64 iNew){
-  assert( isWorker(pSnap) );
-  pSnap->iId = iNew;
-}
-
-/*
-** Return a pointer to the client snapshot object. Each successful call 
-** to lsmDbSnapshotClient() must be matched by an lsmDbSnapshotRelease() 
-** call.
-*/
-#if 0
-Snapshot *lsmDbSnapshotClient(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  Snapshot *pRet;
-  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-  pRet = p->pClient;
-  pRet->nRef++;
-  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-  return pRet;
-}
-#endif
-
-/*
-** Return a pointer to the worker snapshot. This call grabs the worker 
-** mutex. It is released when the pointer to the worker snapshot is passed 
-** to lsmDbSnapshotRelease().
-*/
-Snapshot *lsmDbSnapshotWorker(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
-  return &p->worker;
-}
-
-Snapshot *lsmDbSnapshotRecover(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  Snapshot *pRet = 0;
-  lsmMutexEnter(pDb->pEnv, p->pWorkerMutex);
-  if( p->bRecovered ){
-    lsmFsSetPageSize(pDb->pFS, p->nPgsz);
-    lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
-    lsmMutexLeave(pDb->pEnv, p->pWorkerMutex);
-  }else{
-    pRet = &p->worker;
-  }
-  return pRet;
-}
-
-/*
-** Set (bVal==1) or clear (bVal==0) the "recovery done" flag.
-**
-** TODO: Should this be combined with BeginRecovery()/FinishRecovery()?
-*/
-void lsmDbRecoveryComplete(lsm_db *pDb, int iSlot){
-  Database *p = pDb->pDatabase;
-
-  assert( iSlot==0 || iSlot==1 || iSlot==2 );
-  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) );
-  assert( p->pTree );
-
-  p->bRecovered = 1;
-  p->iCheckpointId = p->worker.iId;
-  p->iSlot = iSlot;
-  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
-  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
-}
-
-void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz){
-  Database *p = pDb->pDatabase;
-  assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) && p->bRecovered==0 );
-  p->nPgsz = nPgsz;
-  p->nBlksz = nBlksz;
-  lsmFsSetPageSize(pDb->pFS, p->nPgsz);
-  lsmFsSetBlockSize(pDb->pFS, p->nBlksz);
-}
-
-static void snapshotDecrRefcnt(lsm_env *pEnv, Snapshot *pSnap){
-  Database *p = pSnap->pDatabase;
-
-  assertSnapshotListOk(p);
-  pSnap->nRef--;
-  assert( pSnap->nRef>=0 );
-  if( pSnap->nRef==0 ){
-    Snapshot *pIter = p->pClient;
-    assert( pSnap!=pIter );
-    while( pIter->pSnapshotNext!=pSnap ) pIter = pIter->pSnapshotNext;
-    pIter->pSnapshotNext = pSnap->pSnapshotNext;
-    freeClientSnapshot(pEnv, pSnap);
-    assertSnapshotListOk(p);
-  }
-}
-
-/*
-** Release a snapshot reference obtained by calling lsmDbSnapshotWorker()
-** or lsmDbSnapshotClient().
-*/
-void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *pSnap){
-  if( pSnap ){
-    Database *p = pSnap->pDatabase;
-
-    /* If this call is to release a pointer to the worker snapshot, relinquish
-    ** the worker mutex.  
-    **
-    ** If pSnap is a client snapshot, decrement the reference count. When the
-    ** reference count reaches zero, free the snapshot object. The decrement
-    ** and (nRef==0) test are protected by the database client mutex.
-    */
-    if( isWorker(pSnap) ){
-      lsmMutexLeave(pEnv, p->pWorkerMutex);
-    }else{
-      lsmMutexEnter(pEnv, p->pClientMutex);
-      snapshotDecrRefcnt(pEnv, pSnap);
-      lsmMutexLeave(pEnv, p->pClientMutex);
-    }
-  }
-}
-
-/*
-** Create a new client snapshot based on the current contents of the worker 
-** snapshot. The connection must be the worker to call this function.
-*/
-int lsmDbUpdateClient(lsm_db *pDb, int nLsmLevel, int bOvfl){
-  Database *p = pDb->pDatabase;   /* Database handle */
-  Snapshot *pOld;                 /* Old client snapshot object */
-  Snapshot *pNew;                 /* New client snapshot object */
-  int nByte;                      /* Memory required for new client snapshot */
-  int rc = LSM_OK;                /* Memory required for new client snapshot */
-  int nLevel = 0;                 /* Number of levels in worker snapshot */
-  int nRight = 0;                 /* Total number of rhs in worker */
-  int nKeySpace = 0;              /* Total size of split keys */
-  Level *pLevel;                  /* Used to iterate through worker levels */
-  Level **ppLink;                 /* Used to link levels together */
-  u8 *pAvail;                     /* Used to divide up allocation */
-
-  /* Must be the worker to call this. */
-  assertMustbeWorker(pDb);
-
-  /* Allocate space for the client snapshot and all levels. */
-  for(pLevel=p->worker.pLevel; pLevel; pLevel=pLevel->pNext){
-    nLevel++;
-    nRight += pLevel->nRight;
-  }
-  nByte = sizeof(Snapshot) 
-        + nLevel * sizeof(Level)
-        + nRight * sizeof(Segment)
-        + nKeySpace;
-  pNew = (Snapshot *)lsmMallocZero(pDb->pEnv, nByte);
-  if( !pNew ) return LSM_NOMEM_BKPT;
-  pNew->pDatabase = p;
-  pNew->iId = p->worker.iId;
-
-  /* Copy the linked-list of Level structures */
-  pAvail = (u8 *)&pNew[1];
-  ppLink = &pNew->pLevel;
-  for(pLevel=p->worker.pLevel; pLevel && rc==LSM_OK; pLevel=pLevel->pNext){
-    Level *pNew;
-
-    pNew = (Level *)pAvail;
-    memcpy(pNew, pLevel, sizeof(Level));
-    pAvail += sizeof(Level);
-
-    if( pNew->nRight ){
-      pNew->aRhs = (Segment *)pAvail;
-      memcpy(pNew->aRhs, pLevel->aRhs, sizeof(Segment) * pNew->nRight);
-      pAvail += (sizeof(Segment) * pNew->nRight);
-      lsmSortedSplitkey(pDb, pNew, &rc);
-    }
-
-    /* This needs to come after any call to lsmSortedSplitkey(). Splitkey()
-    ** uses data within the Merge object to set pNew->pSplitKey and co.  */
-    pNew->pMerge = 0;
-
-    *ppLink = pNew;
-    ppLink = &pNew->pNext;
-  }
-
-  /* Create the serialized version of the new client snapshot. */
-  if( p->bDirty && rc==LSM_OK ){
-    assert( nLevel>nLsmLevel || p->worker.pLevel==0 );
-    rc = lsmCheckpointExport(
-        pDb, nLsmLevel, bOvfl, pNew->iId, 1, &pNew->pExport, &pNew->nExport
-    );
-  }
-
-  if( rc==LSM_OK ){
-    /* Initialize the new snapshot ref-count to 1 */
-    pNew->nRef = 1;
-
-    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);
-
-    /* Install the new client snapshot and release the old. */
-    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-    assertSnapshotListOk(p);
-    pOld = p->pClient;
-    pNew->pSnapshotNext = pOld;
-    p->pClient = pNew;
-    assertSnapshotListOk(p);
-    if( pDb->pClient ){
-      pDb->pClient = pNew;
-      pNew->nRef++;
-    }
-    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-
-    lsmDbSnapshotRelease(pDb->pEnv, pOld);
-    p->bDirty = 0;
-
-    /* Upgrade the user connection to the new client snapshot */
-
-  }else{
-    /* An error has occurred. Delete the allocated object. */
-    freeClientSnapshot(pDb->pEnv, pNew);
-  }
-
-  return rc;
-}
 
 /*
 ** Allocate a new database file block to write data to, either by extending
 ** the database file or by recycling a free-list entry. The worker snapshot 
 ** must be held in order to call this function.
@@ -755,61 +403,48 @@
 **
 ** If successful, *piBlk is set to the block number allocated and LSM_OK is
 ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
 */
 int lsmBlockAllocate(lsm_db *pDb, int *piBlk){
-  Database *p = pDb->pDatabase;
+  Snapshot *p = pDb->pWorker;
   Freelist *pFree;                /* Database free list */
   int iRet = 0;                   /* Block number of allocated block */
+  int rc = LSM_OK;
+
+  assert( pDb->pWorker );
  
   pFree = &p->freelist;
-
   if( pFree->nEntry>0 ){
     /* The first block on the free list was freed as part of the work done
     ** to create the snapshot with id iFree. So, we can reuse this block if
     ** snapshot iFree or later has been checkpointed and all currently 
-    ** active clients are reading from snapshot iFree or later.
-    */
-    Snapshot *pIter;
+    ** active clients are reading from snapshot iFree or later.  */
     i64 iFree = pFree->aEntry[0].iId;
-    i64 iInUse;
-
-    /* Both Database.iCheckpointId and the Database.pClient list are 
-    ** protected by the client mutex. So grab it here before determining
-    ** the id of the oldest snapshot still potentially in use.  */
-    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-    assertSnapshotListOk(p);
-    for(pIter=p->pClient; pIter->pSnapshotNext; pIter=pIter->pSnapshotNext);
-    iInUse = LSM_MIN(pIter->iId, p->iCheckpointId);
-    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-
-    if( 0 ){
-      int i;
-      printf("choose from freelist: ");
-      for(i=0; i<pFree->nEntry && pFree->aEntry[i].iId<=iInUse; i++){
-        printf("%d ", pFree->aEntry[i].iBlk);
-      }
-      printf("\n");
-      fflush(stdout);
-    }
-
-
-    if( iFree<=iInUse ){
+    int bInUse = 0;
+
+    /* The "is in use" bit */
+    rc = lsmLsmInUse(pDb, iFree, &bInUse);
+
+    /* The "has been checkpointed" bit */
+    if( rc==LSM_OK && bInUse==0 ){
+      i64 iId = 0;
+      rc = lsmCheckpointSynced(pDb, &iId);
+      if( rc!=LSM_OK || iId<iFree ) bInUse = 1;
+      if( rc==LSM_BUSY ) rc = LSM_OK;
+    }
+
+    if( rc==LSM_OK && bInUse==0 ){
       iRet = pFree->aEntry[0].iBlk;
       flRemoveEntry0(pFree);
       assert( iRet!=0 );
-      if( p->bRecordDelta ){
-        p->aDelta[0]++;
-      }
     }
   }
 
   /* If no block was allocated from the free-list, allocate one at the
   ** end of the file. */
-  if( iRet==0 ){
-    p->nBlock++;
-    iRet = p->nBlock;
+  if( rc==LSM_OK && iRet==0 ){
+    iRet = ++pDb->pWorker->nBlock;
   }
 
   *piBlk = iRet;
   return LSM_OK;
 }
@@ -820,20 +455,16 @@
 **
 ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. 
 ** LSM_NOMEM).
 */
 int lsmBlockFree(lsm_db *pDb, int iBlk){
-  Database *p = pDb->pDatabase;
-  Snapshot *pWorker = pDb->pWorker;
-  int rc = LSM_OK;
-
-  assertMustbeWorker(pDb);
-  assert( p->bRecordDelta==0 );
-  assert( pDb->pDatabase->bDirty );
-
-  rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, pWorker->iId);
-  return rc;
+  Snapshot *p = pDb->pWorker;
+
+  assert( lsmShmAssertWorker(pDb) );
+  /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */
+
+  return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId);
 }
 
 /*
 ** Refree a database block. The worker snapshot must be held in order to call 
 ** this function.
@@ -844,278 +475,201 @@
 ** block may be reused immediately. Whereas a freed block can not be reused 
 ** until (at least) after the next checkpoint.
 */
 int lsmBlockRefree(lsm_db *pDb, int iBlk){
   int rc = LSM_OK;                /* Return code */
-  Database *p = pDb->pDatabase;
+  Snapshot *p = pDb->pWorker;
 
   if( iBlk==p->nBlock ){
     p->nBlock--;
-  }else if( p->bRecordDelta ){
-    assert( p->aDelta[2]==0 );
-    p->aDelta[1 + (p->aDelta[1]!=0)] = iBlk;
   }else{
-    rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, 0);
-  }
-
-  return rc;
-}
-
-void lsmFreelistDeltaBegin(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  assertMustbeWorker(pDb);
-  assert( p->bRecordDelta==0 );
-  memset(p->aDelta, 0, sizeof(p->aDelta));
-  p->bRecordDelta = 1;
-}
-
-void lsmFreelistDeltaEnd(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  assertMustbeWorker(pDb);
-  p->bRecordDelta = 0;
-}
-
-void lsmFreelistDelta(
-  lsm_db *pDb,                    /* Database handle */
-  u32 *aDeltaOut                  /* OUT: Copy free-list delta here */
-){
-  Database *p = pDb->pDatabase;
-  assertMustbeWorker(pDb);
-  assert( sizeof(p->aDelta)==(sizeof(u32)*LSM_FREELIST_DELTA_SIZE) );
-  memcpy(aDeltaOut, p->aDelta, sizeof(p->aDelta));
-}
-
-u32 *lsmFreelistDeltaPtr(lsm_db *pDb){
-  return pDb->pDatabase->aDelta;
-}
-
-/*
-** Return the current contents of the free-list as a list of integers.
-*/
-int lsmSnapshotFreelist(lsm_db *pDb, int **paFree, int *pnFree){
-  int rc = LSM_OK;                /* Return Code */
-  int *aFree = 0;                 /* Integer array to return via *paFree */
-  int nFree;                      /* Value to return via *pnFree */
-  Freelist *p;                    /* Database free list object */
-
-  assert( pDb->pWorker );
-  p = &pDb->pDatabase->freelist;
-  nFree = p->nEntry;
-  if( nFree && paFree ){
-    aFree = lsmMallocRc(pDb->pEnv, sizeof(int) * nFree, &rc);
-    if( aFree ){
-      int i;
-      for(i=0; i<nFree; i++){
-        aFree[i] = p->aEntry[i].iBlk;
-      }
-    }
-  }
-
-  *pnFree = nFree;
-  if( paFree ) *paFree = aFree;
-  return rc;
-}
-
-
-int lsmSnapshotSetFreelist(lsm_db *pDb, int *aElem, int nElem){
-  Database *p = pDb->pDatabase;
-  lsm_env *pEnv = pDb->pEnv;
-  int rc = LSM_OK;                /* Return code */
-  int i;                          /* Iterator variable */
-  int nIgnore;                    /* Number of entries to ignore */
-  int iRefree1;                   /* A refreed block (or 0) */
-  int iRefree2;                   /* A refreed block (or 0) */
-  Freelist *pFree;                /* Database free-list */
-
-  nIgnore = p->aDelta[0];
-  iRefree1 = p->aDelta[1];
-  iRefree2 = p->aDelta[2];
-
-  pFree = &p->freelist;
-  for(i=nIgnore; rc==LSM_OK && i<nElem; i++){
-    rc = flAppendEntry(pEnv, pFree, aElem[i], 0);
-  }
-
-  if( rc==LSM_OK && iRefree1!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree1, 0);
-  if( rc==LSM_OK && iRefree2!=0 ) rc = flAppendEntry(pEnv, pFree, iRefree2, 0);
-
-  return rc;
-}
-
-/*
-** If required, store a new database checkpoint.
-**
-** The worker mutex must not be held when this is called. This is because
-** this function may indirectly call fsync(). And the worker mutex should
+    rc = flInsertEntry(pDb->pEnv, &p->freelist, iBlk);
+  }
+
+  return rc;
+}
+
+/*
+** If required, copy a database checkpoint from shared memory into the
+** database itself.
+**
+** The WORKER lock must not be held when this is called. This is because
+** this function may indirectly call fsync(). And the WORKER lock should
 ** not be held that long (in case it is required by a client flushing an
 ** in-memory tree to disk).
 */
 int lsmCheckpointWrite(lsm_db *pDb){
-  Snapshot *pSnap;                /* Snapshot to checkpoint */
-  Database *p = pDb->pDatabase;
-  int rc = LSM_OK;                /* Return Code */
+  int rc;                         /* Return Code */
 
   assert( pDb->pWorker==0 );
-
-  /* Try to obtain the checkpointer lock, then check if the a checkpoint
-  ** is actually required. If successful, and one is, set stack variable
-  ** pSnap to point to the client snapshot to checkpoint.  
-  */
-  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-  pSnap = p->pClient;
-  if( pSnap->pExport && p->bCheckpointer==0 && pSnap->iId>p->iCheckpointId ){
-    p->bCheckpointer = 1;
-    pSnap->nRef++;
-  }else{
-    pSnap = 0;
-  }
-  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-
-  /* Attempt to grab the checkpoint mutex. If the attempt fails, this 
-  ** function becomes a no-op. Some other thread is already running
-  ** a checkpoint (or at least checking if one is required).  */
-  if( pSnap ){
-    FileSystem *pFS = pDb->pFS;   /* File system object */
-    int iPg = 1+(p->iSlot%2);     /* Meta page to write to */
-    MetaPage *pPg = 0;            /* Page to write to */
-    int doSync;                   /* True to sync the db */
-
-    /* If the safety mode is "off", omit calls to xSync(). */
-    doSync = (pDb->eSafety!=LSM_SAFETY_OFF);
-
-    /* Sync the db. To make sure all runs referred to by the checkpoint
-    ** are safely on disk. If we do not do this and a power failure occurs 
-    ** just after the checkpoint is written into the db header, the
-    ** database could be corrupted following recovery.  */
-    if( doSync ) rc = lsmFsSyncDb(pFS);
-
-    /* Fetch a reference to the meta-page to write the checkpoint to. */
-    if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pFS, 1, iPg, &pPg);
-
-    /* Unless an error has occurred, copy the checkpoint blob into the
-    ** meta-page, then release the reference to it (which will flush the
-    ** checkpoint into the file).  */
-    if( rc!=LSM_OK ){
-      lsmFsMetaPageRelease(pPg);
-    }else{
-      u8 *aData;                  /* Page buffer */
-      int nData;                  /* Size of buffer aData[] */
-      aData = lsmFsMetaPageData(pPg, &nData);
-      assert( pSnap->nExport<=nData );
-      memcpy(aData, pSnap->pExport, pSnap->nExport);
-      rc = lsmFsMetaPageRelease(pPg);
-      pPg = 0;
-    }
-
-    /* Sync the db file again. To make sure that the checkpoint just 
-    ** written is on the disk.  */
-    if( rc==LSM_OK && doSync ) rc = lsmFsSyncDb(pFS);
-
-    /* This is where space on disk is reclaimed. Now that the checkpoint 
-    ** has been written to the database and synced, part of the database
-    ** log (the part containing the data just synced to disk) is no longer
-    ** required and so the space that it was taking up on disk can be 
-    ** reused.
-    **
-    ** It is also possible that database file blocks may be made available
-    ** for reuse here. A database file block is free if it is not used by
-    ** the most recently checkpointed snapshot, or by a snapshot that is 
-    ** in use by any existing database client. And "the most recently
-    ** checkpointed snapshot" has just changed.
-    */
-    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-    if( rc==LSM_OK ){
-      lsmLogCheckpoint(pDb, &p->log, lsmCheckpointLogOffset(pSnap->pExport));
-      p->iCheckpointId = pSnap->iId;
-      p->iSlot = iPg;
-    }
-    p->bCheckpointer = 0;
-    snapshotDecrRefcnt(pDb->pEnv, pSnap);
-    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-  }
-
-  return rc;
-}
-
-/*
-** This function is called when a connection is about to run log file
-** recovery (read the contents of the log file from disk and create a new
-** in memory tree from it). This happens when the very first connection
-** starts up and connects to the database.
-**
-** This sets the connections tree-version handle to one suitable to insert
-** the read data into.
-**
-** Once recovery is complete (regardless of whether or not it is successful),
-** lsmFinishRecovery() must be called to release resources locked by
-** this function.
-*/
-int lsmBeginRecovery(lsm_db *pDb){
-  int rc;                         /* Return code */
-  Database *p = pDb->pDatabase;   /* Shared data handle */
-
-  assert( p && p->pTree==0 );
-  assert( pDb->pWorker );
-  assert( pDb->pClient==0 );
-  assert( pDb->pTV==0 );
-  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
-
-  rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
-  if( rc==LSM_OK ){
-    assert( pDb->pTV==0 );
-    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
-  }
-  return rc;
-}
+  assert( 1 || pDb->pClient==0 );
+  assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
+
+  rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
+  if( rc!=LSM_OK ) return rc;
+
+  rc = lsmCheckpointLoad(pDb);
+  if( rc==LSM_OK ){
+    ShmHeader *pShm = pDb->pShmhdr;
+    int bDone = 0;                /* True if checkpoint is already stored */
+
+    /* Check if this checkpoint has already been written to the database
+    ** file. If so, set variable bDone to true.  */
+    if( pShm->iMetaPage ){
+      MetaPage *pPg;              /* Meta page */
+      u8 *aData;                  /* Meta-page data buffer */
+      int nData;                  /* Size of aData[] in bytes */
+      i64 iCkpt;                  /* Id of checkpoint just loaded */
+      i64 iDisk;                  /* Id of checkpoint already stored in db */
+      iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
+      rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
+      if( rc==LSM_OK ){
+        aData = lsmFsMetaPageData(pPg, &nData);
+        iDisk = lsmCheckpointId((u32 *)aData, 1);
+        lsmFsMetaPageRelease(pPg);
+      }
+      bDone = (iDisk>=iCkpt);
+    }
+
+    if( rc==LSM_OK && bDone==0 ){
+      int iMeta = (pShm->iMetaPage % 2) + 1;
+      rc = lsmFsSyncDb(pDb->pFS);
+      if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
+      if( rc==LSM_OK ) rc = lsmFsSyncDb(pDb->pFS);
+      if( rc==LSM_OK ) pShm->iMetaPage = iMeta;
+    }
+  }
+
+  /* If no error has occured, then the snapshot currently in pDb->aSnapshot
+  ** has been synced to disk. This means it may be possible to wrap the
+  ** log file. Obtain the WRITER lock and update the relevent tree-header
+  ** fields to reflect this. 
+  */
+  if( rc==LSM_OK ){
+    u64 iLogoff = lsmCheckpointLogOffset(pDb->aSnapshot);
+    if( pDb->nTransOpen==0 ){
+      rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
+    }
+    if( rc==LSM_OK ){
+      rc = lsmTreeLoadHeader(pDb);
+      if( rc==LSM_OK ) lsmLogCheckpoint(pDb, iLogoff);
+      if( rc==LSM_OK ) lsmTreeEndTransaction(pDb, 1);
+      if( rc==LSM_BUSY ) rc = LSM_OK;
+      if( pDb->nTransOpen==0 ){
+        rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+      }
+    }
+    if( rc==LSM_BUSY ) rc = LSM_OK;
+  }
+
+  lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
+  return rc;
+}
+
+int lsmBeginWork(lsm_db *pDb){
+  int rc;
+
+  /* Attempt to take the WORKER lock */
+  rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
+
+  /* Deserialize the current worker snapshot */
+  if( rc==LSM_OK ){
+    rc = lsmCheckpointLoadWorker(pDb);
+    if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
+  }
+  return rc;
+}
+
+void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
+  if( p ){
+    lsmSortedFreeLevel(pEnv, p->pLevel);
+    lsmFree(pEnv, p->freelist.aEntry);
+    lsmFree(pEnv, p);
+  }
+}
+
+/*
+** Argument bFlush is true if the contents of the in-memory tree has just
+** been flushed to disk. The significance of this is that once the snapshot
+** created to hold the updated state of the database is synced to disk, log
+** file space can be recycled.
+*/
+void lsmFinishWork(lsm_db *pDb, int bFlush, int nOvfl, int *pRc){
+  /* If no error has occurred, serialize the worker snapshot and write
+  ** it to shared memory.  */
+  if( *pRc==LSM_OK ){
+    *pRc = lsmCheckpointSaveWorker(pDb, bFlush, nOvfl);
+  }
+
+  if( pDb->pWorker ){
+    lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
+    pDb->pWorker = 0;
+  }
+
+  lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
+}
+
 
 /*
 ** Called when recovery is finished.
 */
 int lsmFinishRecovery(lsm_db *pDb){
-  int rc;
-  assert( pDb->pWorker );
-  assert( pDb->pClient==0 );
-  assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) );
-  rc = lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);
-  pDb->pTV = 0;
-  return rc;
+  lsmTreeEndTransaction(pDb, 1);
+  return LSM_OK;
 }
 
 /*
 ** Begin a read transaction. This function is a no-op if the connection
 ** passed as the only argument already has an open read transaction.
 */
 int lsmBeginReadTrans(lsm_db *pDb){
+  const int MAX_READLOCK_ATTEMPTS = 5;
   int rc = LSM_OK;                /* Return code */
+  int iAttempt = 0;
 
-  /* No reason a worker connection should be opening a read-transaction. */
   assert( pDb->pWorker==0 );
+  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );
 
-  if( pDb->pClient==0 ){
-    Database *p = pDb->pDatabase;
-    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-
+  while( rc==LSM_OK && pDb->pClient==0 && (iAttempt++)<MAX_READLOCK_ATTEMPTS ){
     assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
 
-    /* If there is no in-memory tree structure, allocate one now */
-    if( p->pTree==0 ){
-      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
+    /* Load the in-memory tree header. */
+    rc = lsmTreeLoadHeader(pDb);
+
+    /* Load the database snapshot */
+    if( rc==LSM_OK ){
+      rc = lsmCheckpointLoad(pDb);
     }
 
+    /* Take a read-lock on the tree and snapshot just loaded. Then check
+    ** that the shared-memory still contains the same values. If so, proceed.
+    ** Otherwise, relinquish the read-lock and retry the whole procedure
+    ** (starting with loading the in-memory tree header).  */
     if( rc==LSM_OK ){
-      /* Set the connections client database file snapshot */
-      p->pClient->nRef++;
-      pDb->pClient = p->pClient;
-
-      /* Set the connections tree-version handle */
-      assert( pDb->pTV==0 );
-      pDb->pTV = lsmTreeReadVersion(p->pTree);
-      assert( pDb->pTV!=0 );
-    }
-
-    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-  }
+      ShmHeader *pShm = pDb->pShmhdr;
+      i64 iTree = pDb->treehdr.iTreeId;
+      i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0);
+      rc = lsmReadlock(pDb, iSnap, iTree);
+      if( rc==LSM_OK ){
+        if( (i64)pShm->hdr1.iTreeId==iTree 
+         && pShm->hdr1.iTransId==pDb->treehdr.iTransId
+         && lsmCheckpointId(pShm->aClient, 0)==iSnap
+        ){
+          /* Read lock has been successfully obtained. Deserialize the 
+          ** checkpoint just loaded. TODO: This will be removed after 
+          ** lsm_sorted.c is changed to work directly from the serialized
+          ** version of the snapshot.  */
+          rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient);
+          assert( (rc==LSM_OK)==(pDb->pClient!=0) );
+        }else{
+          rc = lsmReleaseReadlock(pDb);
+        }
+      }
+      if( rc==LSM_BUSY ) rc = LSM_OK;
+    }
+  }
+  if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
 
   return rc;
 }
 
 /*
@@ -1124,79 +678,67 @@
 void lsmFinishReadTrans(lsm_db *pDb){
   Snapshot *pClient = pDb->pClient;
 
   /* Worker connections should not be closing read transactions. And
   ** read transactions should only be closed after all cursors and write
-  ** transactions have been closed.  */
+  ** transactions have been closed. Finally pClient should be non-NULL
+  ** only iff pDb->iReader>=0.  */
   assert( pDb->pWorker==0 );
   assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
 
   if( pClient ){
-    Database *p = pDb->pDatabase;
-
-    lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient);
+    lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
     pDb->pClient = 0;
-
-    /* Release the in-memory tree version */
-    lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-    lsmTreeReleaseReadVersion(pDb->pEnv, pDb->pTV);
-    pDb->pTV = 0;
-    lsmMutexLeave(pDb->pEnv, p->pClientMutex);
   }
+  if( pDb->iReader>=0 ) lsmReleaseReadlock(pDb);
+  assert( (pDb->pClient!=0)==(pDb->iReader>=0) );
 }
 
 /*
 ** Open a write transaction.
 */
 int lsmBeginWriteTrans(lsm_db *pDb){
-  int rc = LSM_OK;                /* Return code */
-  Database *p = pDb->pDatabase;   /* Shared database object */
-
-  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-  assert( p->pTree );
-  assert( (pDb->pTV==0)==(pDb->pClient==0) );
-
-  /* There are two reasons the attempt to open a write transaction may fail:
-  **
-  **   1. There is already a writer.
-  **   2. Connection pDb already has an open read transaction, and the read
-  **      snapshot is not the most recent version of the database.
-  **
-  ** If condition 1 is true, then the Database.bWriter flag is set. If the
-  ** second is true, then the call to lsmTreeWriteVersion() returns NULL.
-  */
-  if( p->bWriter ){
+  int rc;                         /* Return code */
+  ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
+
+  assert( pDb->nTransOpen==0 );
+
+  /* If there is no read-transaction open, open one now. */
+  rc = lsmBeginReadTrans(pDb);
+
+  /* Attempt to take the WRITER lock */
+  if( rc==LSM_OK ){
+    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
+  }
+
+  /* If the previous writer failed mid-transaction, run emergency rollback. */
+  if( rc==LSM_OK && pShm->bWriter ){
+    /* TODO: This! */
+    assert( 0 );
+    rc = LSM_CORRUPT_BKPT;
+  }
+
+  /* Check that this connection is currently reading from the most recent
+  ** version of the database. If not, return LSM_BUSY.  */
+  if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
     rc = LSM_BUSY;
-  }else{
-    rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
-  }
-
-  if( rc==LSM_OK ){
-    rc = lsmLogBegin(pDb, &p->log);
-
-    if( rc!=LSM_OK ){
-      /* If the call to lsmLogBegin() failed, relinquish the read/write
-      ** TreeVersion handle obtained above. The attempt to open a transaction
-      ** has failed.  */
-      TreeVersion *pWrite = pDb->pTV;
-      TreeVersion **ppRestore = (pDb->pClient ? &pDb->pTV : 0);
-      pDb->pTV = 0;
-      lsmTreeReleaseWriteVersion(pDb->pEnv, pWrite, 0, ppRestore);
-    }else if( pDb->pClient==0 ){
-      /* Otherwise, if the lsmLogBegin() attempt was successful and the 
-      ** client did not have a read transaction open when this function
-      ** was called, lsm_db.pClient will still be NULL. In this case, grab 
-      ** a reference to the lastest checkpointed snapshot now.  */
-      p->pClient->nRef++;
-      pDb->pClient = p->pClient;
-    }
-  }
-
-  if( rc==LSM_OK ){
-    p->bWriter = 1;
-  }
-  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+  }
+
+  if( rc==LSM_OK ){
+    rc = lsmLogBegin(pDb);
+  }
+
+  /* If everything was successful, set the "transaction-in-progress" flag
+  ** and return LSM_OK. Otherwise, if some error occurred, relinquish the 
+  ** WRITER lock and return an error code.  */
+  if( rc==LSM_OK ){
+    pShm->bWriter = 1;
+    pDb->treehdr.iTransId++;
+  }else{
+    lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+    if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
+  }
   return rc;
 }
 
 /*
 ** End the current write transaction. The connection is left with an open
@@ -1210,111 +752,394 @@
 ** merely releases locks and other resources held by the write-transaction.
 **
 ** LSM_OK is returned if successful, or an LSM error code otherwise.
 */
 int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
-  Database *p = pDb->pDatabase;
-  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-
-  assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) );
-  assert( p->bWriter );
-  p->bWriter = 0;
-  lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, bCommit, &pDb->pTV);
-
-  lsmLogEnd(pDb, &p->log, bCommit);
-  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-  return LSM_OK;
-}
-
-
-/*
-** This function is called at the beginning of a flush operation (i.e. when
-** flushing the contents of the in-memory tree to a segment on disk).
-**
-** The caller must already be the worker connection.
-**
-** Also, the caller must have an open write transaction or be in the process
-** of shutting down the (shared) database connection. This means we don't
-** have to worry about any other connection modifying the in-memory tree
-** structure while it is being flushed (although some other clients may be
-** reading from it).
-*/
-int lsmBeginFlush(lsm_db *pDb){
-
-  assert( pDb->pWorker );
-  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pDb->pTV))
-       || (pDb->pTV==0 && holdingGlobalMutex(pDb->pEnv))
-  );
-
-  if( pDb->pTV==0 ){
-    pDb->pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);
-  }
-  return LSM_OK;
-}
-
-int lsmDbTreeSize(lsm_db *pDb){
-  TreeVersion *pTV = pDb->pTV;
-
-  assert( pDb->pWorker );
-  assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pTV))
-       || (pTV==0 && holdingGlobalMutex(pDb->pEnv))
-  );
-  if( pTV==0 ) pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree);
-
-  return lsmTreeSize(pTV);
-}
-
-/*
-** This is called to indicate that a "flush-tree" operation has finished.
-** If the second argument is true, a new in-memory tree is allocated to
-** hold subsequent writes.
-*/
-int lsmFinishFlush(lsm_db *pDb, int bEmpty){
-  Database *p = pDb->pDatabase;
-  int rc = LSM_OK;
-
-  assert( pDb->pWorker );
-  assert( pDb->pTV && (p->nDbRef==0 || lsmTreeIsWriteVersion(pDb->pTV)) );
-  lsmMutexEnter(pDb->pEnv, p->pClientMutex);
-
-  if( bEmpty ){
-    if( p->bWriter ){
-      lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0);
-    }
-    pDb->pTV = 0;
-    lsmTreeRelease(pDb->pEnv, p->pTree);
-
-    if( p->nDbRef>0 ){
-      rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree);
-    }else{
-      /* This is the case if the Database object is being deleted */
-      p->pTree = 0;
-    }
-  }
-
-  if( p->bWriter ){
-    assert( pDb->pClient );
-    if( 0==pDb->pTV ) rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV);
-  }else{
-    pDb->pTV = 0;
-  }
-  lsmMutexLeave(pDb->pEnv, p->pClientMutex);
-  return rc;
-}
-
-/*
-** Return a pointer to the DbLog object associated with connection pDb.
-** Allocate and initialize it if necessary.
-*/
-DbLog *lsmDatabaseLog(lsm_db *pDb){
-  Database *p = pDb->pDatabase;
-  return &p->log;
-}
+  lsmLogEnd(pDb, bCommit);
+  lsmTreeEndTransaction(pDb, bCommit);
+  lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+  return LSM_OK;
+}
+
 
 /*
 ** Return non-zero if the caller is holding the client mutex.
 */
 #ifdef LSM_DEBUG
 int lsmHoldingClientMutex(lsm_db *pDb){
   return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
 }
 #endif
+
+/*
+** Obtain a read-lock on database version identified by the combination
+** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
+** an LSM error code otherwise.
+*/
+int lsmReadlock(lsm_db *db, i64 iLsm, i64 iTree){
+  ShmHeader *pShm = db->pShmhdr;
+  int i;
+  int rc = LSM_OK;
+
+  assert( db->iReader<0 );
+
+  /* Search for an exact match. */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId==iLsm && p->iTreeId==iTree ){
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){
+        db->iReader = i;
+      }else if( rc==LSM_BUSY ){
+        rc = LSM_OK;
+      }
+    }
+  }
+
+  /* Try to obtain a write-lock on each slot, in order. If successful, set
+  ** the slot values to iLsm/iTree.  */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+    if( rc==LSM_BUSY ){
+      rc = LSM_OK;
+    }else{
+      ShmReader *p = &pShm->aReader[i];
+      p->iLsmId = iLsm;
+      p->iTreeId = iTree;
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      if( rc==LSM_OK ) db->iReader = i;
+    }
+  }
+
+  /* Search for any usable slot */
+  for(i=0; db->iReader<0 && rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+      if( rc==LSM_OK ){
+        if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){
+          db->iReader = i;
+        }
+      }else if( rc==LSM_BUSY ){
+        rc = LSM_OK;
+      }
+    }
+  }
+
+  return rc;
+}
+
+static int isInUse(lsm_db *db, i64 iLsm, i64 iTree, int *pbInUse){
+  ShmHeader *pShm = db->pShmhdr;
+  int i;
+  int rc = LSM_OK;
+
+  for(i=0; rc==LSM_OK && i<LSM_LOCK_NREADER; i++){
+    ShmReader *p = &pShm->aReader[i];
+    if( p->iLsmId && p->iTreeId && (p->iTreeId<=iTree || p->iLsmId<=iLsm) ){
+      rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+      if( rc==LSM_OK ){
+        p->iTreeId = p->iLsmId = 0;
+        lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+      }
+    }
+  }
+
+  if( rc==LSM_BUSY ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  *pbInUse = 0;
+  return rc;
+}
+
+int lsmTreeInUse(lsm_db *db, u32 iTreeId, int *pbInUse){
+  if( db->treehdr.iTreeId==iTreeId ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  return isInUse(db, 0, (i64)iTreeId, pbInUse);
+}
+
+int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
+  if( db->pClient && db->pClient->iId<=iLsmId ){
+    *pbInUse = 1;
+    return LSM_OK;
+  }
+  return isInUse(db, iLsmId, 0, pbInUse);
+}
+
+/*
+** Release the read-lock currently held by connection db.
+*/
+int lsmReleaseReadlock(lsm_db *db){
+  int rc = LSM_OK;
+  if( db->iReader>=0 ){
+    rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
+    db->iReader = -1;
+  }
+  return rc;
+}
+
+/*
+** This function may only be called after a successful call to
+** lsmDbDatabaseConnect(). It returns true if the connection is in
+** multi-process mode, or false otherwise.
+*/
+int lsmDbMultiProc(lsm_db *pDb){
+  return pDb->pDatabase && (pDb->pDatabase->pFile!=0);
+}
+
+void lsmDbDeferredClose(lsm_db *pDb, lsm_file *pFile, LsmFile *pLsmFile){
+  Database *p = pDb->pDatabase;
+  lsm_env *pEnv = pDb->pEnv;
+
+  lsmMutexEnter(pEnv, p->pClientMutex);
+  pLsmFile->pFile = pFile;
+  pLsmFile->pNext = p->pLsmFile;
+  p->pLsmFile = pLsmFile;
+  lsmMutexLeave(pEnv, p->pClientMutex);
+}
+
+
+/*************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+*************************************************************************/
+
+/*
+** Retrieve a pointer to shared-memory chunk iChunk. Chunks are numbered
+** starting from 0 (i.e. the header chunk is chunk 0).
+*/
+int lsmShmChunk(lsm_db *db, int iChunk, void **ppData){
+  int rc = LSM_OK;
+  void *pRet = 0;
+  Database *p = db->pDatabase;
+  lsm_env *pEnv = db->pEnv;
+
+  /* Enter the client mutex */
+  assert( iChunk>=0 );
+  lsmMutexEnter(pEnv, p->pClientMutex);
+
+  if( iChunk>=p->nShmChunk ){
+    int nNew = iChunk+1;
+    void **apNew;
+    apNew = (void **)lsmRealloc(pEnv, p->apShmChunk, sizeof(void*) * nNew);
+    if( apNew==0 ){
+      rc = LSM_NOMEM_BKPT;
+    }else{
+      memset(&apNew[p->nShmChunk], 0, sizeof(void*) * (nNew-p->nShmChunk));
+      p->apShmChunk = apNew;
+      p->nShmChunk = nNew;
+    }
+  }
+
+  if( rc==LSM_OK && p->apShmChunk[iChunk]==0 ){
+    void *pChunk = 0;
+    if( p->pFile==0 ){
+      /* Single process mode */
+      pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
+    }else{
+      /* Multi-process mode */
+      rc = lsmEnvShmMap(pEnv, p->pFile, iChunk, LSM_SHM_CHUNK_SIZE, &pChunk);
+    }
+    p->apShmChunk[iChunk] = pChunk;
+  }
+
+  if( rc==LSM_OK ){
+    pRet = p->apShmChunk[iChunk];
+  }
+
+  /* Release the client mutex */
+  lsmMutexLeave(pEnv, p->pClientMutex);
+
+  *ppData = pRet; 
+  return rc;
+}
+
+/*
+** Attempt to obtain the lock identified by the iLock and bExcl parameters.
+** If successful, return LSM_OK. If the lock cannot be obtained because 
+** there exists some other conflicting lock, return LSM_BUSY. If some other
+** error occurs, return an LSM error code.
+**
+** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
+** or else a value returned by the LSM_LOCK_READER macro.
+*/
+int lsmShmLock(
+  lsm_db *db, 
+  int iLock,
+  int eOp,                        /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
+  int bBlock                      /* True for a blocking lock */
+){
+  lsm_db *pIter;
+  const u32 me = (1 << (iLock-1));
+  const u32 ms = (1 << (iLock+16-1));
+  int rc = LSM_OK;
+  Database *p = db->pDatabase;
+
+  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
+  assert( iLock<=16 );
+  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+  /* Check for a no-op. Proceed only if this is not one of those. */
+  if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
+   || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
+   || (eOp==LSM_LOCK_EXCL   && (db->mLock & me)==0)
+  ){
+    int nExcl = 0;                /* Number of connections holding EXCLUSIVE */
+    int nShared = 0;              /* Number of connections holding SHARED */
+    lsmMutexEnter(db->pEnv, p->pClientMutex);
+
+    /* Figure out the locks currently held by this process on iLock, not
+    ** including any held by connection db.  */
+    for(pIter=p->pConn; pIter; pIter=pIter->pNext){
+      assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
+      if( pIter!=db ){
+        if( pIter->mLock & me ){
+          nExcl++;
+        }else if( pIter->mLock & ms ){
+          nShared++;
+        }
+      }
+    }
+    assert( nExcl==0 || nExcl==1 );
+    assert( nExcl==0 || nShared==0 );
+    assert( nExcl==0 || (db->mLock & (me|ms))==0 );
+
+    switch( eOp ){
+      case LSM_LOCK_UNLOCK:
+        if( nShared==0 ){
+          lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_UNLOCK);
+        }
+        db->mLock &= ~(me|ms);
+        break;
+
+      case LSM_LOCK_SHARED:
+        if( nExcl ){
+          rc = LSM_BUSY;
+        }else{
+          if( nShared==0 ){
+            rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_SHARED);
+          }
+          db->mLock |= ms;
+          db->mLock &= ~me;
+        }
+        break;
+
+      default:
+        assert( eOp==LSM_LOCK_EXCL );
+        if( nExcl || nShared ){
+          rc = LSM_BUSY;
+        }else{
+          rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_EXCL);
+          db->mLock |= (me|ms);
+        }
+        break;
+    }
+
+    lsmMutexLeave(db->pEnv, p->pClientMutex);
+  }
+
+  return rc;
+}
+
+#ifdef LSM_DEBUG
+
+int shmLockType(lsm_db *db, int iLock){
+  const u32 me = (1 << (iLock-1));
+  const u32 ms = (1 << (iLock+16-1));
+
+  if( db->mLock & me ) return LSM_LOCK_EXCL;
+  if( db->mLock & ms ) return LSM_LOCK_SHARED;
+  return LSM_LOCK_UNLOCK;
+}
+
+/*
+** The arguments passed to this function are similar to those passed to
+** the lsmShmLock() function. However, instead of obtaining a new lock 
+** this function returns true if the specified connection already holds 
+** (or does not hold) such a lock, depending on the value of eOp. As
+** follows:
+**
+**   (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
+**   (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
+**   (eOp==LSM_LOCK_EXCL)   -> true if db has an EXCLUSIVE lock on iLock.
+*/
+int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
+  int ret;
+  int eHave;
+
+  assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
+  assert( iLock<=16 );
+  assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+  eHave = shmLockType(db, iLock);
+
+  switch( eOp ){
+    case LSM_LOCK_UNLOCK:
+      ret = (eHave==LSM_LOCK_UNLOCK);
+      break;
+    case LSM_LOCK_SHARED:
+      ret = (eHave!=LSM_LOCK_UNLOCK);
+      break;
+    case LSM_LOCK_EXCL:
+      ret = (eHave==LSM_LOCK_EXCL);
+      break;
+    default:
+      assert( !"bad eOp value passed to lsmShmAssertLock()" );
+      break;
+  }
+
+  return ret;
+}
+
+int lsmShmAssertWorker(lsm_db *db){
+  return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
+}
+
+/*
+** This function does not contribute to library functionality, and is not
+** included in release builds. It is intended to be called from within
+** an interactive debugger.
+**
+** When called, this function prints a single line of human readable output
+** to stdout describing the locks currently held by the connection. For 
+** example:
+**
+**     (gdb) call print_db_locks(pDb)
+**     (shared on dms2) (exclusive on writer) 
+*/
+void print_db_locks(lsm_db *db){
+  int iLock;
+  for(iLock=0; iLock<16; iLock++){
+    int bOne = 0;
+    const char *azLock[] = {0, "shared", "exclusive"};
+    const char *azName[] = {
+      0, "dms1", "dms2", "writer", "worker", "checkpointer",
+      "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
+    };
+    int eHave = shmLockType(db, iLock);
+    if( azLock[eHave] ){
+      printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
+      bOne = 1;
+    }
+  }
+  printf("\n");
+}
+void print_all_db_locks(lsm_db *db){
+  lsm_db *p;
+  for(p=db->pDatabase->pConn; p; p=p->pNext){
+    printf("%s connection %p ", ((p==db)?"*":""), p);
+    print_db_locks(p);
+  }
+}
+#endif
+
+void lsmShmBarrier(lsm_db *db){
+  lsmEnvShmBarrier(db->pEnv);
+}
+
+
+

Index: src/lsm_sorted.c
==================================================================
--- src/lsm_sorted.c
+++ src/lsm_sorted.c
@@ -262,11 +262,11 @@
   BtreeCursor *pBtCsr;
 
   Snapshot *pSnap;
 
   /* Used by cursors flushing the in-memory tree only */
-  int nLsmLevel;                  /* Number of levels to store in LSM */
+  int *pnOvfl;                    /* Number of free-list entries to store */
   void *pSystemVal;               /* Pointer to buffer to free */
 };
 
 #define CURSOR_DATA_TREE      0
 #define CURSOR_DATA_SYSTEM    1
@@ -286,14 +286,10 @@
 **
 ** CURSOR_AT_FREELIST
 **   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
 **   pointing at a free list.
 **
-** CURSOR_AT_LEVELS
-**   This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually
-**   pointing at a free list.
-**
 ** CURSOR_IGNORE_SYSTEM
 **   If set, this cursor ignores system keys.
 **
 ** CURSOR_NEXT_OK
 **   Set if it is Ok to call lsm_csr_next().
@@ -302,11 +298,10 @@
 **   Set if it is Ok to call lsm_csr_prev().
 */
 #define CURSOR_IGNORE_DELETE    0x00000001
 #define CURSOR_NEW_SYSTEM       0x00000002
 #define CURSOR_AT_FREELIST      0x00000004
-#define CURSOR_AT_LEVELS        0x00000008
 #define CURSOR_IGNORE_SYSTEM    0x00000010
 #define CURSOR_NEXT_OK          0x00000020
 #define CURSOR_PREV_OK          0x00000040
 
 typedef struct MergeWorker MergeWorker;
@@ -485,10 +480,19 @@
 }
 
 static u8 *pageGetCell(u8 *aData, int nData, int iCell){
   return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])];
 }
+
+/*
+** Return the number of cells on page pPg.
+*/
+static int pageObjGetNRec(Page *pPg){
+  int nData;
+  u8 *aData = lsmFsPageData(pPg, &nData);
+  return pageGetNRec(aData, nData);
+}
 
 /*
 ** Return the decoded (possibly relative) pointer value stored in cell 
 ** iCell from page aData/nData.
 */
@@ -567,10 +571,11 @@
   u8 *aCell;
   int eType;
 
   aData = fsPageData(pPg, &nData);
   assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) );
+  assert( iKey>=0 && iKey<pageGetNRec(aData, nData) );
 
   aCell = pageGetCell(aData, nData, iKey);
   eType = *aCell++;
   aCell += lsmVarintGet32(aCell, piPtr);
 
@@ -599,16 +604,25 @@
   if( pCsr->iPg<0 ){
     pCsr->pKey = 0;
     pCsr->nKey = 0;
     pCsr->eType = 0;
   }else{
-    int dummy;
-    rc = pageGetBtreeKey(
-        pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
-        &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
-    );
-    pCsr->eType |= SORTED_SEPARATOR;
+    int iPg;
+    for(iPg=pCsr->iPg; iPg>=0; iPg--){
+      int iCell = pCsr->aPg[pCsr->iPg].iCell;
+      if( iCell>=0 ){
+        int dummy;
+        rc = pageGetBtreeKey(
+            pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell,
+            &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob
+        );
+        pCsr->eType |= SORTED_SEPARATOR;
+        break;
+      }
+    }
+
+    if( iPg<0 ) rc = LSM_CORRUPT_BKPT;
   }
 
   return rc;
 }
 
@@ -824,17 +838,27 @@
       Blob blob = {0,0,0};
       void *pSeek;
       int nSeek;
       int iTopicSeek;
       int dummy;
-
       int iPg = 0;
       int iLoad = pCsr->pSeg->iRoot;
-
-      rc = pageGetBtreeKey(pCsr->aPg[nDepth-1].pPage, 
-          0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
-      );
+      Page *pPg = pCsr->aPg[nDepth-1].pPage;
+ 
+      if( pageObjGetNRec(pPg)==0 ){
+        /* This can happen when pPg is the right-most leaf in the b-tree.
+        ** In this case, set the iTopicSeek/pSeek/nSeek key to a value
+        ** greater than any real key.  */
+        assert( iCell==-1 );
+        iTopicSeek = 1000;
+        pSeek = 0;
+        nSeek = 0;
+      }else{
+        rc = pageGetBtreeKey(pPg,
+            0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob
+        );
+      }
 
       do {
         Page *pPg;
         rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg);
         assert( rc==LSM_OK || pPg==0 );
@@ -1099,10 +1123,14 @@
 
     for(i=0; i<pLevel->nRight; i++){
       pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i];
     }
   }
+
+  if( nPtr>1 && pLevel->pSplitKey==0 ){
+    lsmSortedSplitkey(pDb, pLevel, &rc);
+  }
 
   return rc;
 }
 
 static int levelCursorInitRun(
@@ -1541,12 +1569,14 @@
   if( pLeft->pPg==0 ){
     iRet = 1;
   }else if( pRight->pPg==0 ){
     iRet = 0;
   }else{
-    int res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);
-
+    int res = rtTopic(pLeft->eType) - rtTopic(pRight->eType);
+    if( res==0 ){
+      res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey);
+    }
     if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){
       iRet = 0;
     }else{
       iRet = 1;
     }
@@ -1973,11 +2003,10 @@
     }
   }
 
   if( rc==LSM_OK ){
     if( useTree ){
-      assert( pDb->pTV );
       rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr);
     }
     pCsr->pDb = pDb;
     pCsr->pSnap = pSnap;
     pCsr->xCmp = pDb->xCmp;
@@ -2026,12 +2055,13 @@
 /*
 ** If the free-block list is not empty, then have this cursor visit a key
 ** with (a) the system bit set, and (b) the key "F" and (c) a value blob
 ** containing the entire serialized free-block list.
 */
-static void multiCursorVisitFreelist(MultiCursor *pCsr){
+static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){
   assert( pCsr );
+  pCsr->pnOvfl = pnOvfl;
   pCsr->flags |= CURSOR_NEW_SYSTEM;
 }
 
 /*
 ** Allocate a new cursor to read the database (the in-memory tree and all
@@ -2116,15 +2146,10 @@
       if( pCsr->flags & CURSOR_AT_FREELIST ){
         pKey = (void *)"FREELIST";
         nKey = 8;
         eType = SORTED_SYSTEM_WRITE;
       }
-      else if( pCsr->flags & CURSOR_AT_LEVELS ){
-        pKey = (void *)"LEVELS";
-        nKey = 6;
-        eType = SORTED_SYSTEM_WRITE;
-      }
       break;
 
     default: {
       int iSeg = iKey - CURSOR_DATA_SEGMENT;
       if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){
@@ -2158,21 +2183,17 @@
       *ppVal = 0;
       *pnVal = 0;
     }
   }else if( iVal==CURSOR_DATA_SYSTEM ){
     if( pCsr->flags & CURSOR_AT_FREELIST ){
-      int *aVal;
+      void *aVal;
       int nVal;
+
       assert( pCsr->pSystemVal==0 );
-      rc = lsmSnapshotFreelist(pCsr->pDb, &aVal, &nVal);
-      pCsr->pSystemVal = *ppVal = (void *)aVal;
-      *pnVal = sizeof(int) * nVal;
-      lsmFreelistDeltaBegin(pCsr->pDb);
-    }else if( (pCsr->flags & CURSOR_AT_LEVELS) && pCsr->nLsmLevel>0 ){
-      lsmFree(pCsr->pDb->pEnv, pCsr->pSystemVal);
-      lsmCheckpointLevels(pCsr->pDb, pCsr->nLsmLevel, ppVal, pnVal);
-      pCsr->pSystemVal = *ppVal;
+      rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, &nVal, pCsr->pnOvfl);
+      *ppVal = pCsr->pSystemVal = aVal;
+      *pnVal = nVal;
     }else{
       *ppVal = 0;
       *pnVal = 0;
     }
   }else if( iVal-CURSOR_DATA_SEGMENT<pCsr->nSegCsr 
@@ -2185,48 +2206,43 @@
   }
   assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) );
   return rc;
 }
 
-int lsmSortedLoadSystem(lsm_db *pDb){
+int lsmSortedLoadFreelist(
+  lsm_db *pDb,                    /* Database handle (must be worker) */
+  void **ppVal,                   /* OUT: Blob containing LSM free-list */
+  int *pnVal                      /* OUT: Size of *ppVal blob in bytes */
+){
   MultiCursor *pCsr = 0;          /* Cursor used to retreive free-list */
   int rc;                         /* Return Code */
 
   assert( pDb->pWorker );
+  assert( *ppVal==0 && *pnVal==0 );
+
   rc = multiCursorAllocate(pDb, 1, &pCsr);
   if( rc==LSM_OK ){
-    void *pVal; int nVal;         /* Value read from database */
-
     rc = lsmMCursorLast(pCsr);
-    if( rc==LSM_OK 
-     && pCsr->eType==SORTED_SYSTEM_WRITE 
-     && pCsr->key.nData==6 
-     && 0==memcmp(pCsr->key.pData, "LEVELS", 6)
-    ){
-      rc = lsmMCursorValue(pCsr, &pVal, &nVal);
-      if( rc==LSM_OK ){
-        rc = lsmCheckpointLoadLevels(pDb, pVal, nVal);
-      }
-      if( rc==LSM_OK ){
-        rc = lsmMCursorPrev(pCsr);
-      }
-    }
-
     if( rc==LSM_OK 
      && pCsr->eType==SORTED_SYSTEM_WRITE 
      && pCsr->key.nData==8 
      && 0==memcmp(pCsr->key.pData, "FREELIST", 8)
     ){
+      void *pVal; int nVal;         /* Value read from database */
       rc = lsmMCursorValue(pCsr, &pVal, &nVal);
       if( rc==LSM_OK ){
-        int n32 = nVal / sizeof(u32);
-        rc = lsmSnapshotSetFreelist(pDb, (int *)pVal, n32);
+        *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc);
+        if( *ppVal ){
+          memcpy(*ppVal, pVal, nVal);
+          *pnVal = nVal;
+        }
       }
     }
 
     lsmMCursorClose(pCsr);
   }
+
   return rc;
 }
 
 static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){
   int i1;
@@ -2425,11 +2441,10 @@
   if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE;
   assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE );
 
   assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 );
   assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 );
-  assert( (pCsr->flags & CURSOR_AT_LEVELS)==0 );
 
   pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK);
   lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res);
   switch( eESeek ){
     case LSM_SEEK_EQ:
@@ -2554,20 +2569,14 @@
           rc = lsmTreeCursorPrev(pCsr->pTreeCsr);
         }else{
           rc = lsmTreeCursorNext(pCsr->pTreeCsr);
         }
       }else if( iKey==CURSOR_DATA_SYSTEM ){
-        assert( pCsr->flags & (CURSOR_AT_FREELIST | CURSOR_AT_LEVELS) );
+        assert( pCsr->flags & CURSOR_AT_FREELIST );
         assert( pCsr->flags & CURSOR_NEW_SYSTEM );
         assert( bReverse==0 );
-
-        if( pCsr->flags & CURSOR_AT_FREELIST ){
-          pCsr->flags &= ~CURSOR_AT_FREELIST;
-          pCsr->flags |= CURSOR_AT_LEVELS;
-        }else{
-          pCsr->flags &= ~CURSOR_AT_LEVELS;
-        }
+        pCsr->flags &= ~CURSOR_AT_FREELIST;
       }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){
         assert( bReverse==0 && pCsr->pBtCsr );
         rc = btreeCursorNext(pCsr->pBtCsr);
       }else{
         LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT];
@@ -3441,10 +3450,11 @@
   return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr);
 }
 
 static void sortedFreeLevel(lsm_env *pEnv, Level *p){
   if( p ){
+    lsmFree(pEnv, p->pSplitKey);
     lsmFree(pEnv, p->pMerge);
     lsmFree(pEnv, p->aRhs);
     lsmFree(pEnv, p);
   }
 }
@@ -3453,35 +3463,37 @@
   if( pDb->xWork ){
     pDb->xWork(pDb, pDb->pWorkCtx);
   }
 }
 
-int lsmSortedNewToplevel(
+static int sortedNewToplevel(
   lsm_db *pDb,                    /* Connection handle */
-  int nLevel,                     /* Number of levels store in LSM (often 0) */
-  int bFreelist                   /* True to store the freelist in the LSM */
+  int bTree,                      /* True to store contents of in-memory tree */
+  int *pnOvfl                     /* OUT: Number of free-list entries stored */
 ){
   int rc = LSM_OK;                /* Return Code */
   MultiCursor *pCsr = 0;
   Level *pNext = 0;               /* The current top level */
   Level *pNew;                    /* The new level itself */
   Segment *pDel = 0;              /* Delete separators from this segment */
   int iLeftPtr = 0;
+
+  assert( pnOvfl );
 
   /* Allocate the new level structure to write to. */
   pNext = lsmDbSnapshotLevel(pDb->pWorker);
   pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc);
 
   /* Create a cursor to gather the data required by the new segment. The new
   ** segment contains everything in the tree and pointers to the next segment
   ** in the database (if any).  */
   if( rc==LSM_OK ){
-
-    pNew->pNext = pNext;
-    lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
-
-    rc = multiCursorNew(pDb, pDb->pWorker, (pDb->pTV!=0), 0, &pCsr);
+    rc = multiCursorNew(pDb, pDb->pWorker, bTree, 0, &pCsr);
+    if( rc==LSM_OK ){
+      pNew->pNext = pNext;
+      lsmDbSnapshotSetLevel(pDb->pWorker, pNew);
+    }
     if( rc==LSM_OK ){
       if( pNext ){
         assert( pNext->pMerge==0 || pNext->nRight>0 );
         if( pNext->pMerge==0 ){
           if( pNext->lhs.iRoot ){
@@ -3498,16 +3510,12 @@
         multiCursorIgnoreDelete(pCsr);
       }
     }
 
     if( rc==LSM_OK ){
-      assert( bFreelist || nLevel==0 );
-      if( bFreelist ){
-        multiCursorVisitFreelist(pCsr);
-      }
+      multiCursorVisitFreelist(pCsr, pnOvfl);
       multiCursorReadSeparators(pCsr);
-      pCsr->nLsmLevel = nLevel;
     }
   }
 
   if( rc!=LSM_OK ){
     lsmMCursorClose(pCsr);
@@ -3536,11 +3544,10 @@
     }
 
     mergeWorkerShutdown(&mergeworker, &rc);
     pNew->pMerge = 0;
   }
-  lsmFreelistDeltaEnd(pDb);
 
   /* Link the new level into the top of the tree. */
   if( rc==LSM_OK ){
     if( pDel ){
       pDel->iRoot = 0;
@@ -3571,39 +3578,28 @@
 ** the first, the connection also holds the in-memory tree write-version.
 ** In the second, no in-memory tree version reference is held at all.
 */
 int lsmSortedFlushTree(
   lsm_db *pDb,                    /* Connection handle */
-  int nLevel,
-  int bFreelist
+  int *pnOvfl                     /* OUT: Number of free-list entries written */
 ){
   int rc;
 
   assert( pDb->pWorker );
-  assert( pDb->pTV==0 || lsmTreeIsWriteVersion(pDb->pTV) );
-
-  rc = lsmBeginFlush(pDb);
 
   /* If there is nothing to do, return early. */
-  if( lsmTreeSize(pDb->pTV)==0 && bFreelist==0 ){
-    lsmFinishFlush(pDb, 0);
+  if( lsmTreeSize(pDb)==0 && lsmCheckpointOverflowRequired(pDb)==0 ){
+    *pnOvfl = 0;
     return LSM_OK;
   }
 
-  lsmDatabaseDirty(pDb);
-
-  if( rc==LSM_OK ){
-    rc = lsmSortedNewToplevel(pDb, nLevel, bFreelist);
-  }
+  rc = sortedNewToplevel(pDb, 1, pnOvfl);
+  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
 
 #if 0
-  lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "tree flush");
+  lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "tree flush");
 #endif
-
-  assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
-
-  lsmFinishFlush(pDb, rc==LSM_OK);
   return rc;
 }
 
 /*
 ** The nMerge levels in the LSM beginning with pLevel consist of a
@@ -3643,11 +3639,11 @@
     pNew->nRight = nMerge;
     pNew->iAge = pLevel->iAge+1;
     for(i=0; i<nMerge; i++){
       pNext = p->pNext;
       pNew->aRhs[i] = p->lhs;
-      lsmFree(pDb->pEnv, p);
+      sortedFreeLevel(pDb->pEnv, p);
       p = pNext;
     }
 
     /* Replace the old levels with the new. */
     pTopLevel = lsmDbSnapshotLevel(pDb->pWorker);
@@ -3798,11 +3794,10 @@
 
   assert( lsmFsIntegrityCheck(pDb) );
   assert( pWorker );
 
   if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK;
-  lsmDatabaseDirty(pDb);
 
   while( nRemaining>0 ){
     Level *pLevel;
     Level *pTopLevel = lsmDbSnapshotLevel(pWorker);
 
@@ -3939,11 +3934,11 @@
       ** the database structure has changed. */
       mergeWorkerShutdown(&mergeworker, &rc);
       if( rc==LSM_OK ) sortedInvokeWorkHook(pDb);
 
 #if 0
-      lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "work");
+      lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work");
 #endif
 
     }
   }
 
@@ -4036,42 +4031,50 @@
 */
 int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){
   int rc = LSM_OK;                /* Return code */
 
   /* This function may not be called if pDb has an open read or write
-  ** transaction. Return LSM_MISUSE if an application attempts this.  
-  */
+  ** transaction. Return LSM_MISUSE if an application attempts this.  */
   if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT;
-  assert( pDb->pTV==0 );
 
+  /* If the FLUSH flag is set, try to flush the contents of the in-memory
+  ** tree to disk.  */
   if( (flags & LSM_WORK_FLUSH) ){
     rc = lsmBeginWriteTrans(pDb);
     if( rc==LSM_OK ){
       rc = lsmFlushToDisk(pDb);
-      lsmFinishWriteTrans(pDb, 0);
+      lsmFinishWriteTrans(pDb, 1);
       lsmFinishReadTrans(pDb);
     }
   }
 
   if( rc==LSM_OK && nPage>0 ){
     int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0);
     int nWrite = 0;
-    pDb->pWorker = lsmDbSnapshotWorker(pDb);
-    rc = sortedWork(pDb, nPage, bOptimize, &nWrite);
-
-    if( rc==LSM_OK && nWrite && (flags & LSM_WORK_CHECKPOINT) ){
-      int bOvfl;
-      int nLsm;
-
-      bOvfl = lsmCheckpointOverflow(pDb, &nLsm);
+    int nOvfl = -1;
+
+    assert( pDb->pWorker==0 );
+    rc = lsmBeginWork(pDb);
+    if( rc==LSM_OK ){
+      rc = sortedWork(pDb, nPage, bOptimize, &nWrite);
+    }
+
+    if( rc==LSM_OK && nWrite ){
       rc = lsmSortedFlushDb(pDb);
-      if( rc==LSM_OK && bOvfl ) rc = lsmSortedNewToplevel(pDb, nLsm, bOvfl);
-      if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsm, bOvfl);
+      if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){
+        rc = sortedNewToplevel(pDb, 0, &nOvfl);
+      }
+    }
+
+    if( nWrite ){
+      lsmFinishWork(pDb, 0, nOvfl, &rc);
+    }else{
+      int rcdummy = LSM_BUSY;
+      lsmFinishWork(pDb, 0, 0, &rcdummy);
     }
 
-    lsmDbSnapshotRelease(pDb->pEnv, pDb->pWorker);
-    pDb->pWorker = 0;
+    assert( pDb->pWorker==0 );
     if( pnWrite ) *pnWrite = nWrite;
   }else if( pnWrite ){
     *pnWrite = 0;
   }
 
@@ -4274,19 +4277,25 @@
   Snapshot *pWorker;              /* Worker snapshot */
   Snapshot *pRelease = 0;         /* Snapshot to release */
   Page *pPg = 0;                  /* Handle for page iPg */
   int i, j;                       /* Loop counters */
   const int perLine = 16;         /* Bytes per line in the raw hex dump */
+  int bEndWork = 0;
 
   *pzOut = 0;
   if( iPg==0 ) return LSM_ERROR;
 
   /* Obtain the worker snapshot */
+#if 0
   pWorker = pDb->pWorker;
   if( !pWorker ){
-    pRelease = pWorker = lsmDbSnapshotWorker(pDb);
+    rc = lsmBeginWork(pDb);
+    if( rc!=LSM_OK ) return rc;
+    pWorker = pDb->pWorker;
+    bEndWork = 1;
   }
+#endif
 
   rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg);
   if( rc==LSM_OK ){
     Blob blob = {0, 0, 0, 0};
     int nKeyWidth = 0;
@@ -4371,11 +4380,10 @@
     *pzOut = str.z;
     sortedBlobFree(&blob);
     lsmFsPageRelease(pPg);
   }
 
-  lsmDbSnapshotRelease(pDb->pEnv, pRelease);
   return rc;
 }
 
 void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){
   assert( pDb->xLog );
@@ -4410,15 +4418,11 @@
   const char *zWhy                /* Caption to print near top of dump */
 ){
   Snapshot *pDump = pSnap;
   Level *pTopLevel;
 
-  if( pDump==0 ){
-    assert( pDb->pWorker==0 );
-    pDump = lsmDbSnapshotWorker(pDb);
-  }
-
+  assert( pSnap );
   pTopLevel = lsmDbSnapshotLevel(pDump);
   if( pDb->xLog && pTopLevel ){
     Level *pLevel;
     int iLevel = 0;
 
@@ -4481,14 +4485,10 @@
           sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals);
         }
       }
     }
   }
-
-  if( pSnap==0 ){
-    lsmDbSnapshotRelease(pDb->pEnv, pDump);
-  }
 }
 
 void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){
   Level *pNext;
   Level *p;

Index: src/lsm_tree.c
==================================================================
--- src/lsm_tree.c
+++ src/lsm_tree.c
@@ -48,11 +48,11 @@
 **   designed so that it may be edited in place exactly once without 
 **   affecting existing users. In other words, the node structure is capable
 **   of storing two separate versions of the node at the same time.
 **   When a node is to be edited, if the node structure already contains 
 **   two versions, a copy is made as in the append-only approach. Or, if
-**   it only contains a single version, it may be edited in place.
+**   it only contains a single version, it is edited in place.
 **
 **   This reduces the overhead so that, roughly, one new node structure
 **   must be allocated for each write (on top of those allocations that 
 **   would have been required by a non-MVCC tree). Logic: Assume that at 
 **   any time, 50% of nodes in the tree already contain 2 versions. When
@@ -93,101 +93,52 @@
 typedef struct TreeNode TreeNode;
 typedef struct TreeLeaf TreeLeaf;
 typedef struct NodeVersion NodeVersion;
 
 /*
-** Container for a key-value pair.
+** Container for a key-value pair. Within the *-shm file, each key/value
+** pair is stored in a single allocation (which may not actually be 
+** contiguous in memory). Layout is the TreeKey structure, followed by
+** the nKey bytes of key blob, followed by the nValue bytes of value blob
+** (if nValue is non-negative).
 */
 struct TreeKey {
-  void *pKey;                     /* Pointer to key */
-  void *pValue;                   /* Pointer to value. May be NULL. */
   int nKey;                       /* Size of pKey in bytes */
   int nValue;                     /* Size of pValue. Or negative. */
 };
 
+#define TK_KEY(p) ((void *)&(p)[1])
+#define TK_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey))
+
 /*
 ** A single tree node. A node structure may contain up to 3 key/value
 ** pairs. Internal (non-leaf) nodes have up to 4 children.
 **
 ** TODO: Update the format of this to be more compact. Get it working
 ** first though...
 */
 struct TreeNode {
-  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */
+  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
 
   /* The following fields are present for interior nodes only, not leaves. */
-  TreeNode *apChild[4];           /* Array of pointers to child nodes */
+  u32 aiChildPtr[4];              /* Array of pointers to child nodes */
 
-  int iV2;                        /* Version number of v2 */
-  u8 iV2Ptr;                      /* apChild[] entry replaced by pV2Ptr */
-  TreeNode *pV2Ptr;               /* Substitute pointer */
-  TreeNode *pNext;                /* Next in interior node rollback list */
+  /* The extra child pointer slot. */
+  u32 iV2;                        /* Transaction number of v2 */
+  u8 iV2Child;                    /* apChild[] entry replaced by pV2Ptr */
+  u32 iV2Ptr;                     /* Substitute pointer */
 };
 
 struct TreeLeaf {
-  TreeKey *apKey[3];              /* Array of pointers to key-value pairs */
-};
-
-/*
-** A handle used by a client to access a Tree structure.
-*/
-struct TreeVersion {
-  Tree *pTree;                    /* The tree structure to which this belongs */
-  int nRef;                       /* Number of pointers to this */
-  TreeNode *pRoot;                /* Pointer to root of tree structure */
-  int nHeight;                    /* Current height of tree pRoot */
-  int iVersion;                   /* Current version */
-};
-
-#define WORKING_VERSION (1<<30)
-
-/*
-** A tree structure.
-**
-** iVersion:
-**   When the tree is first created, this is set to 1. Thereafter it is
-**   incremented each time lsmTreeMark() is called. The tree must be 
-**   destroyed (i.e. flushed to disk) before it wraps around (todo!).
-**
-**   When v2 data is written to a tree-node, the iV2 field of the node
-**   is set to the current value of Tree.iVersion.
-**
-** nRef:
-**   Number of references to this tree structure. When it is first created,
-**   (in lsmTreeNew()) nRef is set to 1. There after the ref-count may be
-**   incremented and decremented using treeIncrRefcount() and 
-**   DecrRefcount(). When the ref-count of a tree structure reaches zero
-**   it is freed.
-**
-** xCmp:
-**   Pointer to the compare function. This is a copy of some pDb->xCmp.
-**
-*/
-struct Tree {
-  int nTreeRef;                   /* Current number of pointers to this */
-  Mempool *pPool;                 /* Memory pool to allocate from */
-  int (*xCmp)(void *, int, void *, int);         /* Compare function */
-  TreeVersion *pCommit;           /* Committed version of tree (for readers) */
-
-  TreeVersion *pWorking;          /* Working verson (for writers) */
-#if 0
-  TreeVersion tvWorking;          /* Working verson (for writers) */
-#endif
-
-  TreeNode *pRbFirst;
-  TreeNode *pRbLast;
-};
-
-/*
-** The pointer passed as the first argument points to an interior node,
-** not a leaf. This function returns the value of the iCell'th child
-** sub-tree of the node.
-*/
-static TreeNode *getChildPtr(TreeNode *p, int iVersion, int iCell){
-  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Ptr ) return p->pV2Ptr;
-  return p->apChild[iCell];
-}
+  u32 aiKeyPtr[3];                /* Array of pointers to TreeKey objects */
+};
+
+typedef struct TreeBlob TreeBlob;
+struct TreeBlob {
+  int n;
+  u8 *a;
+};
 
 /*
 ** Cursor for searching a tree structure.
 **
 ** If a cursor does not point to any element (a.k.a. EOF), then the
@@ -203,11 +154,179 @@
   lsm_db *pDb;                    /* Database handle for this cursor */
   int iNode;                      /* Cursor points at apTreeNode[iNode] */
   TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */
   u8 aiCell[MAX_DEPTH];           /* Current position in tree */
   TreeKey *pSave;                 /* Saved key */
+  TreeBlob blob;                  /* Dynamic storage for a key */
 };
+
+/*
+** A value guaranteed to be larger than the largest possible transaction
+** id (TreeHeader.iTransId).
+*/
+#define WORKING_VERSION (1<<30)
+
+static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){
+  if( n>p->n ){
+    lsmFree(pDb->pEnv, p->a);
+    p->a = lsmMallocRc(pDb->pEnv, n, pRc);
+    p->n = n;
+  }
+  return (p->a==0);
+}
+static void tblobFree(lsm_db *pDb, TreeBlob *p){
+  lsmFree(pDb->pEnv, p->a);
+}
+
+
+/***********************************************************************
+** Start of IntArray methods.  */
+/*
+** Append value iVal to the contents of IntArray *p. Return LSM_OK if 
+** successful, or LSM_NOMEM if an OOM condition is encountered.
+*/
+static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){
+  assert( p->nArray<=p->nAlloc );
+  if( p->nArray>=p->nAlloc ){
+    u32 *aNew;
+    int nNew = p->nArray ? p->nArray*2 : 128;
+    aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32));
+    if( !aNew ) return LSM_NOMEM_BKPT;
+    p->aArray = aNew;
+    p->nAlloc = nNew;
+  }
+
+  p->aArray[p->nArray++] = iVal;
+  return LSM_OK;
+}
+
+/*
+** Zero the IntArray object.
+*/
+static void intArrayFree(lsm_env *pEnv, IntArray *p){
+  lsmFree(pEnv, p->aArray);
+  memset(p, 0, sizeof(IntArray));
+}
+
+/*
+** Return the number of entries currently in the int-array object.
+*/
+static int intArraySize(IntArray *p){
+  return p->nArray;
+}
+
+/*
+** Return a copy of the iIdx'th entry in the int-array.
+*/
+static u32 intArrayEntry(IntArray *p, int iIdx){
+  return p->aArray[iIdx];
+}
+
+/*
+** Truncate the int-array so that all but the first nVal values are 
+** discarded.
+*/
+static void intArrayTruncate(IntArray *p, int nVal){
+  p->nArray = nVal;
+}
+/* End of IntArray methods.
+***********************************************************************/
+
+/*
+** The pointer passed as the first argument points to an interior node,
+** not a leaf. This function returns the offset of the iCell'th child
+** sub-tree of the node.
+*/
+static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){
+  assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) );
+  if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr;
+  return p->aiChildPtr[iCell];
+}
+
+/*
+** Given an offset within the *-shm file, return the associated chunk number.
+*/
+static int treeOffsetToChunk(u32 iOff){
+  assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
+  return (int)(iOff>>15);
+}
+
+/*
+** Return a pointer to the mapped memory location associated with *-shm 
+** file offset iPtr.
+*/
+static void *treeShmptr(lsm_db *pDb, u32 iPtr, int *pRc){
+  /* TODO: This will likely be way too slow. If it is, chunks should be
+  ** cached as part of the db handle.  */
+  if( iPtr && *pRc==0 ){
+    int rc;
+    void *pChunk;
+
+    rc = lsmShmChunk(pDb, treeOffsetToChunk(iPtr), &pChunk);
+    if( rc==LSM_OK ){
+      return &((u8 *)pChunk)[iPtr & (LSM_SHM_CHUNK_SIZE-1)];
+    }
+    *pRc = rc;
+  }
+  return 0;
+}
+
+static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){
+  int rcdummy = LSM_OK;
+  return (ShmChunk *)treeShmptr(pDb, iChunk*LSM_SHM_CHUNK_SIZE, &rcdummy);
+}
+
+/* Values for the third argument to treeShmkey(). */
+#define TK_LOADKEY  1
+#define TK_LOADVAL  2
+
+static TreeKey *treeShmkey(
+  lsm_db *pDb,                    /* Database handle */
+  u32 iPtr,                       /* Shmptr to TreeKey struct */
+  int eLoad,                      /* Either zero or a TREEKEY_LOADXXX value */
+  TreeBlob *pBlob,                /* Used if dynamic memory is required */
+  int *pRc                        /* IN/OUT: Error code */
+){
+  TreeKey *pRet;
+
+  assert( eLoad==TK_LOADKEY || eLoad==TK_LOADVAL );
+  pRet = (TreeKey *)treeShmptr(pDb, iPtr, pRc);
+  if( pRet ){
+    int nReq;                     /* Bytes of space required at pRet */
+    int nAvail;                   /* Bytes of space available at pRet */
+
+    nReq = sizeof(TreeKey) + pRet->nKey;
+    if( eLoad==TK_LOADVAL && pRet->nValue>0 ){
+      nReq += pRet->nValue;
+    }
+    assert( LSM_SHM_CHUNK_SIZE==(1<<15) );
+    nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1));
+
+    if( nAvail<nReq ){
+      if( tblobGrow(pDb, pBlob, nReq, pRc)==0 ){
+        int nLoad = 0;
+        while( *pRc==LSM_OK ){
+          ShmChunk *pChunk;
+          void *p = treeShmptr(pDb, iPtr, pRc);
+          int n = LSM_MIN(nAvail, nReq-nLoad);
+
+          memcpy(&pBlob->a[nLoad], p, n);
+          nLoad += n;
+          if( nLoad==nReq ) break;
+
+          pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr));
+          assert( pChunk );
+          iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR;
+          nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR;
+        }
+      }
+      pRet = (TreeKey *)(pBlob->a);
+    }
+  }
+
+  return pRet;
+}
 
 #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT)
 
 void assert_leaf_looks_ok(TreeNode *pNode){
   assert( pNode->apKey[1] );
@@ -245,176 +364,94 @@
 #else
 # define assert_tree_looks_ok(x,y)
 #endif
 
 #ifdef LSM_DEBUG
+
+/*
+** Pointer pBlob points to a buffer containing a blob of binary data
+** nBlob bytes long. Append the contents of this blob to *pStr, with
+** each octet represented by a 2-digit hexadecimal number. For example,
+** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF},
+** then "0144ff" is appended to *pStr.
+*/
 static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){
   int i;
-  lsmStringExtend(pStr, nBlob);
+  lsmStringExtend(pStr, nBlob*2);
   if( pStr->nAlloc==0 ) return;
   for(i=0; i<nBlob; i++){
     u8 c = ((u8*)pBlob)[i];
-    pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
-    pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
+    if( c>='a' && c<='z' ){
+      pStr->z[pStr->n++] = c;
+    }else{
+      pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf];
+      pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf];
+    }
   }
   pStr->z[pStr->n] = 0;
 }
 
+/*
+** Append nIndent space (0x20) characters to string *pStr.
+*/
 static void lsmAppendIndent(LsmString *pStr, int nIndent){
   int i;
   lsmStringExtend(pStr, nIndent);
   for(i=0; i<nIndent; i++) lsmStringAppend(pStr, " ", 1);
 }
 
-static void lsmAppendKeyValue(LsmString *pStr, TreeKey *pKey){
-  int i;
-
-  for(i=0; i<pKey->nKey; i++){
-    lsmStringAppendf(pStr, "%2X ", ((u8 *)(pKey->pKey))[i]);
-  }
-  lsmStringAppend(pStr, "      ", -1);
-
-  if( pKey->nValue<0 ){
-    lsmStringAppend(pStr, "<deleted>", -1);
-  }else{
-    lsmAppendStrBlob(pStr, pKey->pValue, pKey->nValue);
-  }
-}
-
-void dump_node(TreeNode *pNode, int nIndent, int isNode){
-  if( pNode ){
-    LsmString s;
-    int i;
-
-    lsmStringInit(&s, NEED_ENV);
-    lsmAppendIndent(&s, nIndent);
-    lsmStringAppendf(&s, "0x%p", (void*)pNode);
-    printf("%s\n", s.z);
-    lsmStringClear(&s);
-
-    for(i=0; i<4; i++){
-
-      if( isNode ){
-        if( pNode->iV2 && i==pNode->iV2Ptr ){
-          lsmAppendIndent(&s, nIndent+2);
-          lsmStringAppendf(&s, "if( version>=%d )", pNode->iV2);
-          printf("%s\n", s.z);
-          lsmStringClear(&s);
-          dump_node(pNode->pV2Ptr, nIndent + 4, isNode-1);
-          if( pNode->apChild[i] ){
-            lsmAppendIndent(&s, nIndent+2);
-            lsmStringAppendf(&s, "else");
-            printf("%s\n", s.z);
-            lsmStringClear(&s);
-          }
-        }
-
-        dump_node(pNode->apChild[i], nIndent + 4, isNode-1);
-      }
-
-      if( i<3 && pNode->apKey[i] ){
-        lsmAppendIndent(&s, nIndent);
-        lsmStringAppendf(&s, "k%d: ", i);
-        lsmAppendKeyValue(&s, pNode->apKey[i]);
-        printf("%s\n", s.z);
-        lsmStringClear(&s);
-      }
-
-    }
-  }
-}
-
-void dump_node_contents(TreeNode *pNode, int iVersion, int nIndent, int isNode){
-  int i;
-  LsmString s;
-
-  lsmStringInit(&s, NEED_ENV);
-  lsmAppendIndent(&s, nIndent);
-  for(i=0; i<3; i++){
-    if( pNode->apKey[i] ){
-      TreeKey *pKey = pNode->apKey[i];
-      lsmAppendStrBlob(&s, pKey->pKey, pKey->nKey);
+void dump_node_contents(
+  lsm_db *pDb,
+  u32 iNode,                      /* Print out hte contents of this node */
+  int nIndent,                    /* Number of spaces indentation */
+  int nHeight                     /* Height: (0==leaf) (1==parent-of-leaf) */
+){
+  int i;
+  int rc = LSM_OK;
+  LsmString s;
+  TreeNode *pNode;
+  TreeBlob b = {0, 0};
+
+  /* Append the nIndent bytes of space to string s. */
+  lsmStringInit(&s, pDb->pEnv);
+  if( nIndent ) lsmAppendIndent(&s, nIndent);
+
+  pNode = (TreeNode *)treeShmptr(pDb, iNode, &rc);
+
+  /* Append each key to string s. */
+  for(i=0; i<3; i++){
+    u32 iPtr = pNode->aiKeyPtr[i];
+    if( iPtr ){
+      TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TK_LOADKEY, &b, &rc);
+      lsmAppendStrBlob(&s, TK_KEY(pKey), pKey->nKey);
       lsmStringAppend(&s, "     ", -1);
     }
   }
 
   printf("%s\n", s.z);
   lsmStringClear(&s);
 
-  for(i=0; i<4 && isNode>0; i++){
-    TreeNode *pChild = getChildPtr(pNode, iVersion, i);
-    if( pChild ){
-      dump_node_contents(pChild, iVersion, nIndent + 2, isNode-1);
-    }
-  }
-}
-
-void dump_tree_contents(Tree *pTree, const char *zCaption){
-  TreeVersion *p = pTree->pWorking ? pTree->pWorking : pTree->pCommit;
-  printf("\n%s\n", zCaption);
-  if( p->pRoot ){
-    dump_node_contents(p->pRoot, WORKING_VERSION, 0, p->nHeight-1);
-  }
-  fflush(stdout);
-}
-
-void dump_tv_contents(TreeVersion *pTV, const char *zCaption){
-  printf("\n%s\n", zCaption);
-  if( pTV->pRoot ){
-    dump_node(pTV->pRoot, 2, pTV->nHeight-1);
+  for(i=0; i<4 && nHeight>0; i++){
+    u32 iPtr = getChildPtr(pNode, pDb->treehdr.iTransId, i);
+    if( iPtr ){
+      dump_node_contents(pDb, iPtr, nIndent + 2, nHeight-1);
+    }
+  }
+
+  tblobFree(pDb, &b);
+}
+
+void dump_tree_contents(lsm_db *pDb, const char *zCaption){
+  printf("\n%s\n", zCaption);
+  if( pDb->treehdr.iRoot ){
+    dump_node_contents(pDb, pDb->treehdr.iRoot, 0, pDb->treehdr.nHeight-1);
   }
   fflush(stdout);
 }
 
 #endif
 
-/*
-** Allocate a new tree structure.
-*/
-int lsmTreeNew(
-  lsm_env *pEnv,                            /* Environment handle */
-  int (*xCmp)(void *, int, void *, int),    /* Compare function */
-  Tree **ppTree                             /* OUT: New tree object */
-){
-  int rc;
-  Tree *pTree = 0;
-  Mempool *pPool;                 /* Memory pool used by the new tree */
-  TreeVersion *pClient = 0;       /* Initial client access handle */
-
-  rc = lsmPoolNew(pEnv, &pPool);
-  pClient = (TreeVersion *)lsmMallocZeroRc(pEnv, sizeof(TreeVersion), &rc);
-
-  if( rc==LSM_OK ){
-    pTree = (Tree *)lsmPoolMallocZero(pEnv, pPool, sizeof(Tree));
-    assert( pTree );
-    pTree->pPool = pPool;
-    pTree->xCmp = xCmp;
-    pTree->nTreeRef = 1;
-
-    pClient->iVersion = 1;
-    pClient->pTree = pTree;
-    pClient->nRef = 1;
-    pTree->pCommit = pClient;
-  }else{
-    assert( pClient==0 );
-    lsmPoolDestroy(pEnv, pPool);
-  }
-
-  *ppTree = pTree;
-  return rc;
-}
-
-/*
-** Destroy a tree structure allocated by lsmTreeNew().
-*/
-static void treeDestroy(lsm_env *pEnv, Tree *pTree){
-  if( pTree ){
-    assert( pTree->pWorking==0 );
-    lsmPoolDestroy(pEnv, pTree->pPool);
-  }
-}
-
 /*
 ** Initialize a cursor object, the space for which has already been
 ** allocated.
 */
 static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){
@@ -421,47 +458,34 @@
   memset(pCsr, 0, sizeof(TreeCursor));
   pCsr->pDb = pDb;
   pCsr->iNode = -1;
 }
 
-static TreeNode *newTreeLeaf(lsm_env *pEnv, Tree *pTree){
-  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeLeaf));
-}
-
-static TreeNode *newTreeNode(lsm_env *pEnv, Tree *pTree){
-  return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));
-}
-
-static TreeNode *copyTreeNode(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
-  TreeNode *pNew;
-  pNew = (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode));
-
-  memcpy(pNew->apKey, pOld->apKey, sizeof(pNew->apKey));
-  memcpy(pNew->apChild, pOld->apChild, sizeof(pNew->apChild));
-  if( pOld->iV2 ) pNew->apChild[pOld->iV2Ptr] = pOld->pV2Ptr;
-
-  return pNew;
-}
-
-static TreeNode *copyTreeLeaf(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){
-  TreeNode *pNew;
-  pNew = newTreeLeaf(pEnv, pTree);
-  memcpy(pNew, pOld, sizeof(TreeLeaf));
-  return pNew;
+/*
+** Return a pointer to the mapping of the TreeKey object that the cursor
+** is pointing to. 
+*/
+static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){
+  return (TreeKey *)treeShmkey(pCsr->pDb,
+      pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]], 
+      TK_LOADVAL, pBlob, pRc
+  );
 }
 
 /*
 ** Save the current position of tree cursor pCsr.
 */
-void lsmTreeCursorSave(TreeCursor *pCsr){
+int lsmTreeCursorSave(TreeCursor *pCsr){
+  int rc = LSM_OK;
   if( pCsr->pSave==0 ){
     int iNode = pCsr->iNode;
     if( iNode>=0 ){
-      pCsr->pSave = pCsr->apTreeNode[iNode]->apKey[pCsr->aiCell[iNode]];
+      pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc);
     }
     pCsr->iNode = -1;
   }
+  return rc;
 }
 
 /*
 ** Restore the position of a saved tree cursor.
 */
@@ -469,15 +493,206 @@
   int rc = LSM_OK;
   if( pCsr->pSave ){
     TreeKey *pKey = pCsr->pSave;
     pCsr->pSave = 0;
     if( pRes ){
-      rc = lsmTreeCursorSeek(pCsr, pKey->pKey, pKey->nKey, pRes);
+      rc = lsmTreeCursorSeek(pCsr, TK_KEY(pKey), pKey->nKey, pRes);
     }
   }
   return rc;
 }
+
+/*
+** Allocate nByte bytes of space within the *-shm file. If successful, 
+** return LSM_OK and set *piPtr to the offset within the file at which
+** the allocated space is located.
+*/
+static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){
+  u32 iRet = 0;
+  if( *pRc==LSM_OK ){
+    const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE;
+    const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR;
+    u32 iWrite;                   /* Current write offset */
+    u32 iEof;                     /* End of current chunk */
+    int iChunk;                   /* Current chunk */
+
+    assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) );
+
+    /* Check if there is enough space on the current chunk to fit the
+    ** new allocation. If not, link in a new chunk and put the new
+    ** allocation at the start of it.  */
+    iWrite = pDb->treehdr.iWrite;
+    if( bAlign ){
+      iWrite = (iWrite + 3) & ~0x0003;
+      assert( (iWrite % 4)==0 );
+    }
+
+    assert( iWrite );
+    iChunk = treeOffsetToChunk(iWrite-1);
+    iEof = (iChunk+1) * CHUNK_SIZE;
+    assert( iEof>=iWrite && (iEof-iWrite)<CHUNK_SIZE );
+    if( (iWrite+nByte)>iEof ){
+      ShmChunk *pHdr;           /* Header of chunk just finished (iChunk) */
+      ShmChunk *pFirst;         /* Header of chunk treehdr.iFirst */
+      int iNext = 0;            /* Next chunk */
+      int rc;
+
+      /* Check if the chunk at the start of the linked list is still in
+      ** use. If not, reuse it. If so, allocate a new chunk by appending
+      ** to the *-shm file.  */
+      if( pDb->treehdr.iFirst!=iChunk ){
+        int bInUse;
+        pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst);
+        rc = lsmTreeInUse(pDb, pFirst->iLastTree, &bInUse);
+        if( rc!=LSM_OK ){
+          *pRc = rc;
+          return 0;
+        }
+        if( bInUse==0 ){
+          iNext = pDb->treehdr.iFirst;
+          pDb->treehdr.iFirst = pFirst->iNext;
+          pFirst->iNext = 0;
+          pFirst->iLastTree = 0;
+          assert( pDb->treehdr.iFirst );
+          assert( pFirst->iLastTree<pDb->treehdr.iTreeId );
+        }
+      }
+      if( iNext==0 ) iNext = pDb->treehdr.nChunk++;
+
+      /* Set the header values for the chunk just finished */
+      pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE, pRc);
+      pHdr->iLastTree = pDb->treehdr.iTreeId;
+      pHdr->iNext = iNext;
+
+      /* Advance to the next chunk */
+      iWrite = iNext * CHUNK_SIZE + CHUNK_HDR;
+    }
+
+    /* Allocate space at iWrite. */
+    iRet = iWrite;
+    pDb->treehdr.iWrite = iWrite + nByte;
+    pDb->treehdr.nByte += nByte;
+  }
+  return iRet;
+}
+
+/*
+** Allocate and zero nByte bytes of space within the *-shm file.
+*/
+static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){
+  u32 iPtr;
+  void *p;
+  iPtr = treeShmalloc(pDb, 1, nByte, pRc);
+  p = treeShmptr(pDb, iPtr, pRc);
+  if( p ){
+    assert( *pRc==LSM_OK );
+    memset(p, 0, nByte);
+    *piPtr = iPtr;
+  }
+  return p;
+}
+
+static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){
+  return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc);
+}
+
+static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){
+  return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc);
+}
+
+static TreeKey *newTreeKey(
+  lsm_db *pDb, 
+  u32 *piPtr, 
+  void *pKey, int nKey,           /* Key data */
+  void *pVal, int nVal,           /* Value data (or nVal<0 for delete) */
+  int *pRc
+){
+  TreeKey *p;
+  u32 iPtr;
+  int nRem;
+  u8 *a;
+  int n;
+
+#if 0
+  nRem = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
+  *piPtr = iPtr = treeShmalloc(pDb, 1, nRem, pRc);
+  p = treeShmptr(pDb, iPtr, pRc);
+  if( *pRc ) return 0;
+  p->nKey = nKey;
+  p->nValue = nVal;
+  memcpy(&p[1], pKey, nKey);
+  if( nVal>0 ) memcpy(((u8 *)&p[1]) + nKey, pVal, nVal);
+  return p;
+#endif
+
+  /* Allocate space for the TreeKey structure itself */
+  *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc);
+  p = treeShmptr(pDb, iPtr, pRc);
+  if( *pRc ) return 0;
+  p->nKey = nKey;
+  p->nValue = nVal;
+
+  /* Allocate and populate the space required for the key and value. */
+  n = nRem = nKey;
+  a = (u8 *)pKey;
+  while( a ){
+    while( nRem>0 ){
+      u8 *aAlloc;
+      int nAlloc;
+      u32 iWrite;
+
+      iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1));
+      iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR);
+      nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem);
+
+      aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc), pRc);
+      if( aAlloc==0 ) break;
+      memcpy(aAlloc, &a[n-nRem], nAlloc);
+      nRem -= nAlloc;
+    }
+    a = pVal;
+    n = nRem = nVal;
+    pVal = 0;
+  }
+
+  if( *pRc ) return 0;
+#if 0
+  printf("store: %d %s\n", (int)iPtr, (char *)pKey);
+#endif
+  return p;
+}
+
+static TreeNode *copyTreeNode(
+  lsm_db *pDb, 
+  TreeNode *pOld, 
+  u32 *piNew, 
+  int *pRc
+){
+  TreeNode *pNew;
+
+  pNew = newTreeNode(pDb, piNew, pRc);
+  if( pNew ){
+    memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr));
+    memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr));
+    if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr;
+  }
+  return pNew;
+}
+
+static TreeNode *copyTreeLeaf(
+  lsm_db *pDb, 
+  TreeLeaf *pOld, 
+  u32 *piNew, 
+  int *pRc
+){
+  TreeLeaf *pNew;
+  pNew = newTreeLeaf(pDb, piNew, pRc);
+  if( pNew ){
+    memcpy(pNew, pOld, sizeof(TreeLeaf));
+  }
+  return (TreeNode *)pNew;
+}
 
 /*
 ** The tree cursor passed as the second argument currently points to an 
 ** internal node (not a leaf). Specifically, to a sub-tree pointer. This
 ** function replaces the sub-tree that the cursor currently points to
@@ -485,15 +700,15 @@
 **
 ** The sub-tree may be replaced either by writing the "v2 data" on the
 ** internal node, or by allocating a new TreeNode structure and then 
 ** calling this function on the parent of the internal node.
 */
-static int treeUpdatePtr(Tree *pTree, TreeCursor *pCsr, TreeNode *pNew){
+static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){
   int rc = LSM_OK;
   if( pCsr->iNode<0 ){
-    /* pNew is the new root node */
-    pTree->pWorking->pRoot = pNew;
+    /* iNew is the new root node */
+    pDb->treehdr.iRoot = iNew;
   }else{
     /* If this node already has version 2 content, allocate a copy and
     ** update the copy with the new pointer value. Otherwise, store the
     ** new pointer as v2 data within the current node structure.  */
 
@@ -503,30 +718,39 @@
     p = pCsr->apTreeNode[pCsr->iNode];
     iChildPtr = pCsr->aiCell[pCsr->iNode];
 
     if( p->iV2 ){
       /* The "allocate new TreeNode" option */
-      TreeNode *pCopy = copyTreeNode(pCsr->pDb->pEnv, pTree, p);
+      u32 iCopy;
+      TreeNode *pCopy;
+      pCopy = copyTreeNode(pDb, p, &iCopy, &rc);
       if( pCopy ){
-        pCopy->apChild[iChildPtr] = pNew;
+        assert( rc==LSM_OK );
+        pCopy->aiChildPtr[iChildPtr] = iNew;
         pCsr->iNode--;
-        rc = treeUpdatePtr(pTree, pCsr, pCopy);
-      }else{
-        rc = LSM_NOMEM_BKPT;
+        rc = treeUpdatePtr(pDb, pCsr, iCopy);
       }
     }else{
       /* The "v2 data" option */
-      p->iV2 = pTree->pWorking->iVersion;
-      p->iV2Ptr = (u8)iChildPtr;
-      p->pV2Ptr = (void *)pNew;
-      if( pTree->pRbLast ){
-        pTree->pRbLast->pNext = p;
+      u32 iPtr;
+      assert( pDb->treehdr.iTransId>0 );
+
+      if( pCsr->iNode ){
+        iPtr = getChildPtr(
+            pCsr->apTreeNode[pCsr->iNode-1], 
+            pDb->treehdr.iTransId, pCsr->aiCell[pCsr->iNode-1]
+        );
       }else{
-        pTree->pRbFirst = p;
+        iPtr = pDb->treehdr.iRoot;
       }
-      pTree->pRbLast = p;
-      assert( pTree->pRbLast->pNext==0 );
+      rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr);
+
+      if( rc==LSM_OK ){
+        p->iV2 = pDb->treehdr.iTransId;
+        p->iV2Child = (u8)iChildPtr;
+        p->iV2Ptr = iNew;
+      }
     }
   }
 
   return rc;
 }
@@ -542,188 +766,210 @@
 **
 ** Pointer pLeftPtr points to a child tree that contains keys that are
 ** smaller than pTreeKey.
 */
 static int treeInsert(
-  lsm_env *pEnv,
-  Tree *pTree, 
+  lsm_db *pDb,                    /* Database handle */
   TreeCursor *pCsr,               /* Cursor indicating path to insert at */
-  TreeNode *pLeftPtr,             /* New child pointer (or NULL for leaves) */
-  TreeKey *pTreeKey,              /* New key to insert */
-  TreeNode *pRightPtr,            /* New child pointer (or NULL for leaves) */
+  u32 iLeftPtr,                   /* Left child pointer */
+  u32 iTreeKey,                   /* Location of key to insert */
+  u32 iRightPtr,                  /* Right child pointer */
   int iSlot                       /* Position to insert key into */
 ){
   int rc = LSM_OK;
   TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode];
 
-  /* Check if the leaf is currently full. If so, allocate a sibling node. */
-  if( pNode->apKey[0] && pNode->apKey[2] ){
-    TreeNode *pLeft;              /* New sibling node. */
-    TreeNode *pRight;             /* Sibling of pLeft (either new or pNode) */
+  /* Check if the node is currently full. If so, split pNode in two and
+  ** call this function recursively to add a key to the parent. Otherwise, 
+  ** insert the new key directly into pNode.  */
+  assert( pNode->aiKeyPtr[1] );
+  if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){
+    u32 iLeft; TreeNode *pLeft;   /* New left-hand sibling node */
+    u32 iRight; TreeNode *pRight; /* New right-hand sibling node */
 
-    pLeft = newTreeNode(pEnv, pTree);
-    pRight = newTreeNode(pEnv, pTree);
+    pLeft = newTreeNode(pDb, &iLeft, &rc);
+    pRight = newTreeNode(pDb, &iRight, &rc);
+    if( rc ) return rc;
+
+    pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0);
+    pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0];
+    pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1);
+
+    pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2);
+    pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2];
+    pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3);
 
     if( pCsr->iNode==0 ){
       /* pNode is the root of the tree. Grow the tree by one level. */
-      TreeNode *pRoot;            /* New root node */
-
-      pRoot = newTreeNode(pEnv, pTree);
-
-      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
-      pLeft->apKey[1] = pNode->apKey[0];
-      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);
-
-      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
-      pRight->apKey[1] = pNode->apKey[2];
-      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);
-
-      pRoot->apKey[1] = pNode->apKey[1];
-      pRoot->apChild[1] = pLeft;
-      pRoot->apChild[2] = pRight;
-
-      pTree->pWorking->pRoot = pRoot;
-      pTree->pWorking->nHeight++;
-    }else{
-      TreeKey *pParentKey;        /* Key to insert into parent node */
-      pParentKey = pNode->apKey[1];
-
-      pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0);
-      pLeft->apKey[1] = pNode->apKey[0];
-      pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1);
-
-      pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2);
-      pRight->apKey[1] = pNode->apKey[2];
-      pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3);
+      u32 iRoot; TreeNode *pRoot; /* New root node */
+
+      pRoot = newTreeNode(pDb, &iRoot, &rc);
+      pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1];
+      pRoot->aiChildPtr[1] = iLeft;
+      pRoot->aiChildPtr[2] = iRight;
+
+      pDb->treehdr.iRoot = iRoot;
+      pDb->treehdr.nHeight++;
+    }else{
 
       pCsr->iNode--;
-      treeInsert(pEnv, 
-          pTree, pCsr, pLeft, pParentKey, pRight, pCsr->aiCell[pCsr->iNode]
+      rc = treeInsert(pDb, pCsr, 
+          iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode]
       );
     }
 
     assert( pLeft->iV2==0 );
     assert( pRight->iV2==0 );
     switch( iSlot ){
       case 0:
-        pLeft->apKey[0] = pTreeKey;
-        pLeft->apChild[0] = pLeftPtr;
-        if( pRightPtr ) pLeft->apChild[1] = pRightPtr;
+        pLeft->aiKeyPtr[0] = iTreeKey;
+        pLeft->aiChildPtr[0] = iLeftPtr;
+        if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr;
         break;
       case 1:
-        pLeft->apChild[3] = (pRightPtr ? pRightPtr : pLeft->apChild[2]);
-        pLeft->apKey[2] = pTreeKey;
-        pLeft->apChild[2] = pLeftPtr;
+        pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]);
+        pLeft->aiKeyPtr[2] = iTreeKey;
+        pLeft->aiChildPtr[2] = iLeftPtr;
         break;
       case 2:
-        pRight->apKey[0] = pTreeKey;
-        pRight->apChild[0] = pLeftPtr;
-        if( pRightPtr ) pRight->apChild[1] = pRightPtr;
+        pRight->aiKeyPtr[0] = iTreeKey;
+        pRight->aiChildPtr[0] = iLeftPtr;
+        if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr;
         break;
       case 3:
-        pRight->apChild[3] = (pRightPtr ? pRightPtr : pRight->apChild[2]);
-        pRight->apKey[2] = pTreeKey;
-        pRight->apChild[2] = pLeftPtr;
+        pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]);
+        pRight->aiKeyPtr[2] = iTreeKey;
+        pRight->aiChildPtr[2] = iLeftPtr;
         break;
     }
 
   }else{
     TreeNode *pNew;
-    TreeKey **pOut;
-    TreeNode **pPtr;
+    u32 *piKey;
+    u32 *piChild;
+    u32 iStore = 0;
+    u32 iNew = 0;
     int i;
 
-    pNew = newTreeNode(pEnv, pTree);
-    if( pNew ){
-      TreeNode *pStore = 0;
-      pOut = pNew->apKey;
-      pPtr = pNew->apChild;
-
-      for(i=0; i<iSlot; i++){
-        if( pNode->apKey[i] ){
-          *(pOut++) = pNode->apKey[i];
-          *(pPtr++) = getChildPtr(pNode, WORKING_VERSION, i);
-        }
-      }
-
-      *pOut++ = pTreeKey;
-      *pPtr++ = pLeftPtr;
-
-      pStore = pRightPtr;
-      for(i=iSlot; i<3; i++){
-        if( pNode->apKey[i] ){
-          *(pOut++) = pNode->apKey[i];
-          *(pPtr++) = pStore ? pStore : getChildPtr(pNode, WORKING_VERSION, i);
-          pStore = 0;
-        }
-      }
-      if( pStore ){
-        *pPtr = pStore;
-      }else{
-        *pPtr = getChildPtr(pNode, WORKING_VERSION, (pNode->apKey[2] ? 3 : 2));
-      }
-
-      pCsr->iNode--;
-      rc = treeUpdatePtr(pTree, pCsr, pNew);
-    }else{
-      rc = LSM_NOMEM_BKPT;
-    }
+    /* Allocate a new version of node pNode. */
+    pNew = newTreeNode(pDb, &iNew, &rc);
+    if( rc ) return rc;
+
+    piKey = pNew->aiKeyPtr;
+    piChild = pNew->aiChildPtr;
+
+    for(i=0; i<iSlot; i++){
+      if( pNode->aiKeyPtr[i] ){
+        *(piKey++) = pNode->aiKeyPtr[i];
+        *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i);
+      }
+    }
+
+    *piKey++ = iTreeKey;
+    *piChild++ = iLeftPtr;
+
+    iStore = iRightPtr;
+    for(i=iSlot; i<3; i++){
+      if( pNode->aiKeyPtr[i] ){
+        *(piKey++) = pNode->aiKeyPtr[i];
+        *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i);
+        iStore = 0;
+      }
+    }
+
+    if( iStore ){
+      *piChild = iStore;
+    }else{
+      *piChild = getChildPtr(pNode, WORKING_VERSION, 
+          (pNode->aiKeyPtr[2] ? 3 : 2)
+      );
+    }
+    pCsr->iNode--;
+    rc = treeUpdatePtr(pDb, pCsr, iNew);
   }
 
   return rc;
 }
 
 static int treeInsertLeaf(
-  lsm_env *pEnv,
-  Tree *pTree,                    /* Tree structure */
+  lsm_db *pDb,                    /* Database handle */
   TreeCursor *pCsr,               /* Cursor structure */
-  TreeKey *pTreeKey,              /* Key to insert */
+  u32 iTreeKey,                   /* Key pointer to insert */
   int iSlot                       /* Insert key to the left of this */
 ){
-  int rc;                         /* Return code */
+  int rc = LSM_OK;                /* Return code */
   TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode];
-  TreeNode *pNew;
+  TreeLeaf *pNew;
+  u32 iNew;
 
   assert( iSlot>=0 && iSlot<=4 );
   assert( pCsr->iNode>0 );
-  assert( pLeaf->apKey[1] );
+  assert( pLeaf->aiKeyPtr[1] );
 
   pCsr->iNode--;
 
-  pNew = newTreeLeaf(pEnv, pTree);
-  if( !pNew ){
-    rc = LSM_NOMEM_BKPT;
-  }else if( pLeaf->apKey[0] && pLeaf->apKey[2] ){
-    TreeNode *pRight;
-
-    pRight = newTreeLeaf(pEnv, pTree);
-    if( pRight==0 ){
-      rc = LSM_NOMEM_BKPT;
-    }else{
-      pNew->apKey[1] = pLeaf->apKey[0];
-      pRight->apKey[1] = pLeaf->apKey[2];
-      switch( iSlot ){
-        case 0: pNew->apKey[0] = pTreeKey; break;
-        case 1: pNew->apKey[2] = pTreeKey; break;
-        case 2: pRight->apKey[0] = pTreeKey; break;
-        case 3: pRight->apKey[2] = pTreeKey; break;
-      }
-      rc = treeInsert(pEnv, pTree, pCsr, pNew, pLeaf->apKey[1], pRight, 
-          pCsr->aiCell[pCsr->iNode]
-      );
-    }
-  }else{
-    int iOut = 0;
-    int i;
-    for(i=0; i<4; i++){
-      if( i==iSlot ) pNew->apKey[iOut++] = pTreeKey;
-      if( i<3 && pLeaf->apKey[i] ) pNew->apKey[iOut++] = pLeaf->apKey[i];
-    }
-    rc = treeUpdatePtr(pTree, pCsr, pNew);
-  }
-
-  return rc;
+  pNew = newTreeLeaf(pDb, &iNew, &rc);
+  if( pNew ){
+    if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){
+      /* The leaf is full. Split it in two. */
+      TreeLeaf *pRight;
+      u32 iRight;
+      pRight = newTreeLeaf(pDb, &iRight, &rc);
+      if( pRight ){
+        assert( rc==LSM_OK );
+        pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0];
+        pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2];
+        switch( iSlot ){
+          case 0: pNew->aiKeyPtr[0] = iTreeKey; break;
+          case 1: pNew->aiKeyPtr[2] = iTreeKey; break;
+          case 2: pRight->aiKeyPtr[0] = iTreeKey; break;
+          case 3: pRight->aiKeyPtr[2] = iTreeKey; break;
+        }
+
+        rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, 
+            pCsr->aiCell[pCsr->iNode]
+        );
+      }
+    }else{
+      int iOut = 0;
+      int i;
+      for(i=0; i<4; i++){
+        if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey;
+        if( i<3 && pLeaf->aiKeyPtr[i] ){
+          pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i];
+        }
+      }
+      rc = treeUpdatePtr(pDb, pCsr, iNew);
+    }
+  }
+
+  return rc;
+}
+
+/*
+** Empty the contents of the in-memory tree.
+*/
+void lsmTreeClear(lsm_db *pDb){
+  pDb->treehdr.iTreeId++;
+  pDb->treehdr.iTransId = 1;
+  pDb->treehdr.iRoot = 0;
+  pDb->treehdr.nHeight = 0;
+  pDb->treehdr.nByte = 0;
+}
+
+/*
+** This function is called during recovery to initialize the 
+** tree header. Only the database connections private copy of the tree-header
+** is initialized here - it will be copied into shared memory if log file
+** recovery is successful.
+*/
+void lsmTreeInit(lsm_db *pDb){
+  pDb->treehdr.iTransId = 1;
+  pDb->treehdr.iFirst = 1;
+  pDb->treehdr.nChunk = 2;
+  pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR;
+  pDb->treehdr.iTreeId = 1;
 }
 
 /*
 ** Insert a new entry into the in-memory tree.
 **
@@ -736,53 +982,38 @@
   void *pKey,                     /* Pointer to key data */
   int nKey,                       /* Size of key data in bytes */
   void *pVal,                     /* Pointer to value data (or NULL) */
   int nVal                        /* Bytes in value data (or -ve for delete) */
 ){
-  lsm_env *pEnv = pDb->pEnv;
-  TreeVersion *pTV = pDb->pTV;
-  Tree *pTree = pTV->pTree;
   int rc = LSM_OK;                /* Return Code */
   TreeKey *pTreeKey;              /* New key-value being inserted */
   int nTreeKey;                   /* Number of bytes allocated at pTreeKey */
+  u32 iTreeKey;
+  u8 *a;
+  TreeHeader *pHdr = &pDb->treehdr;
 
   assert( nVal>=0 || pVal==0 );
-  assert( pTV==pTree->pWorking );
   assert_tree_looks_ok(LSM_OK, pTree);
-  /* dump_tree_contents(pTree, "before"); */
+#if 0
+  dump_tree_contents(pDb, "before");
+#endif
 
   /* Allocate and populate a new key-value pair structure */
-  nTreeKey = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0);
-  pTreeKey = (TreeKey *)lsmPoolMalloc(pDb->pEnv, pTree->pPool, nTreeKey);
-  if( !pTreeKey ) return LSM_NOMEM_BKPT;
-  pTreeKey->pKey = (void *)&pTreeKey[1];
-  memcpy(pTreeKey->pKey, pKey, nKey);
-  if( nVal>0 ){
-    pTreeKey->pValue = (void *)&((u8 *)(pTreeKey->pKey))[nKey];
-    memcpy(pTreeKey->pValue, pVal, nVal);
-  }else{
-    pTreeKey->pValue = 0;
-  }
-  pTreeKey->nValue = nVal;
-  pTreeKey->nKey = nKey;
-
-  if( pTree->pWorking->pRoot==0 ){
+  pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc);
+  if( rc!=LSM_OK ) return rc;
+
+  if( pHdr->iRoot==0 ){
     /* The tree is completely empty. Add a new root node and install
     ** (pKey/nKey) as the middle entry. Even though it is a leaf at the
     ** moment, use newTreeNode() to allocate the node (i.e. allocate enough
     ** space for the fields used by interior nodes). This is because the
-    ** treeInsert() routine may convert this node to an interior node.  
-    */
-    TreeNode *pRoot;              /* New tree root node */
-    pRoot = newTreeNode(pEnv, pTree);
-    if( !pRoot ){
-      rc = LSM_NOMEM_BKPT;
-    }else{
-      pRoot->apKey[1] = pTreeKey;
-      pTree->pWorking->pRoot = pRoot;
-      assert( pTree->pWorking->nHeight==0 );
-      pTree->pWorking->nHeight = 1;
+    ** treeInsert() routine may convert this node to an interior node. */
+    TreeNode *pRoot = newTreeNode(pDb, &pHdr->iRoot, &rc);
+    if( rc==LSM_OK ){
+      assert( pHdr->nHeight==0 );
+      pRoot->aiKeyPtr[1] = iTreeKey;
+      pHdr->nHeight = 1;
     }
   }else{
     TreeCursor csr;
     int res;
 
@@ -791,27 +1022,30 @@
     lsmTreeCursorSeek(&csr, pKey, nKey, &res);
 
     if( res==0 ){
       /* The search found a match within the tree. */
       TreeNode *pNew;
+      u32 iNew;
       TreeNode *pNode = csr.apTreeNode[csr.iNode];
       int iCell = csr.aiCell[csr.iNode];
 
       /* Create a copy of this node */
-      if( (csr.iNode>0 && csr.iNode==(pTree->pWorking->nHeight-1)) ){
-        pNew = copyTreeLeaf(pEnv, pTree, pNode);
+      if( (csr.iNode>0 && csr.iNode==(pHdr->nHeight-1)) ){
+        pNew = copyTreeLeaf(pDb, (TreeLeaf *)pNode, &iNew, &rc);
       }else{
-        pNew = copyTreeNode(pEnv, pTree, pNode);
+        pNew = copyTreeNode(pDb, pNode, &iNew, &rc);
       }
 
-      /* Modify the value in the new version */
-      pNew->apKey[iCell] = pTreeKey;
+      if( rc==LSM_OK ){
+        /* Modify the value in the new version */
+        pNew->aiKeyPtr[iCell] = iTreeKey;
 
-      /* Change the pointer in the parent (if any) to point at the new 
-      ** TreeNode */
-      csr.iNode--;
-      treeUpdatePtr(pTree, &csr, pNew);
+        /* Change the pointer in the parent (if any) to point at the new 
+        ** TreeNode */
+        csr.iNode--;
+        treeUpdatePtr(pDb, &csr, iNew);
+      }
     }else{
       /* The cursor now points to the leaf node into which the new entry should
       ** be inserted. There may or may not be a free slot within the leaf for
       ** the new key-value pair. 
       **
@@ -820,39 +1054,31 @@
       ** index of the rightmost key if the new key is larger than all keys
       ** currently stored in the node).
       */
       int iSlot = csr.aiCell[csr.iNode] + (res<0);
       if( csr.iNode==0 ){
-        rc = treeInsert(pEnv, pTree, &csr, 0, pTreeKey, 0, iSlot);
+        rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot);
       }else{
-        rc = treeInsertLeaf(pEnv, pTree, &csr, pTreeKey, iSlot);
+        rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot);
       }
     }
+    tblobFree(pDb, &csr.blob);
   }
 
-  /* dump_tree_contents(pTree, "after"); */
+#if 0
+  dump_tree_contents(pDb, "after");
+#endif
   assert_tree_looks_ok(rc, pTree);
   return rc;
 }
 
 /*
 ** Return, in bytes, the amount of memory currently used by the tree 
 ** structure.
 */
-int lsmTreeSize(TreeVersion *pTV){
-  return (lsmPoolUsed(pTV->pTree->pPool) - ROUND8(sizeof(Tree)));
-}
-
-/*
-** Return true if the tree is empty. Otherwise false.
-**
-** The caller is responsible for ensuring that it has exclusive access
-** to the Tree structure for this call.
-*/
-int lsmTreeIsEmpty(Tree *pTree){
-  assert( pTree==0 || pTree->pWorking==0 );
-  return (pTree==0 || pTree->pCommit->pRoot==0);
+int lsmTreeSize(lsm_db *pDb){
+  return pDb->treehdr.nByte;
 }
 
 /*
 ** Open a cursor on the in-memory tree pTree.
 */
@@ -869,10 +1095,11 @@
 /*
 ** Close an in-memory tree cursor.
 */
 void lsmTreeCursorDestroy(TreeCursor *pCsr){
   if( pCsr ){
+    tblobFree(pCsr->pDb, &pCsr->blob);
     lsmFree(pCsr->pDb->pEnv, pCsr);
   }
 }
 
 void lsmTreeCursorReset(TreeCursor *pCsr){
@@ -881,21 +1108,20 @@
 }
 
 #ifndef NDEBUG
 static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){
   TreeKey *p;
-  int cmp;
+  int cmp = 0;
+  int rc = LSM_OK;
   assert( pCsr->iNode>=0 );
-  p = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
-  cmp = memcmp(p->pKey, pKey, LSM_MIN(p->nKey, nKey));
-  if( cmp==0 ){
-    cmp = p->nKey - nKey;
+  p = csrGetKey(pCsr, &pCsr->blob, &rc);
+  if( p ){
+    cmp = pCsr->pDb->xCmp(TK_KEY(p), p->nKey, pKey, nKey);
   }
   return cmp;
 }
 #endif
-
 
 
 /*
 ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey)
 ** in the tree structure. If an exact match for the key is found, leave the
@@ -909,87 +1135,99 @@
 **     is smaller than the key and set *pRes to -1, or
 **
 **   * If the tree is empty, leave the cursor at EOF and set *pRes to -1.
 */
 int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){
-  TreeVersion *p = pCsr->pDb->pTV;
-  int (*xCmp)(void *, int, void *, int) = p->pTree->xCmp;
-  TreeNode *pNode = p->pRoot;     /* Current node in search */
+  int rc = LSM_OK;                /* Return code */
+  lsm_db *pDb = pCsr->pDb;
+  TreeHeader *pHdr = &pCsr->pDb->treehdr;
+  int (*xCmp)(void *, int, void *, int) = pDb->xCmp;
+
+  u32 iNodePtr;                   /* Location of current node in search */
 
   /* Discard any saved position data */
   treeCursorRestore(pCsr, 0);
 
-  if( pNode==0 ){
-    /* A special case - the tree is completely empty. */
+  iNodePtr = pDb->treehdr.iRoot;
+  if( iNodePtr==0 ){
+    /* Either an error occurred or the tree is completely empty. */
+    assert( rc!=LSM_OK || pDb->treehdr.iRoot==0 );
     *pRes = -1;
     pCsr->iNode = -1;
   }else{
+    TreeBlob b = {0, 0};
     int res = 0;                  /* Result of comparison function */
     int iNode = -1;
-    while( pNode ){
+    while( iNodePtr ){
+      TreeNode *pNode;            /* Node at location iNodePtr */
       int iTest;                  /* Index of second key to test (0 or 2) */
       TreeKey *pTreeKey;          /* Key to compare against */
 
+      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
       iNode++;
       pCsr->apTreeNode[iNode] = pNode;
 
       /* Compare (pKey/nKey) with the key in the middle slot of B-tree node
       ** pNode. The middle slot is never empty. If the comparison is a match,
       ** then the search is finished. Break out of the loop. */
-      pTreeKey = pNode->apKey[1];
-      res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
+      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TK_LOADKEY, &b, &rc);
+      if( rc!=LSM_OK ) break;
+      res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
       if( res==0 ){
         pCsr->aiCell[iNode] = 1;
         break;
       }
 
       /* Based on the results of the previous comparison, compare (pKey/nKey)
       ** to either the left or right key of the B-tree node, if such a key
       ** exists. */
       iTest = (res>0 ? 0 : 2);
-      pTreeKey = pNode->apKey[iTest];
+      pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[iTest], TK_LOADKEY, &b, &rc);
+      if( rc ) break;
       if( pTreeKey==0 ){
         iTest = 1;
       }else{
-        res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey);
+        res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey);
         if( res==0 ){
           pCsr->aiCell[iNode] = iTest;
           break;
         }
       }
 
-      if( iNode<(p->nHeight-1) ){
-        pNode = getChildPtr(pNode, p->iVersion, iTest + (res<0));
+      if( iNode<(pHdr->nHeight-1) ){
+        iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iTest + (res<0));
       }else{
-        pNode = 0;
+        iNodePtr = 0;
       }
-      pCsr->aiCell[iNode] = iTest + (pNode && (res<0));
+      pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0));
     }
 
     *pRes = res;
     pCsr->iNode = iNode;
+    tblobFree(pDb, &b);
   }
 
   /* assert() that *pRes has been set properly */
 #ifndef NDEBUG
-  if( lsmTreeCursorValid(pCsr) ){
+  if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){
     int cmp = treeCsrCompare(pCsr, pKey, nKey);
     assert( *pRes==cmp || (*pRes ^ cmp)>0 );
   }
 #endif
 
-  return LSM_OK;
+  return rc;
 }
 
 int lsmTreeCursorNext(TreeCursor *pCsr){
 #ifndef NDEBUG
   TreeKey *pK1;
+  TreeBlob key1 = {0, 0};
 #endif
-
-  TreeVersion *p = pCsr->pDb->pTV;
-  const int iLeaf = p->nHeight-1;
+  lsm_db *pDb = pCsr->pDb;
+  const int iLeaf = pDb->treehdr.nHeight-1;
   int iCell; 
+  int rc = LSM_OK; 
   TreeNode *pNode; 
 
   /* Restore the cursor position, if required */
   int iRestore = 0;
   treeCursorRestore(pCsr, &iRestore);
@@ -997,11 +1235,12 @@
 
   /* Save a pointer to the current key. This is used in an assert() at the
   ** end of this function - to check that the 'next' key really is larger
   ** than the current key. */
 #ifndef NDEBUG
-  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
+  pK1 = csrGetKey(pCsr, &key1, &rc);
+  if( rc!=LSM_OK ) return rc;
 #endif
 
   assert( lsmTreeCursorValid(pCsr) );
   assert( pCsr->aiCell[pCsr->iNode]<3 );
 
@@ -1009,49 +1248,51 @@
   iCell = ++pCsr->aiCell[pCsr->iNode];
 
   /* If the current node is not a leaf, and the current cell has sub-tree
   ** associated with it, descend to the left-most key on the left-most
   ** leaf of the sub-tree.  */
-  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
+  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
     do {
+      u32 iNodePtr;
       pCsr->iNode++;
-      pNode = getChildPtr(pNode, p->iVersion, iCell);
+      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
+      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
       pCsr->apTreeNode[pCsr->iNode] = pNode;
-      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->apKey[0]==0);
+      iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0);
     }while( pCsr->iNode < iLeaf );
   }
 
   /* Otherwise, the next key is found by following pointer up the tree 
   ** until there is a key immediately to the right of the pointer followed 
   ** to reach the sub-tree containing the current key. */
-  else if( iCell>=3 || pNode->apKey[iCell]==0 ){
+  else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){
     while( (--pCsr->iNode)>=0 ){
       iCell = pCsr->aiCell[pCsr->iNode];
-      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
+      if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
     }
   }
 
 #ifndef NDEBUG
   if( pCsr->iNode>=0 ){
-    TreeKey *pK2;
-    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
-    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
-    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)>0 );
+    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
+    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)>0 );
   }
+  tblobFree(pDb, &key1);
 #endif
 
-  return LSM_OK;
+  return rc;
 }
 
 int lsmTreeCursorPrev(TreeCursor *pCsr){
 #ifndef NDEBUG
   TreeKey *pK1;
+  TreeBlob key1 = {0, 0};
 #endif
-
-  TreeVersion *p = pCsr->pDb->pTV;
-  const int iLeaf = p->nHeight-1;
+  lsm_db *pDb = pCsr->pDb;
+  const int iLeaf = pDb->treehdr.nHeight-1;
   int iCell; 
+  int rc = LSM_OK; 
   TreeNode *pNode; 
 
   /* Restore the cursor position, if required */
   int iRestore = 0;
   treeCursorRestore(pCsr, &iRestore);
@@ -1059,11 +1300,12 @@
 
   /* Save a pointer to the current key. This is used in an assert() at the
   ** end of this function - to check that the 'next' key really is smaller
   ** than the current key. */
 #ifndef NDEBUG
-  pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
+  pK1 = csrGetKey(pCsr, &key1, &rc);
+  if( rc!=LSM_OK ) return rc;
 #endif
 
   assert( lsmTreeCursorValid(pCsr) );
   pNode = pCsr->apTreeNode[pCsr->iNode];
   iCell = pCsr->aiCell[pCsr->iNode];
@@ -1070,16 +1312,19 @@
   assert( iCell>=0 && iCell<3 );
 
   /* If the current node is not a leaf, and the current cell has sub-tree
   ** associated with it, descend to the right-most key on the right-most
   ** leaf of the sub-tree.  */
-  if( pCsr->iNode<iLeaf && getChildPtr(pNode, p->iVersion, iCell) ){
+  if( pCsr->iNode<iLeaf && getChildPtr(pNode, pDb->treehdr.iTransId, iCell) ){
     do {
+      u32 iNodePtr;
       pCsr->iNode++;
-      pNode = getChildPtr(pNode, p->iVersion, iCell);
+      iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell);
+      pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
+      if( rc!=LSM_OK ) break;
       pCsr->apTreeNode[pCsr->iNode] = pNode;
-      iCell = 1 + (pNode->apKey[2]!=0) + (pCsr->iNode < iLeaf);
+      iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf);
       pCsr->aiCell[pCsr->iNode] = iCell;
     }while( pCsr->iNode < iLeaf );
   }
 
   /* Otherwise, the next key is found by following pointer up the tree until
@@ -1086,89 +1331,107 @@
   ** there is a key immediately to the left of the pointer followed to reach
   ** the sub-tree containing the current key. */
   else{
     do {
       iCell = pCsr->aiCell[pCsr->iNode]-1;
-      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break;
+      if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break;
     }while( (--pCsr->iNode)>=0 );
     pCsr->aiCell[pCsr->iNode] = iCell;
   }
 
 #ifndef NDEBUG
   if( pCsr->iNode>=0 ){
-    TreeKey *pK2;
-    int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp;
-    pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
-    assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)<0 );
+    TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc);
+    assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)<0 );
   }
+  tblobFree(pDb, &key1);
 #endif
 
-  return LSM_OK;
+  return rc;
 }
 
 /*
 ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the
 ** in-memory tree.
 */
 int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){
-  TreeVersion *p = pCsr->pDb->pTV;
-  TreeNode *pNode = p->pRoot;
+  lsm_db *pDb = pCsr->pDb;
+  TreeHeader *pHdr = &pDb->treehdr;
+  int rc = LSM_OK;
+
+  u32 iNodePtr;
   pCsr->iNode = -1;
 
   /* Discard any saved position data */
   treeCursorRestore(pCsr, 0);
 
-  while( pNode ){
+  iNodePtr = pHdr->iRoot;
+  while( iNodePtr ){
     int iCell;
+    TreeNode *pNode;
+
+    pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc);
+    if( rc ) break;
+
     if( bLast ){
-      iCell = ((pNode->apKey[2]==0) ? 2 : 3);
+      iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3);
     }else{
-      iCell = ((pNode->apKey[0]==0) ? 1 : 0);
+      iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0);
     }
-
     pCsr->iNode++;
     pCsr->apTreeNode[pCsr->iNode] = pNode;
 
-    if( pCsr->iNode<p->nHeight-1 ){
-      pNode = getChildPtr(pNode, p->iVersion, iCell);
-    }else{
-      pNode = 0;
-    }
-    pCsr->aiCell[pCsr->iNode] = iCell - (pNode==0 && bLast);
-  }
-  return LSM_OK;
+    if( pCsr->iNode<pHdr->nHeight-1 ){
+      iNodePtr = getChildPtr(pNode, pHdr->iTransId, iCell);
+    }else{
+      iNodePtr = 0;
+    }
+    pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast);
+  }
+
+  return rc;
 }
 
 int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){
   TreeKey *pTreeKey;
+  int rc = LSM_OK;
+
   assert( lsmTreeCursorValid(pCsr) );
 
   pTreeKey = pCsr->pSave;
   if( !pTreeKey ){
-    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
+    pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
+  }
+  if( rc==LSM_OK ){
+    *pnKey = pTreeKey->nKey;
+    *ppKey = (void *)&pTreeKey[1];
   }
-  *ppKey = pTreeKey->pKey;
-  *pnKey = pTreeKey->nKey;
 
-  return LSM_OK;
+  return rc;
 }
 
 int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){
-  TreeKey *pTreeKey;
   int res = 0;
+  int rc;
 
-  treeCursorRestore(pCsr, &res);
+  rc = treeCursorRestore(pCsr, &res);
   if( res==0 ){
-    pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]];
-    *ppVal = pTreeKey->pValue;
-    *pnVal = pTreeKey->nValue;
+    TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc);
+    if( rc==LSM_OK ){
+      *pnVal = pTreeKey->nValue;
+      if( pTreeKey->nValue>=0 ){
+        *ppVal = TK_VAL(pTreeKey);
+      }else{
+        *ppVal = 0;
+      }
+    }
   }else{
     *ppVal = 0;
     *pnVal = 0;
   }
 
-  return LSM_OK;
+  return rc;
 }
 
 /*
 ** Return true if the cursor currently points to a valid entry. 
 */
@@ -1175,194 +1438,159 @@
 int lsmTreeCursorValid(TreeCursor *pCsr){
   return (pCsr && (pCsr->pSave || pCsr->iNode>=0));
 }
 
 /*
-** Roll back to mark pMark. Structure *pMark should have been previously
-** populated by a call to lsmTreeMark().
-*/
-void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
-  TreeVersion *pWorking = pDb->pTV;
-  Tree *pTree = pWorking->pTree;
-  TreeNode *p;
-
-  assert( lsmTreeIsWriteVersion(pWorking) );
-
-  pWorking->pRoot = (TreeNode *)pMark->pRoot;
-  pWorking->nHeight = pMark->nHeight;
-
-  if( pMark->pRollback ){
-    p = ((TreeNode *)pMark->pRollback)->pNext;
-  }else{
-    p = pTree->pRbFirst;
-  }
-
-  while( p ){
-    TreeNode *pNext = p->pNext;
-    assert( p->iV2!=0 );
-    assert( pNext || p==pTree->pRbLast );
-    p->iV2 = 0;
-    p->iV2Ptr = 0;
-    p->pV2Ptr = 0;
-    p->pNext = 0;
-    p = pNext;
-  }
-
-  pTree->pRbLast = (TreeNode *)pMark->pRollback;
-  if( pTree->pRbLast ){
-    pTree->pRbLast->pNext = 0;
-  }else{
-    pTree->pRbFirst = 0;
-  }
-
-  lsmPoolRollback(pDb->pEnv, pTree->pPool, pMark->pMpChunk, pMark->iMpOff);
-}
-
-/*
-** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
-** pointer to the same TreeMark structure may be used to roll the tree
-** contents back to their current state.
-*/
-void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark){
-  Tree *pTree = pTV->pTree;
-  memset(pMark, 0, sizeof(TreeMark));
-  pMark->pRoot = (void *)pTV->pRoot;
-  pMark->nHeight = pTV->nHeight;
-  pMark->pRollback = (void *)pTree->pRbLast;
-  lsmPoolMark(pTree->pPool, &pMark->pMpChunk, &pMark->iMpOff);
-
-  assert( lsmTreeIsWriteVersion(pTV) );
-  pTV->iVersion++;
-}
-
-/*
-** This is called when a client wishes to upgrade from a read to a write
-** transaction. If the read-version passed as the second version is the
-** most recent one, decrement its ref-count and return a pointer to
-** the write-version object. Otherwise return null. So we can do:
-**
-**     // Open read-transaction
-**     pReadVersion = lsmTreeReadVersion(pTree);
-**
-**     // Later on, attempt to upgrade to write transaction
-**     if( pWriteVersion = lsmTreeWriteVersion(pTree, pReadVersion) ){
-**       // Have upgraded to a write transaction!
-**     }else{
-**       // Reading an out-of-date snapshot. Upgrade fails.
-**     }
-**
-** The caller must take care of rejecting a clients attempt to upgrade to
-** a write transaction *while* another client has a write transaction 
-** underway. This mechanism merely prevents writing to an out-of-date
-** snapshot.
-*/
-int lsmTreeWriteVersion(
-  lsm_env *pEnv,
-  Tree *pTree, 
-  TreeVersion **ppVersion
-){
-  TreeVersion *pRead = *ppVersion;
-  TreeVersion *pRet;
-
-  /* The caller must ensure that no other write transaction is underway. */
-  assert( pTree->pWorking==0 );
-  
-  if( pRead && pTree->pCommit!=pRead ) return LSM_BUSY;
-  pRet = lsmMallocZero(pEnv, sizeof(TreeVersion));
-  if( pRet==0 ) return LSM_NOMEM_BKPT;
-  pTree->pWorking = pRet;
-
-  memcpy(pRet, pTree->pCommit, sizeof(TreeVersion));
-  pRet->nRef = 1;
-  if( pRead ) pRead->nRef--;
-  *ppVersion = pRet;
-  assert( pRet->pTree==pTree );
-  return LSM_OK;
-}
-
-static void treeIncrRefcount(Tree *pTree){
-  pTree->nTreeRef++;
-}
-
-static void treeDecrRefcount(lsm_env *pEnv, Tree *pTree){
-  assert( pTree->nTreeRef>0 );
-  pTree->nTreeRef--;
-  if( pTree->nTreeRef==0 ){
-    assert( pTree->pWorking==0 );
-    treeDestroy(pEnv, pTree);
-  }
-}
-
-/*
-** Release a reference to the write-version.
-*/
-int lsmTreeReleaseWriteVersion(
-  lsm_env *pEnv,
-  TreeVersion *pWorking,          /* Write-version reference */
-  int bCommit,                    /* True for a commit */
-  TreeVersion **ppReadVersion     /* OUT: Read-version reference */
-){
-  Tree *pTree = pWorking->pTree;
-
-  assert( lsmTreeIsWriteVersion(pWorking) );
-  assert( pWorking->nRef==1 );
-
-  if( bCommit ){
-    treeIncrRefcount(pTree);
-    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);
-    pTree->pCommit = pWorking;
-  }else{
-    lsmFree(pEnv, pWorking);
-  }
-
-  pTree->pWorking = 0;
-  if( ppReadVersion ){
-    *ppReadVersion = lsmTreeReadVersion(pTree);
-  }
-  return LSM_OK;
-}
-
-
-TreeVersion *lsmTreeRecoverVersion(Tree *pTree){
-  return pTree->pCommit;
-}
-
-/*
-** Return a reference to a TreeVersion structure that may be used to read
-** the database. The reference should be released at some point in the future
-** by calling lsmTreeReleaseReadVersion().
-*/
-TreeVersion *lsmTreeReadVersion(Tree *pTree){
-  TreeVersion *pRet = pTree->pCommit;
-  assert( pRet->nRef>0 );
-  pRet->nRef++;
-  return pRet;
-}
-
-/*
-** Release a reference to a read-version.
-*/
-void lsmTreeReleaseReadVersion(lsm_env *pEnv, TreeVersion *pTreeVersion){
-  if( pTreeVersion ){
-    assert( pTreeVersion->nRef>0 );
-    pTreeVersion->nRef--;
-    if( pTreeVersion->nRef==0 ){
-      Tree *pTree = pTreeVersion->pTree;
-      lsmFree(pEnv, pTreeVersion);
-      treeDecrRefcount(pEnv, pTree);
-    }
-  }
-}
-
-/*
-** Return true if the tree-version passed as the first argument is writable. 
-*/
-int lsmTreeIsWriteVersion(TreeVersion *pTV){
-  return (pTV==pTV->pTree->pWorking);
-}
-
-void lsmTreeRelease(lsm_env *pEnv, Tree *pTree){
-  if( pTree ){
-    assert( pTree->nTreeRef>0 && pTree->pCommit );
-    lsmTreeReleaseReadVersion(pEnv, pTree->pCommit);
-  }
-}
+** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a
+** pointer to the same TreeMark structure may be used to roll the tree
+** contents back to their current state.
+*/
+void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){
+  pMark->iRoot = pDb->treehdr.iRoot;
+  pMark->nHeight = pDb->treehdr.nHeight;
+  pMark->iWrite = pDb->treehdr.iWrite;
+  pMark->nChunk = pDb->treehdr.nChunk;
+  pMark->iFirst = pDb->treehdr.iFirst;
+  pMark->iRollback = intArraySize(&pDb->rollback);
+}
+
+/*
+** Roll back to mark pMark. Structure *pMark should have been previously
+** populated by a call to lsmTreeMark().
+*/
+void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){
+  int rcdummy = LSM_OK;
+  int iIdx;
+  int nIdx;
+  u32 iNext;
+  ShmChunk *pChunk;
+  u32 iChunk;
+
+  /* Revert all required v2 pointers. */
+  nIdx = intArraySize(&pDb->rollback);
+  for(iIdx = pMark->iRollback; iIdx<nIdx; iIdx++){
+    TreeNode *pNode;
+    pNode = treeShmptr(pDb, intArrayEntry(&pDb->rollback, iIdx), &rcdummy);
+    assert( pNode && rcdummy==LSM_OK );
+    pNode->iV2 = 0;
+    pNode->iV2Child = 0;
+    pNode->iV2Ptr = 0;
+  }
+  intArrayTruncate(&pDb->rollback, pMark->iRollback);
+
+  /* Restore the free-chunk list */
+  assert( pMark->iWrite!=0 );
+  iChunk = treeOffsetToChunk(pMark->iWrite-1);
+  pChunk = treeShmChunk(pDb, iChunk);
+  iNext = pChunk->iNext;
+  pChunk->iNext = 0;
+  assert( iNext==0 
+       || pDb->treehdr.iFirst==pMark->iFirst 
+       || iNext==pMark->iFirst 
+  );
+  pDb->treehdr.iFirst = pMark->iFirst;
+  while( iNext ){
+    iChunk = iNext;
+    pChunk = treeShmChunk(pDb, iChunk);
+    iNext = pChunk->iNext;
+    if( iChunk<pMark->nChunk ){
+      pChunk->iNext = pDb->treehdr.iFirst;
+      pChunk->iLastTree = 0;
+    }
+  }
+
+  /* Restore the tree-header fields */
+  pDb->treehdr.iRoot = pMark->iRoot;
+  pDb->treehdr.nHeight = pMark->nHeight;
+  pDb->treehdr.iWrite = pMark->iWrite;
+  pDb->treehdr.nChunk = pMark->nChunk;
+}
+
+static void treeHeaderChecksum(
+  TreeHeader *pHdr, 
+  u32 *aCksum
+){
+  u32 cksum1 = 0x12345678;
+  u32 cksum2 = 0x9ABCDEF0;
+  u32 *a = (u32 *)pHdr;
+  int i;
+
+  assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) );
+  assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 );
+
+  for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){
+    cksum1 += a[i];
+    cksum2 += (cksum1 + a[i+1]);
+  }
+  aCksum[0] = cksum1;
+  aCksum[1] = cksum2;
+}
+
+/*
+** Return true if the checksum stored in TreeHeader object *pHdr is 
+** consistent with the contents of its other fields.
+*/
+static int treeHeaderChecksumOk(TreeHeader *pHdr){
+  u32 aCksum[2];
+  treeHeaderChecksum(pHdr, aCksum);
+  return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum)));
+}
+
+/*
+** Load the in-memory tree header from shared-memory into pDb->treehdr.
+** If the header cannot be loaded, return LSM_BUSY.
+*/
+int lsmTreeLoadHeader(lsm_db *pDb){
+  while( 1 ){
+    int rc;
+    ShmHeader *pShm = pDb->pShmhdr;
+
+    memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
+    if( treeHeaderChecksumOk(&pDb->treehdr) ) return LSM_OK;
+
+    rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
+    if( rc==LSM_BUSY ){
+      usleep(50);
+    }else{
+      if( rc==LSM_OK ){
+        if( treeHeaderChecksumOk(&pShm->hdr1)==0 ){
+          memcpy(&pShm->hdr1, &pShm->hdr2, sizeof(TreeHeader));
+        }
+        memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader));
+        lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+
+        if( treeHeaderChecksumOk(&pDb->treehdr)==0 ){
+          rc = LSM_CORRUPT_BKPT;
+        }
+      }
+      return rc;
+    }
+  }
+}
+
+/*
+** This function is called to conclude a transaction. If argument bCommit
+** is true, the transaction is committed. Otherwise it is rolled back.
+*/
+int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){
+  ShmHeader *pShm = pDb->pShmhdr;
+
+  if( bCommit ){
+    treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum);
+    memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader));
+    lsmShmBarrier(pDb);
+    memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader));
+  }
+  pShm->bWriter = 0;
+  intArrayFree(pDb->pEnv, &pDb->rollback);
+
+  return LSM_OK;
+}
+
+/*
+** Begin a new transaction.
+*/
+int lsmTreeBeginTransaction(lsm_db *pDb){
+  pDb->treehdr.iTransId++;
+  return LSM_OK;
+}
+

Index: src/lsm_unix.c
==================================================================
--- src/lsm_unix.c
+++ src/lsm_unix.c
@@ -34,25 +34,39 @@
 
 #include <unistd.h>
 #include <errno.h>
 
 #include <sys/mman.h>
-
 #include "lsmInt.h"
 
 /*
 ** An open file is an instance of the following object
 */
 typedef struct PosixFile PosixFile;
 struct PosixFile {
-  lsm_env *pEnv;     /* The run-time environment */
-  int fd;            /* The open file descriptor */
-  void *pMap;
-  off_t nMap;
+  lsm_env *pEnv;                  /* The run-time environment */
+  const char *zName;              /* Full path to file */
+  int fd;                         /* The open file descriptor */
+  int shmfd;                      /* Shared memory file-descriptor */
+  void *pMap;                     /* Pointer to mapping of file fd */
+  off_t nMap;                     /* Size of mapping at pMap in bytes */
+  int nShm;                       /* Number of entries in array apShm[] */
+  void **apShm;                   /* Array of 32K shared memory segments */
 };
 
 static int lsm_ioerr(void){ return LSM_IOERR; }
+
+static char *posixShmFile(PosixFile *p){
+  char *zShm;
+  int nName = strlen(p->zName);
+  zShm = (char *)lsmMalloc(p->pEnv, nName+4+1);
+  if( zShm ){
+    memcpy(zShm, p->zName, nName);
+    memcpy(&zShm[nName], "-shm", 5);
+  }
+  return zShm;
+}
 
 static int lsmPosixOsOpen(
   lsm_env *pEnv,
   const char *zFile, 
   lsm_file **ppFile
@@ -63,10 +77,11 @@
   p = lsm_malloc(pEnv, sizeof(PosixFile));
   if( p==0 ){
     rc = LSM_NOMEM;
   }else{
     memset(p, 0, sizeof(PosixFile));
+    p->zName = zFile;
     p->pEnv = pEnv;
     p->fd = open(zFile, O_RDWR|O_CREAT, 0644);
     if( p->fd<0 ){
       lsm_free(pEnv, p);
       p = 0;
@@ -262,24 +277,133 @@
 
   memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev));
   memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino));
   return LSM_OK;
 }
+
+static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
+  int prc = unlink(zFile);
+  return prc ? LSM_IOERR_BKPT : LSM_OK;
+}
+
+int lsmPosixOsLock(lsm_file *pFile, int iLock, int eType){
+  int rc = LSM_OK;
+  PosixFile *p = (PosixFile *)pFile;
+  static const short aType[3] = { F_UNLCK, F_RDLCK, F_WRLCK };
+  struct flock lock;
+
+  assert( aType[LSM_LOCK_UNLOCK]==F_UNLCK );
+  assert( aType[LSM_LOCK_SHARED]==F_RDLCK );
+  assert( aType[LSM_LOCK_EXCL]==F_WRLCK );
+  assert( eType>=0 && eType<array_size(aType) );
+  assert( iLock>0 && iLock<=16 );
+
+  memset(&lock, 0, sizeof(lock));
+  lock.l_whence = SEEK_SET;
+  lock.l_len = 1;
+  lock.l_type = aType[eType];
+  lock.l_start = (4096-iLock);
+
+  if( fcntl(p->fd, F_SETLK, &lock) ){
+    int e = errno;
+    if( e==EACCES || e==EAGAIN ){
+      rc = LSM_BUSY;
+    }else{
+      rc = LSM_IOERR;
+    }
+  }
+
+  return LSM_OK;
+}
+
+int lsmPosixOsShmMap(lsm_file *pFile, int iChunk, int sz, void **ppShm){
+  PosixFile *p = (PosixFile *)pFile;
+
+  *ppShm = 0;
+  assert( sz==LSM_SHM_CHUNK_SIZE );
+  if( iChunk>=p->nShm ){
+    int i;
+    void **apNew;
+    int nNew = iChunk+1;
+    off_t nReq = nNew * LSM_SHM_CHUNK_SIZE;
+    struct stat sStat;
+
+    /* If the shared-memory file has not been opened, open it now. */
+    if( p->shmfd<=0 ){
+      char *zShm = posixShmFile(p);
+      if( !zShm ) return LSM_NOMEM_BKPT;
+      p->shmfd = open(zShm, O_RDWR|O_CREAT, 0644);
+      lsmFree(p->pEnv, zShm);
+      if( p->shmfd<0 ){ 
+        return LSM_IOERR_BKPT;
+      }
+    }
+
+    /* If the shared-memory file is not large enough to contain the 
+    ** requested chunk, cause it to grow.  */
+    if( fstat(p->shmfd, &sStat) ){
+      return LSM_IOERR_BKPT;
+    }
+    if( sStat.st_size<nReq ){
+      if( ftruncate(p->shmfd, nReq) ){
+        return LSM_IOERR_BKPT;
+      }
+    }
+
+    apNew = (void **)lsmRealloc(p->pEnv, p->apShm, sizeof(void *) * nNew);
+    if( !apNew ) return LSM_NOMEM_BKPT;
+    for(i=p->nShm; i<nNew; i++){
+      apNew[i] = 0;
+    }
+    p->apShm = apNew;
+    p->nShm = nNew;
+  }
+
+  if( p->apShm[iChunk]==0 ){
+    p->apShm[iChunk] = mmap(0, LSM_SHM_CHUNK_SIZE, 
+        PROT_READ|PROT_WRITE, MAP_SHARED, p->shmfd, iChunk*LSM_SHM_CHUNK_SIZE
+    );
+    if( p->apShm[iChunk]==0 ) return LSM_IOERR;
+  }
+
+  *ppShm = p->apShm[iChunk];
+  return LSM_OK;
+}
+
+void lsmPosixOsShmBarrier(void){
+}
+
+int lsmPosixOsShmUnmap(lsm_file *pFile, int bDelete){
+  PosixFile *p = (PosixFile *)pFile;
+  if( p->shmfd>0 ){
+    int i;
+    for(i=0; i<p->nShm; i++){
+      if( p->apShm[i] ){
+        munmap(p->apShm[i], LSM_SHM_CHUNK_SIZE);
+        p->apShm[i] = 0;
+      }
+    }
+    close(p->shmfd);
+    p->shmfd = 0;
+    if( bDelete ){
+      char *zShm = posixShmFile(p);
+      if( zShm ) unlink(zShm);
+    }
+  }
+  return LSM_OK;
+}
+
 
 static int lsmPosixOsClose(lsm_file *pFile){
    PosixFile *p = (PosixFile *)pFile;
+   lsmPosixOsShmUnmap(pFile, 0);
    if( p->pMap ) munmap(p->pMap, p->nMap);
    close(p->fd);
    lsm_free(p->pEnv, p);
    return LSM_OK;
 }
 
-static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){
-  int prc = unlink(zFile);
-  return prc ? LSM_IOERR_BKPT : LSM_OK;
-}
-
 /****************************************************************************
 ** Memory allocation routines.
 */
 #define ROUND8(x) (((x)+7)&~7)
 #define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) )
@@ -530,10 +654,14 @@
     lsmPosixOsSectorSize,    /* xSectorSize */
     lsmPosixOsRemap,         /* xRemap */
     lsmPosixOsFileid,        /* xFileid */
     lsmPosixOsClose,         /* xClose */
     lsmPosixOsUnlink,        /* xUnlink */
+    lsmPosixOsLock,          /* xLock */
+    lsmPosixOsShmMap,        /* xShmMap */
+    lsmPosixOsShmBarrier,    /* xShmBarrier */
+    lsmPosixOsShmUnmap,      /* xShmUnmap */
     /***** memory allocation *********/
     0,                       /* pMemCtx */
     lsmPosixOsMalloc,        /* xMalloc */
     lsmPosixOsRealloc,       /* xRealloc */
     lsmPosixOsFree,          /* xFree */

Index: test/attach.test
==================================================================
--- test/attach.test
+++ test/attach.test
@@ -22,12 +22,11 @@
   finish_test
   return
 }
 
 for {set i 2} {$i<=15} {incr i} {
-  forcedelete test$i.db
-  forcedelete test$i.db-journal
+  db_delete test$i.db
 }
 
 do_test attach-1.1 {
   execsql {
     CREATE TABLE t1(a,b);

ADDED   test/ckpt1.test
Index: test/ckpt1.test
==================================================================
--- /dev/null
+++ test/ckpt1.test
@@ -0,0 +1,94 @@
+# 2012 August 29
+#
+# The author disclaims copyright to this source code.  In place of
+# a legal notice, here is a blessing:
+#
+#    May you do good and not evil.
+#    May you find forgiveness for yourself and forgive others.
+#    May you share freely, never taking more than you give.
+#
+#***********************************************************************
+# The tests in this file focus on testing that very large checkpoints
+# (those that occur when the database contains an unusually large number 
+# of levels or free blocks) are handled correctly.
+#
+
+set testdir [file dirname $argv0]
+source $testdir/tester.tcl
+set testprefix ckpt1
+
+# Check that lsm_config(AUTOWORK) seems to be connected to something.
+#
+do_test 1.1 { sqlite4_lsm_config db main autowork  0  } 0
+do_test 1.2 { sqlite4_lsm_config db main autowork  1  } 1
+do_test 1.3 { sqlite4_lsm_config db main autowork -1  } 1
+do_test 1.4 { sqlite4_lsm_config db main autowork  0  } 0
+do_test 1.5 { sqlite4_lsm_config db main autowork -1  } 0
+
+
+set nLevel 200
+do_execsql_test 2.0 { CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER UNIQUE) }
+do_test 2.1 {
+  for {set i 1} {$i <= $nLevel} {incr i} {
+    db close
+    sqlite4 db test.db
+    sqlite4_lsm_config db main autowork 0
+    db eval { INSERT INTO t1 VALUES($i, $i || $i) }
+  }
+  db eval { 
+    SELECT count(*) FROM t1;
+    PRAGMA integrity_check;
+  }
+} [list $nLevel ok]
+
+
+#-------------------------------------------------------------------------
+# The point of this test is to add a large number of blocks to the 
+# free-block list and check that this doesn't seem to cause any
+# obvious problems.
+#
+do_test 3.0 {
+  db close
+  forcedelete test.db
+  sqlite4 db file:test.db?lsm_block_size=65536
+  execsql { 
+    CREATE TABLE t1(a PRIMARY KEY, b);
+    CREATE INDEX i1 ON t1(b);
+  }
+} {}
+do_execsql_test 3.1 {
+  INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100));
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   2
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   4
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --   8
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  16
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  32
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  64
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 128
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 256
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 512
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  1K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  2K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  4K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   --  8K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 16K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 32K
+  INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1;   -- 64K
+}
+do_test 3.2 {
+  sqlite4_lsm_work db main -optimize 1000000
+  execsql { SELECT count(*) FROM t1 }
+} {65536}
+do_test 3.3 {
+  db close
+  sqlite4 db test.db
+  execsql { SELECT count(*) FROM t1 }
+} {65536}
+do_test 3.4 {
+  execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) }
+  sqlite4_lsm_work db main -optimize 1000000
+  execsql { SELECT count(*) FROM t1 }
+} {65537}
+
+finish_test
+

Index: test/manydb.test
==================================================================
--- test/manydb.test
+++ test/manydb.test
@@ -17,28 +17,18 @@
 
 set testdir [file dirname $argv0]
 source $testdir/tester.tcl
 
 set N 300
-# if we're using proxy locks, we use 5 filedescriptors for a db
-# that is open and in the middle of writing changes, normally
-# sqlite uses 3 (proxy locking adds the conch and the local lock)
-set using_proxy 0
-foreach {name value} [array get env SQLITE4_FORCE_PROXY_LOCKING] {
-  set using_proxy value
-}
-set num_fd_per_openwrite_db 3
-if {$using_proxy>0} {
-  set num_fd_per_openwrite_db 5
-} 
+set num_fd_per_openwrite_db 4
 
 # First test how many file descriptors are available for use. To open a
 # database for writing SQLite requires 3 file descriptors (the database, the
 # journal and the directory).
 set filehandles {}
 catch {
-  for {set i 0} {$i<($N * 3)} {incr i} {
+  for {set i 0} {$i<($N * $num_fd_per_openwrite_db)} {incr i} {
     lappend filehandles [open testfile.1 w]
   }
 }
 foreach fd $filehandles {
   close $fd

Index: test/permutations.test
==================================================================
--- test/permutations.test
+++ test/permutations.test
@@ -131,11 +131,14 @@
 #
 lappend ::testsuitelist xxx
 
 test_suite "src4" -prefix "" -description {
 } -files {
-  simple.test log1.test log2.test log3.test csr1.test
+  simple.test 
+  log1.test log2.test log3.test 
+  csr1.test
+  ckpt1.test
 
   aggerror.test
   attach.test
   autoindex1.test
   badutf.test

Index: test/test_lsm.c
==================================================================
--- test/test_lsm.c
+++ test/test_lsm.c
@@ -35,10 +35,11 @@
     { "log-size",       LSM_CONFIG_LOG_SIZE }, 
     { "safety",         LSM_CONFIG_SAFETY }, 
     { "write-buffer",   LSM_CONFIG_WRITE_BUFFER }, 
     { "mmap",           LSM_CONFIG_MMAP }, 
     { "page-size",      LSM_CONFIG_PAGE_SIZE }, 
+    { "autowork",       LSM_CONFIG_AUTOWORK }, 
     { 0, 0 }
   };
 
   const char *zDb;                /* objv[1] as a string */
   const char *zName;              /* objv[2] as a string */

Index: test/tester.tcl
==================================================================
--- test/tester.tcl
+++ test/tester.tcl
@@ -19,10 +19,11 @@
 #
 # Commands to manipulate the db and the file-system at a high level:
 #
 #      copy_file              FROM TO
 #      delete_file            FILENAME
+#      db_delete              DBNAME
 #      drop_all_tables        ?DB?
 #      forcecopy              FROM TO
 #      forcedelete            FILENAME
 #
 # Test the capability of the SQLite version built into the interpreter to
@@ -357,17 +358,26 @@
   #
   if {$cmdlinearg(binarylog)} {
     vfslog new binarylog {} vfslog.bin
   }
 }
+
+# Delete all files associated with LSM database $file. That is:
+#
+#     ${file}
+#     ${file}-log
+#     ${file}-shm
+#
+proc db_delete {file} {
+  forcedelete $file $file-shm $file-log
+}
 
 # Create a test database
 #
 proc reset_db {} {
   catch {db close}
-  forcedelete test.db
-  forcedelete test.db-log
+  db_delete test.db
   sqlite4 db ./test.db
   set ::DB [sqlite4_connection_pointer db]
   if {[info exists ::SETUP_SQL]} {
     db eval $::SETUP_SQL
   }
@@ -1034,14 +1044,12 @@
     # SQL (in that order) to prepare for the test case.
     do_test $testname.$n.1 {
       set ::sqlite_io_error_pending 0
       catch {db close}
       catch {db2 close}
-      catch {forcedelete test.db}
-      catch {forcedelete test.db-journal}
-      catch {forcedelete test2.db}
-      catch {forcedelete test2.db-journal}
+      catch {db_delete test.db}
+      catch {db_delete test2.db}
       set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db]
       sqlite4_extended_result_codes $::DB $::ioerropts(-erc)
       if {[info exists ::ioerropts(-tclprep)]} {
         eval $::ioerropts(-tclprep)
       }
@@ -1466,11 +1474,11 @@
   hexio_write test.db 92 $B
   return ""
 }
 
 proc db_save {} {
-  foreach f [glob -nocomplain sv_test.db*] { forcedelete $f }
+  db_delete sv_test.db
   foreach f [glob -nocomplain test.db*] {
     set f2 "sv_$f"
     forcecopy $f $f2
   }
 }
@@ -1478,11 +1486,11 @@
   db_save
   catch { db close }
   return ""
 }
 proc db_restore {} {
-  foreach f [glob -nocomplain test.db*] { forcedelete $f }
+  db_delete test.db
   foreach f2 [glob -nocomplain sv_test.db*] {
     set f [string range $f2 3 end]
     forcecopy $f2 $f
   }
 }
@@ -1491,11 +1499,11 @@
   db_restore
   sqlite4 db $dbfile
 }
 proc db_delete_and_reopen {{file test.db}} {
   catch { db close }
-  foreach f [glob -nocomplain test.db*] { forcedelete $f }
+  db_delete $file
   sqlite4 db $file
 }
 
 # Do an SQL statement.  Append the search count to the end of the result.
 #

Index: tool/lsmview.tcl
==================================================================
--- tool/lsmview.tcl
+++ tool/lsmview.tcl
@@ -142,11 +142,14 @@
   $C bind $tid <Leave> [list segment_info $C {}]
 }
 
 proc segment_info {C segment} {
   set w $C
-  while {[winfo class $w]!="Frame"} {set w [winfo parent $w]}
+  while {[winfo class $w]!="Frame"} {
+    set w [winfo parent $w]
+    if {$w==""} return
+  }
   set w $w.info
   if {$segment==""} {
     $w config -text ""
   } else {
     foreach {iFirst iLast iRoot nSize} $segment break

ADDED   www/shm.wiki
Index: www/shm.wiki
==================================================================
--- /dev/null
+++ www/shm.wiki
@@ -0,0 +1,329 @@
+
+<title>Multi-process LSM Notes</title>
+<nowiki>
+
+<p>
+Notes on the changes required for LSM to allow connections from 
+multiple processes. In other words, notes to do with the contents
+of the *-shm file and the way they are accessed and manipulated.
+
+
+<h2>Contents of shared memory</h2>
+
+<p>
+Like SQLite 3 WAL mode, LSM uses a *-shm file. It uses the same
+"dead man switch" mechanism to ensure it is always initialized to 
+zero when the first client connects.
+
+<p>
+The *-shm file contains:
+
+<ol>
+  <li> A flag indicating whether or not the *-shm has been initialized
+       (log file recovered into in-memory tree, header fields loaded etc.)
+  <li> The meta-page number to which a checkpoint was last successfully
+       written.
+  <li> The client snapshot.
+  <li> The worker snapshot.
+  <li> The in-memory tree. This takes up most of the space in the file.
+</ol>
+
+<p>
+The client and worker snapshots are in the same format as those stored
+in the header of the database file itself.
+
+<p>
+Sometimes data from the meta-page identified by the header field is
+required. For example it is necessary to know the id of the last
+checkpointed snapshot in order to determine which free blocks are safe
+to reuse. The associated log file offset is also required to determine
+when the log file may be wrapped. These quantities are read directly
+from the meta-page in the database itself as required.
+
+<h2>File locks</h2>
+
+<p>
+Lsm uses the same ideas as SQLite in WAL mode. Both SHARED and EXCLUSIVE 
+locks are required. There are three exclusive locks:
+
+<ul>
+  <li> WRITER: Required to write to in-memory tree and its log file.
+  <li> WORKER: Required to write to body of database file.
+  <li> CHECKPOINTER: Required to write to database file header.
+</ul>
+
+<p>
+Only one client may hold each of these locks at one time. In other words,
+each of the above is implemented by represents a range of bytes in the file
+
+<p>
+There are also N separate locks held by readers. These locks also 
+work like WAL locks in that they are a combination of a lock and a
+value. In WAL mode the value is a 32-bit integer. For LSM, it will
+be two 64-bit integers - an in-memory tree id and a snapshot id.
+
+<h2>Memory allocation</h2>
+
+<p>
+Within the *-shm file, memory is allocated in 32KB chunks.
+
+<p>
+The first chunk of the file is the header chunk. It contains:
+
+<ol>
+  <li> The client snapshot (4KB)
+  <li> The worker snapshot (4KB)
+  <li> The "initialized" flag (4 bytes)
+  <li> The meta-page number containing the last checkpoint written (4
+       bytes)
+  <li> The in-memory tree headers (see below).
+</ol>
+
+<p>
+The second and subsequent chunks are used to store the in-memory tree
+data.
+
+<p>
+The in-memory tree structure is essentially an append-only rb-tree
+with some modifications to reduce the amount of data written.
+Multiple trees will sometimes be present in the file. To cope with
+circumstances like the following:
+
+<ul>
+  <li> Writer builds tree A.
+  <li> Reader takes a read lock on tree A.
+  <li> Tree A is flushed to the db.
+  <li> Writer begins building tree B.
+  <li> Reader continues reading from tree A.
+</ul>
+
+<p>
+In this case, the chunks used by tree A may not be reused until after
+the active read transaction has concluded.
+
+<p>
+Each chunk begins with three 32-bit integer fields:
+<ul>
+  <li> Id of first tree for which data is stored on the chunk,
+  <li> Id of last tree for which data is stored on the chunk,
+  <li> Chunk number of chunk written after this one (or zero, if this
+       is the most recently written chunk).
+</ul>
+
+<p>
+The third field described above links all tree chunks in the file,
+in-use or otherwise, into a single list. To allocate a new chunk,
+a writer first checks if the chunk at the head of the list can be
+recycled. If so, it moves it to the end of the list and begins
+writing to it. Otherwise, it allocates a new chunk at the end of
+the file, appends that to the list and continues writing.
+
+<p><b>Crash recovery: But, what happens if a writer crashes while
+writing a transaction to the database?</b>
+
+<p>If a writer crashes during a write transaction, readers can 
+often continue as normal. However, the next writer must roll 
+back any changes made to the db before it can commence a new
+transaction. Or, if a writer fails when updating the in-memory 
+tree header, it may not be possible for readers to continue. 
+This is resolved by having one reader become a writer, restore 
+the db, then "commit" the empty transaction.
+
+<p>
+The pattern used by a writer is:
+<ol>
+  <li> Obtain WRITER lock. This is a barrier operation (on Linux, an
+  fcntl(F_SETLK)).  
+  <li> Update shared memory region.
+  <li> Release WRITER lock. Another barrier (on Linux, another F_SETLK).
+</ol>
+
+<p> Or, if a failure occurs during step 2, the unlock operation is done
+automatically by the OS. Either way, assume that the unlock is also a
+barrier (see Documentation/memory-barrier.txt in kernel source tree). It
+can therefore be assumed that from the point of view of the subsequent
+writer, all writes to the shared memory region completed by the failed
+writer appear to have been performed in order - there is no need to
+worry that the hardware has reordered the writes made by the failed
+writer. The compiler may reorder them, of course, but this should be
+easy enough to avoid.
+
+<p>
+Also assumed is that 32-bit writes are atomic, in the sense that it
+is not possible for a failure in a writer process to result in some
+bits of a 32-bit word being updated and some remaining in their 
+original state.
+
+<p>
+Crashes are then managed by the following:
+
+<ul>
+  <li>When a write transaction is opened, a flag is set in the in-memory
+  tree header. This indicates that a transaction is underway. The same
+  flag is cleared right before the WRITER lock is released to commit or
+  roll back the transaction. 
+
+  <li>When a recyclable chunk is moved from the start of the linked list
+  to the end, the first thing done is that the "first tree" field is
+  updated. Then the "last tree". Then the header pointer is set to point
+  to the next element in the list.
+
+  <li>If the header flag is already set when the writer grabs the WRITER
+  lock, then a crash must have occurred. In this case the free-list must
+  be recovered.
+
+  <li>Recovering the free list involves two steps: First a linear scan
+  of the current tree to identify those chunks in use (and also for
+  another reason, see below). Second, a scan of the remainder of the
+  file checking the "first tree" field of all chunks that either belong
+  to an earlier tree or appear to belong to the current tree but are not
+  linked in anywhere. Based on this, the new writer can rebuild the
+  free-list.
+
+</ul>
+
+
+<h2>In-memory tree format</h2>
+
+<p>
+Header fields:
+
+<ul>
+  <li> 32-bits: Tree id (incremented for each new tree).
+  <li> 32-bits: Transaction id (incremented for each new transaction).
+  <li> 32-bits: Pointer to head of tree (an offset within the *-shm
+       file).
+  <li> 32-bits: Height of tree.
+  <li> 64-bits: Last checkpoint id for which log file space has already
+                been reclaimed.
+  <li> DbLog structure (see lsmInt.h).
+  <li> 32-bits: Header checksum 1.
+  <li> 32-bits: Header checksum 2.
+</ul>
+
+<p>
+There are two copies of the in-memory tree header. Both stored on
+the *-shm header chunk. Copy 1 and copy 2.
+
+<p>
+To commit a transaction, a writer does the following:
+
+<ol>
+  <li> Updates copy 2 of the header,
+  <li> Invokes a memory barrier,
+  <li> Updates copy 1 of the header,
+  <li> Clears the "transaction in progress flag",
+  <li> Drops the WRITER lock.
+</ol>
+
+<p>
+To open a read transaction, the reader:
+
+<ol>
+  <li> Reads copy 1 of the header.
+
+  <li> If the checksum fails, attempt to obtain the WRITER lock. If
+       successful, do the equivalent of opening and committing an
+       empty transaction (see below). Either way, return to 1 and
+       attempt to reread the in-memory tree header. If copy 1 cannot be
+       read within some reasonable amount of time...?
+
+  <li> Read the client shapshot from shared memory. If the checksum
+       fails, attempt to obtain the WORKER lock. If successful, copy
+       the worker snapshot over the client snapshot and drop the WORKER
+       lock. Successful or otherwise, attempt to reread the snapshot.
+       If this cannot be completed within some reasonable amount of
+       time...?
+
+  <li> Grab a read-lock corresponding to the tree id and snapshot ids
+       just read (note: assume that this is a memory barrier).
+
+  <li> Check that the shared memory tree header and client snapshot
+       still contain the ids for which the lock was obtained. If not, 
+       drop the lock and go back to step 1.
+</ol>
+
+<p>To open a write transaction, the writer:
+
+<ol>
+  <li> Opens a read transaction, if one is not already open.
+
+  <li> Obtain the WRITER lock.
+
+  <li> Check the "transaction in progress" flag. If it is set,
+       perform the emergency rollback and freelist recovery, then
+       clear the flag.
+
+  <li> Check that copy 1 of the header still matches the copy read
+       when the read transaction was opened. If not, drop the lock
+       and return LSM_BUSY.
+
+  <li> Set the "transaction in progress" flag.
+</ol>
+
+<p>
+Emergency rollback and recovery:
+<ol>
+  <li> If the checksum of copy 1 of the header fails, replace it with
+       the contents of copy 2.
+
+  <li> Iterate through the entire tree, rolling back any nodes with
+       transaction ids that indicate they require it. Record the blocks
+       occupied by the current tree.
+
+  <li> Scan through the entire *-shm memory file, inspecting the "first
+       tree" fields of each chunk.
+</ol>
+
+<p>
+    Large values or keys may overflow chunks.
+
+<h2>Client and worker snapshots</h2>
+
+<p>
+The client and worker snapshots stored in the *-shm file use the
+same format as the checkpoint written to the database file. Except,
+they are always in native byte order. Each is stored in a dedicated
+4KB slot, as in the database file. A client must hold the WORKER
+lock to modify either of the two snapshots.
+
+<p>
+To work on the database file, a worker performs the following:
+<ol>
+  <li> Obtain the WORKER lock.
+
+  <li> Copies the worker snapshot from the shared-memory region into
+       heap memory and verifies that the checksum computes.
+
+  <li> If the checksum of the worker snapshot does not compute, copy
+       the client snapshot over the top of the worker and reload it.
+       If the checksum still does not compute, return LSM_CORRUPT.
+
+  <li> Perform some merging work on the database. Generate a new
+       worker snapshot. Write it over the top of the old.
+
+  <li> Optionally, copy the new worker snapshot over the top of the
+       client snapshot. TODO: Copying the worker snapshot into the
+       client slot makes the worker read-only.... Currently, LSM
+       distinguishes between read-only and read-write worker snapshots.
+       But that would mean an extra flag in shared-memory. Perhaps its
+       better to consider all worker snapshots to be read-only. Or,
+       change the format slightly to include a "read-write" flag that
+       can be set for those snapshots not copied into the client slot. 
+       UPDATE: Current code already treats all worker snapshots as read-only.
+
+  <li> Release the WORKER lock.
+</ol>
+
+<p>
+To checkpoint a snapshot.
+<ol>
+    <li> Obtain the CHECKPOINTER lock.
+    <li> Read the client snapshot.
+    <li> Sync the database file.
+    <li> Write the client snapshot into the appropriate meta-page (based
+         on the "last checkpoint slot" field in the *-shm header).
+    <li> Sync the database file.
+    <li> Update the "last checkpoint slot" field.
+    <li> Drop the CHECKPOINTER lock.
+</ol>