Index: lsm-test/lsmtest1.c ================================================================== --- lsm-test/lsmtest1.c +++ lsm-test/lsmtest1.c @@ -58,13 +58,26 @@ static int testControlDb(TestDb **ppDb){ #ifdef HAVE_KYOTOCABINET return tdb_open("kyotocabinet", "tmp.db", 1, ppDb); #else - return tdb_open("sqlite3", "tmp.db", 1, ppDb); + return tdb_open("sqlite3", ":memory:", 1, ppDb); #endif } + +void testDatasourceFetch( + TestDb *pDb, /* Database handle */ + Datasource *pData, + int iKey, + int *pRc /* IN/OUT: Error code */ +){ + void *pKey; int nKey; /* Database key to query for */ + void *pVal; int nVal; /* Expected result of query */ + + testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal); + testFetch(pDb, pKey, nKey, pVal, nVal, pRc); +} /* ** This function is called to test that the contents of database pDb ** are as expected. In this case, expected is defined as containing ** key-value pairs iFirst through iLast, inclusive, from data source Index: lsm-test/lsmtest5.c ================================================================== --- lsm-test/lsmtest5.c +++ lsm-test/lsmtest5.c @@ -524,11 +524,13 @@ /* Open a new database connection. Initialize the pseudo-random number ** argument based on the thread number. */ iPrng = testPrngValue(iThread); pDb = testOpen(p->zSystem, 0, &rc); - tdb_lsm_config_work_hook(pDb, xMt1Work, 0); + if( rc==0 ){ + tdb_lsm_config_work_hook(pDb, xMt1Work, 0); + } /* Loop until either an error occurs or some other thread sets the ** halt flag. */ while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){ int iKey; Index: lsm-test/lsmtest_main.c ================================================================== --- lsm-test/lsmtest_main.c +++ lsm-test/lsmtest_main.c @@ -171,11 +171,11 @@ res = nKey1 - nKey2; } return res; } -static int test_scan_debug = 0; +int test_scan_debug = 0; static void scanCompareCb( void *pCtx, void *pKey, int nKey, void *pVal, int nVal @@ -183,11 +183,14 @@ ScanResult *p = (ScanResult *)pCtx; u8 *aKey = (u8 *)pKey; u8 *aVal = (u8 *)pVal; int i; - if( test_scan_debug ) printf("%.20s\n", (char *)pKey); + if( test_scan_debug ) printf("%.*s\n", nKey, (char *)pKey); +#if 0 + if( test_scan_debug ) printf("%.20s\n", (char *)pVal); +#endif #if 0 /* Check tdb_fetch() matches */ int rc = 0; testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc); @@ -458,11 +461,11 @@ static lsm_db *configure_lsm_db(TestDb *pDb){ lsm_db *pLsm; pLsm = tdb_lsm(pDb); if( pLsm ){ - tdb_lsm_config_str(pDb, "mmap=0 autowork=1 nmerge=4 worker_nmerge=4"); + tdb_lsm_config_str(pDb, "mmap=1 autowork=1 nmerge=4 worker_nmerge=4"); } return pLsm; } Index: lsm-test/lsmtest_tdb3.c ================================================================== --- lsm-test/lsmtest_tdb3.c +++ lsm-test/lsmtest_tdb3.c @@ -310,10 +310,31 @@ static int testEnvUnlink(lsm_env *pEnv, const char *zFile){ lsm_env *pRealEnv = tdb_lsm_env(); unused_parameter(pEnv); return pRealEnv->xUnlink(pRealEnv, zFile); } + +static int testEnvLock(lsm_file *pFile, int iLock, int eType){ + LsmFile *p = (LsmFile *)pFile; + lsm_env *pRealEnv = tdb_lsm_env(); + return pRealEnv->xLock(p->pReal, iLock, eType); +} + +static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){ + LsmFile *p = (LsmFile *)pFile; + lsm_env *pRealEnv = tdb_lsm_env(); + return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp); +} + +static void testEnvShmBarrier(void){ +} + +static int testEnvShmUnmap(lsm_file *pFile, int bDel){ + LsmFile *p = (LsmFile *)pFile; + lsm_env *pRealEnv = tdb_lsm_env(); + return pRealEnv->xShmUnmap(p->pReal, bDel); +} static void doSystemCrash(LsmDb *pDb){ lsm_env *pEnv = tdb_lsm_env(); int iFile; int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector; @@ -574,10 +595,12 @@ { "autowork", 0, LSM_CONFIG_AUTOWORK }, { "log_size", 0, LSM_CONFIG_LOG_SIZE }, { "mmap", 0, LSM_CONFIG_MMAP }, { "use_log", 0, LSM_CONFIG_USE_LOG }, { "nmerge", 0, LSM_CONFIG_NMERGE }, + { "max_freelist", 0, LSM_CONFIG_MAX_FREELIST }, + { "multi_proc", 0, LSM_CONFIG_MULTIPLE_PROCESSES }, { "worker_nmerge", 1, LSM_CONFIG_NMERGE }, { 0, 0 } }; const char *z = zStr; @@ -693,10 +716,14 @@ pDb->env.xSectorSize = testEnvSectorSize; pDb->env.xRemap = testEnvRemap; pDb->env.xFileid = testEnvFileid; pDb->env.xClose = testEnvClose; pDb->env.xUnlink = testEnvUnlink; + pDb->env.xLock = testEnvLock; + pDb->env.xShmBarrier = testEnvShmBarrier; + pDb->env.xShmMap = testEnvShmMap; + pDb->env.xShmUnmap = testEnvShmUnmap; rc = lsm_new(&pDb->env, &pDb->db); if( rc==LSM_OK ){ lsm_config_log(pDb->db, xLog, 0); lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb); @@ -728,11 +755,12 @@ int test_lsm_lomem_open( const char *zFilename, int bClear, TestDb **ppDb ){ - const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384"; + const char *zCfg = + "page_size=256 block_size=65536 write_buffer=16384 max_freelist=4"; return testLsmOpen(zCfg, zFilename, bClear, ppDb); } lsm_db *tdb_lsm(TestDb *pDb){ if( pDb->pMethods->xClose==test_lsm_close ){ Index: src/build.c ================================================================== --- src/build.c +++ src/build.c @@ -1396,27 +1396,30 @@ pIndex->pTable = pTab; pIndex->nColumn = nCol; pIndex->onError = (u8)onError; pIndex->pSchema = pTab->pSchema; - if( db->init.busy ){ - Hash *pIdxHash = &pIndex->pSchema->idxHash; - Index *p; - - p = sqlite4HashInsert(pIdxHash, pIndex->zName, nName, pIndex); - if( p ){ - assert( p==pIndex ); - db->mallocFailed = 1; - sqlite4DbFree(db, pIndex); - pIndex = 0; - } - } } *pzExtra = zExtra; return pIndex; } + +static int addIndexToHash(sqlite4 *db, Index *pIdx){ + if( db->init.busy ){ + Hash *pIdxHash = &pIdx->pSchema->idxHash; + int nName = sqlite4Strlen30(pIdx->zName); + Index *p; + p = sqlite4HashInsert(pIdxHash, pIdx->zName, nName, pIdx); + if( p ){ + assert( p==pIdx ); + db->mallocFailed = 1; + return SQLITE4_NOMEM; + } + } + return SQLITE4_OK; +} /* ** Allocate and populate an Index structure representing an implicit ** primary key. In implicit primary key behaves similarly to the built-in @@ -1425,19 +1428,22 @@ static void addImplicitPrimaryKey( Parse *pParse, /* Parse context */ Table *pTab, /* Table to add implicit PRIMARY KEY to */ int iDb ){ + sqlite4 *db = pParse->db; Index *pIndex; /* New index */ char *zExtra; assert( !pTab->pIndex || pTab->pIndex->eIndexType!=SQLITE4_INDEX_PRIMARYKEY ); assert( sqlite4Strlen30("binary")==6 ); pIndex = newIndex(pParse, pTab, pTab->zName, 1, OE_Abort, 1+6, &zExtra); + if( addIndexToHash(db, pIndex) ){ + sqlite4DbFree(db, pIndex); + pIndex = 0; + } if( pIndex ){ - sqlite4 *db = pParse->db; - pIndex->aiColumn[0] = -1; pIndex->azColl[0] = zExtra; memcpy(zExtra, "binary", 7); pIndex->eIndexType = SQLITE4_INDEX_PRIMARYKEY; pIndex->pNext = pTab->pIndex; @@ -2665,10 +2671,11 @@ if( db->init.busy ){ db->flags |= SQLITE4_InternChanges; if( pTblName!=0 || bPrimaryKey ){ pIndex->tnum = db->init.newTnum; } + if( addIndexToHash(db, pIndex) ) goto exit_create_index; } /* If the db->init.busy is 0 then create the index on disk. This ** involves writing the index into the master table and filling in the ** index with the current table contents. Index: src/kvlsm.c ================================================================== --- src/kvlsm.c +++ src/kvlsm.c @@ -440,16 +440,31 @@ pNew = (KVLsm *)sqlite4_malloc(pEnv, sizeof(KVLsm)); if( pNew==0 ){ rc = SQLITE4_NOMEM; }else{ + struct Config { + const char *zParam; + int eParam; + } aConfig[] = { + { "lsm_block_size", LSM_CONFIG_BLOCK_SIZE } + }; + memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; - rc = lsm_new(0, &pNew->pDb); if( rc==SQLITE4_OK ){ + int i; + for(i=0; ipDb, aConfig[i].eParam, &nVal); + } + } + rc = lsm_open(pNew->pDb, zName); } if( rc!=SQLITE4_OK ){ lsm_close(pNew->pDb); Index: src/lsm.h ================================================================== --- src/lsm.h +++ src/lsm.h @@ -32,10 +32,15 @@ typedef long long int lsm_i64; /* 64-bit signed integer type */ /* Forward reference */ typedef struct lsm_env lsm_env; /* Runtime environment */ +/* Candidate values for the 3rd argument to lsm_env.xLock() */ +#define LSM_LOCK_UNLOCK 0 +#define LSM_LOCK_SHARED 1 +#define LSM_LOCK_EXCL 2 + /* ** Run-time environment used by LSM */ struct lsm_env { int nByte; /* Size of this structure in bytes */ @@ -51,18 +56,20 @@ int (*xSectorSize)(lsm_file *); int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*); int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf); int (*xClose)(lsm_file *); int (*xUnlink)(lsm_env*, const char *); + int (*xLock)(lsm_file*, int, int); + int (*xShmMap)(lsm_file*, int, int, void **); + void (*xShmBarrier)(void); + int (*xShmUnmap)(lsm_file*, int); /****** memory allocation ****************************************/ void *pMemCtx; void *(*xMalloc)(lsm_env*, int); /* malloc(3) function */ void *(*xRealloc)(lsm_env*, void *, int); /* realloc(3) function */ void (*xFree)(lsm_env*, void *); /* free(3) function */ -#if 1 sqlite4_size_t (*xSize)(lsm_env*, void *); /* xSize function */ -#endif /****** mutexes ****************************************************/ void *pMutexCtx; int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */ int (*xMutexNew)(lsm_env*, lsm_mutex**); /* Get a new dynamic mutex */ void (*xMutexDel)(lsm_mutex *); /* Delete an allocated mutex */ @@ -165,20 +172,33 @@ ** file normally. False otherwise. ** ** LSM_CONFIG_NMERGE ** A read/write integer parameter. The minimum number of segments to ** merge together at a time. Default value 4. +** +** LSM_CONFIG_MAX_FREELIST +** A read/write integer parameter. The maximum number of free-list +** entries that are stored in a database checkpoint (the others are +** stored elsewhere in the database). +** +** There is no reason for an application to configure or query this +** parameter. It is only present because configuring a small value +** makes certain parts of the lsm code easier to test. +** +** LSM_CONFIG_MULTIPLE_PROCESSES */ -#define LSM_CONFIG_WRITE_BUFFER 1 -#define LSM_CONFIG_PAGE_SIZE 2 -#define LSM_CONFIG_SAFETY 3 -#define LSM_CONFIG_BLOCK_SIZE 4 -#define LSM_CONFIG_AUTOWORK 5 -#define LSM_CONFIG_LOG_SIZE 6 -#define LSM_CONFIG_MMAP 7 -#define LSM_CONFIG_USE_LOG 8 -#define LSM_CONFIG_NMERGE 9 +#define LSM_CONFIG_WRITE_BUFFER 1 +#define LSM_CONFIG_PAGE_SIZE 2 +#define LSM_CONFIG_SAFETY 3 +#define LSM_CONFIG_BLOCK_SIZE 4 +#define LSM_CONFIG_AUTOWORK 5 +#define LSM_CONFIG_LOG_SIZE 6 +#define LSM_CONFIG_MMAP 7 +#define LSM_CONFIG_USE_LOG 8 +#define LSM_CONFIG_NMERGE 9 +#define LSM_CONFIG_MAX_FREELIST 10 +#define LSM_CONFIG_MULTIPLE_PROCESSES 11 #define LSM_SAFETY_OFF 0 #define LSM_SAFETY_NORMAL 1 #define LSM_SAFETY_FULL 2 Index: src/lsmInt.h ================================================================== --- src/lsmInt.h +++ src/lsmInt.h @@ -43,11 +43,10 @@ ** overridden by calls to lsm_config(). */ #define LSM_PAGE_SIZE 4096 #define LSM_BLOCK_SIZE (2 * 1024 * 1024) #define LSM_TREE_BYTES (2 * 1024 * 1024) -#define LSM_ECOLA 4 #define LSM_DEFAULT_LOG_SIZE (128*1024) #define LSM_DEFAULT_NMERGE 4 /* Places where a NULL needs to be changed to a real lsm_env pointer @@ -56,17 +55,27 @@ /* Initial values for log file checksums. These are only used if the ** database file does not contain a valid checkpoint. */ #define LSM_CKSUM0_INIT 42 #define LSM_CKSUM1_INIT 42 + +#define LSM_META_PAGE_SIZE 4096 /* "mmap" mode is currently only used in environments with 64-bit address ** spaces. The following macro is used to test for this. */ #define LSM_IS_64_BIT (sizeof(void*)==8) #define LSM_AUTOWORK_QUANT 32 +/* Minimum number of free-list entries to store in the checkpoint, assuming +** the free-list contains this many entries. i.e. if overflow is required, +** the first LSM_CKPT_MIN_FREELIST entries are stored in the checkpoint and +** the remainder in an LSM system entry. */ +#define LSM_CKPT_MIN_FREELIST 6 +#define LSM_CKPT_MAX_REFREE 2 +#define LSM_CKPT_MIN_NONLSM (LSM_CKPT_MIN_FREELIST - LSM_CKPT_MAX_REFREE) + typedef struct Database Database; typedef struct DbLog DbLog; typedef struct FileSystem FileSystem; typedef struct Level Level; typedef struct LogMark LogMark; @@ -86,10 +95,15 @@ typedef struct TreeVersion TreeVersion; typedef struct TreeCursor TreeCursor; typedef struct Merge Merge; typedef struct MergeInput MergeInput; +typedef struct TreeHeader TreeHeader; +typedef struct ShmHeader ShmHeader; +typedef struct ShmChunk ShmChunk; +typedef struct ShmReader ShmReader; + typedef unsigned char u8; typedef unsigned short int u16; typedef unsigned int u32; typedef lsm_i64 i64; typedef unsigned long long int u64; @@ -109,10 +123,35 @@ #define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE) #define unused_parameter(x) (void)(x) #define array_size(x) (sizeof(x)/sizeof(x[0])) + +/* The size of each shared-memory chunk */ +#define LSM_SHM_CHUNK_SIZE (32*1024) + +/* The number of bytes reserved at the start of each shm chunk for MM. */ +#define LSM_SHM_CHUNK_HDR (3 * 4) + +/* The number of available read locks. */ +#define LSM_LOCK_NREADER 6 + +/* Lock definitions */ +#define LSM_LOCK_DMS1 1 +#define LSM_LOCK_DMS2 2 +#define LSM_LOCK_WRITER 3 +#define LSM_LOCK_WORKER 4 +#define LSM_LOCK_CHECKPOINTER 5 +#define LSM_LOCK_READER(i) ((i) + LSM_LOCK_CHECKPOINTER + 1) + +/* +** Hard limit on the number of free-list entries that may be stored in +** a checkpoint (the remainder are stored as a system record in the LSM). +** See also LSM_CONFIG_MAX_FREELIST. +*/ +#define LSM_MAX_FREELIST_ENTRIES 100 + /* ** A string that can grow by appending. */ struct LsmString { lsm_env *pEnv; /* Run-time environment */ @@ -119,22 +158,45 @@ int n; /* Size of string. -1 indicates error */ int nAlloc; /* Space allocated for z[] */ char *z; /* The string content */ }; +typedef struct LsmFile LsmFile; +struct LsmFile { + lsm_file *pFile; + LsmFile *pNext; +}; + +/* +** An instance of the following type is used to store an ordered list of +** u32 values. +** +** Note: This is a place-holder implementation. It should be replaced by +** a version that avoids making a single large allocation when the array +** contains a large number of values. For this reason, the internals of +** this object should only manipulated by the intArrayXXX() functions in +** lsm_tree.c. +*/ +typedef struct IntArray IntArray; +struct IntArray { + int nAlloc; + int nArray; + u32 *aArray; +}; + /* ** An instance of this structure represents a point in the history of the -** tree structure to roll back to. Refer to comments in tree.c for details. -** -** Pointers pRollback and pRoot both point to structures of type TreeNode. +** tree structure to roll back to. Refer to comments in lsm_tree.c for +** details. */ struct TreeMark { - void *pMpChunk; /* Mempool chunk to roll back to */ - int iMpOff; /* Mempool chunk offset to roll back to */ - void *pRollback; /* Zero v2 information starting here */ - void *pRoot; /* Root node to restore */ - int nHeight; /* Height of tree at pRoot */ + u32 iRoot; /* Offset of root node in shm file */ + u32 nHeight; /* Current height of tree structure */ + u32 iWrite; /* Write offset in shm file */ + u32 nChunk; /* Number of chunks in shared-memory file */ + u32 iFirst; /* First chunk in linked list */ + int iRollback; /* Index in lsm->rollback to revert to */ }; /* ** An instance of this structure represents a point in the database log. */ @@ -165,40 +227,77 @@ u32 cksum0; /* Checksum 0 at offset iOff */ u32 cksum1; /* Checksum 1 at offset iOff */ LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */ }; +/* +** Tree header structure. +*/ +struct TreeHeader { + u32 iTreeId; /* Current tree id */ + u32 iTransId; /* Current transaction id */ + u32 iRoot; /* Offset of root node in shm file */ + u32 nHeight; /* Current height of tree structure */ + u32 iWrite; /* Write offset in shm file */ + u32 nChunk; /* Number of chunks in shared-memory file */ + u32 iFirst; /* First chunk in linked list */ + u32 nByte; /* Size of current tree structure in bytes */ + DbLog log; /* Current layout of log file */ + i64 iCkpt; /* Id of ckpt log space is reclaimed for */ + u32 aCksum[2]; /* Checksums 1 and 2. */ +}; + /* ** Database handle structure. +** +** mLock: +** A bitmask representing the locks currently held by the connection. +** An LSM database supports N distinct locks, where N is some number less +** than or equal to 16. Locks are numbered starting from 1 (see the +** definitions for LSM_LOCK_WRITER and co.). +** +** The least significant 16-bits in mLock represent EXCLUSIVE locks. The +** most significant are SHARED locks. So, if a connection holds a SHARED +** lock on lock region iLock, then the following is true: +** +** (mLock & ((iLock+16-1) << 1)) +** +** Or for an EXCLUSIVE lock: +** +** (mLock & ((iLock-1) << 1)) */ struct lsm_db { /* Database handle configuration */ lsm_env *pEnv; /* runtime environment */ int (*xCmp)(void *, int, void *, int); /* Compare function */ - int nTreeLimit; /* Maximum size of in-memory tree in bytes */ - int bAutowork; /* True to do auto-work after writing */ + + /* Values configured by calls to lsm_config */ int eSafety; /* LSM_SAFETY_OFF, NORMAL or FULL */ - + int bAutowork; /* Configured by LSM_CONFIG_AUTOWORK */ + int nTreeLimit; /* Configured by LSM_CONFIG_WRITE_BUFFER */ int nMerge; /* Configured by LSM_CONFIG_NMERGE */ int nLogSz; /* Configured by LSM_CONFIG_LOG_SIZE */ int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ + int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ + int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ /* Sub-system handles */ FileSystem *pFS; /* On-disk portion of database */ Database *pDatabase; /* Database shared data */ /* Client transaction context */ - TreeVersion *pTV; /* In-memory tree snapshot (non-NULL in rt) */ Snapshot *pClient; /* Client snapshot (non-NULL in read trans) */ + int iReader; /* Read lock held (-1 == unlocked) */ MultiCursor *pCsr; /* List of all open cursors */ - LogWriter *pLogWriter; + LogWriter *pLogWriter; /* Context for writing to the log file */ int nTransOpen; /* Number of opened write transactions */ int nTransAlloc; /* Allocated size of aTrans[] array */ TransMark *aTrans; /* Array of marks for transaction rollback */ + IntArray rollback; /* List of tree-nodes to roll back */ /* Worker context */ Snapshot *pWorker; /* Worker snapshot (or NULL) */ /* Debugging message callback */ @@ -206,10 +305,19 @@ void *pLogCtx; /* Work done notification callback */ void (*xWork)(lsm_db *, void *); void *pWorkCtx; + + u32 mLock; /* Mask of current locks. See lsmShmLock(). */ + lsm_db *pNext; /* Next connection to same database */ + + int nShm; /* Size of apShm[] array */ + void **apShm; /* Shared memory chunks */ + ShmHeader *pShmhdr; /* Live shared-memory header */ + TreeHeader treehdr; /* Local copy of tree-header */ + u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)]; }; struct Segment { int iFirst; /* First page of this run */ int iLast; /* Last page of this run */ @@ -225,11 +333,11 @@ struct Level { Segment lhs; /* Left-hand (main) segment */ int iAge; /* Number of times data has been written */ int nRight; /* Size of apRight[] array */ Segment *aRhs; /* Old segments being merged into this */ - int iSplitTopic; + int iSplitTopic; /* Split key topic (if nRight>0) */ void *pSplitKey; /* Pointer to split-key (if nRight>0) */ int nSplitKey; /* Number of bytes in split-key */ Merge *pMerge; /* Merge operation currently underway */ Level *pNext; /* Next level in tree */ }; @@ -268,38 +376,142 @@ ** array is valid. */ #define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0) /* -** Number of integers in the free-list delta. +** The values that accompany the lock held by a database reader. +*/ +struct ShmReader { + i64 iTreeId; + i64 iLsmId; +}; + +/* +** An instance of this structure is stored in the first shared-memory +** page. The shared-memory header. +** +** bWriter: +** Immediately after opening a write transaction taking the WRITER lock, +** each writer client sets this flag. It is cleared right before the +** WRITER lock is relinquished. If a subsequent writer finds that this +** flag is already set when a write transaction is opened, this indicates +** that a previous writer failed mid-transaction. +** +** iMetaPage: +** If the database file does not contain a valid, synced, checkpoint, this +** value is set to 0. Otherwise, it is set to the meta-page number that +** contains the most recently written checkpoint (either 1 or 2). +** +** hdr1, hdr2: +** The two copies of the in-memory tree header. Two copies are required +** in case a writer fails while updating one of them. +*/ +struct ShmHeader { + u32 aClient[LSM_META_PAGE_SIZE / 4]; + u32 aWorker[LSM_META_PAGE_SIZE / 4]; + u32 bWriter; + u32 iMetaPage; + TreeHeader hdr1; + TreeHeader hdr2; + ShmReader aReader[LSM_LOCK_NREADER]; +}; + +/* +** An instance of this structure is stored at the start of each shared-memory +** chunk except the first (which is the header chunk - see above). +*/ +struct ShmChunk { + u32 iFirstTree; + u32 iLastTree; + u32 iNext; +}; + +#define LSM_APPLIST_SZ 4 + +typedef struct Freelist Freelist; +typedef struct FreelistEntry FreelistEntry; + +/* +** An instance of the following structure stores the current database free +** block list. The free list is a list of blocks that are not currently +** used by the worker snapshot. Assocated with each block in the list is the +** snapshot id of the most recent snapshot that did actually use the block. +*/ +struct Freelist { + FreelistEntry *aEntry; /* Free list entries */ + int nEntry; /* Number of valid slots in aEntry[] */ + int nAlloc; /* Allocated size of aEntry[] */ +}; +struct FreelistEntry { + u32 iBlk; /* Block number */ + i64 iId; /* Largest snapshot id to use this block */ +}; + +/* +** A snapshot of a database. A snapshot contains all the information required +** to read or write a database file on disk. See the description of struct +** Database below for futher details. */ -#define LSM_FREELIST_DELTA_SIZE 3 +struct Snapshot { + Database *pDatabase; /* Database this snapshot belongs to */ + Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ + i64 iId; /* Snapshot id */ -/* + /* Used by worker snapshots only */ + int nBlock; /* Number of blocks in database file */ + u32 aiAppend[LSM_APPLIST_SZ]; /* Append point list */ + Freelist freelist; /* Free block list */ + int nFreelistOvfl; /* Number of extra free-list entries in LSM */ +}; +#define LSM_INITIAL_SNAPSHOT_ID 11 + +/* ** Functions from file "lsm_ckpt.c". */ -int lsmCheckpointRead(lsm_db *, int *, int *); int lsmCheckpointWrite(lsm_db *); -int lsmCheckpointExport(lsm_db *, int, int, i64, int, void **, int *); -void lsmChecksumBytes(const u8 *, int, const u32 *, u32 *); -lsm_i64 lsmCheckpointLogOffset(void *pExport); int lsmCheckpointLevels(lsm_db *, int, void **, int *); int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal); -int lsmCheckpointOverflow(lsm_db *pDb, int *pnLsmLevel); + +int lsmCheckpointOverflow(lsm_db *pDb, void **, int *, int *); +int lsmCheckpointOverflowRequired(lsm_db *pDb); +int lsmCheckpointOverflowLoad(lsm_db *pDb, Freelist *); + +int lsmCheckpointRecover(lsm_db *); +int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **); + +int lsmCheckpointLoad(lsm_db *pDb); +int lsmCheckpointLoadWorker(lsm_db *pDb); +int lsmCheckpointStore(lsm_db *pDb, int); + +i64 lsmCheckpointId(u32 *, int); +i64 lsmCheckpointLogOffset(u32 *); +int lsmCheckpointPgsz(u32 *); +int lsmCheckpointBlksz(u32 *); +void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog); +void lsmCheckpointZeroLogoffset(lsm_db *); + +int lsmCheckpointSaveWorker(lsm_db *pDb, int, int); +int lsmDatabaseFull(lsm_db *pDb); +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId); + /* ** Functions from file "lsm_tree.c". */ int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree); void lsmTreeRelease(lsm_env *, Tree *); +void lsmTreeClear(lsm_db *); +void lsmTreeInit(lsm_db *); -int lsmTreeSize(TreeVersion *pTV); -int lsmTreeIsEmpty(Tree *pTree); +int lsmTreeSize(lsm_db *); +int lsmTreeEndTransaction(lsm_db *pDb, int bCommit); +int lsmTreeBeginTransaction(lsm_db *pDb); +int lsmTreeLoadHeader(lsm_db *pDb); int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal); void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark); -void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark); +void lsmTreeMark(lsm_db *pDb, TreeMark *pMark); int lsmTreeCursorNew(lsm_db *pDb, TreeCursor **); void lsmTreeCursorDestroy(TreeCursor *); int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes); @@ -308,19 +520,11 @@ int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast); void lsmTreeCursorReset(TreeCursor *pCsr); int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey); int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal); int lsmTreeCursorValid(TreeCursor *pCsr); -void lsmTreeCursorSave(TreeCursor *pCsr); - -TreeVersion *lsmTreeReadVersion(Tree *); -int lsmTreeWriteVersion(lsm_env *pEnv, Tree *, TreeVersion **); -TreeVersion *lsmTreeRecoverVersion(Tree *); -int lsmTreeIsWriteVersion(TreeVersion *); -int lsmTreeReleaseWriteVersion(lsm_env *, TreeVersion *, int, TreeVersion **); -void lsmTreeReleaseReadVersion(lsm_env *, TreeVersion *); - +int lsmTreeCursorSave(TreeCursor *pCsr); /* ** Functions from file "mem.c". */ int lsmPoolNew(lsm_env *pEnv, Mempool **ppPool); @@ -386,11 +590,10 @@ FileSystem *lsmPageFS(Page *); int lsmFsSectorSize(FileSystem *); void lsmSortedSplitkey(lsm_db *, Level *, int *); -int lsmFsSetupAppendList(lsm_db *db); /* Reading sorted run content. */ int lsmFsDbPageGet(FileSystem *, Pgno, Page **); int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **); @@ -406,14 +609,12 @@ int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **); int lsmFsMetaPageRelease(MetaPage *); u8 *lsmFsMetaPageData(MetaPage *, int *); -#ifdef LSM_EXPENSIVE_DEBUG +#ifdef LSM_DEBUG int lsmFsIntegrityCheck(lsm_db *); -#else -# define lsmFsIntegrityCheck(pDb) 1 #endif int lsmFsPageWritable(Page *); /* Functions to read, write and sync the log file. */ @@ -428,19 +629,27 @@ /* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */ int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut); int lsmConfigMmap(lsm_db *pDb, int *piParam); +int lsmEnvOpen(lsm_env *, const char *, lsm_file **); +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile); +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock); + +int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **); +void lsmEnvShmBarrier(lsm_env *); +void lsmEnvShmUnmap(lsm_env *, lsm_file *, int); + /* ** End of functions from "lsm_file.c". **************************************************************************/ /* ** Functions from file "lsm_sorted.c". */ int lsmInfoPageDump(lsm_db *, Pgno, int, char **); -int lsmSortedFlushTree(lsm_db *, int, int); +int lsmSortedFlushTree(lsm_db *, int *); void lsmSortedCleanup(lsm_db *); int lsmSortedAutoWork(lsm_db *, int nUnit); void lsmSortedRemap(lsm_db *pDb); @@ -448,12 +657,11 @@ int lsmSortedFlushDb(lsm_db *); int lsmSortedAdvanceAll(lsm_db *pDb); int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *); - -int lsmSortedLoadSystem(lsm_db *pDb); +int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *); void *lsmSortedSplitKey(Level *pLevel, int *pnByte); void lsmSortedSaveTreeCursors(lsm_db *); @@ -498,52 +706,46 @@ int lsmFlushToDisk(lsm_db *); /* ** Functions from file "lsm_log.c". */ -int lsmLogBegin(lsm_db *pDb, DbLog *pLog); +int lsmLogBegin(lsm_db *pDb); int lsmLogWrite(lsm_db *, void *, int, void *, int); int lsmLogCommit(lsm_db *); -void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit); +void lsmLogEnd(lsm_db *pDb, int bCommit); void lsmLogTell(lsm_db *, LogMark *); void lsmLogSeek(lsm_db *, LogMark *); int lsmLogRecover(lsm_db *); -void lsmLogCheckpoint(lsm_db *, DbLog *pLog, lsm_i64); +void lsmLogCheckpoint(lsm_db *, lsm_i64); int lsmLogStructure(lsm_db *pDb, char **pzVal); /************************************************************************** ** Functions from file "lsm_shared.c". */ -int lsmDbDatabaseFind(lsm_db*, const char *); + +int lsmDbDatabaseConnect(lsm_db*, const char *); void lsmDbDatabaseRelease(lsm_db *); -int lsmBeginRecovery(lsm_db *); int lsmBeginReadTrans(lsm_db *); int lsmBeginWriteTrans(lsm_db *); int lsmBeginFlush(lsm_db *); +int lsmBeginWork(lsm_db *); +void lsmFinishWork(lsm_db *, int, int, int *); + int lsmFinishRecovery(lsm_db *); void lsmFinishReadTrans(lsm_db *); int lsmFinishWriteTrans(lsm_db *, int); int lsmFinishFlush(lsm_db *, int); -int lsmDbUpdateClient(lsm_db *, int, int); - -int lsmSnapshotFreelist(lsm_db *, int **, int *); int lsmSnapshotSetFreelist(lsm_db *, int *, int); -void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz); - Snapshot *lsmDbSnapshotClient(lsm_db *); Snapshot *lsmDbSnapshotWorker(lsm_db *); -Snapshot *lsmDbSnapshotRecover(lsm_db *); -void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *); -void lsmSnapshotSetNBlock(Snapshot *, int); -int lsmSnapshotGetNBlock(Snapshot *); void lsmSnapshotSetCkptid(Snapshot *, i64); Level *lsmDbSnapshotLevel(Snapshot *); void lsmDbSnapshotSetLevel(Snapshot *, Level *); @@ -553,28 +755,48 @@ int lsmBlockFree(lsm_db *, int); int lsmBlockRefree(lsm_db *, int); void lsmFreelistDeltaBegin(lsm_db *); void lsmFreelistDeltaEnd(lsm_db *); -void lsmFreelistDelta(lsm_db *, u32 *); -u32 *lsmFreelistDeltaPtr(lsm_db *pDb); - -void lsmDatabaseDirty(lsm_db *pDb); -int lsmDatabaseIsDirty(lsm_db *pDb); +int lsmFreelistDelta(lsm_db *pDb); DbLog *lsmDatabaseLog(lsm_db *pDb); -Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp); -int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg); -void lsmSharedAppendListRemove(lsm_db *db, int iIdx); - -int lsmDbTreeSize(lsm_db *pDb); - #ifdef LSM_DEBUG int lsmHoldingClientMutex(lsm_db *pDb); + int lsmShmAssertLock(lsm_db *db, int iLock, int eOp); + int lsmShmAssertWorker(lsm_db *db); +#endif + +void lsmFreeSnapshot(lsm_env *, Snapshot *); + + +/* Candidate values for the 3rd argument to lsmShmLock() */ +#define LSM_LOCK_UNLOCK 0 +#define LSM_LOCK_SHARED 1 +#define LSM_LOCK_EXCL 2 + +int lsmShmChunk(lsm_db *db, int iChunk, void **ppData); +int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock); +void lsmShmBarrier(lsm_db *db); + +#ifdef LSM_DEBUG +void lsmShmHasLock(lsm_db *db, int iLock, int eOp); +#else +# define lsmShmHasLock(x,y,z) #endif +int lsmReadlock(lsm_db *, i64 iLsm, i64 iTree); +int lsmReleaseReadlock(lsm_db *); + +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse); +int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse); +int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId); + +int lsmDbMultiProc(lsm_db *); +void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *); + /************************************************************************** ** functions in lsm_str.c */ void lsmStringInit(LsmString*, lsm_env *pEnv); Index: src/lsm_ckpt.c ================================================================== --- src/lsm_ckpt.c +++ src/lsm_ckpt.c @@ -34,16 +34,20 @@ ** the two checksum values. ** 4. The total number of blocks in the database. ** 5. The block size. ** 6. The number of levels. ** 7. The nominal database page size. -** 8. Flag indicating if overflow records are used. If true, the top-level -** segment contains LEVELS and FREELIST entries. +** 8. Flag indicating if there exists a FREELIST record in the database. ** ** Log pointer: ** -** 4 integers. See ckptExportLog() and ckptImportLog(). +** 4 integers (2 for a 64-bit offset and 2 for a 64-bit checksum). See +** ckptExportLog() and ckptImportLog(). +** +** Append points: +** +** 4 integers. See ckptExportAppendlist(). ** ** For each level in the database, a level record. Formatted as follows: ** ** 0. Age of the level. ** 1. The number of right-hand segments (nRight, possibly 0), @@ -55,28 +59,22 @@ ** 5a. Page number of next cell to read during merge ** 5b. Cell number of next cell to read during merge ** 7. Page containing current split-key. ** 8. Cell within page containing current split-key. ** -** The freelist. If the checkpoint header indicates that the top level -** segment contains LEVELS and FREELIST records, then three integers are -** stored here: -** -** 1. The size to truncate the free list to after it is loaded. -** 2. First refree block (or 0), -** 3. Second refree block (or 0), -** -** In this case, the free list is loaded from the top level segment, -** then truncated so that it contains the nTruncate newest entries only, -** where nTruncate is the first integer in the block of three above. If -** either or both of the "refree block" integers are non-zero, then they -** are appended to the free-list. -** -** Or, if the checkpoint header flag is clear, then the entire free-list -** is stored in the checkpoint. The format is the number of entries in -** the free-list, followed by the entries themselves (i.e. N+1 integers -** for an N entry free-list). +** The freelist. +** +** 1. Number of free-list entries stored in checkpoint header. +** 2. For each entry: +** 2a. Block number of free block. +** 2b. MSW of associated checkpoint id. +** 2c. LSW of associated checkpoint id. +** +** If the overflow flag is set, then extra free-list entries may be stored +** in the FREELIST record. The FREELIST record contains 3 32-bit integers +** per entry, in the same format as above (without the "number of entries" +** field). ** ** The checksum: ** ** 1. Checksum value 1. ** 2. Checksum value 2. @@ -88,29 +86,58 @@ ** 3. Root page of array (or 0), ** 4. Size of array in pages, */ /* -** OVERSIZED CHECKPOINT BLOBS: -** -** There are two slots allocated for checkpoints at the start of each -** database file. Each are 4096 bytes in size, so may accommodate -** checkpoints that consist of up to 1024 32-bit integers. Normally, -** this is enough. -** -** However, if a database contains a sufficiently large number of levels, -** a checkpoint may exceed 1024 integers in size. In most circumstances this -** is an undesirable scenario, as a database with so many levels will be -** slow to query. If this does happen, then only the uppermost (more recent) -** levels are stored in the checkpoint blob itself. The remainder are stored -** in an LSM record with the system key "LEVELS". The payload of the entry -** is a series of 32-bit big-endian integers, as follows: -** -** 1. Number of levels (store in the LEVELS record, not total). -** 2. For each level, a "level record" (as desribed above). -** -** There is no checksum in the LEVELS record. +** LARGE NUMBERS OF LEVEL RECORDS: +** +** A limit on the number of rhs segments that may be present in the database +** file. Defining this limit ensures that all level records fit within +** the 4096 byte limit for checkpoint blobs. +** +** The number of right-hand-side segments in a database is counted as +** follows: +** +** * For each level in the database not undergoing a merge, add 1. +** +** * For each level in the database that is undergoing a merge, add +** the number of segments on the rhs of the level. +** +** A level record not undergoing a merge is 6 integers. A level record +** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the +** separators from the next level) is (6*nRhs+12) integers. The maximum +** per right-hand-side level is therefore 12 integers. So the maximum +** size of all level records in a checkpoint is 12*40=480 integers. +*/ +#define LSM_MAX_RHS_SEGMENTS 40 + +/* +** LARGE NUMBERS OF FREELIST ENTRIES: +** +** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h) +** on the number of free-list entries stored in a checkpoint. Since each +** free-list entry consists of 3 integers, the maximum free-list size is +** 3*100=300 integers. Combined with the limit on rhs segments defined +** above, this ensures that a checkpoint always fits within a 4096 byte +** meta page. +** +** If the database contains more than 100 free blocks, the "overflow" flag +** in the checkpoint header is set and the remainder are stored in the +** system FREELIST entry in the LSM (along with user data). The value +** accompanying the FREELIST key in the LSM is, like a checkpoint, an array +** of 32-bit big-endian integers. As follows: +** +** For each entry: +** a. Block number of free block. +** b. MSW of associated checkpoint id. +** c. LSW of associated checkpoint id. +** +** The number of entries is not required - it is implied by the size of the +** value blob containing the integer array. +** +** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit. +** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST. */ /* ** The argument to this macro must be of type u32. On a little-endian ** architecture, it returns the u32 value that results from interpreting @@ -124,15 +151,16 @@ ) static const int one = 1; #define LSM_LITTLE_ENDIAN (*(u8 *)(&one)) -/* Total number of 32-bit integers in the checkpoint header. */ -#define CKPT_HDR_SIZE 8 -#define CKPT_LOGPTR_SIZE 4 -#define CKPT_SEGMENT_SIZE 4 -#define CKPT_CKSUM_SIZE 2 +/* Sizes, in integers, of various parts of the checkpoint. */ +#define CKPT_HDR_SIZE 8 +#define CKPT_LOGPTR_SIZE 4 +#define CKPT_SEGMENT_SIZE 4 +#define CKPT_CKSUM_SIZE 2 +#define CKPT_APPENDLIST_SIZE LSM_APPLIST_SZ /* A #define to describe each integer in the checkpoint header. */ #define CKPT_HDR_ID_MSW 0 #define CKPT_HDR_ID_LSW 1 #define CKPT_HDR_NCKPT 2 @@ -140,67 +168,57 @@ #define CKPT_HDR_BLKSZ 4 #define CKPT_HDR_NLEVEL 5 #define CKPT_HDR_PGSZ 6 #define CKPT_HDR_OVFL 7 -/* -** Generate or extend an 8 byte checksum based on the data in array aByte[] -** and the initial values of aIn[0] and aIn[1] (or initial values of 0 and -** 0 if aIn==NULL). -** -** The checksum is written back into aOut[] before returning. -*/ -void lsmChecksumBytes( - const u8 *a, /* Content to be checksummed */ - int nByte, /* Bytes of content in a[] */ - const u32 *aIn, /* Initial checksum value input */ - u32 *aOut /* OUT: Final checksum value output */ -){ - u32 s1, s2; - u32 *aData = (u32 *)a; - u32 *aEnd = (u32 *)&a[nByte & ~0x00000007]; - - u32 aExtra[2] = {0, 0}; - memcpy(aExtra, &a[nByte & ~0x00000007], nByte & 0x00000007); - - if( aIn ){ - s1 = aIn[0]; - s2 = aIn[1]; - }else{ - s1 = s2 = 0; - } - - if( LSM_LITTLE_ENDIAN ){ - /* little-endian */ - s1 += aExtra[0] + s2; - s2 += aExtra[1] + s1; - while( aData=p->nAlloc ){ int nNew = LSM_MAX(8, iIdx*2); p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32)); @@ -211,23 +229,30 @@ p->nAlloc = nNew; } p->aCkpt[iIdx] = iVal; } -static void ckptChangeEndianness(u32 *a, int n){ +/* +** Argument aInt points to an array nInt elements in size. Switch the +** endian-ness of each element of the array. +*/ +static void ckptChangeEndianness(u32 *aInt, int nInt){ if( LSM_LITTLE_ENDIAN ){ int i; - for(i=0; iaCkpt, nCkpt); - lsmChecksumBytes((u8 *)p->aCkpt, sizeof(u32)*nCkpt, 0, aCksum); - ckptChangeEndianness(aCksum, 2); + ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]); ckptSetValue(p, nCkpt, aCksum[0], pRc); ckptSetValue(p, nCkpt+1, aCksum[1], pRc); } } @@ -250,14 +275,14 @@ *piOut = iOut; } static void ckptExportLevel( - Level *pLevel, - CkptBuffer *p, - int *piOut, - int *pRc + Level *pLevel, /* Level object to serialize */ + CkptBuffer *p, /* Append new level record to this ckpt */ + int *piOut, /* IN/OUT: Size of checkpoint so far */ + int *pRc /* IN/OUT: Error code */ ){ int iOut = *piOut; Merge *pMerge; pMerge = pLevel->pMerge; @@ -286,121 +311,124 @@ *piOut = iOut; } /* -** Write the current log offset into the checkpoint buffer. 4 values. +** Populate the log offset fields of the checkpoint buffer. 4 values. */ -static void ckptExportLog(DbLog *pLog, CkptBuffer *p, int *piOut, int *pRc){ +static void ckptExportLog( + lsm_db *pDb, + int bFlush, + CkptBuffer *p, + int *piOut, + int *pRc +){ + int iOut = *piOut; + + assert( iOut==CKPT_HDR_LO_MSW ); + + if( bFlush ){ + DbLog *pLog = &pDb->treehdr.log; + i64 iOff = pLog->aRegion[2].iEnd; + ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc); + ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc); + ckptSetValue(p, iOut++, pLog->cksum0, pRc); + ckptSetValue(p, iOut++, pLog->cksum1, pRc); + }else{ + for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){ + ckptSetValue(p, iOut, pDb->pShmhdr->aWorker[iOut], pRc); + } + } + + *piOut = iOut; +} + +static void ckptExportAppendlist( + lsm_db *db, /* Database connection */ + CkptBuffer *p, /* Checkpoint buffer to write to */ + int *piOut, /* IN/OUT: Offset within checkpoint buffer */ + int *pRc /* IN/OUT: Error code */ +){ + int i; int iOut = *piOut; - i64 iOff = pLog->aRegion[2].iEnd; + u32 *aiAppend = db->pWorker->aiAppend; - ckptSetValue(p, iOut++, (iOff >> 32) & 0xFFFFFFFF, pRc); - ckptSetValue(p, iOut++, (iOff & 0xFFFFFFFF), pRc); - ckptSetValue(p, iOut++, pLog->cksum0, pRc); - ckptSetValue(p, iOut++, pLog->cksum1, pRc); - + for(i=0; iaRegion[2].iStart = (((i64)aIn[iIn]) << 32) + (i64)aIn[iIn+1]; - pLog->cksum0 = aIn[iIn+2]; - pLog->cksum1 = aIn[iIn+3]; - - *piIn = iIn+4; -} - -lsm_i64 lsmCheckpointLogOffset(void *pExport){ - u8 *aIn = (u8 *)pExport; - u32 i1; - u32 i2; - i1 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4]); - i2 = lsmGetU32(&aIn[CKPT_HDR_SIZE*4+4]); - return (((i64)i1) << 32) + (i64)i2; -} - - -int lsmCheckpointExport( +}; + +static int ckptExportSnapshot( lsm_db *pDb, /* Connection handle */ - int nLsmLevel, /* Number of levels to store in LSM */ - int bOvfl, /* True if free list is stored in LSM */ + int nOvfl, /* Number of free-list entries in LSM */ + int bLog, /* True to update log-offset fields */ i64 iId, /* Checkpoint id */ int bCksum, /* If true, include checksums */ void **ppCkpt, /* OUT: Buffer containing checkpoint */ int *pnCkpt /* OUT: Size of checkpoint in bytes */ ){ int rc = LSM_OK; /* Return Code */ FileSystem *pFS = pDb->pFS; /* File system object */ Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */ - int nAll = 0; /* Number of levels in db */ - int nHdrLevel = 0; /* Number of levels in checkpoint */ - int iLevel; /* Used to count out nHdrLevel levels */ + int nLevel = 0; /* Number of levels in checkpoint */ + int iLevel; /* Used to count out nLevel levels */ int iOut = 0; /* Current offset in aCkpt[] */ Level *pLevel; /* Level iterator */ int i; /* Iterator used while serializing freelist */ - u32 aDelta[LSM_FREELIST_DELTA_SIZE]; CkptBuffer ckpt; + int nFree; + + nFree = pSnap->freelist.nEntry; + if( nOvfl>=0 ){ + nFree -= nOvfl; + }else{ + nOvfl = pDb->pShmhdr->aWorker[CKPT_HDR_OVFL]; + } - assert( bOvfl || nLsmLevel==0 ); - /* Initialize the output buffer */ memset(&ckpt, 0, sizeof(CkptBuffer)); ckpt.pEnv = pDb->pEnv; iOut = CKPT_HDR_SIZE; - /* Write the current log offset */ - ckptExportLog(lsmDatabaseLog(pDb), &ckpt, &iOut, &rc); + /* Write the log offset into the checkpoint. */ + ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc); + + /* Write the append-point list */ + ckptExportAppendlist(pDb, &ckpt, &iOut, &rc); /* Figure out how many levels will be written to the checkpoint. */ - for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nAll++; - nHdrLevel = nAll - nLsmLevel; - assert( nHdrLevel>0 ); + for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++; - /* Serialize nHdrLevel levels. */ + /* Serialize nLevel levels. */ iLevel = 0; - for(pLevel=lsmDbSnapshotLevel(pSnap); iLevelpNext){ + for(pLevel=lsmDbSnapshotLevel(pSnap); iLevelpNext){ ckptExportLevel(pLevel, &ckpt, &iOut, &rc); iLevel++; } - /* Write the freelist delta (if bOvfl is true) or else the entire free-list - ** (if bOvfl is false). */ + /* Write the freelist */ if( rc==LSM_OK ){ - if( bOvfl ){ - lsmFreelistDelta(pDb, aDelta); - for(i=0; ipEnv, aVal); + ckptSetValue(&ckpt, iOut++, nFree, &rc); + for(i=0; ifreelist.aEntry[i]; + ckptSetValue(&ckpt, iOut++, p->iBlk, &rc); + ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc); + ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc); } } /* Write the checkpoint header */ assert( iId>=0 ); ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc); ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc); ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc); - ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, lsmSnapshotGetNBlock(pSnap), &rc); + ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc); ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc); - ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nHdrLevel, &rc); + ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc); ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc); - ckptSetValue(&ckpt, CKPT_HDR_OVFL, bOvfl, &rc); + ckptSetValue(&ckpt, CKPT_HDR_OVFL, nOvfl, &rc); if( bCksum ){ ckptAddChecksum(&ckpt, iOut, &rc); }else{ ckptSetValue(&ckpt, iOut, 0, &rc); @@ -407,10 +435,16 @@ ckptSetValue(&ckpt, iOut+1, 0, &rc); } iOut += 2; assert( iOut<=1024 ); +#if 0 + lsmLogMessage(pDb, rc, + "ckptExportSnapshot(): id=%d freelist: %d/%d", (int)iId, nFree, nOvfl + ); +#endif + *ppCkpt = (void *)ckpt.aCkpt; if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut; return rc; } @@ -525,78 +559,10 @@ *ppLevel = pRet; *piIn = iIn; return rc; } -static int ckptImport( - lsm_db *pDb, - void *pCkpt, - int nInt, - int *pbOvfl, - int *pRc -){ - int rc = *pRc; - int ret = 0; - if( rc==LSM_OK ){ - Snapshot *pSnap = pDb->pWorker; - u32 cksum[2] = {0, 0}; - u32 *aInt = (u32 *)pCkpt; - - lsmChecksumBytes((u8 *)aInt, sizeof(u32)*(nInt-2), 0, cksum); - if( LSM_LITTLE_ENDIAN ){ - int i; - for(i=0; i0 ){ u32 *aIn; @@ -627,102 +593,10 @@ } return rc; } - -/* -** If *pRc is not LSM_OK when this function is called, it is a no-op. -** -** Otherwise, it attempts to read the id and size of the checkpoint stored in -** slot iSlot of the database header. If an error occurs during processing, -** *pRc is set to an error code before returning. The returned value is -** always zero in this case. -** -** Or, if no error occurs, set *pnInt to the total number of integer values -** in the checkpoint and return the checkpoint id. -*/ -static i64 ckptReadId( - lsm_db *pDb, /* Connection handle */ - int iSlot, /* Slot to read from (1 or 2) */ - int *pnInt, /* OUT: Size of slot checkpoint in ints */ - int *pRc /* IN/OUT: Error code */ -){ - i64 iId = 0; /* Checkpoint id (return value) */ - - assert( iSlot==1 || iSlot==2 ); - if( *pRc==LSM_OK ){ - MetaPage *pPg; /* Meta page for slot iSlot */ - *pRc = lsmFsMetaPageGet(pDb->pFS, 0, iSlot, &pPg); - if( *pRc==LSM_OK ){ - u8 *aData = lsmFsMetaPageData(pPg, 0); - - iId = (i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4]) << 32; - iId += (i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]); - *pnInt = (int)lsmGetU32(&aData[CKPT_HDR_NCKPT*4]); - - lsmFsMetaPageRelease(pPg); - } - } - return iId; -} - -/* -** Attempt to load the checkpoint from slot iSlot. Return true if the -** attempt is successful. -*/ -static int ckptTryRead( - lsm_db *pDb, - int iSlot, - int nCkpt, - int *pbOvfl, - int *pRc -){ - int ret = 0; - assert( iSlot==1 || iSlot==2 ); - if( *pRc==LSM_OK - && nCkpt>=CKPT_HDR_SIZE - && nCkpt<65536 - ){ - u32 *aCkpt; - aCkpt = (u32 *)lsmMallocZeroRc(pDb->pEnv, sizeof(u32)*nCkpt, pRc); - if( aCkpt ){ - int rc = LSM_OK; - int iPg; - int nRem; - u8 *aRem; - - /* Read the checkpoint data. */ - nRem = sizeof(u32) * nCkpt; - aRem = (u8 *)aCkpt; - iPg = iSlot; - while( rc==LSM_OK && nRem ){ - MetaPage *pPg; - rc = lsmFsMetaPageGet(pDb->pFS, 0, iPg, &pPg); - if( rc==LSM_OK ){ - int nCopy; - int nData; - u8 *aData = lsmFsMetaPageData(pPg, &nData); - - nCopy = LSM_MIN(nRem, nData); - memcpy(aRem, aData, nCopy); - aRem += nCopy; - nRem -= nCopy; - lsmFsMetaPageRelease(pPg); - } - iPg += 2; - } - - ret = ckptImport(pDb, aCkpt, nCkpt, pbOvfl, &rc); - lsmFree(pDb->pEnv, aCkpt); - *pRc = rc; - } - } - - return ret; -} - /* ** Return the data for the LEVELS record. ** ** The size of the checkpoint that can be stored in the database header ** must not exceed 1024 32-bit integers. Normally, it does not. However, @@ -771,119 +645,570 @@ return rc; } /* -** The function is used to determine if the FREELIST and LEVELS overflow -** records may be required if a new top level segment is written and a -** serialized checkpoint blob created. -** -** If the checkpoint will definitely fit in a single meta page, 0 is -** returned and *pnLsmLevel is set to 0. In this case the caller need not -** bother creating FREELIST and LEVELS records. -** -** Or, if it is likely that the overflow records will be required, non-zero -** is returned. +** The worker lock must be held to call this function. +** +** The function serializes and returns the data that should be stored as +** the FREELIST system record. */ int lsmCheckpointOverflow( lsm_db *pDb, /* Database handle (must hold worker lock) */ - int *pnLsmLevel /* OUT: Number of levels to store in LSM */ -){ - Level *p; /* Used to iterate through levels */ - int nFree; /* Free integers remaining in db header */ - int nList; /* Size of freelist in integers */ - int nLevel = 0; /* Number of levels stored in LEVELS */ - - /* Number of free integers - 1024 less those used by the checkpoint header, - ** less the 4 used for the log-pointer, less the 3 used for the free-list - ** delta and the 2 used for the checkpoint checksum. Value nFree is - ** therefore the total number of integers available to store the database - ** levels and freelist. */ - nFree = 1024 - CKPT_HDR_SIZE - CKPT_LOGPTR_SIZE - CKPT_CKSUM_SIZE; - - /* Allow space for the free-list delta */ - nFree -= 3; - - /* Allow space for the new level that may be created */ - nFree -= (2 + CKPT_SEGMENT_SIZE); - - /* Each level record not currently undergoing a merge consumes 2 + 4 - ** integers. Each level that is undergoing a merge consumes 2 + 4 + - ** (nRhs * 4) + 1 + 1 + (nMerge * 2) + 2, where nRhs is the number of levels - ** used as input to the merge and nMerge is the total number of segments - ** (same as the number of levels, possibly plus 1 separators array). - ** - ** The calculation in the following block may overestimate the number - ** of integers required by a single level by 2 (as it assumes - ** that nMerge==nRhs+1). */ - for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext){ - int nThis; /* Number of integers required by level p */ - if( p->pMerge ){ - nThis = 2 + (1 + p->nRight) * (2 + CKPT_SEGMENT_SIZE) + 1 + 1 + 2; - }else{ - nThis = 2 + CKPT_SEGMENT_SIZE; - } - if( nFreepNext; - } - *pnLsmLevel = nLevel; - - /* Set nList to the number of values required to store the free-list */ - lsmSnapshotFreelist(pDb, 0, &nList); - nList++; - - return (nLevel>0 || nList>nFree); + void **ppVal, /* OUT: lsmMalloc'd buffer */ + int *pnVal, /* OUT: Size of *ppVal in bytes */ + int *pnOvfl /* OUT: Number of freelist entries in buf */ +){ + int rc = LSM_OK; + int nRet; + Snapshot *p = pDb->pWorker; + + assert( lsmShmAssertWorker(pDb) ); + assert( pnOvfl && ppVal && pnVal ); + assert( pDb->nMaxFreelist>=2 && pDb->nMaxFreelist<=LSM_MAX_FREELIST_ENTRIES ); + + if( p->nFreelistOvfl ){ + rc = lsmCheckpointOverflowLoad(pDb, &p->freelist); + if( rc!=LSM_OK ) return rc; + p->nFreelistOvfl = 0; + } + + if( p->freelist.nEntry<=pDb->nMaxFreelist ){ + nRet = 0; + *pnVal = 0; + *ppVal = 0; + }else{ + int i; /* Iterator variable */ + int iOut = 0; /* Current size of blob in ckpt */ + CkptBuffer ckpt; /* Used to build FREELIST blob */ + + nRet = (p->freelist.nEntry - pDb->nMaxFreelist); + + memset(&ckpt, 0, sizeof(CkptBuffer)); + ckpt.pEnv = pDb->pEnv; + for(i=p->freelist.nEntry-nRet; rc==LSM_OK && ifreelist.nEntry; i++){ + FreelistEntry *pEntry = &p->freelist.aEntry[i]; + ckptSetValue(&ckpt, iOut++, pEntry->iBlk, &rc); + ckptSetValue(&ckpt, iOut++, (pEntry->iId >> 32) & 0xFFFFFFFF, &rc); + ckptSetValue(&ckpt, iOut++, pEntry->iId & 0xFFFFFFFF, &rc); + } + ckptChangeEndianness(ckpt.aCkpt, iOut); + + *ppVal = ckpt.aCkpt; + *pnVal = iOut*sizeof(u32); + } + + *pnOvfl = nRet; + return rc; +} + +/* +** The connection must be the worker in order to call this function. +** +** True is returned if there are currently too many free-list entries +** in-memory to store in a checkpoint. Before calling lsmCheckpointSaveWorker() +** to save the current worker snapshot, a new top-level LSM segment must +** be created so that some of them can be written to the LSM. +*/ +int lsmCheckpointOverflowRequired(lsm_db *pDb){ + assert( lsmShmAssertWorker(pDb) ); + return (pDb->pWorker->freelist.nEntry > pDb->nMaxFreelist); +} + +/* +** Connection pDb must be the worker to call this function. +** +** Load the FREELIST record from the database. Decode it and append the +** results to list pFreelist. +*/ +int lsmCheckpointOverflowLoad( + lsm_db *pDb, + Freelist *pFreelist +){ + int rc; + int nVal = 0; + void *pVal = 0; + assert( lsmShmAssertWorker(pDb) ); + + /* Load the blob of data from the LSM. If that is successful (and the + ** blob is greater than zero bytes in size), decode the contents and + ** merge them into the current contents of *pFreelist. */ + rc = lsmSortedLoadFreelist(pDb, &pVal, &nVal); + if( pVal ){ + u32 *aFree = (u32 *)pVal; + int nFree = nVal / sizeof(int); + ckptChangeEndianness(aFree, nFree); + if( (nFree % 3) ){ + rc = LSM_CORRUPT_BKPT; + }else{ + int iNew = 0; /* Offset of next element in aFree[] */ + int iOld = 0; /* Next element in freelist fl */ + Freelist fl = *pFreelist; /* Original contents of *pFreelist */ + + memset(pFreelist, 0, sizeof(Freelist)); + while( rc==LSM_OK && (iNew=fl.nEntry ){ + iBlk = aFree[iNew]; + iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2]; + iNew += 3; + }else if( iNew>=nFree ){ + iBlk = fl.aEntry[iOld].iBlk; + iId = fl.aEntry[iOld].iId; + iOld += 1; + }else{ + iId = ((i64)(aFree[iNew+1])<<32) + (i64)aFree[iNew+2]; + if( iIdpEnv, pFreelist, iBlk, iId); + } + lsmFree(pDb->pEnv, fl.aEntry); + +#ifdef LSM_DEBUG + if( rc==LSM_OK ){ + int i; + for(i=1; rc==LSM_OK && inEntry; i++){ + assert( pFreelist->aEntry[i].iId >= pFreelist->aEntry[i-1].iId ); + } + assert( pFreelist->nEntry==(fl.nEntry + nFree/3) ); + } +#endif + } + + lsmFree(pDb->pEnv, pVal); + } + + return rc; +} + +/* +** Read the checkpoint id from meta-page pPg. +*/ +static i64 ckptLoadId(MetaPage *pPg){ + i64 ret = 0; + if( pPg ){ + int nData; + u8 *aData = lsmFsMetaPageData(pPg, &nData); + ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) + + ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); + } + return ret; +} + +/* +** Return true if the buffer passed as an argument contains a valid +** checkpoint. +*/ +static int ckptChecksumOk(u32 *aCkpt){ + u32 nCkpt = aCkpt[CKPT_HDR_NCKPT]; + u32 cksum1; + u32 cksum2; + + if( nCkpt(LSM_META_PAGE_SIZE)/sizeof(u32) ) return 0; + ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2); + return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]); +} + +/* +** Attempt to load a checkpoint from meta page iMeta. +** +** This function is a no-op if *pRc is set to any value other than LSM_OK +** when it is called. If an error occurs, *pRc is set to an LSM error code +** before returning. +** +** If no error occurs and the checkpoint is successfully loaded, copy it to +** ShmHeader.aClient[] and ShmHeader.aWorker[], and set ShmHeader.iMetaPage +** to indicate its origin. In this case return 1. Or, if the checkpoint +** cannot be loaded (because the checksum does not compute), return 0. +*/ +static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){ + int bLoaded = 0; /* Return value */ + if( *pRc==LSM_OK ){ + int rc = LSM_OK; /* Error code */ + u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */ + u32 nCkpt; /* Number of elements in aCkpt[] */ + int nData; /* Bytes of data in aData[] */ + u8 *aData; /* Meta page data */ + + aData = lsmFsMetaPageData(pPg, &nData); + nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); + if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){ + aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc); + } + if( aCkpt ){ + memcpy(aCkpt, aData, nCkpt*sizeof(u32)); + ckptChangeEndianness(aCkpt, nCkpt); + if( ckptChecksumOk(aCkpt) ){ + ShmHeader *pShm = pDb->pShmhdr; + memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32)); + memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32)); + memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); + pShm->iMetaPage = iMeta; + bLoaded = 1; + } + } + + lsmFree(pDb->pEnv, aCkpt); + *pRc = rc; + } + return bLoaded; +} + +/* +** Initialize the shared-memory header with an empty snapshot. This function +** is called when no valid snapshot can be found in the database header. +*/ +static void ckptLoadEmpty(lsm_db *pDb){ + u32 aCkpt[] = { + 0, /* CKPT_HDR_ID_MSW */ + 10, /* CKPT_HDR_ID_LSW */ + 0, /* CKPT_HDR_NCKPT */ + 0, /* CKPT_HDR_NBLOCK */ + 0, /* CKPT_HDR_BLKSZ */ + 0, /* CKPT_HDR_NLEVEL */ + 0, /* CKPT_HDR_PGSZ */ + 0, /* CKPT_HDR_OVFL */ + 0, 0, 1234, 5678, /* The log pointer and initial checksum */ + 0, 0, 0, 0, /* The append list */ + 0, /* The free block list */ + 0, 0 /* Space for checksum values */ + }; + u32 nCkpt = array_size(aCkpt); + ShmHeader *pShm = pDb->pShmhdr; + + aCkpt[CKPT_HDR_NCKPT] = nCkpt; + aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz; + aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz; + ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]); + + memcpy(pShm->aClient, aCkpt, nCkpt*sizeof(u32)); + memcpy(pShm->aWorker, aCkpt, nCkpt*sizeof(u32)); + memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32)); +} + +/* +** This function is called as part of database recovery to initialize the +** ShmHeader.aClient[] and ShmHeader.aWorker[] snapshots. +*/ +int lsmCheckpointRecover(lsm_db *pDb){ + int rc = LSM_OK; /* Return Code */ + i64 iId1; /* Id of checkpoint on meta-page 1 */ + i64 iId2; /* Id of checkpoint on meta-page 2 */ + int bLoaded = 0; /* True once checkpoint has been loaded */ + int cmp; /* True if (iId2>iId1) */ + MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */ + + rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]); + if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]); + + iId1 = ckptLoadId(apPg[0]); + iId2 = ckptLoadId(apPg[1]); + cmp = (iId2 > iId1); + bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc); + if( bLoaded==0 ){ + bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc); + } + + /* The database does not contain a valid checkpoint. Initialize the shared + ** memory header with an empty checkpoint. */ + if( bLoaded==0 ){ + ckptLoadEmpty(pDb); + } + + lsmFsMetaPageRelease(apPg[0]); + lsmFsMetaPageRelease(apPg[1]); + + return rc; +} + +/* +** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta. +*/ +int lsmCheckpointStore(lsm_db *pDb, int iMeta){ + MetaPage *pPg = 0; + int rc; + + assert( iMeta==1 || iMeta==2 ); + rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg); + if( rc==LSM_OK ){ + u8 *aData; + int nData; + int nCkpt; + + nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT]; + aData = lsmFsMetaPageData(pPg, &nData); + memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32)); + ckptChangeEndianness((u32 *)aData, nCkpt); + rc = lsmFsMetaPageRelease(pPg); + } + + return rc; +} + +/* +** Copy the current client snapshot from shared-memory to pDb->aSnapshot[]. +*/ +int lsmCheckpointLoad(lsm_db *pDb){ + while( 1 ){ + int rc; + int nInt; + ShmHeader *pShm = pDb->pShmhdr; + + nInt = pShm->aClient[CKPT_HDR_NCKPT]; + memcpy(pDb->aSnapshot, pShm->aClient, nInt*sizeof(u32)); + if( ckptChecksumOk(pDb->aSnapshot) ) return LSM_OK; + + rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); + if( rc==LSM_BUSY ){ + usleep(50); + }else{ + if( rc==LSM_OK ){ + if( ckptChecksumOk(pShm->aClient)==0 ){ + nInt = pShm->aWorker[CKPT_HDR_NCKPT]; + memcpy(pShm->aClient, pShm->aWorker, nInt*sizeof(u32)); + } + nInt = pShm->aClient[CKPT_HDR_NCKPT]; + memcpy(pDb->aSnapshot, &pShm->aClient, nInt*sizeof(u32)); + lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); + + if( ckptChecksumOk(pDb->aSnapshot)==0 ){ + rc = LSM_CORRUPT_BKPT; + } + } + return rc; + } + } +} + +int lsmCheckpointLoadWorker(lsm_db *pDb){ + int rc; + ShmHeader *pShm = pDb->pShmhdr; + + /* Must be holding the WORKER lock to do this */ + assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) ); + + if( ckptChecksumOk(pShm->aWorker)==0 ){ + int nInt = (int)pShm->aClient[CKPT_HDR_NCKPT]; + memcpy(pShm->aWorker, pShm->aClient, nInt*sizeof(u32)); + if( ckptChecksumOk(pShm->aWorker)==0 ) return LSM_CORRUPT_BKPT; + } + + rc = lsmCheckpointDeserialize(pDb, 1, pShm->aWorker, &pDb->pWorker); + assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); + return rc; +} + +int lsmCheckpointDeserialize( + lsm_db *pDb, + int bInclFreelist, /* If true, deserialize free-list */ + u32 *aCkpt, + Snapshot **ppSnap +){ + int rc = LSM_OK; + Snapshot *pNew; + + pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc); + if( rc==LSM_OK ){ + int nFree; + int nCopy; + int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL]; + int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE; + + pNew->iId = lsmCheckpointId(aCkpt, 0); + pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK]; + rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel); + + /* Make a copy of the append-list */ + nCopy = sizeof(u32) * LSM_APPLIST_SZ; + memcpy(pNew->aiAppend, &aCkpt[CKPT_HDR_SIZE+CKPT_LOGPTR_SIZE], nCopy); + + /* Copy the free-list */ + if( bInclFreelist ){ + pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL]; + nFree = aCkpt[iIn++]; + if( nFree ){ + pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc( + pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc + ); + if( rc==LSM_OK ){ + int i; + for(i=0; ifreelist.aEntry[i]; + p->iBlk = aCkpt[iIn++]; + p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1]; + iIn += 2; + } + pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree; + } + } + } + } + + if( rc!=LSM_OK ){ + lsmFreeSnapshot(pDb->pEnv, pNew); + pNew = 0; + } + + *ppSnap = pNew; + return rc; +} + +/* +** Connection pDb must be the worker connection in order to call this +** function. It returns true if the database already contains the maximum +** number of levels or false otherwise. +** +** This is used when flushing the in-memory tree to disk. If the database +** is already full, then the caller should invoke lsm_work() or similar +** until it is not full before creating a new level by flushing the in-memory +** tree to disk. Limiting the number of levels in the database ensures that +** the records describing them always fit within the checkpoint blob. +*/ +int lsmDatabaseFull(lsm_db *pDb){ + Level *p; + int nRhs = 0; + + assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) ); + assert( pDb->pWorker ); + + for(p=pDb->pWorker->pLevel; p; p=p->pNext){ + nRhs += (p->nRight ? p->nRight : 1); + } + + return (nRhs >= LSM_MAX_RHS_SEGMENTS); +} + +/* +** The connection passed as the only argument is currently the worker +** connection. Some work has been performed on the database by the connection, +** but no new snapshot has been written into shared memory. +** +** This function updates the shared-memory worker and client snapshots with +** the new snapshot produced by the work performed by pDb. +** +** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM +** error code is returned. +*/ +int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush, int nOvfl){ + Snapshot *pSnap = pDb->pWorker; + ShmHeader *pShm = pDb->pShmhdr; + void *p = 0; + int n = 0; + int rc; + + rc = ckptExportSnapshot(pDb, nOvfl, bFlush, pSnap->iId+1, 1, &p, &n); + if( rc!=LSM_OK ) return rc; + assert( ckptChecksumOk((u32 *)p) ); + + assert( n<=LSM_META_PAGE_SIZE ); + memcpy(pShm->aWorker, p, n); + lsmShmBarrier(pDb); + memcpy(pShm->aClient, p, n); + lsmFree(pDb->pEnv, p); + + return LSM_OK; +} + +int lsmCheckpointSynced(lsm_db *pDb, i64 *piId){ + int rc = LSM_OK; + const int nAttempt = 3; + int i; + for(i=0; ipShmhdr->iMetaPage; + rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg); + if( rc==LSM_OK ){ + int nCkpt; + int nData; + u8 *aData; + + aData = lsmFsMetaPageData(pPg, &nData); + assert( nData==LSM_META_PAGE_SIZE ); + nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]); + + if( nCkpt<(LSM_META_PAGE_SIZE/sizeof(u32)) ){ + u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc); + if( aCopy ){ + memcpy(aCopy, aData, nCkpt*sizeof(u32)); + ckptChangeEndianness(aCopy, nCkpt); + if( ckptChecksumOk(aCopy) ){ + *piId = lsmCheckpointId(aCopy, 0); + } + lsmFree(pDb->pEnv, aCopy); + } + } + lsmFsMetaPageRelease(pPg); + } + if( rc!=LSM_OK || pDb->pShmhdr->iMetaPage==iMeta ) break; + } + + return (rc==LSM_OK && i==3) ? LSM_BUSY : LSM_OK; } /* -** Attempt to read a checkpoint from the database header. If an error -** occurs, return an error code. Otherwise, return LSM_OK and, if -** a checkpoint is successfully loaded, populate the shared database -** structure. -** -** If a checkpoint is loaded, set *piSlot to the page number of the -** meta-page from which it is read (either 1 or 2). Or, if a checkpoint -** cannot be loaded, set *piSlot to 0. -** -** If a checkpoint is loaded and it indicates that the LEVELS and FREELIST -** records are present in the top-level segment *pbOvfl is set to true -** before returning. Otherwise, it is set to false. +** Return the checkpoint-id of the checkpoint array passed as the first +** argument to this function. If the second argument is true, then assume +** that the checkpoint is made up of 32-bit big-endian integers. If it +** is false, assume that the integers are in machine byte order. */ -int lsmCheckpointRead(lsm_db *pDb, int *piSlot, int *pbOvfl){ - int rc = LSM_OK; /* Return Code */ - i64 iId1; - i64 iId2; - int nInt1; - int nInt2; - int bLoaded = 0; - int iSlot = 0; - - iId1 = ckptReadId(pDb, 1, &nInt1, &rc); - iId2 = ckptReadId(pDb, 2, &nInt2, &rc); - - *pbOvfl = 0; - if( iId1>=iId2 ){ - bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc); - if( bLoaded ) iSlot = 1; - if( bLoaded==0 ){ - bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc); - if( bLoaded ) iSlot = 2; - } +i64 lsmCheckpointId(u32 *aCkpt, int bDisk){ + i64 iId; + if( bDisk ){ + u8 *aData = (u8 *)aCkpt; + iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32); + iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4])); }else{ - bLoaded = ckptTryRead(pDb, 2, nInt2, pbOvfl, &rc); - if( bLoaded ) iSlot = 2; - if( bLoaded==0 ){ - bLoaded = ckptTryRead(pDb, 1, nInt1, pbOvfl, &rc); - if( bLoaded ) iSlot = 1; - } - } - - *piSlot = iSlot; - return rc; + iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW]; + } + return iId; +} + +i64 lsmCheckpointLogOffset(u32 *aCkpt){ + return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW]; +} + +int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; } + +int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; } + +void lsmCheckpointLogoffset( + u32 *aCkpt, + DbLog *pLog +){ + u32 iOffMSB = aCkpt[CKPT_HDR_LO_MSW]; + u32 iOffLSB = aCkpt[CKPT_HDR_LO_LSW]; + pLog->aRegion[2].iStart = (((i64)iOffMSB) << 32) + ((i64)iOffLSB); + pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1]; + pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2]; +} + +void lsmCheckpointZeroLogoffset(lsm_db *pDb){ + u32 nCkpt; + + nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT]; + assert( nCkpt>CKPT_HDR_NCKPT ); + assert( nCkpt==pDb->pShmhdr->aClient[CKPT_HDR_NCKPT] ); + assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aClient, nCkpt*sizeof(u32)) ); + assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aWorker, nCkpt*sizeof(u32)) ); + + pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0; + pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0; + ckptChecksum(pDb->aSnapshot, nCkpt, + &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1] + ); + + memcpy(pDb->pShmhdr->aClient, pDb->aSnapshot, nCkpt*sizeof(u32)); + memcpy(pDb->pShmhdr->aWorker, pDb->aSnapshot, nCkpt*sizeof(u32)); } Index: src/lsm_file.c ================================================================== --- src/lsm_file.c +++ src/lsm_file.c @@ -31,21 +31,22 @@ ** is page 33. ** ** It is assumed that the first two meta pages and the data that follows ** them are located on different disk sectors. So that if a power failure ** while writing to a meta page there is no risk of damage to the other -** meta page or any other part of the database file. +** meta page or any other part of the database file. TODO: This may need +** to be revisited. ** ** Blocks: ** ** The database file is also divided into blocks. The default block size is ** 2MB. When writing to the database file, an attempt is made to write data ** in contiguous block-sized chunks. ** ** The first and last page on each block are special in that they are 4 ** bytes smaller than all other pages. This is because the last four bytes -** of space on the first and last pages of each block are reserved for a +** of space on the first and last pages of each block are reserved for ** pointers to other blocks (i.e. a 32-bit block number). ** ** Runs: ** ** A run is a sequence of pages that the upper layer uses to store a @@ -75,10 +76,11 @@ ** This file opens and closes the log file. But it does not contain any ** logic related to the log file format. Instead, it exports the following ** functions that are used by the code in lsm_log.c to read and write the ** log file: ** +** lsmFsOpenLog ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog ** lsmFsTruncateLog ** lsmFsCloseAndDeleteLog @@ -111,15 +113,17 @@ */ struct FileSystem { lsm_db *pDb; /* Database handle that owns this object */ lsm_env *pEnv; /* Environment pointer */ char *zDb; /* Database file name */ + char *zLog; /* Database file name */ int nMetasize; /* Size of meta pages in bytes */ int nPagesize; /* Database page-size in bytes */ int nBlocksize; /* Database block-size in bytes */ /* r/w file descriptors for both files. */ + LsmFile *pLsmFile; lsm_file *fdDb; /* Database file */ lsm_file *fdLog; /* Log file */ /* mmap() mode things */ int bUseMmap; /* True to use mmap() to access db file */ @@ -191,11 +195,11 @@ ** lsmEnvClose() ** lsmEnvTruncate() ** lsmEnvUnlink() ** lsmEnvRemap() */ -static int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){ +int lsmEnvOpen(lsm_env *pEnv, const char *zFile, lsm_file **ppNew){ return pEnv->xOpen(pEnv, zFile, ppNew); } static int lsmEnvRead( lsm_env *pEnv, lsm_file *pFile, @@ -218,11 +222,11 @@ return pEnv->xSync(pFile); } static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSectorSize(pFile); } -static int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ +int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xClose(pFile); } static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){ return pEnv->xTruncate(pFile, nByte); } @@ -236,32 +240,59 @@ void **ppMap, i64 *pszMap ){ return pEnv->xRemap(pFile, szMin, ppMap, pszMap); } + +int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){ + if( pFile==0 ) return LSM_OK; + return pEnv->xLock(pFile, iLock, eLock); +} + +int lsmEnvShmMap( + lsm_env *pEnv, + lsm_file *pFile, + int iChunk, + int sz, + void **ppOut +){ + return pEnv->xShmMap(pFile, iChunk, sz, ppOut); +} + +void lsmEnvShmBarrier(lsm_env *pEnv){ + return pEnv->xShmBarrier(); +} + +void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){ + return pEnv->xShmUnmap(pFile, bDel); +} + /* ** Write the contents of string buffer pStr into the log file, starting at ** offset iOff. */ int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){ + assert( pFS->fdLog ); return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n); } /* ** fsync() the log file. */ int lsmFsSyncLog(FileSystem *pFS){ + assert( pFS->fdLog ); return lsmEnvSync(pFS->pEnv, pFS->fdLog); } /* -** Read nRead bytes of data starting at offset iOff of the log file. Store -** the results in string buffer pStr. +** Read nRead bytes of data starting at offset iOff of the log file. Append +** the results to string buffer pStr. */ int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){ int rc; /* Return code */ + assert( pFS->fdLog ); rc = lsmStringExtend(pStr, nRead); if( rc==LSM_OK ){ rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead); pStr->n += nRead; } @@ -310,54 +341,70 @@ int bLog, /* True for log, false for db */ int *pRc /* IN/OUT: Error code */ ){ lsm_file *pFile = 0; if( *pRc==LSM_OK ){ - char *zName; - zName = lsmMallocPrintf(pFS->pEnv, "%s%s", pFS->zDb, (bLog ? "-log" : "")); - if( !zName ){ - *pRc = LSM_NOMEM; - }else{ - *pRc = lsmEnvOpen(pFS->pEnv, zName, &pFile); - } - lsmFree(pFS->pEnv, zName); + *pRc = lsmEnvOpen(pFS->pEnv, (bLog ? pFS->zLog : pFS->zDb), &pFile); } return pFile; } + +/* +** If it is not already open, this function opens the log file. It returns +** LSM_OK if successful (or if the log file was already open) or an LSM +** error code otherwise. +** +** The log file must be opened before any of the following may be called: +** +** lsmFsWriteLog +** lsmFsSyncLog +** lsmFsReadLog +*/ +int lsmFsOpenLog(FileSystem *pFS){ + int rc = LSM_OK; + if( 0==pFS->fdLog ){ pFS->fdLog = fsOpenFile(pFS, 1, &rc); } + return rc; +} /* ** Open a connection to a database stored within the file-system (the ** "system of files"). */ int lsmFsOpen(lsm_db *pDb, const char *zDb){ FileSystem *pFS; int rc = LSM_OK; + int nDb = strlen(zDb); + int nByte; assert( pDb->pFS==0 ); assert( pDb->pWorker==0 && pDb->pClient==0 ); - pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, sizeof(FileSystem), &rc); + nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1; + pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc); if( pFS ){ + pFS->zDb = (char *)&pFS[1]; + pFS->zLog = &pFS->zDb[nDb+1]; pFS->nPagesize = LSM_PAGE_SIZE; pFS->nBlocksize = LSM_BLOCK_SIZE; pFS->nMetasize = 4 * 1024; pFS->pDb = pDb; pFS->pEnv = pDb->pEnv; - /* Make a copy of the database name. */ - pFS->zDb = lsmMallocStrdup(pDb->pEnv, zDb); - if( pFS->zDb==0 ) rc = LSM_NOMEM; + /* Make a copy of the database and log file names. */ + memcpy(pFS->zDb, zDb, nDb+1); + memcpy(pFS->zLog, zDb, nDb); + memcpy(&pFS->zLog[nDb], "-log", 5); /* Allocate the hash-table here. At some point, it should be changed ** so that it can grow dynamicly. */ pFS->nCacheMax = 2048; pFS->nHash = 4096; pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc); + pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc); - /* Open the files */ + /* Open the database file */ pFS->fdDb = fsOpenFile(pFS, 0, &rc); - pFS->fdLog = fsOpenFile(pFS, 1, &rc); if( rc!=LSM_OK ){ lsmFsClose(pFS); pFS = 0; } @@ -383,13 +430,20 @@ lsmFree(pEnv, pPg); pPg = pNext; } if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); - if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog ); + if( pFS->fdLog ){ + if( lsmDbMultiProc(pFS->pDb) ){ + lsmDbDeferredClose(pFS->pDb, pFS->fdLog, pFS->pLsmFile); + pFS->pLsmFile = 0; + }else{ + lsmEnvClose(pFS->pEnv, pFS->fdLog ); + } + } + lsmFree(pEnv, pFS->pLsmFile); - lsmFree(pEnv, pFS->zDb); lsmFree(pEnv, pFS->apHash); lsmFree(pEnv, pFS); } } @@ -625,17 +679,17 @@ int *pRc ){ if( *pRc==LSM_OK && iSz>pFS->nMap ){ Page *pFix; int rc; + u8 *aOld = pFS->pMap; rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); if( rc==LSM_OK ){ u8 *aData = (u8 *)pFS->pMap; for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){ pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)]; } - lsmSortedRemap(pFS->pDb); } *pRc = rc; } } @@ -781,15 +835,15 @@ int iBlk ){ int rc = LSM_OK; /* Return code */ int iFirst; /* First page on block iBlk */ int iLast; /* Last page on block iBlk */ - int i; /* Used to iterate through append points */ Level *pLevel; /* Used to iterate through levels */ - Pgno *aAppend; - int nAppend; + int iIn; /* Used to iterate through append points */ + int iOut = 0; /* Used to output append points */ + u32 *aApp = pSnapshot->aiAppend; iFirst = fsFirstPageOnBlock(pFS, iBlk); iLast = fsLastPageOnBlock(pFS, iBlk); /* Check if any other run in the snapshot has a start or end page @@ -798,17 +852,16 @@ if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){ return LSM_OK; } } - aAppend = lsmSharedAppendList(pFS->pDb, &nAppend); - for(i=0; i=iFirst && aAppend[i]<=iLast ){ - lsmSharedAppendListRemove(pFS->pDb, i); - break; + for(iIn=0; iIniLast ){ + aApp[iOut++] = aApp[iIn]; } } + while( iOutpDb, iBlk); } return rc; @@ -933,116 +986,19 @@ } return fsPageGet(pFS, iPg, 0, ppNext); } -static Pgno findAppendPoint(FileSystem *pFS, int nMin){ - Pgno ret = 0; - Pgno *aAppend; - int nAppend; - int i; - - aAppend = lsmSharedAppendList(pFS->pDb, &nAppend); -#if 1 - for(i=nAppend-1; i>=0; i--){ -#else - for(i=0; i=nMin ){ - ret = aAppend[i]; - lsmSharedAppendListRemove(pFS->pDb, i); - break; - } - } - - return ret; -} - -static void addAppendPoint( - lsm_db *db, - Pgno iLast, - int *pRc /* IN/OUT: Error code */ -){ - if( *pRc==LSM_OK && iLast>0 ){ - FileSystem *pFS = db->pFS; - - Pgno *aPoint; - int nPoint; - int i; - int iBlk; - int bLast; - - iBlk = fsPageToBlock(pFS, iLast); - bLast = (iLast==fsLastPageOnBlock(pFS, iBlk)); - - aPoint = lsmSharedAppendList(db, &nPoint); - for(i=0; i=aPoint[i] ){ - aPoint[i] = iLast+1; - } - return; - } - } - - if( bLast==0 ){ - *pRc = lsmSharedAppendListAdd(db, iLast+1); - } - } -} - -static void subAppendPoint(lsm_db *db, Pgno iFirst){ - if( iFirst>0 ){ - FileSystem *pFS = db->pFS; - Pgno *aPoint; - int nPoint; - int i; - int iBlk; - - iBlk = fsPageToBlock(pFS, iFirst); - aPoint = lsmSharedAppendList(db, &nPoint); - for(i=0; i=aPoint[i] ) lsmSharedAppendListRemove(db, i); - return; - } - } - } -} - -int lsmFsSetupAppendList(lsm_db *db){ - int rc = LSM_OK; - Level *pLvl; - - assert( db->pWorker ); - for(pLvl=lsmDbSnapshotLevel(db->pWorker); - rc==LSM_OK && pLvl; - pLvl=pLvl->pNext - ){ - if( pLvl->nRight==0 ){ - addAppendPoint(db, pLvl->lhs.iLast, &rc); - }else{ - int i; - for(i=0; inRight; i++){ - addAppendPoint(db, pLvl->aRhs[i].iLast, &rc); - } - } - } - - for(pLvl=lsmDbSnapshotLevel(db->pWorker); pLvl; pLvl=pLvl->pNext){ - int i; - subAppendPoint(db, pLvl->lhs.iFirst); - for(i=0; inRight; i++){ - subAppendPoint(db, pLvl->aRhs[i].iFirst); - } - } - - return rc; +static Pgno findAppendPoint(FileSystem *pFS){ + int i; + u32 *aiAppend = pFS->pDb->pWorker->aiAppend; + u32 iRet = 0; + + for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){ + if( (iRet = aiAppend[i]) ) aiAppend[i] = 0; + } + return iRet; } /* ** Append a page to file iFile. Return a reference to it. lsmFsPageWrite() ** has already been called on the returned reference. @@ -1059,11 +1015,11 @@ int iApp = 0; int iNext = 0; int iPrev = p->iLast; if( iPrev==0 ){ - iApp = findAppendPoint(pFS, 0); + iApp = findAppendPoint(pFS); }else if( fsIsLast(pFS, iPrev) ){ Page *pLast = 0; rc = fsPageGet(pFS, iPrev, 0, &pLast); if( rc!=LSM_OK ) return rc; iApp = lsmGetU32(&pLast->aData[pFS->nPagesize-4]); @@ -1133,11 +1089,18 @@ int iBlk = fsPageToBlock(pFS, iPg); lsmBlockRefree(pFS->pDb, iBlk); lsmFsPageRelease(pLast); } }else{ - rc = lsmSharedAppendListAdd(pFS->pDb, p->iLast+1); + int i; + u32 *aiAppend = pFS->pDb->pWorker->aiAppend; + for(i=0; iiLast+1; + break; + } + } } } return rc; } @@ -1401,21 +1364,24 @@ ** If an error occurs, *pzOut is set to NULL and an LSM error code returned. */ int lsmInfoArrayStructure(lsm_db *pDb, Pgno iFirst, char **pzOut){ int rc = LSM_OK; Snapshot *pWorker; /* Worker snapshot */ - Snapshot *pRelease = 0; /* Snapshot to release */ Segment *pArray = 0; /* Array to report on */ Level *pLvl; /* Used to iterate through db levels */ + int bUnlock = 0; *pzOut = 0; if( iFirst==0 ) return LSM_ERROR; /* Obtain the worker snapshot */ pWorker = pDb->pWorker; if( !pWorker ){ - pRelease = pWorker = lsmDbSnapshotWorker(pDb); + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + pWorker = pDb->pWorker; + bUnlock = 1; } /* Search for the array that starts on page iFirst */ for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pArray==0; pLvl=pLvl->pNext){ if( 0==(pArray = startsWith(&pLvl->lhs, iFirst)) ){ @@ -1449,52 +1415,50 @@ lsmStringAppendf(&str, " %d", pArray->iLast); *pzOut = str.z; } - lsmDbSnapshotRelease(pDb->pEnv, pRelease); + if( bUnlock ){ + int rcwork = LSM_BUSY; + lsmFinishWork(pDb, 0, 0, &rcwork); + } return rc; } -#ifdef LSM_EXPENSIVE_DEBUG /* ** Helper function for lsmFsIntegrityCheck() */ static void checkBlocks( FileSystem *pFS, - Segment *pSeg, - int bExtra, + Segment *pSeg, + int bExtra, /* If true, count the "next" block if any */ + int nUsed, u8 *aUsed ){ if( pSeg ){ - int i; - for(i=0; i<2; i++){ - Segment *p = (i ? pSeg->pRun : pSeg->pSep); - - if( p && p->nSize>0 ){ - const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); - - int iBlk; - int iLastBlk; - iBlk = fsPageToBlock(pFS, p->iFirst); - iLastBlk = fsPageToBlock(pFS, p->iLast); - - while( iBlk ){ - assert( iBlk<=pFS->nBlock ); - /* assert( aUsed[iBlk-1]==0 ); */ - aUsed[iBlk-1] = 1; - if( iBlk!=iLastBlk ){ - fsBlockNext(pFS, iBlk, &iBlk); - }else{ - iBlk = 0; - } - } - - if( bExtra && (p->iLast % nPagePerBlock)==0 ){ - fsBlockNext(pFS, iLastBlk, &iBlk); - aUsed[iBlk-1] = 1; - } + if( pSeg && pSeg->nSize>0 ){ + const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); + + int iBlk; + int iLastBlk; + iBlk = fsPageToBlock(pFS, pSeg->iFirst); + iLastBlk = fsPageToBlock(pFS, pSeg->iLast); + + while( iBlk ){ + assert( iBlk<=nUsed ); + /* assert( aUsed[iBlk-1]==0 ); */ + aUsed[iBlk-1] = 1; + if( iBlk!=iLastBlk ){ + fsBlockNext(pFS, iBlk, &iBlk); + }else{ + iBlk = 0; + } + } + + if( bExtra && (pSeg->iLast % nPagePerBlock)==0 ){ + fsBlockNext(pFS, iLastBlk, &iBlk); + aUsed[iBlk-1] = 1; } } } } @@ -1501,51 +1465,65 @@ /* ** This function checks that all blocks in the database file are accounted ** for. For each block, exactly one of the following must be true: ** ** + the block is part of a sorted run, or -** + the block is on the lPending list, or -** + the block is on the lFree list +** + the block is on the free-block list ** ** This function also checks that there are no references to blocks with ** out-of-range block numbers. ** ** If no errors are found, non-zero is returned. If an error is found, an ** assert() fails. */ int lsmFsIntegrityCheck(lsm_db *pDb){ - int nBlock; - int i; - FileSystem *pFS = pDb->pFS; - u8 *aUsed; - Level *pLevel; - - nBlock = pFS->nBlock; - aUsed = lsmMallocZero(pDb->pEnv, nBlock); - assert( aUsed ); - - for(pLevel=pDb->pLevel; pLevel; pLevel=pLevel->pNext){ - int i; - checkBlocks(pFS, &pLevel->lhs, (pLevel->pSMerger!=0), aUsed); - - for(i=0; inRight; i++){ - checkBlocks(pFS, &pLevel->aRhs[i], 0, aUsed); - } - } - - for(i=0; ilFree.n; i++){ - int iBlk = pFS->lFree.a[i]; - assert( aUsed[iBlk-1]==0 ); - aUsed[iBlk-1] = 1; - } - for(i=0; ilPending.n; i++){ - int iBlk = pFS->lPending.a[i]; - assert( aUsed[iBlk-1]==0 ); - aUsed[iBlk-1] = 1; + int i; + int j; + Freelist freelist = {0, 0, 0}; + FileSystem *pFS = pDb->pFS; + u8 *aUsed; + Level *pLevel; + Snapshot *pWorker = pDb->pWorker; + int nBlock = pWorker->nBlock; + + aUsed = lsmMallocZero(pDb->pEnv, nBlock); + if( aUsed==0 ){ + /* Malloc has failed. Since this function is only called within debug + ** builds, this probably means the user is running an OOM injection test. + ** Regardless, it will not be possible to run the integrity-check at this + ** time, so assume the database is Ok and return non-zero. */ + return 1; + } + + for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){ + int i; + checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed); + for(i=0; inRight; i++){ + checkBlocks(pFS, &pLevel->aRhs[i], 0, nBlock, aUsed); + } + } + + if( pWorker->nFreelistOvfl ){ + int rc = lsmCheckpointOverflowLoad(pDb, &freelist); + assert( rc==LSM_OK || rc==LSM_NOMEM ); + if( rc!=LSM_OK ) return 1; + } + + for(j=0; j<2; j++){ + Freelist *pFreelist; + if( j==0 ) pFreelist = &pWorker->freelist; + if( j==1 ) pFreelist = &freelist; + + for(i=0; inEntry; i++){ + u32 iBlk = pFreelist->aEntry[i].iBlk; + assert( iBlk<=nBlock ); + assert( aUsed[iBlk-1]==0 ); + aUsed[iBlk-1] = 1; + } } for(i=0; ipEnv, aUsed); + lsmFree(pDb->pEnv, freelist.aEntry); return 1; } -#endif Index: src/lsm_log.c ================================================================== --- src/lsm_log.c +++ src/lsm_log.c @@ -302,18 +302,17 @@ ** ** Before returning, this function allocates the LogWriter object that ** will be used to write to the log file during the write transaction. ** LSM_OK is returned if no error occurs, otherwise an LSM error code. */ -int lsmLogBegin(lsm_db *pDb, DbLog *pLog){ +int lsmLogBegin(lsm_db *pDb){ int rc = LSM_OK; LogWriter *pNew; LogRegion *aReg; - assert( lsmHoldingClientMutex(pDb) ); if( pDb->bUseLog==0 ) return LSM_OK; - + rc = lsmFsOpenLog(pDb->pFS); pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc); if( pNew ){ lsmStringInit(&pNew->buf, pDb->pEnv); rc = lsmStringExtend(&pNew->buf, 2); } @@ -344,17 +343,17 @@ ** file than region 0. In this case, append data to region 2, but ** remember to jump over region 1 if required. ** ** 3) Region 2 is the last in the file. Append to it. */ - aReg = &pLog->aRegion[0]; + aReg = &pDb->treehdr.log.aRegion[0]; assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart ); assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart ); - pNew->cksum0 = pLog->cksum0; - pNew->cksum1 = pLog->cksum1; + pNew->cksum0 = pDb->treehdr.log.cksum0; + pNew->cksum1 = pDb->treehdr.log.cksum1; if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=pDb->nLogSz ){ /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP ** into the log file in this case. Pad it out to 8 bytes using a PAD2 ** record so that the checksums can be updated immediately. */ @@ -401,16 +400,17 @@ ** ** A call to this function deletes the LogWriter object allocated by ** lsmLogBegin(). If the transaction is being committed, the shared state ** in *pLog is updated before returning. */ -void lsmLogEnd(lsm_db *pDb, DbLog *pLog, int bCommit){ +void lsmLogEnd(lsm_db *pDb, int bCommit){ + DbLog *pLog; LogWriter *p; - assert( lsmHoldingClientMutex(pDb) ); if( pDb->bUseLog==0 ) return; p = pDb->pLogWriter; + pLog = &pDb->treehdr.log; if( bCommit ){ pLog->aRegion[2].iEnd = p->iOff; pLog->cksum0 = p->cksum0; pLog->cksum1 = p->cksum1; @@ -434,13 +434,13 @@ ** file. The checkpoint specifies that the log starts at offset iOff. ** The shared state in *pLog is updated to reflect the fact that space ** in the log file that occurs logically before offset iOff may now ** be reused. */ -void lsmLogCheckpoint(lsm_db *pDb, DbLog *pLog, lsm_i64 iOff){ +void lsmLogCheckpoint(lsm_db *pDb, lsm_i64 iOff){ + DbLog *pLog = &pDb->treehdr.log; int iRegion; - assert( lsmHoldingClientMutex(pDb) ); for(iRegion=0; iRegion<3; iRegion++){ LogRegion *p = &pLog->aRegion[iRegion]; if( iOff>=p->iStart && iOff<=p->iEnd ) break; p->iStart = 0; @@ -725,11 +725,11 @@ /* ** TODO: Thread safety of this function? */ int lsmLogStructure(lsm_db *pDb, char **pzVal){ - DbLog *pLog = lsmDatabaseLog(pDb); + DbLog *pLog = &pDb->treehdr.log; *pzVal = lsmMallocPrintf(pDb->pEnv, "%d %d %d %d %d %d", (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd, (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd, (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd @@ -887,20 +887,23 @@ */ int lsmLogRecover(lsm_db *pDb){ LsmString buf1; /* Key buffer */ LsmString buf2; /* Value buffer */ LogReader reader; /* Log reader object */ - int rc; /* Return code */ + int rc = LSM_OK; /* Return code */ int nCommit = 0; /* Number of transactions to recover */ int iPass; int nJump = 0; /* Number of LSM_LOG_JUMP records in pass 0 */ DbLog *pLog; - rc = lsmBeginRecovery(pDb); + rc = lsmFsOpenLog(pDb->pFS); if( rc!=LSM_OK ) return rc; - pLog = lsmDatabaseLog(pDb); + lsmTreeInit(pDb); + pLog = &pDb->treehdr.log; + lsmCheckpointLogoffset(pDb->pShmhdr->aWorker, pLog); + logReaderInit(pDb, pLog, 1, &reader); lsmStringInit(&buf1, pDb->pEnv); lsmStringInit(&buf2, pDb->pEnv); /* The outer for() loop runs at most twice. The first iteration is to @@ -1014,10 +1017,11 @@ if( pLog->aRegion[2].iStart==0 ){ iPass = 1; }else{ pLog->aRegion[2].iStart = 0; iPass = -1; + lsmCheckpointZeroLogoffset(pDb); } } logReaderInit(pDb, pLog, 0, &reader); nCommit = nCommit * -1; } Index: src/lsm_main.c ================================================================== --- src/lsm_main.c +++ src/lsm_main.c @@ -39,14 +39,11 @@ ** handle must be holding a pointer to a client snapshot. And the reverse ** - if there are no open cursors and no write transactions then there must ** not be a client snapshot. */ assert( (pDb->pCsr!=0 || pDb->nTransOpen>0)==(pDb->pClient!=0) ); - /* If there is a write transaction open according to pDb->nTransOpen, then - ** the connection must be holding the read/write TreeVersion. */ assert( pDb->nTransOpen>=0 ); - assert( pDb->nTransOpen==0 || lsmTreeIsWriteVersion(pDb->pTV) ); } #else # define assert_db_state(x) #endif @@ -82,31 +79,22 @@ pDb->xCmp = xCmp; pDb->nLogSz = LSM_DEFAULT_LOG_SIZE; pDb->nDfltPgsz = LSM_PAGE_SIZE; pDb->nDfltBlksz = LSM_BLOCK_SIZE; pDb->nMerge = LSM_DEFAULT_NMERGE; + pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES; pDb->bUseLog = 1; - + pDb->iReader = -1; + pDb->bMultiProc = 1; return LSM_OK; } lsm_env *lsm_get_env(lsm_db *pDb){ assert( pDb->pEnv ); return pDb->pEnv; } -/* -** Release snapshot handle *ppSnap. Then set *ppSnap to zero. This -** is useful for doing (say): -** -** dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker); -*/ -static void dbReleaseSnapshot(lsm_env *pEnv, Snapshot **ppSnap){ - lsmDbSnapshotRelease(pEnv, *ppSnap); - *ppSnap = 0; -} - /* ** If database handle pDb is currently holding a client snapshot, but does ** not have any open cursors or write transactions, release it. */ static void dbReleaseClientSnapshot(lsm_db *pDb){ @@ -113,83 +101,30 @@ if( pDb->nTransOpen==0 && pDb->pCsr==0 ){ lsmFinishReadTrans(pDb); } } -static void dbWorkerStart(lsm_db *pDb){ - assert( pDb->pWorker==0 ); - pDb->pWorker = lsmDbSnapshotWorker(pDb); -} - -static void dbWorkerDone(lsm_db *pDb){ - assert( pDb->pWorker ); - dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker); -} - static int dbAutoWork(lsm_db *pDb, int nUnit){ int rc = LSM_OK; /* Return code */ assert( pDb->pWorker==0 ); assert( pDb->bAutowork ); assert( nUnit>0 ); /* If one is required, run a checkpoint. */ +#if 0 rc = lsmCheckpointWrite(pDb); - - dbWorkerStart(pDb); - rc = lsmSortedAutoWork(pDb, nUnit); - dbWorkerDone(pDb); - - return rc; -} - -/* -** If required, run the recovery procedure to initialize the database. -** Return LSM_OK if successful or an error code otherwise. -*/ -static int dbRecoverIfRequired(lsm_db *pDb){ - int rc = LSM_OK; - - assert( pDb->pWorker==0 && pDb->pClient==0 ); - - /* The following call returns NULL if recovery is not required. */ - pDb->pWorker = lsmDbSnapshotRecover(pDb); - if( pDb->pWorker ){ - int bOvfl; - int iSlot; - - /* Read the database structure */ - rc = lsmCheckpointRead(pDb, &iSlot, &bOvfl); - - /* Read the free block list and any level records stored in the LSM. */ - if( rc==LSM_OK && bOvfl ){ - rc = lsmSortedLoadSystem(pDb); - } - - /* Set up the initial append list */ - if( rc==LSM_OK ){ - rc = lsmFsSetupAppendList(pDb); - } - - /* Populate the in-memory tree by reading the log file. */ - if( rc==LSM_OK ){ - rc = lsmLogRecover(pDb); - } - - /* Set the "recovery done" flag */ - if( rc==LSM_OK ){ - lsmDbRecoveryComplete(pDb, iSlot); - } - - /* Set up the initial client snapshot. */ - if( rc==LSM_OK ){ - rc = lsmDbUpdateClient(pDb, 0, 0); - } - - dbReleaseSnapshot(pDb->pEnv, &pDb->pWorker); - } - +#endif + + rc = lsmBeginWork(pDb); + if( rc==LSM_OK ) rc = lsmSortedAutoWork(pDb, nUnit); + if( pDb->pWorker && pDb->pWorker->pLevel ){ + lsmFinishWork(pDb, 0, -1, &rc); + }else{ + int rcdummy = LSM_BUSY; + lsmFinishWork(pDb, 0, 0, &rcdummy); + } return rc; } static int getFullpathname( lsm_env *pEnv, @@ -236,22 +171,27 @@ ** path is required to ensure that the correct files are operated ** on even if the application changes the cwd. */ rc = getFullpathname(pDb->pEnv, zFilename, &zFull); assert( rc==LSM_OK || zFull==0 ); - /* Open the database file */ + /* Open the database file. */ if( rc==LSM_OK ){ rc = lsmFsOpen(pDb, zFull); } - /* Open the shared data handle. */ + /* Connect to the database */ if( rc==LSM_OK ){ - rc = lsmDbDatabaseFind(pDb, zFilename); + rc = lsmDbDatabaseConnect(pDb, zFilename); } - if( rc==LSM_OK ){ - rc = dbRecoverIfRequired(pDb); + /* Configure the file-system connection with the page-size and block-size + ** of this database. Even if the database file is zero bytes in size + ** on disk, these values have been set in shared-memory by now, and so are + ** guaranteed not to change during the lifetime of this connection. */ + if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb)) ){ + lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot)); + lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot)); } lsmFree(pDb->pEnv, zFull); } @@ -262,52 +202,54 @@ ** This function flushes the contents of the in-memory tree to disk. It ** returns LSM_OK if successful, or an error code otherwise. */ int lsmFlushToDisk(lsm_db *pDb){ int rc = LSM_OK; /* Return code */ - int nLsmLevel; - int bOvfl; + int nOvfl = 0; /* Number of free-list entries in LSM */ /* Must not hold the worker snapshot when this is called. */ assert( pDb->pWorker==0 ); - dbWorkerStart(pDb); + rc = lsmBeginWork(pDb); /* Save the position of each open cursor belonging to pDb. */ - rc = lsmSaveCursors(pDb); + if( rc==LSM_OK ){ + rc = lsmSaveCursors(pDb); + } - bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel); if( rc==LSM_OK && pDb->bAutowork ){ rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT); - bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel); + } + while( rc==LSM_OK && lsmDatabaseFull(pDb) ){ + rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT); } /* Write the contents of the in-memory tree into the database file and ** update the worker snapshot accordingly. Then flush the contents of ** the db file to disk too. No calls to fsync() are made here - just ** write(). */ - if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl); -#if 0 - if( rc==LSM_OK && bAutowork ){ - assert( bOvfl==0 && nLsmLevel==0 ); - rc = lsmSortedAutoWork(pDb, LSM_AUTOWORK_QUANT); - bOvfl = lsmCheckpointOverflow(pDb, &nLsmLevel); - if( bOvfl && rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, nLsmLevel, bOvfl); - } -#endif - if( rc==LSM_OK ) rc = lsmSortedFlushDb(pDb); - - /* Create a new client snapshot - one that uses the new runs created above. */ - if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsmLevel, bOvfl); + if( rc==LSM_OK ) rc = lsmSortedFlushTree(pDb, &nOvfl); + if( rc==LSM_OK ) lsmTreeClear(pDb); + + lsmFinishWork(pDb, 1, nOvfl, &rc); /* Restore the position of any open cursors */ - if( rc==LSM_OK ) rc = lsmRestoreCursors(pDb); + if( rc==LSM_OK && pDb->pCsr ){ + lsmFreeSnapshot(pDb->pEnv, pDb->pClient); + pDb->pClient = 0; + rc = lsmCheckpointLoad(pDb); + if( rc==LSM_OK ){ + rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient); + } + if( rc==LSM_OK ){ + rc = lsmRestoreCursors(pDb); + } + } #if 0 if( rc==LSM_OK ) lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "flush"); #endif - dbWorkerDone(pDb); return rc; } int lsm_close(lsm_db *pDb){ int rc = LSM_OK; @@ -314,11 +256,10 @@ if( pDb ){ assert_db_state(pDb); if( pDb->pCsr || pDb->nTransOpen ){ rc = LSM_MISUSE_BKPT; }else{ - assert( pDb->pWorker==0 && pDb->pTV==0 ); lsmDbDatabaseRelease(pDb); lsmFsClose(pDb->pFS); lsmFree(pDb->pEnv, pDb->aTrans); lsmFree(pDb->pEnv, pDb); } @@ -421,10 +362,32 @@ int *piVal = va_arg(ap, int *); if( *piVal>1 ) pDb->nMerge = *piVal; *piVal = pDb->nMerge; break; } + + case LSM_CONFIG_MAX_FREELIST: { + int *piVal = va_arg(ap, int *); + if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){ + pDb->nMaxFreelist = *piVal; + } + *piVal = pDb->nMaxFreelist; + break; + } + + case LSM_CONFIG_MULTIPLE_PROCESSES: { + int *piVal = va_arg(ap, int *); + if( pDb->pDatabase ){ + /* If lsm_open() has been called, this is a read-only parameter. + ** Set the output variable to true if this connection is currently + ** in multi-process mode. */ + *piVal = lsmDbMultiProc(pDb); + }else{ + pDb->bMultiProc = *piVal = (*piVal!=0); + } + break; + } default: rc = LSM_MISUSE; break; } @@ -446,16 +409,19 @@ Level *pTopLevel = 0; /* Top level of snapshot to report on */ int rc = LSM_OK; Level *p; LsmString s; Snapshot *pWorker; /* Worker snapshot */ - Snapshot *pRelease = 0; /* Snapshot to release */ + int bUnlock = 0; /* Obtain the worker snapshot */ pWorker = pDb->pWorker; if( !pWorker ){ - pRelease = pWorker = lsmDbSnapshotWorker(pDb); + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + pWorker = pDb->pWorker; + bUnlock = 1; } /* Format the contents of the snapshot as text */ pTopLevel = lsmDbSnapshotLevel(pWorker); lsmStringInit(&s, pDb->pEnv); @@ -469,11 +435,14 @@ lsmStringAppend(&s, "}", 1); } rc = s.n>=0 ? LSM_OK : LSM_NOMEM; /* Release the snapshot and return */ - lsmDbSnapshotRelease(pDb->pEnv, pRelease); + if( bUnlock ){ + int rcdummy = LSM_BUSY; + lsmFinishWork(pDb, 0, 0, &rcdummy); + } *pzOut = s.z; return rc; } int lsm_info(lsm_db *pDb, int eParam, ...){ @@ -545,11 +514,10 @@ bCommit = 1; rc = lsm_begin(pDb, 1); } if( rc==LSM_OK ){ - assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) ); rc = lsmLogWrite(pDb, (void *)pKey, nKey, (void *)pVal, nVal); } lsmSortedSaveTreeCursors(pDb); @@ -562,14 +530,13 @@ if( nQuant>pDb->nTreeLimit ){ nQuant = pDb->nTreeLimit; } - nBefore = lsmTreeSize(pDb->pTV); + nBefore = lsmTreeSize(pDb); rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal); - nAfter = lsmTreeSize(pDb->pTV); - + nAfter = lsmTreeSize(pDb); nDiff = (nAfter/nQuant) - (nBefore/nQuant); if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){ rc = dbAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT); } } @@ -739,11 +706,11 @@ rc = lsmBeginWriteTrans(pDb); } if( rc==LSM_OK ){ for(i=pDb->nTransOpen; ipTV, &pDb->aTrans[i].tree); + lsmTreeMark(pDb, &pDb->aTrans[i].tree); lsmLogTell(pDb, &pDb->aTrans[i].log); } pDb->nTransOpen = iLevel; } } @@ -750,21 +717,25 @@ return rc; } int lsm_commit(lsm_db *pDb, int iLevel){ + int bFlush = 0; int rc = LSM_OK; assert_db_state( pDb ); /* A value less than zero means close the innermost nested transaction. */ if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1); if( iLevelnTransOpen ){ if( iLevel==0 ){ + /* Commit the transaction to disk. */ - if( pDb->pTV && lsmTreeSize(pDb->pTV)>pDb->nTreeLimit ){ + if( lsmTreeSize(pDb)>pDb->nTreeLimit ){ + lsmTreeEndTransaction(pDb, 1); + bFlush = 1; rc = lsmFlushToDisk(pDb); } if( rc==LSM_OK ) rc = lsmLogCommit(pDb); if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){ rc = lsmFsSyncLog(pDb->pFS); @@ -772,11 +743,15 @@ lsmFinishWriteTrans(pDb, (rc==LSM_OK)); } pDb->nTransOpen = iLevel; } + dbReleaseClientSnapshot(pDb); + if( pDb->bAutowork && bFlush && rc==LSM_OK ){ + rc = lsmCheckpointWrite(pDb); + } return rc; } int lsm_rollback(lsm_db *pDb, int iLevel){ int rc = LSM_OK; Index: src/lsm_mem.c ================================================================== --- src/lsm_mem.c +++ src/lsm_mem.c @@ -107,11 +107,10 @@ pRet = lsmReallocOrFree(pEnv, p, N); if( !pRet ) *pRc = LSM_NOMEM_BKPT; } return pRet; } - char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){ int nByte; char *zRet; nByte = strlen(zIn); Index: src/lsm_shared.c ================================================================== --- src/lsm_shared.c +++ src/lsm_shared.c @@ -13,37 +13,10 @@ ** Utilities used to help multiple LSM clients to coexist within the ** same process space. */ #include "lsmInt.h" -typedef struct Freelist Freelist; -typedef struct AppendList AppendList; -typedef struct FreelistEntry FreelistEntry; - -/* -** TODO: Find homes for these miscellaneous notes. -** -** FREE-LIST DELTA FORMAT -** -** The free-list delta consists of three integers: -** -** 1. The number of elements to remove from the start of the free-list. -** 2. If non-zero, a refreed block to append to the free-list. -** 3. Same as (2). -** -** SNAPSHOT ID MANIPULATIONS -** -** When the database is initialized the worker snapshot id is set to the -** value read from the checkpoint. Or, if there is no valid checkpoint, -** to a non-zero default value (e.g. 1). -** -** The client snapshot is then initialized as a copy of the worker. The -** client snapshot id is a copy of the worker snapshot id (as read from -** the checkpoint). The worker snapshot id is then incremented. -** -*/ - /* ** Global data. All global variables used by code in this file are grouped ** into the following structure instance. ** ** pDatabase: @@ -53,159 +26,37 @@ */ static struct SharedData { Database *pDatabase; /* Linked list of all Database objects */ } gShared; -/* -** An instance of the following structure stores the current database free -** block list. The free list is a list of blocks that are not currently -** used by the worker snapshot. Assocated with each block in the list is the -** snapshot id of the most recent snapshot that did actually use the block. -*/ -struct Freelist { - FreelistEntry *aEntry; /* Free list entries */ - int nEntry; /* Number of valid slots in aEntry[] */ - int nAlloc; /* Allocated size of aEntry[] */ -}; -struct FreelistEntry { - int iBlk; /* Block number */ - i64 iId; /* Largest snapshot id to use this block */ -}; - -struct AppendList { - Pgno *aPoint; - int nPoint; - int nAlloc; -}; - -/* -** A snapshot of a database. A snapshot contains all the information required -** to read or write a database file on disk. See the description of struct -** Database below for futher details. -** -** pExport/nExport: -** pExport points to a buffer containing the serialized (checkpoint) -** image of the snapshot. The serialized image is nExport bytes in size. -*/ -struct Snapshot { - Database *pDatabase; /* Database this snapshot belongs to */ - Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ - i64 iId; /* Snapshot id */ - - /* Used by client snapshots only */ - void *pExport; /* Serialized snapshot image */ - int nExport; /* Size of pExport in bytes */ - int nRef; /* Number of references to this structure */ - Snapshot *pSnapshotNext; /* Next snapshot on this database */ -}; -#define LSM_INITIAL_SNAPSHOT_ID 11 - /* ** Database structure. There is one such structure for each distinct ** database accessed by this process. They are stored in the singly linked ** list starting at global variable gShared.pDatabase. Database objects are ** reference counted. Once the number of connections to the associated ** database drops to zero, they are removed from the linked list and deleted. -** -** The primary purpose of the Database structure is to manage Snapshots. A -** snapshot contains the information required to read a database - exactly -** where each array is stored, and where new arrays can be written. A -** database has one worker snapshot and any number of client snapshots. -** -** WORKER SNAPSHOT -** -** When a connection is first made to a database and the Database object -** created, the worker snapshot is initialized to the most recently -** checkpointed database state (based on the values in the db header). -** Any time the database file is written to, either to flush the contents -** of an in-memory tree or to merge existing segments, the worker snapshot -** is updated to reflect the modifications. -** -** The worker snapshot is protected by the worker mutex. The worker mutex -** must be obtained before a connection begins to modify the database -** file. After the db file is written, the worker snapshot is updated and -** the worker mutex released. -** -** CLIENT SNAPSHOTS -** -** Client snapshots are used by database clients (readers). When a -** transaction is opened, the client requests a pointer to a read-only -** client snapshot. It is relinquished when the transaction ends. Client -** snapshots are reference counted objects. -** -** When a database is first loaded, the client snapshot is a copy of -** the worker snapshot. Each time the worker snapshot is checkpointed, -** the client snapshot is updated with the new checkpointed contents. -** -** THE FREE-BLOCK LIST -** -** Each Database structure maintains a list of free blocks - the "free-list". -** There is an entry in the free-list for each block in the database file -** that is not used in any way by the worker snapshot. -** -** Associated with each free block in the free-list is a snapshot id. -** This is the id of the earliest snapshot that does not require the -** contents of the block. The block may therefore be reused only after: -** -** (a) a snapshot with an id equal to or greater than the id associated -** with the block has been checkpointed into the db header, and -** -** (b) all existing database clients are using a snapshot with an id -** equal to or greater than the id stored in the free-list entry. -** -** MULTI-THREADING ISSUES -** -** Each Database structure carries with it two mutexes - the client -** mutex and the worker mutex. In a multi-process version of LSM, these -** will be replaced by some other robust locking mechanism. -** -** TODO - this description. */ struct Database { + /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */ char *zName; /* Canonical path to database file */ void *pId; /* Database id (file inode) */ int nId; /* Size of pId in bytes */ - - Tree *pTree; /* Current in-memory tree structure */ - DbLog log; /* Database log state object */ - int nPgsz; /* Nominal database page size */ - int nBlksz; /* Database block size */ - - Snapshot *pClient; /* Client (reader) snapshot */ - Snapshot worker; /* Worker (writer) snapshot */ - AppendList append; /* List of appendable points */ - - int nBlock; /* Number of blocks tracked by this ss */ - Freelist freelist; /* Database free-list */ - - u32 aDelta[LSM_FREELIST_DELTA_SIZE]; - int bRecordDelta; /* True when recording freelist delta */ - - lsm_mutex *pWorkerMutex; /* Protects the worker snapshot */ - lsm_mutex *pClientMutex; /* Protects pClient */ - int bDirty; /* True if worker has been modified */ - int bRecovered; /* True if db does not require recovery */ - - int bCheckpointer; /* True if there exists a checkpointer */ - int bWriter; /* True if there exists a writer */ - i64 iCheckpointId; /* Largest snapshot id stored in db file */ - int iSlot; /* Meta page containing iCheckpointId */ - - /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */ int nDbRef; /* Number of associated lsm_db handles */ Database *pDbNext; /* Next Database structure in global list */ + + /* Protected by the local mutex (pClientMutex) */ + lsm_file *pFile; /* Used for locks/shm in multi-proc mode */ + LsmFile *pLsmFile; /* List of deferred closes */ + lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */ + int nShmChunk; /* Number of entries in apShmChunk[] array */ + void **apShmChunk; /* Array of "shared" memory regions */ + lsm_db *pConn; /* List of connections to this db. */ }; -/* -** Macro that evaluates to true if the snapshot passed as the only argument -** is a worker snapshot. -*/ -#define isWorker(pSnap) ((pSnap)==(&(pSnap)->pDatabase->worker)) - /* ** Functions to enter and leave the global mutex. This mutex is used -** to protect the global linked-list headed at +** to protect the global linked-list headed at gShared.pDatabase. */ static int enterGlobalMutex(lsm_env *pEnv){ lsm_mutex *p; int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); if( rc==LSM_OK ) lsmMutexEnter(pEnv, p); @@ -227,72 +78,18 @@ int i; for(i=0; inEntry; i++){ assert( p->aEntry[i].iBlk!=iBlk ); } } -static void assertMustbeWorker(lsm_db *pDb){ - assert( pDb->pWorker ); - assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) ); -} -static void assertSnapshotListOk(Database *p){ - Snapshot *pIter; - i64 iPrev = 0; - - for(pIter=p->pClient; pIter; pIter=pIter->pSnapshotNext){ - assert( pIter==p->pClient || pIter->iIdiId; - } -} #else # define assertNotInFreelist(x,y) -# define assertMustbeWorker(x) -# define assertSnapshotListOk(x) #endif - -Pgno *lsmSharedAppendList(lsm_db *db, int *pnApp){ - Database *p = db->pDatabase; - assert( db->pWorker ); - *pnApp = p->append.nPoint; - return p->append.aPoint; -} - -int lsmSharedAppendListAdd(lsm_db *db, Pgno iPg){ - AppendList *pList; - assert( db->pWorker ); - pList = &db->pDatabase->append; - - assert( pList->nAlloc>=pList->nPoint ); - if( pList->nAlloc<=pList->nPoint ){ - int nNew = pList->nAlloc+8; - Pgno *aNew = (Pgno *)lsmRealloc(db->pEnv, pList->aPoint, sizeof(Pgno)*nNew); - if( aNew==0 ) return LSM_NOMEM_BKPT; - pList->aPoint = aNew; - pList->nAlloc = nNew; - } - - pList->aPoint[pList->nPoint++] = iPg; - return LSM_OK; -} - -void lsmSharedAppendListRemove(lsm_db *db, int iIdx){ - AppendList *pList; - int i; - assert( db->pWorker ); - pList = &db->pDatabase->append; - - assert( pList->nPoint>iIdx ); - for(i=iIdx+1; inPoint;i++){ - pList->aPoint[i-1] = pList->aPoint[i]; - } - pList->nPoint--; -} - /* ** Append an entry to the free-list. */ -static int flAppendEntry(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){ +int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId){ /* Assert that this is not an attempt to insert a duplicate block number */ assertNotInFreelist(p, iBlk); /* Extend the space allocated for the freelist, if required */ @@ -314,10 +111,22 @@ p->aEntry[p->nEntry].iId = iId; p->nEntry++; return LSM_OK; } + +static int flInsertEntry(lsm_env *pEnv, Freelist *p, int iBlk){ + int rc; + + rc = lsmFreelistAppend(pEnv, p, iBlk, 1); + if( rc==LSM_OK ){ + memmove(&p->aEntry[1], &p->aEntry[0], sizeof(FreelistEntry)*(p->nEntry-1)); + p->aEntry[0].iBlk = iBlk; + p->aEntry[0].iId = 1; + } + return rc; +} /* ** Remove the first entry of the free-list. */ static void flRemoveEntry0(Freelist *p){ @@ -326,23 +135,115 @@ memmove(&p->aEntry[0], &p->aEntry[1], sizeof(FreelistEntry) * nNew); p->nEntry = nNew; } /* -** This function frees all resources held by the Database structure passed +** tHIS Function frees all resources held by the Database structure passed ** as the only argument. */ static void freeDatabase(lsm_env *pEnv, Database *p){ + assert( holdingGlobalMutex(pEnv) ); if( p ){ /* Free the mutexes */ lsmMutexDel(pEnv, p->pClientMutex); - lsmMutexDel(pEnv, p->pWorkerMutex); + + if( p->pFile ){ + lsmEnvClose(pEnv, p->pFile); + } /* Free the memory allocated for the Database struct itself */ lsmFree(pEnv, p); } } + +static void doDbDisconnect(lsm_db *pDb){ + int rc; + + /* Block for an exclusive lock on DMS1. This lock serializes all calls + ** to doDbConnect() and doDbDisconnect() across all processes. */ + rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); + if( rc==LSM_OK ){ + + /* Try an exclusive lock on DMS2. If successful, this is the last + ** connection to the database. In this case flush the contents of the + ** in-memory tree to disk and write a checkpoint. */ + rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0); + if( rc==LSM_OK ){ + /* Flush the in-memory tree, if required. If there is data to flush, + ** this will create a new client snapshot in Database.pClient. The + ** checkpoint (serialization) of this snapshot may be written to disk + ** by the following block. */ + rc = lsmTreeLoadHeader(pDb); + if( rc==LSM_OK && lsmTreeSize(pDb)>0 ){ + rc = lsmFlushToDisk(pDb); + } + + /* Write a checkpoint to disk. */ + if( rc==LSM_OK ){ + rc = lsmCheckpointWrite(pDb); + } + + /* If the checkpoint was written successfully, delete the log file */ + if( rc==LSM_OK && pDb->pFS ){ + Database *p = pDb->pDatabase; + lsmFsCloseAndDeleteLog(pDb->pFS); + if( p->pFile ) lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1); + } + } + } + + lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); + lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + pDb->pShmhdr = 0; +} + +static int doDbConnect(lsm_db *pDb){ + int rc; + + /* Obtain a pointer to the shared-memory header */ + assert( pDb->pShmhdr==0 ); + rc = lsmShmChunk(pDb, 0, (void **)&pDb->pShmhdr); + if( rc!=LSM_OK ) return rc; + + /* Block for an exclusive lock on DMS1. This lock serializes all calls + ** to doDbConnect() and doDbDisconnect() across all processes. */ + rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); + if( rc!=LSM_OK ){ + pDb->pShmhdr = 0; + return rc; + } + + /* Try an exclusive lock on DMS2. If successful, this is the first and + ** only connection to the database. In this case initialize the + ** shared-memory and run log file recovery. */ + rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_EXCL, 0); + if( rc==LSM_OK ){ + memset(pDb->pShmhdr, 0, sizeof(ShmHeader)); + rc = lsmCheckpointRecover(pDb); + if( rc==LSM_OK ){ + rc = lsmLogRecover(pDb); + } + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + + /* Take a shared lock on DMS2. This lock "cannot" fail, as connections + ** may only hold an exclusive lock on DMS2 if they first hold an exclusive + ** lock on DMS1. And this connection is currently holding the exclusive + ** lock on DSM1. */ + if( rc==LSM_OK ){ + rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0); + } + + /* If anything went wrong, unlock DMS2. Unlock DMS1 in any case. */ + if( rc!=LSM_OK ){ + lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); + pDb->pShmhdr = 0; + } + lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); + return rc; +} /* ** Return a reference to the shared Database handle for the database ** identified by canonical path zName. If this is the first connection to ** the named database, a new Database object is allocated. Otherwise, a @@ -353,11 +254,11 @@ ** and and LSM error code returned. ** ** Each successful call to this function should be (eventually) matched ** by a call to lsmDbDatabaseRelease(). */ -int lsmDbDatabaseFind( +int lsmDbDatabaseConnect( lsm_db *pDb, /* Database handle */ const char *zName /* Path to db file */ ){ lsm_env *pEnv = pDb->pEnv; int rc; /* Return code */ @@ -382,19 +283,13 @@ /* If no suitable Database object was found, allocate a new one. */ if( p==0 ){ int nName = strlen(zName); p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nId+nName+1, &rc); - /* Initialize the log handle */ - if( rc==LSM_OK ){ - p->log.cksum0 = LSM_CKSUM0_INIT; - p->log.cksum1 = LSM_CKSUM1_INIT; - } - - /* Allocate the two mutexes */ - if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pWorkerMutex); + /* Allocate the mutex */ if( rc==LSM_OK ) rc = lsmMutexNew(pEnv, &p->pClientMutex); + /* If no error has occurred, fill in other fields and link the new ** Database structure into the global list starting at ** gShared.pDatabase. Otherwise, if an error has occurred, free any ** resources allocated and return without linking anything new into @@ -403,95 +298,91 @@ p->zName = (char *)&p[1]; memcpy((void *)p->zName, zName, nName+1); p->pId = (void *)&p->zName[nName+1]; memcpy(p->pId, pId, nId); p->nId = nId; - p->worker.pDatabase = p; p->pDbNext = gShared.pDatabase; gShared.pDatabase = p; - p->worker.iId = LSM_INITIAL_SNAPSHOT_ID; - p->nPgsz = pDb->nDfltPgsz; - p->nBlksz = pDb->nDfltBlksz; - }else{ + } + + /* If running in multi-process mode, open the shared fd */ + if( rc==LSM_OK && pDb->bMultiProc ){ + rc = lsmEnvOpen(pDb->pEnv, p->zName, &p->pFile); + } + + if( rc!=LSM_OK ){ freeDatabase(pEnv, p); p = 0; } } if( p ) p->nDbRef++; leaveGlobalMutex(pEnv); + + if( p ){ + lsmMutexEnter(pDb->pEnv, p->pClientMutex); + pDb->pNext = p->pConn; + p->pConn = pDb; + lsmMutexLeave(pDb->pEnv, p->pClientMutex); + } } lsmFree(pEnv, pId); pDb->pDatabase = p; + + if( rc==LSM_OK ){ + rc = doDbConnect(pDb); + } + return rc; } - -static void freeClientSnapshot(lsm_env *pEnv, Snapshot *p){ - Level *pLevel; - - assert( p->nRef==0 ); - for(pLevel=p->pLevel; pLevel; pLevel=pLevel->pNext){ - lsmFree(pEnv, pLevel->pSplitKey); - } - lsmFree(pEnv, p->pExport); - lsmFree(pEnv, p); -} - /* -** Release a reference to a Database object obtained from lsmDbDatabaseFind(). -** There should be exactly one call to this function for each successful -** call to Find(). +** Release a reference to a Database object obtained from +** lsmDbDatabaseConnect(). There should be exactly one call to this function +** for each successful call to Find(). */ void lsmDbDatabaseRelease(lsm_db *pDb){ Database *p = pDb->pDatabase; if( p ){ + lsm_db **ppDb; + + if( pDb->pShmhdr ){ + doDbDisconnect(pDb); + } + + lsmMutexEnter(pDb->pEnv, p->pClientMutex); + for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext)); + *ppDb = pDb->pNext; + lsmMutexLeave(pDb->pEnv, p->pClientMutex); + enterGlobalMutex(pDb->pEnv); p->nDbRef--; if( p->nDbRef==0 ){ - int rc = LSM_OK; Database **pp; /* Remove the Database structure from the linked list. */ for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext)); *pp = p->pDbNext; - /* Flush the in-memory tree, if required. If there is data to flush, - ** this will create a new client snapshot in Database.pClient. The - ** checkpoint (serialization) of this snapshot may be written to disk - ** by the following block. */ - if( p->bDirty || 0==lsmTreeIsEmpty(p->pTree) ){ - rc = lsmFlushToDisk(pDb); - } - - /* Write a checkpoint, also if required */ - if( rc==LSM_OK && p->pClient ){ - rc = lsmCheckpointWrite(pDb); - } - - /* If the checkpoint was written successfully, delete the log file */ - if( rc==LSM_OK && pDb->pFS ){ - lsmFsCloseAndDeleteLog(pDb->pFS); - } - - /* Free the in-memory tree object */ - lsmTreeRelease(pDb->pEnv, p->pTree); - - /* Free the contents of the worker snapshot */ - lsmSortedFreeLevel(pDb->pEnv, p->worker.pLevel); - lsmFree(pDb->pEnv, p->freelist.aEntry); - lsmFree(pDb->pEnv, p->append.aPoint); - - /* Free the client snapshot */ - if( p->pClient ){ - assert( p->pClient->nRef==1 ); - p->pClient->nRef = 0; - freeClientSnapshot(pDb->pEnv, p->pClient); - } - + /* Free the Database object and shared memory buffers. */ + if( p->pFile==0 ){ + int i; + for(i=0; inShmChunk; i++){ + lsmFree(pDb->pEnv, p->apShmChunk[i]); + } + }else{ + LsmFile *pIter; + LsmFile *pNext; + for(pIter=p->pLsmFile; pIter; pIter=pNext){ + pNext = pIter->pNext; + lsmEnvClose(pDb->pEnv, pIter->pFile); + lsmFree(pDb->pEnv, pIter); + } + } + lsmFree(pDb->pEnv, p->apShmChunk); freeDatabase(pDb->pEnv, p); } leaveGlobalMutex(pDb->pEnv); } } @@ -499,256 +390,13 @@ Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){ return pSnapshot->pLevel; } void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){ - assert( isWorker(pSnap) ); pSnap->pLevel = pLevel; } -void lsmDatabaseDirty(lsm_db *pDb){ - Database *p = pDb->pDatabase; - assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) ); - if( p->bDirty==0 ){ - p->worker.iId++; - p->bDirty = 1; - } -} - -int lsmDatabaseIsDirty(lsm_db *pDb){ - Database *p = pDb->pDatabase; - assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) ); - return p->bDirty; -} - -/* -** Get/set methods for the snapshot block-count. These should only be -** used with worker snapshots. -*/ -void lsmSnapshotSetNBlock(Snapshot *pSnap, int nNew){ - assert( isWorker(pSnap) ); - pSnap->pDatabase->nBlock = nNew; -} -int lsmSnapshotGetNBlock(Snapshot *pSnap){ - assert( isWorker(pSnap) ); - return pSnap->pDatabase->nBlock; -} - -void lsmSnapshotSetCkptid(Snapshot *pSnap, i64 iNew){ - assert( isWorker(pSnap) ); - pSnap->iId = iNew; -} - -/* -** Return a pointer to the client snapshot object. Each successful call -** to lsmDbSnapshotClient() must be matched by an lsmDbSnapshotRelease() -** call. -*/ -#if 0 -Snapshot *lsmDbSnapshotClient(lsm_db *pDb){ - Database *p = pDb->pDatabase; - Snapshot *pRet; - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - pRet = p->pClient; - pRet->nRef++; - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - return pRet; -} -#endif - -/* -** Return a pointer to the worker snapshot. This call grabs the worker -** mutex. It is released when the pointer to the worker snapshot is passed -** to lsmDbSnapshotRelease(). -*/ -Snapshot *lsmDbSnapshotWorker(lsm_db *pDb){ - Database *p = pDb->pDatabase; - lsmMutexEnter(pDb->pEnv, p->pWorkerMutex); - return &p->worker; -} - -Snapshot *lsmDbSnapshotRecover(lsm_db *pDb){ - Database *p = pDb->pDatabase; - Snapshot *pRet = 0; - lsmMutexEnter(pDb->pEnv, p->pWorkerMutex); - if( p->bRecovered ){ - lsmFsSetPageSize(pDb->pFS, p->nPgsz); - lsmFsSetBlockSize(pDb->pFS, p->nBlksz); - lsmMutexLeave(pDb->pEnv, p->pWorkerMutex); - }else{ - pRet = &p->worker; - } - return pRet; -} - -/* -** Set (bVal==1) or clear (bVal==0) the "recovery done" flag. -** -** TODO: Should this be combined with BeginRecovery()/FinishRecovery()? -*/ -void lsmDbRecoveryComplete(lsm_db *pDb, int iSlot){ - Database *p = pDb->pDatabase; - - assert( iSlot==0 || iSlot==1 || iSlot==2 ); - assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) ); - assert( p->pTree ); - - p->bRecovered = 1; - p->iCheckpointId = p->worker.iId; - p->iSlot = iSlot; - lsmFsSetPageSize(pDb->pFS, p->nPgsz); - lsmFsSetBlockSize(pDb->pFS, p->nBlksz); -} - -void lsmDbSetPagesize(lsm_db *pDb, int nPgsz, int nBlksz){ - Database *p = pDb->pDatabase; - assert( lsmMutexHeld(pDb->pEnv, p->pWorkerMutex) && p->bRecovered==0 ); - p->nPgsz = nPgsz; - p->nBlksz = nBlksz; - lsmFsSetPageSize(pDb->pFS, p->nPgsz); - lsmFsSetBlockSize(pDb->pFS, p->nBlksz); -} - -static void snapshotDecrRefcnt(lsm_env *pEnv, Snapshot *pSnap){ - Database *p = pSnap->pDatabase; - - assertSnapshotListOk(p); - pSnap->nRef--; - assert( pSnap->nRef>=0 ); - if( pSnap->nRef==0 ){ - Snapshot *pIter = p->pClient; - assert( pSnap!=pIter ); - while( pIter->pSnapshotNext!=pSnap ) pIter = pIter->pSnapshotNext; - pIter->pSnapshotNext = pSnap->pSnapshotNext; - freeClientSnapshot(pEnv, pSnap); - assertSnapshotListOk(p); - } -} - -/* -** Release a snapshot reference obtained by calling lsmDbSnapshotWorker() -** or lsmDbSnapshotClient(). -*/ -void lsmDbSnapshotRelease(lsm_env *pEnv, Snapshot *pSnap){ - if( pSnap ){ - Database *p = pSnap->pDatabase; - - /* If this call is to release a pointer to the worker snapshot, relinquish - ** the worker mutex. - ** - ** If pSnap is a client snapshot, decrement the reference count. When the - ** reference count reaches zero, free the snapshot object. The decrement - ** and (nRef==0) test are protected by the database client mutex. - */ - if( isWorker(pSnap) ){ - lsmMutexLeave(pEnv, p->pWorkerMutex); - }else{ - lsmMutexEnter(pEnv, p->pClientMutex); - snapshotDecrRefcnt(pEnv, pSnap); - lsmMutexLeave(pEnv, p->pClientMutex); - } - } -} - -/* -** Create a new client snapshot based on the current contents of the worker -** snapshot. The connection must be the worker to call this function. -*/ -int lsmDbUpdateClient(lsm_db *pDb, int nLsmLevel, int bOvfl){ - Database *p = pDb->pDatabase; /* Database handle */ - Snapshot *pOld; /* Old client snapshot object */ - Snapshot *pNew; /* New client snapshot object */ - int nByte; /* Memory required for new client snapshot */ - int rc = LSM_OK; /* Memory required for new client snapshot */ - int nLevel = 0; /* Number of levels in worker snapshot */ - int nRight = 0; /* Total number of rhs in worker */ - int nKeySpace = 0; /* Total size of split keys */ - Level *pLevel; /* Used to iterate through worker levels */ - Level **ppLink; /* Used to link levels together */ - u8 *pAvail; /* Used to divide up allocation */ - - /* Must be the worker to call this. */ - assertMustbeWorker(pDb); - - /* Allocate space for the client snapshot and all levels. */ - for(pLevel=p->worker.pLevel; pLevel; pLevel=pLevel->pNext){ - nLevel++; - nRight += pLevel->nRight; - } - nByte = sizeof(Snapshot) - + nLevel * sizeof(Level) - + nRight * sizeof(Segment) - + nKeySpace; - pNew = (Snapshot *)lsmMallocZero(pDb->pEnv, nByte); - if( !pNew ) return LSM_NOMEM_BKPT; - pNew->pDatabase = p; - pNew->iId = p->worker.iId; - - /* Copy the linked-list of Level structures */ - pAvail = (u8 *)&pNew[1]; - ppLink = &pNew->pLevel; - for(pLevel=p->worker.pLevel; pLevel && rc==LSM_OK; pLevel=pLevel->pNext){ - Level *pNew; - - pNew = (Level *)pAvail; - memcpy(pNew, pLevel, sizeof(Level)); - pAvail += sizeof(Level); - - if( pNew->nRight ){ - pNew->aRhs = (Segment *)pAvail; - memcpy(pNew->aRhs, pLevel->aRhs, sizeof(Segment) * pNew->nRight); - pAvail += (sizeof(Segment) * pNew->nRight); - lsmSortedSplitkey(pDb, pNew, &rc); - } - - /* This needs to come after any call to lsmSortedSplitkey(). Splitkey() - ** uses data within the Merge object to set pNew->pSplitKey and co. */ - pNew->pMerge = 0; - - *ppLink = pNew; - ppLink = &pNew->pNext; - } - - /* Create the serialized version of the new client snapshot. */ - if( p->bDirty && rc==LSM_OK ){ - assert( nLevel>nLsmLevel || p->worker.pLevel==0 ); - rc = lsmCheckpointExport( - pDb, nLsmLevel, bOvfl, pNew->iId, 1, &pNew->pExport, &pNew->nExport - ); - } - - if( rc==LSM_OK ){ - /* Initialize the new snapshot ref-count to 1 */ - pNew->nRef = 1; - - lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient); - - /* Install the new client snapshot and release the old. */ - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - assertSnapshotListOk(p); - pOld = p->pClient; - pNew->pSnapshotNext = pOld; - p->pClient = pNew; - assertSnapshotListOk(p); - if( pDb->pClient ){ - pDb->pClient = pNew; - pNew->nRef++; - } - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - - lsmDbSnapshotRelease(pDb->pEnv, pOld); - p->bDirty = 0; - - /* Upgrade the user connection to the new client snapshot */ - - }else{ - /* An error has occurred. Delete the allocated object. */ - freeClientSnapshot(pDb->pEnv, pNew); - } - - return rc; -} /* ** Allocate a new database file block to write data to, either by extending ** the database file or by recycling a free-list entry. The worker snapshot ** must be held in order to call this function. @@ -755,61 +403,48 @@ ** ** If successful, *piBlk is set to the block number allocated and LSM_OK is ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned. */ int lsmBlockAllocate(lsm_db *pDb, int *piBlk){ - Database *p = pDb->pDatabase; + Snapshot *p = pDb->pWorker; Freelist *pFree; /* Database free list */ int iRet = 0; /* Block number of allocated block */ + int rc = LSM_OK; + + assert( pDb->pWorker ); pFree = &p->freelist; - if( pFree->nEntry>0 ){ /* The first block on the free list was freed as part of the work done ** to create the snapshot with id iFree. So, we can reuse this block if ** snapshot iFree or later has been checkpointed and all currently - ** active clients are reading from snapshot iFree or later. - */ - Snapshot *pIter; + ** active clients are reading from snapshot iFree or later. */ i64 iFree = pFree->aEntry[0].iId; - i64 iInUse; - - /* Both Database.iCheckpointId and the Database.pClient list are - ** protected by the client mutex. So grab it here before determining - ** the id of the oldest snapshot still potentially in use. */ - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - assertSnapshotListOk(p); - for(pIter=p->pClient; pIter->pSnapshotNext; pIter=pIter->pSnapshotNext); - iInUse = LSM_MIN(pIter->iId, p->iCheckpointId); - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - - if( 0 ){ - int i; - printf("choose from freelist: "); - for(i=0; inEntry && pFree->aEntry[i].iId<=iInUse; i++){ - printf("%d ", pFree->aEntry[i].iBlk); - } - printf("\n"); - fflush(stdout); - } - - - if( iFree<=iInUse ){ + int bInUse = 0; + + /* The "is in use" bit */ + rc = lsmLsmInUse(pDb, iFree, &bInUse); + + /* The "has been checkpointed" bit */ + if( rc==LSM_OK && bInUse==0 ){ + i64 iId = 0; + rc = lsmCheckpointSynced(pDb, &iId); + if( rc!=LSM_OK || iIdaEntry[0].iBlk; flRemoveEntry0(pFree); assert( iRet!=0 ); - if( p->bRecordDelta ){ - p->aDelta[0]++; - } } } /* If no block was allocated from the free-list, allocate one at the ** end of the file. */ - if( iRet==0 ){ - p->nBlock++; - iRet = p->nBlock; + if( rc==LSM_OK && iRet==0 ){ + iRet = ++pDb->pWorker->nBlock; } *piBlk = iRet; return LSM_OK; } @@ -820,20 +455,16 @@ ** ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. ** LSM_NOMEM). */ int lsmBlockFree(lsm_db *pDb, int iBlk){ - Database *p = pDb->pDatabase; - Snapshot *pWorker = pDb->pWorker; - int rc = LSM_OK; - - assertMustbeWorker(pDb); - assert( p->bRecordDelta==0 ); - assert( pDb->pDatabase->bDirty ); - - rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, pWorker->iId); - return rc; + Snapshot *p = pDb->pWorker; + + assert( lsmShmAssertWorker(pDb) ); + /* TODO: Should assert() that lsmCheckpointOverflow() has not been called */ + + return lsmFreelistAppend(pDb->pEnv, &p->freelist, iBlk, p->iId); } /* ** Refree a database block. The worker snapshot must be held in order to call ** this function. @@ -844,278 +475,201 @@ ** block may be reused immediately. Whereas a freed block can not be reused ** until (at least) after the next checkpoint. */ int lsmBlockRefree(lsm_db *pDb, int iBlk){ int rc = LSM_OK; /* Return code */ - Database *p = pDb->pDatabase; + Snapshot *p = pDb->pWorker; if( iBlk==p->nBlock ){ p->nBlock--; - }else if( p->bRecordDelta ){ - assert( p->aDelta[2]==0 ); - p->aDelta[1 + (p->aDelta[1]!=0)] = iBlk; }else{ - rc = flAppendEntry(pDb->pEnv, &p->freelist, iBlk, 0); - } - - return rc; -} - -void lsmFreelistDeltaBegin(lsm_db *pDb){ - Database *p = pDb->pDatabase; - assertMustbeWorker(pDb); - assert( p->bRecordDelta==0 ); - memset(p->aDelta, 0, sizeof(p->aDelta)); - p->bRecordDelta = 1; -} - -void lsmFreelistDeltaEnd(lsm_db *pDb){ - Database *p = pDb->pDatabase; - assertMustbeWorker(pDb); - p->bRecordDelta = 0; -} - -void lsmFreelistDelta( - lsm_db *pDb, /* Database handle */ - u32 *aDeltaOut /* OUT: Copy free-list delta here */ -){ - Database *p = pDb->pDatabase; - assertMustbeWorker(pDb); - assert( sizeof(p->aDelta)==(sizeof(u32)*LSM_FREELIST_DELTA_SIZE) ); - memcpy(aDeltaOut, p->aDelta, sizeof(p->aDelta)); -} - -u32 *lsmFreelistDeltaPtr(lsm_db *pDb){ - return pDb->pDatabase->aDelta; -} - -/* -** Return the current contents of the free-list as a list of integers. -*/ -int lsmSnapshotFreelist(lsm_db *pDb, int **paFree, int *pnFree){ - int rc = LSM_OK; /* Return Code */ - int *aFree = 0; /* Integer array to return via *paFree */ - int nFree; /* Value to return via *pnFree */ - Freelist *p; /* Database free list object */ - - assert( pDb->pWorker ); - p = &pDb->pDatabase->freelist; - nFree = p->nEntry; - if( nFree && paFree ){ - aFree = lsmMallocRc(pDb->pEnv, sizeof(int) * nFree, &rc); - if( aFree ){ - int i; - for(i=0; iaEntry[i].iBlk; - } - } - } - - *pnFree = nFree; - if( paFree ) *paFree = aFree; - return rc; -} - - -int lsmSnapshotSetFreelist(lsm_db *pDb, int *aElem, int nElem){ - Database *p = pDb->pDatabase; - lsm_env *pEnv = pDb->pEnv; - int rc = LSM_OK; /* Return code */ - int i; /* Iterator variable */ - int nIgnore; /* Number of entries to ignore */ - int iRefree1; /* A refreed block (or 0) */ - int iRefree2; /* A refreed block (or 0) */ - Freelist *pFree; /* Database free-list */ - - nIgnore = p->aDelta[0]; - iRefree1 = p->aDelta[1]; - iRefree2 = p->aDelta[2]; - - pFree = &p->freelist; - for(i=nIgnore; rc==LSM_OK && ipEnv, &p->freelist, iBlk); + } + + return rc; +} + +/* +** If required, copy a database checkpoint from shared memory into the +** database itself. +** +** The WORKER lock must not be held when this is called. This is because +** this function may indirectly call fsync(). And the WORKER lock should ** not be held that long (in case it is required by a client flushing an ** in-memory tree to disk). */ int lsmCheckpointWrite(lsm_db *pDb){ - Snapshot *pSnap; /* Snapshot to checkpoint */ - Database *p = pDb->pDatabase; - int rc = LSM_OK; /* Return Code */ + int rc; /* Return Code */ assert( pDb->pWorker==0 ); - - /* Try to obtain the checkpointer lock, then check if the a checkpoint - ** is actually required. If successful, and one is, set stack variable - ** pSnap to point to the client snapshot to checkpoint. - */ - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - pSnap = p->pClient; - if( pSnap->pExport && p->bCheckpointer==0 && pSnap->iId>p->iCheckpointId ){ - p->bCheckpointer = 1; - pSnap->nRef++; - }else{ - pSnap = 0; - } - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - - /* Attempt to grab the checkpoint mutex. If the attempt fails, this - ** function becomes a no-op. Some other thread is already running - ** a checkpoint (or at least checking if one is required). */ - if( pSnap ){ - FileSystem *pFS = pDb->pFS; /* File system object */ - int iPg = 1+(p->iSlot%2); /* Meta page to write to */ - MetaPage *pPg = 0; /* Page to write to */ - int doSync; /* True to sync the db */ - - /* If the safety mode is "off", omit calls to xSync(). */ - doSync = (pDb->eSafety!=LSM_SAFETY_OFF); - - /* Sync the db. To make sure all runs referred to by the checkpoint - ** are safely on disk. If we do not do this and a power failure occurs - ** just after the checkpoint is written into the db header, the - ** database could be corrupted following recovery. */ - if( doSync ) rc = lsmFsSyncDb(pFS); - - /* Fetch a reference to the meta-page to write the checkpoint to. */ - if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pFS, 1, iPg, &pPg); - - /* Unless an error has occurred, copy the checkpoint blob into the - ** meta-page, then release the reference to it (which will flush the - ** checkpoint into the file). */ - if( rc!=LSM_OK ){ - lsmFsMetaPageRelease(pPg); - }else{ - u8 *aData; /* Page buffer */ - int nData; /* Size of buffer aData[] */ - aData = lsmFsMetaPageData(pPg, &nData); - assert( pSnap->nExport<=nData ); - memcpy(aData, pSnap->pExport, pSnap->nExport); - rc = lsmFsMetaPageRelease(pPg); - pPg = 0; - } - - /* Sync the db file again. To make sure that the checkpoint just - ** written is on the disk. */ - if( rc==LSM_OK && doSync ) rc = lsmFsSyncDb(pFS); - - /* This is where space on disk is reclaimed. Now that the checkpoint - ** has been written to the database and synced, part of the database - ** log (the part containing the data just synced to disk) is no longer - ** required and so the space that it was taking up on disk can be - ** reused. - ** - ** It is also possible that database file blocks may be made available - ** for reuse here. A database file block is free if it is not used by - ** the most recently checkpointed snapshot, or by a snapshot that is - ** in use by any existing database client. And "the most recently - ** checkpointed snapshot" has just changed. - */ - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - if( rc==LSM_OK ){ - lsmLogCheckpoint(pDb, &p->log, lsmCheckpointLogOffset(pSnap->pExport)); - p->iCheckpointId = pSnap->iId; - p->iSlot = iPg; - } - p->bCheckpointer = 0; - snapshotDecrRefcnt(pDb->pEnv, pSnap); - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - } - - return rc; -} - -/* -** This function is called when a connection is about to run log file -** recovery (read the contents of the log file from disk and create a new -** in memory tree from it). This happens when the very first connection -** starts up and connects to the database. -** -** This sets the connections tree-version handle to one suitable to insert -** the read data into. -** -** Once recovery is complete (regardless of whether or not it is successful), -** lsmFinishRecovery() must be called to release resources locked by -** this function. -*/ -int lsmBeginRecovery(lsm_db *pDb){ - int rc; /* Return code */ - Database *p = pDb->pDatabase; /* Shared data handle */ - - assert( p && p->pTree==0 ); - assert( pDb->pWorker ); - assert( pDb->pClient==0 ); - assert( pDb->pTV==0 ); - assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) ); - - rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree); - if( rc==LSM_OK ){ - assert( pDb->pTV==0 ); - rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV); - } - return rc; -} + assert( 1 || pDb->pClient==0 ); + assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) ); + + rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0); + if( rc!=LSM_OK ) return rc; + + rc = lsmCheckpointLoad(pDb); + if( rc==LSM_OK ){ + ShmHeader *pShm = pDb->pShmhdr; + int bDone = 0; /* True if checkpoint is already stored */ + + /* Check if this checkpoint has already been written to the database + ** file. If so, set variable bDone to true. */ + if( pShm->iMetaPage ){ + MetaPage *pPg; /* Meta page */ + u8 *aData; /* Meta-page data buffer */ + int nData; /* Size of aData[] in bytes */ + i64 iCkpt; /* Id of checkpoint just loaded */ + i64 iDisk; /* Id of checkpoint already stored in db */ + iCkpt = lsmCheckpointId(pDb->aSnapshot, 0); + rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg); + if( rc==LSM_OK ){ + aData = lsmFsMetaPageData(pPg, &nData); + iDisk = lsmCheckpointId((u32 *)aData, 1); + lsmFsMetaPageRelease(pPg); + } + bDone = (iDisk>=iCkpt); + } + + if( rc==LSM_OK && bDone==0 ){ + int iMeta = (pShm->iMetaPage % 2) + 1; + rc = lsmFsSyncDb(pDb->pFS); + if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta); + if( rc==LSM_OK ) rc = lsmFsSyncDb(pDb->pFS); + if( rc==LSM_OK ) pShm->iMetaPage = iMeta; + } + } + + /* If no error has occured, then the snapshot currently in pDb->aSnapshot + ** has been synced to disk. This means it may be possible to wrap the + ** log file. Obtain the WRITER lock and update the relevent tree-header + ** fields to reflect this. + */ + if( rc==LSM_OK ){ + u64 iLogoff = lsmCheckpointLogOffset(pDb->aSnapshot); + if( pDb->nTransOpen==0 ){ + rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); + } + if( rc==LSM_OK ){ + rc = lsmTreeLoadHeader(pDb); + if( rc==LSM_OK ) lsmLogCheckpoint(pDb, iLogoff); + if( rc==LSM_OK ) lsmTreeEndTransaction(pDb, 1); + if( rc==LSM_BUSY ) rc = LSM_OK; + if( pDb->nTransOpen==0 ){ + rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + } + } + if( rc==LSM_BUSY ) rc = LSM_OK; + } + + lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0); + return rc; +} + +int lsmBeginWork(lsm_db *pDb){ + int rc; + + /* Attempt to take the WORKER lock */ + rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); + + /* Deserialize the current worker snapshot */ + if( rc==LSM_OK ){ + rc = lsmCheckpointLoadWorker(pDb); + if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase; + } + return rc; +} + +void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){ + if( p ){ + lsmSortedFreeLevel(pEnv, p->pLevel); + lsmFree(pEnv, p->freelist.aEntry); + lsmFree(pEnv, p); + } +} + +/* +** Argument bFlush is true if the contents of the in-memory tree has just +** been flushed to disk. The significance of this is that once the snapshot +** created to hold the updated state of the database is synced to disk, log +** file space can be recycled. +*/ +void lsmFinishWork(lsm_db *pDb, int bFlush, int nOvfl, int *pRc){ + /* If no error has occurred, serialize the worker snapshot and write + ** it to shared memory. */ + if( *pRc==LSM_OK ){ + *pRc = lsmCheckpointSaveWorker(pDb, bFlush, nOvfl); + } + + if( pDb->pWorker ){ + lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); + pDb->pWorker = 0; + } + + lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); +} + /* ** Called when recovery is finished. */ int lsmFinishRecovery(lsm_db *pDb){ - int rc; - assert( pDb->pWorker ); - assert( pDb->pClient==0 ); - assert( lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pWorkerMutex) ); - rc = lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0); - pDb->pTV = 0; - return rc; + lsmTreeEndTransaction(pDb, 1); + return LSM_OK; } /* ** Begin a read transaction. This function is a no-op if the connection ** passed as the only argument already has an open read transaction. */ int lsmBeginReadTrans(lsm_db *pDb){ + const int MAX_READLOCK_ATTEMPTS = 5; int rc = LSM_OK; /* Return code */ + int iAttempt = 0; - /* No reason a worker connection should be opening a read-transaction. */ assert( pDb->pWorker==0 ); + assert( (pDb->pClient!=0)==(pDb->iReader>=0) ); - if( pDb->pClient==0 ){ - Database *p = pDb->pDatabase; - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - + while( rc==LSM_OK && pDb->pClient==0 && (iAttempt++)pCsr==0 && pDb->nTransOpen==0 ); - /* If there is no in-memory tree structure, allocate one now */ - if( p->pTree==0 ){ - rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree); + /* Load the in-memory tree header. */ + rc = lsmTreeLoadHeader(pDb); + + /* Load the database snapshot */ + if( rc==LSM_OK ){ + rc = lsmCheckpointLoad(pDb); } + /* Take a read-lock on the tree and snapshot just loaded. Then check + ** that the shared-memory still contains the same values. If so, proceed. + ** Otherwise, relinquish the read-lock and retry the whole procedure + ** (starting with loading the in-memory tree header). */ if( rc==LSM_OK ){ - /* Set the connections client database file snapshot */ - p->pClient->nRef++; - pDb->pClient = p->pClient; - - /* Set the connections tree-version handle */ - assert( pDb->pTV==0 ); - pDb->pTV = lsmTreeReadVersion(p->pTree); - assert( pDb->pTV!=0 ); - } - - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - } + ShmHeader *pShm = pDb->pShmhdr; + i64 iTree = pDb->treehdr.iTreeId; + i64 iSnap = lsmCheckpointId(pDb->aSnapshot, 0); + rc = lsmReadlock(pDb, iSnap, iTree); + if( rc==LSM_OK ){ + if( (i64)pShm->hdr1.iTreeId==iTree + && pShm->hdr1.iTransId==pDb->treehdr.iTransId + && lsmCheckpointId(pShm->aClient, 0)==iSnap + ){ + /* Read lock has been successfully obtained. Deserialize the + ** checkpoint just loaded. TODO: This will be removed after + ** lsm_sorted.c is changed to work directly from the serialized + ** version of the snapshot. */ + rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot, &pDb->pClient); + assert( (rc==LSM_OK)==(pDb->pClient!=0) ); + }else{ + rc = lsmReleaseReadlock(pDb); + } + } + if( rc==LSM_BUSY ) rc = LSM_OK; + } + } + if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY; return rc; } /* @@ -1124,79 +678,67 @@ void lsmFinishReadTrans(lsm_db *pDb){ Snapshot *pClient = pDb->pClient; /* Worker connections should not be closing read transactions. And ** read transactions should only be closed after all cursors and write - ** transactions have been closed. */ + ** transactions have been closed. Finally pClient should be non-NULL + ** only iff pDb->iReader>=0. */ assert( pDb->pWorker==0 ); assert( pDb->pCsr==0 && pDb->nTransOpen==0 ); if( pClient ){ - Database *p = pDb->pDatabase; - - lsmDbSnapshotRelease(pDb->pEnv, pDb->pClient); + lsmFreeSnapshot(pDb->pEnv, pDb->pClient); pDb->pClient = 0; - - /* Release the in-memory tree version */ - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - lsmTreeReleaseReadVersion(pDb->pEnv, pDb->pTV); - pDb->pTV = 0; - lsmMutexLeave(pDb->pEnv, p->pClientMutex); } + if( pDb->iReader>=0 ) lsmReleaseReadlock(pDb); + assert( (pDb->pClient!=0)==(pDb->iReader>=0) ); } /* ** Open a write transaction. */ int lsmBeginWriteTrans(lsm_db *pDb){ - int rc = LSM_OK; /* Return code */ - Database *p = pDb->pDatabase; /* Shared database object */ - - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - assert( p->pTree ); - assert( (pDb->pTV==0)==(pDb->pClient==0) ); - - /* There are two reasons the attempt to open a write transaction may fail: - ** - ** 1. There is already a writer. - ** 2. Connection pDb already has an open read transaction, and the read - ** snapshot is not the most recent version of the database. - ** - ** If condition 1 is true, then the Database.bWriter flag is set. If the - ** second is true, then the call to lsmTreeWriteVersion() returns NULL. - */ - if( p->bWriter ){ + int rc; /* Return code */ + ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */ + + assert( pDb->nTransOpen==0 ); + + /* If there is no read-transaction open, open one now. */ + rc = lsmBeginReadTrans(pDb); + + /* Attempt to take the WRITER lock */ + if( rc==LSM_OK ){ + rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); + } + + /* If the previous writer failed mid-transaction, run emergency rollback. */ + if( rc==LSM_OK && pShm->bWriter ){ + /* TODO: This! */ + assert( 0 ); + rc = LSM_CORRUPT_BKPT; + } + + /* Check that this connection is currently reading from the most recent + ** version of the database. If not, return LSM_BUSY. */ + if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){ rc = LSM_BUSY; - }else{ - rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV); - } - - if( rc==LSM_OK ){ - rc = lsmLogBegin(pDb, &p->log); - - if( rc!=LSM_OK ){ - /* If the call to lsmLogBegin() failed, relinquish the read/write - ** TreeVersion handle obtained above. The attempt to open a transaction - ** has failed. */ - TreeVersion *pWrite = pDb->pTV; - TreeVersion **ppRestore = (pDb->pClient ? &pDb->pTV : 0); - pDb->pTV = 0; - lsmTreeReleaseWriteVersion(pDb->pEnv, pWrite, 0, ppRestore); - }else if( pDb->pClient==0 ){ - /* Otherwise, if the lsmLogBegin() attempt was successful and the - ** client did not have a read transaction open when this function - ** was called, lsm_db.pClient will still be NULL. In this case, grab - ** a reference to the lastest checkpointed snapshot now. */ - p->pClient->nRef++; - pDb->pClient = p->pClient; - } - } - - if( rc==LSM_OK ){ - p->bWriter = 1; - } - lsmMutexLeave(pDb->pEnv, p->pClientMutex); + } + + if( rc==LSM_OK ){ + rc = lsmLogBegin(pDb); + } + + /* If everything was successful, set the "transaction-in-progress" flag + ** and return LSM_OK. Otherwise, if some error occurred, relinquish the + ** WRITER lock and return an error code. */ + if( rc==LSM_OK ){ + pShm->bWriter = 1; + pDb->treehdr.iTransId++; + }else{ + lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb); + } return rc; } /* ** End the current write transaction. The connection is left with an open @@ -1210,111 +752,394 @@ ** merely releases locks and other resources held by the write-transaction. ** ** LSM_OK is returned if successful, or an LSM error code otherwise. */ int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){ - Database *p = pDb->pDatabase; - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - - assert( pDb->pTV && lsmTreeIsWriteVersion(pDb->pTV) ); - assert( p->bWriter ); - p->bWriter = 0; - lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, bCommit, &pDb->pTV); - - lsmLogEnd(pDb, &p->log, bCommit); - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - return LSM_OK; -} - - -/* -** This function is called at the beginning of a flush operation (i.e. when -** flushing the contents of the in-memory tree to a segment on disk). -** -** The caller must already be the worker connection. -** -** Also, the caller must have an open write transaction or be in the process -** of shutting down the (shared) database connection. This means we don't -** have to worry about any other connection modifying the in-memory tree -** structure while it is being flushed (although some other clients may be -** reading from it). -*/ -int lsmBeginFlush(lsm_db *pDb){ - - assert( pDb->pWorker ); - assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pDb->pTV)) - || (pDb->pTV==0 && holdingGlobalMutex(pDb->pEnv)) - ); - - if( pDb->pTV==0 ){ - pDb->pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree); - } - return LSM_OK; -} - -int lsmDbTreeSize(lsm_db *pDb){ - TreeVersion *pTV = pDb->pTV; - - assert( pDb->pWorker ); - assert( (pDb->pDatabase->bWriter && lsmTreeIsWriteVersion(pTV)) - || (pTV==0 && holdingGlobalMutex(pDb->pEnv)) - ); - if( pTV==0 ) pTV = lsmTreeRecoverVersion(pDb->pDatabase->pTree); - - return lsmTreeSize(pTV); -} - -/* -** This is called to indicate that a "flush-tree" operation has finished. -** If the second argument is true, a new in-memory tree is allocated to -** hold subsequent writes. -*/ -int lsmFinishFlush(lsm_db *pDb, int bEmpty){ - Database *p = pDb->pDatabase; - int rc = LSM_OK; - - assert( pDb->pWorker ); - assert( pDb->pTV && (p->nDbRef==0 || lsmTreeIsWriteVersion(pDb->pTV)) ); - lsmMutexEnter(pDb->pEnv, p->pClientMutex); - - if( bEmpty ){ - if( p->bWriter ){ - lsmTreeReleaseWriteVersion(pDb->pEnv, pDb->pTV, 1, 0); - } - pDb->pTV = 0; - lsmTreeRelease(pDb->pEnv, p->pTree); - - if( p->nDbRef>0 ){ - rc = lsmTreeNew(pDb->pEnv, pDb->xCmp, &p->pTree); - }else{ - /* This is the case if the Database object is being deleted */ - p->pTree = 0; - } - } - - if( p->bWriter ){ - assert( pDb->pClient ); - if( 0==pDb->pTV ) rc = lsmTreeWriteVersion(pDb->pEnv, p->pTree, &pDb->pTV); - }else{ - pDb->pTV = 0; - } - lsmMutexLeave(pDb->pEnv, p->pClientMutex); - return rc; -} - -/* -** Return a pointer to the DbLog object associated with connection pDb. -** Allocate and initialize it if necessary. -*/ -DbLog *lsmDatabaseLog(lsm_db *pDb){ - Database *p = pDb->pDatabase; - return &p->log; -} + lsmLogEnd(pDb, bCommit); + lsmTreeEndTransaction(pDb, bCommit); + lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + return LSM_OK; +} + /* ** Return non-zero if the caller is holding the client mutex. */ #ifdef LSM_DEBUG int lsmHoldingClientMutex(lsm_db *pDb){ return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex); } #endif + +/* +** Obtain a read-lock on database version identified by the combination +** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or +** an LSM error code otherwise. +*/ +int lsmReadlock(lsm_db *db, i64 iLsm, i64 iTree){ + ShmHeader *pShm = db->pShmhdr; + int i; + int rc = LSM_OK; + + assert( db->iReader<0 ); + + /* Search for an exact match. */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + if( p->iLsmId==iLsm && p->iTreeId==iTree ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iTree ){ + db->iReader = i; + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + } + } + + /* Try to obtain a write-lock on each slot, in order. If successful, set + ** the slot values to iLsm/iTree. */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + p->iLsmId = iLsm; + p->iTreeId = iTree; + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + if( rc==LSM_OK ) db->iReader = i; + } + } + + /* Search for any usable slot */ + for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; + if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); + if( rc==LSM_OK ){ + if( p->iLsmId && p->iTreeId && p->iLsmId<=iLsm && p->iTreeId<=iTree ){ + db->iReader = i; + } + }else if( rc==LSM_BUSY ){ + rc = LSM_OK; + } + } + } + + return rc; +} + +static int isInUse(lsm_db *db, i64 iLsm, i64 iTree, int *pbInUse){ + ShmHeader *pShm = db->pShmhdr; + int i; + int rc = LSM_OK; + + for(i=0; rc==LSM_OK && iaReader[i]; + if( p->iLsmId && p->iTreeId && (p->iTreeId<=iTree || p->iLsmId<=iLsm) ){ + rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); + if( rc==LSM_OK ){ + p->iTreeId = p->iLsmId = 0; + lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); + } + } + } + + if( rc==LSM_BUSY ){ + *pbInUse = 1; + return LSM_OK; + } + *pbInUse = 0; + return rc; +} + +int lsmTreeInUse(lsm_db *db, u32 iTreeId, int *pbInUse){ + if( db->treehdr.iTreeId==iTreeId ){ + *pbInUse = 1; + return LSM_OK; + } + return isInUse(db, 0, (i64)iTreeId, pbInUse); +} + +int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){ + if( db->pClient && db->pClient->iId<=iLsmId ){ + *pbInUse = 1; + return LSM_OK; + } + return isInUse(db, iLsmId, 0, pbInUse); +} + +/* +** Release the read-lock currently held by connection db. +*/ +int lsmReleaseReadlock(lsm_db *db){ + int rc = LSM_OK; + if( db->iReader>=0 ){ + rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0); + db->iReader = -1; + } + return rc; +} + +/* +** This function may only be called after a successful call to +** lsmDbDatabaseConnect(). It returns true if the connection is in +** multi-process mode, or false otherwise. +*/ +int lsmDbMultiProc(lsm_db *pDb){ + return pDb->pDatabase && (pDb->pDatabase->pFile!=0); +} + +void lsmDbDeferredClose(lsm_db *pDb, lsm_file *pFile, LsmFile *pLsmFile){ + Database *p = pDb->pDatabase; + lsm_env *pEnv = pDb->pEnv; + + lsmMutexEnter(pEnv, p->pClientMutex); + pLsmFile->pFile = pFile; + pLsmFile->pNext = p->pLsmFile; + p->pLsmFile = pLsmFile; + lsmMutexLeave(pEnv, p->pClientMutex); +} + + +/************************************************************************* +************************************************************************** +************************************************************************** +************************************************************************** +************************************************************************** +*************************************************************************/ + +/* +** Retrieve a pointer to shared-memory chunk iChunk. Chunks are numbered +** starting from 0 (i.e. the header chunk is chunk 0). +*/ +int lsmShmChunk(lsm_db *db, int iChunk, void **ppData){ + int rc = LSM_OK; + void *pRet = 0; + Database *p = db->pDatabase; + lsm_env *pEnv = db->pEnv; + + /* Enter the client mutex */ + assert( iChunk>=0 ); + lsmMutexEnter(pEnv, p->pClientMutex); + + if( iChunk>=p->nShmChunk ){ + int nNew = iChunk+1; + void **apNew; + apNew = (void **)lsmRealloc(pEnv, p->apShmChunk, sizeof(void*) * nNew); + if( apNew==0 ){ + rc = LSM_NOMEM_BKPT; + }else{ + memset(&apNew[p->nShmChunk], 0, sizeof(void*) * (nNew-p->nShmChunk)); + p->apShmChunk = apNew; + p->nShmChunk = nNew; + } + } + + if( rc==LSM_OK && p->apShmChunk[iChunk]==0 ){ + void *pChunk = 0; + if( p->pFile==0 ){ + /* Single process mode */ + pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); + }else{ + /* Multi-process mode */ + rc = lsmEnvShmMap(pEnv, p->pFile, iChunk, LSM_SHM_CHUNK_SIZE, &pChunk); + } + p->apShmChunk[iChunk] = pChunk; + } + + if( rc==LSM_OK ){ + pRet = p->apShmChunk[iChunk]; + } + + /* Release the client mutex */ + lsmMutexLeave(pEnv, p->pClientMutex); + + *ppData = pRet; + return rc; +} + +/* +** Attempt to obtain the lock identified by the iLock and bExcl parameters. +** If successful, return LSM_OK. If the lock cannot be obtained because +** there exists some other conflicting lock, return LSM_BUSY. If some other +** error occurs, return an LSM error code. +** +** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER, +** or else a value returned by the LSM_LOCK_READER macro. +*/ +int lsmShmLock( + lsm_db *db, + int iLock, + int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */ + int bBlock /* True for a blocking lock */ +){ + lsm_db *pIter; + const u32 me = (1 << (iLock-1)); + const u32 ms = (1 << (iLock+16-1)); + int rc = LSM_OK; + Database *p = db->pDatabase; + + assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); + assert( iLock<=16 ); + assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); + + /* Check for a no-op. Proceed only if this is not one of those. */ + if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0) + || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms) + || (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0) + ){ + int nExcl = 0; /* Number of connections holding EXCLUSIVE */ + int nShared = 0; /* Number of connections holding SHARED */ + lsmMutexEnter(db->pEnv, p->pClientMutex); + + /* Figure out the locks currently held by this process on iLock, not + ** including any held by connection db. */ + for(pIter=p->pConn; pIter; pIter=pIter->pNext){ + assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 ); + if( pIter!=db ){ + if( pIter->mLock & me ){ + nExcl++; + }else if( pIter->mLock & ms ){ + nShared++; + } + } + } + assert( nExcl==0 || nExcl==1 ); + assert( nExcl==0 || nShared==0 ); + assert( nExcl==0 || (db->mLock & (me|ms))==0 ); + + switch( eOp ){ + case LSM_LOCK_UNLOCK: + if( nShared==0 ){ + lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_UNLOCK); + } + db->mLock &= ~(me|ms); + break; + + case LSM_LOCK_SHARED: + if( nExcl ){ + rc = LSM_BUSY; + }else{ + if( nShared==0 ){ + rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_SHARED); + } + db->mLock |= ms; + db->mLock &= ~me; + } + break; + + default: + assert( eOp==LSM_LOCK_EXCL ); + if( nExcl || nShared ){ + rc = LSM_BUSY; + }else{ + rc = lsmEnvLock(db->pEnv, p->pFile, iLock, LSM_LOCK_EXCL); + db->mLock |= (me|ms); + } + break; + } + + lsmMutexLeave(db->pEnv, p->pClientMutex); + } + + return rc; +} + +#ifdef LSM_DEBUG + +int shmLockType(lsm_db *db, int iLock){ + const u32 me = (1 << (iLock-1)); + const u32 ms = (1 << (iLock+16-1)); + + if( db->mLock & me ) return LSM_LOCK_EXCL; + if( db->mLock & ms ) return LSM_LOCK_SHARED; + return LSM_LOCK_UNLOCK; +} + +/* +** The arguments passed to this function are similar to those passed to +** the lsmShmLock() function. However, instead of obtaining a new lock +** this function returns true if the specified connection already holds +** (or does not hold) such a lock, depending on the value of eOp. As +** follows: +** +** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock +** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock. +** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock. +*/ +int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){ + int ret; + int eHave; + + assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); + assert( iLock<=16 ); + assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); + + eHave = shmLockType(db, iLock); + + switch( eOp ){ + case LSM_LOCK_UNLOCK: + ret = (eHave==LSM_LOCK_UNLOCK); + break; + case LSM_LOCK_SHARED: + ret = (eHave!=LSM_LOCK_UNLOCK); + break; + case LSM_LOCK_EXCL: + ret = (eHave==LSM_LOCK_EXCL); + break; + default: + assert( !"bad eOp value passed to lsmShmAssertLock()" ); + break; + } + + return ret; +} + +int lsmShmAssertWorker(lsm_db *db){ + return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker; +} + +/* +** This function does not contribute to library functionality, and is not +** included in release builds. It is intended to be called from within +** an interactive debugger. +** +** When called, this function prints a single line of human readable output +** to stdout describing the locks currently held by the connection. For +** example: +** +** (gdb) call print_db_locks(pDb) +** (shared on dms2) (exclusive on writer) +*/ +void print_db_locks(lsm_db *db){ + int iLock; + for(iLock=0; iLock<16; iLock++){ + int bOne = 0; + const char *azLock[] = {0, "shared", "exclusive"}; + const char *azName[] = { + 0, "dms1", "dms2", "writer", "worker", "checkpointer", + "reader0", "reader1", "reader2", "reader3", "reader4", "reader5" + }; + int eHave = shmLockType(db, iLock); + if( azLock[eHave] ){ + printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]); + bOne = 1; + } + } + printf("\n"); +} +void print_all_db_locks(lsm_db *db){ + lsm_db *p; + for(p=db->pDatabase->pConn; p; p=p->pNext){ + printf("%s connection %p ", ((p==db)?"*":""), p); + print_db_locks(p); + } +} +#endif + +void lsmShmBarrier(lsm_db *db){ + lsmEnvShmBarrier(db->pEnv); +} + + + Index: src/lsm_sorted.c ================================================================== --- src/lsm_sorted.c +++ src/lsm_sorted.c @@ -262,11 +262,11 @@ BtreeCursor *pBtCsr; Snapshot *pSnap; /* Used by cursors flushing the in-memory tree only */ - int nLsmLevel; /* Number of levels to store in LSM */ + int *pnOvfl; /* Number of free-list entries to store */ void *pSystemVal; /* Pointer to buffer to free */ }; #define CURSOR_DATA_TREE 0 #define CURSOR_DATA_SYSTEM 1 @@ -286,14 +286,10 @@ ** ** CURSOR_AT_FREELIST ** This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually ** pointing at a free list. ** -** CURSOR_AT_LEVELS -** This flag is set when sub-cursor CURSOR_DATA_SYSTEM is actually -** pointing at a free list. -** ** CURSOR_IGNORE_SYSTEM ** If set, this cursor ignores system keys. ** ** CURSOR_NEXT_OK ** Set if it is Ok to call lsm_csr_next(). @@ -302,11 +298,10 @@ ** Set if it is Ok to call lsm_csr_prev(). */ #define CURSOR_IGNORE_DELETE 0x00000001 #define CURSOR_NEW_SYSTEM 0x00000002 #define CURSOR_AT_FREELIST 0x00000004 -#define CURSOR_AT_LEVELS 0x00000008 #define CURSOR_IGNORE_SYSTEM 0x00000010 #define CURSOR_NEXT_OK 0x00000020 #define CURSOR_PREV_OK 0x00000040 typedef struct MergeWorker MergeWorker; @@ -485,10 +480,19 @@ } static u8 *pageGetCell(u8 *aData, int nData, int iCell){ return &aData[lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, iCell)])]; } + +/* +** Return the number of cells on page pPg. +*/ +static int pageObjGetNRec(Page *pPg){ + int nData; + u8 *aData = lsmFsPageData(pPg, &nData); + return pageGetNRec(aData, nData); +} /* ** Return the decoded (possibly relative) pointer value stored in cell ** iCell from page aData/nData. */ @@ -567,10 +571,11 @@ u8 *aCell; int eType; aData = fsPageData(pPg, &nData); assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) ); + assert( iKey>=0 && iKeyiPg<0 ){ pCsr->pKey = 0; pCsr->nKey = 0; pCsr->eType = 0; }else{ - int dummy; - rc = pageGetBtreeKey( - pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell, - &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob - ); - pCsr->eType |= SORTED_SEPARATOR; + int iPg; + for(iPg=pCsr->iPg; iPg>=0; iPg--){ + int iCell = pCsr->aPg[pCsr->iPg].iCell; + if( iCell>=0 ){ + int dummy; + rc = pageGetBtreeKey( + pCsr->aPg[pCsr->iPg].pPage, pCsr->aPg[pCsr->iPg].iCell, + &dummy, &pCsr->eType, &pCsr->pKey, &pCsr->nKey, &pCsr->blob + ); + pCsr->eType |= SORTED_SEPARATOR; + break; + } + } + + if( iPg<0 ) rc = LSM_CORRUPT_BKPT; } return rc; } @@ -824,17 +838,27 @@ Blob blob = {0,0,0}; void *pSeek; int nSeek; int iTopicSeek; int dummy; - int iPg = 0; int iLoad = pCsr->pSeg->iRoot; - - rc = pageGetBtreeKey(pCsr->aPg[nDepth-1].pPage, - 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob - ); + Page *pPg = pCsr->aPg[nDepth-1].pPage; + + if( pageObjGetNRec(pPg)==0 ){ + /* This can happen when pPg is the right-most leaf in the b-tree. + ** In this case, set the iTopicSeek/pSeek/nSeek key to a value + ** greater than any real key. */ + assert( iCell==-1 ); + iTopicSeek = 1000; + pSeek = 0; + nSeek = 0; + }else{ + rc = pageGetBtreeKey(pPg, + 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob + ); + } do { Page *pPg; rc = lsmFsDbPageGet(pCsr->pFS, iLoad, &pPg); assert( rc==LSM_OK || pPg==0 ); @@ -1099,10 +1123,14 @@ for(i=0; inRight; i++){ pCsr->aPtr[i+1].pSeg = &pLevel->aRhs[i]; } } + + if( nPtr>1 && pLevel->pSplitKey==0 ){ + lsmSortedSplitkey(pDb, pLevel, &rc); + } return rc; } static int levelCursorInitRun( @@ -1541,12 +1569,14 @@ if( pLeft->pPg==0 ){ iRet = 1; }else if( pRight->pPg==0 ){ iRet = 0; }else{ - int res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey); - + int res = rtTopic(pLeft->eType) - rtTopic(pRight->eType); + if( res==0 ){ + res = pCsr->xCmp(pLeft->pKey, pLeft->nKey, pRight->pKey, pRight->nKey); + } if( res==0 || (res<0 && bLargest==0) || (res>0 && bLargest) ){ iRet = 0; }else{ iRet = 1; } @@ -1973,11 +2003,10 @@ } } if( rc==LSM_OK ){ if( useTree ){ - assert( pDb->pTV ); rc = lsmTreeCursorNew(pDb, &pCsr->pTreeCsr); } pCsr->pDb = pDb; pCsr->pSnap = pSnap; pCsr->xCmp = pDb->xCmp; @@ -2026,12 +2055,13 @@ /* ** If the free-block list is not empty, then have this cursor visit a key ** with (a) the system bit set, and (b) the key "F" and (c) a value blob ** containing the entire serialized free-block list. */ -static void multiCursorVisitFreelist(MultiCursor *pCsr){ +static void multiCursorVisitFreelist(MultiCursor *pCsr, int *pnOvfl){ assert( pCsr ); + pCsr->pnOvfl = pnOvfl; pCsr->flags |= CURSOR_NEW_SYSTEM; } /* ** Allocate a new cursor to read the database (the in-memory tree and all @@ -2116,15 +2146,10 @@ if( pCsr->flags & CURSOR_AT_FREELIST ){ pKey = (void *)"FREELIST"; nKey = 8; eType = SORTED_SYSTEM_WRITE; } - else if( pCsr->flags & CURSOR_AT_LEVELS ){ - pKey = (void *)"LEVELS"; - nKey = 6; - eType = SORTED_SYSTEM_WRITE; - } break; default: { int iSeg = iKey - CURSOR_DATA_SEGMENT; if( iSeg==pCsr->nSegCsr && pCsr->pBtCsr ){ @@ -2158,21 +2183,17 @@ *ppVal = 0; *pnVal = 0; } }else if( iVal==CURSOR_DATA_SYSTEM ){ if( pCsr->flags & CURSOR_AT_FREELIST ){ - int *aVal; + void *aVal; int nVal; + assert( pCsr->pSystemVal==0 ); - rc = lsmSnapshotFreelist(pCsr->pDb, &aVal, &nVal); - pCsr->pSystemVal = *ppVal = (void *)aVal; - *pnVal = sizeof(int) * nVal; - lsmFreelistDeltaBegin(pCsr->pDb); - }else if( (pCsr->flags & CURSOR_AT_LEVELS) && pCsr->nLsmLevel>0 ){ - lsmFree(pCsr->pDb->pEnv, pCsr->pSystemVal); - lsmCheckpointLevels(pCsr->pDb, pCsr->nLsmLevel, ppVal, pnVal); - pCsr->pSystemVal = *ppVal; + rc = lsmCheckpointOverflow(pCsr->pDb, &aVal, &nVal, pCsr->pnOvfl); + *ppVal = pCsr->pSystemVal = aVal; + *pnVal = nVal; }else{ *ppVal = 0; *pnVal = 0; } }else if( iVal-CURSOR_DATA_SEGMENTnSegCsr @@ -2185,48 +2206,43 @@ } assert( rc==LSM_OK || (*ppVal==0 && *pnVal==0) ); return rc; } -int lsmSortedLoadSystem(lsm_db *pDb){ +int lsmSortedLoadFreelist( + lsm_db *pDb, /* Database handle (must be worker) */ + void **ppVal, /* OUT: Blob containing LSM free-list */ + int *pnVal /* OUT: Size of *ppVal blob in bytes */ +){ MultiCursor *pCsr = 0; /* Cursor used to retreive free-list */ int rc; /* Return Code */ assert( pDb->pWorker ); + assert( *ppVal==0 && *pnVal==0 ); + rc = multiCursorAllocate(pDb, 1, &pCsr); if( rc==LSM_OK ){ - void *pVal; int nVal; /* Value read from database */ - rc = lsmMCursorLast(pCsr); - if( rc==LSM_OK - && pCsr->eType==SORTED_SYSTEM_WRITE - && pCsr->key.nData==6 - && 0==memcmp(pCsr->key.pData, "LEVELS", 6) - ){ - rc = lsmMCursorValue(pCsr, &pVal, &nVal); - if( rc==LSM_OK ){ - rc = lsmCheckpointLoadLevels(pDb, pVal, nVal); - } - if( rc==LSM_OK ){ - rc = lsmMCursorPrev(pCsr); - } - } - if( rc==LSM_OK && pCsr->eType==SORTED_SYSTEM_WRITE && pCsr->key.nData==8 && 0==memcmp(pCsr->key.pData, "FREELIST", 8) ){ + void *pVal; int nVal; /* Value read from database */ rc = lsmMCursorValue(pCsr, &pVal, &nVal); if( rc==LSM_OK ){ - int n32 = nVal / sizeof(u32); - rc = lsmSnapshotSetFreelist(pDb, (int *)pVal, n32); + *ppVal = lsmMallocRc(pDb->pEnv, nVal, &rc); + if( *ppVal ){ + memcpy(*ppVal, pVal, nVal); + *pnVal = nVal; + } } } lsmMCursorClose(pCsr); } + return rc; } static void multiCursorDoCompare(MultiCursor *pCsr, int iOut, int bReverse){ int i1; @@ -2425,11 +2441,10 @@ if( eESeek==LSM_SEEK_LEFAST ) eESeek = LSM_SEEK_LE; assert( eESeek==LSM_SEEK_EQ || eESeek==LSM_SEEK_LE || eESeek==LSM_SEEK_GE ); assert( (pCsr->flags & CURSOR_NEW_SYSTEM)==0 ); assert( (pCsr->flags & CURSOR_AT_FREELIST)==0 ); - assert( (pCsr->flags & CURSOR_AT_LEVELS)==0 ); pCsr->flags &= ~(CURSOR_NEXT_OK | CURSOR_PREV_OK); lsmTreeCursorSeek(pCsr->pTreeCsr, pKey, nKey, &res); switch( eESeek ){ case LSM_SEEK_EQ: @@ -2554,20 +2569,14 @@ rc = lsmTreeCursorPrev(pCsr->pTreeCsr); }else{ rc = lsmTreeCursorNext(pCsr->pTreeCsr); } }else if( iKey==CURSOR_DATA_SYSTEM ){ - assert( pCsr->flags & (CURSOR_AT_FREELIST | CURSOR_AT_LEVELS) ); + assert( pCsr->flags & CURSOR_AT_FREELIST ); assert( pCsr->flags & CURSOR_NEW_SYSTEM ); assert( bReverse==0 ); - - if( pCsr->flags & CURSOR_AT_FREELIST ){ - pCsr->flags &= ~CURSOR_AT_FREELIST; - pCsr->flags |= CURSOR_AT_LEVELS; - }else{ - pCsr->flags &= ~CURSOR_AT_LEVELS; - } + pCsr->flags &= ~CURSOR_AT_FREELIST; }else if( iKey==(CURSOR_DATA_SEGMENT+pCsr->nSegCsr) ){ assert( bReverse==0 && pCsr->pBtCsr ); rc = btreeCursorNext(pCsr->pBtCsr); }else{ LevelCursor *pLevel = &pCsr->aSegCsr[iKey-CURSOR_DATA_SEGMENT]; @@ -3441,10 +3450,11 @@ return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr); } static void sortedFreeLevel(lsm_env *pEnv, Level *p){ if( p ){ + lsmFree(pEnv, p->pSplitKey); lsmFree(pEnv, p->pMerge); lsmFree(pEnv, p->aRhs); lsmFree(pEnv, p); } } @@ -3453,35 +3463,37 @@ if( pDb->xWork ){ pDb->xWork(pDb, pDb->pWorkCtx); } } -int lsmSortedNewToplevel( +static int sortedNewToplevel( lsm_db *pDb, /* Connection handle */ - int nLevel, /* Number of levels store in LSM (often 0) */ - int bFreelist /* True to store the freelist in the LSM */ + int bTree, /* True to store contents of in-memory tree */ + int *pnOvfl /* OUT: Number of free-list entries stored */ ){ int rc = LSM_OK; /* Return Code */ MultiCursor *pCsr = 0; Level *pNext = 0; /* The current top level */ Level *pNew; /* The new level itself */ Segment *pDel = 0; /* Delete separators from this segment */ int iLeftPtr = 0; + + assert( pnOvfl ); /* Allocate the new level structure to write to. */ pNext = lsmDbSnapshotLevel(pDb->pWorker); pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc); /* Create a cursor to gather the data required by the new segment. The new ** segment contains everything in the tree and pointers to the next segment ** in the database (if any). */ if( rc==LSM_OK ){ - - pNew->pNext = pNext; - lsmDbSnapshotSetLevel(pDb->pWorker, pNew); - - rc = multiCursorNew(pDb, pDb->pWorker, (pDb->pTV!=0), 0, &pCsr); + rc = multiCursorNew(pDb, pDb->pWorker, bTree, 0, &pCsr); + if( rc==LSM_OK ){ + pNew->pNext = pNext; + lsmDbSnapshotSetLevel(pDb->pWorker, pNew); + } if( rc==LSM_OK ){ if( pNext ){ assert( pNext->pMerge==0 || pNext->nRight>0 ); if( pNext->pMerge==0 ){ if( pNext->lhs.iRoot ){ @@ -3498,16 +3510,12 @@ multiCursorIgnoreDelete(pCsr); } } if( rc==LSM_OK ){ - assert( bFreelist || nLevel==0 ); - if( bFreelist ){ - multiCursorVisitFreelist(pCsr); - } + multiCursorVisitFreelist(pCsr, pnOvfl); multiCursorReadSeparators(pCsr); - pCsr->nLsmLevel = nLevel; } } if( rc!=LSM_OK ){ lsmMCursorClose(pCsr); @@ -3536,11 +3544,10 @@ } mergeWorkerShutdown(&mergeworker, &rc); pNew->pMerge = 0; } - lsmFreelistDeltaEnd(pDb); /* Link the new level into the top of the tree. */ if( rc==LSM_OK ){ if( pDel ){ pDel->iRoot = 0; @@ -3571,39 +3578,28 @@ ** the first, the connection also holds the in-memory tree write-version. ** In the second, no in-memory tree version reference is held at all. */ int lsmSortedFlushTree( lsm_db *pDb, /* Connection handle */ - int nLevel, - int bFreelist + int *pnOvfl /* OUT: Number of free-list entries written */ ){ int rc; assert( pDb->pWorker ); - assert( pDb->pTV==0 || lsmTreeIsWriteVersion(pDb->pTV) ); - - rc = lsmBeginFlush(pDb); /* If there is nothing to do, return early. */ - if( lsmTreeSize(pDb->pTV)==0 && bFreelist==0 ){ - lsmFinishFlush(pDb, 0); + if( lsmTreeSize(pDb)==0 && lsmCheckpointOverflowRequired(pDb)==0 ){ + *pnOvfl = 0; return LSM_OK; } - lsmDatabaseDirty(pDb); - - if( rc==LSM_OK ){ - rc = lsmSortedNewToplevel(pDb, nLevel, bFreelist); - } + rc = sortedNewToplevel(pDb, 1, pnOvfl); + assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); #if 0 - lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "tree flush"); + lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "tree flush"); #endif - - assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) ); - - lsmFinishFlush(pDb, rc==LSM_OK); return rc; } /* ** The nMerge levels in the LSM beginning with pLevel consist of a @@ -3643,11 +3639,11 @@ pNew->nRight = nMerge; pNew->iAge = pLevel->iAge+1; for(i=0; ipNext; pNew->aRhs[i] = p->lhs; - lsmFree(pDb->pEnv, p); + sortedFreeLevel(pDb->pEnv, p); p = pNext; } /* Replace the old levels with the new. */ pTopLevel = lsmDbSnapshotLevel(pDb->pWorker); @@ -3798,11 +3794,10 @@ assert( lsmFsIntegrityCheck(pDb) ); assert( pWorker ); if( lsmDbSnapshotLevel(pWorker)==0 ) return LSM_OK; - lsmDatabaseDirty(pDb); while( nRemaining>0 ){ Level *pLevel; Level *pTopLevel = lsmDbSnapshotLevel(pWorker); @@ -3939,11 +3934,11 @@ ** the database structure has changed. */ mergeWorkerShutdown(&mergeworker, &rc); if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); #if 0 - lsmSortedDumpStructure(pDb, pDb->pWorker, 0, 0, "work"); + lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work"); #endif } } @@ -4036,42 +4031,50 @@ */ int lsm_work(lsm_db *pDb, int flags, int nPage, int *pnWrite){ int rc = LSM_OK; /* Return code */ /* This function may not be called if pDb has an open read or write - ** transaction. Return LSM_MISUSE if an application attempts this. - */ + ** transaction. Return LSM_MISUSE if an application attempts this. */ if( pDb->nTransOpen || pDb->pCsr ) return LSM_MISUSE_BKPT; - assert( pDb->pTV==0 ); + /* If the FLUSH flag is set, try to flush the contents of the in-memory + ** tree to disk. */ if( (flags & LSM_WORK_FLUSH) ){ rc = lsmBeginWriteTrans(pDb); if( rc==LSM_OK ){ rc = lsmFlushToDisk(pDb); - lsmFinishWriteTrans(pDb, 0); + lsmFinishWriteTrans(pDb, 1); lsmFinishReadTrans(pDb); } } if( rc==LSM_OK && nPage>0 ){ int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0); int nWrite = 0; - pDb->pWorker = lsmDbSnapshotWorker(pDb); - rc = sortedWork(pDb, nPage, bOptimize, &nWrite); - - if( rc==LSM_OK && nWrite && (flags & LSM_WORK_CHECKPOINT) ){ - int bOvfl; - int nLsm; - - bOvfl = lsmCheckpointOverflow(pDb, &nLsm); + int nOvfl = -1; + + assert( pDb->pWorker==0 ); + rc = lsmBeginWork(pDb); + if( rc==LSM_OK ){ + rc = sortedWork(pDb, nPage, bOptimize, &nWrite); + } + + if( rc==LSM_OK && nWrite ){ rc = lsmSortedFlushDb(pDb); - if( rc==LSM_OK && bOvfl ) rc = lsmSortedNewToplevel(pDb, nLsm, bOvfl); - if( rc==LSM_OK ) rc = lsmDbUpdateClient(pDb, nLsm, bOvfl); + if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){ + rc = sortedNewToplevel(pDb, 0, &nOvfl); + } + } + + if( nWrite ){ + lsmFinishWork(pDb, 0, nOvfl, &rc); + }else{ + int rcdummy = LSM_BUSY; + lsmFinishWork(pDb, 0, 0, &rcdummy); } - lsmDbSnapshotRelease(pDb->pEnv, pDb->pWorker); - pDb->pWorker = 0; + assert( pDb->pWorker==0 ); if( pnWrite ) *pnWrite = nWrite; }else if( pnWrite ){ *pnWrite = 0; } @@ -4274,19 +4277,25 @@ Snapshot *pWorker; /* Worker snapshot */ Snapshot *pRelease = 0; /* Snapshot to release */ Page *pPg = 0; /* Handle for page iPg */ int i, j; /* Loop counters */ const int perLine = 16; /* Bytes per line in the raw hex dump */ + int bEndWork = 0; *pzOut = 0; if( iPg==0 ) return LSM_ERROR; /* Obtain the worker snapshot */ +#if 0 pWorker = pDb->pWorker; if( !pWorker ){ - pRelease = pWorker = lsmDbSnapshotWorker(pDb); + rc = lsmBeginWork(pDb); + if( rc!=LSM_OK ) return rc; + pWorker = pDb->pWorker; + bEndWork = 1; } +#endif rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg); if( rc==LSM_OK ){ Blob blob = {0, 0, 0, 0}; int nKeyWidth = 0; @@ -4371,11 +4380,10 @@ *pzOut = str.z; sortedBlobFree(&blob); lsmFsPageRelease(pPg); } - lsmDbSnapshotRelease(pDb->pEnv, pRelease); return rc; } void sortedDumpSegment(lsm_db *pDb, Segment *pRun, int bVals){ assert( pDb->xLog ); @@ -4410,15 +4418,11 @@ const char *zWhy /* Caption to print near top of dump */ ){ Snapshot *pDump = pSnap; Level *pTopLevel; - if( pDump==0 ){ - assert( pDb->pWorker==0 ); - pDump = lsmDbSnapshotWorker(pDb); - } - + assert( pSnap ); pTopLevel = lsmDbSnapshotLevel(pDump); if( pDb->xLog && pTopLevel ){ Level *pLevel; int iLevel = 0; @@ -4481,14 +4485,10 @@ sortedDumpSegment(pDb, &pLevel->aRhs[i], bVals); } } } } - - if( pSnap==0 ){ - lsmDbSnapshotRelease(pDb->pEnv, pDump); - } } void lsmSortedFreeLevel(lsm_env *pEnv, Level *pLevel){ Level *pNext; Level *p; Index: src/lsm_tree.c ================================================================== --- src/lsm_tree.c +++ src/lsm_tree.c @@ -48,11 +48,11 @@ ** designed so that it may be edited in place exactly once without ** affecting existing users. In other words, the node structure is capable ** of storing two separate versions of the node at the same time. ** When a node is to be edited, if the node structure already contains ** two versions, a copy is made as in the append-only approach. Or, if -** it only contains a single version, it may be edited in place. +** it only contains a single version, it is edited in place. ** ** This reduces the overhead so that, roughly, one new node structure ** must be allocated for each write (on top of those allocations that ** would have been required by a non-MVCC tree). Logic: Assume that at ** any time, 50% of nodes in the tree already contain 2 versions. When @@ -93,101 +93,52 @@ typedef struct TreeNode TreeNode; typedef struct TreeLeaf TreeLeaf; typedef struct NodeVersion NodeVersion; /* -** Container for a key-value pair. +** Container for a key-value pair. Within the *-shm file, each key/value +** pair is stored in a single allocation (which may not actually be +** contiguous in memory). Layout is the TreeKey structure, followed by +** the nKey bytes of key blob, followed by the nValue bytes of value blob +** (if nValue is non-negative). */ struct TreeKey { - void *pKey; /* Pointer to key */ - void *pValue; /* Pointer to value. May be NULL. */ int nKey; /* Size of pKey in bytes */ int nValue; /* Size of pValue. Or negative. */ }; +#define TK_KEY(p) ((void *)&(p)[1]) +#define TK_VAL(p) ((void *)(((u8 *)&(p)[1]) + (p)->nKey)) + /* ** A single tree node. A node structure may contain up to 3 key/value ** pairs. Internal (non-leaf) nodes have up to 4 children. ** ** TODO: Update the format of this to be more compact. Get it working ** first though... */ struct TreeNode { - TreeKey *apKey[3]; /* Array of pointers to key-value pairs */ + u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ /* The following fields are present for interior nodes only, not leaves. */ - TreeNode *apChild[4]; /* Array of pointers to child nodes */ + u32 aiChildPtr[4]; /* Array of pointers to child nodes */ - int iV2; /* Version number of v2 */ - u8 iV2Ptr; /* apChild[] entry replaced by pV2Ptr */ - TreeNode *pV2Ptr; /* Substitute pointer */ - TreeNode *pNext; /* Next in interior node rollback list */ + /* The extra child pointer slot. */ + u32 iV2; /* Transaction number of v2 */ + u8 iV2Child; /* apChild[] entry replaced by pV2Ptr */ + u32 iV2Ptr; /* Substitute pointer */ }; struct TreeLeaf { - TreeKey *apKey[3]; /* Array of pointers to key-value pairs */ -}; - -/* -** A handle used by a client to access a Tree structure. -*/ -struct TreeVersion { - Tree *pTree; /* The tree structure to which this belongs */ - int nRef; /* Number of pointers to this */ - TreeNode *pRoot; /* Pointer to root of tree structure */ - int nHeight; /* Current height of tree pRoot */ - int iVersion; /* Current version */ -}; - -#define WORKING_VERSION (1<<30) - -/* -** A tree structure. -** -** iVersion: -** When the tree is first created, this is set to 1. Thereafter it is -** incremented each time lsmTreeMark() is called. The tree must be -** destroyed (i.e. flushed to disk) before it wraps around (todo!). -** -** When v2 data is written to a tree-node, the iV2 field of the node -** is set to the current value of Tree.iVersion. -** -** nRef: -** Number of references to this tree structure. When it is first created, -** (in lsmTreeNew()) nRef is set to 1. There after the ref-count may be -** incremented and decremented using treeIncrRefcount() and -** DecrRefcount(). When the ref-count of a tree structure reaches zero -** it is freed. -** -** xCmp: -** Pointer to the compare function. This is a copy of some pDb->xCmp. -** -*/ -struct Tree { - int nTreeRef; /* Current number of pointers to this */ - Mempool *pPool; /* Memory pool to allocate from */ - int (*xCmp)(void *, int, void *, int); /* Compare function */ - TreeVersion *pCommit; /* Committed version of tree (for readers) */ - - TreeVersion *pWorking; /* Working verson (for writers) */ -#if 0 - TreeVersion tvWorking; /* Working verson (for writers) */ -#endif - - TreeNode *pRbFirst; - TreeNode *pRbLast; -}; - -/* -** The pointer passed as the first argument points to an interior node, -** not a leaf. This function returns the value of the iCell'th child -** sub-tree of the node. -*/ -static TreeNode *getChildPtr(TreeNode *p, int iVersion, int iCell){ - if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Ptr ) return p->pV2Ptr; - return p->apChild[iCell]; -} + u32 aiKeyPtr[3]; /* Array of pointers to TreeKey objects */ +}; + +typedef struct TreeBlob TreeBlob; +struct TreeBlob { + int n; + u8 *a; +}; /* ** Cursor for searching a tree structure. ** ** If a cursor does not point to any element (a.k.a. EOF), then the @@ -203,11 +154,179 @@ lsm_db *pDb; /* Database handle for this cursor */ int iNode; /* Cursor points at apTreeNode[iNode] */ TreeNode *apTreeNode[MAX_DEPTH];/* Current position in tree */ u8 aiCell[MAX_DEPTH]; /* Current position in tree */ TreeKey *pSave; /* Saved key */ + TreeBlob blob; /* Dynamic storage for a key */ }; + +/* +** A value guaranteed to be larger than the largest possible transaction +** id (TreeHeader.iTransId). +*/ +#define WORKING_VERSION (1<<30) + +static int tblobGrow(lsm_db *pDb, TreeBlob *p, int n, int *pRc){ + if( n>p->n ){ + lsmFree(pDb->pEnv, p->a); + p->a = lsmMallocRc(pDb->pEnv, n, pRc); + p->n = n; + } + return (p->a==0); +} +static void tblobFree(lsm_db *pDb, TreeBlob *p){ + lsmFree(pDb->pEnv, p->a); +} + + +/*********************************************************************** +** Start of IntArray methods. */ +/* +** Append value iVal to the contents of IntArray *p. Return LSM_OK if +** successful, or LSM_NOMEM if an OOM condition is encountered. +*/ +static int intArrayAppend(lsm_env *pEnv, IntArray *p, u32 iVal){ + assert( p->nArray<=p->nAlloc ); + if( p->nArray>=p->nAlloc ){ + u32 *aNew; + int nNew = p->nArray ? p->nArray*2 : 128; + aNew = lsmRealloc(pEnv, p->aArray, nNew*sizeof(u32)); + if( !aNew ) return LSM_NOMEM_BKPT; + p->aArray = aNew; + p->nAlloc = nNew; + } + + p->aArray[p->nArray++] = iVal; + return LSM_OK; +} + +/* +** Zero the IntArray object. +*/ +static void intArrayFree(lsm_env *pEnv, IntArray *p){ + lsmFree(pEnv, p->aArray); + memset(p, 0, sizeof(IntArray)); +} + +/* +** Return the number of entries currently in the int-array object. +*/ +static int intArraySize(IntArray *p){ + return p->nArray; +} + +/* +** Return a copy of the iIdx'th entry in the int-array. +*/ +static u32 intArrayEntry(IntArray *p, int iIdx){ + return p->aArray[iIdx]; +} + +/* +** Truncate the int-array so that all but the first nVal values are +** discarded. +*/ +static void intArrayTruncate(IntArray *p, int nVal){ + p->nArray = nVal; +} +/* End of IntArray methods. +***********************************************************************/ + +/* +** The pointer passed as the first argument points to an interior node, +** not a leaf. This function returns the offset of the iCell'th child +** sub-tree of the node. +*/ +static u32 getChildPtr(TreeNode *p, int iVersion, int iCell){ + assert( iCell>=0 && iCell<=array_size(p->aiChildPtr) ); + if( p->iV2 && p->iV2<=iVersion && iCell==p->iV2Child ) return p->iV2Ptr; + return p->aiChildPtr[iCell]; +} + +/* +** Given an offset within the *-shm file, return the associated chunk number. +*/ +static int treeOffsetToChunk(u32 iOff){ + assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); + return (int)(iOff>>15); +} + +/* +** Return a pointer to the mapped memory location associated with *-shm +** file offset iPtr. +*/ +static void *treeShmptr(lsm_db *pDb, u32 iPtr, int *pRc){ + /* TODO: This will likely be way too slow. If it is, chunks should be + ** cached as part of the db handle. */ + if( iPtr && *pRc==0 ){ + int rc; + void *pChunk; + + rc = lsmShmChunk(pDb, treeOffsetToChunk(iPtr), &pChunk); + if( rc==LSM_OK ){ + return &((u8 *)pChunk)[iPtr & (LSM_SHM_CHUNK_SIZE-1)]; + } + *pRc = rc; + } + return 0; +} + +static ShmChunk * treeShmChunk(lsm_db *pDb, int iChunk){ + int rcdummy = LSM_OK; + return (ShmChunk *)treeShmptr(pDb, iChunk*LSM_SHM_CHUNK_SIZE, &rcdummy); +} + +/* Values for the third argument to treeShmkey(). */ +#define TK_LOADKEY 1 +#define TK_LOADVAL 2 + +static TreeKey *treeShmkey( + lsm_db *pDb, /* Database handle */ + u32 iPtr, /* Shmptr to TreeKey struct */ + int eLoad, /* Either zero or a TREEKEY_LOADXXX value */ + TreeBlob *pBlob, /* Used if dynamic memory is required */ + int *pRc /* IN/OUT: Error code */ +){ + TreeKey *pRet; + + assert( eLoad==TK_LOADKEY || eLoad==TK_LOADVAL ); + pRet = (TreeKey *)treeShmptr(pDb, iPtr, pRc); + if( pRet ){ + int nReq; /* Bytes of space required at pRet */ + int nAvail; /* Bytes of space available at pRet */ + + nReq = sizeof(TreeKey) + pRet->nKey; + if( eLoad==TK_LOADVAL && pRet->nValue>0 ){ + nReq += pRet->nValue; + } + assert( LSM_SHM_CHUNK_SIZE==(1<<15) ); + nAvail = LSM_SHM_CHUNK_SIZE - (iPtr & (LSM_SHM_CHUNK_SIZE-1)); + + if( nAvaila[nLoad], p, n); + nLoad += n; + if( nLoad==nReq ) break; + + pChunk = treeShmChunk(pDb, treeOffsetToChunk(iPtr)); + assert( pChunk ); + iPtr = (pChunk->iNext * LSM_SHM_CHUNK_SIZE) + LSM_SHM_CHUNK_HDR; + nAvail = LSM_SHM_CHUNK_SIZE - LSM_SHM_CHUNK_HDR; + } + } + pRet = (TreeKey *)(pBlob->a); + } + } + + return pRet; +} #if defined(LSM_DEBUG) && defined(LSM_EXPENSIVE_ASSERT) void assert_leaf_looks_ok(TreeNode *pNode){ assert( pNode->apKey[1] ); @@ -245,176 +364,94 @@ #else # define assert_tree_looks_ok(x,y) #endif #ifdef LSM_DEBUG + +/* +** Pointer pBlob points to a buffer containing a blob of binary data +** nBlob bytes long. Append the contents of this blob to *pStr, with +** each octet represented by a 2-digit hexadecimal number. For example, +** if the input blob is three bytes in size and contains {0x01, 0x44, 0xFF}, +** then "0144ff" is appended to *pStr. +*/ static void lsmAppendStrBlob(LsmString *pStr, void *pBlob, int nBlob){ int i; - lsmStringExtend(pStr, nBlob); + lsmStringExtend(pStr, nBlob*2); if( pStr->nAlloc==0 ) return; for(i=0; iz[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf]; - pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf]; + if( c>='a' && c<='z' ){ + pStr->z[pStr->n++] = c; + }else{ + pStr->z[pStr->n++] = "0123456789abcdef"[(c>>4)&0xf]; + pStr->z[pStr->n++] = "0123456789abcdef"[c&0xf]; + } } pStr->z[pStr->n] = 0; } +/* +** Append nIndent space (0x20) characters to string *pStr. +*/ static void lsmAppendIndent(LsmString *pStr, int nIndent){ int i; lsmStringExtend(pStr, nIndent); for(i=0; inKey; i++){ - lsmStringAppendf(pStr, "%2X ", ((u8 *)(pKey->pKey))[i]); - } - lsmStringAppend(pStr, " ", -1); - - if( pKey->nValue<0 ){ - lsmStringAppend(pStr, "", -1); - }else{ - lsmAppendStrBlob(pStr, pKey->pValue, pKey->nValue); - } -} - -void dump_node(TreeNode *pNode, int nIndent, int isNode){ - if( pNode ){ - LsmString s; - int i; - - lsmStringInit(&s, NEED_ENV); - lsmAppendIndent(&s, nIndent); - lsmStringAppendf(&s, "0x%p", (void*)pNode); - printf("%s\n", s.z); - lsmStringClear(&s); - - for(i=0; i<4; i++){ - - if( isNode ){ - if( pNode->iV2 && i==pNode->iV2Ptr ){ - lsmAppendIndent(&s, nIndent+2); - lsmStringAppendf(&s, "if( version>=%d )", pNode->iV2); - printf("%s\n", s.z); - lsmStringClear(&s); - dump_node(pNode->pV2Ptr, nIndent + 4, isNode-1); - if( pNode->apChild[i] ){ - lsmAppendIndent(&s, nIndent+2); - lsmStringAppendf(&s, "else"); - printf("%s\n", s.z); - lsmStringClear(&s); - } - } - - dump_node(pNode->apChild[i], nIndent + 4, isNode-1); - } - - if( i<3 && pNode->apKey[i] ){ - lsmAppendIndent(&s, nIndent); - lsmStringAppendf(&s, "k%d: ", i); - lsmAppendKeyValue(&s, pNode->apKey[i]); - printf("%s\n", s.z); - lsmStringClear(&s); - } - - } - } -} - -void dump_node_contents(TreeNode *pNode, int iVersion, int nIndent, int isNode){ - int i; - LsmString s; - - lsmStringInit(&s, NEED_ENV); - lsmAppendIndent(&s, nIndent); - for(i=0; i<3; i++){ - if( pNode->apKey[i] ){ - TreeKey *pKey = pNode->apKey[i]; - lsmAppendStrBlob(&s, pKey->pKey, pKey->nKey); +void dump_node_contents( + lsm_db *pDb, + u32 iNode, /* Print out hte contents of this node */ + int nIndent, /* Number of spaces indentation */ + int nHeight /* Height: (0==leaf) (1==parent-of-leaf) */ +){ + int i; + int rc = LSM_OK; + LsmString s; + TreeNode *pNode; + TreeBlob b = {0, 0}; + + /* Append the nIndent bytes of space to string s. */ + lsmStringInit(&s, pDb->pEnv); + if( nIndent ) lsmAppendIndent(&s, nIndent); + + pNode = (TreeNode *)treeShmptr(pDb, iNode, &rc); + + /* Append each key to string s. */ + for(i=0; i<3; i++){ + u32 iPtr = pNode->aiKeyPtr[i]; + if( iPtr ){ + TreeKey *pKey = treeShmkey(pDb, pNode->aiKeyPtr[i], TK_LOADKEY, &b, &rc); + lsmAppendStrBlob(&s, TK_KEY(pKey), pKey->nKey); lsmStringAppend(&s, " ", -1); } } printf("%s\n", s.z); lsmStringClear(&s); - for(i=0; i<4 && isNode>0; i++){ - TreeNode *pChild = getChildPtr(pNode, iVersion, i); - if( pChild ){ - dump_node_contents(pChild, iVersion, nIndent + 2, isNode-1); - } - } -} - -void dump_tree_contents(Tree *pTree, const char *zCaption){ - TreeVersion *p = pTree->pWorking ? pTree->pWorking : pTree->pCommit; - printf("\n%s\n", zCaption); - if( p->pRoot ){ - dump_node_contents(p->pRoot, WORKING_VERSION, 0, p->nHeight-1); - } - fflush(stdout); -} - -void dump_tv_contents(TreeVersion *pTV, const char *zCaption){ - printf("\n%s\n", zCaption); - if( pTV->pRoot ){ - dump_node(pTV->pRoot, 2, pTV->nHeight-1); + for(i=0; i<4 && nHeight>0; i++){ + u32 iPtr = getChildPtr(pNode, pDb->treehdr.iTransId, i); + if( iPtr ){ + dump_node_contents(pDb, iPtr, nIndent + 2, nHeight-1); + } + } + + tblobFree(pDb, &b); +} + +void dump_tree_contents(lsm_db *pDb, const char *zCaption){ + printf("\n%s\n", zCaption); + if( pDb->treehdr.iRoot ){ + dump_node_contents(pDb, pDb->treehdr.iRoot, 0, pDb->treehdr.nHeight-1); } fflush(stdout); } #endif -/* -** Allocate a new tree structure. -*/ -int lsmTreeNew( - lsm_env *pEnv, /* Environment handle */ - int (*xCmp)(void *, int, void *, int), /* Compare function */ - Tree **ppTree /* OUT: New tree object */ -){ - int rc; - Tree *pTree = 0; - Mempool *pPool; /* Memory pool used by the new tree */ - TreeVersion *pClient = 0; /* Initial client access handle */ - - rc = lsmPoolNew(pEnv, &pPool); - pClient = (TreeVersion *)lsmMallocZeroRc(pEnv, sizeof(TreeVersion), &rc); - - if( rc==LSM_OK ){ - pTree = (Tree *)lsmPoolMallocZero(pEnv, pPool, sizeof(Tree)); - assert( pTree ); - pTree->pPool = pPool; - pTree->xCmp = xCmp; - pTree->nTreeRef = 1; - - pClient->iVersion = 1; - pClient->pTree = pTree; - pClient->nRef = 1; - pTree->pCommit = pClient; - }else{ - assert( pClient==0 ); - lsmPoolDestroy(pEnv, pPool); - } - - *ppTree = pTree; - return rc; -} - -/* -** Destroy a tree structure allocated by lsmTreeNew(). -*/ -static void treeDestroy(lsm_env *pEnv, Tree *pTree){ - if( pTree ){ - assert( pTree->pWorking==0 ); - lsmPoolDestroy(pEnv, pTree->pPool); - } -} - /* ** Initialize a cursor object, the space for which has already been ** allocated. */ static void treeCursorInit(lsm_db *pDb, TreeCursor *pCsr){ @@ -421,47 +458,34 @@ memset(pCsr, 0, sizeof(TreeCursor)); pCsr->pDb = pDb; pCsr->iNode = -1; } -static TreeNode *newTreeLeaf(lsm_env *pEnv, Tree *pTree){ - return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeLeaf)); -} - -static TreeNode *newTreeNode(lsm_env *pEnv, Tree *pTree){ - return (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode)); -} - -static TreeNode *copyTreeNode(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){ - TreeNode *pNew; - pNew = (TreeNode *)lsmPoolMallocZero(pEnv, pTree->pPool, sizeof(TreeNode)); - - memcpy(pNew->apKey, pOld->apKey, sizeof(pNew->apKey)); - memcpy(pNew->apChild, pOld->apChild, sizeof(pNew->apChild)); - if( pOld->iV2 ) pNew->apChild[pOld->iV2Ptr] = pOld->pV2Ptr; - - return pNew; -} - -static TreeNode *copyTreeLeaf(lsm_env *pEnv, Tree *pTree, TreeNode *pOld){ - TreeNode *pNew; - pNew = newTreeLeaf(pEnv, pTree); - memcpy(pNew, pOld, sizeof(TreeLeaf)); - return pNew; +/* +** Return a pointer to the mapping of the TreeKey object that the cursor +** is pointing to. +*/ +static TreeKey *csrGetKey(TreeCursor *pCsr, TreeBlob *pBlob, int *pRc){ + return (TreeKey *)treeShmkey(pCsr->pDb, + pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[pCsr->aiCell[pCsr->iNode]], + TK_LOADVAL, pBlob, pRc + ); } /* ** Save the current position of tree cursor pCsr. */ -void lsmTreeCursorSave(TreeCursor *pCsr){ +int lsmTreeCursorSave(TreeCursor *pCsr){ + int rc = LSM_OK; if( pCsr->pSave==0 ){ int iNode = pCsr->iNode; if( iNode>=0 ){ - pCsr->pSave = pCsr->apTreeNode[iNode]->apKey[pCsr->aiCell[iNode]]; + pCsr->pSave = csrGetKey(pCsr, &pCsr->blob, &rc); } pCsr->iNode = -1; } + return rc; } /* ** Restore the position of a saved tree cursor. */ @@ -469,15 +493,206 @@ int rc = LSM_OK; if( pCsr->pSave ){ TreeKey *pKey = pCsr->pSave; pCsr->pSave = 0; if( pRes ){ - rc = lsmTreeCursorSeek(pCsr, pKey->pKey, pKey->nKey, pRes); + rc = lsmTreeCursorSeek(pCsr, TK_KEY(pKey), pKey->nKey, pRes); } } return rc; } + +/* +** Allocate nByte bytes of space within the *-shm file. If successful, +** return LSM_OK and set *piPtr to the offset within the file at which +** the allocated space is located. +*/ +static u32 treeShmalloc(lsm_db *pDb, int bAlign, int nByte, int *pRc){ + u32 iRet = 0; + if( *pRc==LSM_OK ){ + const static int CHUNK_SIZE = LSM_SHM_CHUNK_SIZE; + const static int CHUNK_HDR = LSM_SHM_CHUNK_HDR; + u32 iWrite; /* Current write offset */ + u32 iEof; /* End of current chunk */ + int iChunk; /* Current chunk */ + + assert( nByte <= (CHUNK_SIZE-CHUNK_HDR) ); + + /* Check if there is enough space on the current chunk to fit the + ** new allocation. If not, link in a new chunk and put the new + ** allocation at the start of it. */ + iWrite = pDb->treehdr.iWrite; + if( bAlign ){ + iWrite = (iWrite + 3) & ~0x0003; + assert( (iWrite % 4)==0 ); + } + + assert( iWrite ); + iChunk = treeOffsetToChunk(iWrite-1); + iEof = (iChunk+1) * CHUNK_SIZE; + assert( iEof>=iWrite && (iEof-iWrite)iEof ){ + ShmChunk *pHdr; /* Header of chunk just finished (iChunk) */ + ShmChunk *pFirst; /* Header of chunk treehdr.iFirst */ + int iNext = 0; /* Next chunk */ + int rc; + + /* Check if the chunk at the start of the linked list is still in + ** use. If not, reuse it. If so, allocate a new chunk by appending + ** to the *-shm file. */ + if( pDb->treehdr.iFirst!=iChunk ){ + int bInUse; + pFirst = treeShmChunk(pDb, pDb->treehdr.iFirst); + rc = lsmTreeInUse(pDb, pFirst->iLastTree, &bInUse); + if( rc!=LSM_OK ){ + *pRc = rc; + return 0; + } + if( bInUse==0 ){ + iNext = pDb->treehdr.iFirst; + pDb->treehdr.iFirst = pFirst->iNext; + pFirst->iNext = 0; + pFirst->iLastTree = 0; + assert( pDb->treehdr.iFirst ); + assert( pFirst->iLastTreetreehdr.iTreeId ); + } + } + if( iNext==0 ) iNext = pDb->treehdr.nChunk++; + + /* Set the header values for the chunk just finished */ + pHdr = (ShmChunk *)treeShmptr(pDb, iChunk*CHUNK_SIZE, pRc); + pHdr->iLastTree = pDb->treehdr.iTreeId; + pHdr->iNext = iNext; + + /* Advance to the next chunk */ + iWrite = iNext * CHUNK_SIZE + CHUNK_HDR; + } + + /* Allocate space at iWrite. */ + iRet = iWrite; + pDb->treehdr.iWrite = iWrite + nByte; + pDb->treehdr.nByte += nByte; + } + return iRet; +} + +/* +** Allocate and zero nByte bytes of space within the *-shm file. +*/ +static void *treeShmallocZero(lsm_db *pDb, int nByte, u32 *piPtr, int *pRc){ + u32 iPtr; + void *p; + iPtr = treeShmalloc(pDb, 1, nByte, pRc); + p = treeShmptr(pDb, iPtr, pRc); + if( p ){ + assert( *pRc==LSM_OK ); + memset(p, 0, nByte); + *piPtr = iPtr; + } + return p; +} + +static TreeNode *newTreeNode(lsm_db *pDb, u32 *piPtr, int *pRc){ + return treeShmallocZero(pDb, sizeof(TreeNode), piPtr, pRc); +} + +static TreeLeaf *newTreeLeaf(lsm_db *pDb, u32 *piPtr, int *pRc){ + return treeShmallocZero(pDb, sizeof(TreeLeaf), piPtr, pRc); +} + +static TreeKey *newTreeKey( + lsm_db *pDb, + u32 *piPtr, + void *pKey, int nKey, /* Key data */ + void *pVal, int nVal, /* Value data (or nVal<0 for delete) */ + int *pRc +){ + TreeKey *p; + u32 iPtr; + int nRem; + u8 *a; + int n; + +#if 0 + nRem = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0); + *piPtr = iPtr = treeShmalloc(pDb, 1, nRem, pRc); + p = treeShmptr(pDb, iPtr, pRc); + if( *pRc ) return 0; + p->nKey = nKey; + p->nValue = nVal; + memcpy(&p[1], pKey, nKey); + if( nVal>0 ) memcpy(((u8 *)&p[1]) + nKey, pVal, nVal); + return p; +#endif + + /* Allocate space for the TreeKey structure itself */ + *piPtr = iPtr = treeShmalloc(pDb, 1, sizeof(TreeKey), pRc); + p = treeShmptr(pDb, iPtr, pRc); + if( *pRc ) return 0; + p->nKey = nKey; + p->nValue = nVal; + + /* Allocate and populate the space required for the key and value. */ + n = nRem = nKey; + a = (u8 *)pKey; + while( a ){ + while( nRem>0 ){ + u8 *aAlloc; + int nAlloc; + u32 iWrite; + + iWrite = (pDb->treehdr.iWrite & (LSM_SHM_CHUNK_SIZE-1)); + iWrite = LSM_MAX(iWrite, LSM_SHM_CHUNK_HDR); + nAlloc = LSM_MIN((LSM_SHM_CHUNK_SIZE-iWrite), nRem); + + aAlloc = treeShmptr(pDb, treeShmalloc(pDb, 0, nAlloc, pRc), pRc); + if( aAlloc==0 ) break; + memcpy(aAlloc, &a[n-nRem], nAlloc); + nRem -= nAlloc; + } + a = pVal; + n = nRem = nVal; + pVal = 0; + } + + if( *pRc ) return 0; +#if 0 + printf("store: %d %s\n", (int)iPtr, (char *)pKey); +#endif + return p; +} + +static TreeNode *copyTreeNode( + lsm_db *pDb, + TreeNode *pOld, + u32 *piNew, + int *pRc +){ + TreeNode *pNew; + + pNew = newTreeNode(pDb, piNew, pRc); + if( pNew ){ + memcpy(pNew->aiKeyPtr, pOld->aiKeyPtr, sizeof(pNew->aiKeyPtr)); + memcpy(pNew->aiChildPtr, pOld->aiChildPtr, sizeof(pNew->aiChildPtr)); + if( pOld->iV2 ) pNew->aiChildPtr[pOld->iV2Child] = pOld->iV2Ptr; + } + return pNew; +} + +static TreeNode *copyTreeLeaf( + lsm_db *pDb, + TreeLeaf *pOld, + u32 *piNew, + int *pRc +){ + TreeLeaf *pNew; + pNew = newTreeLeaf(pDb, piNew, pRc); + if( pNew ){ + memcpy(pNew, pOld, sizeof(TreeLeaf)); + } + return (TreeNode *)pNew; +} /* ** The tree cursor passed as the second argument currently points to an ** internal node (not a leaf). Specifically, to a sub-tree pointer. This ** function replaces the sub-tree that the cursor currently points to @@ -485,15 +700,15 @@ ** ** The sub-tree may be replaced either by writing the "v2 data" on the ** internal node, or by allocating a new TreeNode structure and then ** calling this function on the parent of the internal node. */ -static int treeUpdatePtr(Tree *pTree, TreeCursor *pCsr, TreeNode *pNew){ +static int treeUpdatePtr(lsm_db *pDb, TreeCursor *pCsr, u32 iNew){ int rc = LSM_OK; if( pCsr->iNode<0 ){ - /* pNew is the new root node */ - pTree->pWorking->pRoot = pNew; + /* iNew is the new root node */ + pDb->treehdr.iRoot = iNew; }else{ /* If this node already has version 2 content, allocate a copy and ** update the copy with the new pointer value. Otherwise, store the ** new pointer as v2 data within the current node structure. */ @@ -503,30 +718,39 @@ p = pCsr->apTreeNode[pCsr->iNode]; iChildPtr = pCsr->aiCell[pCsr->iNode]; if( p->iV2 ){ /* The "allocate new TreeNode" option */ - TreeNode *pCopy = copyTreeNode(pCsr->pDb->pEnv, pTree, p); + u32 iCopy; + TreeNode *pCopy; + pCopy = copyTreeNode(pDb, p, &iCopy, &rc); if( pCopy ){ - pCopy->apChild[iChildPtr] = pNew; + assert( rc==LSM_OK ); + pCopy->aiChildPtr[iChildPtr] = iNew; pCsr->iNode--; - rc = treeUpdatePtr(pTree, pCsr, pCopy); - }else{ - rc = LSM_NOMEM_BKPT; + rc = treeUpdatePtr(pDb, pCsr, iCopy); } }else{ /* The "v2 data" option */ - p->iV2 = pTree->pWorking->iVersion; - p->iV2Ptr = (u8)iChildPtr; - p->pV2Ptr = (void *)pNew; - if( pTree->pRbLast ){ - pTree->pRbLast->pNext = p; + u32 iPtr; + assert( pDb->treehdr.iTransId>0 ); + + if( pCsr->iNode ){ + iPtr = getChildPtr( + pCsr->apTreeNode[pCsr->iNode-1], + pDb->treehdr.iTransId, pCsr->aiCell[pCsr->iNode-1] + ); }else{ - pTree->pRbFirst = p; + iPtr = pDb->treehdr.iRoot; } - pTree->pRbLast = p; - assert( pTree->pRbLast->pNext==0 ); + rc = intArrayAppend(pDb->pEnv, &pDb->rollback, iPtr); + + if( rc==LSM_OK ){ + p->iV2 = pDb->treehdr.iTransId; + p->iV2Child = (u8)iChildPtr; + p->iV2Ptr = iNew; + } } } return rc; } @@ -542,188 +766,210 @@ ** ** Pointer pLeftPtr points to a child tree that contains keys that are ** smaller than pTreeKey. */ static int treeInsert( - lsm_env *pEnv, - Tree *pTree, + lsm_db *pDb, /* Database handle */ TreeCursor *pCsr, /* Cursor indicating path to insert at */ - TreeNode *pLeftPtr, /* New child pointer (or NULL for leaves) */ - TreeKey *pTreeKey, /* New key to insert */ - TreeNode *pRightPtr, /* New child pointer (or NULL for leaves) */ + u32 iLeftPtr, /* Left child pointer */ + u32 iTreeKey, /* Location of key to insert */ + u32 iRightPtr, /* Right child pointer */ int iSlot /* Position to insert key into */ ){ int rc = LSM_OK; TreeNode *pNode = pCsr->apTreeNode[pCsr->iNode]; - /* Check if the leaf is currently full. If so, allocate a sibling node. */ - if( pNode->apKey[0] && pNode->apKey[2] ){ - TreeNode *pLeft; /* New sibling node. */ - TreeNode *pRight; /* Sibling of pLeft (either new or pNode) */ + /* Check if the node is currently full. If so, split pNode in two and + ** call this function recursively to add a key to the parent. Otherwise, + ** insert the new key directly into pNode. */ + assert( pNode->aiKeyPtr[1] ); + if( pNode->aiKeyPtr[0] && pNode->aiKeyPtr[2] ){ + u32 iLeft; TreeNode *pLeft; /* New left-hand sibling node */ + u32 iRight; TreeNode *pRight; /* New right-hand sibling node */ - pLeft = newTreeNode(pEnv, pTree); - pRight = newTreeNode(pEnv, pTree); + pLeft = newTreeNode(pDb, &iLeft, &rc); + pRight = newTreeNode(pDb, &iRight, &rc); + if( rc ) return rc; + + pLeft->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 0); + pLeft->aiKeyPtr[1] = pNode->aiKeyPtr[0]; + pLeft->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 1); + + pRight->aiChildPtr[1] = getChildPtr(pNode, WORKING_VERSION, 2); + pRight->aiKeyPtr[1] = pNode->aiKeyPtr[2]; + pRight->aiChildPtr[2] = getChildPtr(pNode, WORKING_VERSION, 3); if( pCsr->iNode==0 ){ /* pNode is the root of the tree. Grow the tree by one level. */ - TreeNode *pRoot; /* New root node */ - - pRoot = newTreeNode(pEnv, pTree); - - pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0); - pLeft->apKey[1] = pNode->apKey[0]; - pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1); - - pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2); - pRight->apKey[1] = pNode->apKey[2]; - pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3); - - pRoot->apKey[1] = pNode->apKey[1]; - pRoot->apChild[1] = pLeft; - pRoot->apChild[2] = pRight; - - pTree->pWorking->pRoot = pRoot; - pTree->pWorking->nHeight++; - }else{ - TreeKey *pParentKey; /* Key to insert into parent node */ - pParentKey = pNode->apKey[1]; - - pLeft->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 0); - pLeft->apKey[1] = pNode->apKey[0]; - pLeft->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 1); - - pRight->apChild[1] = getChildPtr(pNode, WORKING_VERSION, 2); - pRight->apKey[1] = pNode->apKey[2]; - pRight->apChild[2] = getChildPtr(pNode, WORKING_VERSION, 3); + u32 iRoot; TreeNode *pRoot; /* New root node */ + + pRoot = newTreeNode(pDb, &iRoot, &rc); + pRoot->aiKeyPtr[1] = pNode->aiKeyPtr[1]; + pRoot->aiChildPtr[1] = iLeft; + pRoot->aiChildPtr[2] = iRight; + + pDb->treehdr.iRoot = iRoot; + pDb->treehdr.nHeight++; + }else{ pCsr->iNode--; - treeInsert(pEnv, - pTree, pCsr, pLeft, pParentKey, pRight, pCsr->aiCell[pCsr->iNode] + rc = treeInsert(pDb, pCsr, + iLeft, pNode->aiKeyPtr[1], iRight, pCsr->aiCell[pCsr->iNode] ); } assert( pLeft->iV2==0 ); assert( pRight->iV2==0 ); switch( iSlot ){ case 0: - pLeft->apKey[0] = pTreeKey; - pLeft->apChild[0] = pLeftPtr; - if( pRightPtr ) pLeft->apChild[1] = pRightPtr; + pLeft->aiKeyPtr[0] = iTreeKey; + pLeft->aiChildPtr[0] = iLeftPtr; + if( iRightPtr ) pLeft->aiChildPtr[1] = iRightPtr; break; case 1: - pLeft->apChild[3] = (pRightPtr ? pRightPtr : pLeft->apChild[2]); - pLeft->apKey[2] = pTreeKey; - pLeft->apChild[2] = pLeftPtr; + pLeft->aiChildPtr[3] = (iRightPtr ? iRightPtr : pLeft->aiChildPtr[2]); + pLeft->aiKeyPtr[2] = iTreeKey; + pLeft->aiChildPtr[2] = iLeftPtr; break; case 2: - pRight->apKey[0] = pTreeKey; - pRight->apChild[0] = pLeftPtr; - if( pRightPtr ) pRight->apChild[1] = pRightPtr; + pRight->aiKeyPtr[0] = iTreeKey; + pRight->aiChildPtr[0] = iLeftPtr; + if( iRightPtr ) pRight->aiChildPtr[1] = iRightPtr; break; case 3: - pRight->apChild[3] = (pRightPtr ? pRightPtr : pRight->apChild[2]); - pRight->apKey[2] = pTreeKey; - pRight->apChild[2] = pLeftPtr; + pRight->aiChildPtr[3] = (iRightPtr ? iRightPtr : pRight->aiChildPtr[2]); + pRight->aiKeyPtr[2] = iTreeKey; + pRight->aiChildPtr[2] = iLeftPtr; break; } }else{ TreeNode *pNew; - TreeKey **pOut; - TreeNode **pPtr; + u32 *piKey; + u32 *piChild; + u32 iStore = 0; + u32 iNew = 0; int i; - pNew = newTreeNode(pEnv, pTree); - if( pNew ){ - TreeNode *pStore = 0; - pOut = pNew->apKey; - pPtr = pNew->apChild; - - for(i=0; iapKey[i] ){ - *(pOut++) = pNode->apKey[i]; - *(pPtr++) = getChildPtr(pNode, WORKING_VERSION, i); - } - } - - *pOut++ = pTreeKey; - *pPtr++ = pLeftPtr; - - pStore = pRightPtr; - for(i=iSlot; i<3; i++){ - if( pNode->apKey[i] ){ - *(pOut++) = pNode->apKey[i]; - *(pPtr++) = pStore ? pStore : getChildPtr(pNode, WORKING_VERSION, i); - pStore = 0; - } - } - if( pStore ){ - *pPtr = pStore; - }else{ - *pPtr = getChildPtr(pNode, WORKING_VERSION, (pNode->apKey[2] ? 3 : 2)); - } - - pCsr->iNode--; - rc = treeUpdatePtr(pTree, pCsr, pNew); - }else{ - rc = LSM_NOMEM_BKPT; - } + /* Allocate a new version of node pNode. */ + pNew = newTreeNode(pDb, &iNew, &rc); + if( rc ) return rc; + + piKey = pNew->aiKeyPtr; + piChild = pNew->aiChildPtr; + + for(i=0; iaiKeyPtr[i] ){ + *(piKey++) = pNode->aiKeyPtr[i]; + *(piChild++) = getChildPtr(pNode, WORKING_VERSION, i); + } + } + + *piKey++ = iTreeKey; + *piChild++ = iLeftPtr; + + iStore = iRightPtr; + for(i=iSlot; i<3; i++){ + if( pNode->aiKeyPtr[i] ){ + *(piKey++) = pNode->aiKeyPtr[i]; + *(piChild++) = iStore ? iStore : getChildPtr(pNode, WORKING_VERSION, i); + iStore = 0; + } + } + + if( iStore ){ + *piChild = iStore; + }else{ + *piChild = getChildPtr(pNode, WORKING_VERSION, + (pNode->aiKeyPtr[2] ? 3 : 2) + ); + } + pCsr->iNode--; + rc = treeUpdatePtr(pDb, pCsr, iNew); } return rc; } static int treeInsertLeaf( - lsm_env *pEnv, - Tree *pTree, /* Tree structure */ + lsm_db *pDb, /* Database handle */ TreeCursor *pCsr, /* Cursor structure */ - TreeKey *pTreeKey, /* Key to insert */ + u32 iTreeKey, /* Key pointer to insert */ int iSlot /* Insert key to the left of this */ ){ - int rc; /* Return code */ + int rc = LSM_OK; /* Return code */ TreeNode *pLeaf = pCsr->apTreeNode[pCsr->iNode]; - TreeNode *pNew; + TreeLeaf *pNew; + u32 iNew; assert( iSlot>=0 && iSlot<=4 ); assert( pCsr->iNode>0 ); - assert( pLeaf->apKey[1] ); + assert( pLeaf->aiKeyPtr[1] ); pCsr->iNode--; - pNew = newTreeLeaf(pEnv, pTree); - if( !pNew ){ - rc = LSM_NOMEM_BKPT; - }else if( pLeaf->apKey[0] && pLeaf->apKey[2] ){ - TreeNode *pRight; - - pRight = newTreeLeaf(pEnv, pTree); - if( pRight==0 ){ - rc = LSM_NOMEM_BKPT; - }else{ - pNew->apKey[1] = pLeaf->apKey[0]; - pRight->apKey[1] = pLeaf->apKey[2]; - switch( iSlot ){ - case 0: pNew->apKey[0] = pTreeKey; break; - case 1: pNew->apKey[2] = pTreeKey; break; - case 2: pRight->apKey[0] = pTreeKey; break; - case 3: pRight->apKey[2] = pTreeKey; break; - } - rc = treeInsert(pEnv, pTree, pCsr, pNew, pLeaf->apKey[1], pRight, - pCsr->aiCell[pCsr->iNode] - ); - } - }else{ - int iOut = 0; - int i; - for(i=0; i<4; i++){ - if( i==iSlot ) pNew->apKey[iOut++] = pTreeKey; - if( i<3 && pLeaf->apKey[i] ) pNew->apKey[iOut++] = pLeaf->apKey[i]; - } - rc = treeUpdatePtr(pTree, pCsr, pNew); - } - - return rc; + pNew = newTreeLeaf(pDb, &iNew, &rc); + if( pNew ){ + if( pLeaf->aiKeyPtr[0] && pLeaf->aiKeyPtr[2] ){ + /* The leaf is full. Split it in two. */ + TreeLeaf *pRight; + u32 iRight; + pRight = newTreeLeaf(pDb, &iRight, &rc); + if( pRight ){ + assert( rc==LSM_OK ); + pNew->aiKeyPtr[1] = pLeaf->aiKeyPtr[0]; + pRight->aiKeyPtr[1] = pLeaf->aiKeyPtr[2]; + switch( iSlot ){ + case 0: pNew->aiKeyPtr[0] = iTreeKey; break; + case 1: pNew->aiKeyPtr[2] = iTreeKey; break; + case 2: pRight->aiKeyPtr[0] = iTreeKey; break; + case 3: pRight->aiKeyPtr[2] = iTreeKey; break; + } + + rc = treeInsert(pDb, pCsr, iNew, pLeaf->aiKeyPtr[1], iRight, + pCsr->aiCell[pCsr->iNode] + ); + } + }else{ + int iOut = 0; + int i; + for(i=0; i<4; i++){ + if( i==iSlot ) pNew->aiKeyPtr[iOut++] = iTreeKey; + if( i<3 && pLeaf->aiKeyPtr[i] ){ + pNew->aiKeyPtr[iOut++] = pLeaf->aiKeyPtr[i]; + } + } + rc = treeUpdatePtr(pDb, pCsr, iNew); + } + } + + return rc; +} + +/* +** Empty the contents of the in-memory tree. +*/ +void lsmTreeClear(lsm_db *pDb){ + pDb->treehdr.iTreeId++; + pDb->treehdr.iTransId = 1; + pDb->treehdr.iRoot = 0; + pDb->treehdr.nHeight = 0; + pDb->treehdr.nByte = 0; +} + +/* +** This function is called during recovery to initialize the +** tree header. Only the database connections private copy of the tree-header +** is initialized here - it will be copied into shared memory if log file +** recovery is successful. +*/ +void lsmTreeInit(lsm_db *pDb){ + pDb->treehdr.iTransId = 1; + pDb->treehdr.iFirst = 1; + pDb->treehdr.nChunk = 2; + pDb->treehdr.iWrite = LSM_SHM_CHUNK_SIZE + LSM_SHM_CHUNK_HDR; + pDb->treehdr.iTreeId = 1; } /* ** Insert a new entry into the in-memory tree. ** @@ -736,53 +982,38 @@ void *pKey, /* Pointer to key data */ int nKey, /* Size of key data in bytes */ void *pVal, /* Pointer to value data (or NULL) */ int nVal /* Bytes in value data (or -ve for delete) */ ){ - lsm_env *pEnv = pDb->pEnv; - TreeVersion *pTV = pDb->pTV; - Tree *pTree = pTV->pTree; int rc = LSM_OK; /* Return Code */ TreeKey *pTreeKey; /* New key-value being inserted */ int nTreeKey; /* Number of bytes allocated at pTreeKey */ + u32 iTreeKey; + u8 *a; + TreeHeader *pHdr = &pDb->treehdr; assert( nVal>=0 || pVal==0 ); - assert( pTV==pTree->pWorking ); assert_tree_looks_ok(LSM_OK, pTree); - /* dump_tree_contents(pTree, "before"); */ +#if 0 + dump_tree_contents(pDb, "before"); +#endif /* Allocate and populate a new key-value pair structure */ - nTreeKey = sizeof(TreeKey) + nKey + (nVal>0 ? nVal : 0); - pTreeKey = (TreeKey *)lsmPoolMalloc(pDb->pEnv, pTree->pPool, nTreeKey); - if( !pTreeKey ) return LSM_NOMEM_BKPT; - pTreeKey->pKey = (void *)&pTreeKey[1]; - memcpy(pTreeKey->pKey, pKey, nKey); - if( nVal>0 ){ - pTreeKey->pValue = (void *)&((u8 *)(pTreeKey->pKey))[nKey]; - memcpy(pTreeKey->pValue, pVal, nVal); - }else{ - pTreeKey->pValue = 0; - } - pTreeKey->nValue = nVal; - pTreeKey->nKey = nKey; - - if( pTree->pWorking->pRoot==0 ){ + pTreeKey = newTreeKey(pDb, &iTreeKey, pKey, nKey, pVal, nVal, &rc); + if( rc!=LSM_OK ) return rc; + + if( pHdr->iRoot==0 ){ /* The tree is completely empty. Add a new root node and install ** (pKey/nKey) as the middle entry. Even though it is a leaf at the ** moment, use newTreeNode() to allocate the node (i.e. allocate enough ** space for the fields used by interior nodes). This is because the - ** treeInsert() routine may convert this node to an interior node. - */ - TreeNode *pRoot; /* New tree root node */ - pRoot = newTreeNode(pEnv, pTree); - if( !pRoot ){ - rc = LSM_NOMEM_BKPT; - }else{ - pRoot->apKey[1] = pTreeKey; - pTree->pWorking->pRoot = pRoot; - assert( pTree->pWorking->nHeight==0 ); - pTree->pWorking->nHeight = 1; + ** treeInsert() routine may convert this node to an interior node. */ + TreeNode *pRoot = newTreeNode(pDb, &pHdr->iRoot, &rc); + if( rc==LSM_OK ){ + assert( pHdr->nHeight==0 ); + pRoot->aiKeyPtr[1] = iTreeKey; + pHdr->nHeight = 1; } }else{ TreeCursor csr; int res; @@ -791,27 +1022,30 @@ lsmTreeCursorSeek(&csr, pKey, nKey, &res); if( res==0 ){ /* The search found a match within the tree. */ TreeNode *pNew; + u32 iNew; TreeNode *pNode = csr.apTreeNode[csr.iNode]; int iCell = csr.aiCell[csr.iNode]; /* Create a copy of this node */ - if( (csr.iNode>0 && csr.iNode==(pTree->pWorking->nHeight-1)) ){ - pNew = copyTreeLeaf(pEnv, pTree, pNode); + if( (csr.iNode>0 && csr.iNode==(pHdr->nHeight-1)) ){ + pNew = copyTreeLeaf(pDb, (TreeLeaf *)pNode, &iNew, &rc); }else{ - pNew = copyTreeNode(pEnv, pTree, pNode); + pNew = copyTreeNode(pDb, pNode, &iNew, &rc); } - /* Modify the value in the new version */ - pNew->apKey[iCell] = pTreeKey; + if( rc==LSM_OK ){ + /* Modify the value in the new version */ + pNew->aiKeyPtr[iCell] = iTreeKey; - /* Change the pointer in the parent (if any) to point at the new - ** TreeNode */ - csr.iNode--; - treeUpdatePtr(pTree, &csr, pNew); + /* Change the pointer in the parent (if any) to point at the new + ** TreeNode */ + csr.iNode--; + treeUpdatePtr(pDb, &csr, iNew); + } }else{ /* The cursor now points to the leaf node into which the new entry should ** be inserted. There may or may not be a free slot within the leaf for ** the new key-value pair. ** @@ -820,39 +1054,31 @@ ** index of the rightmost key if the new key is larger than all keys ** currently stored in the node). */ int iSlot = csr.aiCell[csr.iNode] + (res<0); if( csr.iNode==0 ){ - rc = treeInsert(pEnv, pTree, &csr, 0, pTreeKey, 0, iSlot); + rc = treeInsert(pDb, &csr, 0, iTreeKey, 0, iSlot); }else{ - rc = treeInsertLeaf(pEnv, pTree, &csr, pTreeKey, iSlot); + rc = treeInsertLeaf(pDb, &csr, iTreeKey, iSlot); } } + tblobFree(pDb, &csr.blob); } - /* dump_tree_contents(pTree, "after"); */ +#if 0 + dump_tree_contents(pDb, "after"); +#endif assert_tree_looks_ok(rc, pTree); return rc; } /* ** Return, in bytes, the amount of memory currently used by the tree ** structure. */ -int lsmTreeSize(TreeVersion *pTV){ - return (lsmPoolUsed(pTV->pTree->pPool) - ROUND8(sizeof(Tree))); -} - -/* -** Return true if the tree is empty. Otherwise false. -** -** The caller is responsible for ensuring that it has exclusive access -** to the Tree structure for this call. -*/ -int lsmTreeIsEmpty(Tree *pTree){ - assert( pTree==0 || pTree->pWorking==0 ); - return (pTree==0 || pTree->pCommit->pRoot==0); +int lsmTreeSize(lsm_db *pDb){ + return pDb->treehdr.nByte; } /* ** Open a cursor on the in-memory tree pTree. */ @@ -869,10 +1095,11 @@ /* ** Close an in-memory tree cursor. */ void lsmTreeCursorDestroy(TreeCursor *pCsr){ if( pCsr ){ + tblobFree(pCsr->pDb, &pCsr->blob); lsmFree(pCsr->pDb->pEnv, pCsr); } } void lsmTreeCursorReset(TreeCursor *pCsr){ @@ -881,21 +1108,20 @@ } #ifndef NDEBUG static int treeCsrCompare(TreeCursor *pCsr, void *pKey, int nKey){ TreeKey *p; - int cmp; + int cmp = 0; + int rc = LSM_OK; assert( pCsr->iNode>=0 ); - p = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; - cmp = memcmp(p->pKey, pKey, LSM_MIN(p->nKey, nKey)); - if( cmp==0 ){ - cmp = p->nKey - nKey; + p = csrGetKey(pCsr, &pCsr->blob, &rc); + if( p ){ + cmp = pCsr->pDb->xCmp(TK_KEY(p), p->nKey, pKey, nKey); } return cmp; } #endif - /* ** Attempt to seek the cursor passed as the first argument to key (pKey/nKey) ** in the tree structure. If an exact match for the key is found, leave the @@ -909,87 +1135,99 @@ ** is smaller than the key and set *pRes to -1, or ** ** * If the tree is empty, leave the cursor at EOF and set *pRes to -1. */ int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes){ - TreeVersion *p = pCsr->pDb->pTV; - int (*xCmp)(void *, int, void *, int) = p->pTree->xCmp; - TreeNode *pNode = p->pRoot; /* Current node in search */ + int rc = LSM_OK; /* Return code */ + lsm_db *pDb = pCsr->pDb; + TreeHeader *pHdr = &pCsr->pDb->treehdr; + int (*xCmp)(void *, int, void *, int) = pDb->xCmp; + + u32 iNodePtr; /* Location of current node in search */ /* Discard any saved position data */ treeCursorRestore(pCsr, 0); - if( pNode==0 ){ - /* A special case - the tree is completely empty. */ + iNodePtr = pDb->treehdr.iRoot; + if( iNodePtr==0 ){ + /* Either an error occurred or the tree is completely empty. */ + assert( rc!=LSM_OK || pDb->treehdr.iRoot==0 ); *pRes = -1; pCsr->iNode = -1; }else{ + TreeBlob b = {0, 0}; int res = 0; /* Result of comparison function */ int iNode = -1; - while( pNode ){ + while( iNodePtr ){ + TreeNode *pNode; /* Node at location iNodePtr */ int iTest; /* Index of second key to test (0 or 2) */ TreeKey *pTreeKey; /* Key to compare against */ + pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); iNode++; pCsr->apTreeNode[iNode] = pNode; /* Compare (pKey/nKey) with the key in the middle slot of B-tree node ** pNode. The middle slot is never empty. If the comparison is a match, ** then the search is finished. Break out of the loop. */ - pTreeKey = pNode->apKey[1]; - res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey); + pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[1], TK_LOADKEY, &b, &rc); + if( rc!=LSM_OK ) break; + res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); if( res==0 ){ pCsr->aiCell[iNode] = 1; break; } /* Based on the results of the previous comparison, compare (pKey/nKey) ** to either the left or right key of the B-tree node, if such a key ** exists. */ iTest = (res>0 ? 0 : 2); - pTreeKey = pNode->apKey[iTest]; + pTreeKey = treeShmkey(pDb, pNode->aiKeyPtr[iTest], TK_LOADKEY, &b, &rc); + if( rc ) break; if( pTreeKey==0 ){ iTest = 1; }else{ - res = xCmp(pTreeKey->pKey, pTreeKey->nKey, pKey, nKey); + res = xCmp((void *)&pTreeKey[1], pTreeKey->nKey, pKey, nKey); if( res==0 ){ pCsr->aiCell[iNode] = iTest; break; } } - if( iNode<(p->nHeight-1) ){ - pNode = getChildPtr(pNode, p->iVersion, iTest + (res<0)); + if( iNode<(pHdr->nHeight-1) ){ + iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iTest + (res<0)); }else{ - pNode = 0; + iNodePtr = 0; } - pCsr->aiCell[iNode] = iTest + (pNode && (res<0)); + pCsr->aiCell[iNode] = iTest + (iNodePtr && (res<0)); } *pRes = res; pCsr->iNode = iNode; + tblobFree(pDb, &b); } /* assert() that *pRes has been set properly */ #ifndef NDEBUG - if( lsmTreeCursorValid(pCsr) ){ + if( rc==LSM_OK && lsmTreeCursorValid(pCsr) ){ int cmp = treeCsrCompare(pCsr, pKey, nKey); assert( *pRes==cmp || (*pRes ^ cmp)>0 ); } #endif - return LSM_OK; + return rc; } int lsmTreeCursorNext(TreeCursor *pCsr){ #ifndef NDEBUG TreeKey *pK1; + TreeBlob key1 = {0, 0}; #endif - - TreeVersion *p = pCsr->pDb->pTV; - const int iLeaf = p->nHeight-1; + lsm_db *pDb = pCsr->pDb; + const int iLeaf = pDb->treehdr.nHeight-1; int iCell; + int rc = LSM_OK; TreeNode *pNode; /* Restore the cursor position, if required */ int iRestore = 0; treeCursorRestore(pCsr, &iRestore); @@ -997,11 +1235,12 @@ /* Save a pointer to the current key. This is used in an assert() at the ** end of this function - to check that the 'next' key really is larger ** than the current key. */ #ifndef NDEBUG - pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; + pK1 = csrGetKey(pCsr, &key1, &rc); + if( rc!=LSM_OK ) return rc; #endif assert( lsmTreeCursorValid(pCsr) ); assert( pCsr->aiCell[pCsr->iNode]<3 ); @@ -1009,49 +1248,51 @@ iCell = ++pCsr->aiCell[pCsr->iNode]; /* If the current node is not a leaf, and the current cell has sub-tree ** associated with it, descend to the left-most key on the left-most ** leaf of the sub-tree. */ - if( pCsr->iNodeiVersion, iCell) ){ + if( pCsr->iNodetreehdr.iTransId, iCell) ){ do { + u32 iNodePtr; pCsr->iNode++; - pNode = getChildPtr(pNode, p->iVersion, iCell); + iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell); + pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); pCsr->apTreeNode[pCsr->iNode] = pNode; - iCell = pCsr->aiCell[pCsr->iNode] = (pNode->apKey[0]==0); + iCell = pCsr->aiCell[pCsr->iNode] = (pNode->aiKeyPtr[0]==0); }while( pCsr->iNode < iLeaf ); } /* Otherwise, the next key is found by following pointer up the tree ** until there is a key immediately to the right of the pointer followed ** to reach the sub-tree containing the current key. */ - else if( iCell>=3 || pNode->apKey[iCell]==0 ){ + else if( iCell>=3 || pNode->aiKeyPtr[iCell]==0 ){ while( (--pCsr->iNode)>=0 ){ iCell = pCsr->aiCell[pCsr->iNode]; - if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break; + if( iCell<3 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; } } #ifndef NDEBUG if( pCsr->iNode>=0 ){ - TreeKey *pK2; - int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; - pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; - assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)>0 ); + TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); + assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)>0 ); } + tblobFree(pDb, &key1); #endif - return LSM_OK; + return rc; } int lsmTreeCursorPrev(TreeCursor *pCsr){ #ifndef NDEBUG TreeKey *pK1; + TreeBlob key1 = {0, 0}; #endif - - TreeVersion *p = pCsr->pDb->pTV; - const int iLeaf = p->nHeight-1; + lsm_db *pDb = pCsr->pDb; + const int iLeaf = pDb->treehdr.nHeight-1; int iCell; + int rc = LSM_OK; TreeNode *pNode; /* Restore the cursor position, if required */ int iRestore = 0; treeCursorRestore(pCsr, &iRestore); @@ -1059,11 +1300,12 @@ /* Save a pointer to the current key. This is used in an assert() at the ** end of this function - to check that the 'next' key really is smaller ** than the current key. */ #ifndef NDEBUG - pK1 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; + pK1 = csrGetKey(pCsr, &key1, &rc); + if( rc!=LSM_OK ) return rc; #endif assert( lsmTreeCursorValid(pCsr) ); pNode = pCsr->apTreeNode[pCsr->iNode]; iCell = pCsr->aiCell[pCsr->iNode]; @@ -1070,16 +1312,19 @@ assert( iCell>=0 && iCell<3 ); /* If the current node is not a leaf, and the current cell has sub-tree ** associated with it, descend to the right-most key on the right-most ** leaf of the sub-tree. */ - if( pCsr->iNodeiVersion, iCell) ){ + if( pCsr->iNodetreehdr.iTransId, iCell) ){ do { + u32 iNodePtr; pCsr->iNode++; - pNode = getChildPtr(pNode, p->iVersion, iCell); + iNodePtr = getChildPtr(pNode, pDb->treehdr.iTransId, iCell); + pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); + if( rc!=LSM_OK ) break; pCsr->apTreeNode[pCsr->iNode] = pNode; - iCell = 1 + (pNode->apKey[2]!=0) + (pCsr->iNode < iLeaf); + iCell = 1 + (pNode->aiKeyPtr[2]!=0) + (pCsr->iNode < iLeaf); pCsr->aiCell[pCsr->iNode] = iCell; }while( pCsr->iNode < iLeaf ); } /* Otherwise, the next key is found by following pointer up the tree until @@ -1086,89 +1331,107 @@ ** there is a key immediately to the left of the pointer followed to reach ** the sub-tree containing the current key. */ else{ do { iCell = pCsr->aiCell[pCsr->iNode]-1; - if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->apKey[iCell] ) break; + if( iCell>=0 && pCsr->apTreeNode[pCsr->iNode]->aiKeyPtr[iCell] ) break; }while( (--pCsr->iNode)>=0 ); pCsr->aiCell[pCsr->iNode] = iCell; } #ifndef NDEBUG if( pCsr->iNode>=0 ){ - TreeKey *pK2; - int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; - pK2 = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; - assert( xCmp(pK2->pKey, pK2->nKey, pK1->pKey, pK1->nKey)<0 ); + TreeKey *pK2 = csrGetKey(pCsr, &pCsr->blob, &rc); + assert( rc || pDb->xCmp(TK_KEY(pK2), pK2->nKey, TK_KEY(pK1), pK1->nKey)<0 ); } + tblobFree(pDb, &key1); #endif - return LSM_OK; + return rc; } /* ** Move the cursor to the first (bLast==0) or last (bLast!=0) entry in the ** in-memory tree. */ int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast){ - TreeVersion *p = pCsr->pDb->pTV; - TreeNode *pNode = p->pRoot; + lsm_db *pDb = pCsr->pDb; + TreeHeader *pHdr = &pDb->treehdr; + int rc = LSM_OK; + + u32 iNodePtr; pCsr->iNode = -1; /* Discard any saved position data */ treeCursorRestore(pCsr, 0); - while( pNode ){ + iNodePtr = pHdr->iRoot; + while( iNodePtr ){ int iCell; + TreeNode *pNode; + + pNode = (TreeNode *)treeShmptr(pDb, iNodePtr, &rc); + if( rc ) break; + if( bLast ){ - iCell = ((pNode->apKey[2]==0) ? 2 : 3); + iCell = ((pNode->aiKeyPtr[2]==0) ? 2 : 3); }else{ - iCell = ((pNode->apKey[0]==0) ? 1 : 0); + iCell = ((pNode->aiKeyPtr[0]==0) ? 1 : 0); } - pCsr->iNode++; pCsr->apTreeNode[pCsr->iNode] = pNode; - if( pCsr->iNodenHeight-1 ){ - pNode = getChildPtr(pNode, p->iVersion, iCell); - }else{ - pNode = 0; - } - pCsr->aiCell[pCsr->iNode] = iCell - (pNode==0 && bLast); - } - return LSM_OK; + if( pCsr->iNodenHeight-1 ){ + iNodePtr = getChildPtr(pNode, pHdr->iTransId, iCell); + }else{ + iNodePtr = 0; + } + pCsr->aiCell[pCsr->iNode] = iCell - (iNodePtr==0 && bLast); + } + + return rc; } int lsmTreeCursorKey(TreeCursor *pCsr, void **ppKey, int *pnKey){ TreeKey *pTreeKey; + int rc = LSM_OK; + assert( lsmTreeCursorValid(pCsr) ); pTreeKey = pCsr->pSave; if( !pTreeKey ){ - pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; + pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); + } + if( rc==LSM_OK ){ + *pnKey = pTreeKey->nKey; + *ppKey = (void *)&pTreeKey[1]; } - *ppKey = pTreeKey->pKey; - *pnKey = pTreeKey->nKey; - return LSM_OK; + return rc; } int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal){ - TreeKey *pTreeKey; int res = 0; + int rc; - treeCursorRestore(pCsr, &res); + rc = treeCursorRestore(pCsr, &res); if( res==0 ){ - pTreeKey = pCsr->apTreeNode[pCsr->iNode]->apKey[pCsr->aiCell[pCsr->iNode]]; - *ppVal = pTreeKey->pValue; - *pnVal = pTreeKey->nValue; + TreeKey *pTreeKey = csrGetKey(pCsr, &pCsr->blob, &rc); + if( rc==LSM_OK ){ + *pnVal = pTreeKey->nValue; + if( pTreeKey->nValue>=0 ){ + *ppVal = TK_VAL(pTreeKey); + }else{ + *ppVal = 0; + } + } }else{ *ppVal = 0; *pnVal = 0; } - return LSM_OK; + return rc; } /* ** Return true if the cursor currently points to a valid entry. */ @@ -1175,194 +1438,159 @@ int lsmTreeCursorValid(TreeCursor *pCsr){ return (pCsr && (pCsr->pSave || pCsr->iNode>=0)); } /* -** Roll back to mark pMark. Structure *pMark should have been previously -** populated by a call to lsmTreeMark(). -*/ -void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){ - TreeVersion *pWorking = pDb->pTV; - Tree *pTree = pWorking->pTree; - TreeNode *p; - - assert( lsmTreeIsWriteVersion(pWorking) ); - - pWorking->pRoot = (TreeNode *)pMark->pRoot; - pWorking->nHeight = pMark->nHeight; - - if( pMark->pRollback ){ - p = ((TreeNode *)pMark->pRollback)->pNext; - }else{ - p = pTree->pRbFirst; - } - - while( p ){ - TreeNode *pNext = p->pNext; - assert( p->iV2!=0 ); - assert( pNext || p==pTree->pRbLast ); - p->iV2 = 0; - p->iV2Ptr = 0; - p->pV2Ptr = 0; - p->pNext = 0; - p = pNext; - } - - pTree->pRbLast = (TreeNode *)pMark->pRollback; - if( pTree->pRbLast ){ - pTree->pRbLast->pNext = 0; - }else{ - pTree->pRbFirst = 0; - } - - lsmPoolRollback(pDb->pEnv, pTree->pPool, pMark->pMpChunk, pMark->iMpOff); -} - -/* -** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a -** pointer to the same TreeMark structure may be used to roll the tree -** contents back to their current state. -*/ -void lsmTreeMark(TreeVersion *pTV, TreeMark *pMark){ - Tree *pTree = pTV->pTree; - memset(pMark, 0, sizeof(TreeMark)); - pMark->pRoot = (void *)pTV->pRoot; - pMark->nHeight = pTV->nHeight; - pMark->pRollback = (void *)pTree->pRbLast; - lsmPoolMark(pTree->pPool, &pMark->pMpChunk, &pMark->iMpOff); - - assert( lsmTreeIsWriteVersion(pTV) ); - pTV->iVersion++; -} - -/* -** This is called when a client wishes to upgrade from a read to a write -** transaction. If the read-version passed as the second version is the -** most recent one, decrement its ref-count and return a pointer to -** the write-version object. Otherwise return null. So we can do: -** -** // Open read-transaction -** pReadVersion = lsmTreeReadVersion(pTree); -** -** // Later on, attempt to upgrade to write transaction -** if( pWriteVersion = lsmTreeWriteVersion(pTree, pReadVersion) ){ -** // Have upgraded to a write transaction! -** }else{ -** // Reading an out-of-date snapshot. Upgrade fails. -** } -** -** The caller must take care of rejecting a clients attempt to upgrade to -** a write transaction *while* another client has a write transaction -** underway. This mechanism merely prevents writing to an out-of-date -** snapshot. -*/ -int lsmTreeWriteVersion( - lsm_env *pEnv, - Tree *pTree, - TreeVersion **ppVersion -){ - TreeVersion *pRead = *ppVersion; - TreeVersion *pRet; - - /* The caller must ensure that no other write transaction is underway. */ - assert( pTree->pWorking==0 ); - - if( pRead && pTree->pCommit!=pRead ) return LSM_BUSY; - pRet = lsmMallocZero(pEnv, sizeof(TreeVersion)); - if( pRet==0 ) return LSM_NOMEM_BKPT; - pTree->pWorking = pRet; - - memcpy(pRet, pTree->pCommit, sizeof(TreeVersion)); - pRet->nRef = 1; - if( pRead ) pRead->nRef--; - *ppVersion = pRet; - assert( pRet->pTree==pTree ); - return LSM_OK; -} - -static void treeIncrRefcount(Tree *pTree){ - pTree->nTreeRef++; -} - -static void treeDecrRefcount(lsm_env *pEnv, Tree *pTree){ - assert( pTree->nTreeRef>0 ); - pTree->nTreeRef--; - if( pTree->nTreeRef==0 ){ - assert( pTree->pWorking==0 ); - treeDestroy(pEnv, pTree); - } -} - -/* -** Release a reference to the write-version. -*/ -int lsmTreeReleaseWriteVersion( - lsm_env *pEnv, - TreeVersion *pWorking, /* Write-version reference */ - int bCommit, /* True for a commit */ - TreeVersion **ppReadVersion /* OUT: Read-version reference */ -){ - Tree *pTree = pWorking->pTree; - - assert( lsmTreeIsWriteVersion(pWorking) ); - assert( pWorking->nRef==1 ); - - if( bCommit ){ - treeIncrRefcount(pTree); - lsmTreeReleaseReadVersion(pEnv, pTree->pCommit); - pTree->pCommit = pWorking; - }else{ - lsmFree(pEnv, pWorking); - } - - pTree->pWorking = 0; - if( ppReadVersion ){ - *ppReadVersion = lsmTreeReadVersion(pTree); - } - return LSM_OK; -} - - -TreeVersion *lsmTreeRecoverVersion(Tree *pTree){ - return pTree->pCommit; -} - -/* -** Return a reference to a TreeVersion structure that may be used to read -** the database. The reference should be released at some point in the future -** by calling lsmTreeReleaseReadVersion(). -*/ -TreeVersion *lsmTreeReadVersion(Tree *pTree){ - TreeVersion *pRet = pTree->pCommit; - assert( pRet->nRef>0 ); - pRet->nRef++; - return pRet; -} - -/* -** Release a reference to a read-version. -*/ -void lsmTreeReleaseReadVersion(lsm_env *pEnv, TreeVersion *pTreeVersion){ - if( pTreeVersion ){ - assert( pTreeVersion->nRef>0 ); - pTreeVersion->nRef--; - if( pTreeVersion->nRef==0 ){ - Tree *pTree = pTreeVersion->pTree; - lsmFree(pEnv, pTreeVersion); - treeDecrRefcount(pEnv, pTree); - } - } -} - -/* -** Return true if the tree-version passed as the first argument is writable. -*/ -int lsmTreeIsWriteVersion(TreeVersion *pTV){ - return (pTV==pTV->pTree->pWorking); -} - -void lsmTreeRelease(lsm_env *pEnv, Tree *pTree){ - if( pTree ){ - assert( pTree->nTreeRef>0 && pTree->pCommit ); - lsmTreeReleaseReadVersion(pEnv, pTree->pCommit); - } -} +** Store a mark in *pMark. Later on, a call to lsmTreeRollback() with a +** pointer to the same TreeMark structure may be used to roll the tree +** contents back to their current state. +*/ +void lsmTreeMark(lsm_db *pDb, TreeMark *pMark){ + pMark->iRoot = pDb->treehdr.iRoot; + pMark->nHeight = pDb->treehdr.nHeight; + pMark->iWrite = pDb->treehdr.iWrite; + pMark->nChunk = pDb->treehdr.nChunk; + pMark->iFirst = pDb->treehdr.iFirst; + pMark->iRollback = intArraySize(&pDb->rollback); +} + +/* +** Roll back to mark pMark. Structure *pMark should have been previously +** populated by a call to lsmTreeMark(). +*/ +void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark){ + int rcdummy = LSM_OK; + int iIdx; + int nIdx; + u32 iNext; + ShmChunk *pChunk; + u32 iChunk; + + /* Revert all required v2 pointers. */ + nIdx = intArraySize(&pDb->rollback); + for(iIdx = pMark->iRollback; iIdxrollback, iIdx), &rcdummy); + assert( pNode && rcdummy==LSM_OK ); + pNode->iV2 = 0; + pNode->iV2Child = 0; + pNode->iV2Ptr = 0; + } + intArrayTruncate(&pDb->rollback, pMark->iRollback); + + /* Restore the free-chunk list */ + assert( pMark->iWrite!=0 ); + iChunk = treeOffsetToChunk(pMark->iWrite-1); + pChunk = treeShmChunk(pDb, iChunk); + iNext = pChunk->iNext; + pChunk->iNext = 0; + assert( iNext==0 + || pDb->treehdr.iFirst==pMark->iFirst + || iNext==pMark->iFirst + ); + pDb->treehdr.iFirst = pMark->iFirst; + while( iNext ){ + iChunk = iNext; + pChunk = treeShmChunk(pDb, iChunk); + iNext = pChunk->iNext; + if( iChunknChunk ){ + pChunk->iNext = pDb->treehdr.iFirst; + pChunk->iLastTree = 0; + } + } + + /* Restore the tree-header fields */ + pDb->treehdr.iRoot = pMark->iRoot; + pDb->treehdr.nHeight = pMark->nHeight; + pDb->treehdr.iWrite = pMark->iWrite; + pDb->treehdr.nChunk = pMark->nChunk; +} + +static void treeHeaderChecksum( + TreeHeader *pHdr, + u32 *aCksum +){ + u32 cksum1 = 0x12345678; + u32 cksum2 = 0x9ABCDEF0; + u32 *a = (u32 *)pHdr; + int i; + + assert( (offsetof(TreeHeader, aCksum) + sizeof(u32)*2)==sizeof(TreeHeader) ); + assert( (sizeof(TreeHeader) % (sizeof(u32)*2))==0 ); + + for(i=0; i<(offsetof(TreeHeader, aCksum) / sizeof(u32)); i+=2){ + cksum1 += a[i]; + cksum2 += (cksum1 + a[i+1]); + } + aCksum[0] = cksum1; + aCksum[1] = cksum2; +} + +/* +** Return true if the checksum stored in TreeHeader object *pHdr is +** consistent with the contents of its other fields. +*/ +static int treeHeaderChecksumOk(TreeHeader *pHdr){ + u32 aCksum[2]; + treeHeaderChecksum(pHdr, aCksum); + return (0==memcmp(aCksum, pHdr->aCksum, sizeof(aCksum))); +} + +/* +** Load the in-memory tree header from shared-memory into pDb->treehdr. +** If the header cannot be loaded, return LSM_BUSY. +*/ +int lsmTreeLoadHeader(lsm_db *pDb){ + while( 1 ){ + int rc; + ShmHeader *pShm = pDb->pShmhdr; + + memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); + if( treeHeaderChecksumOk(&pDb->treehdr) ) return LSM_OK; + + rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); + if( rc==LSM_BUSY ){ + usleep(50); + }else{ + if( rc==LSM_OK ){ + if( treeHeaderChecksumOk(&pShm->hdr1)==0 ){ + memcpy(&pShm->hdr1, &pShm->hdr2, sizeof(TreeHeader)); + } + memcpy(&pDb->treehdr, &pShm->hdr1, sizeof(TreeHeader)); + lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); + + if( treeHeaderChecksumOk(&pDb->treehdr)==0 ){ + rc = LSM_CORRUPT_BKPT; + } + } + return rc; + } + } +} + +/* +** This function is called to conclude a transaction. If argument bCommit +** is true, the transaction is committed. Otherwise it is rolled back. +*/ +int lsmTreeEndTransaction(lsm_db *pDb, int bCommit){ + ShmHeader *pShm = pDb->pShmhdr; + + if( bCommit ){ + treeHeaderChecksum(&pDb->treehdr, pDb->treehdr.aCksum); + memcpy(&pShm->hdr2, &pDb->treehdr, sizeof(TreeHeader)); + lsmShmBarrier(pDb); + memcpy(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)); + } + pShm->bWriter = 0; + intArrayFree(pDb->pEnv, &pDb->rollback); + + return LSM_OK; +} + +/* +** Begin a new transaction. +*/ +int lsmTreeBeginTransaction(lsm_db *pDb){ + pDb->treehdr.iTransId++; + return LSM_OK; +} + Index: src/lsm_unix.c ================================================================== --- src/lsm_unix.c +++ src/lsm_unix.c @@ -34,25 +34,39 @@ #include #include #include - #include "lsmInt.h" /* ** An open file is an instance of the following object */ typedef struct PosixFile PosixFile; struct PosixFile { - lsm_env *pEnv; /* The run-time environment */ - int fd; /* The open file descriptor */ - void *pMap; - off_t nMap; + lsm_env *pEnv; /* The run-time environment */ + const char *zName; /* Full path to file */ + int fd; /* The open file descriptor */ + int shmfd; /* Shared memory file-descriptor */ + void *pMap; /* Pointer to mapping of file fd */ + off_t nMap; /* Size of mapping at pMap in bytes */ + int nShm; /* Number of entries in array apShm[] */ + void **apShm; /* Array of 32K shared memory segments */ }; static int lsm_ioerr(void){ return LSM_IOERR; } + +static char *posixShmFile(PosixFile *p){ + char *zShm; + int nName = strlen(p->zName); + zShm = (char *)lsmMalloc(p->pEnv, nName+4+1); + if( zShm ){ + memcpy(zShm, p->zName, nName); + memcpy(&zShm[nName], "-shm", 5); + } + return zShm; +} static int lsmPosixOsOpen( lsm_env *pEnv, const char *zFile, lsm_file **ppFile @@ -63,10 +77,11 @@ p = lsm_malloc(pEnv, sizeof(PosixFile)); if( p==0 ){ rc = LSM_NOMEM; }else{ memset(p, 0, sizeof(PosixFile)); + p->zName = zFile; p->pEnv = pEnv; p->fd = open(zFile, O_RDWR|O_CREAT, 0644); if( p->fd<0 ){ lsm_free(pEnv, p); p = 0; @@ -262,24 +277,133 @@ memcpy(pBuf, &buf.st_dev, sizeof(buf.st_dev)); memcpy(&(((u8 *)pBuf)[sizeof(buf.st_dev)]), &buf.st_ino, sizeof(buf.st_ino)); return LSM_OK; } + +static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){ + int prc = unlink(zFile); + return prc ? LSM_IOERR_BKPT : LSM_OK; +} + +int lsmPosixOsLock(lsm_file *pFile, int iLock, int eType){ + int rc = LSM_OK; + PosixFile *p = (PosixFile *)pFile; + static const short aType[3] = { F_UNLCK, F_RDLCK, F_WRLCK }; + struct flock lock; + + assert( aType[LSM_LOCK_UNLOCK]==F_UNLCK ); + assert( aType[LSM_LOCK_SHARED]==F_RDLCK ); + assert( aType[LSM_LOCK_EXCL]==F_WRLCK ); + assert( eType>=0 && eType0 && iLock<=16 ); + + memset(&lock, 0, sizeof(lock)); + lock.l_whence = SEEK_SET; + lock.l_len = 1; + lock.l_type = aType[eType]; + lock.l_start = (4096-iLock); + + if( fcntl(p->fd, F_SETLK, &lock) ){ + int e = errno; + if( e==EACCES || e==EAGAIN ){ + rc = LSM_BUSY; + }else{ + rc = LSM_IOERR; + } + } + + return LSM_OK; +} + +int lsmPosixOsShmMap(lsm_file *pFile, int iChunk, int sz, void **ppShm){ + PosixFile *p = (PosixFile *)pFile; + + *ppShm = 0; + assert( sz==LSM_SHM_CHUNK_SIZE ); + if( iChunk>=p->nShm ){ + int i; + void **apNew; + int nNew = iChunk+1; + off_t nReq = nNew * LSM_SHM_CHUNK_SIZE; + struct stat sStat; + + /* If the shared-memory file has not been opened, open it now. */ + if( p->shmfd<=0 ){ + char *zShm = posixShmFile(p); + if( !zShm ) return LSM_NOMEM_BKPT; + p->shmfd = open(zShm, O_RDWR|O_CREAT, 0644); + lsmFree(p->pEnv, zShm); + if( p->shmfd<0 ){ + return LSM_IOERR_BKPT; + } + } + + /* If the shared-memory file is not large enough to contain the + ** requested chunk, cause it to grow. */ + if( fstat(p->shmfd, &sStat) ){ + return LSM_IOERR_BKPT; + } + if( sStat.st_sizeshmfd, nReq) ){ + return LSM_IOERR_BKPT; + } + } + + apNew = (void **)lsmRealloc(p->pEnv, p->apShm, sizeof(void *) * nNew); + if( !apNew ) return LSM_NOMEM_BKPT; + for(i=p->nShm; iapShm = apNew; + p->nShm = nNew; + } + + if( p->apShm[iChunk]==0 ){ + p->apShm[iChunk] = mmap(0, LSM_SHM_CHUNK_SIZE, + PROT_READ|PROT_WRITE, MAP_SHARED, p->shmfd, iChunk*LSM_SHM_CHUNK_SIZE + ); + if( p->apShm[iChunk]==0 ) return LSM_IOERR; + } + + *ppShm = p->apShm[iChunk]; + return LSM_OK; +} + +void lsmPosixOsShmBarrier(void){ +} + +int lsmPosixOsShmUnmap(lsm_file *pFile, int bDelete){ + PosixFile *p = (PosixFile *)pFile; + if( p->shmfd>0 ){ + int i; + for(i=0; inShm; i++){ + if( p->apShm[i] ){ + munmap(p->apShm[i], LSM_SHM_CHUNK_SIZE); + p->apShm[i] = 0; + } + } + close(p->shmfd); + p->shmfd = 0; + if( bDelete ){ + char *zShm = posixShmFile(p); + if( zShm ) unlink(zShm); + } + } + return LSM_OK; +} + static int lsmPosixOsClose(lsm_file *pFile){ PosixFile *p = (PosixFile *)pFile; + lsmPosixOsShmUnmap(pFile, 0); if( p->pMap ) munmap(p->pMap, p->nMap); close(p->fd); lsm_free(p->pEnv, p); return LSM_OK; } -static int lsmPosixOsUnlink(lsm_env *pEnv, const char *zFile){ - int prc = unlink(zFile); - return prc ? LSM_IOERR_BKPT : LSM_OK; -} - /**************************************************************************** ** Memory allocation routines. */ #define ROUND8(x) (((x)+7)&~7) #define BLOCK_HDR_SIZE ROUND8( sizeof(sqlite4_size_t) ) @@ -530,10 +654,14 @@ lsmPosixOsSectorSize, /* xSectorSize */ lsmPosixOsRemap, /* xRemap */ lsmPosixOsFileid, /* xFileid */ lsmPosixOsClose, /* xClose */ lsmPosixOsUnlink, /* xUnlink */ + lsmPosixOsLock, /* xLock */ + lsmPosixOsShmMap, /* xShmMap */ + lsmPosixOsShmBarrier, /* xShmBarrier */ + lsmPosixOsShmUnmap, /* xShmUnmap */ /***** memory allocation *********/ 0, /* pMemCtx */ lsmPosixOsMalloc, /* xMalloc */ lsmPosixOsRealloc, /* xRealloc */ lsmPosixOsFree, /* xFree */ Index: test/attach.test ================================================================== --- test/attach.test +++ test/attach.test @@ -22,12 +22,11 @@ finish_test return } for {set i 2} {$i<=15} {incr i} { - forcedelete test$i.db - forcedelete test$i.db-journal + db_delete test$i.db } do_test attach-1.1 { execsql { CREATE TABLE t1(a,b); ADDED test/ckpt1.test Index: test/ckpt1.test ================================================================== --- /dev/null +++ test/ckpt1.test @@ -0,0 +1,94 @@ +# 2012 August 29 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# The tests in this file focus on testing that very large checkpoints +# (those that occur when the database contains an unusually large number +# of levels or free blocks) are handled correctly. +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set testprefix ckpt1 + +# Check that lsm_config(AUTOWORK) seems to be connected to something. +# +do_test 1.1 { sqlite4_lsm_config db main autowork 0 } 0 +do_test 1.2 { sqlite4_lsm_config db main autowork 1 } 1 +do_test 1.3 { sqlite4_lsm_config db main autowork -1 } 1 +do_test 1.4 { sqlite4_lsm_config db main autowork 0 } 0 +do_test 1.5 { sqlite4_lsm_config db main autowork -1 } 0 + + +set nLevel 200 +do_execsql_test 2.0 { CREATE TABLE t1(a INTEGER PRIMARY KEY, b INTEGER UNIQUE) } +do_test 2.1 { + for {set i 1} {$i <= $nLevel} {incr i} { + db close + sqlite4 db test.db + sqlite4_lsm_config db main autowork 0 + db eval { INSERT INTO t1 VALUES($i, $i || $i) } + } + db eval { + SELECT count(*) FROM t1; + PRAGMA integrity_check; + } +} [list $nLevel ok] + + +#------------------------------------------------------------------------- +# The point of this test is to add a large number of blocks to the +# free-block list and check that this doesn't seem to cause any +# obvious problems. +# +do_test 3.0 { + db close + forcedelete test.db + sqlite4 db file:test.db?lsm_block_size=65536 + execsql { + CREATE TABLE t1(a PRIMARY KEY, b); + CREATE INDEX i1 ON t1(b); + } +} {} +do_execsql_test 3.1 { + INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)); + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 2 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 4 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 8 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 16 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 32 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 64 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 128 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 256 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 512 + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 1K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 2K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 4K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 8K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 16K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 32K + INSERT INTO t1 SELECT randstr(100,100), randstr(100,100) FROM t1; -- 64K +} +do_test 3.2 { + sqlite4_lsm_work db main -optimize 1000000 + execsql { SELECT count(*) FROM t1 } +} {65536} +do_test 3.3 { + db close + sqlite4 db test.db + execsql { SELECT count(*) FROM t1 } +} {65536} +do_test 3.4 { + execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) } + sqlite4_lsm_work db main -optimize 1000000 + execsql { SELECT count(*) FROM t1 } +} {65537} + +finish_test + Index: test/manydb.test ================================================================== --- test/manydb.test +++ test/manydb.test @@ -17,28 +17,18 @@ set testdir [file dirname $argv0] source $testdir/tester.tcl set N 300 -# if we're using proxy locks, we use 5 filedescriptors for a db -# that is open and in the middle of writing changes, normally -# sqlite uses 3 (proxy locking adds the conch and the local lock) -set using_proxy 0 -foreach {name value} [array get env SQLITE4_FORCE_PROXY_LOCKING] { - set using_proxy value -} -set num_fd_per_openwrite_db 3 -if {$using_proxy>0} { - set num_fd_per_openwrite_db 5 -} +set num_fd_per_openwrite_db 4 # First test how many file descriptors are available for use. To open a # database for writing SQLite requires 3 file descriptors (the database, the # journal and the directory). set filehandles {} catch { - for {set i 0} {$i<($N * 3)} {incr i} { + for {set i 0} {$i<($N * $num_fd_per_openwrite_db)} {incr i} { lappend filehandles [open testfile.1 w] } } foreach fd $filehandles { close $fd Index: test/permutations.test ================================================================== --- test/permutations.test +++ test/permutations.test @@ -131,11 +131,14 @@ # lappend ::testsuitelist xxx test_suite "src4" -prefix "" -description { } -files { - simple.test log1.test log2.test log3.test csr1.test + simple.test + log1.test log2.test log3.test + csr1.test + ckpt1.test aggerror.test attach.test autoindex1.test badutf.test Index: test/test_lsm.c ================================================================== --- test/test_lsm.c +++ test/test_lsm.c @@ -35,10 +35,11 @@ { "log-size", LSM_CONFIG_LOG_SIZE }, { "safety", LSM_CONFIG_SAFETY }, { "write-buffer", LSM_CONFIG_WRITE_BUFFER }, { "mmap", LSM_CONFIG_MMAP }, { "page-size", LSM_CONFIG_PAGE_SIZE }, + { "autowork", LSM_CONFIG_AUTOWORK }, { 0, 0 } }; const char *zDb; /* objv[1] as a string */ const char *zName; /* objv[2] as a string */ Index: test/tester.tcl ================================================================== --- test/tester.tcl +++ test/tester.tcl @@ -19,10 +19,11 @@ # # Commands to manipulate the db and the file-system at a high level: # # copy_file FROM TO # delete_file FILENAME +# db_delete DBNAME # drop_all_tables ?DB? # forcecopy FROM TO # forcedelete FILENAME # # Test the capability of the SQLite version built into the interpreter to @@ -357,17 +358,26 @@ # if {$cmdlinearg(binarylog)} { vfslog new binarylog {} vfslog.bin } } + +# Delete all files associated with LSM database $file. That is: +# +# ${file} +# ${file}-log +# ${file}-shm +# +proc db_delete {file} { + forcedelete $file $file-shm $file-log +} # Create a test database # proc reset_db {} { catch {db close} - forcedelete test.db - forcedelete test.db-log + db_delete test.db sqlite4 db ./test.db set ::DB [sqlite4_connection_pointer db] if {[info exists ::SETUP_SQL]} { db eval $::SETUP_SQL } @@ -1034,14 +1044,12 @@ # SQL (in that order) to prepare for the test case. do_test $testname.$n.1 { set ::sqlite_io_error_pending 0 catch {db close} catch {db2 close} - catch {forcedelete test.db} - catch {forcedelete test.db-journal} - catch {forcedelete test2.db} - catch {forcedelete test2.db-journal} + catch {db_delete test.db} + catch {db_delete test2.db} set ::DB [sqlite4 db test.db; sqlite4_connection_pointer db] sqlite4_extended_result_codes $::DB $::ioerropts(-erc) if {[info exists ::ioerropts(-tclprep)]} { eval $::ioerropts(-tclprep) } @@ -1466,11 +1474,11 @@ hexio_write test.db 92 $B return "" } proc db_save {} { - foreach f [glob -nocomplain sv_test.db*] { forcedelete $f } + db_delete sv_test.db foreach f [glob -nocomplain test.db*] { set f2 "sv_$f" forcecopy $f $f2 } } @@ -1478,11 +1486,11 @@ db_save catch { db close } return "" } proc db_restore {} { - foreach f [glob -nocomplain test.db*] { forcedelete $f } + db_delete test.db foreach f2 [glob -nocomplain sv_test.db*] { set f [string range $f2 3 end] forcecopy $f2 $f } } @@ -1491,11 +1499,11 @@ db_restore sqlite4 db $dbfile } proc db_delete_and_reopen {{file test.db}} { catch { db close } - foreach f [glob -nocomplain test.db*] { forcedelete $f } + db_delete $file sqlite4 db $file } # Do an SQL statement. Append the search count to the end of the result. # Index: tool/lsmview.tcl ================================================================== --- tool/lsmview.tcl +++ tool/lsmview.tcl @@ -142,11 +142,14 @@ $C bind $tid [list segment_info $C {}] } proc segment_info {C segment} { set w $C - while {[winfo class $w]!="Frame"} {set w [winfo parent $w]} + while {[winfo class $w]!="Frame"} { + set w [winfo parent $w] + if {$w==""} return + } set w $w.info if {$segment==""} { $w config -text "" } else { foreach {iFirst iLast iRoot nSize} $segment break ADDED www/shm.wiki Index: www/shm.wiki ================================================================== --- /dev/null +++ www/shm.wiki @@ -0,0 +1,329 @@ + +Multi-process LSM Notes + + +

+Notes on the changes required for LSM to allow connections from +multiple processes. In other words, notes to do with the contents +of the *-shm file and the way they are accessed and manipulated. + + +

Contents of shared memory

+ +

+Like SQLite 3 WAL mode, LSM uses a *-shm file. It uses the same +"dead man switch" mechanism to ensure it is always initialized to +zero when the first client connects. + +

+The *-shm file contains: + +

    +
  1. A flag indicating whether or not the *-shm has been initialized + (log file recovered into in-memory tree, header fields loaded etc.) +
  2. The meta-page number to which a checkpoint was last successfully + written. +
  3. The client snapshot. +
  4. The worker snapshot. +
  5. The in-memory tree. This takes up most of the space in the file. +
+ +

+The client and worker snapshots are in the same format as those stored +in the header of the database file itself. + +

+Sometimes data from the meta-page identified by the header field is +required. For example it is necessary to know the id of the last +checkpointed snapshot in order to determine which free blocks are safe +to reuse. The associated log file offset is also required to determine +when the log file may be wrapped. These quantities are read directly +from the meta-page in the database itself as required. + +

File locks

+ +

+Lsm uses the same ideas as SQLite in WAL mode. Both SHARED and EXCLUSIVE +locks are required. There are three exclusive locks: + +

    +
  • WRITER: Required to write to in-memory tree and its log file. +
  • WORKER: Required to write to body of database file. +
  • CHECKPOINTER: Required to write to database file header. +
+ +

+Only one client may hold each of these locks at one time. In other words, +each of the above is implemented by represents a range of bytes in the file + +

+There are also N separate locks held by readers. These locks also +work like WAL locks in that they are a combination of a lock and a +value. In WAL mode the value is a 32-bit integer. For LSM, it will +be two 64-bit integers - an in-memory tree id and a snapshot id. + +

Memory allocation

+ +

+Within the *-shm file, memory is allocated in 32KB chunks. + +

+The first chunk of the file is the header chunk. It contains: + +

    +
  1. The client snapshot (4KB) +
  2. The worker snapshot (4KB) +
  3. The "initialized" flag (4 bytes) +
  4. The meta-page number containing the last checkpoint written (4 + bytes) +
  5. The in-memory tree headers (see below). +
+ +

+The second and subsequent chunks are used to store the in-memory tree +data. + +

+The in-memory tree structure is essentially an append-only rb-tree +with some modifications to reduce the amount of data written. +Multiple trees will sometimes be present in the file. To cope with +circumstances like the following: + +

    +
  • Writer builds tree A. +
  • Reader takes a read lock on tree A. +
  • Tree A is flushed to the db. +
  • Writer begins building tree B. +
  • Reader continues reading from tree A. +
+ +

+In this case, the chunks used by tree A may not be reused until after +the active read transaction has concluded. + +

+Each chunk begins with three 32-bit integer fields: +

    +
  • Id of first tree for which data is stored on the chunk, +
  • Id of last tree for which data is stored on the chunk, +
  • Chunk number of chunk written after this one (or zero, if this + is the most recently written chunk). +
+ +

+The third field described above links all tree chunks in the file, +in-use or otherwise, into a single list. To allocate a new chunk, +a writer first checks if the chunk at the head of the list can be +recycled. If so, it moves it to the end of the list and begins +writing to it. Otherwise, it allocates a new chunk at the end of +the file, appends that to the list and continues writing. + +

Crash recovery: But, what happens if a writer crashes while +writing a transaction to the database? + +

If a writer crashes during a write transaction, readers can +often continue as normal. However, the next writer must roll +back any changes made to the db before it can commence a new +transaction. Or, if a writer fails when updating the in-memory +tree header, it may not be possible for readers to continue. +This is resolved by having one reader become a writer, restore +the db, then "commit" the empty transaction. + +

+The pattern used by a writer is: +

    +
  1. Obtain WRITER lock. This is a barrier operation (on Linux, an + fcntl(F_SETLK)). +
  2. Update shared memory region. +
  3. Release WRITER lock. Another barrier (on Linux, another F_SETLK). +
+ +

Or, if a failure occurs during step 2, the unlock operation is done +automatically by the OS. Either way, assume that the unlock is also a +barrier (see Documentation/memory-barrier.txt in kernel source tree). It +can therefore be assumed that from the point of view of the subsequent +writer, all writes to the shared memory region completed by the failed +writer appear to have been performed in order - there is no need to +worry that the hardware has reordered the writes made by the failed +writer. The compiler may reorder them, of course, but this should be +easy enough to avoid. + +

+Also assumed is that 32-bit writes are atomic, in the sense that it +is not possible for a failure in a writer process to result in some +bits of a 32-bit word being updated and some remaining in their +original state. + +

+Crashes are then managed by the following: + +

    +
  • When a write transaction is opened, a flag is set in the in-memory + tree header. This indicates that a transaction is underway. The same + flag is cleared right before the WRITER lock is released to commit or + roll back the transaction. + +
  • When a recyclable chunk is moved from the start of the linked list + to the end, the first thing done is that the "first tree" field is + updated. Then the "last tree". Then the header pointer is set to point + to the next element in the list. + +
  • If the header flag is already set when the writer grabs the WRITER + lock, then a crash must have occurred. In this case the free-list must + be recovered. + +
  • Recovering the free list involves two steps: First a linear scan + of the current tree to identify those chunks in use (and also for + another reason, see below). Second, a scan of the remainder of the + file checking the "first tree" field of all chunks that either belong + to an earlier tree or appear to belong to the current tree but are not + linked in anywhere. Based on this, the new writer can rebuild the + free-list. + +
+ + +

In-memory tree format

+ +

+Header fields: + +

    +
  • 32-bits: Tree id (incremented for each new tree). +
  • 32-bits: Transaction id (incremented for each new transaction). +
  • 32-bits: Pointer to head of tree (an offset within the *-shm + file). +
  • 32-bits: Height of tree. +
  • 64-bits: Last checkpoint id for which log file space has already + been reclaimed. +
  • DbLog structure (see lsmInt.h). +
  • 32-bits: Header checksum 1. +
  • 32-bits: Header checksum 2. +
+ +

+There are two copies of the in-memory tree header. Both stored on +the *-shm header chunk. Copy 1 and copy 2. + +

+To commit a transaction, a writer does the following: + +

    +
  1. Updates copy 2 of the header, +
  2. Invokes a memory barrier, +
  3. Updates copy 1 of the header, +
  4. Clears the "transaction in progress flag", +
  5. Drops the WRITER lock. +
+ +

+To open a read transaction, the reader: + +

    +
  1. Reads copy 1 of the header. + +
  2. If the checksum fails, attempt to obtain the WRITER lock. If + successful, do the equivalent of opening and committing an + empty transaction (see below). Either way, return to 1 and + attempt to reread the in-memory tree header. If copy 1 cannot be + read within some reasonable amount of time...? + +
  3. Read the client shapshot from shared memory. If the checksum + fails, attempt to obtain the WORKER lock. If successful, copy + the worker snapshot over the client snapshot and drop the WORKER + lock. Successful or otherwise, attempt to reread the snapshot. + If this cannot be completed within some reasonable amount of + time...? + +
  4. Grab a read-lock corresponding to the tree id and snapshot ids + just read (note: assume that this is a memory barrier). + +
  5. Check that the shared memory tree header and client snapshot + still contain the ids for which the lock was obtained. If not, + drop the lock and go back to step 1. +
+ +

To open a write transaction, the writer: + +

    +
  1. Opens a read transaction, if one is not already open. + +
  2. Obtain the WRITER lock. + +
  3. Check the "transaction in progress" flag. If it is set, + perform the emergency rollback and freelist recovery, then + clear the flag. + +
  4. Check that copy 1 of the header still matches the copy read + when the read transaction was opened. If not, drop the lock + and return LSM_BUSY. + +
  5. Set the "transaction in progress" flag. +
+ +

+Emergency rollback and recovery: +

    +
  1. If the checksum of copy 1 of the header fails, replace it with + the contents of copy 2. + +
  2. Iterate through the entire tree, rolling back any nodes with + transaction ids that indicate they require it. Record the blocks + occupied by the current tree. + +
  3. Scan through the entire *-shm memory file, inspecting the "first + tree" fields of each chunk. +
+ +

+ Large values or keys may overflow chunks. + +

Client and worker snapshots

+ +

+The client and worker snapshots stored in the *-shm file use the +same format as the checkpoint written to the database file. Except, +they are always in native byte order. Each is stored in a dedicated +4KB slot, as in the database file. A client must hold the WORKER +lock to modify either of the two snapshots. + +

+To work on the database file, a worker performs the following: +

    +
  1. Obtain the WORKER lock. + +
  2. Copies the worker snapshot from the shared-memory region into + heap memory and verifies that the checksum computes. + +
  3. If the checksum of the worker snapshot does not compute, copy + the client snapshot over the top of the worker and reload it. + If the checksum still does not compute, return LSM_CORRUPT. + +
  4. Perform some merging work on the database. Generate a new + worker snapshot. Write it over the top of the old. + +
  5. Optionally, copy the new worker snapshot over the top of the + client snapshot. TODO: Copying the worker snapshot into the + client slot makes the worker read-only.... Currently, LSM + distinguishes between read-only and read-write worker snapshots. + But that would mean an extra flag in shared-memory. Perhaps its + better to consider all worker snapshots to be read-only. Or, + change the format slightly to include a "read-write" flag that + can be set for those snapshots not copied into the client slot. + UPDATE: Current code already treats all worker snapshots as read-only. + +
  6. Release the WORKER lock. +
+ +

+To checkpoint a snapshot. +

    +
  1. Obtain the CHECKPOINTER lock. +
  2. Read the client snapshot. +
  3. Sync the database file. +
  4. Write the client snapshot into the appropriate meta-page (based + on the "last checkpoint slot" field in the *-shm header). +
  5. Sync the database file. +
  6. Update the "last checkpoint slot" field. +
  7. Drop the CHECKPOINTER lock. +