/* ** 2012-01-23 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** ** Utilities used to help multiple LSM clients to coexist within the ** same process space. */ #include "lsmInt.h" /* ** Global data. All global variables used by code in this file are grouped ** into the following structure instance. ** ** pDatabase: ** Linked list of all Database objects allocated within this process. ** This list may not be traversed without holding the global mutex (see ** functions enterGlobalMutex() and leaveGlobalMutex()). */ static struct SharedData { Database *pDatabase; /* Linked list of all Database objects */ } gShared; /* ** Database structure. There is one such structure for each distinct ** database accessed by this process. They are stored in the singly linked ** list starting at global variable gShared.pDatabase. Database objects are ** reference counted. Once the number of connections to the associated ** database drops to zero, they are removed from the linked list and deleted. ** ** pFile: ** In multi-process mode, this file descriptor is used to obtain locks ** and to access shared-memory. In single process mode, its only job is ** to hold the exclusive lock on the file. ** */ struct Database { /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */ char *zName; /* Canonical path to database file */ int nName; /* strlen(zName) */ int nDbRef; /* Number of associated lsm_db handles */ Database *pDbNext; /* Next Database structure in global list */ /* Protected by the local mutex (pClientMutex) */ int bReadonly; /* True if Database.pFile is read-only */ int bMultiProc; /* True if running in multi-process mode */ lsm_file *pFile; /* Used for locks/shm in multi-proc mode */ LsmFile *pLsmFile; /* List of deferred closes */ lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */ int nShmChunk; /* Number of entries in apShmChunk[] array */ void **apShmChunk; /* Array of "shared" memory regions */ lsm_db *pConn; /* List of connections to this db. */ }; /* ** Functions to enter and leave the global mutex. This mutex is used ** to protect the global linked-list headed at gShared.pDatabase. */ static int enterGlobalMutex(lsm_env *pEnv){ lsm_mutex *p; int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); if( rc==LSM_OK ) lsmMutexEnter(pEnv, p); return rc; } static void leaveGlobalMutex(lsm_env *pEnv){ lsm_mutex *p; lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); lsmMutexLeave(pEnv, p); } #ifdef LSM_DEBUG static int holdingGlobalMutex(lsm_env *pEnv){ lsm_mutex *p; lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p); return lsmMutexHeld(pEnv, p); } #endif #if 0 static void assertNotInFreelist(Freelist *p, int iBlk){ int i; for(i=0; inEntry; i++){ assert( p->aEntry[i].iBlk!=iBlk ); } } #else # define assertNotInFreelist(x,y) #endif /* ** Append an entry to the free-list. If (iId==-1), this is a delete. */ int freelistAppend(lsm_db *db, u32 iBlk, i64 iId){ lsm_env *pEnv = db->pEnv; Freelist *p; int i; assert( iId==-1 || iId>=0 ); p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist; /* Extend the space allocated for the freelist, if required */ assert( p->nAlloc>=p->nEntry ); if( p->nAlloc==p->nEntry ){ int nNew; int nByte; FreelistEntry *aNew; nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2); nByte = sizeof(FreelistEntry) * nNew; aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte); if( !aNew ) return LSM_NOMEM_BKPT; p->nAlloc = nNew; p->aEntry = aNew; } for(i=0; inEntry; i++){ assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk ); if( p->aEntry[i].iBlk>=iBlk ) break; } if( inEntry && p->aEntry[i].iBlk==iBlk ){ /* Clobber an existing entry */ p->aEntry[i].iId = iId; }else{ /* Insert a new entry into the list */ int nByte = sizeof(FreelistEntry)*(p->nEntry-i); memmove(&p->aEntry[i+1], &p->aEntry[i], nByte); p->aEntry[i].iBlk = iBlk; p->aEntry[i].iId = iId; p->nEntry++; } return LSM_OK; } /* ** This function frees all resources held by the Database structure passed ** as the only argument. */ static void freeDatabase(lsm_env *pEnv, Database *p){ assert( holdingGlobalMutex(pEnv) ); if( p ){ /* Free the mutexes */ lsmMutexDel(pEnv, p->pClientMutex); if( p->pFile ){ lsmEnvClose(pEnv, p->pFile); } /* Free the array of shm pointers */ lsmFree(pEnv, p->apShmChunk); /* Free the memory allocated for the Database struct itself */ lsmFree(pEnv, p); } } typedef struct DbTruncateCtx DbTruncateCtx; struct DbTruncateCtx { int nBlock; i64 iInUse; }; static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){ DbTruncateCtx *p = (DbTruncateCtx *)pCtx; if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1; p->nBlock--; return 0; } static int dbTruncate(lsm_db *pDb, i64 iInUse){ int rc = LSM_OK; #if 0 int i; DbTruncateCtx ctx; assert( pDb->pWorker ); ctx.nBlock = pDb->pWorker->nBlock; ctx.iInUse = iInUse; rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx); for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){ rc = freelistAppend(pDb, i, -1); } if( rc==LSM_OK ){ #ifdef LSM_LOG_FREELIST if( ctx.nBlock!=pDb->pWorker->nBlock ){ lsmLogMessage(pDb, 0, "dbTruncate(): truncated db to %d blocks",ctx.nBlock ); } #endif pDb->pWorker->nBlock = ctx.nBlock; } #endif return rc; } /* ** This function is called during database shutdown (when the number of ** connections drops from one to zero). It truncates the database file ** to as small a size as possible without truncating away any blocks that ** contain data. */ static int dbTruncateFile(lsm_db *pDb){ int rc; assert( pDb->pWorker==0 ); assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) ); rc = lsmCheckpointLoadWorker(pDb); if( rc==LSM_OK ){ DbTruncateCtx ctx; /* Walk the database free-block-list in reverse order. Set ctx.nBlock ** to the block number of the last block in the database that actually ** contains data. */ ctx.nBlock = pDb->pWorker->nBlock; ctx.iInUse = -1; rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx); /* If the last block that contains data is not already the last block in ** the database file, truncate the database file so that it is. */ if( rc==LSM_OK ){ rc = lsmFsTruncateDb( pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS) ); } } lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); pDb->pWorker = 0; return rc; } static void doDbDisconnect(lsm_db *pDb){ int rc; if( pDb->bReadonly ){ lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0); }else{ /* Block for an exclusive lock on DMS1. This lock serializes all calls ** to doDbConnect() and doDbDisconnect() across all processes. */ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); if( rc==LSM_OK ){ lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0); /* Try an exclusive lock on DMS2. If successful, this is the last ** connection to the database. In this case flush the contents of the ** in-memory tree to disk and write a checkpoint. */ rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL); if( rc==LSM_OK ){ rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL); } if( rc==LSM_OK ){ int bReadonly = 0; /* True if there exist read-only conns. */ /* Flush the in-memory tree, if required. If there is data to flush, ** this will create a new client snapshot in Database.pClient. The ** checkpoint (serialization) of this snapshot may be written to disk ** by the following block. ** ** There is no need to take a WRITER lock here. That there are no ** other locks on DMS2 guarantees that there are no other read-write ** connections at this time (and the lock on DMS1 guarantees that ** no new ones may appear). */ rc = lsmTreeLoadHeader(pDb, 0); if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){ rc = lsmFlushTreeToDisk(pDb); } /* Now check if there are any read-only connections. If there are, ** then do not truncate the db file or unlink the shared-memory ** region. */ if( rc==LSM_OK ){ rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL); if( rc==LSM_BUSY ){ bReadonly = 1; rc = LSM_OK; } } /* Write a checkpoint to disk. */ if( rc==LSM_OK ){ rc = lsmCheckpointWrite(pDb, 0); } /* If the checkpoint was written successfully, delete the log file ** and, if possible, truncate the database file. */ if( rc==LSM_OK ){ int bRotrans = 0; Database *p = pDb->pDatabase; /* The log file may only be deleted if there are no clients ** read-only clients running rotrans transactions. */ rc = lsmDetectRoTrans(pDb, &bRotrans); if( rc==LSM_OK && bRotrans==0 ){ lsmFsCloseAndDeleteLog(pDb->pFS); } /* The database may only be truncated if there exist no read-only ** clients - either connected or running rotrans transactions. */ if( bReadonly==0 && bRotrans==0 ){ lsmFsUnmap(pDb->pFS); dbTruncateFile(pDb); if( p->pFile && p->bMultiProc ){ lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1); } } } } } if( pDb->iRwclient>=0 ){ lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0); pDb->iRwclient = -1; } lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); } pDb->pShmhdr = 0; } static int doDbConnect(lsm_db *pDb){ const int nUsMax = 100000; /* Max value for nUs */ int nUs = 1000; /* us to wait between DMS1 attempts */ int rc; /* Obtain a pointer to the shared-memory header */ assert( pDb->pShmhdr==0 ); assert( pDb->bReadonly==0 ); /* Block for an exclusive lock on DMS1. This lock serializes all calls ** to doDbConnect() and doDbDisconnect() across all processes. */ while( 1 ){ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1); if( rc!=LSM_BUSY ) break; lsmEnvSleep(pDb->pEnv, nUs); nUs = nUs * 2; if( nUs>nUsMax ) nUs = nUsMax; } if( rc==LSM_OK ){ rc = lsmShmCacheChunks(pDb, 1); } if( rc!=LSM_OK ) return rc; pDb->pShmhdr = (ShmHeader *)pDb->apShm[0]; /* Try an exclusive lock on DMS2/DMS3. If successful, this is the first ** and only connection to the database. In this case initialize the ** shared-memory and run log file recovery. */ assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 ); rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL); if( rc==LSM_OK ){ memset(pDb->pShmhdr, 0, sizeof(ShmHeader)); rc = lsmCheckpointRecover(pDb); if( rc==LSM_OK ){ rc = lsmLogRecover(pDb); } if( rc==LSM_OK ){ ShmHeader *pShm = pDb->pShmhdr; pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0); pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid; } }else if( rc==LSM_BUSY ){ rc = LSM_OK; } /* Take a shared lock on DMS2. In multi-process mode this lock "cannot" ** fail, as connections may only hold an exclusive lock on DMS2 if they ** first hold an exclusive lock on DMS1. And this connection is currently ** holding the exclusive lock on DSM1. ** ** However, if some other connection has the database open in single-process ** mode, this operation will fail. In this case, return the error to the ** caller - the attempt to connect to the db has failed. */ if( rc==LSM_OK ){ rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0); } /* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive ** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */ if( rc!=LSM_OK ){ pDb->pShmhdr = 0; }else{ int i; for(i=0; iiRwclient = i; if( rc2!=LSM_BUSY ){ rc = rc2; break; } } } lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); return rc; } static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){ int rc; rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile); if( rc==LSM_IOERR && bRoOk ){ rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile); p->bReadonly = 1; } return rc; } /* ** Return a reference to the shared Database handle for the database ** identified by canonical path zName. If this is the first connection to ** the named database, a new Database object is allocated. Otherwise, a ** pointer to an existing object is returned. ** ** If successful, *ppDatabase is set to point to the shared Database ** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL ** and and LSM error code returned. ** ** Each successful call to this function should be (eventually) matched ** by a call to lsmDbDatabaseRelease(). */ int lsmDbDatabaseConnect( lsm_db *pDb, /* Database handle */ const char *zName /* Full-path to db file */ ){ lsm_env *pEnv = pDb->pEnv; int rc; /* Return code */ Database *p = 0; /* Pointer returned via *ppDatabase */ int nName = lsmStrlen(zName); assert( pDb->pDatabase==0 ); rc = enterGlobalMutex(pEnv); if( rc==LSM_OK ){ /* Search the global list for an existing object. TODO: Need something ** better than the memcmp() below to figure out if a given Database ** object represents the requested file. */ for(p=gShared.pDatabase; p; p=p->pDbNext){ if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break; } /* If no suitable Database object was found, allocate a new one. */ if( p==0 ){ p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc); /* If the allocation was successful, fill in other fields and ** allocate the client mutex. */ if( rc==LSM_OK ){ p->bMultiProc = pDb->bMultiProc; p->zName = (char *)&p[1]; p->nName = nName; memcpy((void *)p->zName, zName, nName+1); rc = lsmMutexNew(pEnv, &p->pClientMutex); } /* If nothing has gone wrong so far, open the shared fd. And if that ** succeeds and this connection requested single-process mode, ** attempt to take the exclusive lock on DMS2. */ if( rc==LSM_OK ){ int bReadonly = (pDb->bReadonly && pDb->bMultiProc); rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly); } if( rc==LSM_OK && p->bMultiProc==0 ){ /* Hold an exclusive lock DMS1 while grabbing DMS2. This ensures ** that any ongoing call to doDbDisconnect() (even one in another ** process) is finished before proceeding. */ assert( p->bReadonly==0 ); rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_EXCL); if( rc==LSM_OK ){ rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL); lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK); } } if( rc==LSM_OK ){ p->pDbNext = gShared.pDatabase; gShared.pDatabase = p; }else{ freeDatabase(pEnv, p); p = 0; } } if( p ){ p->nDbRef++; } leaveGlobalMutex(pEnv); if( p ){ lsmMutexEnter(pDb->pEnv, p->pClientMutex); pDb->pNext = p->pConn; p->pConn = pDb; lsmMutexLeave(pDb->pEnv, p->pClientMutex); } } pDb->pDatabase = p; if( rc==LSM_OK ){ assert( p ); rc = lsmFsOpen(pDb, zName, p->bReadonly); } /* If the db handle is read-write, then connect to the system now. Run ** recovery as necessary. Or, if this is a read-only database handle, ** defer attempting to connect to the system until a read-transaction ** is opened. */ if( rc==LSM_OK ){ rc = lsmFsConfigure(pDb); } if( rc==LSM_OK && pDb->bReadonly==0 ){ rc = doDbConnect(pDb); } return rc; } static void dbDeferClose(lsm_db *pDb){ if( pDb->pFS ){ LsmFile *pLsmFile; Database *p = pDb->pDatabase; pLsmFile = lsmFsDeferClose(pDb->pFS); pLsmFile->pNext = p->pLsmFile; p->pLsmFile = pLsmFile; } } LsmFile *lsmDbRecycleFd(lsm_db *db){ LsmFile *pRet; Database *p = db->pDatabase; lsmMutexEnter(db->pEnv, p->pClientMutex); if( (pRet = p->pLsmFile)!=0 ){ p->pLsmFile = pRet->pNext; } lsmMutexLeave(db->pEnv, p->pClientMutex); return pRet; } /* ** Release a reference to a Database object obtained from ** lsmDbDatabaseConnect(). There should be exactly one call to this function ** for each successful call to Find(). */ void lsmDbDatabaseRelease(lsm_db *pDb){ Database *p = pDb->pDatabase; if( p ){ lsm_db **ppDb; if( pDb->pShmhdr ){ doDbDisconnect(pDb); } lsmFsUnmap(pDb->pFS); lsmMutexEnter(pDb->pEnv, p->pClientMutex); for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext)); *ppDb = pDb->pNext; dbDeferClose(pDb); lsmMutexLeave(pDb->pEnv, p->pClientMutex); enterGlobalMutex(pDb->pEnv); p->nDbRef--; if( p->nDbRef==0 ){ LsmFile *pIter; LsmFile *pNext; Database **pp; /* Remove the Database structure from the linked list. */ for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext)); *pp = p->pDbNext; /* If they were allocated from the heap, free the shared memory chunks */ if( p->bMultiProc==0 ){ int i; for(i=0; inShmChunk; i++){ lsmFree(pDb->pEnv, p->apShmChunk[i]); } } /* Close any outstanding file descriptors */ for(pIter=p->pLsmFile; pIter; pIter=pNext){ pNext = pIter->pNext; lsmEnvClose(pDb->pEnv, pIter->pFile); lsmFree(pDb->pEnv, pIter); } freeDatabase(pDb->pEnv, p); } leaveGlobalMutex(pDb->pEnv); } } Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){ return pSnapshot->pLevel; } void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){ pSnap->pLevel = pLevel; } /* TODO: Shuffle things around to get rid of this */ static int firstSnapshotInUse(lsm_db *, i64 *); /* ** Context object used by the lsmWalkFreelist() utility. */ typedef struct WalkFreelistCtx WalkFreelistCtx; struct WalkFreelistCtx { lsm_db *pDb; int bReverse; Freelist *pFreelist; int iFree; int (*xUsr)(void *, int, i64); /* User callback function */ void *pUsrctx; /* User callback context */ int bDone; /* Set to true after xUsr() returns true */ }; /* ** Callback used by lsmWalkFreelist(). */ static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){ WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx; const int iDir = (p->bReverse ? -1 : 1); Freelist *pFree = p->pFreelist; assert( p->bDone==0 ); assert( iBlk>=0 ); if( pFree ){ while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){ FreelistEntry *pEntry = &pFree->aEntry[p->iFree]; if( (p->bReverse==0 && pEntry->iBlk>(u32)iBlk) || (p->bReverse!=0 && pEntry->iBlk<(u32)iBlk) ){ break; }else{ p->iFree += iDir; if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){ p->bDone = 1; return 1; } if( pEntry->iBlk==(u32)iBlk ) return 0; } } } if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){ p->bDone = 1; return 1; } return 0; } /* ** The database handle passed as the first argument must be the worker ** connection. This function iterates through the contents of the current ** free block list, invoking the supplied callback once for each list ** element. ** ** The difference between this function and lsmSortedWalkFreelist() is ** that lsmSortedWalkFreelist() only considers those free-list elements ** stored within the LSM. This function also merges in any in-memory ** elements. */ int lsmWalkFreelist( lsm_db *pDb, /* Database handle (must be worker) */ int bReverse, /* True to iterate from largest to smallest */ int (*x)(void *, int, i64), /* Callback function */ void *pCtx /* First argument to pass to callback */ ){ const int iDir = (bReverse ? -1 : 1); int rc; int iCtx; WalkFreelistCtx ctx[2]; ctx[0].pDb = pDb; ctx[0].bReverse = bReverse; ctx[0].pFreelist = &pDb->pWorker->freelist; if( ctx[0].pFreelist && bReverse ){ ctx[0].iFree = ctx[0].pFreelist->nEntry-1; }else{ ctx[0].iFree = 0; } ctx[0].xUsr = walkFreelistCb; ctx[0].pUsrctx = (void *)&ctx[1]; ctx[0].bDone = 0; ctx[1].pDb = pDb; ctx[1].bReverse = bReverse; ctx[1].pFreelist = pDb->pFreelist; if( ctx[1].pFreelist && bReverse ){ ctx[1].iFree = ctx[1].pFreelist->nEntry-1; }else{ ctx[1].iFree = 0; } ctx[1].xUsr = x; ctx[1].pUsrctx = pCtx; ctx[1].bDone = 0; rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]); if( ctx[0].bDone==0 ){ for(iCtx=0; iCtx<2; iCtx++){ int i; WalkFreelistCtx *p = &ctx[iCtx]; for(i=p->iFree; p->pFreelist && rc==LSM_OK && ipFreelist->nEntry && i>=0; i += iDir ){ FreelistEntry *pEntry = &p->pFreelist->aEntry[i]; if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){ return LSM_OK; } } } } return rc; } typedef struct FindFreeblockCtx FindFreeblockCtx; struct FindFreeblockCtx { i64 iInUse; int iRet; int bNotOne; }; static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){ FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx; if( iSnapshotiInUse && (iBlk!=1 || p->bNotOne==0) ){ p->iRet = iBlk; return 1; } return 0; } static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){ int rc; /* Return code */ FindFreeblockCtx ctx; /* Context object */ ctx.iInUse = iInUse; ctx.iRet = 0; ctx.bNotOne = bNotOne; rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx); *piRet = ctx.iRet; return rc; } /* ** Allocate a new database file block to write data to, either by extending ** the database file or by recycling a free-list entry. The worker snapshot ** must be held in order to call this function. ** ** If successful, *piBlk is set to the block number allocated and LSM_OK is ** returned. Otherwise, *piBlk is zeroed and an lsm error code returned. */ int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){ Snapshot *p = pDb->pWorker; int iRet = 0; /* Block number of allocated block */ int rc = LSM_OK; i64 iInUse = 0; /* Snapshot id still in use */ i64 iSynced = 0; /* Snapshot id synced to disk */ assert( p ); #ifdef LSM_LOG_FREELIST { static int nCall = 0; char *zFree = 0; nCall++; rc = lsmInfoFreelist(pDb, &zFree); if( rc!=LSM_OK ) return rc; lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree); lsmFree(pDb->pEnv, zFree); } #endif /* Set iInUse to the smallest snapshot id that is either: ** ** * Currently in use by a database client, ** * May be used by a database client in the future, or ** * Is the most recently checkpointed snapshot (i.e. the one that will ** be used following recovery if a failure occurs at this point). */ rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0); if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId; iInUse = iSynced; if( rc==LSM_OK && pDb->iReader>=0 ){ assert( pDb->pClient ); iInUse = LSM_MIN(iInUse, pDb->pClient->iId); } if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse); #ifdef LSM_LOG_FREELIST { lsmLogMessage(pDb, 0, "lsmBlockAllocate(): " "snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)", iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0) ); } #endif /* Unless there exists a read-only transaction (which prevents us from ** recycling any blocks regardless, query the free block list for a ** suitable block to reuse. ** ** It might seem more natural to check for a read-only transaction at ** the start of this function. However, it is better do wait until after ** the call to lsmCheckpointSynced() to do so. */ if( rc==LSM_OK ){ int bRotrans; rc = lsmDetectRoTrans(pDb, &bRotrans); if( rc==LSM_OK && bRotrans==0 ){ rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet); } } if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){ iRet = 0; }else if( rc==LSM_OK ){ /* If a block was found in the free block list, use it and remove it from ** the list. Otherwise, if no suitable block was found, allocate one from ** the end of the file. */ if( iRet>0 ){ #ifdef LSM_LOG_FREELIST lsmLogMessage(pDb, 0, "reusing block %d (snapshot-in-use=%lld)", iRet, iInUse); #endif rc = freelistAppend(pDb, iRet, -1); if( rc==LSM_OK ){ rc = dbTruncate(pDb, iInUse); } }else{ iRet = ++(p->nBlock); #ifdef LSM_LOG_FREELIST lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet); #endif } } assert( iBefore>0 || iRet>0 || rc!=LSM_OK ); *piBlk = iRet; return rc; } /* ** Free a database block. The worker snapshot must be held in order to call ** this function. ** ** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g. ** LSM_NOMEM). */ int lsmBlockFree(lsm_db *pDb, int iBlk){ Snapshot *p = pDb->pWorker; assert( lsmShmAssertWorker(pDb) ); #ifdef LSM_LOG_FREELIST lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk); #endif return freelistAppend(pDb, iBlk, p->iId); } /* ** Refree a database block. The worker snapshot must be held in order to call ** this function. ** ** Refreeing is required when a block is allocated using lsmBlockAllocate() ** but then not used. This function is used to push the block back onto ** the freelist. Refreeing a block is different from freeing is, as a refreed ** block may be reused immediately. Whereas a freed block can not be reused ** until (at least) after the next checkpoint. */ int lsmBlockRefree(lsm_db *pDb, int iBlk){ int rc = LSM_OK; /* Return code */ #ifdef LSM_LOG_FREELIST lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk); #endif rc = freelistAppend(pDb, iBlk, 0); return rc; } /* ** If required, copy a database checkpoint from shared memory into the ** database itself. ** ** The WORKER lock must not be held when this is called. This is because ** this function may indirectly call fsync(). And the WORKER lock should ** not be held that long (in case it is required by a client flushing an ** in-memory tree to disk). */ int lsmCheckpointWrite(lsm_db *pDb, u32 *pnWrite){ int rc; /* Return Code */ u32 nWrite = 0; assert( pDb->pWorker==0 ); assert( 1 || pDb->pClient==0 ); assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) ); rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0); if( rc!=LSM_OK ) return rc; rc = lsmCheckpointLoad(pDb, 0); if( rc==LSM_OK ){ int nBlock = lsmCheckpointNBlock(pDb->aSnapshot); ShmHeader *pShm = pDb->pShmhdr; int bDone = 0; /* True if checkpoint is already stored */ /* Check if this checkpoint has already been written to the database ** file. If so, set variable bDone to true. */ if( pShm->iMetaPage ){ MetaPage *pPg; /* Meta page */ u8 *aData; /* Meta-page data buffer */ int nData; /* Size of aData[] in bytes */ i64 iCkpt; /* Id of checkpoint just loaded */ i64 iDisk = 0; /* Id of checkpoint already stored in db */ iCkpt = lsmCheckpointId(pDb->aSnapshot, 0); rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg); if( rc==LSM_OK ){ aData = lsmFsMetaPageData(pPg, &nData); iDisk = lsmCheckpointId((u32 *)aData, 1); nWrite = lsmCheckpointNWrite((u32 *)aData, 1); lsmFsMetaPageRelease(pPg); } bDone = (iDisk>=iCkpt); } if( rc==LSM_OK && bDone==0 ){ int iMeta = (pShm->iMetaPage % 2) + 1; if( pDb->eSafety!=LSM_SAFETY_OFF ){ rc = lsmFsSyncDb(pDb->pFS, nBlock); } if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta); if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){ rc = lsmFsSyncDb(pDb->pFS, 0); } if( rc==LSM_OK ){ pShm->iMetaPage = iMeta; nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite; } #ifdef LSM_LOG_WORK lsmLogMessage(pDb, 0, "finish checkpoint %d", (int)lsmCheckpointId(pDb->aSnapshot, 0) ); #endif } } lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0); if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite; return rc; } int lsmBeginWork(lsm_db *pDb){ int rc; /* Attempt to take the WORKER lock */ rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0); /* Deserialize the current worker snapshot */ if( rc==LSM_OK ){ rc = lsmCheckpointLoadWorker(pDb); } return rc; } void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){ if( p ){ lsmSortedFreeLevel(pEnv, p->pLevel); lsmFree(pEnv, p->freelist.aEntry); lsmFree(pEnv, p->redirect.a); lsmFree(pEnv, p); } } /* ** Attempt to populate one of the read-lock slots to contain lock values ** iLsm/iShm. Or, if such a slot exists already, this function is a no-op. ** ** It is not an error if no slot can be populated because the write-lock ** cannot be obtained. If any other error occurs, return an LSM error code. ** Otherwise, LSM_OK. ** ** This function is called at various points to try to ensure that there ** always exists at least one read-lock slot that can be used by a read-only ** client. And so that, in the usual case, there is an "exact match" available ** whenever a read transaction is opened by any client. At present this ** function is called when: ** ** * A write transaction that called lsmTreeDiscardOld() is committed, and ** * Whenever the working snapshot is updated (i.e. lsmFinishWork()). */ static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){ int rc = LSM_OK; ShmHeader *pShm = db->pShmhdr; int i; /* Check if there is already a slot containing the required values. */ for(i=0; iaReader[i]; if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK; } /* Iterate through all read-lock slots, attempting to take a write-lock ** on each of them. If a write-lock succeeds, populate the locked slot ** with the required values and break out of the loop. */ for(i=0; rc==LSM_OK && iaReader[i]; p->iLsmId = iLsm; p->iTreeId = iShm; lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); break; } } return rc; } /* ** Release the read-lock currently held by connection db. */ int dbReleaseReadlock(lsm_db *db){ int rc = LSM_OK; if( db->iReader>=0 ){ rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0); db->iReader = -1; } db->bRoTrans = 0; return rc; } /* ** Argument bFlush is true if the contents of the in-memory tree has just ** been flushed to disk. The significance of this is that once the snapshot ** created to hold the updated state of the database is synced to disk, log ** file space can be recycled. */ void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){ int rc = *pRc; assert( rc!=0 || pDb->pWorker ); if( pDb->pWorker ){ /* If no error has occurred, serialize the worker snapshot and write ** it to shared memory. */ if( rc==LSM_OK ){ rc = lsmSaveWorker(pDb, bFlush); } /* Assuming no error has occurred, update a read lock slot with the ** new snapshot id (see comments above function dbSetReadLock()). */ if( rc==LSM_OK ){ if( pDb->iReader<0 ){ rc = lsmTreeLoadHeader(pDb, 0); } if( rc==LSM_OK ){ rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid); } } /* Free the snapshot object. */ lsmFreeSnapshot(pDb->pEnv, pDb->pWorker); pDb->pWorker = 0; } lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0); *pRc = rc; } /* ** Called when recovery is finished. */ int lsmFinishRecovery(lsm_db *pDb){ lsmTreeEndTransaction(pDb, 1); return LSM_OK; } /* ** Check if the currently configured compression functions ** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its ** compression id set to iReq. Compression routines are compatible if iReq ** is zero (indicating the database is empty), or if it is equal to the ** compression id of the configured compression routines. ** ** If the check shows that the current compression are incompatible and there ** is a compression factory registered, give it a chance to install new ** compression routines. ** ** If, after any registered factory is invoked, the compression functions ** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK. */ int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){ if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){ if( pDb->factory.xFactory ){ pDb->bInFactory = 1; pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq); pDb->bInFactory = 0; } if( pDb->compress.iId!=iReq ){ /* Incompatible */ return LSM_MISMATCH; } } /* Compatible */ return LSM_OK; } /* ** Begin a read transaction. This function is a no-op if the connection ** passed as the only argument already has an open read transaction. */ int lsmBeginReadTrans(lsm_db *pDb){ const int MAX_READLOCK_ATTEMPTS = 10; const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS); int rc = LSM_OK; /* Return code */ int iAttempt = 0; assert( pDb->pWorker==0 ); while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)pCsr==0 && pDb->nTransOpen==0 ); /* Load the in-memory tree header. */ rc = lsmTreeLoadHeader(pDb, &iTreehdr); /* Load the database snapshot */ if( rc==LSM_OK ){ if( lsmCheckpointClientCacheOk(pDb)==0 ){ lsmFreeSnapshot(pDb->pEnv, pDb->pClient); pDb->pClient = 0; lsmMCursorFreeCache(pDb); lsmFsPurgeCache(pDb->pFS); rc = lsmCheckpointLoad(pDb, &iSnap); }else{ iSnap = 1; } } /* Take a read-lock on the tree and snapshot just loaded. Then check ** that the shared-memory still contains the same values. If so, proceed. ** Otherwise, relinquish the read-lock and retry the whole procedure ** (starting with loading the in-memory tree header). */ if( rc==LSM_OK ){ u32 iShmMax = pDb->treehdr.iUsedShmid; u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS; rc = lsmReadlock( pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax ); if( rc==LSM_OK ){ if( lsmTreeLoadHeaderOk(pDb, iTreehdr) && lsmCheckpointLoadOk(pDb, iSnap) ){ /* Read lock has been successfully obtained. Deserialize the ** checkpoint just loaded. TODO: This will be removed after ** lsm_sorted.c is changed to work directly from the serialized ** version of the snapshot. */ if( pDb->pClient==0 ){ rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient); } assert( (rc==LSM_OK)==(pDb->pClient!=0) ); assert( pDb->iReader>=0 ); /* Check that the client has the right compression hooks loaded. ** If not, set rc to LSM_MISMATCH. */ if( rc==LSM_OK ){ rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId); } }else{ rc = dbReleaseReadlock(pDb); } } if( rc==LSM_BUSY ){ rc = LSM_OK; } } #if 0 if( rc==LSM_OK && pDb->pClient ){ fprintf(stderr, "reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n", (void *)pDb, (int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid, (int)pDb->treehdr.root.iTransId, (int)pDb->treehdr.iOldShmid ); } #endif } if( rc==LSM_OK ){ rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk); } if( rc!=LSM_OK ){ dbReleaseReadlock(pDb); } if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY; return rc; } /* ** This function is used by a read-write connection to determine if there ** are currently one or more read-only transactions open on the database ** (in this context a read-only transaction is one opened by a read-only ** connection on a non-live database). ** ** If no error occurs, LSM_OK is returned and *pbExists is set to true if ** some other connection has a read-only transaction open, or false ** otherwise. If an error occurs an LSM error code is returned and the final ** value of *pbExist is undefined. */ int lsmDetectRoTrans(lsm_db *db, int *pbExist){ int rc; /* Only a read-write connection may use this function. */ assert( db->bReadonly==0 ); rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL); if( rc==LSM_BUSY ){ *pbExist = 1; rc = LSM_OK; }else{ *pbExist = 0; } return rc; } /* ** db is a read-only database handle in the disconnected state. This function ** attempts to open a read-transaction on the database. This may involve ** connecting to the database system (opening shared memory etc.). */ int lsmBeginRoTrans(lsm_db *db){ int rc = LSM_OK; assert( db->bReadonly && db->pShmhdr==0 ); assert( db->iReader<0 ); if( db->bRoTrans==0 ){ /* Attempt a shared-lock on DMS1. */ rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0); if( rc!=LSM_OK ) return rc; rc = lsmShmTestLock( db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED ); if( rc==LSM_OK ){ /* System is not live. Take a SHARED lock on the ROTRANS byte and ** release DMS1. Locking ROTRANS tells all read-write clients that they ** may not recycle any disk space from within the database or log files, ** as a read-only client may be using it. */ rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0); lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); if( rc==LSM_OK ){ db->bRoTrans = 1; rc = lsmShmCacheChunks(db, 1); if( rc==LSM_OK ){ db->pShmhdr = (ShmHeader *)db->apShm[0]; memset(db->pShmhdr, 0, sizeof(ShmHeader)); rc = lsmCheckpointRecover(db); if( rc==LSM_OK ){ rc = lsmLogRecover(db); } } } }else if( rc==LSM_BUSY ){ /* System is live! */ rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0); lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0); if( rc==LSM_OK ){ rc = lsmShmCacheChunks(db, 1); if( rc==LSM_OK ){ db->pShmhdr = (ShmHeader *)db->apShm[0]; } } } if( rc==LSM_OK ){ rc = lsmBeginReadTrans(db); } } return rc; } /* ** Close the currently open read transaction. */ void lsmFinishReadTrans(lsm_db *pDb){ /* Worker connections should not be closing read transactions. And ** read transactions should only be closed after all cursors and write ** transactions have been closed. Finally pClient should be non-NULL ** only iff pDb->iReader>=0. */ assert( pDb->pWorker==0 ); assert( pDb->pCsr==0 && pDb->nTransOpen==0 ); if( pDb->bRoTrans ){ int i; for(i=0; inShm; i++){ lsmFree(pDb->pEnv, pDb->apShm[i]); } lsmFree(pDb->pEnv, pDb->apShm); pDb->apShm = 0; pDb->nShm = 0; pDb->pShmhdr = 0; lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0); } dbReleaseReadlock(pDb); } /* ** Open a write transaction. */ int lsmBeginWriteTrans(lsm_db *pDb){ int rc = LSM_OK; /* Return code */ ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */ assert( pDb->nTransOpen==0 ); assert( pDb->bDiscardOld==0 ); assert( pDb->bReadonly==0 ); /* If there is no read-transaction open, open one now. */ if( pDb->iReader<0 ){ rc = lsmBeginReadTrans(pDb); } /* Attempt to take the WRITER lock */ if( rc==LSM_OK ){ rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0); } /* If the previous writer failed mid-transaction, run emergency rollback. */ if( rc==LSM_OK && pShm->bWriter ){ rc = lsmTreeRepair(pDb); if( rc==LSM_OK ) pShm->bWriter = 0; } /* Check that this connection is currently reading from the most recent ** version of the database. If not, return LSM_BUSY. */ if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){ rc = LSM_BUSY; } if( rc==LSM_OK ){ rc = lsmLogBegin(pDb); } /* If everything was successful, set the "transaction-in-progress" flag ** and return LSM_OK. Otherwise, if some error occurred, relinquish the ** WRITER lock and return an error code. */ if( rc==LSM_OK ){ TreeHeader *p = &pDb->treehdr; pShm->bWriter = 1; p->root.iTransId++; if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){ lsmTreeDiscardOld(pDb); pDb->bDiscardOld = 1; } }else{ lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb); } return rc; } /* ** End the current write transaction. The connection is left with an open ** read transaction. It is an error to call this if there is no open write ** transaction. ** ** If the transaction was committed, then a commit record has already been ** written into the log file when this function is called. Or, if the ** transaction was rolled back, both the log file and in-memory tree ** structure have already been restored. In either case, this function ** merely releases locks and other resources held by the write-transaction. ** ** LSM_OK is returned if successful, or an LSM error code otherwise. */ int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){ int rc = LSM_OK; int bFlush = 0; lsmLogEnd(pDb, bCommit); if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){ bFlush = 1; lsmTreeMakeOld(pDb); } lsmTreeEndTransaction(pDb, bCommit); if( rc==LSM_OK ){ if( bFlush && pDb->bAutowork ){ rc = lsmSortedAutoWork(pDb, 1); }else if( bCommit && pDb->bDiscardOld ){ rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid); } } pDb->bDiscardOld = 0; lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0); if( bFlush && pDb->bAutowork==0 && pDb->xWork ){ pDb->xWork(pDb, pDb->pWorkCtx); } return rc; } /* ** Return non-zero if the caller is holding the client mutex. */ #ifdef LSM_DEBUG int lsmHoldingClientMutex(lsm_db *pDb){ return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex); } #endif static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){ return( p->iLsmId && p->iLsmId<=iLsm && shm_sequence_ge(iShmMax, p->iTreeId) && shm_sequence_ge(p->iTreeId, iShmMin) ); } /* ** Obtain a read-lock on database version identified by the combination ** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or ** an LSM error code otherwise. */ int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){ int rc = LSM_OK; ShmHeader *pShm = db->pShmhdr; int i; assert( db->iReader<0 ); assert( shm_sequence_ge(iShmMax, iShmMin) ); /* This is a no-op if the read-only transaction flag is set. */ if( db->bRoTrans ){ db->iReader = 0; return LSM_OK; } /* Search for an exact match. */ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){ db->iReader = i; }else if( rc==LSM_BUSY ){ rc = LSM_OK; } } } /* Try to obtain a write-lock on each slot, in order. If successful, set ** the slot values to iLsm/iTree. */ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; p->iLsmId = iLsm; p->iTreeId = iShmMax; rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); assert( rc!=LSM_BUSY ); if( rc==LSM_OK ) db->iReader = i; } } /* Search for any usable slot */ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i]; if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0); if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){ db->iReader = i; }else if( rc==LSM_BUSY ){ rc = LSM_OK; } } } if( rc==LSM_OK && db->iReader<0 ){ rc = LSM_BUSY; } return rc; } /* ** This is used to check if there exists a read-lock locking a particular ** version of either the in-memory tree or database file. ** ** If iLsmId is non-zero, then it is a snapshot id. If there exists a ** read-lock using this snapshot or newer, set *pbInUse to true. Or, ** if there is no such read-lock, set it to false. ** ** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id. ** Search for a read-lock using this sequence id or newer. etc. */ static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){ ShmHeader *pShm = db->pShmhdr; int i; int rc = LSM_OK; for(i=0; rc==LSM_OK && iaReader[i]; if( p->iLsmId ){ if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId) || (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid)) ){ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); if( rc==LSM_OK ){ p->iLsmId = 0; lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); } } } } if( rc==LSM_BUSY ){ *pbInUse = 1; return LSM_OK; } *pbInUse = 0; return rc; } /* ** This function is called by worker connections to determine the smallest ** snapshot id that is currently in use by a database client. The worker ** connection uses this result to determine whether or not it is safe to ** recycle a database block. */ static int firstSnapshotInUse( lsm_db *db, /* Database handle */ i64 *piInUse /* IN/OUT: Smallest snapshot id in use */ ){ ShmHeader *pShm = db->pShmhdr; i64 iInUse = *piInUse; int i; assert( iInUse>0 ); for(i=0; iaReader[i]; if( p->iLsmId ){ i64 iThis = p->iLsmId; if( iThis!=0 && iInUse>iThis ){ int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0); if( rc==LSM_OK ){ p->iLsmId = 0; lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0); }else if( rc==LSM_BUSY ){ iInUse = iThis; }else{ /* Some error other than LSM_BUSY. Return the error code to ** the caller in this case. */ return rc; } } } } *piInUse = iInUse; return LSM_OK; } int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){ if( db->treehdr.iUsedShmid==iShmid ){ *pbInUse = 1; return LSM_OK; } return isInUse(db, 0, iShmid, pbInUse); } int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){ if( db->pClient && db->pClient->iId<=iLsmId ){ *pbInUse = 1; return LSM_OK; } return isInUse(db, iLsmId, 0, pbInUse); } /* ** This function may only be called after a successful call to ** lsmDbDatabaseConnect(). It returns true if the connection is in ** multi-process mode, or false otherwise. */ int lsmDbMultiProc(lsm_db *pDb){ return pDb->pDatabase && pDb->pDatabase->bMultiProc; } /************************************************************************* ************************************************************************** ************************************************************************** ************************************************************************** ************************************************************************** *************************************************************************/ /* ** Ensure that database connection db has cached pointers to at least the ** first nChunk chunks of shared memory. */ int lsmShmCacheChunks(lsm_db *db, int nChunk){ int rc = LSM_OK; if( nChunk>db->nShm ){ static const int NINCR = 16; Database *p = db->pDatabase; lsm_env *pEnv = db->pEnv; int nAlloc; int i; /* Ensure that the db->apShm[] array is large enough. If an attempt to ** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array ** is always extended in multiples of 16 entries - so the actual allocated ** size can be inferred from nShm. */ nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR; while( nChunk>=nAlloc ){ void **apShm; nAlloc += NINCR; apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc); if( !apShm ) return LSM_NOMEM_BKPT; db->apShm = apShm; } if( db->bRoTrans ){ for(i=db->nShm; rc==LSM_OK && iapShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); db->nShm++; } }else{ /* Enter the client mutex */ lsmMutexEnter(pEnv, p->pClientMutex); /* Extend the Database objects apShmChunk[] array if necessary. Using the ** same pattern as for the lsm_db.apShm[] array above. */ nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR; while( nChunk>=nAlloc ){ void **apShm; nAlloc += NINCR; apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc); if( !apShm ){ rc = LSM_NOMEM_BKPT; break; } p->apShmChunk = apShm; } for(i=db->nShm; rc==LSM_OK && i=p->nShmChunk ){ void *pChunk = 0; if( p->bMultiProc==0 ){ /* Single process mode */ pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc); }else{ /* Multi-process mode */ rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk); } if( rc==LSM_OK ){ p->apShmChunk[i] = pChunk; p->nShmChunk++; } } if( rc==LSM_OK ){ db->apShm[i] = p->apShmChunk[i]; db->nShm++; } } /* Release the client mutex */ lsmMutexLeave(pEnv, p->pClientMutex); } } return rc; } static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){ int rc = LSM_OK; if( p->bMultiProc ){ rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp); } return rc; } /* ** Test if it would be possible for connection db to obtain a lock of type ** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it ** would not be possible to obtain the lock due to a lock held by another ** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the ** lsm_env.xTestLock function), return some other LSM error code. ** ** Note that this function never actually locks the database - it merely ** queries the system to see if there exists a lock that would prevent ** it from doing so. */ int lsmShmTestLock( lsm_db *db, int iLock, int nLock, int eOp ){ int rc = LSM_OK; lsm_db *pIter; Database *p = db->pDatabase; int i; u64 mask = 0; for(i=iLock; i<(iLock+nLock); i++){ mask |= ((u64)1 << (iLock-1)); if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1)); } lsmMutexEnter(db->pEnv, p->pClientMutex); for(pIter=p->pConn; pIter; pIter=pIter->pNext){ if( pIter!=db && (pIter->mLock & mask) ){ assert( pIter!=db ); break; } } if( pIter ){ rc = LSM_BUSY; }else if( p->bMultiProc ){ rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp); } lsmMutexLeave(db->pEnv, p->pClientMutex); return rc; } /* ** Attempt to obtain the lock identified by the iLock and bExcl parameters. ** If successful, return LSM_OK. If the lock cannot be obtained because ** there exists some other conflicting lock, return LSM_BUSY. If some other ** error occurs, return an LSM error code. ** ** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER, ** or else a value returned by the LSM_LOCK_READER macro. */ int lsmShmLock( lsm_db *db, int iLock, int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */ int bBlock /* True for a blocking lock */ ){ lsm_db *pIter; const u64 me = ((u64)1 << (iLock-1)); const u64 ms = ((u64)1 << (iLock+32-1)); int rc = LSM_OK; Database *p = db->pDatabase; assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 ); assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) ); assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 ); assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); /* Check for a no-op. Proceed only if this is not one of those. */ if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0) || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms) || (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0) ){ int nExcl = 0; /* Number of connections holding EXCLUSIVE */ int nShared = 0; /* Number of connections holding SHARED */ lsmMutexEnter(db->pEnv, p->pClientMutex); /* Figure out the locks currently held by this process on iLock, not ** including any held by connection db. */ for(pIter=p->pConn; pIter; pIter=pIter->pNext){ assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 ); if( pIter!=db ){ if( pIter->mLock & me ){ nExcl++; }else if( pIter->mLock & ms ){ nShared++; } } } assert( nExcl==0 || nExcl==1 ); assert( nExcl==0 || nShared==0 ); assert( nExcl==0 || (db->mLock & (me|ms))==0 ); switch( eOp ){ case LSM_LOCK_UNLOCK: if( nShared==0 ){ lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK); } db->mLock &= ~(me|ms); break; case LSM_LOCK_SHARED: if( nExcl ){ rc = LSM_BUSY; }else{ if( nShared==0 ){ rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED); } if( rc==LSM_OK ){ db->mLock |= ms; db->mLock &= ~me; } } break; default: assert( eOp==LSM_LOCK_EXCL ); if( nExcl || nShared ){ rc = LSM_BUSY; }else{ rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL); if( rc==LSM_OK ){ db->mLock |= (me|ms); } } break; } lsmMutexLeave(db->pEnv, p->pClientMutex); } return rc; } #ifdef LSM_DEBUG int shmLockType(lsm_db *db, int iLock){ const u64 me = ((u64)1 << (iLock-1)); const u64 ms = ((u64)1 << (iLock+32-1)); if( db->mLock & me ) return LSM_LOCK_EXCL; if( db->mLock & ms ) return LSM_LOCK_SHARED; return LSM_LOCK_UNLOCK; } /* ** The arguments passed to this function are similar to those passed to ** the lsmShmLock() function. However, instead of obtaining a new lock ** this function returns true if the specified connection already holds ** (or does not hold) such a lock, depending on the value of eOp. As ** follows: ** ** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock ** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock. ** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock. */ int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){ int ret = 0; int eHave; assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) ); assert( iLock<=16 ); assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL ); eHave = shmLockType(db, iLock); switch( eOp ){ case LSM_LOCK_UNLOCK: ret = (eHave==LSM_LOCK_UNLOCK); break; case LSM_LOCK_SHARED: ret = (eHave!=LSM_LOCK_UNLOCK); break; case LSM_LOCK_EXCL: ret = (eHave==LSM_LOCK_EXCL); break; default: assert( !"bad eOp value passed to lsmShmAssertLock()" ); break; } return ret; } int lsmShmAssertWorker(lsm_db *db){ return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker; } /* ** This function does not contribute to library functionality, and is not ** included in release builds. It is intended to be called from within ** an interactive debugger. ** ** When called, this function prints a single line of human readable output ** to stdout describing the locks currently held by the connection. For ** example: ** ** (gdb) call print_db_locks(pDb) ** (shared on dms2) (exclusive on writer) */ void print_db_locks(lsm_db *db){ int iLock; for(iLock=0; iLock<16; iLock++){ int bOne = 0; const char *azLock[] = {0, "shared", "exclusive"}; const char *azName[] = { 0, "dms1", "dms2", "writer", "worker", "checkpointer", "reader0", "reader1", "reader2", "reader3", "reader4", "reader5" }; int eHave = shmLockType(db, iLock); if( azLock[eHave] ){ printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]); bOne = 1; } } printf("\n"); } void print_all_db_locks(lsm_db *db){ lsm_db *p; for(p=db->pDatabase->pConn; p; p=p->pNext){ printf("%s connection %p ", ((p==db)?"*":""), p); print_db_locks(p); } } #endif void lsmShmBarrier(lsm_db *db){ lsmEnvShmBarrier(db->pEnv); } int lsm_checkpoint(lsm_db *pDb, int *pnKB){ int rc; /* Return code */ u32 nWrite = 0; /* Number of pages checkpointed */ /* Attempt the checkpoint. If successful, nWrite is set to the number of ** pages written between this and the previous checkpoint. */ rc = lsmCheckpointWrite(pDb, &nWrite); /* If required, calculate the output variable (KB of data checkpointed). ** Set it to zero if an error occured. */ if( pnKB ){ int nKB = 0; if( rc==LSM_OK && nWrite ){ nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024; } *pnKB = nKB; } return rc; }