Index: src/btree.c ================================================================== --- src/btree.c +++ src/btree.c @@ -2544,11 +2544,11 @@ } if( rc!=SQLITE_OK ){ unlockBtreeIfUnused(pBt); } - }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && + }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE && btreeInvokeBusyHandler(pBt) ); if( rc==SQLITE_OK ){ if( p->inTrans==TRANS_NONE ){ pBt->nTransaction++; Index: src/os.c ================================================================== --- src/os.c +++ src/os.c @@ -108,12 +108,12 @@ return id->pMethods->xShmGet(id, reqSize, pSize, pp); } int sqlite3OsShmRelease(sqlite3_file *id){ return id->pMethods->xShmRelease(id); } -int sqlite3OsShmLock(sqlite3_file *id, int desiredLock, int *pGotLock){ - return id->pMethods->xShmLock(id, desiredLock, pGotLock); +int sqlite3OsShmLock(sqlite3_file *id, int offset, int n, int flags){ + return id->pMethods->xShmLock(id, offset, n, flags); } void sqlite3OsShmBarrier(sqlite3_file *id){ id->pMethods->xShmBarrier(id); } int sqlite3OsShmClose(sqlite3_file *id, int deleteFlag){ Index: src/os.h ================================================================== --- src/os.h +++ src/os.h @@ -245,11 +245,11 @@ int sqlite3OsDeviceCharacteristics(sqlite3_file *id); int sqlite3OsShmOpen(sqlite3_file *id); int sqlite3OsShmSize(sqlite3_file *id, int, int*); int sqlite3OsShmGet(sqlite3_file *id, int, int*, void volatile**); int sqlite3OsShmRelease(sqlite3_file *id); -int sqlite3OsShmLock(sqlite3_file *id, int, int*); +int sqlite3OsShmLock(sqlite3_file *id, int, int, int); void sqlite3OsShmBarrier(sqlite3_file *id); int sqlite3OsShmClose(sqlite3_file *id, int); /* ** Functions for accessing sqlite3_vfs methods Index: src/os_unix.c ================================================================== --- src/os_unix.c +++ src/os_unix.c @@ -2064,11 +2064,11 @@ if( id ){ unixFile *pFile = (unixFile*)id; semUnlock(id, NO_LOCK); assert( pFile ); unixEnterMutex(); - releaseLockInfo(pFile->pInode); + releaseInodeInfo(pFile->pInode); unixLeaveMutex(); closeUnixFile(id); } return SQLITE_OK; } @@ -2531,11 +2531,11 @@ ** descriptor to pInode->aPending. It will be automatically closed when ** the last lock is cleared. */ setPendingFd(pFile); } - releaseLockInfo(pFile->pInode); + releaseInodeInfo(pFile->pInode); sqlite3_free(pFile->lockingContext); rc = closeUnixFile(id); unixLeaveMutex(); } return rc; @@ -3166,270 +3166,97 @@ ** while accessing any read/write fields. */ struct unixShm { unixShmNode *pShmNode; /* The underlying unixShmNode object */ unixShm *pNext; /* Next unixShm with the same unixShmNode */ - u8 lockState; /* Current lock state */ u8 hasMutex; /* True if holding the unixShmNode mutex */ u8 hasMutexBuf; /* True if holding pFile->mutexBuf */ - u8 sharedMask; /* Mask of shared locks held */ - u8 exclMask; /* Mask of exclusive locks held */ + u16 sharedMask; /* Mask of shared locks held */ + u16 exclMask; /* Mask of exclusive locks held */ #ifdef SQLITE_DEBUG u8 id; /* Id of this connection within its unixShmNode */ #endif }; -/* -** Size increment by which shared memory grows -*/ -#define SQLITE_UNIX_SHM_INCR 4096 - /* ** Constants used for locking */ -#define UNIX_SHM_BASE 80 /* Byte offset of the first lock byte */ -#define UNIX_SHM_DMS 0x01 /* Mask for Dead-Man-Switch lock */ -#define UNIX_SHM_A 0x10 /* Mask for region locks... */ -#define UNIX_SHM_B 0x20 -#define UNIX_SHM_C 0x40 -#define UNIX_SHM_D 0x80 - -#ifdef SQLITE_DEBUG -/* -** Return a pointer to a nul-terminated string in static memory that -** describes a locking mask. The string is of the form "MSABCD" with -** each character representing a lock. "M" for MUTEX, "S" for DMS, -** and "A" through "D" for the region locks. If a lock is held, the -** letter is shown. If the lock is not held, the letter is converted -** to ".". -** -** This routine is for debugging purposes only and does not appear -** in a production build. -*/ -static const char *unixShmLockString(u8 mask){ - static char zBuf[48]; - static int iBuf = 0; - char *z; - - z = &zBuf[iBuf]; - iBuf += 8; - if( iBuf>=sizeof(zBuf) ) iBuf = 0; - - z[0] = (mask & UNIX_SHM_DMS) ? 'S' : '.'; - z[1] = (mask & UNIX_SHM_A) ? 'A' : '.'; - z[2] = (mask & UNIX_SHM_B) ? 'B' : '.'; - z[3] = (mask & UNIX_SHM_C) ? 'C' : '.'; - z[4] = (mask & UNIX_SHM_D) ? 'D' : '.'; - z[5] = 0; - return z; -} -#endif /* SQLITE_DEBUG */ - -/* -** Apply posix advisory locks for all bytes identified in lockMask. -** -** lockMask might contain multiple bits but all bits are guaranteed -** to be contiguous. +#define UNIX_SHM_BASE ((18+SQLITE_SHM_NLOCK)*4) /* first lock byte */ +#define UNIX_SHM_DMS (UNIX_SHM_BASE+SQLITE_SHM_NLOCK) /* deadman switch */ + +/* +** Apply posix advisory locks for all bytes from ofst through ofst+n-1. ** ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking ** otherwise. */ static int unixShmSystemLock( unixShmNode *pShmNode, /* Apply locks to this open shared-memory segment */ int lockType, /* F_UNLCK, F_RDLCK, or F_WRLCK */ - u8 lockMask /* Which bytes to lock or unlock */ + int ofst, /* First byte of the locking range */ + int n /* Number of bytes to lock */ ){ struct flock f; /* The posix advisory locking structure */ - int lockOp; /* The opcode for fcntl() */ - int i; /* Offset into the locking byte range */ - int rc; /* Result code form fcntl() */ - u8 mask; /* Mask of bits in lockMask */ + int rc = SQLITE_OK; /* Result code form fcntl() */ /* Access to the unixShmNode object is serialized by the caller */ assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 ); + + /* Shared locks never span more than one byte */ + assert( n==1 || lockType!=F_RDLCK ); + + /* Locks are within range */ + assert( n>=1 && nh, lockOp, &f); + f.l_start = ofst; + f.l_len = n; + + rc = fcntl(pShmNode->h, F_SETLK, &f); rc = (rc!=(-1)) ? SQLITE_OK : SQLITE_BUSY; /* Update the global lock state and do debug tracing */ #ifdef SQLITE_DEBUG + { u16 mask; OSTRACE(("SHM-LOCK ")); + mask = (1<<(ofst+n)) - (1<exclMask &= ~lockMask; - pShmNode->sharedMask &= ~lockMask; + OSTRACE(("unlock %d ok", ofst)); + pShmNode->exclMask &= ~mask; + pShmNode->sharedMask &= ~mask; }else if( lockType==F_RDLCK ){ - OSTRACE(("read-lock ok")); - pShmNode->exclMask &= ~lockMask; - pShmNode->sharedMask |= lockMask; + OSTRACE(("read-lock %d ok", ofst)); + pShmNode->exclMask &= ~mask; + pShmNode->sharedMask |= mask; }else{ assert( lockType==F_WRLCK ); - OSTRACE(("write-lock ok")); - pShmNode->exclMask |= lockMask; - pShmNode->sharedMask &= ~lockMask; + OSTRACE(("write-lock %d ok", ofst)); + pShmNode->exclMask |= mask; + pShmNode->sharedMask &= ~mask; } }else{ if( lockType==F_UNLCK ){ - OSTRACE(("unlock failed")); + OSTRACE(("unlock %d failed", ofst)); }else if( lockType==F_RDLCK ){ OSTRACE(("read-lock failed")); }else{ assert( lockType==F_WRLCK ); - OSTRACE(("write-lock failed")); + OSTRACE(("write-lock %d failed", ofst)); } } - OSTRACE((" - change requested %s - afterwards %s:%s\n", - unixShmLockString(lockMask), - unixShmLockString(pShmNode->sharedMask), - unixShmLockString(pShmNode->exclMask))); + OSTRACE((" - afterwards %03x,%03x\n", + pShmNode->sharedMask, pShmNode->exclMask)); + } #endif return rc; } -/* -** For connection p, unlock all of the locks identified by the unlockMask -** parameter. -*/ -static int unixShmUnlock( - unixShmNode *pShmNode, /* The underlying shared-memory file */ - unixShm *p, /* The connection to be unlocked */ - u8 unlockMask /* Mask of locks to be unlocked */ -){ - int rc; /* Result code */ - unixShm *pX; /* For looping over all sibling connections */ - u8 allMask; /* Union of locks held by connections other than "p" */ - - /* Access to the unixShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pShmNode->mutex) ); - - /* Compute locks held by sibling connections */ - allMask = 0; - for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 ); - allMask |= pX->sharedMask; - } - - /* Unlock the system-level locks */ - if( (unlockMask & allMask)!=unlockMask ){ - rc = unixShmSystemLock(pShmNode, F_UNLCK, unlockMask & ~allMask); - }else{ - rc = SQLITE_OK; - } - - /* Undo the local locks */ - if( rc==SQLITE_OK ){ - p->exclMask &= ~unlockMask; - p->sharedMask &= ~unlockMask; - } - return rc; -} - -/* -** Get reader locks for connection p on all locks in the readMask parameter. -*/ -static int unixShmSharedLock( - unixShmNode *pShmNode, /* The underlying shared-memory file */ - unixShm *p, /* The connection to get the shared locks */ - u8 readMask /* Mask of shared locks to be acquired */ -){ - int rc; /* Result code */ - unixShm *pX; /* For looping over all sibling connections */ - u8 allShared; /* Union of locks held by connections other than "p" */ - - /* Access to the unixShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pShmNode->mutex) ); - - /* Find out which shared locks are already held by sibling connections. - ** If any sibling already holds an exclusive lock, go ahead and return - ** SQLITE_BUSY. - */ - allShared = 0; - for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - if( (pX->exclMask & readMask)!=0 ) return SQLITE_BUSY; - allShared |= pX->sharedMask; - } - - /* Get shared locks at the system level, if necessary */ - if( (~allShared) & readMask ){ - rc = unixShmSystemLock(pShmNode, F_RDLCK, readMask); - }else{ - rc = SQLITE_OK; - } - - /* Get the local shared locks */ - if( rc==SQLITE_OK ){ - p->sharedMask |= readMask; - } - return rc; -} - -/* -** For connection p, get an exclusive lock on all locks identified in -** the writeMask parameter. -*/ -static int unixShmExclusiveLock( - unixShmNode *pShmNode, /* The underlying shared-memory file */ - unixShm *p, /* The connection to get the exclusive locks */ - u8 writeMask /* Mask of exclusive locks to be acquired */ -){ - int rc; /* Result code */ - unixShm *pX; /* For looping over all sibling connections */ - - /* Access to the unixShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pShmNode->mutex) ); - - /* Make sure no sibling connections hold locks that will block this - ** lock. If any do, return SQLITE_BUSY right away. - */ - for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - if( (pX->exclMask & writeMask)!=0 ) return SQLITE_BUSY; - if( (pX->sharedMask & writeMask)!=0 ) return SQLITE_BUSY; - } - - /* Get the exclusive locks at the system level. Then if successful - ** also mark the local connection as being locked. - */ - rc = unixShmSystemLock(pShmNode, F_WRLCK, writeMask); - if( rc==SQLITE_OK ){ - p->sharedMask &= ~writeMask; - p->exclMask |= writeMask; - } - return rc; -} /* ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0. ** ** This is not a VFS shared-memory method; it is a utility function called @@ -3518,17 +3345,17 @@ /* Check to see if another process is holding the dead-man switch. ** If not, truncate the file to zero length. */ rc = SQLITE_OK; - if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS)==SQLITE_OK ){ + if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){ if( ftruncate(pShmNode->h, 0) ){ rc = SQLITE_IOERR; } } if( rc==SQLITE_OK ){ - rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS); + rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS, 1); } if( rc ) goto shm_open_err; } /* Make the new connection a child of the unixShmNode */ @@ -3570,21 +3397,18 @@ pShmNode = p->pShmNode; assert( pShmNode==pDbFd->pInode->pShmNode ); assert( pShmNode->pInode==pDbFd->pInode ); - /* Verify that the connection being closed holds no locks */ - assert( p->exclMask==0 ); - assert( p->sharedMask==0 ); - /* Remove connection p from the set of connections associated ** with pShmNode */ sqlite3_mutex_enter(pShmNode->mutex); for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){} *pp = p->pNext; /* Free the connection p */ + assert( p->hasMutexBuf==0 ); sqlite3_free(p); pDbFd->pShm = 0; sqlite3_mutex_leave(pShmNode->mutex); /* If pShmNode->nRef has reached 0, then close the underlying @@ -3639,10 +3463,31 @@ reqSize = -1; } return rc; } +/* +** Release the lock held on the shared memory segment to that other +** threads are free to resize it if necessary. +** +** If the lock is not currently held, this routine is a harmless no-op. +** +** If the shared-memory object is in lock state RECOVER, then we do not +** really want to release the lock, so in that case too, this routine +** is a no-op. +*/ +static int unixShmRelease(sqlite3_file *fd){ + unixFile *pDbFd = (unixFile*)fd; + unixShm *p = pDbFd->pShm; + + if( p->hasMutexBuf ){ + assert( sqlite3_mutex_notheld(p->pShmNode->mutex) ); + sqlite3_mutex_leave(p->pShmNode->mutexBuf); + p->hasMutexBuf = 0; + } + return SQLITE_OK; +} /* ** Map the shared storage into memory. ** ** If reqMapSize is positive, then an attempt is made to make the @@ -3685,11 +3530,11 @@ int rc = SQLITE_OK; assert( pShmNode==pDbFd->pInode->pShmNode ); assert( pShmNode->pInode==pDbFd->pInode ); - if( p->lockState!=SQLITE_SHM_CHECKPOINT && p->hasMutexBuf==0 ){ + if( p->hasMutexBuf==0 ){ assert( sqlite3_mutex_notheld(pShmNode->mutex) ); sqlite3_mutex_enter(pShmNode->mutexBuf); p->hasMutexBuf = 1; } sqlite3_mutex_enter(pShmNode->mutex); @@ -3712,176 +3557,127 @@ } } *pNewMapSize = pShmNode->szMap; *ppBuf = pShmNode->pMMapBuf; sqlite3_mutex_leave(pShmNode->mutex); + if( *ppBuf==0 ){ + /* Do not hold the mutex if a NULL pointer is being returned. */ + unixShmRelease(fd); + } return rc; } -/* -** Release the lock held on the shared memory segment to that other -** threads are free to resize it if necessary. -** -** If the lock is not currently held, this routine is a harmless no-op. -** -** If the shared-memory object is in lock state RECOVER, then we do not -** really want to release the lock, so in that case too, this routine -** is a no-op. -*/ -static int unixShmRelease(sqlite3_file *fd){ - unixFile *pDbFd = (unixFile*)fd; - unixShm *p = pDbFd->pShm; - - if( p->hasMutexBuf && p->lockState!=SQLITE_SHM_RECOVER ){ - assert( sqlite3_mutex_notheld(p->pShmNode->mutex) ); - sqlite3_mutex_leave(p->pShmNode->mutexBuf); - p->hasMutexBuf = 0; - } - return SQLITE_OK; -} - -/* -** Symbolic names for LOCK states used for debugging. -*/ -#ifdef SQLITE_DEBUG -static const char *azLkName[] = { - "UNLOCK", - "READ", - "READ_FULL", - "WRITE", - "PENDING", - "CHECKPOINT", - "RECOVER" -}; -#endif - /* ** Change the lock state for a shared-memory segment. +** +** Note that the relationship between SHAREd and EXCLUSIVE locks is a little +** different here than in posix. In xShmLock(), one can go from unlocked +** to shared and back or from unlocked to exclusive and back. But one may +** not go from shared to exclusive or from exclusive to shared. */ static int unixShmLock( sqlite3_file *fd, /* Database file holding the shared memory */ - int desiredLock, /* One of SQLITE_SHM_xxxxx locking states */ - int *pGotLock /* The lock you actually got */ + int ofst, /* First lock to acquire or release */ + int n, /* Number of locks to acquire or release */ + int flags /* What to do with the lock */ ){ - unixFile *pDbFd = (unixFile*)fd; - unixShm *p = pDbFd->pShm; - unixShmNode *pShmNode = p->pShmNode; - int rc = SQLITE_PROTOCOL; + unixFile *pDbFd = (unixFile*)fd; /* Connection holding shared memory */ + unixShm *p = pDbFd->pShm; /* The shared memory being locked */ + unixShm *pX; /* For looping over all siblings */ + unixShmNode *pShmNode = p->pShmNode; /* The underlying file iNode */ + int rc = SQLITE_OK; /* Result code */ + u16 mask; /* Mask of locks to take or release */ assert( pShmNode==pDbFd->pInode->pShmNode ); assert( pShmNode->pInode==pDbFd->pInode ); - - /* Note that SQLITE_SHM_READ_FULL and SQLITE_SHM_PENDING are never - ** directly requested; they are side effects from requesting - ** SQLITE_SHM_READ and SQLITE_SHM_CHECKPOINT, respectively. - */ - assert( desiredLock==SQLITE_SHM_UNLOCK - || desiredLock==SQLITE_SHM_READ - || desiredLock==SQLITE_SHM_WRITE - || desiredLock==SQLITE_SHM_CHECKPOINT - || desiredLock==SQLITE_SHM_RECOVER ); - - /* Return directly if this is just a lock state query, or if - ** the connection is already in the desired locking state. - */ - if( desiredLock==p->lockState - || (desiredLock==SQLITE_SHM_READ && p->lockState==SQLITE_SHM_READ_FULL) - ){ - OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s and got %s\n", - p->id, getpid(), azLkName[desiredLock], azLkName[p->lockState])); - if( pGotLock ) *pGotLock = p->lockState; - return SQLITE_OK; - } - - OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s->%s\n", - p->id, getpid(), azLkName[p->lockState], azLkName[desiredLock])); - - if( desiredLock==SQLITE_SHM_RECOVER && !p->hasMutexBuf ){ - assert( sqlite3_mutex_notheld(pShmNode->mutex) ); - sqlite3_mutex_enter(pShmNode->mutexBuf); - p->hasMutexBuf = 1; - } - sqlite3_mutex_enter(pShmNode->mutex); - switch( desiredLock ){ - case SQLITE_SHM_UNLOCK: { - assert( p->lockState!=SQLITE_SHM_RECOVER ); - unixShmUnlock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B|UNIX_SHM_C|UNIX_SHM_D); - rc = SQLITE_OK; - p->lockState = SQLITE_SHM_UNLOCK; - break; - } - case SQLITE_SHM_READ: { - if( p->lockState==SQLITE_SHM_UNLOCK ){ - int nAttempt; - rc = SQLITE_BUSY; - assert( p->lockState==SQLITE_SHM_UNLOCK ); - for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){ - rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B); - if( rc==SQLITE_BUSY ){ - rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_D); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_READ_FULL; - } - }else{ - unixShmUnlock(pShmNode, p, UNIX_SHM_B); - p->lockState = SQLITE_SHM_READ; - } - } - }else{ - assert( p->lockState==SQLITE_SHM_WRITE - || p->lockState==SQLITE_SHM_RECOVER ); - rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A); - unixShmUnlock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D); - p->lockState = SQLITE_SHM_READ; - } - break; - } - case SQLITE_SHM_WRITE: { - assert( p->lockState==SQLITE_SHM_READ - || p->lockState==SQLITE_SHM_READ_FULL ); - rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_WRITE; - } - break; - } - case SQLITE_SHM_CHECKPOINT: { - assert( p->lockState==SQLITE_SHM_UNLOCK - || p->lockState==SQLITE_SHM_PENDING - ); - if( p->lockState==SQLITE_SHM_UNLOCK ){ - rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_B|UNIX_SHM_C); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_PENDING; - } - } - if( p->lockState==SQLITE_SHM_PENDING ){ - rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_A); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_CHECKPOINT; - } - } - break; - } - default: { - assert( desiredLock==SQLITE_SHM_RECOVER ); - assert( p->lockState==SQLITE_SHM_READ - || p->lockState==SQLITE_SHM_READ_FULL - ); - assert( sqlite3_mutex_held(pShmNode->mutexBuf) ); - rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_RECOVER; - } - break; + assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK ); + assert( n>=1 ); + assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) + || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) + || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) + || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) ); + assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 ); + + mask = (1<<(ofst+n)) - (1<1 || mask==(1<mutex); + if( flags & SQLITE_SHM_UNLOCK ){ + u16 allMask = 0; /* Mask of locks held by siblings */ + + /* See if any siblings hold this same lock */ + for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ + if( pX==p ) continue; + assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 ); + allMask |= pX->sharedMask; + } + + /* Unlock the system-level locks */ + if( (mask & allMask)==0 ){ + rc = unixShmSystemLock(pShmNode, F_UNLCK, ofst+UNIX_SHM_BASE, n); + }else{ + rc = SQLITE_OK; + } + + /* Undo the local locks */ + if( rc==SQLITE_OK ){ + p->exclMask &= ~mask; + p->sharedMask &= ~mask; + } + }else if( flags & SQLITE_SHM_SHARED ){ + u16 allShared = 0; /* Union of locks held by connections other than "p" */ + + /* Find out which shared locks are already held by sibling connections. + ** If any sibling already holds an exclusive lock, go ahead and return + ** SQLITE_BUSY. + */ + for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ + if( (pX->exclMask & mask)!=0 ){ + rc = SQLITE_BUSY; + break; + } + allShared |= pX->sharedMask; + } + + /* Get shared locks at the system level, if necessary */ + if( rc==SQLITE_OK ){ + if( (allShared & mask)==0 ){ + rc = unixShmSystemLock(pShmNode, F_RDLCK, ofst+UNIX_SHM_BASE, n); + }else{ + rc = SQLITE_OK; + } + } + + /* Get the local shared locks */ + if( rc==SQLITE_OK ){ + p->sharedMask |= mask; + } + }else{ + /* Make sure no sibling connections hold locks that will block this + ** lock. If any do, return SQLITE_BUSY right away. + */ + for(pX=pShmNode->pFirst; pX; pX=pX->pNext){ + if( (pX->exclMask & mask)!=0 || (pX->sharedMask & mask)!=0 ){ + rc = SQLITE_BUSY; + break; + } + } + + /* Get the exclusive locks at the system level. Then if successful + ** also mark the local connection as being locked. + */ + if( rc==SQLITE_OK ){ + rc = unixShmSystemLock(pShmNode, F_WRLCK, ofst+UNIX_SHM_BASE, n); + if( rc==SQLITE_OK ){ + assert( (p->sharedMask & mask)==0 ); + p->exclMask |= mask; + } } } sqlite3_mutex_leave(pShmNode->mutex); - OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %s\n", - p->id, getpid(), azLkName[p->lockState])); - if( pGotLock ) *pGotLock = p->lockState; + OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %03x,%03x\n", + p->id, getpid(), p->sharedMask, p->exclMask)); return rc; } /* ** Implement a memory barrier or memory fence on shared memory. @@ -3890,16 +3686,12 @@ ** any load or store begun after the barrier. */ static void unixShmBarrier( sqlite3_file *fd /* Database file holding the shared memory */ ){ -#ifdef __GNUC__ - __sync_synchronize(); -#else - unixMutexEnter(); - unixMutexLeave(); -#endif + unixEnterMutex(); + unixLeaveMutex(); } #else # define unixShmOpen 0 Index: src/os_win.c ================================================================== --- src/os_win.c +++ src/os_win.c @@ -1225,12 +1225,10 @@ char *pMMapBuf; /* Where currently mmapped(). NULL if unmapped */ int nRef; /* Number of winShm objects pointing to this */ winShm *pFirst; /* All winShm objects pointing to this */ winShmNode *pNext; /* Next in list of all winShmNode objects */ #ifdef SQLITE_DEBUG - u8 exclMask; /* Mask of exclusive locks held */ - u8 sharedMask; /* Mask of shared locks held */ u8 nextShmId; /* Next available winShm.id value */ #endif }; /* @@ -1251,310 +1249,64 @@ ** and is read-only thereafter. */ struct winShm { winShmNode *pShmNode; /* The underlying winShmNode object */ winShm *pNext; /* Next winShm with the same winShmNode */ - u8 lockState; /* Current lock state */ u8 hasMutex; /* True if holding the winShmNode mutex */ u8 hasMutexBuf; /* True if holding pFile->mutexBuf */ - u8 sharedMask; /* Mask of shared locks held */ - u8 exclMask; /* Mask of exclusive locks held */ #ifdef SQLITE_DEBUG u8 id; /* Id of this connection with its winShmNode */ #endif }; -/* -** Size increment by which shared memory grows -*/ -#define SQLITE_WIN_SHM_INCR 4096 - /* ** Constants used for locking */ -#define WIN_SHM_BASE 80 /* Byte offset of the first lock byte */ -#define WIN_SHM_DMS 0x01 /* Mask for Dead-Man-Switch lock */ -#define WIN_SHM_A 0x10 /* Mask for region locks... */ -#define WIN_SHM_B 0x20 -#define WIN_SHM_C 0x40 -#define WIN_SHM_D 0x80 - -#ifdef SQLITE_DEBUG -/* -** Return a pointer to a nul-terminated string in static memory that -** describes a locking mask. The string is of the form "MSABCD" with -** each character representing a lock. "M" for MUTEX, "S" for DMS, -** and "A" through "D" for the region locks. If a lock is held, the -** letter is shown. If the lock is not held, the letter is converted -** to ".". -** -** This routine is for debugging purposes only and does not appear -** in a production build. -*/ -static const char *winShmLockString(u8 mask){ - static char zBuf[48]; - static int iBuf = 0; - char *z; - - z = &zBuf[iBuf]; - iBuf += 8; - if( iBuf>=sizeof(zBuf) ) iBuf = 0; - - z[0] = (mask & WIN_SHM_DMS) ? 'S' : '.'; - z[1] = (mask & WIN_SHM_A) ? 'A' : '.'; - z[2] = (mask & WIN_SHM_B) ? 'B' : '.'; - z[3] = (mask & WIN_SHM_C) ? 'C' : '.'; - z[4] = (mask & WIN_SHM_D) ? 'D' : '.'; - z[5] = 0; - return z; -} -#endif /* SQLITE_DEBUG */ - -/* -** Apply posix advisory locks for all bytes identified in lockMask. -** -** lockMask might contain multiple bits but all bits are guaranteed -** to be contiguous. -** -** Locks block if the mask is exactly WIN_SHM_C and are non-blocking -** otherwise. +#define WIN_SHM_BASE ((18+SQLITE_SHM_NLOCK)*4) /* first lock byte */ +#define WIN_SHM_DMS (WIN_SHM_BASE+SQLITE_SHM_NLOCK) /* deadman switch */ + +/* +** Apply advisory locks for all n bytes beginning at ofst. */ #define _SHM_UNLCK 1 #define _SHM_RDLCK 2 #define _SHM_WRLCK 3 static int winShmSystemLock( winShmNode *pFile, /* Apply locks to this open shared-memory segment */ int lockType, /* _SHM_UNLCK, _SHM_RDLCK, or _SHM_WRLCK */ - u8 lockMask /* Which bytes to lock or unlock */ + int ofst, /* Offset to first byte to be locked/unlocked */ + int nByte /* Number of bytes to lock or unlock */ ){ OVERLAPPED ovlp; DWORD dwFlags; - int nBytes; /* Number of bytes to lock */ - int i; /* Offset into the locking byte range */ int rc = 0; /* Result code form Lock/UnlockFileEx() */ - u8 mask; /* Mask of bits in lockMask */ /* Access to the winShmNode object is serialized by the caller */ assert( sqlite3_mutex_held(pFile->mutex) || pFile->nRef==0 ); /* Initialize the locking parameters */ - if( lockMask==WIN_SHM_C && lockType!=_SHM_UNLCK ){ - dwFlags = 0; - OSTRACE(("SHM-LOCK %d requesting blocking lock %s\n", - pFile->hFile.h, - winShmLockString(lockMask))); - }else{ - dwFlags = LOCKFILE_FAIL_IMMEDIATELY; - OSTRACE(("SHM-LOCK %d requesting %s %s\n", - pFile->hFile.h, - lockType!=_SHM_UNLCK ? "lock" : "unlock", - winShmLockString(lockMask))); - } + dwFlags = LOCKFILE_FAIL_IMMEDIATELY; if( lockType == _SHM_WRLCK ) dwFlags |= LOCKFILE_EXCLUSIVE_LOCK; /* Find the first bit in lockMask that is set */ - for(i=0, mask=0x01; mask!=0 && (lockMask&mask)==0; mask <<= 1, i++){} - assert( mask!=0 ); memset(&ovlp, 0, sizeof(OVERLAPPED)); - ovlp.Offset = i+WIN_SHM_BASE; - nBytes = 1; - - /* Extend the locking range for each additional bit that is set */ - mask <<= 1; - while( mask!=0 && (lockMask & mask)!=0 ){ - nBytes++; - mask <<= 1; - } - - /* Verify that all bits set in lockMask are contiguous */ - assert( mask==0 || (lockMask & ~(mask | (mask-1)))==0 ); + ovlp.Offset = ofst; /* Release/Acquire the system-level lock */ if( lockType==_SHM_UNLCK ){ - for(i=0; ihFile.h, 0, 1, 0, &ovlp); - if( !rc ) break; - } + rc = UnlockFileEx(pFile->hFile.h, 0, nByte, 0, &ovlp); }else{ - /* release old individual byte locks (if any) - ** and set new individual byte locks */ - for(i=0; ihFile.h, 0, 1, 0, &ovlp); - rc = LockFileEx(pFile->hFile.h, dwFlags, 0, 1, 0, &ovlp); - if( !rc ) break; - } + rc = LockFileEx(pFile->hFile.h, dwFlags, 0, nByte, 0, &ovlp); } if( !rc ){ OSTRACE(("SHM-LOCK %d %s ERROR 0x%08lx\n", pFile->hFile.h, lockType==_SHM_UNLCK ? "UnlockFileEx" : "LockFileEx", GetLastError())); - /* release individual byte locks (if any) */ - ovlp.Offset-=i; - for(i=0; ihFile.h, 0, 1, 0, &ovlp); - } } rc = (rc!=0) ? SQLITE_OK : SQLITE_BUSY; - /* Update the global lock state and do debug tracing */ -#ifdef SQLITE_DEBUG - OSTRACE(("SHM-LOCK %d ", pFile->hFile.h)); - if( rc==SQLITE_OK ){ - if( lockType==_SHM_UNLCK ){ - OSTRACE(("unlock ok")); - pFile->exclMask &= ~lockMask; - pFile->sharedMask &= ~lockMask; - }else if( lockType==_SHM_RDLCK ){ - OSTRACE(("read-lock ok")); - pFile->exclMask &= ~lockMask; - pFile->sharedMask |= lockMask; - }else{ - assert( lockType==_SHM_WRLCK ); - OSTRACE(("write-lock ok")); - pFile->exclMask |= lockMask; - pFile->sharedMask &= ~lockMask; - } - }else{ - if( lockType==_SHM_UNLCK ){ - OSTRACE(("unlock failed")); - }else if( lockType==_SHM_RDLCK ){ - OSTRACE(("read-lock failed")); - }else{ - assert( lockType==_SHM_WRLCK ); - OSTRACE(("write-lock failed")); - } - } - OSTRACE((" - change requested %s - afterwards %s:%s\n", - winShmLockString(lockMask), - winShmLockString(pFile->sharedMask), - winShmLockString(pFile->exclMask))); -#endif - - return rc; -} - -/* -** For connection p, unlock all of the locks identified by the unlockMask -** parameter. -*/ -static int winShmUnlock( - winShmNode *pFile, /* The underlying shared-memory file */ - winShm *p, /* The connection to be unlocked */ - u8 unlockMask /* Mask of locks to be unlocked */ -){ - int rc; /* Result code */ - winShm *pX; /* For looping over all sibling connections */ - u8 allMask; /* Union of locks held by connections other than "p" */ - - /* Access to the winShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pFile->mutex) ); - - /* don't attempt to unlock anything we don't have locks for */ - if( (unlockMask & (p->exclMask|p->sharedMask)) != unlockMask ){ - OSTRACE(("SHM-LOCK %d unlocking more than we have locked - requested %s - have %s\n", - pFile->hFile.h, - winShmLockString(unlockMask), - winShmLockString(p->exclMask|p->sharedMask))); - unlockMask &= (p->exclMask|p->sharedMask); - } - - /* Compute locks held by sibling connections */ - allMask = 0; - for(pX=pFile->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 ); - allMask |= pX->sharedMask; - } - - /* Unlock the system-level locks */ - if( (unlockMask & allMask)!=unlockMask ){ - rc = winShmSystemLock(pFile, _SHM_UNLCK, unlockMask & ~allMask); - }else{ - rc = SQLITE_OK; - } - - /* Undo the local locks */ - if( rc==SQLITE_OK ){ - p->exclMask &= ~unlockMask; - p->sharedMask &= ~unlockMask; - } - return rc; -} - -/* -** Get reader locks for connection p on all locks in the readMask parameter. -*/ -static int winShmSharedLock( - winShmNode *pFile, /* The underlying shared-memory file */ - winShm *p, /* The connection to get the shared locks */ - u8 readMask /* Mask of shared locks to be acquired */ -){ - int rc; /* Result code */ - winShm *pX; /* For looping over all sibling connections */ - u8 allShared; /* Union of locks held by connections other than "p" */ - - /* Access to the winShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pFile->mutex) ); - - /* Find out which shared locks are already held by sibling connections. - ** If any sibling already holds an exclusive lock, go ahead and return - ** SQLITE_BUSY. - */ - allShared = 0; - for(pX=pFile->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - if( (pX->exclMask & readMask)!=0 ) return SQLITE_BUSY; - allShared |= pX->sharedMask; - } - - /* Get shared locks at the system level, if necessary */ - if( (~allShared) & readMask ){ - rc = winShmSystemLock(pFile, _SHM_RDLCK, readMask); - }else{ - rc = SQLITE_OK; - } - - /* Get the local shared locks */ - if( rc==SQLITE_OK ){ - p->sharedMask |= readMask; - } - return rc; -} - -/* -** For connection p, get an exclusive lock on all locks identified in -** the writeMask parameter. -*/ -static int winShmExclusiveLock( - winShmNode *pFile, /* The underlying shared-memory file */ - winShm *p, /* The connection to get the exclusive locks */ - u8 writeMask /* Mask of exclusive locks to be acquired */ -){ - int rc; /* Result code */ - winShm *pX; /* For looping over all sibling connections */ - - /* Access to the winShmNode object is serialized by the caller */ - assert( sqlite3_mutex_held(pFile->mutex) ); - - /* Make sure no sibling connections hold locks that will block this - ** lock. If any do, return SQLITE_BUSY right away. - */ - for(pX=pFile->pFirst; pX; pX=pX->pNext){ - if( pX==p ) continue; - if( (pX->exclMask & writeMask)!=0 ) return SQLITE_BUSY; - if( (pX->sharedMask & writeMask)!=0 ) return SQLITE_BUSY; - } - - /* Get the exclusive locks at the system level. Then if successful - ** also mark the local connection as being locked. - */ - rc = winShmSystemLock(pFile, _SHM_WRLCK, writeMask); - if( rc==SQLITE_OK ){ - p->sharedMask &= ~writeMask; - p->exclMask |= writeMask; - } return rc; } /* ** Purge the winShmNodeList list of all entries with winShmNode.nRef==0. @@ -1678,15 +1430,16 @@ } /* Check to see if another process is holding the dead-man switch. ** If not, truncate the file to zero length. */ - if( winShmSystemLock(pShmNode, _SHM_WRLCK, WIN_SHM_DMS)==SQLITE_OK ){ + if( winShmSystemLock(pShmNode, _SHM_WRLCK, WIN_SHM_DMS, 1)==SQLITE_OK ){ rc = winTruncate((sqlite3_file *)&pShmNode->hFile, 0); } if( rc==SQLITE_OK ){ - rc = winShmSystemLock(pShmNode, _SHM_RDLCK, WIN_SHM_DMS); + winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS, 1); + rc = winShmSystemLock(pShmNode, _SHM_RDLCK, WIN_SHM_DMS, 1); } if( rc ) goto shm_open_err; } /* Make the new connection a child of the winShmNode */ @@ -1701,11 +1454,11 @@ winShmLeaveMutex(); return SQLITE_OK; /* Jump here on any error */ shm_open_err: - winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS); + winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS, 1); winShmPurge(); /* This call frees pShmNode if required */ sqlite3_free(p); sqlite3_free(pNew); winShmLeaveMutex(); return rc; @@ -1726,14 +1479,10 @@ pDbFd = (winFile*)fd; p = pDbFd->pShm; pShmNode = p->pShmNode; - /* Verify that the connection being closed holds no locks */ - assert( p->exclMask==0 ); - assert( p->sharedMask==0 ); - /* Remove connection p from the set of connections associated ** with pShmNode */ sqlite3_mutex_enter(pShmNode->mutex); for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){} *pp = p->pNext; @@ -1780,16 +1529,12 @@ *pNewSize = 0; if( reqSize>=0 ){ sqlite3_int64 sz; rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz); - if( SQLITE_OK==rc ){ - reqSize = (reqSize + SQLITE_WIN_SHM_INCR - 1)/SQLITE_WIN_SHM_INCR; - reqSize *= SQLITE_WIN_SHM_INCR; - if( reqSize>sz ){ - rc = winTruncate((sqlite3_file *)&pShmNode->hFile, reqSize); - } + if( SQLITE_OK==rc && reqSize>sz ){ + rc = winTruncate((sqlite3_file *)&pShmNode->hFile, reqSize); } } if( SQLITE_OK==rc ){ sqlite3_int64 sz; rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz); @@ -1831,18 +1576,18 @@ */ static int winShmGet( sqlite3_file *fd, /* The database file holding the shared memory */ int reqMapSize, /* Requested size of mapping. -1 means don't care */ int *pNewMapSize, /* Write new size of mapping here */ - void **ppBuf /* Write mapping buffer origin here */ + void volatile **ppBuf /* Write mapping buffer origin here */ ){ winFile *pDbFd = (winFile*)fd; winShm *p = pDbFd->pShm; winShmNode *pShmNode = p->pShmNode; int rc = SQLITE_OK; - if( p->lockState!=SQLITE_SHM_CHECKPOINT && p->hasMutexBuf==0 ){ + if( p->hasMutexBuf==0 ){ assert( sqlite3_mutex_notheld(pShmNode->mutex) ); sqlite3_mutex_enter(pShmNode->mutexBuf); p->hasMutexBuf = 1; } sqlite3_mutex_enter(pShmNode->mutex); @@ -1918,162 +1663,52 @@ ** is a no-op. */ static int winShmRelease(sqlite3_file *fd){ winFile *pDbFd = (winFile*)fd; winShm *p = pDbFd->pShm; - if( p->hasMutexBuf && p->lockState!=SQLITE_SHM_RECOVER ){ + if( p->hasMutexBuf ){ winShmNode *pShmNode = p->pShmNode; assert( sqlite3_mutex_notheld(pShmNode->mutex) ); sqlite3_mutex_leave(pShmNode->mutexBuf); p->hasMutexBuf = 0; } return SQLITE_OK; } - -/* -** Symbolic names for LOCK states used for debugging. -*/ -#ifdef SQLITE_DEBUG -static const char *azLkName[] = { - "UNLOCK", - "READ", - "READ_FULL", - "WRITE", - "PENDING", - "CHECKPOINT", - "RECOVER" -}; -#endif - /* ** Change the lock state for a shared-memory segment. */ static int winShmLock( - sqlite3_file *fd, /* Database holding the shared memory */ - int desiredLock, /* One of SQLITE_SHM_xxxxx locking states */ - int *pGotLock /* The lock you actually got */ + sqlite3_file *fd, /* Database file holding the shared memory */ + int ofst, /* First lock to acquire or release */ + int n, /* Number of locks to acquire or release */ + int flags /* What to do with the lock */ ){ winFile *pDbFd = (winFile*)fd; winShm *p = pDbFd->pShm; winShmNode *pShmNode = p->pShmNode; int rc = SQLITE_PROTOCOL; - /* Note that SQLITE_SHM_READ_FULL and SQLITE_SHM_PENDING are never - ** directly requested; they are side effects from requesting - ** SQLITE_SHM_READ and SQLITE_SHM_CHECKPOINT, respectively. - */ - assert( desiredLock==SQLITE_SHM_UNLOCK - || desiredLock==SQLITE_SHM_READ - || desiredLock==SQLITE_SHM_WRITE - || desiredLock==SQLITE_SHM_CHECKPOINT - || desiredLock==SQLITE_SHM_RECOVER ); - - /* Return directly if this is just a lock state query, or if - ** the connection is already in the desired locking state. - */ - if( desiredLock==p->lockState - || (desiredLock==SQLITE_SHM_READ && p->lockState==SQLITE_SHM_READ_FULL) - ){ - OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d request %s and got %s\n", - pShmNode->hFile.h, - p->id, (int)GetCurrentProcessId(), azLkName[desiredLock], - azLkName[p->lockState])); - if( pGotLock ) *pGotLock = p->lockState; - return SQLITE_OK; - } - - OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d request %s->%s\n", - pShmNode->hFile.h, - p->id, (int)GetCurrentProcessId(), azLkName[p->lockState], - azLkName[desiredLock])); - - if( desiredLock==SQLITE_SHM_RECOVER && !p->hasMutexBuf ){ - assert( sqlite3_mutex_notheld(pShmNode->mutex) ); - sqlite3_mutex_enter(pShmNode->mutexBuf); - p->hasMutexBuf = 1; - } + assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK ); + assert( n>=1 ); + assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED) + || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE) + || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED) + || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) ); + assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 ); + sqlite3_mutex_enter(pShmNode->mutex); - switch( desiredLock ){ - case SQLITE_SHM_UNLOCK: { - assert( p->lockState!=SQLITE_SHM_RECOVER ); - winShmUnlock(pShmNode, p, WIN_SHM_A|WIN_SHM_B|WIN_SHM_C|WIN_SHM_D); - rc = SQLITE_OK; - p->lockState = SQLITE_SHM_UNLOCK; - break; - } - case SQLITE_SHM_READ: { - if( p->lockState==SQLITE_SHM_UNLOCK ){ - int nAttempt; - rc = SQLITE_BUSY; - assert( p->lockState==SQLITE_SHM_UNLOCK ); - for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){ - rc = winShmSharedLock(pShmNode, p, WIN_SHM_A|WIN_SHM_B); - if( rc==SQLITE_BUSY ){ - rc = winShmSharedLock(pShmNode, p, WIN_SHM_D); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_READ_FULL; - } - }else{ - winShmUnlock(pShmNode, p, WIN_SHM_B); - p->lockState = SQLITE_SHM_READ; - } - } - }else{ - assert( p->lockState==SQLITE_SHM_WRITE - || p->lockState==SQLITE_SHM_RECOVER ); - rc = winShmSharedLock(pShmNode, p, WIN_SHM_A); - winShmUnlock(pShmNode, p, WIN_SHM_C|WIN_SHM_D); - p->lockState = SQLITE_SHM_READ; - } - break; - } - case SQLITE_SHM_WRITE: { - assert( p->lockState==SQLITE_SHM_READ - || p->lockState==SQLITE_SHM_READ_FULL ); - rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_C|WIN_SHM_D); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_WRITE; - } - break; - } - case SQLITE_SHM_CHECKPOINT: { - assert( p->lockState==SQLITE_SHM_UNLOCK - || p->lockState==SQLITE_SHM_PENDING - ); - if( p->lockState==SQLITE_SHM_UNLOCK ){ - rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_B|WIN_SHM_C); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_PENDING; - } - } - if( p->lockState==SQLITE_SHM_PENDING ){ - rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_A); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_CHECKPOINT; - } - } - break; - } - default: { - assert( desiredLock==SQLITE_SHM_RECOVER ); - assert( p->lockState==SQLITE_SHM_READ - || p->lockState==SQLITE_SHM_READ_FULL - ); - assert( sqlite3_mutex_held(pShmNode->mutexBuf) ); - rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_C); - if( rc==SQLITE_OK ){ - p->lockState = SQLITE_SHM_RECOVER; - } - break; - } + if( flags & SQLITE_SHM_UNLOCK ){ + rc = winShmSystemLock(pShmNode, _SHM_UNLCK, ofst+WIN_SHM_BASE, n); + }else if( flags & SQLITE_SHM_SHARED ){ + rc = winShmSystemLock(pShmNode, _SHM_RDLCK, ofst+WIN_SHM_BASE, n); + }else{ + rc = winShmSystemLock(pShmNode, _SHM_WRLCK, ofst+WIN_SHM_BASE, n); } sqlite3_mutex_leave(pShmNode->mutex); - OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d got %s\n", - pShmNode->hFile.h, - p->id, (int)GetCurrentProcessId(), azLkName[p->lockState])); - if( pGotLock ) *pGotLock = p->lockState; + OSTRACE(("SHM-LOCK shmid-%d, pid-%d %s\n", + p->id, (int)GetCurrentProcessId(), rc ? "failed" : "ok")); return rc; } /* ** Implement a memory barrier or memory fence on shared memory. Index: src/pager.c ================================================================== --- src/pager.c +++ src/pager.c @@ -1201,11 +1201,11 @@ #else # define pagerUseWal(x) 0 # define pagerRollbackWal(x) 0 # define pagerWalFrames(v,w,x,y,z) 0 # define pagerOpenWalIfPresent(z) SQLITE_OK -# define pagerOpenSnapshot(z) SQLITE_OK +# define pagerBeginReadTransaction(z) SQLITE_OK #endif /* ** Unlock the database file. This function is a no-op if the pager ** is in exclusive mode. @@ -1236,11 +1236,11 @@ ** Clearing the page size cache here is being conservative. */ pPager->dbSizeValid = 0; if( pagerUseWal(pPager) ){ - sqlite3WalCloseSnapshot(pPager->pWal); + sqlite3WalEndReadTransaction(pPager->pWal); }else{ rc = osUnlock(pPager->fd, NO_LOCK); } if( rc ){ pPager->errCode = rc; @@ -1435,21 +1435,20 @@ pPager->pInJournal = 0; pPager->nRec = 0; sqlite3PcacheCleanAll(pPager->pPCache); if( pagerUseWal(pPager) ){ - rc2 = sqlite3WalWriteLock(pPager->pWal, 0); + rc2 = sqlite3WalEndWriteTransaction(pPager->pWal); pPager->state = PAGER_SHARED; /* If the connection was in locking_mode=exclusive mode but is no longer, ** drop the EXCLUSIVE lock held on the database file. */ if( rc2==SQLITE_OK && !pPager->exclusiveMode - && sqlite3WalExclusiveMode(pPager->pWal, -1) + && sqlite3WalExclusiveMode(pPager->pWal, 0) ){ - sqlite3WalExclusiveMode(pPager->pWal, 0); rc2 = osUnlock(pPager->fd, SHARED_LOCK); } }else if( !pPager->exclusiveMode ){ rc2 = osUnlock(pPager->fd, SHARED_LOCK); pPager->state = PAGER_SHARED; @@ -2360,19 +2359,31 @@ } return rc; } /* -** Open a WAL snapshot on the log file this pager is connected to. +** Begin a read transaction on the WAL. +** +** This routine used to be called "pagerOpenSnapshot()" because it essentially +** makes a snapshot of the database at the current point in time and preserves +** that snapshot for use by the reader in spite of concurrently changes by +** other writers or checkpointers. */ -static int pagerOpenSnapshot(Pager *pPager){ +static int pagerBeginReadTransaction(Pager *pPager){ int rc; /* Return code */ int changed = 0; /* True if cache must be reset */ assert( pagerUseWal(pPager) ); - rc = sqlite3WalOpenSnapshot(pPager->pWal, &changed); + /* sqlite3WalEndReadTransaction() was not called for the previous + ** transaction in locking_mode=EXCLUSIVE. So call it now. If we + ** are in locking_mode=NORMAL and EndRead() was previously called, + ** the duplicate call is harmless. + */ + sqlite3WalEndReadTransaction(pPager->pWal); + + rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed); if( rc==SQLITE_OK ){ int dummy; if( changed ){ pager_reset(pPager); assert( pPager->errCode || pPager->dbSizeValid==0 ); @@ -2426,11 +2437,11 @@ if( rc==SQLITE_OK ){ if( isWal ){ pager_reset(pPager); rc = sqlite3PagerOpenWal(pPager, 0); if( rc==SQLITE_OK ){ - rc = pagerOpenSnapshot(pPager); + rc = pagerBeginReadTransaction(pPager); } }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){ pPager->journalMode = PAGER_JOURNALMODE_DELETE; } } @@ -4000,11 +4011,11 @@ pPager->errCode = SQLITE_OK; pager_reset(pPager); } if( pagerUseWal(pPager) ){ - rc = pagerOpenSnapshot(pPager); + rc = pagerBeginReadTransaction(pPager); }else if( pPager->state==PAGER_UNLOCK || isErrorReset ){ sqlite3_vfs * const pVfs = pPager->pVfs; int isHotJournal = 0; assert( !MEMDB ); assert( sqlite3PcacheRefCount(pPager->pPCache)==0 ); @@ -4539,11 +4550,11 @@ if( pagerUseWal(pPager) ){ /* If the pager is configured to use locking_mode=exclusive, and an ** exclusive lock on the database is not already held, obtain it now. */ - if( pPager->exclusiveMode && !sqlite3WalExclusiveMode(pPager->pWal, -1) ){ + if( pPager->exclusiveMode && sqlite3WalExclusiveMode(pPager->pWal, -1) ){ rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK); pPager->state = PAGER_SHARED; if( rc!=SQLITE_OK ){ return rc; } @@ -4559,11 +4570,11 @@ ** transaction, but never to PAGER_EXCLUSIVE. This is because in ** PAGER_EXCLUSIVE state the code to roll back savepoint transactions ** may copy data from the sub-journal into the database file as well ** as into the page cache. Which would be incorrect in WAL mode. */ - rc = sqlite3WalWriteLock(pPager->pWal, 1); + rc = sqlite3WalBeginWriteTransaction(pPager->pWal); if( rc==SQLITE_OK ){ pPager->dbOrigSize = pPager->dbSize; pPager->state = PAGER_RESERVED; pPager->journalOff = 0; } @@ -5890,12 +5901,11 @@ int rc = SQLITE_OK; if( pPager->pWal ){ u8 *zBuf = (u8 *)pPager->pTmpSpace; rc = sqlite3WalCheckpoint(pPager->pWal, (pPager->noSync ? 0 : pPager->sync_flags), - pPager->pageSize, zBuf, - pPager->xBusyHandler, pPager->pBusyHandlerArg + pPager->pageSize, zBuf ); } return rc; } Index: src/sqlite.h.in ================================================================== --- src/sqlite.h.in +++ src/sqlite.h.in @@ -442,11 +442,12 @@ #define SQLITE_IOERR_ACCESS (SQLITE_IOERR | (13<<8)) #define SQLITE_IOERR_CHECKRESERVEDLOCK (SQLITE_IOERR | (14<<8)) #define SQLITE_IOERR_LOCK (SQLITE_IOERR | (15<<8)) #define SQLITE_IOERR_CLOSE (SQLITE_IOERR | (16<<8)) #define SQLITE_IOERR_DIR_CLOSE (SQLITE_IOERR | (17<<8)) -#define SQLITE_LOCKED_SHAREDCACHE (SQLITE_LOCKED | (1<<8) ) +#define SQLITE_LOCKED_SHAREDCACHE (SQLITE_LOCKED | (1<<8)) +#define SQLITE_BUSY_RECOVERY (SQLITE_BUSY | (1<<8)) /* ** CAPI3REF: Flags For File Open Operations ** ** These bit values are intended for use in the @@ -656,11 +657,11 @@ /* Methods above are valid for version 1 */ int (*xShmOpen)(sqlite3_file*); int (*xShmSize)(sqlite3_file*, int reqSize, int *pNewSize); int (*xShmGet)(sqlite3_file*, int reqSize, int *pSize, void volatile**); int (*xShmRelease)(sqlite3_file*); - int (*xShmLock)(sqlite3_file*, int desiredLock, int *gotLock); + int (*xShmLock)(sqlite3_file*, int offset, int n, int flags); void (*xShmBarrier)(sqlite3_file*); int (*xShmClose)(sqlite3_file*, int deleteFlag); /* Methods above are valid for version 2 */ /* Additional methods may be added in future releases */ }; @@ -886,20 +887,44 @@ #define SQLITE_ACCESS_READ 2 /* ** CAPI3REF: Flags for the xShmLock VFS method ** -** These integer constants define the various locking states that -** an sqlite3_shm object can be in. +** These integer constants define the various locking operations +** allowed by the xShmLock method of [sqlite3_io_methods]. The +** following are the only legal combinations of flags to the +** xShmLock method: +** +**
    +**
  • SQLITE_SHM_LOCK | SQLITE_SHM_SHARED +**
  • SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE +**
  • SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED +**
  • SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE +**
+** +** When unlocking, the same SHARED or EXCLUSIVE flag must be supplied as +** was given no the corresponding lock. +** +** The xShmLock method can transition between unlocked and SHARED or +** between unlocked and EXCLUSIVE. It cannot transition between SHARED +** and EXCLUSIVE. */ -#define SQLITE_SHM_UNLOCK 0 -#define SQLITE_SHM_READ 1 -#define SQLITE_SHM_READ_FULL 2 -#define SQLITE_SHM_WRITE 3 -#define SQLITE_SHM_PENDING 4 -#define SQLITE_SHM_CHECKPOINT 5 -#define SQLITE_SHM_RECOVER 6 +#define SQLITE_SHM_UNLOCK 1 +#define SQLITE_SHM_LOCK 2 +#define SQLITE_SHM_SHARED 4 +#define SQLITE_SHM_EXCLUSIVE 8 + +/* +** CAPI3REF: Maximum xShmLock index +** +** The xShmLock method on [sqlite3_io_methods] may use values +** between 0 and this upper bound as its "offset" argument. +** The SQLite core will never attempt to acquire or release a +** lock outside of this range +*/ +#define SQLITE_SHM_NLOCK 8 + /* ** CAPI3REF: Initialize The SQLite Library ** ** ^The sqlite3_initialize() routine initializes the Index: src/test1.c ================================================================== --- src/test1.c +++ src/test1.c @@ -4607,11 +4607,11 @@ } return TCL_OK; } /* -** tclcmd: file_control_lockproxy_test DB +** tclcmd: file_control_lockproxy_test DB PWD ** ** This TCL command runs the sqlite3_file_control interface and ** verifies correct operation of the SQLITE_GET_LOCKPROXYFILE and ** SQLITE_SET_LOCKPROXYFILE verbs. */ @@ -4620,19 +4620,22 @@ Tcl_Interp *interp, /* The TCL interpreter that invoked this command */ int objc, /* Number of arguments */ Tcl_Obj *CONST objv[] /* Command arguments */ ){ sqlite3 *db; + const char *zPwd; + int nPwd; - if( objc!=2 ){ + if( objc!=3 ){ Tcl_AppendResult(interp, "wrong # args: should be \"", - Tcl_GetStringFromObj(objv[0], 0), " DB", 0); + Tcl_GetStringFromObj(objv[0], 0), " DB PWD", 0); return TCL_ERROR; } if( getDbPointer(interp, Tcl_GetString(objv[1]), &db) ){ return TCL_ERROR; } + zPwd = Tcl_GetStringFromObj(objv[2], &nPwd); #if !defined(SQLITE_ENABLE_LOCKING_STYLE) # if defined(__APPLE__) # define SQLITE_ENABLE_LOCKING_STYLE 1 # else @@ -4639,13 +4642,19 @@ # define SQLITE_ENABLE_LOCKING_STYLE 0 # endif #endif #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__) { - char *proxyPath = "test.proxy"; char *testPath; int rc; + char proxyPath[400]; + + if( sizeof(proxyPath)pRealFile, reqSize, pSize, pp); } static int cfShmRelease(sqlite3_file *pFile){ return sqlite3OsShmRelease(((CrashFile*)pFile)->pRealFile); } -static int cfShmLock(sqlite3_file *pFile, int desired, int *pGot){ - return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, desired, pGot); +static int cfShmLock(sqlite3_file *pFile, int ofst, int n, int flags){ + return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, ofst, n, flags); } static void cfShmBarrier(sqlite3_file *pFile){ sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile); } static int cfShmClose(sqlite3_file *pFile, int delFlag){ Index: src/test_devsym.c ================================================================== --- src/test_devsym.c +++ src/test_devsym.c @@ -52,11 +52,11 @@ static int devsymDeviceCharacteristics(sqlite3_file*); static int devsymShmOpen(sqlite3_file*); static int devsymShmSize(sqlite3_file*,int,int*); static int devsymShmGet(sqlite3_file*,int,int*,volatile void**); static int devsymShmRelease(sqlite3_file*); -static int devsymShmLock(sqlite3_file*,int,int*); +static int devsymShmLock(sqlite3_file*,int,int,int); static void devsymShmBarrier(sqlite3_file*); static int devsymShmClose(sqlite3_file*,int); /* ** Method declarations for devsym_vfs. @@ -261,13 +261,13 @@ } static int devsymShmRelease(sqlite3_file *pFile){ devsym_file *p = (devsym_file *)pFile; return sqlite3OsShmRelease(p->pReal); } -static int devsymShmLock(sqlite3_file *pFile, int desired, int *pGot){ +static int devsymShmLock(sqlite3_file *pFile, int ofst, int n, int flags){ devsym_file *p = (devsym_file *)pFile; - return sqlite3OsShmLock(p->pReal, desired, pGot); + return sqlite3OsShmLock(p->pReal, ofst, n, flags); } static void devsymShmBarrier(sqlite3_file *pFile){ devsym_file *p = (devsym_file *)pFile; sqlite3OsShmBarrier(p->pReal); } Index: src/test_osinst.c ================================================================== --- src/test_osinst.c +++ src/test_osinst.c @@ -153,11 +153,11 @@ static int vfslogShmOpen(sqlite3_file *pFile); static int vfslogShmSize(sqlite3_file *pFile, int reqSize, int *pNewSize); static int vfslogShmGet(sqlite3_file *pFile, int,int*,volatile void **); static int vfslogShmRelease(sqlite3_file *pFile); -static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock); +static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags); static void vfslogShmBarrier(sqlite3_file*); static int vfslogShmClose(sqlite3_file *pFile, int deleteFlag); /* ** Method declarations for vfslog_vfs. @@ -458,16 +458,16 @@ rc = p->pReal->pMethods->xShmRelease(p->pReal); t = vfslog_time() - t; vfslog_call(p->pVfslog, OS_SHMRELEASE, p->iFileId, t, rc, 0, 0); return rc; } -static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock){ +static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags){ int rc; sqlite3_uint64 t; VfslogFile *p = (VfslogFile *)pFile; t = vfslog_time(); - rc = p->pReal->pMethods->xShmLock(p->pReal, desiredLock, gotLock); + rc = p->pReal->pMethods->xShmLock(p->pReal, ofst, n, flags); t = vfslog_time() - t; vfslog_call(p->pVfslog, OS_SHMLOCK, p->iFileId, t, rc, 0, 0); return rc; } static void vfslogShmBarrier(sqlite3_file *pFile){ Index: src/test_vfs.c ================================================================== --- src/test_vfs.c +++ src/test_vfs.c @@ -100,11 +100,11 @@ static int tvfsShmOpen(sqlite3_file*); static int tvfsShmSize(sqlite3_file*, int , int *); static int tvfsShmGet(sqlite3_file*, int , int *, volatile void **); static int tvfsShmRelease(sqlite3_file*); -static int tvfsShmLock(sqlite3_file*, int , int *); +static int tvfsShmLock(sqlite3_file*, int , int, int); static void tvfsShmBarrier(sqlite3_file*); static int tvfsShmClose(sqlite3_file*, int); static sqlite3_io_methods tvfs_io_methods = { 2, /* iVersion */ @@ -542,35 +542,38 @@ return rc; } static int tvfsShmLock( sqlite3_file *pFile, - int desiredLock, - int *gotLock + int ofst, + int n, + int flags ){ int rc = SQLITE_OK; TestvfsFile *pFd = (TestvfsFile *)pFile; Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData); - char *zLock = ""; - - switch( desiredLock ){ - case SQLITE_SHM_READ: zLock = "READ"; break; - case SQLITE_SHM_WRITE: zLock = "WRITE"; break; - case SQLITE_SHM_CHECKPOINT: zLock = "CHECKPOINT"; break; - case SQLITE_SHM_RECOVER: zLock = "RECOVER"; break; - case SQLITE_SHM_PENDING: zLock = "PENDING"; break; - case SQLITE_SHM_UNLOCK: zLock = "UNLOCK"; break; + int nLock; + char zLock[80]; + + sqlite3_snprintf(sizeof(zLock), zLock, "%d %d", ofst, n); + nLock = strlen(zLock); + if( flags & SQLITE_SHM_LOCK ){ + strcpy(&zLock[nLock], " lock"); + }else{ + strcpy(&zLock[nLock], " unlock"); + } + nLock += strlen(&zLock[nLock]); + if( flags & SQLITE_SHM_SHARED ){ + strcpy(&zLock[nLock], " shared"); + }else{ + strcpy(&zLock[nLock], " exclusive"); } tvfsExecTcl(p, "xShmLock", Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, Tcl_NewStringObj(zLock, -1) ); tvfsResultCode(p, &rc); - if( rc==SQLITE_OK ){ - *gotLock = desiredLock; - } - return rc; } static void tvfsShmBarrier(sqlite3_file *pFile){ int rc = SQLITE_OK; @@ -714,13 +717,11 @@ ** When the xShmLock method is invoked by SQLite, the following script is ** run: ** ** SCRIPT xShmLock FILENAME ID LOCK ** -** where LOCK is one of "UNLOCK", "READ", "READ_FULL", "WRITE", "PENDING", -** "CHECKPOINT" or "RECOVER". The script should return an SQLite error -** code. +** where LOCK is of the form "OFFSET NBYTE lock/unlock shared/exclusive" */ static int testvfs_cmd( ClientData cd, Tcl_Interp *interp, int objc, Index: src/vdbe.c ================================================================== --- src/vdbe.c +++ src/vdbe.c @@ -478,26 +478,10 @@ ** flag on jump instructions, we get a (small) speed improvement. */ #define CHECK_FOR_INTERRUPT \ if( db->u1.isInterrupted ) goto abort_due_to_interrupt; -#ifdef SQLITE_DEBUG -static int fileExists(sqlite3 *db, const char *zFile){ - int res = 0; - int rc = SQLITE_OK; -#ifdef SQLITE_TEST - /* If we are currently testing IO errors, then do not call OsAccess() to - ** test for the presence of zFile. This is because any IO error that - ** occurs here will not be reported, causing the test to fail. - */ - extern int sqlite3_io_error_pending; - if( sqlite3_io_error_pending<=0 ) -#endif - rc = sqlite3OsAccess(db->pVfs, zFile, SQLITE_ACCESS_EXISTS, &res); - return (res && rc==SQLITE_OK); -} -#endif #ifndef NDEBUG /* ** This function is only called from within an assert() expression. It ** checks that the sqlite3.nTransaction variable is correctly set to @@ -592,23 +576,18 @@ #ifndef SQLITE_OMIT_PROGRESS_CALLBACK checkProgress = db->xProgress!=0; #endif #ifdef SQLITE_DEBUG sqlite3BeginBenignMalloc(); - if( p->pc==0 - && ((p->db->flags & SQLITE_VdbeListing) || fileExists(db, "vdbe_explain")) - ){ + if( p->pc==0 && (p->db->flags & SQLITE_VdbeListing)!=0 ){ int i; printf("VDBE Program Listing:\n"); sqlite3VdbePrintSql(p); for(i=0; inOp; i++){ sqlite3VdbePrintOp(stdout, i, &aOp[i]); } } - if( fileExists(db, "vdbe_trace") ){ - p->trace = stdout; - } sqlite3EndBenignMalloc(); #endif for(pc=p->pc; rc==SQLITE_OK; pc++){ assert( pc>=0 && pcnOp ); if( db->mallocFailed ) goto no_mem; @@ -626,17 +605,10 @@ printf("VDBE Execution Trace:\n"); sqlite3VdbePrintSql(p); } sqlite3VdbePrintOp(p->trace, pc, pOp); } - if( p->trace==0 && pc==0 ){ - sqlite3BeginBenignMalloc(); - if( fileExists(db, "vdbe_sqltrace") ){ - sqlite3VdbePrintSql(p); - } - sqlite3EndBenignMalloc(); - } #endif /* Check to see if we need to simulate an interrupt. This only happens ** if we have a special test build. Index: src/wal.c ================================================================== --- src/wal.c +++ src/wal.c @@ -91,16 +91,26 @@ ** ** READER ALGORITHM ** ** To read a page from the database (call it page number P), a reader ** first checks the WAL to see if it contains page P. If so, then the -** last valid instance of page P that is or is followed by a commit frame -** become the value read. If the WAL contains no copies of page P that -** are valid and which are or are followed by a commit frame, then page -** P is read from the database file. +** last valid instance of page P that is a followed by a commit frame +** or is a commit frame itself becomes the value read. If the WAL +** contains no copies of page P that are valid and which are a commit +** frame or are followed by a commit frame, then page P is read from +** the database file. ** -** The reader algorithm in the previous paragraph works correctly, but +** To start a read transaction, the reader records the index of the last +** valid frame in the WAL. The reader uses this recorded "mxFrame" value +** for all subsequent read operations. New transactions can be appended +** to the WAL, but as long as the reader uses its original mxFrame value +** and ignores the newly appended content, it will see a consistent snapshot +** of the database from a single point in time. This technique allows +** multiple concurrent readers to view different versions of the database +** content simultaneously. +** +** The reader algorithm in the previous paragraphs works correctly, but ** because frames for page P can appear anywhere within the WAL, the ** reader has to scan the entire WAL looking for page P frames. If the ** WAL is large (multiple megabytes is typical) that scan can be slow, ** and read performance suffers. To overcome this problem, a separate ** data structure called the wal-index is maintained to expedite the @@ -159,12 +169,11 @@ ** 1-based index of an entry in the mapping section of the same ** index block. Let K be the 1-based index of the largest entry in ** the mapping section. (For index blocks other than the last, K will ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block ** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table -** contain a value greater than K. Note that no hash table slot ever -** contains a zero value. +** contain a value of 0. ** ** To look for page P in the hash table, first compute a hash iKey on ** P as follows: ** ** iKey = (P * 383) % HASHTABLE_NSLOT @@ -212,14 +221,37 @@ */ #ifndef SQLITE_OMIT_WAL #include "wal.h" +/* +** Trace output macros +*/ +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) +int sqlite3WalTrace = 0; +# define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X +#else +# define WALTRACE(X) +#endif + + +/* +** Indices of various locking bytes. WAL_NREADER is the number +** of available reader locks and should be at least 3. +*/ +#define WAL_WRITE_LOCK 0 +#define WAL_ALL_BUT_WRITE 1 +#define WAL_CKPT_LOCK 1 +#define WAL_RECOVER_LOCK 2 +#define WAL_READ_LOCK(I) (3+(I)) +#define WAL_NREADER (SQLITE_SHM_NLOCK-3) + /* Object declarations */ typedef struct WalIndexHdr WalIndexHdr; typedef struct WalIterator WalIterator; +typedef struct WalCkptInfo WalCkptInfo; /* ** The following object holds a copy of the wal-index header content. ** @@ -226,26 +258,79 @@ ** The actual header in the wal-index consists of two copies of this ** object. */ struct WalIndexHdr { u32 iChange; /* Counter incremented each transaction */ - u16 bigEndCksum; /* True if checksums in WAL are big-endian */ + u8 isInit; /* 1 when initialized */ + u8 bigEndCksum; /* True if checksums in WAL are big-endian */ u16 szPage; /* Database page size in bytes */ u32 mxFrame; /* Index of last valid frame in the WAL */ u32 nPage; /* Size of database in pages */ u32 aFrameCksum[2]; /* Checksum of last frame in log */ u32 aSalt[2]; /* Two salt values copied from WAL header */ u32 aCksum[2]; /* Checksum over all prior fields */ }; + +/* +** A copy of the following object occurs in the wal-index immediately +** following the second copy of the WalIndexHdr. This object stores +** information used by checkpoint. +** +** nBackfill is the number of frames in the WAL that have been written +** back into the database. (We call the act of moving content from WAL to +** database "backfilling".) The nBackfill number is never greater than +** WalIndexHdr.mxFrame. nBackfill can only be increased by threads +** holding the WAL_CKPT_LOCK lock (which includes a recovery thread). +** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from +** mxFrame back to zero when the WAL is reset. +** +** There is one entry in aReadMark[] for each reader lock. If a reader +** holds read-lock K, then the value in aReadMark[K] is no greater than +** the mxFrame for that reader. aReadMark[0] is a special case. It +** always holds zero. Readers holding WAL_READ_LOCK(0) always ignore +** the entire WAL and read all content directly from the database. +** +** The value of aReadMark[K] may only be changed by a thread that +** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of +** aReadMark[K] cannot changed while there is a reader is using that mark +** since the reader will be holding a shared lock on WAL_READ_LOCK(K). +** +** The checkpointer may only transfer frames from WAL to database where +** the frame numbers are less than or equal to every aReadMark[] that is +** in use (that is, every aReadMark[j] for which there is a corresponding +** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the +** largest value and will increase an unused aReadMark[] to mxFrame if there +** is not already an aReadMark[] equal to mxFrame. The exception to the +** previous sentence is when nBackfill equals mxFrame (meaning that everything +** in the WAL has been backfilled into the database) then new readers +** will choose aReadMark[0] which has value 0 and hence such reader will +** get all their all content directly from the database file and ignore +** the WAL. +** +** Writers normally append new frames to the end of the WAL. However, +** if nBackfill equals mxFrame (meaning that all WAL content has been +** written back into the database) and if no readers are using the WAL +** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then +** the writer will first "reset" the WAL back to the beginning and start +** writing new content beginning at frame 1. +** +** We assume that 32-bit loads are atomic and so no locks are needed in +** order to read from any aReadMark[] entries. +*/ +struct WalCkptInfo { + u32 nBackfill; /* Number of WAL frames backfilled into DB */ + u32 aReadMark[WAL_NREADER]; /* Reader marks */ +}; + /* A block of WALINDEX_LOCK_RESERVED bytes beginning at ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems ** only support mandatory file-locks, we do not read or write data ** from the region of the file on which locks are applied. */ -#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2) -#define WALINDEX_LOCK_RESERVED 8 +#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo)) +#define WALINDEX_LOCK_RESERVED 16 #define WALINDEX_HDR_SIZE (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED) /* Size of header before each frame in wal */ #define WAL_FRAME_HDRSIZE 24 @@ -275,25 +360,34 @@ /* ** An open write-ahead log file is represented by an instance of the ** following object. */ struct Wal { - sqlite3_vfs *pVfs; /* The VFS used to create pFd */ + sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */ sqlite3_file *pDbFd; /* File handle for the database file */ sqlite3_file *pWalFd; /* File handle for WAL file */ u32 iCallback; /* Value to pass to log callback (or 0) */ int szWIndex; /* Size of the wal-index that is mapped in mem */ volatile u32 *pWiData; /* Pointer to wal-index content in memory */ - u8 lockState; /* SQLITE_SHM_xxxx constant showing lock state */ - u8 readerType; /* SQLITE_SHM_READ or SQLITE_SHM_READ_FULL */ + u16 szPage; /* Database page size */ + i16 readLock; /* Which read lock is being held. -1 for none */ u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */ - u8 isWindexOpen; /* True if ShmOpen() called on pDbFd */ - WalIndexHdr hdr; /* Wal-index for current snapshot */ + u8 isWIndexOpen; /* True if ShmOpen() called on pDbFd */ + u8 writeLock; /* True if in a write transaction */ + u8 ckptLock; /* True if holding a checkpoint lock */ + WalIndexHdr hdr; /* Wal-index header for current transaction */ char *zWalName; /* Name of WAL file */ - int szPage; /* Database page size */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ }; + +/* +** Return a pointer to the WalCkptInfo structure in the wal-index. +*/ +static volatile WalCkptInfo *walCkptInfo(Wal *pWal){ + assert( pWal->pWiData!=0 ); + return (volatile WalCkptInfo*)&pWal->pWiData[sizeof(WalIndexHdr)/2]; +} /* ** This structure is used to implement an iterator that loops through ** all frames in the WAL in database page order. Where two or more frames @@ -377,47 +471,26 @@ aOut[0] = s1; aOut[1] = s2; } -/* -** Attempt to change the lock status. -** -** When changing the lock status to SQLITE_SHM_READ, store the -** type of reader lock (either SQLITE_SHM_READ or SQLITE_SHM_READ_FULL) -** in pWal->readerType. -*/ -static int walSetLock(Wal *pWal, int desiredStatus){ - int rc = SQLITE_OK; /* Return code */ - if( pWal->exclusiveMode || pWal->lockState==desiredStatus ){ - pWal->lockState = desiredStatus; - }else{ - int got = pWal->lockState; - rc = sqlite3OsShmLock(pWal->pDbFd, desiredStatus, &got); - pWal->lockState = got; - if( got==SQLITE_SHM_READ_FULL || got==SQLITE_SHM_READ ){ - pWal->readerType = got; - pWal->lockState = SQLITE_SHM_READ; - } - } - return rc; -} - /* ** Write the header information in pWal->hdr into the wal-index. ** ** The checksum on pWal->hdr is updated before it is written. */ static void walIndexWriteHdr(Wal *pWal){ WalIndexHdr *aHdr; - walChecksumBytes(1, (u8*)&pWal->hdr, - sizeof(pWal->hdr) - sizeof(pWal->hdr.aCksum), + + assert( pWal->writeLock ); + pWal->hdr.isInit = 1; + walChecksumBytes(1, (u8*)&pWal->hdr, offsetof(WalIndexHdr, aCksum), 0, pWal->hdr.aCksum); aHdr = (WalIndexHdr*)pWal->pWiData; - memcpy(&aHdr[1], &pWal->hdr, sizeof(pWal->hdr)); + memcpy(&aHdr[1], &pWal->hdr, sizeof(WalIndexHdr)); sqlite3OsShmBarrier(pWal->pDbFd); - memcpy(&aHdr[0], &pWal->hdr, sizeof(pWal->hdr)); + memcpy(&aHdr[0], &pWal->hdr, sizeof(WalIndexHdr)); } /* ** This function encodes a single frame header and writes it to a buffer ** supplied by the caller. A frame-header is made up of a series of @@ -517,10 +590,71 @@ #define HASHTABLE_NPAGE 4096 /* Must be power of 2 and multiple of 256 */ #define HASHTABLE_DATATYPE u16 #define HASHTABLE_HASH_1 383 /* Should be prime */ #define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) /* Must be a power of 2 */ #define HASHTABLE_NBYTE (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT) + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) +/* +** Names of locks. This routine is used to provide debugging output and is not +** a part of an ordinary build. +*/ +static const char *walLockName(int lockIdx){ + if( lockIdx==WAL_WRITE_LOCK ){ + return "WRITE-LOCK"; + }else if( lockIdx==WAL_CKPT_LOCK ){ + return "CKPT-LOCK"; + }else if( lockIdx==WAL_RECOVER_LOCK ){ + return "RECOVER-LOCK"; + }else{ + static char zName[15]; + sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]", + lockIdx-WAL_READ_LOCK(0)); + return zName; + } +} +#endif /*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */ + + +/* +** Set or release locks on the WAL. Locks are either shared or exclusive. +** A lock cannot be moved directly between shared and exclusive - it must go +** through the unlocked state first. +** +** In locking_mode=EXCLUSIVE, all of these routines become no-ops. +*/ +static int walLockShared(Wal *pWal, int lockIdx){ + int rc; + if( pWal->exclusiveMode ) return SQLITE_OK; + rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1, + SQLITE_SHM_LOCK | SQLITE_SHM_SHARED); + WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal, + walLockName(lockIdx), rc ? "failed" : "ok")); + return rc; +} +static void walUnlockShared(Wal *pWal, int lockIdx){ + if( pWal->exclusiveMode ) return; + (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1, + SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED); + WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx))); +} +static int walLockExclusive(Wal *pWal, int lockIdx, int n){ + int rc; + if( pWal->exclusiveMode ) return SQLITE_OK; + rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n, + SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE); + WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal, + walLockName(lockIdx), n, rc ? "failed" : "ok")); + return rc; +} +static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){ + if( pWal->exclusiveMode ) return; + (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n, + SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE); + WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal, + walLockName(lockIdx), n)); +} /* ** Return the index in the Wal.pWiData array that corresponds to ** frame iFrame. ** @@ -541,14 +675,14 @@ + (iFrame-1) ); } /* -** Return the minimum mapping size in bytes that can be used to read the -** wal-index up to and including frame iFrame. If iFrame is the last frame -** in a block of 256 frames, the returned byte-count includes the space -** required by the 256-byte index block. +** Return the minimum size of the shared-memory, in bytes, that is needed +** to support a wal-index containing frame iFrame. The value returned +** includes the wal-index header and the complete "block" containing iFrame, +** including the hash table segment that follows the block. */ static int walMappingSize(u32 iFrame){ const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ; return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED @@ -598,11 +732,11 @@ ** storage to be at least as big as enlargeTo before remapping. */ static int walIndexRemap(Wal *pWal, int enlargeTo){ int rc; int sz; - assert( pWal->lockState>=SQLITE_SHM_WRITE ); + assert( pWal->writeLock ); rc = sqlite3OsShmSize(pWal->pDbFd, enlargeTo, &sz); if( rc==SQLITE_OK && sz>pWal->szWIndex ){ walIndexUnmap(pWal); rc = walIndexMap(pWal, sz); } @@ -610,11 +744,12 @@ return rc; } /* ** Compute a hash on a page number. The resulting hash value must land -** between 0 and (HASHTABLE_NSLOT-1). +** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances +** the hash to the next value in the event of a collision. */ static int walHash(u32 iPage){ assert( iPage>0 ); assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 ); return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1); @@ -673,26 +808,32 @@ ** than pWal->hdr.mxFrame. ** ** This function is called whenever pWal->hdr.mxFrame is decreased due ** to a rollback or savepoint. ** -** At most only the very last hash table needs to be updated. Any -** later hash tables will be automatically cleared when pWal->hdr.mxFrame -** advances to the point where those hash tables are actually needed. +** At most only the hash table containing pWal->hdr.mxFrame needs to be +** updated. Any later hash tables will be automatically cleared when +** pWal->hdr.mxFrame advances to the point where those hash tables are +** actually needed. */ static void walCleanupHash(Wal *pWal){ volatile HASHTABLE_DATATYPE *aHash; /* Pointer to hash table to clear */ volatile u32 *aPgno; /* Unused return from walHashFind() */ u32 iZero; /* frame == (aHash[x]+iZero) */ - int iLimit; /* Zero values greater than this */ + int iLimit = 0; /* Zero values greater than this */ - assert( pWal->lockState==SQLITE_SHM_WRITE ); - walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero); - iLimit = pWal->hdr.mxFrame - iZero; - if( iLimit>0 ){ + assert( pWal->writeLock ); + testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE-1 ); + testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE ); + testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE+1 ); + if( (pWal->hdr.mxFrame % HASHTABLE_NPAGE)>0 ){ int nByte; /* Number of bytes to zero in aPgno[] */ int i; /* Used to iterate through aHash[] */ + + walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero); + iLimit = pWal->hdr.mxFrame - iZero; + assert( iLimit>0 ); for(i=0; iiLimit ){ aHash[i] = 0; } } @@ -707,11 +848,11 @@ #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT /* Verify that the every entry in the mapping region is still reachable ** via the hash table even after the cleanup. */ - { + if( iLimit ){ int i; /* Loop counter */ int iKey; /* Hash key */ for(i=1; i<=iLimit; i++){ for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){ if( aHash[iKey]==i ) break; @@ -808,23 +949,48 @@ } /* ** Recover the wal-index by reading the write-ahead log file. -** The caller must hold RECOVER lock on the wal-index file. +** +** This routine first tries to establish an exclusive lock on the +** wal-index to prevent other threads/processes from doing anything +** with the WAL or wal-index while recovery is running. The +** WAL_RECOVER_LOCK is also held so that other threads will know +** that this thread is running recovery. If unable to establish +** the necessary locks, this routine returns SQLITE_BUSY. */ static int walIndexRecover(Wal *pWal){ int rc; /* Return Code */ i64 nSize; /* Size of log file */ u32 aFrameCksum[2] = {0, 0}; + int iLock; /* Lock offset to lock for checkpoint */ + int nLock; /* Number of locks to hold */ - assert( pWal->lockState>SQLITE_SHM_READ ); + /* Obtain an exclusive lock on all byte in the locking range not already + ** locked by the caller. The caller is guaranteed to have locked the + ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte. + ** If successful, the same bytes that are locked here are unlocked before + ** this function returns. + */ + assert( pWal->ckptLock==1 || pWal->ckptLock==0 ); + assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 ); + assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE ); + assert( pWal->writeLock ); + iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock; + nLock = SQLITE_SHM_NLOCK - iLock; + rc = walLockExclusive(pWal, iLock, nLock); + if( rc ){ + return rc; + } + WALTRACE(("WAL%p: recovery begin...\n", pWal)); + memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); rc = sqlite3OsFileSize(pWal->pWalFd, &nSize); if( rc!=SQLITE_OK ){ - return rc; + goto recovery_error; } if( nSize>WAL_HDRSIZE ){ u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ @@ -836,11 +1002,11 @@ u32 magic; /* Magic value read from WAL header */ /* Read in the WAL header. */ rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); if( rc!=SQLITE_OK ){ - return rc; + goto recovery_error; } /* If the database page size is not a power of two, or is greater than ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid ** data. Similarly, if the 'magic' value is invalid, ignore the whole @@ -865,11 +1031,12 @@ /* Malloc a buffer to read frames into. */ szFrame = szPage + WAL_FRAME_HDRSIZE; aFrame = (u8 *)sqlite3_malloc(szFrame); if( !aFrame ){ - return SQLITE_NOMEM; + rc = SQLITE_NOMEM; + goto recovery_error; } aData = &aFrame[WAL_FRAME_HDRSIZE]; /* Read all frames from the log file. */ iFrame = 0; @@ -905,35 +1072,41 @@ } if( rc==SQLITE_OK ){ pWal->hdr.aFrameCksum[0] = aFrameCksum[0]; pWal->hdr.aFrameCksum[1] = aFrameCksum[1]; walIndexWriteHdr(pWal); + + /* Zero the checkpoint-header. This is safe because this thread is + ** currently holding locks that exclude all other readers, writers and + ** checkpointers. + */ + memset((void *)walCkptInfo(pWal), 0, sizeof(WalCkptInfo)); } + +recovery_error: + WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok")); + walUnlockExclusive(pWal, iLock, nLock); return rc; } /* ** Close an open wal-index. */ static void walIndexClose(Wal *pWal, int isDelete){ - if( pWal->isWindexOpen ){ - int notUsed; - sqlite3OsShmLock(pWal->pDbFd, SQLITE_SHM_UNLOCK, ¬Used); + if( pWal->isWIndexOpen ){ sqlite3OsShmClose(pWal->pDbFd, isDelete); - pWal->isWindexOpen = 0; + pWal->isWIndexOpen = 0; } } /* -** Open a connection to the log file associated with database zDb. The -** database file does not actually have to exist. zDb is used only to -** figure out the name of the log file to open. If the log file does not -** exist it is created by this call. +** Open a connection to the WAL file associated with database zDbName. +** The database file must already be opened on connection pDbFd. ** ** A SHARED lock should be held on the database file when this function ** is called. The purpose of this SHARED lock is to prevent any other -** client from unlinking the log or wal-index file. If another process +** client from unlinking the WAL or wal-index file. If another process ** were to do this just after this client opened one of these files, the ** system would be badly broken. ** ** If the log file is successfully opened, SQLITE_OK is returned and ** *ppWal is set to point to a new WAL handle. If an error occurs, @@ -976,18 +1149,19 @@ pRet->pVfs = pVfs; pRet->pWalFd = (sqlite3_file *)&pRet[1]; pRet->pDbFd = pDbFd; pRet->szWIndex = -1; + pRet->readLock = -1; sqlite3_randomness(8, &pRet->hdr.aSalt); pRet->zWalName = zWal = pVfs->szOsFile + (char*)pRet->pWalFd; sqlite3_snprintf(nWal, zWal, "%s-wal", zDbName); rc = sqlite3OsShmOpen(pDbFd); /* Open file handle on the write-ahead log file. */ if( rc==SQLITE_OK ){ - pRet->isWindexOpen = 1; + pRet->isWIndexOpen = 1; flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL); rc = sqlite3OsOpen(pVfs, zWal, pRet->pWalFd, flags, &flags); } if( rc!=SQLITE_OK ){ @@ -994,10 +1168,11 @@ walIndexClose(pRet, 0); sqlite3OsClose(pRet->pWalFd); sqlite3_free(pRet); }else{ *ppWal = pRet; + WALTRACE(("WAL%d: opened\n", pRet)); } return rc; } /* @@ -1131,11 +1306,11 @@ /* This routine only runs while holding SQLITE_SHM_CHECKPOINT. No other ** thread is able to write to shared memory while this routine is ** running (or, indeed, while the WalIterator object exists). Hence, ** we can cast off the volatile qualifacation from shared memory */ - assert( pWal->lockState==SQLITE_SHM_CHECKPOINT ); + assert( pWal->ckptLock ); aData = (u32*)pWal->pWiData; /* Allocate space for the WalIterator object */ iLast = pWal->hdr.mxFrame; nSegment = (iLast >> 8) + 1; @@ -1177,13 +1352,42 @@ ** Free an iterator allocated by walIteratorInit(). */ static void walIteratorFree(WalIterator *p){ sqlite3_free(p); } + /* -** Checkpoint the contents of the log file. +** Copy as much content as we can from the WAL back into the database file +** in response to an sqlite3_wal_checkpoint() request or the equivalent. +** +** The amount of information copies from WAL to database might be limited +** by active readers. This routine will never overwrite a database page +** that a concurrent reader might be using. +** +** All I/O barrier operations (a.k.a fsyncs) occur in this routine when +** SQLite is in WAL-mode in synchronous=NORMAL. That means that if +** checkpoints are always run by a background thread or background +** process, foreground threads will never block on a lengthy fsync call. +** +** Fsync is called on the WAL before writing content out of the WAL and +** into the database. This ensures that if the new content is persistent +** in the WAL and can be recovered following a power-loss or hard reset. +** +** Fsync is also called on the database file if (and only if) the entire +** WAL content is copied into the database file. This second fsync makes +** it safe to delete the WAL since the new content will persist in the +** database file. +** +** This routine uses and updates the nBackfill field of the wal-index header. +** This is the only routine tha will increase the value of nBackfill. +** (A WAL reset or recovery will revert nBackfill to zero, but not increase +** its value.) +** +** The caller must be holding sufficient locks to ensure that no other +** checkpoint is running (in any other thread or process) at the same +** time. */ static int walCheckpoint( Wal *pWal, /* Wal connection */ int sync_flags, /* Flags for OsSync() (or 0) */ int nBuf, /* Size of zBuf in bytes */ @@ -1192,55 +1396,92 @@ int rc; /* Return code */ int szPage = pWal->hdr.szPage; /* Database page-size */ WalIterator *pIter = 0; /* Wal iterator context */ u32 iDbpage = 0; /* Next database page to write */ u32 iFrame = 0; /* Wal frame containing data for iDbpage */ + u32 mxSafeFrame; /* Max frame that can be backfilled */ + int i; /* Loop counter */ + volatile WalIndexHdr *pHdr; /* The actual wal-index header in SHM */ + volatile WalCkptInfo *pInfo; /* The checkpoint status information */ /* Allocate the iterator */ rc = walIteratorInit(pWal, &pIter); if( rc!=SQLITE_OK || pWal->hdr.mxFrame==0 ){ - goto out; + walIteratorFree(pIter); + return rc; } + /*** TODO: Move this test out to the caller. Make it an assert() here ***/ if( pWal->hdr.szPage!=nBuf ){ - rc = SQLITE_CORRUPT_BKPT; - goto out; - } - - /* Sync the log file to disk */ - if( sync_flags ){ - rc = sqlite3OsSync(pWal->pWalFd, sync_flags); - if( rc!=SQLITE_OK ) goto out; - } - - /* Iterate through the contents of the log, copying data to the db file. */ - while( 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){ - rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, - walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE - ); - if( rc!=SQLITE_OK ) goto out; - rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage); - if( rc!=SQLITE_OK ) goto out; - } - - /* Truncate the database file */ - rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage)); - if( rc!=SQLITE_OK ) goto out; - - /* Sync the database file. If successful, update the wal-index. */ - if( sync_flags ){ - rc = sqlite3OsSync(pWal->pDbFd, sync_flags); - if( rc!=SQLITE_OK ) goto out; - } - pWal->hdr.mxFrame = 0; - pWal->nCkpt++; - sqlite3Put4byte((u8*)pWal->hdr.aSalt, - 1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt)); - sqlite3_randomness(4, &pWal->hdr.aSalt[1]); - walIndexWriteHdr(pWal); - - out: + walIteratorFree(pIter); + return SQLITE_CORRUPT_BKPT; + } + + /* Compute in mxSafeFrame the index of the last frame of the WAL that is + ** safe to write into the database. Frames beyond mxSafeFrame might + ** overwrite database pages that are in use by active readers and thus + ** cannot be backfilled from the WAL. + */ + mxSafeFrame = pWal->hdr.mxFrame; + pHdr = (volatile WalIndexHdr*)pWal->pWiData; + pInfo = (volatile WalCkptInfo*)&pHdr[2]; + assert( pInfo==walCkptInfo(pWal) ); + for(i=1; iaReadMark[i]; + if( y>0 && (mxSafeFrame==0 || mxSafeFrame>=y) ){ + if( y<=pWal->hdr.mxFrame + && walLockExclusive(pWal, WAL_READ_LOCK(i), 1)==SQLITE_OK + ){ + pInfo->aReadMark[i] = 0; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + }else{ + mxSafeFrame = y-1; + } + } + } + + if( pInfo->nBackfillnBackfill; + + /* Sync the WAL to disk */ + if( sync_flags ){ + rc = sqlite3OsSync(pWal->pWalFd, sync_flags); + } + + /* Iterate through the contents of the WAL, copying data to the db file. */ + while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){ + if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue; + rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, + walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE + ); + if( rc!=SQLITE_OK ) break; + rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage); + if( rc!=SQLITE_OK ) break; + } + + /* If work was actually accomplished... */ + if( rc==SQLITE_OK && pInfo->nBackfillnBackfill = mxSafeFrame; + if( mxSafeFrame==pHdr[0].mxFrame && sync_flags ){ + rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage)); + if( rc==SQLITE_OK && sync_flags ){ + rc = sqlite3OsSync(pWal->pDbFd, sync_flags); + } + } + } + + /* Release the reader lock held while backfilling */ + walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); + }else{ + /* Reset the return code so as not to report a checkpoint failure + ** just because active readers prevent any backfill. + */ + rc = SQLITE_OK; + } + walIteratorFree(pIter); return rc; } /* @@ -1264,11 +1505,12 @@ ** ** The EXCLUSIVE lock is not released before returning. */ rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE); if( rc==SQLITE_OK ){ - rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf, 0, 0); + pWal->exclusiveMode = 1; + rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf); if( rc==SQLITE_OK ){ isDelete = 1; } walIndexUnmap(pWal); } @@ -1276,10 +1518,11 @@ walIndexClose(pWal, isDelete); sqlite3OsClose(pWal->pWalFd); if( isDelete ){ sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0); } + WALTRACE(("WAL%p: closed\n", pWal)); sqlite3_free(pWal); } return rc; } @@ -1288,11 +1531,12 @@ ** there is a problem. ** ** The wal-index is in shared memory. Another thread or process might ** be writing the header at the same time this procedure is trying to ** read it, which might result in inconsistency. A dirty read is detected -** by verifying a checksum on the header. +** by verifying that both copies of the header are the same and also by +** a checksum on the header. ** ** If and only if the read is consistent and the header is different from ** pWal->hdr, then pWal->hdr is updated to the content of the new header ** and *pChanged is set to 1. ** @@ -1309,13 +1553,13 @@ ** header is invalid. */ return 1; } assert( pWal->pWiData ); - /* Read the header. The caller may or may not have an exclusive - ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index - ** file, meaning it is possible that an inconsistent snapshot is read + /* Read the header. This might happen currently with a write to the + ** same area of shared memory on a different CPU in a SMP, + ** meaning it is possible that an inconsistent snapshot is read ** from the file. If this happens, return non-zero. ** ** There are two copies of the header at the beginning of the wal-index. ** When reading, read [0] first then [1]. Writes are in the reverse order. ** Memory barriers are used to prevent the compiler or the hardware from @@ -1327,11 +1571,11 @@ memcpy(&h2, &aHdr[1], sizeof(h2)); if( memcmp(&h1, &h2, sizeof(h1))!=0 ){ return 1; /* Dirty read */ } - if( h1.szPage==0 ){ + if( h1.isInit==0 ){ return 1; /* Malformed header - probably all zeros */ } walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum); if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){ return 1; /* Checksum does not match */ @@ -1365,56 +1609,44 @@ ** If the wal-index header is successfully read, return SQLITE_OK. ** Otherwise an SQLite error code. */ static int walIndexReadHdr(Wal *pWal, int *pChanged){ int rc; /* Return code */ - int lockState; /* pWal->lockState before running recovery */ + int badHdr; /* True if a header read failed */ - assert( pWal->lockState>=SQLITE_SHM_READ ); assert( pChanged ); rc = walIndexMap(pWal, walMappingSize(1)); if( rc!=SQLITE_OK ){ return rc; } - /* First attempt to read the wal-index header. This may fail for one - ** of two reasons: (a) the wal-index does not yet exist or has been - ** corrupted and needs to be constructed by running recovery, or (b) - ** the caller is only holding a READ lock and made a dirty read of - ** the wal-index header. - ** - ** A dirty read of the wal-index header occurs if another thread or - ** process happens to be writing to the wal-index header at roughly - ** the same time as this thread is reading it. In this case it is - ** possible that an inconsistent header is read (which is detected - ** using the header checksum mechanism). - */ - if( walIndexTryHdr(pWal, pChanged)!=0 ){ - - /* If the first attempt to read the header failed, lock the wal-index - ** file with an exclusive lock and try again. If the header checksum - ** verification fails again, we can be sure that it is not simply a - ** dirty read, but that the wal-index really does need to be - ** reconstructed by running log recovery. - ** - ** In the paragraph above, an "exclusive lock" may be any of WRITE, - ** PENDING, CHECKPOINT or RECOVER. If any of these are already held, - ** no locking operations are required. If the caller currently holds - ** a READ lock, then upgrade to a RECOVER lock before re-reading the - ** wal-index header and revert to a READ lock before returning. - */ - lockState = pWal->lockState; - if( lockState>SQLITE_SHM_READ - || SQLITE_OK==(rc = walSetLock(pWal, SQLITE_SHM_RECOVER)) - ){ - if( walIndexTryHdr(pWal, pChanged) ){ + /* Try once to read the header straight out. This works most of the + ** time. + */ + badHdr = walIndexTryHdr(pWal, pChanged); + + /* If the first attempt failed, it might have been due to a race + ** with a writer. So get a WRITE lock and try again. + */ + assert( badHdr==0 || pWal->writeLock==0 ); + if( badHdr ){ + rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1); + if( rc==SQLITE_OK ){ + pWal->writeLock = 1; + badHdr = walIndexTryHdr(pWal, pChanged); + if( badHdr ){ + /* If the wal-index header is still malformed even while holding + ** a WRITE lock, it can only mean that the header is corrupted and + ** needs to be reconstructed. So run recovery to do exactly that. + */ + rc = walIndexRecover(pWal); *pChanged = 1; - rc = walIndexRecover(pWal); } - if( lockState==SQLITE_SHM_READ ){ - walSetLock(pWal, SQLITE_SHM_READ); - } + walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1); + pWal->writeLock = 0; + }else if( rc!=SQLITE_BUSY ){ + return rc; } } /* Make sure the mapping is large enough to cover the entire wal-index */ if( rc==SQLITE_OK ){ @@ -1426,57 +1658,196 @@ return rc; } /* -** Take a snapshot of the state of the WAL and wal-index for the current +** This is the value that walTryBeginRead returns when it needs to +** be retried. +*/ +#define WAL_RETRY (-1) + +/* +** Attempt to start a read transaction. This might fail due to a race or +** other transient condition. When that happens, it returns WAL_RETRY to +** indicate to the caller that it is safe to retry immediately. +** +** On success return SQLITE_OK. On a permantent failure (such an +** I/O error or an SQLITE_BUSY because another process is running +** recovery) return a positive error code. +** +** On success, this routine obtains a read lock on +** WAL_READ_LOCK(pWal->readLock). The pWal->readLock integer is +** in the range 0 <= pWal->readLock < WAL_NREADER. If pWal->readLock==(-1) +** that means the Wal does not hold any read lock. The reader must not +** access any database page that is modified by a WAL frame up to and +** including frame number aReadMark[pWal->readLock]. The reader will +** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0 +** Or if pWal->readLock==0, then the reader will ignore the WAL +** completely and get all content directly from the database file. +** When the read transaction is completed, the caller must release the +** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1. +** +** This routine uses the nBackfill and aReadMark[] fields of the header +** to select a particular WAL_READ_LOCK() that strives to let the +** checkpoint process do as much work as possible. This routine might +** update values of the aReadMark[] array in the header, but if it does +** so it takes care to hold an exclusive lock on the corresponding +** WAL_READ_LOCK() while changing values. +*/ +static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal){ + volatile WalIndexHdr *pHdr; /* Header of the wal-index */ + volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */ + u32 mxReadMark; /* Largest aReadMark[] value */ + int mxI; /* Index of largest aReadMark[] value */ + int i; /* Loop counter */ + int rc; /* Return code */ + + assert( pWal->readLock<0 ); /* Not currently locked */ + + if( !useWal ){ + rc = walIndexReadHdr(pWal, pChanged); + if( rc==SQLITE_BUSY ){ + /* If there is not a recovery running in another thread or process + ** then convert BUSY errors to WAL_RETRY. If recovery is known to + ** be running, convert BUSY to BUSY_RECOVERY. There is a race here + ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY + ** would be technically correct. But the race is benign since with + ** WAL_RETRY this routine will be called again and will probably be + ** right on the second iteration. + */ + rc = walLockShared(pWal, WAL_RECOVER_LOCK); + if( rc==SQLITE_OK ){ + walUnlockShared(pWal, WAL_RECOVER_LOCK); + rc = WAL_RETRY; + }else if( rc==SQLITE_BUSY ){ + rc = SQLITE_BUSY_RECOVERY; + } + } + }else{ + rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame)); + } + if( rc!=SQLITE_OK ){ + return rc; + } + + pHdr = (volatile WalIndexHdr*)pWal->pWiData; + pInfo = (volatile WalCkptInfo*)&pHdr[2]; + assert( pInfo==walCkptInfo(pWal) ); + if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){ + /* The WAL has been completely backfilled (or it is empty). + ** and can be safely ignored. + */ + rc = walLockShared(pWal, WAL_READ_LOCK(0)); + if( rc==SQLITE_OK ){ + if( pHdr->mxFrame!=pWal->hdr.mxFrame ){ + walUnlockShared(pWal, WAL_READ_LOCK(0)); + return WAL_RETRY; + } + pWal->readLock = 0; + return SQLITE_OK; + }else if( rc!=SQLITE_BUSY ){ + return rc; + } + } + + /* If we get this far, it means that the reader will want to use + ** the WAL to get at content from recent commits. The job now is + ** to select one of the aReadMark[] entries that is closest to + ** but not exceeding pWal->hdr.mxFrame and lock that entry. + */ + mxReadMark = 0; + mxI = 0; + for(i=1; iaReadMark[i]; + if( mxReadMarkaReadMark[1] = pWal->hdr.mxFrame+1; + walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1); + } + return WAL_RETRY; + }else{ + if( mxReadMark < pWal->hdr.mxFrame ){ + for(i=1; iaReadMark[i] = pWal->hdr.mxFrame+1; + mxI = i; + walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); + break; + } + } + } + + rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); + if( rc ){ + return rc==SQLITE_BUSY ? WAL_RETRY : rc; + } + if( pInfo->aReadMark[mxI]!=mxReadMark + || pHdr[0].mxFrame!=pWal->hdr.mxFrame + || (sqlite3OsShmBarrier(pWal->pDbFd), pHdr[1].mxFrame!=pWal->hdr.mxFrame) + ){ + walUnlockShared(pWal, WAL_READ_LOCK(mxI)); + return WAL_RETRY; + }else{ + pWal->readLock = mxI; + } + } + return rc; +} + +/* +** Begin a read transaction on the database. +** +** This routine used to be called sqlite3OpenSnapshot() and with good reason: +** it takes a snapshot of the state of the WAL and wal-index for the current ** instant in time. The current thread will continue to use this snapshot. -** Other threads might containing appending to the WAL and wal-index but -** the extra content appended will be ignored by the current thread. -** -** A snapshot is like a read transaction. -** -** No other threads are allowed to run a checkpoint while this thread is -** holding the snapshot since a checkpoint would remove data out from under -** this thread. -** -** If this call obtains a new read-lock and the database contents have been -** modified since the most recent call to WalCloseSnapshot() on this Wal -** connection, then *pChanged is set to 1 before returning. Otherwise, it -** is left unmodified. This is used by the pager layer to determine whether -** or not any cached pages may be safely reused. -*/ -int sqlite3WalOpenSnapshot(Wal *pWal, int *pChanged){ +** Other threads might append new content to the WAL and wal-index but +** that extra content is ignored by the current thread. +** +** If the database contents have changes since the previous read +** transaction, then *pChanged is set to 1 before returning. The +** Pager layer will use this to know that is cache is stale and +** needs to be flushed. +*/ +int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ int rc; /* Return code */ - rc = walSetLock(pWal, SQLITE_SHM_READ); - assert( rc!=SQLITE_OK || pWal->lockState==SQLITE_SHM_READ ); - - if( rc==SQLITE_OK ){ - rc = walIndexReadHdr(pWal, pChanged); - if( rc!=SQLITE_OK ){ - /* An error occured while attempting log recovery. */ - sqlite3WalCloseSnapshot(pWal); - } - } - + do{ + rc = walTryBeginRead(pWal, pChanged, 0); + }while( rc==WAL_RETRY ); walIndexUnmap(pWal); return rc; } /* -** Unlock the current snapshot. +** Finish with a read transaction. All this does is release the +** read-lock. */ -void sqlite3WalCloseSnapshot(Wal *pWal){ - assert( pWal->lockState==SQLITE_SHM_READ - || pWal->lockState==SQLITE_SHM_UNLOCK - ); - walSetLock(pWal, SQLITE_SHM_UNLOCK); +void sqlite3WalEndReadTransaction(Wal *pWal){ + if( pWal->readLock>=0 ){ + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + pWal->readLock = -1; + } } /* -** Read a page from the log, if it is present. +** Read a page from the WAL, if it is present in the WAL and if the +** current read transaction is configured to use the WAL. +** +** The *pInWal is set to 1 if the requested page is in the WAL and +** has been loaded. Or *pInWal is set to 0 if the page was not in +** the WAL and needs to be read out of the database. */ int sqlite3WalRead( Wal *pWal, /* WAL handle */ Pgno pgno, /* Database page number to read data for */ int *pInWal, /* OUT: True if data is read from WAL */ @@ -1485,22 +1856,26 @@ ){ int rc; /* Return code */ u32 iRead = 0; /* If !=0, WAL frame to return data from */ u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */ int iHash; /* Used to loop through N hash tables */ + + /* This routine is only called from within a read transaction */ + assert( pWal->readLock>=0 ); /* If the "last page" field of the wal-index header snapshot is 0, then ** no data will be read from the wal under any circumstances. Return early - ** in this case to avoid the walIndexMap/Unmap overhead. + ** in this case to avoid the walIndexMap/Unmap overhead. Likewise, if + ** pWal->readLock==0, then the WAL is ignored by the reader so + ** return early, as if the WAL were empty. */ - if( iLast==0 ){ + if( iLast==0 || pWal->readLock==0 ){ *pInWal = 0; return SQLITE_OK; } /* Ensure the wal-index is mapped. */ - assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE ); rc = walIndexMap(pWal, walMappingSize(iLast)); if( rc!=SQLITE_OK ){ return rc; } @@ -1605,64 +1980,120 @@ /* ** Set *pPgno to the size of the database file (or zero, if unknown). */ void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){ - assert( pWal->lockState==SQLITE_SHM_READ - || pWal->lockState==SQLITE_SHM_WRITE ); + assert( pWal->readLock>=0 ); *pPgno = pWal->hdr.nPage; } -/* -** This function returns SQLITE_OK if the caller may write to the database. -** Otherwise, if the caller is operating on a snapshot that has already -** been overwritten by another writer, SQLITE_BUSY is returned. -*/ -int sqlite3WalWriteLock(Wal *pWal, int op){ - int rc = SQLITE_OK; - if( op ){ - assert( pWal->lockState==SQLITE_SHM_READ ); - rc = walSetLock(pWal, SQLITE_SHM_WRITE); - - /* If this connection is not reading the most recent database snapshot, - ** it is not possible to write to the database. In this case release - ** the write locks and return SQLITE_BUSY. - */ - if( rc==SQLITE_OK ){ - rc = walIndexMap(pWal, walMappingSize(1)); - assert( pWal->szWIndex>=WALINDEX_HDR_SIZE || rc!=SQLITE_OK ); - if( rc==SQLITE_OK - && memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr)) - ){ - rc = SQLITE_BUSY; - } - walIndexUnmap(pWal); - if( rc!=SQLITE_OK ){ - walSetLock(pWal, SQLITE_SHM_READ); - } - } - }else if( pWal->lockState==SQLITE_SHM_WRITE ){ - rc = walSetLock(pWal, SQLITE_SHM_READ); - } - return rc; + +/* +** This function starts a write transaction on the WAL. +** +** A read transaction must have already been started by a prior call +** to sqlite3WalBeginReadTransaction(). +** +** If another thread or process has written into the database since +** the read transaction was started, then it is not possible for this +** thread to write as doing so would cause a fork. So this routine +** returns SQLITE_BUSY in that case and no write transaction is started. +** +** There can only be a single writer active at a time. +*/ +int sqlite3WalBeginWriteTransaction(Wal *pWal){ + int rc; + volatile WalCkptInfo *pInfo; + + /* Cannot start a write transaction without first holding a read + ** transaction. */ + assert( pWal->readLock>=0 ); + + /* Only one writer allowed at a time. Get the write lock. Return + ** SQLITE_BUSY if unable. + */ + rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1); + if( rc ){ + return rc; + } + pWal->writeLock = 1; + + /* If another connection has written to the database file since the + ** time the read transaction on this connection was started, then + ** the write is disallowed. + */ + rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame)); + if( rc ){ + walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1); + pWal->writeLock = 0; + return rc; + } + if( memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))!=0 ){ + walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1); + pWal->writeLock = 0; + walIndexUnmap(pWal); + return SQLITE_BUSY; + } + + pInfo = walCkptInfo(pWal); + if( pWal->readLock==0 ){ + assert( pInfo->nBackfill==pWal->hdr.mxFrame ); + if( pInfo->nBackfill>0 ){ + rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); + if( rc==SQLITE_OK ){ + /* If all readers are using WAL_READ_LOCK(0) (in other words if no + ** readers are currently using the WAL) */ + pWal->nCkpt++; + pWal->hdr.mxFrame = 0; + sqlite3Put4byte((u8*)pWal->hdr.aSalt, + 1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt)); + sqlite3_randomness(4, &pWal->hdr.aSalt[1]); + walIndexWriteHdr(pWal); + pInfo->nBackfill = 0; + memset((void*)&pInfo->aReadMark[1], 0, + sizeof(pInfo->aReadMark)-sizeof(u32)); + rc = sqlite3OsTruncate(pWal->pDbFd, + ((i64)pWal->hdr.nPage*(i64)pWal->szPage)); + walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1); + } + } + walUnlockShared(pWal, WAL_READ_LOCK(0)); + pWal->readLock = -1; + do{ + int notUsed; + rc = walTryBeginRead(pWal, ¬Used, 1); + }while( rc==WAL_RETRY ); + } + walIndexUnmap(pWal); + return rc; +} + +/* +** End a write transaction. The commit has already been done. This +** routine merely releases the lock. +*/ +int sqlite3WalEndWriteTransaction(Wal *pWal){ + walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1); + pWal->writeLock = 0; + return SQLITE_OK; } /* ** If any data has been written (but not committed) to the log file, this ** function moves the write-pointer back to the start of the transaction. ** ** Additionally, the callback function is invoked for each frame written -** to the log since the start of the transaction. If the callback returns +** to the WAL since the start of the transaction. If the callback returns ** other than SQLITE_OK, it is not invoked again and the error code is ** returned to the caller. ** ** Otherwise, if the callback function does not return an error, this ** function returns SQLITE_OK. */ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){ int rc = SQLITE_OK; - if( pWal->lockState==SQLITE_SHM_WRITE ){ + if( pWal->writeLock ){ int unused; Pgno iMax = pWal->hdr.mxFrame; Pgno iFrame; assert( pWal->pWiData==0 ); @@ -1670,11 +2101,11 @@ if( rc==SQLITE_OK ){ rc = walIndexMap(pWal, walMappingSize(iMax)); } if( rc==SQLITE_OK ){ for(iFrame=pWal->hdr.mxFrame+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){ - assert( pWal->lockState==SQLITE_SHM_WRITE ); + assert( pWal->writeLock ); rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]); } walCleanupHash(pWal); } walIndexUnmap(pWal); @@ -1687,11 +2118,11 @@ ** values. This function populates the array with values required to ** "rollback" the write position of the WAL handle back to the current ** point in the event of a savepoint rollback (via WalSavepointUndo()). */ void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){ - assert( pWal->lockState==SQLITE_SHM_WRITE ); + assert( pWal->writeLock ); aWalData[0] = pWal->hdr.mxFrame; aWalData[1] = pWal->hdr.aFrameCksum[0]; aWalData[2] = pWal->hdr.aFrameCksum[1]; } @@ -1701,11 +2132,11 @@ ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated ** by a call to WalSavepoint(). */ int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){ int rc = SQLITE_OK; - assert( pWal->lockState==SQLITE_SHM_WRITE ); + assert( pWal->writeLock ); assert( aWalData[0]<=pWal->hdr.mxFrame ); if( aWalData[0]hdr.mxFrame ){ rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame)); pWal->hdr.mxFrame = aWalData[0]; @@ -1737,12 +2168,19 @@ PgHdr *p; /* Iterator to run through pList with. */ PgHdr *pLast = 0; /* Last frame in list */ int nLast = 0; /* Number of extra copies of last page */ assert( pList ); - assert( pWal->lockState==SQLITE_SHM_WRITE ); + assert( pWal->writeLock ); assert( pWal->pWiData==0 ); + +#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) + { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){} + WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n", + pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill")); + } +#endif /* If this is the first frame written into the log, write the WAL ** header to the start of the WAL file. See comments at the top of ** this source file for a description of the WAL header format. */ @@ -1755,10 +2193,11 @@ pWal->szPage = szPage; pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN; sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt); memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8); rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0); + WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok")); if( rc!=SQLITE_OK ){ return rc; } walChecksumBytes(1, aWalHdr, sizeof(aWalHdr), 0, pWal->hdr.aFrameCksum); } @@ -1846,52 +2285,42 @@ pWal->iCallback = iFrame; } } walIndexUnmap(pWal); + WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok")); return rc; } /* -** Checkpoint the database: +** This routine is called to implement sqlite3_wal_checkpoint() and +** related interfaces. ** -** 1. Acquire a CHECKPOINT lock -** 2. Copy the contents of the log into the database file. -** 3. Zero the wal-index header (so new readers will ignore the log). -** 4. Drop the CHECKPOINT lock. +** Obtain a CHECKPOINT lock and then backfill as much information as +** we can from WAL into the database. */ int sqlite3WalCheckpoint( Wal *pWal, /* Wal connection */ int sync_flags, /* Flags to sync db file with (or 0) */ int nBuf, /* Size of temporary buffer */ - u8 *zBuf, /* Temporary buffer to use */ - int (*xBusyHandler)(void *), /* Pointer to busy-handler function */ - void *pBusyHandlerArg /* Argument to pass to xBusyHandler */ + u8 *zBuf /* Temporary buffer to use */ ){ int rc; /* Return code */ int isChanged = 0; /* True if a new wal-index header is loaded */ assert( pWal->pWiData==0 ); - - /* Get the CHECKPOINT lock. - ** - ** Normally, the connection will be in UNLOCK state at this point. But - ** if the connection is in exclusive-mode it may still be in READ state - ** even though the upper layer has no active read-transaction (because - ** WalCloseSnapshot() is not called in exclusive mode). The state will - ** be set to UNLOCK when this function returns. This is Ok. - */ - assert( (pWal->lockState==SQLITE_SHM_UNLOCK) - || (pWal->lockState==SQLITE_SHM_READ) ); - walSetLock(pWal, SQLITE_SHM_UNLOCK); - do { - rc = walSetLock(pWal, SQLITE_SHM_CHECKPOINT); - }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) ); - if( rc!=SQLITE_OK ){ - walSetLock(pWal, SQLITE_SHM_UNLOCK); + assert( pWal->ckptLock==0 ); + + WALTRACE(("WAL%p: checkpoint begins\n", pWal)); + rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1); + if( rc ){ + /* Usually this is SQLITE_BUSY meaning that another thread or process + ** is already running a checkpoint, or maybe a recovery. But it might + ** also be SQLITE_IOERR. */ return rc; } + pWal->ckptLock = 1; /* Copy data from the log to the database file. */ rc = walIndexReadHdr(pWal, &isChanged); if( rc==SQLITE_OK ){ rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf); @@ -1906,11 +2335,13 @@ memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); } /* Release the locks. */ walIndexUnmap(pWal); - walSetLock(pWal, SQLITE_SHM_UNLOCK); + walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1); + pWal->ckptLock = 0; + WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok")); return rc; } /* Return the value to pass to a sqlite3_wal_hook callback, the ** number of frames in the WAL at the point of the last commit since @@ -1925,35 +2356,56 @@ } return (int)ret; } /* -** This function is called to set or query the exclusive-mode flag -** associated with the WAL connection passed as the first argument. The -** exclusive-mode flag should be set to indicate that the caller is -** holding an EXCLUSIVE lock on the database file (it does this in -** locking_mode=exclusive mode). If the EXCLUSIVE lock is to be dropped, -** the flag set by this function should be cleared before doing so. -** -** The value of the exclusive-mode flag may only be modified when -** the WAL connection is in READ state. -** -** When the flag is set, this module does not call the VFS xShmLock() -** method to obtain any locks on the wal-index (as it assumes it -** has exclusive access to the wal and wal-index files anyhow). It -** continues to hold (and does not drop) the existing READ lock on -** the wal-index. -** -** To set or clear the flag, the "op" parameter is passed 1 or 0, -** respectively. To query the flag, pass -1. In all cases, the value -** returned is the value of the exclusive-mode flag (after its value -** has been modified, if applicable). +** This function is called to change the WAL subsystem into or out +** of locking_mode=EXCLUSIVE. +** +** If op is zero, then attempt to change from locking_mode=EXCLUSIVE +** into locking_mode=NORMAL. This means that we must acquire a lock +** on the pWal->readLock byte. If the WAL is already in locking_mode=NORMAL +** or if the acquisition of the lock fails, then return 0. If the +** transition out of exclusive-mode is successful, return 1. This +** operation must occur while the pager is still holding the exclusive +** lock on the main database file. +** +** If op is one, then change from locking_mode=NORMAL into +** locking_mode=EXCLUSIVE. This means that the pWal->readLock must +** be released. Return 1 if the transition is made and 0 if the +** WAL is already in exclusive-locking mode - meaning that this +** routine is a no-op. The pager must already hold the exclusive lock +** on the main database file before invoking this operation. +** +** If op is negative, then do a dry-run of the op==1 case but do +** not actually change anything. The pager uses this to see if it +** should acquire the database exclusive lock prior to invoking +** the op==1 case. */ int sqlite3WalExclusiveMode(Wal *pWal, int op){ - if( op>=0 ){ - assert( pWal->lockState==SQLITE_SHM_READ ); - pWal->exclusiveMode = (u8)op; + int rc; + assert( pWal->writeLock==0 && pWal->readLock>=0 ); + if( op==0 ){ + if( pWal->exclusiveMode ){ + pWal->exclusiveMode = 0; + if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){ + pWal->exclusiveMode = 1; + } + rc = pWal->exclusiveMode==0; + }else{ + /* No changes. Either already in locking_mode=NORMAL or else the + ** acquisition of the read-lock failed. The pager must continue to + ** hold the database exclusive lock. */ + rc = 0; + } + }else if( op>0 ){ + assert( pWal->exclusiveMode==0 ); + walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); + pWal->exclusiveMode = 1; + rc = 1; + }else{ + rc = pWal->exclusiveMode==0; } - return pWal->exclusiveMode; + return rc; } #endif /* #ifndef SQLITE_OMIT_WAL */ Index: src/wal.h ================================================================== --- src/wal.h +++ src/wal.h @@ -18,23 +18,24 @@ #define _WAL_H_ #include "sqliteInt.h" #ifdef SQLITE_OMIT_WAL -# define sqlite3WalOpen(x,y,z) 0 -# define sqlite3WalClose(w,x,y,z) 0 -# define sqlite3WalOpenSnapshot(y,z) 0 -# define sqlite3WalCloseSnapshot(z) -# define sqlite3WalRead(v,w,x,y,z) 0 +# define sqlite3WalOpen(x,y,z) 0 +# define sqlite3WalClose(w,x,y,z) 0 +# define sqlite3WalBeginReadTransaction(y,z) 0 +# define sqlite3WalEndReadTransaction(z) +# define sqlite3WalRead(v,w,x,y,z) 0 # define sqlite3WalDbsize(y,z) -# define sqlite3WalWriteLock(y,z) 0 -# define sqlite3WalUndo(x,y,z) 0 +# define sqlite3WalBeginWriteTransaction(y) 0 +# define sqlite3WalEndWRiteTransaction(x) 0 +# define sqlite3WalUndo(x,y,z) 0 # define sqlite3WalSavepoint(y,z) -# define sqlite3WalSavepointUndo(y,z) 0 -# define sqlite3WalFrames(u,v,w,x,y,z) 0 -# define sqlite3WalCheckpoint(u,v,w,x,y,z) 0 -# define sqlite3WalCallback(z) 0 +# define sqlite3WalSavepointUndo(y,z) 0 +# define sqlite3WalFrames(u,v,w,x,y,z) 0 +# define sqlite3WalCheckpoint(u,v,w,x) 0 +# define sqlite3WalCallback(z) 0 #else #define WAL_SAVEPOINT_NDATA 3 /* Connection to a write-ahead log (WAL) file. @@ -51,22 +52,23 @@ ** at an instant in time. sqlite3WalOpenSnapshot gets a read lock and ** preserves the current state even if the other threads or processes ** write to or checkpoint the WAL. sqlite3WalCloseSnapshot() closes the ** transaction and releases the lock. */ -int sqlite3WalOpenSnapshot(Wal *pWal, int *); -void sqlite3WalCloseSnapshot(Wal *pWal); +int sqlite3WalBeginReadTransaction(Wal *pWal, int *); +void sqlite3WalEndReadTransaction(Wal *pWal); /* Read a page from the write-ahead log, if it is present. */ int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, int nOut, u8 *pOut); /* Return the size of the database as it existed at the beginning ** of the snapshot */ void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno); /* Obtain or release the WRITER lock. */ -int sqlite3WalWriteLock(Wal *pWal, int op); +int sqlite3WalBeginWriteTransaction(Wal *pWal); +int sqlite3WalEndWriteTransaction(Wal *pWal); /* Undo any frames written (but not committed) to the log */ int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx); /* Return an integer that records the current (uncommitted) write @@ -83,13 +85,11 @@ /* Copy pages from the log to the database file */ int sqlite3WalCheckpoint( Wal *pWal, /* Write-ahead log connection */ int sync_flags, /* Flags to sync db file with (or 0) */ int nBuf, /* Size of buffer nBuf */ - u8 *zBuf, /* Temporary buffer to use */ - int (*xBusyHandler)(void *), /* Pointer to busy-handler function */ - void *pBusyHandlerArg /* Argument to pass to xBusyHandler */ + u8 *zBuf /* Temporary buffer to use */ ); /* Return the value to pass to a sqlite3_wal_hook callback, the ** number of frames in the WAL at the point of the last commit since ** sqlite3WalCallback() was called. If no commits have occurred since Index: test/filectrl.test ================================================================== --- test/filectrl.test +++ test/filectrl.test @@ -32,10 +32,10 @@ file_control_lasterrno_test db } {} do_test filectrl-1.5 { db close sqlite3 db test_control_lockproxy.db - file_control_lockproxy_test db + file_control_lockproxy_test db [pwd] } {} db close file delete -force .test_control_lockproxy.db-conch test.proxy finish_test Index: test/lock_common.tcl ================================================================== --- test/lock_common.tcl +++ test/lock_common.tcl @@ -45,11 +45,17 @@ append r $line } } proc testfixture_nb_cb {varname chan} { - set line [gets $chan] + if {[eof $chan]} { + append ::tfnb($chan) "ERROR: Child process hung up" + set line "OVER" + } else { + set line [gets $chan] + } + if { $line == "OVER" } { set $varname $::tfnb($chan) unset ::tfnb($chan) close $chan } else { Index: test/wal.test ================================================================== --- test/wal.test +++ test/wal.test @@ -569,74 +569,53 @@ do_test wal-10.$tn.11 { sql2 { BEGIN; SELECT * FROM t1 } } {1 2 3 4 5 6 7 8 9 10} do_test wal-10.$tn.12 { catchsql { PRAGMA wal_checkpoint } - } {1 {database is locked}} + } {0 {}} ;# Reader no longer block checkpoints do_test wal-10.$tn.13 { execsql { INSERT INTO t1 VALUES(11, 12) } sql2 {SELECT * FROM t1} } {1 2 3 4 5 6 7 8 9 10} - # Connection [db2] is holding a lock on a snapshot, preventing [db] from - # checkpointing the database. Add a busy-handler to [db]. If [db2] completes - # its transaction from within the busy-handler, [db] is able to complete - # the checkpoint operation. - # - proc busyhandler x { - if {$x==4} { sql2 COMMIT } - if {$x<5} { return 0 } - return 1 - } - db busy busyhandler + # Writers do not block checkpoints any more either. + # do_test wal-10.$tn.14 { - execsql { PRAGMA wal_checkpoint } - } {} - - # Similar to the test above. Except this time, a new read transaction is - # started (db3) while the checkpointer is waiting for an old one (db2) to - # finish. The checkpointer can finish, but any subsequent write operations - # must wait until after db3 has closed the read transaction, as db3 is a - # "region D" writer. - # - db busy {} - do_test wal-10.$tn.15 { - sql2 { BEGIN; SELECT * FROM t1; } + catchsql { PRAGMA wal_checkpoint } + } {0 {}} + + # The following series of test cases used to verify another blocking + # case in WAL - a case which no longer blocks. + # + do_test wal-10.$tn.15 { + sql2 { COMMIT; BEGIN; SELECT * FROM t1; } } {1 2 3 4 5 6 7 8 9 10 11 12} do_test wal-10.$tn.16 { catchsql { PRAGMA wal_checkpoint } - } {1 {database is locked}} - proc busyhandler x { - if {$x==3} { sql3 { BEGIN; SELECT * FROM t1 } } - if {$x==4} { sql2 COMMIT } - if {$x<5} { return 0 } - return 1 - } - db busy busyhandler + } {0 {}} do_test wal-10.$tn.17 { execsql { PRAGMA wal_checkpoint } } {} do_test wal-10.$tn.18 { - sql3 { SELECT * FROM t1 } + sql3 { BEGIN; SELECT * FROM t1 } } {1 2 3 4 5 6 7 8 9 10 11 12} do_test wal-10.$tn.19 { catchsql { INSERT INTO t1 VALUES(13, 14) } - } {1 {database is locked}} + } {0 {}} do_test wal-10.$tn.20 { execsql { SELECT * FROM t1 } - } {1 2 3 4 5 6 7 8 9 10 11 12} + } {1 2 3 4 5 6 7 8 9 10 11 12 13 14} do_test wal-10.$tn.21 { sql3 COMMIT + sql2 COMMIT } {} do_test wal-10.$tn.22 { - execsql { INSERT INTO t1 VALUES(13, 14) } execsql { SELECT * FROM t1 } } {1 2 3 4 5 6 7 8 9 10 11 12 13 14} - # Set [db3] up as a "region D" reader again. Then upgrade it to a writer - # and back down to a reader. Then, check that a checkpoint is not possible - # (as [db3] still has a snapshot locked). + # Another series of tests that used to demonstrate blocking behavior + # but which now work. # do_test wal-10.$tn.23 { execsql { PRAGMA wal_checkpoint } } {} do_test wal-10.$tn.24 { @@ -645,27 +624,25 @@ do_test wal-10.$tn.25 { execsql { PRAGMA wal_checkpoint } } {} do_test wal-10.$tn.26 { catchsql { INSERT INTO t1 VALUES(15, 16) } - } {1 {database is locked}} + } {0 {}} do_test wal-10.$tn.27 { - sql3 { INSERT INTO t1 VALUES(15, 16) } + sql3 { INSERT INTO t1 VALUES(17, 18) } } {} do_test wal-10.$tn.28 { code3 { set ::STMT [sqlite3_prepare db3 "SELECT * FROM t1" -1 TAIL] sqlite3_step $::STMT } - sql3 COMMIT execsql { SELECT * FROM t1 } - } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16} - db busy {} + } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} do_test wal-10.$tn.29 { - execsql { INSERT INTO t1 VALUES(17, 18) } + execsql { INSERT INTO t1 VALUES(19, 20) } catchsql { PRAGMA wal_checkpoint } - } {1 {database is locked}} + } {0 {}} do_test wal-10.$tn.30 { code3 { sqlite3_finalize $::STMT } execsql { PRAGMA wal_checkpoint } } {} @@ -672,24 +649,25 @@ # At one point, if a reader failed to upgrade to a writer because it # was reading an old snapshot, the write-locks were not being released. # Test that this bug has been fixed. # do_test wal-10.$tn.31 { + sql2 COMMIT execsql { BEGIN ; SELECT * FROM t1 } - sql2 { INSERT INTO t1 VALUES(19, 20) } - catchsql { INSERT INTO t1 VALUES(21, 22) } + sql2 { INSERT INTO t1 VALUES(21, 22) } + catchsql { INSERT INTO t1 VALUES(23, 24) } } {1 {database is locked}} do_test wal-10.$tn.32 { # This statement would fail when the bug was present. - sql2 { INSERT INTO t1 VALUES(21, 22) } + sql2 { INSERT INTO t1 VALUES(23, 24) } } {} do_test wal-10.$tn.33 { execsql { SELECT * FROM t1 ; COMMIT } - } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18} + } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20} do_test wal-10.$tn.34 { execsql { SELECT * FROM t1 } - } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22} + } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24} # Test that if a checkpointer cannot obtain the required locks, it # releases all locks before returning a busy error. # do_test wal-10.$tn.35 { @@ -701,15 +679,13 @@ sql2 { BEGIN; SELECT * FROM t1; } } {a b c d} - proc busyhandler x { return 1 } - db busy busyhandler do_test wal-10.$tn.36 { catchsql { PRAGMA wal_checkpoint } - } {1 {database is locked}} + } {0 {}} do_test wal-10.$tn.36 { sql3 { INSERT INTO t1 VALUES('e', 'f') } sql2 { SELECT * FROM t1 } } {a b c d} do_test wal-10.$tn.37 { @@ -1057,12 +1033,13 @@ } {SQLITE_LOCKED} do_test wal-15.3.3 { sqlite3_errmsg db } {database table is locked} -# Also test that an error is returned if the db cannot be checkpointed -# because of locks held by another connection. +# Earlier versions returned an error is returned if the db cannot be +# checkpointed because of locks held by another connection. Check that +# this is no longer the case. # sqlite3 db2 test.db do_test wal-15.4.1 { execsql { BEGIN; @@ -1070,14 +1047,14 @@ } db2 } {1 2} do_test wal-15.4.2 { execsql { COMMIT } sqlite3_wal_checkpoint db -} {SQLITE_BUSY} +} {SQLITE_OK} do_test wal-15.4.3 { sqlite3_errmsg db -} {database is locked} +} {not an error} # After [db2] drops its lock, [db] may checkpoint the db. # do_test wal-15.4.4 { execsql { COMMIT } db2 Index: test/wal2.test ================================================================== --- test/wal2.test +++ test/wal2.test @@ -19,21 +19,22 @@ ifcapable !wal {finish_test ; return } proc set_tvfs_hdr {file args} { # Set $nHdr to the number of bytes in the wal-index header: - set nHdr 80 + set nHdr 40 set nInt [expr {$nHdr/4}] if {[llength $args]>1} { return -code error {wrong # args: should be "set_tvfs_hdr fileName ?val?"} } set blob [tvfs shm $file] + if {[llength $args]} { set ia [lindex $args 0] - set tail [string range $blob [expr $nHdr*2] end] + binary scan $blob a[expr $nHdr*2]a* dummy tail set blob [binary format i${nInt}i${nInt}a* $ia $ia $tail] tvfs shm $file $blob } binary scan $blob i${nInt} ints @@ -90,23 +91,32 @@ } {4 10} do_test wal2-1.1 { execsql { SELECT count(a), sum(a) FROM t1 } db2 } {4 10} -foreach {tn iInsert res wal_index_hdr_mod wal_locks} { - 2 5 {5 15} 0 {READ RECOVER READ UNLOCK} - 3 6 {6 21} 1 {READ RECOVER READ UNLOCK} - 4 7 {7 28} 2 {READ RECOVER READ UNLOCK} - 5 8 {8 36} 3 {READ RECOVER READ UNLOCK} - 6 9 {9 45} 4 {READ RECOVER READ UNLOCK} - 7 10 {10 55} 5 {READ RECOVER READ UNLOCK} - 8 11 {11 66} 6 {READ RECOVER READ UNLOCK} - 9 12 {12 78} 7 {READ RECOVER READ UNLOCK} - 10 13 {13 91} 8 {READ RECOVER READ UNLOCK} - 11 14 {14 105} 9 {READ RECOVER READ UNLOCK} - 12 15 {15 120} -1 {READ UNLOCK} -} { +set RECOVER [list \ + {0 1 lock exclusive} {1 7 lock exclusive} \ + {1 7 unlock exclusive} {0 1 unlock exclusive} \ +] +set READ [list \ + {4 1 lock exclusive} {4 1 unlock exclusive} \ + {4 1 lock shared} {4 1 unlock shared} \ +] + +foreach {tn iInsert res wal_index_hdr_mod wal_locks} " + 2 5 {5 15} 0 {$RECOVER $READ} + 3 6 {6 21} 1 {$RECOVER $READ} + 4 7 {7 28} 2 {$RECOVER $READ} + 5 8 {8 36} 3 {$RECOVER $READ} + 6 9 {9 45} 4 {$RECOVER $READ} + 7 10 {10 55} 5 {$RECOVER $READ} + 8 11 {11 66} 6 {$RECOVER $READ} + 9 12 {12 78} 7 {$RECOVER $READ} + 10 13 {13 91} 8 {$RECOVER $READ} + 11 14 {14 105} 9 {$RECOVER $READ} + 12 15 {15 120} -1 {$READ} +" { do_test wal2-1.$tn.1 { execsql { INSERT INTO t1 VALUES($iInsert) } set ::locks [list] @@ -117,11 +127,10 @@ set ::cb_done 1 if {$::wal_index_hdr_mod >= 0} { incr_tvfs_hdr [lindex $args 0] $::wal_index_hdr_mod 1 } } - if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] } return SQLITE_OK } execsql { SELECT count(a), sum(a) FROM t1 } db2 @@ -148,10 +157,15 @@ # # After this, the header is corrupted again and the reader is allowed # to run recovery. This time, it sees an up-to-date snapshot of the # database file. # +set WRITER [list 0 1 lock exclusive] +set LOCKS [list \ + {0 1 lock exclusive} {0 1 unlock exclusive} \ + {4 1 lock shared} {4 1 unlock shared} \ +] do_test wal2-2.0 { testvfs tvfs tvfs_cb proc tvfs_cb {method args} { if {$method == "xShmOpen"} { set ::shm_file [lindex $args 0] } @@ -204,11 +218,11 @@ } } if {$method == "xShmLock"} { set lock [lindex $args 2] lappend ::locks $lock - if {$lock == "RECOVER"} { + if {$lock == $::WRITER} { set_tvfs_hdr $::shm_file $::oldhdr } } return SQLITE_OK } @@ -216,11 +230,11 @@ execsql { SELECT count(a), sum(a) FROM t1 } db2 } $res0 do_test wal2-2.$tn.3 { set ::locks - } {READ RECOVER READ UNLOCK} + } $LOCKS do_test wal2-2.$tn.4 { set ::locks [list] set ::cb_done 0 proc tvfs_cb {method args} { @@ -243,10 +257,12 @@ db close db2 close tvfs delete file delete -force test.db test.db-wal test.db-journal + +if 0 { #------------------------------------------------------------------------- # This test case - wal2-3.* - tests the response of the library to an # SQLITE_BUSY when attempting to obtain a READ or RECOVER lock. # # wal2-3.0 - 2: SQLITE_BUSY when obtaining a READ lock @@ -312,10 +328,12 @@ list [info exists ::sabotage] [info exists ::locked] } {0 0} db close tvfs delete file delete -force test.db test.db-wal test.db-journal + +} #------------------------------------------------------------------------- # Test that a database connection using a VFS that does not support the # xShmXXX interfaces cannot open a WAL database. # @@ -347,10 +365,23 @@ #------------------------------------------------------------------------- # Test that if a database connection is forced to run recovery before it # can perform a checkpoint, it does not transition into RECOVER state. # +# UPDATE: This has now changed. When running a checkpoint, if recovery is +# required the client grabs all exclusive locks (just as it would for a +# recovery performed as a pre-cursor to a normal database transaction). +# +set expected_locks [list] +lappend expected_locks {1 1 lock exclusive} ;# Lock checkpoint +lappend expected_locks {0 1 lock exclusive} ;# Lock writer +lappend expected_locks {2 6 lock exclusive} ;# Lock recovery & all aReadMark[] +lappend expected_locks {2 6 unlock exclusive} ;# Unlock recovery & aReadMark[] +lappend expected_locks {0 1 unlock exclusive} ;# Unlock writer +lappend expected_locks {3 1 lock exclusive} ;# Lock aReadMark[0] +lappend expected_locks {3 1 unlock exclusive} ;# Unlock aReadMark[0] +lappend expected_locks {1 1 unlock exclusive} ;# Unlock checkpoint do_test wal2-5.1 { proc tvfs_cb {method args} { set ::shm_file [lindex $args 0] if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] } return $::tvfs_cb_return @@ -368,11 +399,11 @@ incr_tvfs_hdr $::shm_file 1 1 set ::locks [list] execsql { PRAGMA wal_checkpoint } set ::locks -} {CHECKPOINT UNLOCK} +} $expected_locks db close tvfs delete #------------------------------------------------------------------------- # This block, test cases wal2-6.*, tests the operation of WAL with @@ -533,58 +564,132 @@ do_test wal2-6.3.7 { execsql { PRAGMA lock_status } } {main exclusive temp closed} db close + +# This test - wal2-6.4.* - uses a single database connection and the +# [testvfs] instrumentation to test that xShmLock() is being called +# as expected when a WAL database is used with locking_mode=exclusive. +# do_test wal2-6.4.1 { file delete -force test.db test.db-wal test.db-journal proc tvfs_cb {method args} { set ::shm_file [lindex $args 0] if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] } return "SQLITE_OK" } testvfs tvfs tvfs_cb sqlite3 db test.db -vfs tvfs - - execsql { - PRAGMA journal_mode = WAL; - CREATE TABLE t1(x); - INSERT INTO t1 VALUES('Leonard'); - INSERT INTO t1 VALUES('Arthur'); - } - - set ::locks [list] - execsql { PRAGMA locking_mode = exclusive } - set ::locks -} {} -do_test wal2-6.4.2 { - execsql { SELECT * FROM t1 } -} {Leonard Arthur} -do_test wal2-6.4.3 { - set ::locks -} {READ} -do_test wal2-6.4.4 { - execsql { - INSERT INTO t1 VALUES('Julius Henry'); - SELECT * FROM t1; - } -} {Leonard Arthur {Julius Henry}} -do_test wal2-6.4.5 { - set ::locks -} {READ} -do_test wal2-6.4.6 { - execsql { - PRAGMA locking_mode = NORMAL; - DELETE FROM t1; - } - set ::locks -} {READ UNLOCK} -do_test wal2-6.4.7 { - set ::locks [list] - execsql { INSERT INTO t1 VALUES('Karl') } - set ::locks -} {READ WRITE READ UNLOCK} +} {} + +set RECOVERY { + {0 1 lock exclusive} {1 7 lock exclusive} + {1 7 unlock exclusive} {0 1 unlock exclusive} +} +set READMARK0_READ { + {3 1 lock shared} {3 1 unlock shared} +} +set READMARK0_WRITE { + {3 1 lock shared} + {0 1 lock exclusive} {3 1 unlock shared} + {4 1 lock exclusive} {4 1 unlock exclusive} {4 1 lock shared} + {0 1 unlock exclusive} {4 1 unlock shared} +} +set READMARK1_SET { + {4 1 lock exclusive} {4 1 unlock exclusive} +} +set READMARK1_READ { + {4 1 lock shared} {4 1 unlock shared} +} + +foreach {tn sql res expected_locks} { + 2 { + PRAGMA journal_mode = WAL; + BEGIN; + CREATE TABLE t1(x); + INSERT INTO t1 VALUES('Leonard'); + INSERT INTO t1 VALUES('Arthur'); + COMMIT; + } {wal} { + $RECOVERY + $READMARK0_READ + $READMARK0_WRITE + } + + 3 { + # This test should do the READMARK1_SET locking to populate the + # aReadMark[1] slot with the current mxFrame value. Followed by + # READMARK1_READ to read the database. + # + SELECT * FROM t1 + } {Leonard Arthur} { + $READMARK1_SET + $READMARK1_READ + } + + 4 { + # aReadMark[1] is already set to mxFrame. So just READMARK1_READ + # this time, not READMARK1_SET. + # + SELECT * FROM t1 ORDER BY x + } {Arthur Leonard} { + $READMARK1_READ + } + + 5 { + PRAGMA locking_mode = exclusive + } {exclusive} { } + + 6 { + INSERT INTO t1 VALUES('Julius Henry'); + SELECT * FROM t1; + } {Leonard Arthur {Julius Henry}} { + $READMARK1_READ + } + + 7 { + INSERT INTO t1 VALUES('Karl'); + SELECT * FROM t1; + } {Leonard Arthur {Julius Henry} Karl} { } + + 8 { + PRAGMA locking_mode = normal + } {normal} { } + + 9 { + SELECT * FROM t1 ORDER BY x + } {Arthur {Julius Henry} Karl Leonard} { } + + 10 { + DELETE FROM t1 + } {} { + $READMARK1_READ + } + + 11 { + SELECT * FROM t1 + } {} { + $READMARK1_SET + $READMARK1_READ + } +} { + + set L [list] + foreach el [subst $expected_locks] { lappend L $el } + + set S "" + foreach sq [split $sql "\n"] { + set sq [string trim $sq] + if {[string match {#*} $sq]==0} {append S "$sq\n"} + } + + set ::locks [list] + do_test wal2-6.4.$tn.1 { execsql $S } $res + do_test wal2-6.4.$tn.2 { set ::locks } $L +} + db close tvfs delete do_test wal2-6.5.1 { sqlite3 db test.db Index: test/walthread.test ================================================================== --- test/walthread.test +++ test/walthread.test @@ -282,10 +282,16 @@ INSERT INTO t1 SELECT md5sum(x) FROM t1; COMMIT; } } + # Turn off auto-checkpoint. Otherwise, an auto-checkpoint run by a + # writer may cause the dedicated checkpoint thread to return an + # SQLITE_BUSY error. + # + db eval { PRAGMA wal_autocheckpoint = 0 } + set nRun 0 while {[tt_continue]} { read_transaction write_transaction incr nRun @@ -387,11 +393,14 @@ } -thread t 10 { set nextwrite $E(pid) proc wal_hook {zDb nEntry} { - if {$nEntry>10} {db eval {PRAGMA wal_checkpoint}} + if {$nEntry>10} { + set rc [catch { db eval {PRAGMA wal_checkpoint} } msg] + if {$rc && $msg != "database is locked"} { error $msg } + } return 0 } db wal_hook wal_hook while {[tt_continue]} {