Index: src/btree.c
==================================================================
--- src/btree.c
+++ src/btree.c
@@ -2544,11 +2544,11 @@
     }
   
     if( rc!=SQLITE_OK ){
       unlockBtreeIfUnused(pBt);
     }
-  }while( rc==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
+  }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
           btreeInvokeBusyHandler(pBt) );
 
   if( rc==SQLITE_OK ){
     if( p->inTrans==TRANS_NONE ){
       pBt->nTransaction++;

Index: src/os.c
==================================================================
--- src/os.c
+++ src/os.c
@@ -108,12 +108,12 @@
   return id->pMethods->xShmGet(id, reqSize, pSize, pp);
 }
 int sqlite3OsShmRelease(sqlite3_file *id){
   return id->pMethods->xShmRelease(id);
 }
-int sqlite3OsShmLock(sqlite3_file *id, int desiredLock, int *pGotLock){
-  return id->pMethods->xShmLock(id, desiredLock, pGotLock);
+int sqlite3OsShmLock(sqlite3_file *id, int offset, int n, int flags){
+  return id->pMethods->xShmLock(id, offset, n, flags);
 }
 void sqlite3OsShmBarrier(sqlite3_file *id){
   id->pMethods->xShmBarrier(id);
 }
 int sqlite3OsShmClose(sqlite3_file *id, int deleteFlag){

Index: src/os.h
==================================================================
--- src/os.h
+++ src/os.h
@@ -245,11 +245,11 @@
 int sqlite3OsDeviceCharacteristics(sqlite3_file *id);
 int sqlite3OsShmOpen(sqlite3_file *id);
 int sqlite3OsShmSize(sqlite3_file *id, int, int*);
 int sqlite3OsShmGet(sqlite3_file *id, int, int*, void volatile**);
 int sqlite3OsShmRelease(sqlite3_file *id);
-int sqlite3OsShmLock(sqlite3_file *id, int, int*);
+int sqlite3OsShmLock(sqlite3_file *id, int, int, int);
 void sqlite3OsShmBarrier(sqlite3_file *id);
 int sqlite3OsShmClose(sqlite3_file *id, int);
 
 /* 
 ** Functions for accessing sqlite3_vfs methods 

Index: src/os_unix.c
==================================================================
--- src/os_unix.c
+++ src/os_unix.c
@@ -2064,11 +2064,11 @@
   if( id ){
     unixFile *pFile = (unixFile*)id;
     semUnlock(id, NO_LOCK);
     assert( pFile );
     unixEnterMutex();
-    releaseLockInfo(pFile->pInode);
+    releaseInodeInfo(pFile->pInode);
     unixLeaveMutex();
     closeUnixFile(id);
   }
   return SQLITE_OK;
 }
@@ -2531,11 +2531,11 @@
       ** descriptor to pInode->aPending.  It will be automatically closed when
       ** the last lock is cleared.
       */
       setPendingFd(pFile);
     }
-    releaseLockInfo(pFile->pInode);
+    releaseInodeInfo(pFile->pInode);
     sqlite3_free(pFile->lockingContext);
     rc = closeUnixFile(id);
     unixLeaveMutex();
   }
   return rc;
@@ -3166,270 +3166,97 @@
 ** while accessing any read/write fields.
 */
 struct unixShm {
   unixShmNode *pShmNode;     /* The underlying unixShmNode object */
   unixShm *pNext;            /* Next unixShm with the same unixShmNode */
-  u8 lockState;              /* Current lock state */
   u8 hasMutex;               /* True if holding the unixShmNode mutex */
   u8 hasMutexBuf;            /* True if holding pFile->mutexBuf */
-  u8 sharedMask;             /* Mask of shared locks held */
-  u8 exclMask;               /* Mask of exclusive locks held */
+  u16 sharedMask;            /* Mask of shared locks held */
+  u16 exclMask;              /* Mask of exclusive locks held */
 #ifdef SQLITE_DEBUG
   u8 id;                     /* Id of this connection within its unixShmNode */
 #endif
 };
 
-/*
-** Size increment by which shared memory grows
-*/
-#define SQLITE_UNIX_SHM_INCR  4096
-
 /*
 ** Constants used for locking
 */
-#define UNIX_SHM_BASE      80        /* Byte offset of the first lock byte */
-#define UNIX_SHM_DMS       0x01      /* Mask for Dead-Man-Switch lock */
-#define UNIX_SHM_A         0x10      /* Mask for region locks... */
-#define UNIX_SHM_B         0x20
-#define UNIX_SHM_C         0x40
-#define UNIX_SHM_D         0x80
-
-#ifdef SQLITE_DEBUG
-/*
-** Return a pointer to a nul-terminated string in static memory that
-** describes a locking mask.  The string is of the form "MSABCD" with
-** each character representing a lock.  "M" for MUTEX, "S" for DMS, 
-** and "A" through "D" for the region locks.  If a lock is held, the
-** letter is shown.  If the lock is not held, the letter is converted
-** to ".".
-**
-** This routine is for debugging purposes only and does not appear
-** in a production build.
-*/
-static const char *unixShmLockString(u8 mask){
-  static char zBuf[48];
-  static int iBuf = 0;
-  char *z;
-
-  z = &zBuf[iBuf];
-  iBuf += 8;
-  if( iBuf>=sizeof(zBuf) ) iBuf = 0;
-
-  z[0] = (mask & UNIX_SHM_DMS)   ? 'S' : '.';
-  z[1] = (mask & UNIX_SHM_A)     ? 'A' : '.';
-  z[2] = (mask & UNIX_SHM_B)     ? 'B' : '.';
-  z[3] = (mask & UNIX_SHM_C)     ? 'C' : '.';
-  z[4] = (mask & UNIX_SHM_D)     ? 'D' : '.';
-  z[5] = 0;
-  return z;
-}
-#endif /* SQLITE_DEBUG */
-
-/*
-** Apply posix advisory locks for all bytes identified in lockMask.
-**
-** lockMask might contain multiple bits but all bits are guaranteed
-** to be contiguous.
+#define UNIX_SHM_BASE   ((18+SQLITE_SHM_NLOCK)*4)         /* first lock byte */
+#define UNIX_SHM_DMS    (UNIX_SHM_BASE+SQLITE_SHM_NLOCK)  /* deadman switch */
+
+/*
+** Apply posix advisory locks for all bytes from ofst through ofst+n-1.
 **
 ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking
 ** otherwise.
 */
 static int unixShmSystemLock(
   unixShmNode *pShmNode, /* Apply locks to this open shared-memory segment */
   int lockType,          /* F_UNLCK, F_RDLCK, or F_WRLCK */
-  u8 lockMask            /* Which bytes to lock or unlock */
+  int ofst,              /* First byte of the locking range */
+  int n                  /* Number of bytes to lock */
 ){
   struct flock f;       /* The posix advisory locking structure */
-  int lockOp;           /* The opcode for fcntl() */
-  int i;                /* Offset into the locking byte range */
-  int rc;               /* Result code form fcntl() */
-  u8 mask;              /* Mask of bits in lockMask */
+  int rc = SQLITE_OK;   /* Result code form fcntl() */
 
   /* Access to the unixShmNode object is serialized by the caller */
   assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 );
+
+  /* Shared locks never span more than one byte */
+  assert( n==1 || lockType!=F_RDLCK );
+
+  /* Locks are within range */
+  assert( n>=1 && n<SQLITE_SHM_NLOCK );
 
   /* Initialize the locking parameters */
   memset(&f, 0, sizeof(f));
   f.l_type = lockType;
   f.l_whence = SEEK_SET;
-  if( lockMask==UNIX_SHM_C && lockType!=F_UNLCK ){
-    lockOp = F_SETLKW;
-    OSTRACE(("SHM-LOCK requesting blocking lock\n"));
-  }else{
-    lockOp = F_SETLK;
-  }
-
-  /* Find the first bit in lockMask that is set */
-  for(i=0, mask=0x01; mask!=0 && (lockMask&mask)==0; mask <<= 1, i++){}
-  assert( mask!=0 );
-  f.l_start = i+UNIX_SHM_BASE;
-  f.l_len = 1;
-
-  /* Extend the locking range for each additional bit that is set */
-  mask <<= 1;
-  while( mask!=0 && (lockMask & mask)!=0 ){
-    f.l_len++;
-    mask <<= 1;
-  }
-
-  /* Verify that all bits set in lockMask are contiguous */
-  assert( mask==0 || (lockMask & ~(mask | (mask-1)))==0 );
-
-  /* Acquire the system-level lock */
-  rc = fcntl(pShmNode->h, lockOp, &f);
+  f.l_start = ofst;
+  f.l_len = n;
+
+  rc = fcntl(pShmNode->h, F_SETLK, &f);
   rc = (rc!=(-1)) ? SQLITE_OK : SQLITE_BUSY;
 
   /* Update the global lock state and do debug tracing */
 #ifdef SQLITE_DEBUG
+  { u16 mask;
   OSTRACE(("SHM-LOCK "));
+  mask = (1<<(ofst+n)) - (1<<ofst);
   if( rc==SQLITE_OK ){
     if( lockType==F_UNLCK ){
-      OSTRACE(("unlock ok"));
-      pShmNode->exclMask &= ~lockMask;
-      pShmNode->sharedMask &= ~lockMask;
+      OSTRACE(("unlock %d ok", ofst));
+      pShmNode->exclMask &= ~mask;
+      pShmNode->sharedMask &= ~mask;
     }else if( lockType==F_RDLCK ){
-      OSTRACE(("read-lock ok"));
-      pShmNode->exclMask &= ~lockMask;
-      pShmNode->sharedMask |= lockMask;
+      OSTRACE(("read-lock %d ok", ofst));
+      pShmNode->exclMask &= ~mask;
+      pShmNode->sharedMask |= mask;
     }else{
       assert( lockType==F_WRLCK );
-      OSTRACE(("write-lock ok"));
-      pShmNode->exclMask |= lockMask;
-      pShmNode->sharedMask &= ~lockMask;
+      OSTRACE(("write-lock %d ok", ofst));
+      pShmNode->exclMask |= mask;
+      pShmNode->sharedMask &= ~mask;
     }
   }else{
     if( lockType==F_UNLCK ){
-      OSTRACE(("unlock failed"));
+      OSTRACE(("unlock %d failed", ofst));
     }else if( lockType==F_RDLCK ){
       OSTRACE(("read-lock failed"));
     }else{
       assert( lockType==F_WRLCK );
-      OSTRACE(("write-lock failed"));
+      OSTRACE(("write-lock %d failed", ofst));
     }
   }
-  OSTRACE((" - change requested %s - afterwards %s:%s\n",
-           unixShmLockString(lockMask),
-           unixShmLockString(pShmNode->sharedMask),
-           unixShmLockString(pShmNode->exclMask)));
+  OSTRACE((" - afterwards %03x,%03x\n",
+           pShmNode->sharedMask, pShmNode->exclMask));
+  }
 #endif
 
   return rc;        
 }
 
-/*
-** For connection p, unlock all of the locks identified by the unlockMask
-** parameter.
-*/
-static int unixShmUnlock(
-  unixShmNode *pShmNode,   /* The underlying shared-memory file */
-  unixShm *p,              /* The connection to be unlocked */
-  u8 unlockMask            /* Mask of locks to be unlocked */
-){
-  int rc;      /* Result code */
-  unixShm *pX; /* For looping over all sibling connections */
-  u8 allMask;  /* Union of locks held by connections other than "p" */
-
-  /* Access to the unixShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pShmNode->mutex) );
-
-  /* Compute locks held by sibling connections */
-  allMask = 0;
-  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
-    allMask |= pX->sharedMask;
-  }
-
-  /* Unlock the system-level locks */
-  if( (unlockMask & allMask)!=unlockMask ){
-    rc = unixShmSystemLock(pShmNode, F_UNLCK, unlockMask & ~allMask);
-  }else{
-    rc = SQLITE_OK;
-  }
-
-  /* Undo the local locks */
-  if( rc==SQLITE_OK ){
-    p->exclMask &= ~unlockMask;
-    p->sharedMask &= ~unlockMask;
-  } 
-  return rc;
-}
-
-/*
-** Get reader locks for connection p on all locks in the readMask parameter.
-*/
-static int unixShmSharedLock(
-  unixShmNode *pShmNode,   /* The underlying shared-memory file */
-  unixShm *p,              /* The connection to get the shared locks */
-  u8 readMask              /* Mask of shared locks to be acquired */
-){
-  int rc;        /* Result code */
-  unixShm *pX;   /* For looping over all sibling connections */
-  u8 allShared;  /* Union of locks held by connections other than "p" */
-
-  /* Access to the unixShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pShmNode->mutex) );
-
-  /* Find out which shared locks are already held by sibling connections.
-  ** If any sibling already holds an exclusive lock, go ahead and return
-  ** SQLITE_BUSY.
-  */
-  allShared = 0;
-  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    if( (pX->exclMask & readMask)!=0 ) return SQLITE_BUSY;
-    allShared |= pX->sharedMask;
-  }
-
-  /* Get shared locks at the system level, if necessary */
-  if( (~allShared) & readMask ){
-    rc = unixShmSystemLock(pShmNode, F_RDLCK, readMask);
-  }else{
-    rc = SQLITE_OK;
-  }
-
-  /* Get the local shared locks */
-  if( rc==SQLITE_OK ){
-    p->sharedMask |= readMask;
-  }
-  return rc;
-}
-
-/*
-** For connection p, get an exclusive lock on all locks identified in
-** the writeMask parameter.
-*/
-static int unixShmExclusiveLock(
-  unixShmNode *pShmNode,    /* The underlying shared-memory file */
-  unixShm *p,               /* The connection to get the exclusive locks */
-  u8 writeMask              /* Mask of exclusive locks to be acquired */
-){
-  int rc;        /* Result code */
-  unixShm *pX;   /* For looping over all sibling connections */
-
-  /* Access to the unixShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pShmNode->mutex) );
-
-  /* Make sure no sibling connections hold locks that will block this
-  ** lock.  If any do, return SQLITE_BUSY right away.
-  */
-  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    if( (pX->exclMask & writeMask)!=0 ) return SQLITE_BUSY;
-    if( (pX->sharedMask & writeMask)!=0 ) return SQLITE_BUSY;
-  }
-
-  /* Get the exclusive locks at the system level.  Then if successful
-  ** also mark the local connection as being locked.
-  */
-  rc = unixShmSystemLock(pShmNode, F_WRLCK, writeMask);
-  if( rc==SQLITE_OK ){
-    p->sharedMask &= ~writeMask;
-    p->exclMask |= writeMask;
-  }
-  return rc;
-}
 
 /*
 ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
 **
 ** This is not a VFS shared-memory method; it is a utility function called
@@ -3518,17 +3345,17 @@
 
     /* Check to see if another process is holding the dead-man switch.
     ** If not, truncate the file to zero length. 
     */
     rc = SQLITE_OK;
-    if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS)==SQLITE_OK ){
+    if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){
       if( ftruncate(pShmNode->h, 0) ){
         rc = SQLITE_IOERR;
       }
     }
     if( rc==SQLITE_OK ){
-      rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS);
+      rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS, 1);
     }
     if( rc ) goto shm_open_err;
   }
 
   /* Make the new connection a child of the unixShmNode */
@@ -3570,21 +3397,18 @@
   pShmNode = p->pShmNode;
 
   assert( pShmNode==pDbFd->pInode->pShmNode );
   assert( pShmNode->pInode==pDbFd->pInode );
 
-  /* Verify that the connection being closed holds no locks */
-  assert( p->exclMask==0 );
-  assert( p->sharedMask==0 );
-
   /* Remove connection p from the set of connections associated
   ** with pShmNode */
   sqlite3_mutex_enter(pShmNode->mutex);
   for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){}
   *pp = p->pNext;
 
   /* Free the connection p */
+  assert( p->hasMutexBuf==0 );
   sqlite3_free(p);
   pDbFd->pShm = 0;
   sqlite3_mutex_leave(pShmNode->mutex);
 
   /* If pShmNode->nRef has reached 0, then close the underlying
@@ -3639,10 +3463,31 @@
     reqSize = -1;
   }
   return rc;
 }
 
+/*
+** Release the lock held on the shared memory segment to that other
+** threads are free to resize it if necessary.
+**
+** If the lock is not currently held, this routine is a harmless no-op.
+**
+** If the shared-memory object is in lock state RECOVER, then we do not
+** really want to release the lock, so in that case too, this routine
+** is a no-op.
+*/
+static int unixShmRelease(sqlite3_file *fd){
+  unixFile *pDbFd = (unixFile*)fd;
+  unixShm *p = pDbFd->pShm;
+
+  if( p->hasMutexBuf ){
+    assert( sqlite3_mutex_notheld(p->pShmNode->mutex) );
+    sqlite3_mutex_leave(p->pShmNode->mutexBuf);
+    p->hasMutexBuf = 0;
+  }
+  return SQLITE_OK;
+}
 
 /*
 ** Map the shared storage into memory. 
 **
 ** If reqMapSize is positive, then an attempt is made to make the
@@ -3685,11 +3530,11 @@
   int rc = SQLITE_OK;
 
   assert( pShmNode==pDbFd->pInode->pShmNode );
   assert( pShmNode->pInode==pDbFd->pInode );
 
-  if( p->lockState!=SQLITE_SHM_CHECKPOINT && p->hasMutexBuf==0 ){
+  if( p->hasMutexBuf==0 ){
     assert( sqlite3_mutex_notheld(pShmNode->mutex) );
     sqlite3_mutex_enter(pShmNode->mutexBuf);
     p->hasMutexBuf = 1;
   }
   sqlite3_mutex_enter(pShmNode->mutex);
@@ -3712,176 +3557,127 @@
     }
   }
   *pNewMapSize = pShmNode->szMap;
   *ppBuf = pShmNode->pMMapBuf;
   sqlite3_mutex_leave(pShmNode->mutex);
+  if( *ppBuf==0 ){
+    /* Do not hold the mutex if a NULL pointer is being returned. */
+    unixShmRelease(fd);
+  }
   return rc;
 }
 
-/*
-** Release the lock held on the shared memory segment to that other
-** threads are free to resize it if necessary.
-**
-** If the lock is not currently held, this routine is a harmless no-op.
-**
-** If the shared-memory object is in lock state RECOVER, then we do not
-** really want to release the lock, so in that case too, this routine
-** is a no-op.
-*/
-static int unixShmRelease(sqlite3_file *fd){
-  unixFile *pDbFd = (unixFile*)fd;
-  unixShm *p = pDbFd->pShm;
-
-  if( p->hasMutexBuf && p->lockState!=SQLITE_SHM_RECOVER ){
-    assert( sqlite3_mutex_notheld(p->pShmNode->mutex) );
-    sqlite3_mutex_leave(p->pShmNode->mutexBuf);
-    p->hasMutexBuf = 0;
-  }
-  return SQLITE_OK;
-}
-
-/*
-** Symbolic names for LOCK states used for debugging.
-*/
-#ifdef SQLITE_DEBUG
-static const char *azLkName[] = {
-  "UNLOCK",
-  "READ",
-  "READ_FULL",
-  "WRITE",
-  "PENDING",
-  "CHECKPOINT",
-  "RECOVER"
-};
-#endif
-
 
 /*
 ** Change the lock state for a shared-memory segment.
+**
+** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
+** different here than in posix.  In xShmLock(), one can go from unlocked
+** to shared and back or from unlocked to exclusive and back.  But one may
+** not go from shared to exclusive or from exclusive to shared.
 */
 static int unixShmLock(
   sqlite3_file *fd,          /* Database file holding the shared memory */
-  int desiredLock,           /* One of SQLITE_SHM_xxxxx locking states */
-  int *pGotLock              /* The lock you actually got */
+  int ofst,                  /* First lock to acquire or release */
+  int n,                     /* Number of locks to acquire or release */
+  int flags                  /* What to do with the lock */
 ){
-  unixFile *pDbFd = (unixFile*)fd;
-  unixShm *p = pDbFd->pShm;
-  unixShmNode *pShmNode = p->pShmNode;
-  int rc = SQLITE_PROTOCOL;
+  unixFile *pDbFd = (unixFile*)fd;      /* Connection holding shared memory */
+  unixShm *p = pDbFd->pShm;             /* The shared memory being locked */
+  unixShm *pX;                          /* For looping over all siblings */
+  unixShmNode *pShmNode = p->pShmNode;  /* The underlying file iNode */
+  int rc = SQLITE_OK;                   /* Result code */
+  u16 mask;                             /* Mask of locks to take or release */
 
   assert( pShmNode==pDbFd->pInode->pShmNode );
   assert( pShmNode->pInode==pDbFd->pInode );
-
-  /* Note that SQLITE_SHM_READ_FULL and SQLITE_SHM_PENDING are never
-  ** directly requested; they are side effects from requesting
-  ** SQLITE_SHM_READ and SQLITE_SHM_CHECKPOINT, respectively.
-  */
-  assert( desiredLock==SQLITE_SHM_UNLOCK
-       || desiredLock==SQLITE_SHM_READ
-       || desiredLock==SQLITE_SHM_WRITE
-       || desiredLock==SQLITE_SHM_CHECKPOINT
-       || desiredLock==SQLITE_SHM_RECOVER );
-
-  /* Return directly if this is just a lock state query, or if
-  ** the connection is already in the desired locking state.
-  */
-  if( desiredLock==p->lockState
-   || (desiredLock==SQLITE_SHM_READ && p->lockState==SQLITE_SHM_READ_FULL)
-  ){
-    OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s and got %s\n",
-             p->id, getpid(), azLkName[desiredLock], azLkName[p->lockState]));
-    if( pGotLock ) *pGotLock = p->lockState;
-    return SQLITE_OK;
-  }
-
-  OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s->%s\n",
-            p->id, getpid(), azLkName[p->lockState], azLkName[desiredLock]));
-  
-  if( desiredLock==SQLITE_SHM_RECOVER && !p->hasMutexBuf ){
-    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
-    sqlite3_mutex_enter(pShmNode->mutexBuf);
-    p->hasMutexBuf = 1;
-  }
-  sqlite3_mutex_enter(pShmNode->mutex);
-  switch( desiredLock ){
-    case SQLITE_SHM_UNLOCK: {
-      assert( p->lockState!=SQLITE_SHM_RECOVER );
-      unixShmUnlock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B|UNIX_SHM_C|UNIX_SHM_D);
-      rc = SQLITE_OK;
-      p->lockState = SQLITE_SHM_UNLOCK;
-      break;
-    }
-    case SQLITE_SHM_READ: {
-      if( p->lockState==SQLITE_SHM_UNLOCK ){
-        int nAttempt;
-        rc = SQLITE_BUSY;
-        assert( p->lockState==SQLITE_SHM_UNLOCK );
-        for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
-          rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B);
-          if( rc==SQLITE_BUSY ){
-            rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_D);
-            if( rc==SQLITE_OK ){
-              p->lockState = SQLITE_SHM_READ_FULL;
-            }
-          }else{
-            unixShmUnlock(pShmNode, p, UNIX_SHM_B);
-            p->lockState = SQLITE_SHM_READ;
-          }
-        }
-      }else{
-       assert( p->lockState==SQLITE_SHM_WRITE
-               || p->lockState==SQLITE_SHM_RECOVER );
-        rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A);
-        unixShmUnlock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D);
-        p->lockState = SQLITE_SHM_READ;
-      }
-      break;
-    }
-    case SQLITE_SHM_WRITE: {
-      assert( p->lockState==SQLITE_SHM_READ 
-              || p->lockState==SQLITE_SHM_READ_FULL );
-      rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D);
-      if( rc==SQLITE_OK ){
-        p->lockState = SQLITE_SHM_WRITE;
-      }
-      break;
-    }
-    case SQLITE_SHM_CHECKPOINT: {
-      assert( p->lockState==SQLITE_SHM_UNLOCK
-           || p->lockState==SQLITE_SHM_PENDING
-      );
-      if( p->lockState==SQLITE_SHM_UNLOCK ){
-        rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_B|UNIX_SHM_C);
-        if( rc==SQLITE_OK ){
-          p->lockState = SQLITE_SHM_PENDING;
-        }
-      }
-      if( p->lockState==SQLITE_SHM_PENDING ){
-        rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_A);
-        if( rc==SQLITE_OK ){
-          p->lockState = SQLITE_SHM_CHECKPOINT;
-        }
-      }
-      break;
-    }
-    default: {
-      assert( desiredLock==SQLITE_SHM_RECOVER );
-      assert( p->lockState==SQLITE_SHM_READ
-           || p->lockState==SQLITE_SHM_READ_FULL
-      );
-      assert( sqlite3_mutex_held(pShmNode->mutexBuf) );
-      rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C);
-      if( rc==SQLITE_OK ){
-        p->lockState = SQLITE_SHM_RECOVER;
-      }
-      break;
+  assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK );
+  assert( n>=1 );
+  assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
+       || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
+       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
+       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
+  assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
+
+  mask = (1<<(ofst+n)) - (1<<ofst);
+  assert( n>1 || mask==(1<<ofst) );
+  sqlite3_mutex_enter(pShmNode->mutex);
+  if( flags & SQLITE_SHM_UNLOCK ){
+    u16 allMask = 0; /* Mask of locks held by siblings */
+
+    /* See if any siblings hold this same lock */
+    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
+      if( pX==p ) continue;
+      assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
+      allMask |= pX->sharedMask;
+    }
+
+    /* Unlock the system-level locks */
+    if( (mask & allMask)==0 ){
+      rc = unixShmSystemLock(pShmNode, F_UNLCK, ofst+UNIX_SHM_BASE, n);
+    }else{
+      rc = SQLITE_OK;
+    }
+
+    /* Undo the local locks */
+    if( rc==SQLITE_OK ){
+      p->exclMask &= ~mask;
+      p->sharedMask &= ~mask;
+    } 
+  }else if( flags & SQLITE_SHM_SHARED ){
+    u16 allShared = 0;  /* Union of locks held by connections other than "p" */
+
+    /* Find out which shared locks are already held by sibling connections.
+    ** If any sibling already holds an exclusive lock, go ahead and return
+    ** SQLITE_BUSY.
+    */
+    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
+      if( (pX->exclMask & mask)!=0 ){
+        rc = SQLITE_BUSY;
+        break;
+      }
+      allShared |= pX->sharedMask;
+    }
+
+    /* Get shared locks at the system level, if necessary */
+    if( rc==SQLITE_OK ){
+      if( (allShared & mask)==0 ){
+        rc = unixShmSystemLock(pShmNode, F_RDLCK, ofst+UNIX_SHM_BASE, n);
+      }else{
+        rc = SQLITE_OK;
+      }
+    }
+
+    /* Get the local shared locks */
+    if( rc==SQLITE_OK ){
+      p->sharedMask |= mask;
+    }
+  }else{
+    /* Make sure no sibling connections hold locks that will block this
+    ** lock.  If any do, return SQLITE_BUSY right away.
+    */
+    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
+      if( (pX->exclMask & mask)!=0 || (pX->sharedMask & mask)!=0 ){
+        rc = SQLITE_BUSY;
+        break;
+      }
+    }
+  
+    /* Get the exclusive locks at the system level.  Then if successful
+    ** also mark the local connection as being locked.
+    */
+    if( rc==SQLITE_OK ){
+      rc = unixShmSystemLock(pShmNode, F_WRLCK, ofst+UNIX_SHM_BASE, n);
+      if( rc==SQLITE_OK ){
+        assert( (p->sharedMask & mask)==0 );
+        p->exclMask |= mask;
+      }
     }
   }
   sqlite3_mutex_leave(pShmNode->mutex);
-  OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %s\n",
-           p->id, getpid(), azLkName[p->lockState]));
-  if( pGotLock ) *pGotLock = p->lockState;
+  OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %03x,%03x\n",
+           p->id, getpid(), p->sharedMask, p->exclMask));
   return rc;
 }
 
 /*
 ** Implement a memory barrier or memory fence on shared memory.  
@@ -3890,16 +3686,12 @@
 ** any load or store begun after the barrier.
 */
 static void unixShmBarrier(
   sqlite3_file *fd           /* Database file holding the shared memory */
 ){
-#ifdef __GNUC__
-  __sync_synchronize();
-#else
-  unixMutexEnter();
-  unixMutexLeave();
-#endif
+  unixEnterMutex();
+  unixLeaveMutex();
 }
 
 
 #else
 # define unixShmOpen    0

Index: src/os_win.c
==================================================================
--- src/os_win.c
+++ src/os_win.c
@@ -1225,12 +1225,10 @@
   char *pMMapBuf;            /* Where currently mmapped().  NULL if unmapped */
   int nRef;                  /* Number of winShm objects pointing to this */
   winShm *pFirst;            /* All winShm objects pointing to this */
   winShmNode *pNext;         /* Next in list of all winShmNode objects */
 #ifdef SQLITE_DEBUG
-  u8 exclMask;               /* Mask of exclusive locks held */
-  u8 sharedMask;             /* Mask of shared locks held */
   u8 nextShmId;              /* Next available winShm.id value */
 #endif
 };
 
 /*
@@ -1251,310 +1249,64 @@
 ** and is read-only thereafter.
 */
 struct winShm {
   winShmNode *pShmNode;      /* The underlying winShmNode object */
   winShm *pNext;             /* Next winShm with the same winShmNode */
-  u8 lockState;              /* Current lock state */
   u8 hasMutex;               /* True if holding the winShmNode mutex */
   u8 hasMutexBuf;            /* True if holding pFile->mutexBuf */
-  u8 sharedMask;             /* Mask of shared locks held */
-  u8 exclMask;               /* Mask of exclusive locks held */
 #ifdef SQLITE_DEBUG
   u8 id;                     /* Id of this connection with its winShmNode */
 #endif
 };
 
-/*
-** Size increment by which shared memory grows
-*/
-#define SQLITE_WIN_SHM_INCR  4096
-
 /*
 ** Constants used for locking
 */
-#define WIN_SHM_BASE      80        /* Byte offset of the first lock byte */
-#define WIN_SHM_DMS       0x01      /* Mask for Dead-Man-Switch lock */
-#define WIN_SHM_A         0x10      /* Mask for region locks... */
-#define WIN_SHM_B         0x20
-#define WIN_SHM_C         0x40
-#define WIN_SHM_D         0x80
-
-#ifdef SQLITE_DEBUG
-/*
-** Return a pointer to a nul-terminated string in static memory that
-** describes a locking mask.  The string is of the form "MSABCD" with
-** each character representing a lock.  "M" for MUTEX, "S" for DMS, 
-** and "A" through "D" for the region locks.  If a lock is held, the
-** letter is shown.  If the lock is not held, the letter is converted
-** to ".".
-**
-** This routine is for debugging purposes only and does not appear
-** in a production build.
-*/
-static const char *winShmLockString(u8 mask){
-  static char zBuf[48];
-  static int iBuf = 0;
-  char *z;
-
-  z = &zBuf[iBuf];
-  iBuf += 8;
-  if( iBuf>=sizeof(zBuf) ) iBuf = 0;
-
-  z[0] = (mask & WIN_SHM_DMS)   ? 'S' : '.';
-  z[1] = (mask & WIN_SHM_A)     ? 'A' : '.';
-  z[2] = (mask & WIN_SHM_B)     ? 'B' : '.';
-  z[3] = (mask & WIN_SHM_C)     ? 'C' : '.';
-  z[4] = (mask & WIN_SHM_D)     ? 'D' : '.';
-  z[5] = 0;
-  return z;
-}
-#endif /* SQLITE_DEBUG */
-
-/*
-** Apply posix advisory locks for all bytes identified in lockMask.
-**
-** lockMask might contain multiple bits but all bits are guaranteed
-** to be contiguous.
-**
-** Locks block if the mask is exactly WIN_SHM_C and are non-blocking
-** otherwise.
+#define WIN_SHM_BASE   ((18+SQLITE_SHM_NLOCK)*4)        /* first lock byte */
+#define WIN_SHM_DMS    (WIN_SHM_BASE+SQLITE_SHM_NLOCK)  /* deadman switch */
+
+/*
+** Apply advisory locks for all n bytes beginning at ofst.
 */
 #define _SHM_UNLCK  1
 #define _SHM_RDLCK  2
 #define _SHM_WRLCK  3
 static int winShmSystemLock(
   winShmNode *pFile,    /* Apply locks to this open shared-memory segment */
   int lockType,         /* _SHM_UNLCK, _SHM_RDLCK, or _SHM_WRLCK */
-  u8 lockMask           /* Which bytes to lock or unlock */
+  int ofst,             /* Offset to first byte to be locked/unlocked */
+  int nByte             /* Number of bytes to lock or unlock */
 ){
   OVERLAPPED ovlp;
   DWORD dwFlags;
-  int nBytes;           /* Number of bytes to lock */
-  int i;                /* Offset into the locking byte range */
   int rc = 0;           /* Result code form Lock/UnlockFileEx() */
-  u8 mask;              /* Mask of bits in lockMask */
 
   /* Access to the winShmNode object is serialized by the caller */
   assert( sqlite3_mutex_held(pFile->mutex) || pFile->nRef==0 );
 
   /* Initialize the locking parameters */
-  if( lockMask==WIN_SHM_C && lockType!=_SHM_UNLCK ){
-    dwFlags = 0;
-    OSTRACE(("SHM-LOCK %d requesting blocking lock %s\n", 
-             pFile->hFile.h,
-             winShmLockString(lockMask)));
-  }else{
-    dwFlags = LOCKFILE_FAIL_IMMEDIATELY;
-    OSTRACE(("SHM-LOCK %d requesting %s %s\n", 
-             pFile->hFile.h,
-             lockType!=_SHM_UNLCK ? "lock" : "unlock", 
-             winShmLockString(lockMask)));
-  }
+  dwFlags = LOCKFILE_FAIL_IMMEDIATELY;
   if( lockType == _SHM_WRLCK ) dwFlags |= LOCKFILE_EXCLUSIVE_LOCK;
 
   /* Find the first bit in lockMask that is set */
-  for(i=0, mask=0x01; mask!=0 && (lockMask&mask)==0; mask <<= 1, i++){}
-  assert( mask!=0 );
   memset(&ovlp, 0, sizeof(OVERLAPPED));
-  ovlp.Offset = i+WIN_SHM_BASE;
-  nBytes = 1;
-
-  /* Extend the locking range for each additional bit that is set */
-  mask <<= 1;
-  while( mask!=0 && (lockMask & mask)!=0 ){
-    nBytes++;
-    mask <<= 1;
-  }
-
-  /* Verify that all bits set in lockMask are contiguous */
-  assert( mask==0 || (lockMask & ~(mask | (mask-1)))==0 );
+  ovlp.Offset = ofst;
 
   /* Release/Acquire the system-level lock */
   if( lockType==_SHM_UNLCK ){
-    for(i=0; i<nBytes; i++, ovlp.Offset++){
-      rc = UnlockFileEx(pFile->hFile.h, 0, 1, 0, &ovlp);
-      if( !rc ) break;
-    }
+    rc = UnlockFileEx(pFile->hFile.h, 0, nByte, 0, &ovlp);
   }else{
-    /* release old individual byte locks (if any)
-    ** and set new individual byte locks */
-    for(i=0; i<nBytes; i++, ovlp.Offset++){
-      UnlockFileEx(pFile->hFile.h, 0, 1, 0, &ovlp);
-      rc = LockFileEx(pFile->hFile.h, dwFlags, 0, 1, 0, &ovlp);
-      if( !rc ) break;
-    }
+    rc = LockFileEx(pFile->hFile.h, dwFlags, 0, nByte, 0, &ovlp);
   }
   if( !rc ){
     OSTRACE(("SHM-LOCK %d %s ERROR 0x%08lx\n", 
              pFile->hFile.h,
              lockType==_SHM_UNLCK ? "UnlockFileEx" : "LockFileEx",
              GetLastError()));
-    /* release individual byte locks (if any) */
-    ovlp.Offset-=i;
-    for(i=0; i<nBytes; i++, ovlp.Offset++){
-      UnlockFileEx(pFile->hFile.h, 0, 1, 0, &ovlp);
-    }
   }
   rc = (rc!=0) ? SQLITE_OK : SQLITE_BUSY;
 
-  /* Update the global lock state and do debug tracing */
-#ifdef SQLITE_DEBUG
-  OSTRACE(("SHM-LOCK %d ", pFile->hFile.h));
-  if( rc==SQLITE_OK ){
-    if( lockType==_SHM_UNLCK ){
-      OSTRACE(("unlock ok"));
-      pFile->exclMask &= ~lockMask;
-      pFile->sharedMask &= ~lockMask;
-    }else if( lockType==_SHM_RDLCK ){
-      OSTRACE(("read-lock ok"));
-      pFile->exclMask &= ~lockMask;
-      pFile->sharedMask |= lockMask;
-    }else{
-      assert( lockType==_SHM_WRLCK );
-      OSTRACE(("write-lock ok"));
-      pFile->exclMask |= lockMask;
-      pFile->sharedMask &= ~lockMask;
-    }
-  }else{
-    if( lockType==_SHM_UNLCK ){
-      OSTRACE(("unlock failed"));
-    }else if( lockType==_SHM_RDLCK ){
-      OSTRACE(("read-lock failed"));
-    }else{
-      assert( lockType==_SHM_WRLCK );
-      OSTRACE(("write-lock failed"));
-    }
-  }
-  OSTRACE((" - change requested %s - afterwards %s:%s\n",
-           winShmLockString(lockMask),
-           winShmLockString(pFile->sharedMask),
-           winShmLockString(pFile->exclMask)));
-#endif
-
-  return rc;
-}
-
-/*
-** For connection p, unlock all of the locks identified by the unlockMask
-** parameter.
-*/
-static int winShmUnlock(
-  winShmNode *pFile,   /* The underlying shared-memory file */
-  winShm *p,           /* The connection to be unlocked */
-  u8 unlockMask         /* Mask of locks to be unlocked */
-){
-  int rc;      /* Result code */
-  winShm *pX; /* For looping over all sibling connections */
-  u8 allMask;  /* Union of locks held by connections other than "p" */
-
-  /* Access to the winShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pFile->mutex) );
-
-  /* don't attempt to unlock anything we don't have locks for */
-  if( (unlockMask & (p->exclMask|p->sharedMask)) != unlockMask ){
-    OSTRACE(("SHM-LOCK %d unlocking more than we have locked - requested %s - have %s\n",
-             pFile->hFile.h,
-             winShmLockString(unlockMask),
-             winShmLockString(p->exclMask|p->sharedMask)));
-    unlockMask &= (p->exclMask|p->sharedMask);
-  }
-
-  /* Compute locks held by sibling connections */
-  allMask = 0;
-  for(pX=pFile->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
-    allMask |= pX->sharedMask;
-  }
-
-  /* Unlock the system-level locks */
-  if( (unlockMask & allMask)!=unlockMask ){
-    rc = winShmSystemLock(pFile, _SHM_UNLCK, unlockMask & ~allMask);
-  }else{
-    rc = SQLITE_OK;
-  }
-
-  /* Undo the local locks */
-  if( rc==SQLITE_OK ){
-    p->exclMask &= ~unlockMask;
-    p->sharedMask &= ~unlockMask;
-  } 
-  return rc;
-}
-
-/*
-** Get reader locks for connection p on all locks in the readMask parameter.
-*/
-static int winShmSharedLock(
-  winShmNode *pFile,   /* The underlying shared-memory file */
-  winShm *p,           /* The connection to get the shared locks */
-  u8 readMask           /* Mask of shared locks to be acquired */
-){
-  int rc;        /* Result code */
-  winShm *pX;   /* For looping over all sibling connections */
-  u8 allShared;  /* Union of locks held by connections other than "p" */
-
-  /* Access to the winShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pFile->mutex) );
-
-  /* Find out which shared locks are already held by sibling connections.
-  ** If any sibling already holds an exclusive lock, go ahead and return
-  ** SQLITE_BUSY.
-  */
-  allShared = 0;
-  for(pX=pFile->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    if( (pX->exclMask & readMask)!=0 ) return SQLITE_BUSY;
-    allShared |= pX->sharedMask;
-  }
-
-  /* Get shared locks at the system level, if necessary */
-  if( (~allShared) & readMask ){
-    rc = winShmSystemLock(pFile, _SHM_RDLCK, readMask);
-  }else{
-    rc = SQLITE_OK;
-  }
-
-  /* Get the local shared locks */
-  if( rc==SQLITE_OK ){
-    p->sharedMask |= readMask;
-  }
-  return rc;
-}
-
-/*
-** For connection p, get an exclusive lock on all locks identified in
-** the writeMask parameter.
-*/
-static int winShmExclusiveLock(
-  winShmNode *pFile,    /* The underlying shared-memory file */
-  winShm *p,            /* The connection to get the exclusive locks */
-  u8 writeMask           /* Mask of exclusive locks to be acquired */
-){
-  int rc;        /* Result code */
-  winShm *pX;   /* For looping over all sibling connections */
-
-  /* Access to the winShmNode object is serialized by the caller */
-  assert( sqlite3_mutex_held(pFile->mutex) );
-
-  /* Make sure no sibling connections hold locks that will block this
-  ** lock.  If any do, return SQLITE_BUSY right away.
-  */
-  for(pX=pFile->pFirst; pX; pX=pX->pNext){
-    if( pX==p ) continue;
-    if( (pX->exclMask & writeMask)!=0 ) return SQLITE_BUSY;
-    if( (pX->sharedMask & writeMask)!=0 ) return SQLITE_BUSY;
-  }
-
-  /* Get the exclusive locks at the system level.  Then if successful
-  ** also mark the local connection as being locked.
-  */
-  rc = winShmSystemLock(pFile, _SHM_WRLCK, writeMask);
-  if( rc==SQLITE_OK ){
-    p->sharedMask &= ~writeMask;
-    p->exclMask |= writeMask;
-  }
   return rc;
 }
 
 /*
 ** Purge the winShmNodeList list of all entries with winShmNode.nRef==0.
@@ -1678,15 +1430,16 @@
     }
 
     /* Check to see if another process is holding the dead-man switch.
     ** If not, truncate the file to zero length. 
     */
-    if( winShmSystemLock(pShmNode, _SHM_WRLCK, WIN_SHM_DMS)==SQLITE_OK ){
+    if( winShmSystemLock(pShmNode, _SHM_WRLCK, WIN_SHM_DMS, 1)==SQLITE_OK ){
       rc = winTruncate((sqlite3_file *)&pShmNode->hFile, 0);
     }
     if( rc==SQLITE_OK ){
-      rc = winShmSystemLock(pShmNode, _SHM_RDLCK, WIN_SHM_DMS);
+      winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS, 1);
+      rc = winShmSystemLock(pShmNode, _SHM_RDLCK, WIN_SHM_DMS, 1);
     }
     if( rc ) goto shm_open_err;
   }
 
   /* Make the new connection a child of the winShmNode */
@@ -1701,11 +1454,11 @@
   winShmLeaveMutex();
   return SQLITE_OK;
 
   /* Jump here on any error */
 shm_open_err:
-  winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS);
+  winShmSystemLock(pShmNode, _SHM_UNLCK, WIN_SHM_DMS, 1);
   winShmPurge();                 /* This call frees pShmNode if required */
   sqlite3_free(p);
   sqlite3_free(pNew);
   winShmLeaveMutex();
   return rc;
@@ -1726,14 +1479,10 @@
 
   pDbFd = (winFile*)fd;
   p = pDbFd->pShm;
   pShmNode = p->pShmNode;
 
-  /* Verify that the connection being closed holds no locks */
-  assert( p->exclMask==0 );
-  assert( p->sharedMask==0 );
-
   /* Remove connection p from the set of connections associated
   ** with pShmNode */
   sqlite3_mutex_enter(pShmNode->mutex);
   for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){}
   *pp = p->pNext;
@@ -1780,16 +1529,12 @@
 
   *pNewSize = 0;
   if( reqSize>=0 ){
     sqlite3_int64 sz;
     rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz);
-    if( SQLITE_OK==rc ){
-      reqSize = (reqSize + SQLITE_WIN_SHM_INCR - 1)/SQLITE_WIN_SHM_INCR;
-      reqSize *= SQLITE_WIN_SHM_INCR;
-      if( reqSize>sz ){
-        rc = winTruncate((sqlite3_file *)&pShmNode->hFile, reqSize);
-      }
+    if( SQLITE_OK==rc && reqSize>sz ){
+      rc = winTruncate((sqlite3_file *)&pShmNode->hFile, reqSize);
     }
   }
   if( SQLITE_OK==rc ){
     sqlite3_int64 sz;
     rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz);
@@ -1831,18 +1576,18 @@
 */
 static int winShmGet(
   sqlite3_file *fd,        /* The database file holding the shared memory */
   int reqMapSize,          /* Requested size of mapping. -1 means don't care */
   int *pNewMapSize,        /* Write new size of mapping here */
-  void **ppBuf             /* Write mapping buffer origin here */
+  void volatile **ppBuf    /* Write mapping buffer origin here */
 ){
   winFile *pDbFd = (winFile*)fd;
   winShm *p = pDbFd->pShm;
   winShmNode *pShmNode = p->pShmNode;
   int rc = SQLITE_OK;
 
-  if( p->lockState!=SQLITE_SHM_CHECKPOINT && p->hasMutexBuf==0 ){
+  if( p->hasMutexBuf==0 ){
     assert( sqlite3_mutex_notheld(pShmNode->mutex) );
     sqlite3_mutex_enter(pShmNode->mutexBuf);
     p->hasMutexBuf = 1;
   }
   sqlite3_mutex_enter(pShmNode->mutex);
@@ -1918,162 +1663,52 @@
 ** is a no-op.
 */
 static int winShmRelease(sqlite3_file *fd){
   winFile *pDbFd = (winFile*)fd;
   winShm *p = pDbFd->pShm;
-  if( p->hasMutexBuf && p->lockState!=SQLITE_SHM_RECOVER ){
+  if( p->hasMutexBuf ){
     winShmNode *pShmNode = p->pShmNode;
     assert( sqlite3_mutex_notheld(pShmNode->mutex) );
     sqlite3_mutex_leave(pShmNode->mutexBuf);
     p->hasMutexBuf = 0;
   }
   return SQLITE_OK;
 }
-
-/*
-** Symbolic names for LOCK states used for debugging.
-*/
-#ifdef SQLITE_DEBUG
-static const char *azLkName[] = {
-  "UNLOCK",
-  "READ",
-  "READ_FULL",
-  "WRITE",
-  "PENDING",
-  "CHECKPOINT",
-  "RECOVER"
-};
-#endif
-
 
 /*
 ** Change the lock state for a shared-memory segment.
 */
 static int winShmLock(
-  sqlite3_file *fd,          /* Database holding the shared memory */
-  int desiredLock,           /* One of SQLITE_SHM_xxxxx locking states */
-  int *pGotLock              /* The lock you actually got */
+  sqlite3_file *fd,          /* Database file holding the shared memory */
+  int ofst,                  /* First lock to acquire or release */
+  int n,                     /* Number of locks to acquire or release */
+  int flags                  /* What to do with the lock */
 ){
   winFile *pDbFd = (winFile*)fd;
   winShm *p = pDbFd->pShm;
   winShmNode *pShmNode = p->pShmNode;
   int rc = SQLITE_PROTOCOL;
 
-  /* Note that SQLITE_SHM_READ_FULL and SQLITE_SHM_PENDING are never
-  ** directly requested; they are side effects from requesting
-  ** SQLITE_SHM_READ and SQLITE_SHM_CHECKPOINT, respectively.
-  */
-  assert( desiredLock==SQLITE_SHM_UNLOCK
-       || desiredLock==SQLITE_SHM_READ
-       || desiredLock==SQLITE_SHM_WRITE
-       || desiredLock==SQLITE_SHM_CHECKPOINT
-       || desiredLock==SQLITE_SHM_RECOVER );
-
-  /* Return directly if this is just a lock state query, or if
-  ** the connection is already in the desired locking state.
-  */
-  if( desiredLock==p->lockState
-   || (desiredLock==SQLITE_SHM_READ && p->lockState==SQLITE_SHM_READ_FULL)
-  ){
-    OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d request %s and got %s\n",
-             pShmNode->hFile.h,
-             p->id, (int)GetCurrentProcessId(), azLkName[desiredLock],
-             azLkName[p->lockState]));
-    if( pGotLock ) *pGotLock = p->lockState;
-    return SQLITE_OK;
-  }
-
-  OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d request %s->%s\n",
-           pShmNode->hFile.h,
-           p->id, (int)GetCurrentProcessId(), azLkName[p->lockState], 
-           azLkName[desiredLock]));
-  
-  if( desiredLock==SQLITE_SHM_RECOVER && !p->hasMutexBuf ){
-    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
-    sqlite3_mutex_enter(pShmNode->mutexBuf);
-    p->hasMutexBuf = 1;
-  }
+  assert( ofst>=0 && ofst+n<=SQLITE_SHM_NLOCK );
+  assert( n>=1 );
+  assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
+       || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
+       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
+       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
+  assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
+
   sqlite3_mutex_enter(pShmNode->mutex);
-  switch( desiredLock ){
-    case SQLITE_SHM_UNLOCK: {
-      assert( p->lockState!=SQLITE_SHM_RECOVER );
-      winShmUnlock(pShmNode, p, WIN_SHM_A|WIN_SHM_B|WIN_SHM_C|WIN_SHM_D);
-      rc = SQLITE_OK;
-      p->lockState = SQLITE_SHM_UNLOCK;
-      break;
-    }
-    case SQLITE_SHM_READ: {
-      if( p->lockState==SQLITE_SHM_UNLOCK ){
-        int nAttempt;
-        rc = SQLITE_BUSY;
-        assert( p->lockState==SQLITE_SHM_UNLOCK );
-        for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
-          rc = winShmSharedLock(pShmNode, p, WIN_SHM_A|WIN_SHM_B);
-          if( rc==SQLITE_BUSY ){
-            rc = winShmSharedLock(pShmNode, p, WIN_SHM_D);
-            if( rc==SQLITE_OK ){
-              p->lockState = SQLITE_SHM_READ_FULL;
-            }
-          }else{
-            winShmUnlock(pShmNode, p, WIN_SHM_B);
-            p->lockState = SQLITE_SHM_READ;
-          }
-        }
-      }else{
-       assert( p->lockState==SQLITE_SHM_WRITE
-               || p->lockState==SQLITE_SHM_RECOVER );
-        rc = winShmSharedLock(pShmNode, p, WIN_SHM_A);
-        winShmUnlock(pShmNode, p, WIN_SHM_C|WIN_SHM_D);
-        p->lockState = SQLITE_SHM_READ;
-      }
-      break;
-    }
-    case SQLITE_SHM_WRITE: {
-      assert( p->lockState==SQLITE_SHM_READ 
-              || p->lockState==SQLITE_SHM_READ_FULL );
-      rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_C|WIN_SHM_D);
-      if( rc==SQLITE_OK ){
-        p->lockState = SQLITE_SHM_WRITE;
-      }
-      break;
-    }
-    case SQLITE_SHM_CHECKPOINT: {
-      assert( p->lockState==SQLITE_SHM_UNLOCK
-           || p->lockState==SQLITE_SHM_PENDING
-      );
-      if( p->lockState==SQLITE_SHM_UNLOCK ){
-        rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_B|WIN_SHM_C);
-        if( rc==SQLITE_OK ){
-          p->lockState = SQLITE_SHM_PENDING;
-        }
-      }
-      if( p->lockState==SQLITE_SHM_PENDING ){
-        rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_A);
-        if( rc==SQLITE_OK ){
-          p->lockState = SQLITE_SHM_CHECKPOINT;
-        }
-      }
-      break;
-    }
-    default: {
-      assert( desiredLock==SQLITE_SHM_RECOVER );
-      assert( p->lockState==SQLITE_SHM_READ
-           || p->lockState==SQLITE_SHM_READ_FULL
-      );
-      assert( sqlite3_mutex_held(pShmNode->mutexBuf) );
-      rc = winShmExclusiveLock(pShmNode, p, WIN_SHM_C);
-      if( rc==SQLITE_OK ){
-        p->lockState = SQLITE_SHM_RECOVER;
-      }
-      break;
-    }
+  if( flags & SQLITE_SHM_UNLOCK ){
+    rc = winShmSystemLock(pShmNode, _SHM_UNLCK, ofst+WIN_SHM_BASE, n);
+  }else if( flags & SQLITE_SHM_SHARED ){
+    rc = winShmSystemLock(pShmNode, _SHM_RDLCK, ofst+WIN_SHM_BASE, n);
+  }else{
+    rc = winShmSystemLock(pShmNode, _SHM_WRLCK, ofst+WIN_SHM_BASE, n);
   }
   sqlite3_mutex_leave(pShmNode->mutex);
-  OSTRACE(("SHM-LOCK %d shmid-%d, pid-%d got %s\n",
-           pShmNode->hFile.h, 
-           p->id, (int)GetCurrentProcessId(), azLkName[p->lockState]));
-  if( pGotLock ) *pGotLock = p->lockState;
+  OSTRACE(("SHM-LOCK shmid-%d, pid-%d %s\n",
+           p->id, (int)GetCurrentProcessId(), rc ? "failed" : "ok"));
   return rc;
 }
 
 /*
 ** Implement a memory barrier or memory fence on shared memory.  

Index: src/pager.c
==================================================================
--- src/pager.c
+++ src/pager.c
@@ -1201,11 +1201,11 @@
 #else
 # define pagerUseWal(x) 0
 # define pagerRollbackWal(x) 0
 # define pagerWalFrames(v,w,x,y,z) 0
 # define pagerOpenWalIfPresent(z) SQLITE_OK
-# define pagerOpenSnapshot(z) SQLITE_OK
+# define pagerBeginReadTransaction(z) SQLITE_OK
 #endif
 
 /*
 ** Unlock the database file. This function is a no-op if the pager
 ** is in exclusive mode.
@@ -1236,11 +1236,11 @@
     ** Clearing the page size cache here is being conservative.
     */
     pPager->dbSizeValid = 0;
 
     if( pagerUseWal(pPager) ){
-      sqlite3WalCloseSnapshot(pPager->pWal);
+      sqlite3WalEndReadTransaction(pPager->pWal);
     }else{
       rc = osUnlock(pPager->fd, NO_LOCK);
     }
     if( rc ){
       pPager->errCode = rc;
@@ -1435,21 +1435,20 @@
   pPager->pInJournal = 0;
   pPager->nRec = 0;
   sqlite3PcacheCleanAll(pPager->pPCache);
 
   if( pagerUseWal(pPager) ){
-    rc2 = sqlite3WalWriteLock(pPager->pWal, 0);
+    rc2 = sqlite3WalEndWriteTransaction(pPager->pWal);
     pPager->state = PAGER_SHARED;
 
     /* If the connection was in locking_mode=exclusive mode but is no longer,
     ** drop the EXCLUSIVE lock held on the database file.
     */
     if( rc2==SQLITE_OK 
      && !pPager->exclusiveMode 
-     && sqlite3WalExclusiveMode(pPager->pWal, -1) 
+     && sqlite3WalExclusiveMode(pPager->pWal, 0) 
     ){
-      sqlite3WalExclusiveMode(pPager->pWal, 0);
       rc2 = osUnlock(pPager->fd, SHARED_LOCK);
     }
   }else if( !pPager->exclusiveMode ){
     rc2 = osUnlock(pPager->fd, SHARED_LOCK);
     pPager->state = PAGER_SHARED;
@@ -2360,19 +2359,31 @@
   }
   return rc;
 }
 
 /*
-** Open a WAL snapshot on the log file this pager is connected to.
+** Begin a read transaction on the WAL.
+**
+** This routine used to be called "pagerOpenSnapshot()" because it essentially
+** makes a snapshot of the database at the current point in time and preserves
+** that snapshot for use by the reader in spite of concurrently changes by
+** other writers or checkpointers.
 */
-static int pagerOpenSnapshot(Pager *pPager){
+static int pagerBeginReadTransaction(Pager *pPager){
   int rc;                         /* Return code */
   int changed = 0;                /* True if cache must be reset */
 
   assert( pagerUseWal(pPager) );
 
-  rc = sqlite3WalOpenSnapshot(pPager->pWal, &changed);
+  /* sqlite3WalEndReadTransaction() was not called for the previous
+  ** transaction in locking_mode=EXCLUSIVE.  So call it now.  If we
+  ** are in locking_mode=NORMAL and EndRead() was previously called,
+  ** the duplicate call is harmless.
+  */
+  sqlite3WalEndReadTransaction(pPager->pWal);
+
+  rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed);
   if( rc==SQLITE_OK ){
     int dummy;
     if( changed ){
       pager_reset(pPager);
       assert( pPager->errCode || pPager->dbSizeValid==0 );
@@ -2426,11 +2437,11 @@
     if( rc==SQLITE_OK ){
       if( isWal ){
         pager_reset(pPager);
         rc = sqlite3PagerOpenWal(pPager, 0);
         if( rc==SQLITE_OK ){
-          rc = pagerOpenSnapshot(pPager);
+          rc = pagerBeginReadTransaction(pPager);
         }
       }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){
         pPager->journalMode = PAGER_JOURNALMODE_DELETE;
       }
     }
@@ -4000,11 +4011,11 @@
     pPager->errCode = SQLITE_OK;
     pager_reset(pPager);
   }
 
   if( pagerUseWal(pPager) ){
-    rc = pagerOpenSnapshot(pPager);
+    rc = pagerBeginReadTransaction(pPager);
   }else if( pPager->state==PAGER_UNLOCK || isErrorReset ){
     sqlite3_vfs * const pVfs = pPager->pVfs;
     int isHotJournal = 0;
     assert( !MEMDB );
     assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
@@ -4539,11 +4550,11 @@
 
     if( pagerUseWal(pPager) ){
       /* If the pager is configured to use locking_mode=exclusive, and an
       ** exclusive lock on the database is not already held, obtain it now.
       */
-      if( pPager->exclusiveMode && !sqlite3WalExclusiveMode(pPager->pWal, -1) ){
+      if( pPager->exclusiveMode && sqlite3WalExclusiveMode(pPager->pWal, -1) ){
         rc = sqlite3OsLock(pPager->fd, EXCLUSIVE_LOCK);
         pPager->state = PAGER_SHARED;
         if( rc!=SQLITE_OK ){
           return rc;
         }
@@ -4559,11 +4570,11 @@
       ** transaction, but never to PAGER_EXCLUSIVE. This is because in 
       ** PAGER_EXCLUSIVE state the code to roll back savepoint transactions
       ** may copy data from the sub-journal into the database file as well
       ** as into the page cache. Which would be incorrect in WAL mode.
       */
-      rc = sqlite3WalWriteLock(pPager->pWal, 1);
+      rc = sqlite3WalBeginWriteTransaction(pPager->pWal);
       if( rc==SQLITE_OK ){
         pPager->dbOrigSize = pPager->dbSize;
         pPager->state = PAGER_RESERVED;
         pPager->journalOff = 0;
       }
@@ -5890,12 +5901,11 @@
   int rc = SQLITE_OK;
   if( pPager->pWal ){
     u8 *zBuf = (u8 *)pPager->pTmpSpace;
     rc = sqlite3WalCheckpoint(pPager->pWal,
         (pPager->noSync ? 0 : pPager->sync_flags),
-        pPager->pageSize, zBuf, 
-        pPager->xBusyHandler, pPager->pBusyHandlerArg
+        pPager->pageSize, zBuf
     );
   }
   return rc;
 }
 

Index: src/sqlite.h.in
==================================================================
--- src/sqlite.h.in
+++ src/sqlite.h.in
@@ -442,11 +442,12 @@
 #define SQLITE_IOERR_ACCESS            (SQLITE_IOERR | (13<<8))
 #define SQLITE_IOERR_CHECKRESERVEDLOCK (SQLITE_IOERR | (14<<8))
 #define SQLITE_IOERR_LOCK              (SQLITE_IOERR | (15<<8))
 #define SQLITE_IOERR_CLOSE             (SQLITE_IOERR | (16<<8))
 #define SQLITE_IOERR_DIR_CLOSE         (SQLITE_IOERR | (17<<8))
-#define SQLITE_LOCKED_SHAREDCACHE      (SQLITE_LOCKED | (1<<8) )
+#define SQLITE_LOCKED_SHAREDCACHE      (SQLITE_LOCKED |  (1<<8))
+#define SQLITE_BUSY_RECOVERY           (SQLITE_BUSY   |  (1<<8))
 
 /*
 ** CAPI3REF: Flags For File Open Operations
 **
 ** These bit values are intended for use in the
@@ -656,11 +657,11 @@
   /* Methods above are valid for version 1 */
   int (*xShmOpen)(sqlite3_file*);
   int (*xShmSize)(sqlite3_file*, int reqSize, int *pNewSize);
   int (*xShmGet)(sqlite3_file*, int reqSize, int *pSize, void volatile**);
   int (*xShmRelease)(sqlite3_file*);
-  int (*xShmLock)(sqlite3_file*, int desiredLock, int *gotLock);
+  int (*xShmLock)(sqlite3_file*, int offset, int n, int flags);
   void (*xShmBarrier)(sqlite3_file*);
   int (*xShmClose)(sqlite3_file*, int deleteFlag);
   /* Methods above are valid for version 2 */
   /* Additional methods may be added in future releases */
 };
@@ -886,20 +887,44 @@
 #define SQLITE_ACCESS_READ      2
 
 /*
 ** CAPI3REF: Flags for the xShmLock VFS method
 **
-** These integer constants define the various locking states that
-** an sqlite3_shm object can be in.
+** These integer constants define the various locking operations
+** allowed by the xShmLock method of [sqlite3_io_methods].  The
+** following are the only legal combinations of flags to the
+** xShmLock method:
+**
+** <ul>
+** <li>  SQLITE_SHM_LOCK | SQLITE_SHM_SHARED
+** <li>  SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE
+** <li>  SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED
+** <li>  SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE
+** </ul>
+**
+** When unlocking, the same SHARED or EXCLUSIVE flag must be supplied as
+** was given no the corresponding lock.  
+**
+** The xShmLock method can transition between unlocked and SHARED or
+** between unlocked and EXCLUSIVE.  It cannot transition between SHARED
+** and EXCLUSIVE.
 */
-#define SQLITE_SHM_UNLOCK       0
-#define SQLITE_SHM_READ         1
-#define SQLITE_SHM_READ_FULL    2
-#define SQLITE_SHM_WRITE        3
-#define SQLITE_SHM_PENDING      4
-#define SQLITE_SHM_CHECKPOINT   5
-#define SQLITE_SHM_RECOVER      6
+#define SQLITE_SHM_UNLOCK       1
+#define SQLITE_SHM_LOCK         2
+#define SQLITE_SHM_SHARED       4
+#define SQLITE_SHM_EXCLUSIVE    8
+
+/*
+** CAPI3REF: Maximum xShmLock index
+**
+** The xShmLock method on [sqlite3_io_methods] may use values
+** between 0 and this upper bound as its "offset" argument.
+** The SQLite core will never attempt to acquire or release a
+** lock outside of this range
+*/
+#define SQLITE_SHM_NLOCK        8
+
 
 /*
 ** CAPI3REF: Initialize The SQLite Library
 **
 ** ^The sqlite3_initialize() routine initializes the

Index: src/test1.c
==================================================================
--- src/test1.c
+++ src/test1.c
@@ -4607,11 +4607,11 @@
   }
   return TCL_OK;  
 }
 
 /*
-** tclcmd:   file_control_lockproxy_test DB
+** tclcmd:   file_control_lockproxy_test DB PWD
 **
 ** This TCL command runs the sqlite3_file_control interface and
 ** verifies correct operation of the SQLITE_GET_LOCKPROXYFILE and
 ** SQLITE_SET_LOCKPROXYFILE verbs.
 */
@@ -4620,19 +4620,22 @@
   Tcl_Interp *interp,    /* The TCL interpreter that invoked this command */
   int objc,              /* Number of arguments */
   Tcl_Obj *CONST objv[]  /* Command arguments */
 ){
   sqlite3 *db;
+  const char *zPwd;
+  int nPwd;
   
-  if( objc!=2 ){
+  if( objc!=3 ){
     Tcl_AppendResult(interp, "wrong # args: should be \"",
-                     Tcl_GetStringFromObj(objv[0], 0), " DB", 0);
+                     Tcl_GetStringFromObj(objv[0], 0), " DB PWD", 0);
     return TCL_ERROR;
   }
   if( getDbPointer(interp, Tcl_GetString(objv[1]), &db) ){
    return TCL_ERROR;
   }
+  zPwd = Tcl_GetStringFromObj(objv[2], &nPwd);
   
 #if !defined(SQLITE_ENABLE_LOCKING_STYLE)
 #  if defined(__APPLE__)
 #    define SQLITE_ENABLE_LOCKING_STYLE 1
 #  else
@@ -4639,13 +4642,19 @@
 #    define SQLITE_ENABLE_LOCKING_STYLE 0
 #  endif
 #endif
 #if SQLITE_ENABLE_LOCKING_STYLE && defined(__APPLE__)
   {
-    char *proxyPath = "test.proxy";
     char *testPath;
     int rc;
+    char proxyPath[400];
+    
+    if( sizeof(proxyPath)<nPwd+20 ){
+      Tcl_AppendResult(interp, "PWD too big", (void*)0);
+      return TCL_ERROR;
+    }
+    sprintf(proxyPath, "%s/test.proxy", zPwd);
     rc = sqlite3_file_control(db, NULL, SQLITE_SET_LOCKPROXYFILE, proxyPath);
     if( rc ){
       Tcl_SetObjResult(interp, Tcl_NewIntObj(rc)); 
       return TCL_ERROR;
     }
@@ -5134,10 +5143,11 @@
 #endif
 #ifdef SQLITE_DEBUG
   extern int sqlite3WhereTrace;
   extern int sqlite3OSTrace;
   extern int sqlite3VdbeAddopTrace;
+  extern int sqlite3WalTrace;
 #endif
 #ifdef SQLITE_TEST
   extern char sqlite3_query_plan[];
   static char *query_plan = sqlite3_query_plan;
 #ifdef SQLITE_ENABLE_FTS3
@@ -5201,10 +5211,12 @@
       (char*)&sqlite3VdbeAddopTrace, TCL_LINK_INT);
   Tcl_LinkVar(interp, "sqlite_where_trace",
       (char*)&sqlite3WhereTrace, TCL_LINK_INT);
   Tcl_LinkVar(interp, "sqlite_os_trace",
       (char*)&sqlite3OSTrace, TCL_LINK_INT);
+  Tcl_LinkVar(interp, "sqlite_wal_trace",
+      (char*)&sqlite3WalTrace, TCL_LINK_INT);
 #endif
 #ifndef SQLITE_OMIT_DISKIO
   Tcl_LinkVar(interp, "sqlite_opentemp_count",
       (char*)&sqlite3_opentemp_count, TCL_LINK_INT);
 #endif

Index: src/test6.c
==================================================================
--- src/test6.c
+++ src/test6.c
@@ -538,12 +538,12 @@
   return sqlite3OsShmGet(((CrashFile*)pFile)->pRealFile, reqSize, pSize, pp);
 }
 static int cfShmRelease(sqlite3_file *pFile){
   return sqlite3OsShmRelease(((CrashFile*)pFile)->pRealFile);
 }
-static int cfShmLock(sqlite3_file *pFile, int desired, int *pGot){
-  return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, desired, pGot);
+static int cfShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
+  return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, ofst, n, flags);
 }
 static void cfShmBarrier(sqlite3_file *pFile){
   sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile);
 }
 static int cfShmClose(sqlite3_file *pFile, int delFlag){

Index: src/test_devsym.c
==================================================================
--- src/test_devsym.c
+++ src/test_devsym.c
@@ -52,11 +52,11 @@
 static int devsymDeviceCharacteristics(sqlite3_file*);
 static int devsymShmOpen(sqlite3_file*);
 static int devsymShmSize(sqlite3_file*,int,int*);
 static int devsymShmGet(sqlite3_file*,int,int*,volatile void**);
 static int devsymShmRelease(sqlite3_file*);
-static int devsymShmLock(sqlite3_file*,int,int*);
+static int devsymShmLock(sqlite3_file*,int,int,int);
 static void devsymShmBarrier(sqlite3_file*);
 static int devsymShmClose(sqlite3_file*,int);
 
 /*
 ** Method declarations for devsym_vfs.
@@ -261,13 +261,13 @@
 }
 static int devsymShmRelease(sqlite3_file *pFile){
   devsym_file *p = (devsym_file *)pFile;
   return sqlite3OsShmRelease(p->pReal);
 }
-static int devsymShmLock(sqlite3_file *pFile, int desired, int *pGot){
+static int devsymShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   devsym_file *p = (devsym_file *)pFile;
-  return sqlite3OsShmLock(p->pReal, desired, pGot);
+  return sqlite3OsShmLock(p->pReal, ofst, n, flags);
 }
 static void devsymShmBarrier(sqlite3_file *pFile){
   devsym_file *p = (devsym_file *)pFile;
   sqlite3OsShmBarrier(p->pReal);
 }

Index: src/test_osinst.c
==================================================================
--- src/test_osinst.c
+++ src/test_osinst.c
@@ -153,11 +153,11 @@
 
 static int vfslogShmOpen(sqlite3_file *pFile);
 static int vfslogShmSize(sqlite3_file *pFile, int reqSize, int *pNewSize);
 static int vfslogShmGet(sqlite3_file *pFile, int,int*,volatile void **);
 static int vfslogShmRelease(sqlite3_file *pFile);
-static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock);
+static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags);
 static void vfslogShmBarrier(sqlite3_file*);
 static int vfslogShmClose(sqlite3_file *pFile, int deleteFlag);
 
 /*
 ** Method declarations for vfslog_vfs.
@@ -458,16 +458,16 @@
   rc = p->pReal->pMethods->xShmRelease(p->pReal);
   t = vfslog_time() - t;
   vfslog_call(p->pVfslog, OS_SHMRELEASE, p->iFileId, t, rc, 0, 0);
   return rc;
 }
-static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock){
+static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   int rc;
   sqlite3_uint64 t;
   VfslogFile *p = (VfslogFile *)pFile;
   t = vfslog_time();
-  rc = p->pReal->pMethods->xShmLock(p->pReal, desiredLock, gotLock);
+  rc = p->pReal->pMethods->xShmLock(p->pReal, ofst, n, flags);
   t = vfslog_time() - t;
   vfslog_call(p->pVfslog, OS_SHMLOCK, p->iFileId, t, rc, 0, 0);
   return rc;
 }
 static void vfslogShmBarrier(sqlite3_file *pFile){

Index: src/test_vfs.c
==================================================================
--- src/test_vfs.c
+++ src/test_vfs.c
@@ -100,11 +100,11 @@
 
 static int tvfsShmOpen(sqlite3_file*);
 static int tvfsShmSize(sqlite3_file*, int , int *);
 static int tvfsShmGet(sqlite3_file*, int , int *, volatile void **);
 static int tvfsShmRelease(sqlite3_file*);
-static int tvfsShmLock(sqlite3_file*, int , int *);
+static int tvfsShmLock(sqlite3_file*, int , int, int);
 static void tvfsShmBarrier(sqlite3_file*);
 static int tvfsShmClose(sqlite3_file*, int);
 
 static sqlite3_io_methods tvfs_io_methods = {
   2,                            /* iVersion */
@@ -542,35 +542,38 @@
   return rc;
 }
 
 static int tvfsShmLock(
   sqlite3_file *pFile,
-  int desiredLock,
-  int *gotLock
+  int ofst,
+  int n,
+  int flags
 ){
   int rc = SQLITE_OK;
   TestvfsFile *pFd = (TestvfsFile *)pFile;
   Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
-  char *zLock = "";
-
-  switch( desiredLock ){
-    case SQLITE_SHM_READ:         zLock = "READ";       break;
-    case SQLITE_SHM_WRITE:        zLock = "WRITE";      break;
-    case SQLITE_SHM_CHECKPOINT:   zLock = "CHECKPOINT"; break;
-    case SQLITE_SHM_RECOVER:      zLock = "RECOVER";    break;
-    case SQLITE_SHM_PENDING:      zLock = "PENDING";    break;
-    case SQLITE_SHM_UNLOCK:       zLock = "UNLOCK";     break;
+  int nLock;
+  char zLock[80];
+
+  sqlite3_snprintf(sizeof(zLock), zLock, "%d %d", ofst, n);
+  nLock = strlen(zLock);
+  if( flags & SQLITE_SHM_LOCK ){
+    strcpy(&zLock[nLock], " lock");
+  }else{
+    strcpy(&zLock[nLock], " unlock");
+  }
+  nLock += strlen(&zLock[nLock]);
+  if( flags & SQLITE_SHM_SHARED ){
+    strcpy(&zLock[nLock], " shared");
+  }else{
+    strcpy(&zLock[nLock], " exclusive");
   }
   tvfsExecTcl(p, "xShmLock", 
       Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId,
       Tcl_NewStringObj(zLock, -1)
   );
   tvfsResultCode(p, &rc);
-  if( rc==SQLITE_OK ){
-    *gotLock = desiredLock;
-  }
-
   return rc;
 }
 
 static void tvfsShmBarrier(sqlite3_file *pFile){
   int rc = SQLITE_OK;
@@ -714,13 +717,11 @@
 ** When the xShmLock method is invoked by SQLite, the following script is
 ** run:
 **
 **   SCRIPT xShmLock    FILENAME ID LOCK
 **
-** where LOCK is one of "UNLOCK", "READ", "READ_FULL", "WRITE", "PENDING",
-** "CHECKPOINT" or "RECOVER". The script should return an SQLite error
-** code.
+** where LOCK is of the form "OFFSET NBYTE lock/unlock shared/exclusive"
 */
 static int testvfs_cmd(
   ClientData cd,
   Tcl_Interp *interp,
   int objc,

Index: src/vdbe.c
==================================================================
--- src/vdbe.c
+++ src/vdbe.c
@@ -478,26 +478,10 @@
 ** flag on jump instructions, we get a (small) speed improvement.
 */
 #define CHECK_FOR_INTERRUPT \
    if( db->u1.isInterrupted ) goto abort_due_to_interrupt;
 
-#ifdef SQLITE_DEBUG
-static int fileExists(sqlite3 *db, const char *zFile){
-  int res = 0;
-  int rc = SQLITE_OK;
-#ifdef SQLITE_TEST
-  /* If we are currently testing IO errors, then do not call OsAccess() to
-  ** test for the presence of zFile. This is because any IO error that
-  ** occurs here will not be reported, causing the test to fail.
-  */
-  extern int sqlite3_io_error_pending;
-  if( sqlite3_io_error_pending<=0 )
-#endif
-    rc = sqlite3OsAccess(db->pVfs, zFile, SQLITE_ACCESS_EXISTS, &res);
-  return (res && rc==SQLITE_OK);
-}
-#endif
 
 #ifndef NDEBUG
 /*
 ** This function is only called from within an assert() expression. It
 ** checks that the sqlite3.nTransaction variable is correctly set to
@@ -592,23 +576,18 @@
 #ifndef SQLITE_OMIT_PROGRESS_CALLBACK
   checkProgress = db->xProgress!=0;
 #endif
 #ifdef SQLITE_DEBUG
   sqlite3BeginBenignMalloc();
-  if( p->pc==0 
-   && ((p->db->flags & SQLITE_VdbeListing) || fileExists(db, "vdbe_explain"))
-  ){
+  if( p->pc==0  && (p->db->flags & SQLITE_VdbeListing)!=0 ){
     int i;
     printf("VDBE Program Listing:\n");
     sqlite3VdbePrintSql(p);
     for(i=0; i<p->nOp; i++){
       sqlite3VdbePrintOp(stdout, i, &aOp[i]);
     }
   }
-  if( fileExists(db, "vdbe_trace") ){
-    p->trace = stdout;
-  }
   sqlite3EndBenignMalloc();
 #endif
   for(pc=p->pc; rc==SQLITE_OK; pc++){
     assert( pc>=0 && pc<p->nOp );
     if( db->mallocFailed ) goto no_mem;
@@ -626,17 +605,10 @@
         printf("VDBE Execution Trace:\n");
         sqlite3VdbePrintSql(p);
       }
       sqlite3VdbePrintOp(p->trace, pc, pOp);
     }
-    if( p->trace==0 && pc==0 ){
-      sqlite3BeginBenignMalloc();
-      if( fileExists(db, "vdbe_sqltrace") ){
-        sqlite3VdbePrintSql(p);
-      }
-      sqlite3EndBenignMalloc();
-    }
 #endif
       
 
     /* Check to see if we need to simulate an interrupt.  This only happens
     ** if we have a special test build.

Index: src/wal.c
==================================================================
--- src/wal.c
+++ src/wal.c
@@ -91,16 +91,26 @@
 **
 ** READER ALGORITHM
 **
 ** To read a page from the database (call it page number P), a reader
 ** first checks the WAL to see if it contains page P.  If so, then the
-** last valid instance of page P that is or is followed by a commit frame
-** become the value read.  If the WAL contains no copies of page P that
-** are valid and which are or are followed by a commit frame, then page
-** P is read from the database file.
+** last valid instance of page P that is a followed by a commit frame
+** or is a commit frame itself becomes the value read.  If the WAL
+** contains no copies of page P that are valid and which are a commit
+** frame or are followed by a commit frame, then page P is read from
+** the database file.
 **
-** The reader algorithm in the previous paragraph works correctly, but 
+** To start a read transaction, the reader records the index of the last
+** valid frame in the WAL.  The reader uses this recorded "mxFrame" value
+** for all subsequent read operations.  New transactions can be appended
+** to the WAL, but as long as the reader uses its original mxFrame value
+** and ignores the newly appended content, it will see a consistent snapshot
+** of the database from a single point in time.  This technique allows
+** multiple concurrent readers to view different versions of the database
+** content simultaneously.
+**
+** The reader algorithm in the previous paragraphs works correctly, but 
 ** because frames for page P can appear anywhere within the WAL, the
 ** reader has to scan the entire WAL looking for page P frames.  If the
 ** WAL is large (multiple megabytes is typical) that scan can be slow,
 ** and read performance suffers.  To overcome this problem, a separate
 ** data structure called the wal-index is maintained to expedite the
@@ -159,12 +169,11 @@
 ** 1-based index of an entry in the mapping section of the same
 ** index block.   Let K be the 1-based index of the largest entry in
 ** the mapping section.  (For index blocks other than the last, K will
 ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
 ** K will be (mxFrame%HASHTABLE_NPAGE).)  Unused slots of the hash table
-** contain a value greater than K.  Note that no hash table slot ever
-** contains a zero value.
+** contain a value of 0.
 **
 ** To look for page P in the hash table, first compute a hash iKey on
 ** P as follows:
 **
 **      iKey = (P * 383) % HASHTABLE_NSLOT
@@ -212,14 +221,37 @@
 */
 #ifndef SQLITE_OMIT_WAL
 
 #include "wal.h"
 
+/*
+** Trace output macros
+*/
+#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
+int sqlite3WalTrace = 0;
+# define WALTRACE(X)  if(sqlite3WalTrace) sqlite3DebugPrintf X
+#else
+# define WALTRACE(X)
+#endif
+
+
+/*
+** Indices of various locking bytes.   WAL_NREADER is the number
+** of available reader locks and should be at least 3.
+*/
+#define WAL_WRITE_LOCK         0
+#define WAL_ALL_BUT_WRITE      1
+#define WAL_CKPT_LOCK          1
+#define WAL_RECOVER_LOCK       2
+#define WAL_READ_LOCK(I)       (3+(I))
+#define WAL_NREADER            (SQLITE_SHM_NLOCK-3)
+
 
 /* Object declarations */
 typedef struct WalIndexHdr WalIndexHdr;
 typedef struct WalIterator WalIterator;
+typedef struct WalCkptInfo WalCkptInfo;
 
 
 /*
 ** The following object holds a copy of the wal-index header content.
 **
@@ -226,26 +258,79 @@
 ** The actual header in the wal-index consists of two copies of this
 ** object.
 */
 struct WalIndexHdr {
   u32 iChange;                    /* Counter incremented each transaction */
-  u16 bigEndCksum;                /* True if checksums in WAL are big-endian */
+  u8 isInit;                      /* 1 when initialized */
+  u8 bigEndCksum;                 /* True if checksums in WAL are big-endian */
   u16 szPage;                     /* Database page size in bytes */
   u32 mxFrame;                    /* Index of last valid frame in the WAL */
   u32 nPage;                      /* Size of database in pages */
   u32 aFrameCksum[2];             /* Checksum of last frame in log */
   u32 aSalt[2];                   /* Two salt values copied from WAL header */
   u32 aCksum[2];                  /* Checksum over all prior fields */
 };
+
+/*
+** A copy of the following object occurs in the wal-index immediately
+** following the second copy of the WalIndexHdr.  This object stores
+** information used by checkpoint.
+**
+** nBackfill is the number of frames in the WAL that have been written
+** back into the database. (We call the act of moving content from WAL to
+** database "backfilling".)  The nBackfill number is never greater than
+** WalIndexHdr.mxFrame.  nBackfill can only be increased by threads
+** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
+** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
+** mxFrame back to zero when the WAL is reset.
+**
+** There is one entry in aReadMark[] for each reader lock.  If a reader
+** holds read-lock K, then the value in aReadMark[K] is no greater than
+** the mxFrame for that reader.  aReadMark[0] is a special case.  It
+** always holds zero.  Readers holding WAL_READ_LOCK(0) always ignore 
+** the entire WAL and read all content directly from the database.
+**
+** The value of aReadMark[K] may only be changed by a thread that
+** is holding an exclusive lock on WAL_READ_LOCK(K).  Thus, the value of
+** aReadMark[K] cannot changed while there is a reader is using that mark
+** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
+**
+** The checkpointer may only transfer frames from WAL to database where
+** the frame numbers are less than or equal to every aReadMark[] that is
+** in use (that is, every aReadMark[j] for which there is a corresponding
+** WAL_READ_LOCK(j)).  New readers (usually) pick the aReadMark[] with the
+** largest value and will increase an unused aReadMark[] to mxFrame if there
+** is not already an aReadMark[] equal to mxFrame.  The exception to the
+** previous sentence is when nBackfill equals mxFrame (meaning that everything
+** in the WAL has been backfilled into the database) then new readers
+** will choose aReadMark[0] which has value 0 and hence such reader will
+** get all their all content directly from the database file and ignore 
+** the WAL.
+**
+** Writers normally append new frames to the end of the WAL.  However,
+** if nBackfill equals mxFrame (meaning that all WAL content has been
+** written back into the database) and if no readers are using the WAL
+** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
+** the writer will first "reset" the WAL back to the beginning and start
+** writing new content beginning at frame 1.
+**
+** We assume that 32-bit loads are atomic and so no locks are needed in
+** order to read from any aReadMark[] entries.
+*/
+struct WalCkptInfo {
+  u32 nBackfill;                  /* Number of WAL frames backfilled into DB */
+  u32 aReadMark[WAL_NREADER];     /* Reader marks */
+};
+
 
 /* A block of WALINDEX_LOCK_RESERVED bytes beginning at
 ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
 ** only support mandatory file-locks, we do not read or write data
 ** from the region of the file on which locks are applied.
 */
-#define WALINDEX_LOCK_OFFSET   (sizeof(WalIndexHdr)*2)
-#define WALINDEX_LOCK_RESERVED 8
+#define WALINDEX_LOCK_OFFSET   (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
+#define WALINDEX_LOCK_RESERVED 16
 #define WALINDEX_HDR_SIZE      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
 
 /* Size of header before each frame in wal */
 #define WAL_FRAME_HDRSIZE 24
 
@@ -275,25 +360,34 @@
 /*
 ** An open write-ahead log file is represented by an instance of the
 ** following object.
 */
 struct Wal {
-  sqlite3_vfs *pVfs;         /* The VFS used to create pFd */
+  sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
   sqlite3_file *pDbFd;       /* File handle for the database file */
   sqlite3_file *pWalFd;      /* File handle for WAL file */
   u32 iCallback;             /* Value to pass to log callback (or 0) */
   int szWIndex;              /* Size of the wal-index that is mapped in mem */
   volatile u32 *pWiData;     /* Pointer to wal-index content in memory */
-  u8 lockState;              /* SQLITE_SHM_xxxx constant showing lock state */
-  u8 readerType;             /* SQLITE_SHM_READ or SQLITE_SHM_READ_FULL */
+  u16 szPage;                /* Database page size */
+  i16 readLock;              /* Which read lock is being held.  -1 for none */
   u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
-  u8 isWindexOpen;           /* True if ShmOpen() called on pDbFd */
-  WalIndexHdr hdr;           /* Wal-index for current snapshot */
+  u8 isWIndexOpen;           /* True if ShmOpen() called on pDbFd */
+  u8 writeLock;              /* True if in a write transaction */
+  u8 ckptLock;               /* True if holding a checkpoint lock */
+  WalIndexHdr hdr;           /* Wal-index header for current transaction */
   char *zWalName;            /* Name of WAL file */
-  int szPage;                /* Database page size */
   u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
 };
+
+/*
+** Return a pointer to the WalCkptInfo structure in the wal-index.
+*/
+static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
+  assert( pWal->pWiData!=0 );
+  return (volatile WalCkptInfo*)&pWal->pWiData[sizeof(WalIndexHdr)/2];
+}
 
 
 /*
 ** This structure is used to implement an iterator that loops through
 ** all frames in the WAL in database page order. Where two or more frames
@@ -377,47 +471,26 @@
 
   aOut[0] = s1;
   aOut[1] = s2;
 }
 
-/*
-** Attempt to change the lock status.
-**
-** When changing the lock status to SQLITE_SHM_READ, store the
-** type of reader lock (either SQLITE_SHM_READ or SQLITE_SHM_READ_FULL)
-** in pWal->readerType.
-*/
-static int walSetLock(Wal *pWal, int desiredStatus){
-  int rc = SQLITE_OK;             /* Return code */
-  if( pWal->exclusiveMode || pWal->lockState==desiredStatus ){
-    pWal->lockState = desiredStatus;
-  }else{
-    int got = pWal->lockState;
-    rc = sqlite3OsShmLock(pWal->pDbFd, desiredStatus, &got);
-    pWal->lockState = got;
-    if( got==SQLITE_SHM_READ_FULL || got==SQLITE_SHM_READ ){
-      pWal->readerType = got;
-      pWal->lockState = SQLITE_SHM_READ;
-    }
-  }
-  return rc;
-}
-
 /*
 ** Write the header information in pWal->hdr into the wal-index.
 **
 ** The checksum on pWal->hdr is updated before it is written.
 */
 static void walIndexWriteHdr(Wal *pWal){
   WalIndexHdr *aHdr;
-  walChecksumBytes(1, (u8*)&pWal->hdr,
-                   sizeof(pWal->hdr) - sizeof(pWal->hdr.aCksum),
+
+  assert( pWal->writeLock );
+  pWal->hdr.isInit = 1;
+  walChecksumBytes(1, (u8*)&pWal->hdr, offsetof(WalIndexHdr, aCksum),
                    0, pWal->hdr.aCksum);
   aHdr = (WalIndexHdr*)pWal->pWiData;
-  memcpy(&aHdr[1], &pWal->hdr, sizeof(pWal->hdr));
+  memcpy(&aHdr[1], &pWal->hdr, sizeof(WalIndexHdr));
   sqlite3OsShmBarrier(pWal->pDbFd);
-  memcpy(&aHdr[0], &pWal->hdr, sizeof(pWal->hdr));
+  memcpy(&aHdr[0], &pWal->hdr, sizeof(WalIndexHdr));
 }
 
 /*
 ** This function encodes a single frame header and writes it to a buffer
 ** supplied by the caller. A frame-header is made up of a series of 
@@ -517,10 +590,71 @@
 #define HASHTABLE_NPAGE      4096  /* Must be power of 2 and multiple of 256 */
 #define HASHTABLE_DATATYPE   u16
 #define HASHTABLE_HASH_1     383                  /* Should be prime */
 #define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
 #define HASHTABLE_NBYTE      (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)
+
+#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
+/*
+** Names of locks.  This routine is used to provide debugging output and is not
+** a part of an ordinary build.
+*/
+static const char *walLockName(int lockIdx){
+  if( lockIdx==WAL_WRITE_LOCK ){
+    return "WRITE-LOCK";
+  }else if( lockIdx==WAL_CKPT_LOCK ){
+    return "CKPT-LOCK";
+  }else if( lockIdx==WAL_RECOVER_LOCK ){
+    return "RECOVER-LOCK";
+  }else{
+    static char zName[15];
+    sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",
+                     lockIdx-WAL_READ_LOCK(0));
+    return zName;
+  }
+}
+#endif /*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
+    
+
+/*
+** Set or release locks on the WAL.  Locks are either shared or exclusive.
+** A lock cannot be moved directly between shared and exclusive - it must go
+** through the unlocked state first.
+**
+** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
+*/
+static int walLockShared(Wal *pWal, int lockIdx){
+  int rc;
+  if( pWal->exclusiveMode ) return SQLITE_OK;
+  rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
+                        SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
+  WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,
+            walLockName(lockIdx), rc ? "failed" : "ok"));
+  return rc;
+}
+static void walUnlockShared(Wal *pWal, int lockIdx){
+  if( pWal->exclusiveMode ) return;
+  (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
+                         SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
+  WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));
+}
+static int walLockExclusive(Wal *pWal, int lockIdx, int n){
+  int rc;
+  if( pWal->exclusiveMode ) return SQLITE_OK;
+  rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
+                        SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
+  WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,
+            walLockName(lockIdx), n, rc ? "failed" : "ok"));
+  return rc;
+}
+static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
+  if( pWal->exclusiveMode ) return;
+  (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
+                         SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
+  WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
+             walLockName(lockIdx), n));
+}
 
 /*
 ** Return the index in the Wal.pWiData array that corresponds to 
 ** frame iFrame.
 **
@@ -541,14 +675,14 @@
     + (iFrame-1)
   );
 }
 
 /*
-** Return the minimum mapping size in bytes that can be used to read the
-** wal-index up to and including frame iFrame. If iFrame is the last frame
-** in a block of 256 frames, the returned byte-count includes the space
-** required by the 256-byte index block.
+** Return the minimum size of the shared-memory, in bytes, that is needed
+** to support a wal-index containing frame iFrame.  The value returned
+** includes the wal-index header and the complete "block" containing iFrame,
+** including the hash table segment that follows the block.
 */
 static int walMappingSize(u32 iFrame){
   const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ;
   return ( WALINDEX_LOCK_OFFSET 
          + WALINDEX_LOCK_RESERVED 
@@ -598,11 +732,11 @@
 ** storage to be at least as big as enlargeTo before remapping.
 */
 static int walIndexRemap(Wal *pWal, int enlargeTo){
   int rc;
   int sz;
-  assert( pWal->lockState>=SQLITE_SHM_WRITE );
+  assert( pWal->writeLock );
   rc = sqlite3OsShmSize(pWal->pDbFd, enlargeTo, &sz);
   if( rc==SQLITE_OK && sz>pWal->szWIndex ){
     walIndexUnmap(pWal);
     rc = walIndexMap(pWal, sz);
   }
@@ -610,11 +744,12 @@
   return rc;
 }
 
 /*
 ** Compute a hash on a page number.  The resulting hash value must land
-** between 0 and (HASHTABLE_NSLOT-1).
+** between 0 and (HASHTABLE_NSLOT-1).  The walHashNext() function advances
+** the hash to the next value in the event of a collision.
 */
 static int walHash(u32 iPage){
   assert( iPage>0 );
   assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
   return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
@@ -673,26 +808,32 @@
 ** than pWal->hdr.mxFrame.
 **
 ** This function is called whenever pWal->hdr.mxFrame is decreased due
 ** to a rollback or savepoint.
 **
-** At most only the very last hash table needs to be updated.  Any
-** later hash tables will be automatically cleared when pWal->hdr.mxFrame
-** advances to the point where those hash tables are actually needed.
+** At most only the hash table containing pWal->hdr.mxFrame needs to be
+** updated.  Any later hash tables will be automatically cleared when
+** pWal->hdr.mxFrame advances to the point where those hash tables are
+** actually needed.
 */
 static void walCleanupHash(Wal *pWal){
   volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
   volatile u32 *aPgno;                 /* Unused return from walHashFind() */
   u32 iZero;                           /* frame == (aHash[x]+iZero) */
-  int iLimit;                          /* Zero values greater than this */
+  int iLimit = 0;                      /* Zero values greater than this */
 
-  assert( pWal->lockState==SQLITE_SHM_WRITE );
-  walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
-  iLimit = pWal->hdr.mxFrame - iZero;
-  if( iLimit>0 ){
+  assert( pWal->writeLock );
+  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE-1 );
+  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE );
+  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE+1 );
+  if( (pWal->hdr.mxFrame % HASHTABLE_NPAGE)>0 ){
     int nByte;                    /* Number of bytes to zero in aPgno[] */
     int i;                        /* Used to iterate through aHash[] */
+
+    walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
+    iLimit = pWal->hdr.mxFrame - iZero;
+    assert( iLimit>0 );
     for(i=0; i<HASHTABLE_NSLOT; i++){
       if( aHash[i]>iLimit ){
         aHash[i] = 0;
       }
     }
@@ -707,11 +848,11 @@
 
 #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
   /* Verify that the every entry in the mapping region is still reachable
   ** via the hash table even after the cleanup.
   */
-  {
+  if( iLimit ){
     int i;           /* Loop counter */
     int iKey;        /* Hash key */
     for(i=1; i<=iLimit; i++){
       for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){
         if( aHash[iKey]==i ) break;
@@ -808,23 +949,48 @@
 }
 
 
 /*
 ** Recover the wal-index by reading the write-ahead log file. 
-** The caller must hold RECOVER lock on the wal-index file.
+**
+** This routine first tries to establish an exclusive lock on the
+** wal-index to prevent other threads/processes from doing anything
+** with the WAL or wal-index while recovery is running.  The
+** WAL_RECOVER_LOCK is also held so that other threads will know
+** that this thread is running recovery.  If unable to establish
+** the necessary locks, this routine returns SQLITE_BUSY.
 */
 static int walIndexRecover(Wal *pWal){
   int rc;                         /* Return Code */
   i64 nSize;                      /* Size of log file */
   u32 aFrameCksum[2] = {0, 0};
+  int iLock;                      /* Lock offset to lock for checkpoint */
+  int nLock;                      /* Number of locks to hold */
 
-  assert( pWal->lockState>SQLITE_SHM_READ );
+  /* Obtain an exclusive lock on all byte in the locking range not already
+  ** locked by the caller. The caller is guaranteed to have locked the
+  ** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
+  ** If successful, the same bytes that are locked here are unlocked before
+  ** this function returns.
+  */
+  assert( pWal->ckptLock==1 || pWal->ckptLock==0 );
+  assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );
+  assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );
+  assert( pWal->writeLock );
+  iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock;
+  nLock = SQLITE_SHM_NLOCK - iLock;
+  rc = walLockExclusive(pWal, iLock, nLock);
+  if( rc ){
+    return rc;
+  }
+  WALTRACE(("WAL%p: recovery begin...\n", pWal));
+
   memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
 
   rc = sqlite3OsFileSize(pWal->pWalFd, &nSize);
   if( rc!=SQLITE_OK ){
-    return rc;
+    goto recovery_error;
   }
 
   if( nSize>WAL_HDRSIZE ){
     u8 aBuf[WAL_HDRSIZE];         /* Buffer to load WAL header into */
     u8 *aFrame = 0;               /* Malloc'd buffer to load entire frame */
@@ -836,11 +1002,11 @@
     u32 magic;                    /* Magic value read from WAL header */
 
     /* Read in the WAL header. */
     rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
     if( rc!=SQLITE_OK ){
-      return rc;
+      goto recovery_error;
     }
 
     /* If the database page size is not a power of two, or is greater than
     ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid 
     ** data. Similarly, if the 'magic' value is invalid, ignore the whole
@@ -865,11 +1031,12 @@
 
     /* Malloc a buffer to read frames into. */
     szFrame = szPage + WAL_FRAME_HDRSIZE;
     aFrame = (u8 *)sqlite3_malloc(szFrame);
     if( !aFrame ){
-      return SQLITE_NOMEM;
+      rc = SQLITE_NOMEM;
+      goto recovery_error;
     }
     aData = &aFrame[WAL_FRAME_HDRSIZE];
 
     /* Read all frames from the log file. */
     iFrame = 0;
@@ -905,35 +1072,41 @@
   }
   if( rc==SQLITE_OK ){
     pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
     pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
     walIndexWriteHdr(pWal);
+
+    /* Zero the checkpoint-header. This is safe because this thread is 
+    ** currently holding locks that exclude all other readers, writers and
+    ** checkpointers.
+    */
+    memset((void *)walCkptInfo(pWal), 0, sizeof(WalCkptInfo));
   }
+
+recovery_error:
+  WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
+  walUnlockExclusive(pWal, iLock, nLock);
   return rc;
 }
 
 /*
 ** Close an open wal-index.
 */
 static void walIndexClose(Wal *pWal, int isDelete){
-  if( pWal->isWindexOpen ){
-    int notUsed;
-    sqlite3OsShmLock(pWal->pDbFd, SQLITE_SHM_UNLOCK, &notUsed);
+  if( pWal->isWIndexOpen ){
     sqlite3OsShmClose(pWal->pDbFd, isDelete);
-    pWal->isWindexOpen = 0;
+    pWal->isWIndexOpen = 0;
   }
 }
 
 /* 
-** Open a connection to the log file associated with database zDb. The
-** database file does not actually have to exist. zDb is used only to
-** figure out the name of the log file to open. If the log file does not 
-** exist it is created by this call.
+** Open a connection to the WAL file associated with database zDbName.
+** The database file must already be opened on connection pDbFd.
 **
 ** A SHARED lock should be held on the database file when this function
 ** is called. The purpose of this SHARED lock is to prevent any other
-** client from unlinking the log or wal-index file. If another process
+** client from unlinking the WAL or wal-index file. If another process
 ** were to do this just after this client opened one of these files, the
 ** system would be badly broken.
 **
 ** If the log file is successfully opened, SQLITE_OK is returned and 
 ** *ppWal is set to point to a new WAL handle. If an error occurs,
@@ -976,18 +1149,19 @@
 
   pRet->pVfs = pVfs;
   pRet->pWalFd = (sqlite3_file *)&pRet[1];
   pRet->pDbFd = pDbFd;
   pRet->szWIndex = -1;
+  pRet->readLock = -1;
   sqlite3_randomness(8, &pRet->hdr.aSalt);
   pRet->zWalName = zWal = pVfs->szOsFile + (char*)pRet->pWalFd;
   sqlite3_snprintf(nWal, zWal, "%s-wal", zDbName);
   rc = sqlite3OsShmOpen(pDbFd);
 
   /* Open file handle on the write-ahead log file. */
   if( rc==SQLITE_OK ){
-    pRet->isWindexOpen = 1;
+    pRet->isWIndexOpen = 1;
     flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
     rc = sqlite3OsOpen(pVfs, zWal, pRet->pWalFd, flags, &flags);
   }
 
   if( rc!=SQLITE_OK ){
@@ -994,10 +1168,11 @@
     walIndexClose(pRet, 0);
     sqlite3OsClose(pRet->pWalFd);
     sqlite3_free(pRet);
   }else{
     *ppWal = pRet;
+    WALTRACE(("WAL%d: opened\n", pRet));
   }
   return rc;
 }
 
 /*
@@ -1131,11 +1306,11 @@
   /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
   ** thread is able to write to shared memory while this routine is
   ** running (or, indeed, while the WalIterator object exists).  Hence,
   ** we can cast off the volatile qualifacation from shared memory
   */
-  assert( pWal->lockState==SQLITE_SHM_CHECKPOINT );
+  assert( pWal->ckptLock );
   aData = (u32*)pWal->pWiData;
 
   /* Allocate space for the WalIterator object */
   iLast = pWal->hdr.mxFrame;
   nSegment = (iLast >> 8) + 1;
@@ -1177,13 +1352,42 @@
 ** Free an iterator allocated by walIteratorInit().
 */
 static void walIteratorFree(WalIterator *p){
   sqlite3_free(p);
 }
+
 
 /*
-** Checkpoint the contents of the log file.
+** Copy as much content as we can from the WAL back into the database file
+** in response to an sqlite3_wal_checkpoint() request or the equivalent.
+**
+** The amount of information copies from WAL to database might be limited
+** by active readers.  This routine will never overwrite a database page
+** that a concurrent reader might be using.
+**
+** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
+** SQLite is in WAL-mode in synchronous=NORMAL.  That means that if 
+** checkpoints are always run by a background thread or background 
+** process, foreground threads will never block on a lengthy fsync call.
+**
+** Fsync is called on the WAL before writing content out of the WAL and
+** into the database.  This ensures that if the new content is persistent
+** in the WAL and can be recovered following a power-loss or hard reset.
+**
+** Fsync is also called on the database file if (and only if) the entire
+** WAL content is copied into the database file.  This second fsync makes
+** it safe to delete the WAL since the new content will persist in the
+** database file.
+**
+** This routine uses and updates the nBackfill field of the wal-index header.
+** This is the only routine tha will increase the value of nBackfill.  
+** (A WAL reset or recovery will revert nBackfill to zero, but not increase
+** its value.)
+**
+** The caller must be holding sufficient locks to ensure that no other
+** checkpoint is running (in any other thread or process) at the same
+** time.
 */
 static int walCheckpoint(
   Wal *pWal,                      /* Wal connection */
   int sync_flags,                 /* Flags for OsSync() (or 0) */
   int nBuf,                       /* Size of zBuf in bytes */
@@ -1192,55 +1396,92 @@
   int rc;                         /* Return code */
   int szPage = pWal->hdr.szPage;  /* Database page-size */
   WalIterator *pIter = 0;         /* Wal iterator context */
   u32 iDbpage = 0;                /* Next database page to write */
   u32 iFrame = 0;                 /* Wal frame containing data for iDbpage */
+  u32 mxSafeFrame;                /* Max frame that can be backfilled */
+  int i;                          /* Loop counter */
+  volatile WalIndexHdr *pHdr;     /* The actual wal-index header in SHM */
+  volatile WalCkptInfo *pInfo;    /* The checkpoint status information */
 
   /* Allocate the iterator */
   rc = walIteratorInit(pWal, &pIter);
   if( rc!=SQLITE_OK || pWal->hdr.mxFrame==0 ){
-    goto out;
+    walIteratorFree(pIter);
+    return rc;
   }
 
+  /*** TODO:  Move this test out to the caller.  Make it an assert() here ***/
   if( pWal->hdr.szPage!=nBuf ){
-    rc = SQLITE_CORRUPT_BKPT;
-    goto out;
-  }
-
-  /* Sync the log file to disk */
-  if( sync_flags ){
-    rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
-    if( rc!=SQLITE_OK ) goto out;
-  }
-
-  /* Iterate through the contents of the log, copying data to the db file. */
-  while( 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
-    rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, 
-        walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE
-    );
-    if( rc!=SQLITE_OK ) goto out;
-    rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage);
-    if( rc!=SQLITE_OK ) goto out;
-  }
-
-  /* Truncate the database file */
-  rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage));
-  if( rc!=SQLITE_OK ) goto out;
-
-  /* Sync the database file. If successful, update the wal-index. */
-  if( sync_flags ){
-    rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
-    if( rc!=SQLITE_OK ) goto out;
-  }
-  pWal->hdr.mxFrame = 0;
-  pWal->nCkpt++;
-  sqlite3Put4byte((u8*)pWal->hdr.aSalt,
-                   1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt));
-  sqlite3_randomness(4, &pWal->hdr.aSalt[1]);
-  walIndexWriteHdr(pWal);
-
- out:
+    walIteratorFree(pIter);
+    return SQLITE_CORRUPT_BKPT;
+  }
+
+  /* Compute in mxSafeFrame the index of the last frame of the WAL that is
+  ** safe to write into the database.  Frames beyond mxSafeFrame might
+  ** overwrite database pages that are in use by active readers and thus
+  ** cannot be backfilled from the WAL.
+  */
+  mxSafeFrame = pWal->hdr.mxFrame;
+  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
+  pInfo = (volatile WalCkptInfo*)&pHdr[2];
+  assert( pInfo==walCkptInfo(pWal) );
+  for(i=1; i<WAL_NREADER; i++){
+    u32 y = pInfo->aReadMark[i];
+    if( y>0 && (mxSafeFrame==0 || mxSafeFrame>=y) ){
+      if( y<=pWal->hdr.mxFrame
+       && walLockExclusive(pWal, WAL_READ_LOCK(i), 1)==SQLITE_OK
+      ){
+        pInfo->aReadMark[i] = 0;
+        walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
+      }else{
+        mxSafeFrame = y-1;
+      }
+    }
+  }
+
+  if( pInfo->nBackfill<mxSafeFrame
+   && (rc = walLockExclusive(pWal, WAL_READ_LOCK(0), 1))==SQLITE_OK
+  ){
+    u32 nBackfill = pInfo->nBackfill;
+
+    /* Sync the WAL to disk */
+    if( sync_flags ){
+      rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
+    }
+
+    /* Iterate through the contents of the WAL, copying data to the db file. */
+    while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
+      if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue;
+      rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, 
+          walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE
+      );
+      if( rc!=SQLITE_OK ) break;
+      rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage);
+      if( rc!=SQLITE_OK ) break;
+    }
+
+    /* If work was actually accomplished... */
+    if( rc==SQLITE_OK && pInfo->nBackfill<mxSafeFrame ){
+      pInfo->nBackfill = mxSafeFrame;
+      if( mxSafeFrame==pHdr[0].mxFrame && sync_flags ){
+        rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage));
+        if( rc==SQLITE_OK && sync_flags ){
+          rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
+        }
+      }
+    }
+
+    /* Release the reader lock held while backfilling */
+    walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
+  }else{
+    /* Reset the return code so as not to report a checkpoint failure
+    ** just because active readers prevent any backfill.
+    */
+    rc = SQLITE_OK;
+  }
+
   walIteratorFree(pIter);
   return rc;
 }
 
 /*
@@ -1264,11 +1505,12 @@
     **
     ** The EXCLUSIVE lock is not released before returning.
     */
     rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);
     if( rc==SQLITE_OK ){
-      rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf, 0, 0);
+      pWal->exclusiveMode = 1;
+      rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf);
       if( rc==SQLITE_OK ){
         isDelete = 1;
       }
       walIndexUnmap(pWal);
     }
@@ -1276,10 +1518,11 @@
     walIndexClose(pWal, isDelete);
     sqlite3OsClose(pWal->pWalFd);
     if( isDelete ){
       sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
     }
+    WALTRACE(("WAL%p: closed\n", pWal));
     sqlite3_free(pWal);
   }
   return rc;
 }
 
@@ -1288,11 +1531,12 @@
 ** there is a problem.
 **
 ** The wal-index is in shared memory.  Another thread or process might
 ** be writing the header at the same time this procedure is trying to
 ** read it, which might result in inconsistency.  A dirty read is detected
-** by verifying a checksum on the header.
+** by verifying that both copies of the header are the same and also by
+** a checksum on the header.
 **
 ** If and only if the read is consistent and the header is different from
 ** pWal->hdr, then pWal->hdr is updated to the content of the new header
 ** and *pChanged is set to 1.
 **
@@ -1309,13 +1553,13 @@
     ** header is invalid. */
     return 1;
   }
   assert( pWal->pWiData );
 
-  /* Read the header. The caller may or may not have an exclusive 
-  ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index
-  ** file, meaning it is possible that an inconsistent snapshot is read
+  /* Read the header. This might happen currently with a write to the
+  ** same area of shared memory on a different CPU in a SMP,
+  ** meaning it is possible that an inconsistent snapshot is read
   ** from the file. If this happens, return non-zero.
   **
   ** There are two copies of the header at the beginning of the wal-index.
   ** When reading, read [0] first then [1].  Writes are in the reverse order.
   ** Memory barriers are used to prevent the compiler or the hardware from
@@ -1327,11 +1571,11 @@
   memcpy(&h2, &aHdr[1], sizeof(h2));
 
   if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
     return 1;   /* Dirty read */
   }  
-  if( h1.szPage==0 ){
+  if( h1.isInit==0 ){
     return 1;   /* Malformed header - probably all zeros */
   }
   walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);
   if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){
     return 1;   /* Checksum does not match */
@@ -1365,56 +1609,44 @@
 ** If the wal-index header is successfully read, return SQLITE_OK. 
 ** Otherwise an SQLite error code.
 */
 static int walIndexReadHdr(Wal *pWal, int *pChanged){
   int rc;                         /* Return code */
-  int lockState;                  /* pWal->lockState before running recovery */
+  int badHdr;                     /* True if a header read failed */
 
-  assert( pWal->lockState>=SQLITE_SHM_READ );
   assert( pChanged );
   rc = walIndexMap(pWal, walMappingSize(1));
   if( rc!=SQLITE_OK ){
     return rc;
   }
 
-  /* First attempt to read the wal-index header. This may fail for one
-  ** of two reasons: (a) the wal-index does not yet exist or has been
-  ** corrupted and needs to be constructed by running recovery, or (b)
-  ** the caller is only holding a READ lock and made a dirty read of
-  ** the wal-index header.
-  **
-  ** A dirty read of the wal-index header occurs if another thread or
-  ** process happens to be writing to the wal-index header at roughly
-  ** the same time as this thread is reading it. In this case it is 
-  ** possible that an inconsistent header is read (which is detected
-  ** using the header checksum mechanism).
-  */
-  if( walIndexTryHdr(pWal, pChanged)!=0 ){
-
-    /* If the first attempt to read the header failed, lock the wal-index
-    ** file with an exclusive lock and try again. If the header checksum 
-    ** verification fails again, we can be sure that it is not simply a
-    ** dirty read, but that the wal-index really does need to be 
-    ** reconstructed by running log recovery.
-    **
-    ** In the paragraph above, an "exclusive lock" may be any of WRITE,
-    ** PENDING, CHECKPOINT or RECOVER. If any of these are already held,
-    ** no locking operations are required. If the caller currently holds
-    ** a READ lock, then upgrade to a RECOVER lock before re-reading the
-    ** wal-index header and revert to a READ lock before returning.
-    */
-    lockState = pWal->lockState;
-    if( lockState>SQLITE_SHM_READ
-     || SQLITE_OK==(rc = walSetLock(pWal, SQLITE_SHM_RECOVER)) 
-    ){
-      if( walIndexTryHdr(pWal, pChanged) ){
+  /* Try once to read the header straight out.  This works most of the
+  ** time.
+  */
+  badHdr = walIndexTryHdr(pWal, pChanged);
+
+  /* If the first attempt failed, it might have been due to a race
+  ** with a writer.  So get a WRITE lock and try again.
+  */
+  assert( badHdr==0 || pWal->writeLock==0 );
+  if( badHdr ){
+    rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
+    if( rc==SQLITE_OK ){
+      pWal->writeLock = 1;
+      badHdr = walIndexTryHdr(pWal, pChanged);
+      if( badHdr ){
+        /* If the wal-index header is still malformed even while holding
+        ** a WRITE lock, it can only mean that the header is corrupted and
+        ** needs to be reconstructed.  So run recovery to do exactly that.
+        */
+        rc = walIndexRecover(pWal);
         *pChanged = 1;
-        rc = walIndexRecover(pWal);
       }
-      if( lockState==SQLITE_SHM_READ ){
-        walSetLock(pWal, SQLITE_SHM_READ);
-      }
+      walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
+      pWal->writeLock = 0;
+    }else if( rc!=SQLITE_BUSY ){
+      return rc;
     }
   }
 
   /* Make sure the mapping is large enough to cover the entire wal-index */
   if( rc==SQLITE_OK ){
@@ -1426,57 +1658,196 @@
 
   return rc;
 }
 
 /*
-** Take a snapshot of the state of the WAL and wal-index for the current
+** This is the value that walTryBeginRead returns when it needs to
+** be retried.
+*/
+#define WAL_RETRY  (-1)
+
+/*
+** Attempt to start a read transaction.  This might fail due to a race or
+** other transient condition.  When that happens, it returns WAL_RETRY to
+** indicate to the caller that it is safe to retry immediately.
+**
+** On success return SQLITE_OK.  On a permantent failure (such an
+** I/O error or an SQLITE_BUSY because another process is running
+** recovery) return a positive error code.
+**
+** On success, this routine obtains a read lock on 
+** WAL_READ_LOCK(pWal->readLock).  The pWal->readLock integer is
+** in the range 0 <= pWal->readLock < WAL_NREADER.  If pWal->readLock==(-1)
+** that means the Wal does not hold any read lock.  The reader must not
+** access any database page that is modified by a WAL frame up to and
+** including frame number aReadMark[pWal->readLock].  The reader will
+** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0
+** Or if pWal->readLock==0, then the reader will ignore the WAL
+** completely and get all content directly from the database file.
+** When the read transaction is completed, the caller must release the
+** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.
+**
+** This routine uses the nBackfill and aReadMark[] fields of the header
+** to select a particular WAL_READ_LOCK() that strives to let the
+** checkpoint process do as much work as possible.  This routine might
+** update values of the aReadMark[] array in the header, but if it does
+** so it takes care to hold an exclusive lock on the corresponding
+** WAL_READ_LOCK() while changing values.
+*/
+static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal){
+  volatile WalIndexHdr *pHdr;     /* Header of the wal-index */
+  volatile WalCkptInfo *pInfo;    /* Checkpoint information in wal-index */
+  u32 mxReadMark;                 /* Largest aReadMark[] value */
+  int mxI;                        /* Index of largest aReadMark[] value */
+  int i;                          /* Loop counter */
+  int rc;                         /* Return code  */
+
+  assert( pWal->readLock<0 );     /* Not currently locked */
+
+  if( !useWal ){
+    rc = walIndexReadHdr(pWal, pChanged);
+    if( rc==SQLITE_BUSY ){
+      /* If there is not a recovery running in another thread or process
+      ** then convert BUSY errors to WAL_RETRY.  If recovery is known to
+      ** be running, convert BUSY to BUSY_RECOVERY.  There is a race here
+      ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
+      ** would be technically correct.  But the race is benign since with
+      ** WAL_RETRY this routine will be called again and will probably be
+      ** right on the second iteration.
+      */
+      rc = walLockShared(pWal, WAL_RECOVER_LOCK);
+      if( rc==SQLITE_OK ){
+        walUnlockShared(pWal, WAL_RECOVER_LOCK);
+        rc = WAL_RETRY;
+      }else if( rc==SQLITE_BUSY ){
+        rc = SQLITE_BUSY_RECOVERY;
+      }
+    }
+  }else{
+    rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
+  }
+  if( rc!=SQLITE_OK ){
+    return rc;
+  }
+
+  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
+  pInfo = (volatile WalCkptInfo*)&pHdr[2];
+  assert( pInfo==walCkptInfo(pWal) );
+  if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){
+    /* The WAL has been completely backfilled (or it is empty).
+    ** and can be safely ignored.
+    */
+    rc = walLockShared(pWal, WAL_READ_LOCK(0));
+    if( rc==SQLITE_OK ){
+      if( pHdr->mxFrame!=pWal->hdr.mxFrame ){
+        walUnlockShared(pWal, WAL_READ_LOCK(0));
+        return WAL_RETRY;
+      }
+      pWal->readLock = 0;
+      return SQLITE_OK;
+    }else if( rc!=SQLITE_BUSY ){
+      return rc;
+    }
+  }
+
+  /* If we get this far, it means that the reader will want to use
+  ** the WAL to get at content from recent commits.  The job now is
+  ** to select one of the aReadMark[] entries that is closest to
+  ** but not exceeding pWal->hdr.mxFrame and lock that entry.
+  */
+  mxReadMark = 0;
+  mxI = 0;
+  for(i=1; i<WAL_NREADER; i++){
+    u32 thisMark = pInfo->aReadMark[i];
+    if( mxReadMark<thisMark ){
+      mxReadMark = thisMark;
+      mxI = i;
+    }
+  }
+  if( mxI==0 ){
+    /* If we get here, it means that all of the aReadMark[] entries between
+    ** 1 and WAL_NREADER-1 are zero.  Try to initialize aReadMark[1] to
+    ** be mxFrame, then retry.
+    */
+    rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
+    if( rc==SQLITE_OK ){
+      pInfo->aReadMark[1] = pWal->hdr.mxFrame+1;
+      walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
+    }
+    return WAL_RETRY;
+  }else{
+    if( mxReadMark < pWal->hdr.mxFrame ){
+      for(i=1; i<WAL_NREADER; i++){
+        rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
+        if( rc==SQLITE_OK ){
+          mxReadMark = pInfo->aReadMark[i] = pWal->hdr.mxFrame+1;
+          mxI = i;
+          walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
+          break;
+        }
+      }
+    }
+
+    rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
+    if( rc ){
+      return rc==SQLITE_BUSY ? WAL_RETRY : rc;
+    }
+    if( pInfo->aReadMark[mxI]!=mxReadMark
+     || pHdr[0].mxFrame!=pWal->hdr.mxFrame
+     || (sqlite3OsShmBarrier(pWal->pDbFd), pHdr[1].mxFrame!=pWal->hdr.mxFrame)
+    ){
+      walUnlockShared(pWal, WAL_READ_LOCK(mxI));
+      return WAL_RETRY;
+    }else{
+      pWal->readLock = mxI;
+    }
+  }
+  return rc;
+}
+
+/*
+** Begin a read transaction on the database.
+**
+** This routine used to be called sqlite3OpenSnapshot() and with good reason:
+** it takes a snapshot of the state of the WAL and wal-index for the current
 ** instant in time.  The current thread will continue to use this snapshot.
-** Other threads might containing appending to the WAL and wal-index but
-** the extra content appended will be ignored by the current thread.
-**
-** A snapshot is like a read transaction.
-**
-** No other threads are allowed to run a checkpoint while this thread is
-** holding the snapshot since a checkpoint would remove data out from under
-** this thread.
-**
-** If this call obtains a new read-lock and the database contents have been
-** modified since the most recent call to WalCloseSnapshot() on this Wal
-** connection, then *pChanged is set to 1 before returning. Otherwise, it 
-** is left unmodified. This is used by the pager layer to determine whether 
-** or not any cached pages may be safely reused.
-*/
-int sqlite3WalOpenSnapshot(Wal *pWal, int *pChanged){
+** Other threads might append new content to the WAL and wal-index but
+** that extra content is ignored by the current thread.
+**
+** If the database contents have changes since the previous read
+** transaction, then *pChanged is set to 1 before returning.  The
+** Pager layer will use this to know that is cache is stale and
+** needs to be flushed.
+*/
+int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
   int rc;                         /* Return code */
 
-  rc = walSetLock(pWal, SQLITE_SHM_READ);
-  assert( rc!=SQLITE_OK || pWal->lockState==SQLITE_SHM_READ );
-
-  if( rc==SQLITE_OK ){
-    rc = walIndexReadHdr(pWal, pChanged);
-    if( rc!=SQLITE_OK ){
-      /* An error occured while attempting log recovery. */
-      sqlite3WalCloseSnapshot(pWal);
-    }
-  }
-
+  do{
+    rc = walTryBeginRead(pWal, pChanged, 0);
+  }while( rc==WAL_RETRY );
   walIndexUnmap(pWal);
   return rc;
 }
 
 /*
-** Unlock the current snapshot.
+** Finish with a read transaction.  All this does is release the
+** read-lock.
 */
-void sqlite3WalCloseSnapshot(Wal *pWal){
-  assert( pWal->lockState==SQLITE_SHM_READ
-       || pWal->lockState==SQLITE_SHM_UNLOCK
-  );
-  walSetLock(pWal, SQLITE_SHM_UNLOCK);
+void sqlite3WalEndReadTransaction(Wal *pWal){
+  if( pWal->readLock>=0 ){
+    walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
+    pWal->readLock = -1;
+  }
 }
 
 /*
-** Read a page from the log, if it is present. 
+** Read a page from the WAL, if it is present in the WAL and if the 
+** current read transaction is configured to use the WAL.  
+**
+** The *pInWal is set to 1 if the requested page is in the WAL and
+** has been loaded.  Or *pInWal is set to 0 if the page was not in 
+** the WAL and needs to be read out of the database.
 */
 int sqlite3WalRead(
   Wal *pWal,                      /* WAL handle */
   Pgno pgno,                      /* Database page number to read data for */
   int *pInWal,                    /* OUT: True if data is read from WAL */
@@ -1485,22 +1856,26 @@
 ){
   int rc;                         /* Return code */
   u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
   u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
   int iHash;                      /* Used to loop through N hash tables */
+
+  /* This routine is only called from within a read transaction */
+  assert( pWal->readLock>=0 );
 
   /* If the "last page" field of the wal-index header snapshot is 0, then
   ** no data will be read from the wal under any circumstances. Return early
-  ** in this case to avoid the walIndexMap/Unmap overhead.
+  ** in this case to avoid the walIndexMap/Unmap overhead.  Likewise, if
+  ** pWal->readLock==0, then the WAL is ignored by the reader so
+  ** return early, as if the WAL were empty.
   */
-  if( iLast==0 ){
+  if( iLast==0 || pWal->readLock==0 ){
     *pInWal = 0;
     return SQLITE_OK;
   }
 
   /* Ensure the wal-index is mapped. */
-  assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
   rc = walIndexMap(pWal, walMappingSize(iLast));
   if( rc!=SQLITE_OK ){
     return rc;
   }
 
@@ -1605,64 +1980,120 @@
 
 /* 
 ** Set *pPgno to the size of the database file (or zero, if unknown).
 */
 void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
-  assert( pWal->lockState==SQLITE_SHM_READ
-       || pWal->lockState==SQLITE_SHM_WRITE );
+  assert( pWal->readLock>=0 );
   *pPgno = pWal->hdr.nPage;
 }
 
-/* 
-** This function returns SQLITE_OK if the caller may write to the database.
-** Otherwise, if the caller is operating on a snapshot that has already
-** been overwritten by another writer, SQLITE_BUSY is returned.
-*/
-int sqlite3WalWriteLock(Wal *pWal, int op){
-  int rc = SQLITE_OK;
-  if( op ){
-    assert( pWal->lockState==SQLITE_SHM_READ );
-    rc = walSetLock(pWal, SQLITE_SHM_WRITE);
-
-    /* If this connection is not reading the most recent database snapshot,
-    ** it is not possible to write to the database. In this case release
-    ** the write locks and return SQLITE_BUSY.
-    */
-    if( rc==SQLITE_OK ){
-      rc = walIndexMap(pWal, walMappingSize(1));
-      assert( pWal->szWIndex>=WALINDEX_HDR_SIZE || rc!=SQLITE_OK );
-      if( rc==SQLITE_OK
-       && memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))
-      ){
-        rc = SQLITE_BUSY;
-      }
-      walIndexUnmap(pWal);
-      if( rc!=SQLITE_OK ){
-        walSetLock(pWal, SQLITE_SHM_READ);
-      }
-    }
-  }else if( pWal->lockState==SQLITE_SHM_WRITE ){
-    rc = walSetLock(pWal, SQLITE_SHM_READ);
-  }
-  return rc;
+
+/* 
+** This function starts a write transaction on the WAL.
+**
+** A read transaction must have already been started by a prior call
+** to sqlite3WalBeginReadTransaction().
+**
+** If another thread or process has written into the database since
+** the read transaction was started, then it is not possible for this
+** thread to write as doing so would cause a fork.  So this routine
+** returns SQLITE_BUSY in that case and no write transaction is started.
+**
+** There can only be a single writer active at a time.
+*/
+int sqlite3WalBeginWriteTransaction(Wal *pWal){
+  int rc;
+  volatile WalCkptInfo *pInfo;
+
+  /* Cannot start a write transaction without first holding a read
+  ** transaction. */
+  assert( pWal->readLock>=0 );
+
+  /* Only one writer allowed at a time.  Get the write lock.  Return
+  ** SQLITE_BUSY if unable.
+  */
+  rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
+  if( rc ){
+    return rc;
+  }
+  pWal->writeLock = 1;
+
+  /* If another connection has written to the database file since the
+  ** time the read transaction on this connection was started, then
+  ** the write is disallowed.
+  */
+  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
+  if( rc ){
+    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
+    pWal->writeLock = 0;
+    return rc;
+  }
+  if( memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))!=0 ){
+    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
+    pWal->writeLock = 0;
+    walIndexUnmap(pWal);
+    return SQLITE_BUSY;
+  }
+
+  pInfo = walCkptInfo(pWal);
+  if( pWal->readLock==0 ){
+    assert( pInfo->nBackfill==pWal->hdr.mxFrame );
+    if( pInfo->nBackfill>0 ){
+      rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
+      if( rc==SQLITE_OK ){
+        /* If all readers are using WAL_READ_LOCK(0) (in other words if no
+        ** readers are currently using the WAL) */
+        pWal->nCkpt++;
+        pWal->hdr.mxFrame = 0;
+        sqlite3Put4byte((u8*)pWal->hdr.aSalt,
+                         1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt));
+        sqlite3_randomness(4, &pWal->hdr.aSalt[1]);
+        walIndexWriteHdr(pWal);
+        pInfo->nBackfill = 0;
+        memset((void*)&pInfo->aReadMark[1], 0,
+               sizeof(pInfo->aReadMark)-sizeof(u32));
+        rc = sqlite3OsTruncate(pWal->pDbFd, 
+                               ((i64)pWal->hdr.nPage*(i64)pWal->szPage));
+        walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
+      }
+    }
+    walUnlockShared(pWal, WAL_READ_LOCK(0));
+    pWal->readLock = -1;
+    do{
+      int notUsed;
+      rc = walTryBeginRead(pWal, &notUsed, 1);
+    }while( rc==WAL_RETRY );
+  }
+  walIndexUnmap(pWal);
+  return rc;
+}
+
+/*
+** End a write transaction.  The commit has already been done.  This
+** routine merely releases the lock.
+*/
+int sqlite3WalEndWriteTransaction(Wal *pWal){
+  walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
+  pWal->writeLock = 0;
+  return SQLITE_OK;
 }
 
 /*
 ** If any data has been written (but not committed) to the log file, this
 ** function moves the write-pointer back to the start of the transaction.
 **
 ** Additionally, the callback function is invoked for each frame written
-** to the log since the start of the transaction. If the callback returns
+** to the WAL since the start of the transaction. If the callback returns
 ** other than SQLITE_OK, it is not invoked again and the error code is
 ** returned to the caller.
 **
 ** Otherwise, if the callback function does not return an error, this
 ** function returns SQLITE_OK.
 */
 int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
   int rc = SQLITE_OK;
-  if( pWal->lockState==SQLITE_SHM_WRITE ){
+  if( pWal->writeLock ){
     int unused;
     Pgno iMax = pWal->hdr.mxFrame;
     Pgno iFrame;
   
     assert( pWal->pWiData==0 );
@@ -1670,11 +2101,11 @@
     if( rc==SQLITE_OK ){
       rc = walIndexMap(pWal, walMappingSize(iMax));
     }
     if( rc==SQLITE_OK ){
       for(iFrame=pWal->hdr.mxFrame+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){
-        assert( pWal->lockState==SQLITE_SHM_WRITE );
+        assert( pWal->writeLock );
         rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
       }
       walCleanupHash(pWal);
     }
     walIndexUnmap(pWal);
@@ -1687,11 +2118,11 @@
 ** values. This function populates the array with values required to 
 ** "rollback" the write position of the WAL handle back to the current 
 ** point in the event of a savepoint rollback (via WalSavepointUndo()).
 */
 void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){
-  assert( pWal->lockState==SQLITE_SHM_WRITE );
+  assert( pWal->writeLock );
   aWalData[0] = pWal->hdr.mxFrame;
   aWalData[1] = pWal->hdr.aFrameCksum[0];
   aWalData[2] = pWal->hdr.aFrameCksum[1];
 }
 
@@ -1701,11 +2132,11 @@
 ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
 ** by a call to WalSavepoint().
 */
 int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
   int rc = SQLITE_OK;
-  assert( pWal->lockState==SQLITE_SHM_WRITE );
+  assert( pWal->writeLock );
 
   assert( aWalData[0]<=pWal->hdr.mxFrame );
   if( aWalData[0]<pWal->hdr.mxFrame ){
     rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
     pWal->hdr.mxFrame = aWalData[0];
@@ -1737,12 +2168,19 @@
   PgHdr *p;                       /* Iterator to run through pList with. */
   PgHdr *pLast = 0;               /* Last frame in list */
   int nLast = 0;                  /* Number of extra copies of last page */
 
   assert( pList );
-  assert( pWal->lockState==SQLITE_SHM_WRITE );
+  assert( pWal->writeLock );
   assert( pWal->pWiData==0 );
+
+#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
+  { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
+    WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
+              pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));
+  }
+#endif
 
   /* If this is the first frame written into the log, write the WAL
   ** header to the start of the WAL file. See comments at the top of
   ** this source file for a description of the WAL header format.
   */
@@ -1755,10 +2193,11 @@
     pWal->szPage = szPage;
     pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;
     sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);
     memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);
     rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
+    WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
     if( rc!=SQLITE_OK ){
       return rc;
     }
     walChecksumBytes(1, aWalHdr, sizeof(aWalHdr), 0, pWal->hdr.aFrameCksum);
   }
@@ -1846,52 +2285,42 @@
       pWal->iCallback = iFrame;
     }
   }
 
   walIndexUnmap(pWal);
+  WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));
   return rc;
 }
 
 /* 
-** Checkpoint the database:
+** This routine is called to implement sqlite3_wal_checkpoint() and
+** related interfaces.
 **
-**   1. Acquire a CHECKPOINT lock
-**   2. Copy the contents of the log into the database file.
-**   3. Zero the wal-index header (so new readers will ignore the log).
-**   4. Drop the CHECKPOINT lock.
+** Obtain a CHECKPOINT lock and then backfill as much information as
+** we can from WAL into the database.
 */
 int sqlite3WalCheckpoint(
   Wal *pWal,                      /* Wal connection */
   int sync_flags,                 /* Flags to sync db file with (or 0) */
   int nBuf,                       /* Size of temporary buffer */
-  u8 *zBuf,                       /* Temporary buffer to use */
-  int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
-  void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
+  u8 *zBuf                        /* Temporary buffer to use */
 ){
   int rc;                         /* Return code */
   int isChanged = 0;              /* True if a new wal-index header is loaded */
 
   assert( pWal->pWiData==0 );
-
-  /* Get the CHECKPOINT lock. 
-  **
-  ** Normally, the connection will be in UNLOCK state at this point. But
-  ** if the connection is in exclusive-mode it may still be in READ state
-  ** even though the upper layer has no active read-transaction (because
-  ** WalCloseSnapshot() is not called in exclusive mode). The state will
-  ** be set to UNLOCK when this function returns. This is Ok.
-  */
-  assert( (pWal->lockState==SQLITE_SHM_UNLOCK)
-       || (pWal->lockState==SQLITE_SHM_READ) );
-  walSetLock(pWal, SQLITE_SHM_UNLOCK);
-  do {
-    rc = walSetLock(pWal, SQLITE_SHM_CHECKPOINT);
-  }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
-  if( rc!=SQLITE_OK ){
-    walSetLock(pWal, SQLITE_SHM_UNLOCK);
+  assert( pWal->ckptLock==0 );
+
+  WALTRACE(("WAL%p: checkpoint begins\n", pWal));
+  rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
+  if( rc ){
+    /* Usually this is SQLITE_BUSY meaning that another thread or process
+    ** is already running a checkpoint, or maybe a recovery.  But it might
+    ** also be SQLITE_IOERR. */
     return rc;
   }
+  pWal->ckptLock = 1;
 
   /* Copy data from the log to the database file. */
   rc = walIndexReadHdr(pWal, &isChanged);
   if( rc==SQLITE_OK ){
     rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
@@ -1906,11 +2335,13 @@
     memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
   }
 
   /* Release the locks. */
   walIndexUnmap(pWal);
-  walSetLock(pWal, SQLITE_SHM_UNLOCK);
+  walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
+  pWal->ckptLock = 0;
+  WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
   return rc;
 }
 
 /* Return the value to pass to a sqlite3_wal_hook callback, the
 ** number of frames in the WAL at the point of the last commit since
@@ -1925,35 +2356,56 @@
   }
   return (int)ret;
 }
 
 /*
-** This function is called to set or query the exclusive-mode flag 
-** associated with the WAL connection passed as the first argument. The
-** exclusive-mode flag should be set to indicate that the caller is
-** holding an EXCLUSIVE lock on the database file (it does this in
-** locking_mode=exclusive mode). If the EXCLUSIVE lock is to be dropped,
-** the flag set by this function should be cleared before doing so.
-**
-** The value of the exclusive-mode flag may only be modified when
-** the WAL connection is in READ state.
-**
-** When the flag is set, this module does not call the VFS xShmLock()
-** method to obtain any locks on the wal-index (as it assumes it
-** has exclusive access to the wal and wal-index files anyhow). It
-** continues to hold (and does not drop) the existing READ lock on
-** the wal-index.
-**
-** To set or clear the flag, the "op" parameter is passed 1 or 0,
-** respectively. To query the flag, pass -1. In all cases, the value
-** returned is the value of the exclusive-mode flag (after its value
-** has been modified, if applicable).
+** This function is called to change the WAL subsystem into or out
+** of locking_mode=EXCLUSIVE.
+**
+** If op is zero, then attempt to change from locking_mode=EXCLUSIVE
+** into locking_mode=NORMAL.  This means that we must acquire a lock
+** on the pWal->readLock byte.  If the WAL is already in locking_mode=NORMAL
+** or if the acquisition of the lock fails, then return 0.  If the
+** transition out of exclusive-mode is successful, return 1.  This
+** operation must occur while the pager is still holding the exclusive
+** lock on the main database file.
+**
+** If op is one, then change from locking_mode=NORMAL into 
+** locking_mode=EXCLUSIVE.  This means that the pWal->readLock must
+** be released.  Return 1 if the transition is made and 0 if the
+** WAL is already in exclusive-locking mode - meaning that this
+** routine is a no-op.  The pager must already hold the exclusive lock
+** on the main database file before invoking this operation.
+**
+** If op is negative, then do a dry-run of the op==1 case but do
+** not actually change anything.  The pager uses this to see if it
+** should acquire the database exclusive lock prior to invoking
+** the op==1 case.
 */
 int sqlite3WalExclusiveMode(Wal *pWal, int op){
-  if( op>=0 ){
-    assert( pWal->lockState==SQLITE_SHM_READ );
-    pWal->exclusiveMode = (u8)op;
+  int rc;
+  assert( pWal->writeLock==0 && pWal->readLock>=0 );
+  if( op==0 ){
+    if( pWal->exclusiveMode ){
+      pWal->exclusiveMode = 0;
+      if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){
+        pWal->exclusiveMode = 1;
+      }
+      rc = pWal->exclusiveMode==0;
+    }else{
+      /* No changes.  Either already in locking_mode=NORMAL or else the 
+      ** acquisition of the read-lock failed.  The pager must continue to
+      ** hold the database exclusive lock. */
+      rc = 0;
+    }
+  }else if( op>0 ){
+    assert( pWal->exclusiveMode==0 );
+    walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
+    pWal->exclusiveMode = 1;
+    rc = 1;
+  }else{
+    rc = pWal->exclusiveMode==0;
   }
-  return pWal->exclusiveMode;
+  return rc;
 }
 
 #endif /* #ifndef SQLITE_OMIT_WAL */

Index: src/wal.h
==================================================================
--- src/wal.h
+++ src/wal.h
@@ -18,23 +18,24 @@
 #define _WAL_H_
 
 #include "sqliteInt.h"
 
 #ifdef SQLITE_OMIT_WAL
-# define sqlite3WalOpen(x,y,z)             0
-# define sqlite3WalClose(w,x,y,z)          0
-# define sqlite3WalOpenSnapshot(y,z)       0
-# define sqlite3WalCloseSnapshot(z) 
-# define sqlite3WalRead(v,w,x,y,z)         0
+# define sqlite3WalOpen(x,y,z)                 0
+# define sqlite3WalClose(w,x,y,z)              0
+# define sqlite3WalBeginReadTransaction(y,z)   0
+# define sqlite3WalEndReadTransaction(z)
+# define sqlite3WalRead(v,w,x,y,z)             0
 # define sqlite3WalDbsize(y,z)
-# define sqlite3WalWriteLock(y,z)          0
-# define sqlite3WalUndo(x,y,z)             0
+# define sqlite3WalBeginWriteTransaction(y)    0
+# define sqlite3WalEndWRiteTransaction(x)      0
+# define sqlite3WalUndo(x,y,z)                 0
 # define sqlite3WalSavepoint(y,z)
-# define sqlite3WalSavepointUndo(y,z)      0
-# define sqlite3WalFrames(u,v,w,x,y,z)     0
-# define sqlite3WalCheckpoint(u,v,w,x,y,z) 0
-# define sqlite3WalCallback(z)             0
+# define sqlite3WalSavepointUndo(y,z)          0
+# define sqlite3WalFrames(u,v,w,x,y,z)         0
+# define sqlite3WalCheckpoint(u,v,w,x)         0
+# define sqlite3WalCallback(z)                 0
 #else
 
 #define WAL_SAVEPOINT_NDATA 3
 
 /* Connection to a write-ahead log (WAL) file. 
@@ -51,22 +52,23 @@
 ** at an instant in time.  sqlite3WalOpenSnapshot gets a read lock and
 ** preserves the current state even if the other threads or processes
 ** write to or checkpoint the WAL.  sqlite3WalCloseSnapshot() closes the
 ** transaction and releases the lock.
 */
-int sqlite3WalOpenSnapshot(Wal *pWal, int *);
-void sqlite3WalCloseSnapshot(Wal *pWal);
+int sqlite3WalBeginReadTransaction(Wal *pWal, int *);
+void sqlite3WalEndReadTransaction(Wal *pWal);
 
 /* Read a page from the write-ahead log, if it is present. */
 int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, int nOut, u8 *pOut);
 
 /* Return the size of the database as it existed at the beginning
 ** of the snapshot */
 void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno);
 
 /* Obtain or release the WRITER lock. */
-int sqlite3WalWriteLock(Wal *pWal, int op);
+int sqlite3WalBeginWriteTransaction(Wal *pWal);
+int sqlite3WalEndWriteTransaction(Wal *pWal);
 
 /* Undo any frames written (but not committed) to the log */
 int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx);
 
 /* Return an integer that records the current (uncommitted) write
@@ -83,13 +85,11 @@
 /* Copy pages from the log to the database file */ 
 int sqlite3WalCheckpoint(
   Wal *pWal,                      /* Write-ahead log connection */
   int sync_flags,                 /* Flags to sync db file with (or 0) */
   int nBuf,                       /* Size of buffer nBuf */
-  u8 *zBuf,                       /* Temporary buffer to use */
-  int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
-  void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
+  u8 *zBuf                        /* Temporary buffer to use */
 );
 
 /* Return the value to pass to a sqlite3_wal_hook callback, the
 ** number of frames in the WAL at the point of the last commit since
 ** sqlite3WalCallback() was called.  If no commits have occurred since

Index: test/filectrl.test
==================================================================
--- test/filectrl.test
+++ test/filectrl.test
@@ -32,10 +32,10 @@
   file_control_lasterrno_test db
 } {}
 do_test filectrl-1.5 {
   db close
   sqlite3 db test_control_lockproxy.db
-  file_control_lockproxy_test db
+  file_control_lockproxy_test db [pwd]
 } {}
 db close
 file delete -force .test_control_lockproxy.db-conch test.proxy
 finish_test

Index: test/lock_common.tcl
==================================================================
--- test/lock_common.tcl
+++ test/lock_common.tcl
@@ -45,11 +45,17 @@
     append r $line
   }
 }
 
 proc testfixture_nb_cb {varname chan} {
-  set line [gets $chan]
+  if {[eof $chan]} {
+    append ::tfnb($chan) "ERROR: Child process hung up"
+    set line "OVER"
+  } else {
+    set line [gets $chan]
+  }
+
   if { $line == "OVER" } {
     set $varname $::tfnb($chan)
     unset ::tfnb($chan)
     close $chan
   } else {

Index: test/wal.test
==================================================================
--- test/wal.test
+++ test/wal.test
@@ -569,74 +569,53 @@
   do_test wal-10.$tn.11 {
     sql2 { BEGIN; SELECT * FROM t1 }
   } {1 2 3 4 5 6 7 8 9 10}
   do_test wal-10.$tn.12 {
     catchsql { PRAGMA wal_checkpoint } 
-  } {1 {database is locked}}
+  } {0 {}}   ;# Reader no longer block checkpoints
   do_test wal-10.$tn.13 {
     execsql { INSERT INTO t1 VALUES(11, 12) }
     sql2 {SELECT * FROM t1}
   } {1 2 3 4 5 6 7 8 9 10}
 
-  # Connection [db2] is holding a lock on a snapshot, preventing [db] from
-  # checkpointing the database. Add a busy-handler to [db]. If [db2] completes
-  # its transaction from within the busy-handler, [db] is able to complete
-  # the checkpoint operation.
-  #
-  proc busyhandler x {
-    if {$x==4} { sql2 COMMIT }
-    if {$x<5} { return 0 }
-    return 1
-  }
-  db busy busyhandler
+  # Writers do not block checkpoints any more either.
+  #
   do_test wal-10.$tn.14 {
-    execsql { PRAGMA wal_checkpoint } 
-  } {}
-
-  # Similar to the test above. Except this time, a new read transaction is
-  # started (db3) while the checkpointer is waiting for an old one (db2) to 
-  # finish. The checkpointer can finish, but any subsequent write operations 
-  # must wait until after db3 has closed the read transaction, as db3 is a
-  # "region D" writer.
-  #
-  db busy {}
-  do_test wal-10.$tn.15 {
-    sql2 { BEGIN; SELECT * FROM t1; }
+    catchsql { PRAGMA wal_checkpoint } 
+  } {0 {}}
+
+  # The following series of test cases used to verify another blocking
+  # case in WAL - a case which no longer blocks.
+  #
+  do_test wal-10.$tn.15 {
+    sql2 { COMMIT; BEGIN; SELECT * FROM t1; }
   } {1 2 3 4 5 6 7 8 9 10 11 12}
   do_test wal-10.$tn.16 {
     catchsql { PRAGMA wal_checkpoint } 
-  } {1 {database is locked}}
-  proc busyhandler x {
-    if {$x==3} { sql3 { BEGIN; SELECT * FROM t1 } }
-    if {$x==4} { sql2 COMMIT }
-    if {$x<5}  { return 0 }
-    return 1
-  }
-  db busy busyhandler
+  } {0 {}}
   do_test wal-10.$tn.17 {
     execsql { PRAGMA wal_checkpoint } 
   } {}
   do_test wal-10.$tn.18 {
-    sql3 { SELECT * FROM t1 }
+    sql3 { BEGIN; SELECT * FROM t1 }
   } {1 2 3 4 5 6 7 8 9 10 11 12}
   do_test wal-10.$tn.19 {
     catchsql { INSERT INTO t1 VALUES(13, 14) }
-  } {1 {database is locked}}
+  } {0 {}}
   do_test wal-10.$tn.20 {
     execsql { SELECT * FROM t1 }
-  } {1 2 3 4 5 6 7 8 9 10 11 12}
+  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
   do_test wal-10.$tn.21 {
     sql3 COMMIT
+    sql2 COMMIT
   } {}
   do_test wal-10.$tn.22 {
-    execsql { INSERT INTO t1 VALUES(13, 14) }
     execsql { SELECT * FROM t1 }
   } {1 2 3 4 5 6 7 8 9 10 11 12 13 14}
 
-  # Set [db3] up as a "region D" reader again. Then upgrade it to a writer
-  # and back down to a reader. Then, check that a checkpoint is not possible
-  # (as [db3] still has a snapshot locked).
+  # Another series of tests that used to demonstrate blocking behavior
+  # but which now work.
   #
   do_test wal-10.$tn.23 {
     execsql { PRAGMA wal_checkpoint }
   } {}
   do_test wal-10.$tn.24 {
@@ -645,27 +624,25 @@
   do_test wal-10.$tn.25 {
     execsql { PRAGMA wal_checkpoint }
   } {}
   do_test wal-10.$tn.26 {
     catchsql { INSERT INTO t1 VALUES(15, 16) }
-  } {1 {database is locked}}
+  } {0 {}}
   do_test wal-10.$tn.27 {
-    sql3 { INSERT INTO t1 VALUES(15, 16) }
+    sql3 { INSERT INTO t1 VALUES(17, 18) }
   } {}
   do_test wal-10.$tn.28 {
     code3 {
       set ::STMT [sqlite3_prepare db3 "SELECT * FROM t1" -1 TAIL]
       sqlite3_step $::STMT
     }
-    sql3 COMMIT
     execsql { SELECT * FROM t1 }
-  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16}
-  db busy {}
+  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18}
   do_test wal-10.$tn.29 {
-    execsql { INSERT INTO t1 VALUES(17, 18) }
+    execsql { INSERT INTO t1 VALUES(19, 20) }
     catchsql { PRAGMA wal_checkpoint }
-  } {1 {database is locked}}
+  } {0 {}}
   do_test wal-10.$tn.30 {
     code3 { sqlite3_finalize $::STMT }
     execsql { PRAGMA wal_checkpoint }
   } {}
 
@@ -672,24 +649,25 @@
   # At one point, if a reader failed to upgrade to a writer because it
   # was reading an old snapshot, the write-locks were not being released.
   # Test that this bug has been fixed.
   #
   do_test wal-10.$tn.31 {
+    sql2 COMMIT
     execsql { BEGIN ; SELECT * FROM t1 }
-    sql2 { INSERT INTO t1 VALUES(19, 20) }
-    catchsql { INSERT INTO t1 VALUES(21, 22) }
+    sql2 { INSERT INTO t1 VALUES(21, 22) }
+    catchsql { INSERT INTO t1 VALUES(23, 24) }
   } {1 {database is locked}}
   do_test wal-10.$tn.32 {
     # This statement would fail when the bug was present.
-    sql2 { INSERT INTO t1 VALUES(21, 22) }
+    sql2 { INSERT INTO t1 VALUES(23, 24) }
   } {}
   do_test wal-10.$tn.33 {
     execsql { SELECT * FROM t1 ; COMMIT }
-  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18}
+  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20}
   do_test wal-10.$tn.34 {
     execsql { SELECT * FROM t1 }
-  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22}
+  } {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24}
 
   # Test that if a checkpointer cannot obtain the required locks, it
   # releases all locks before returning a busy error.
   #
   do_test wal-10.$tn.35 {
@@ -701,15 +679,13 @@
     sql2 {
       BEGIN;
         SELECT * FROM t1;
     }
   } {a b c d}
-  proc busyhandler x { return 1 }
-  db busy busyhandler
   do_test wal-10.$tn.36 {
     catchsql { PRAGMA wal_checkpoint }
-  } {1 {database is locked}}
+  } {0 {}}
   do_test wal-10.$tn.36 {
     sql3 { INSERT INTO t1 VALUES('e', 'f') }
     sql2 { SELECT * FROM t1 }
   } {a b c d}
   do_test wal-10.$tn.37 {
@@ -1057,12 +1033,13 @@
 } {SQLITE_LOCKED}
 do_test wal-15.3.3 {
   sqlite3_errmsg db
 } {database table is locked}
 
-# Also test that an error is returned if the db cannot be checkpointed
-# because of locks held by another connection.
+# Earlier versions returned an error is returned if the db cannot be 
+# checkpointed because of locks held by another connection. Check that
+# this is no longer the case.
 #
 sqlite3 db2 test.db
 do_test wal-15.4.1 {
   execsql {
     BEGIN;
@@ -1070,14 +1047,14 @@
   } db2
 } {1 2}
 do_test wal-15.4.2 {
   execsql { COMMIT }
   sqlite3_wal_checkpoint db
-} {SQLITE_BUSY}
+} {SQLITE_OK}
 do_test wal-15.4.3 {
   sqlite3_errmsg db
-} {database is locked}
+} {not an error}
 
 # After [db2] drops its lock, [db] may checkpoint the db.
 #
 do_test wal-15.4.4 {
   execsql { COMMIT } db2

Index: test/wal2.test
==================================================================
--- test/wal2.test
+++ test/wal2.test
@@ -19,21 +19,22 @@
 ifcapable !wal {finish_test ; return }
 
 proc set_tvfs_hdr {file args} {
 
   # Set $nHdr to the number of bytes in the wal-index header:
-  set nHdr 80
+  set nHdr 40
   set nInt [expr {$nHdr/4}]
 
   if {[llength $args]>1} {
     return -code error {wrong # args: should be "set_tvfs_hdr fileName ?val?"}
   }
 
   set blob [tvfs shm $file]
+
   if {[llength $args]} {
     set ia [lindex $args 0]
-    set tail [string range $blob [expr $nHdr*2] end]
+    binary scan $blob a[expr $nHdr*2]a* dummy tail
     set blob [binary format i${nInt}i${nInt}a* $ia $ia $tail]
     tvfs shm $file $blob
   }
 
   binary scan $blob i${nInt} ints
@@ -90,23 +91,32 @@
 } {4 10}
 do_test wal2-1.1 {
   execsql { SELECT count(a), sum(a) FROM t1 } db2
 } {4 10}
 
-foreach {tn iInsert res wal_index_hdr_mod wal_locks} {
-         2    5   {5 15}    0             {READ RECOVER READ UNLOCK}
-         3    6   {6 21}    1             {READ RECOVER READ UNLOCK}
-         4    7   {7 28}    2             {READ RECOVER READ UNLOCK}
-         5    8   {8 36}    3             {READ RECOVER READ UNLOCK}
-         6    9   {9 45}    4             {READ RECOVER READ UNLOCK}
-         7   10   {10 55}   5             {READ RECOVER READ UNLOCK}
-         8   11   {11 66}   6             {READ RECOVER READ UNLOCK}
-         9   12   {12 78}   7             {READ RECOVER READ UNLOCK}
-        10   13   {13 91}   8             {READ RECOVER READ UNLOCK}
-        11   14   {14 105}  9             {READ RECOVER READ UNLOCK}
-        12   15   {15 120}  -1            {READ UNLOCK}
-} {
+set RECOVER [list                                      \
+  {0 1 lock exclusive}   {1 7 lock exclusive}          \
+  {1 7 unlock exclusive} {0 1 unlock exclusive}        \
+]
+set READ [list                                         \
+  {4 1 lock exclusive} {4 1 unlock exclusive}          \
+  {4 1 lock shared}    {4 1 unlock shared}             \
+]
+
+foreach {tn iInsert res wal_index_hdr_mod wal_locks} "
+         2    5   {5 15}    0             {$RECOVER $READ}
+         3    6   {6 21}    1             {$RECOVER $READ}
+         4    7   {7 28}    2             {$RECOVER $READ}
+         5    8   {8 36}    3             {$RECOVER $READ}
+         6    9   {9 45}    4             {$RECOVER $READ}
+         7   10   {10 55}   5             {$RECOVER $READ}
+         8   11   {11 66}   6             {$RECOVER $READ}
+         9   12   {12 78}   7             {$RECOVER $READ}
+        10   13   {13 91}   8             {$RECOVER $READ}
+        11   14   {14 105}  9             {$RECOVER $READ}
+        12   15   {15 120}  -1            {$READ}
+" {
 
   do_test wal2-1.$tn.1 {
     execsql { INSERT INTO t1 VALUES($iInsert) }
 
     set ::locks [list]
@@ -117,11 +127,10 @@
         set ::cb_done 1
         if {$::wal_index_hdr_mod >= 0} {
           incr_tvfs_hdr [lindex $args 0] $::wal_index_hdr_mod 1
         }
       }
-
       if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] }
       return SQLITE_OK
     }
 
     execsql { SELECT count(a), sum(a) FROM t1 } db2
@@ -148,10 +157,15 @@
 #
 # After this, the header is corrupted again and the reader is allowed
 # to run recovery. This time, it sees an up-to-date snapshot of the
 # database file.
 #
+set WRITER [list 0 1 lock exclusive]
+set LOCKS  [list \
+  {0 1 lock exclusive} {0 1 unlock exclusive} \
+  {4 1 lock shared}    {4 1 unlock shared}    \
+]
 do_test wal2-2.0 {
 
   testvfs tvfs tvfs_cb
   proc tvfs_cb {method args} {
     if {$method == "xShmOpen"} { set ::shm_file [lindex $args 0] }
@@ -204,11 +218,11 @@
         }
       }
       if {$method == "xShmLock"} {
         set lock [lindex $args 2]
         lappend ::locks $lock
-        if {$lock == "RECOVER"} {
+        if {$lock == $::WRITER} {
           set_tvfs_hdr $::shm_file $::oldhdr
         }
       }
       return SQLITE_OK
     }
@@ -216,11 +230,11 @@
     execsql { SELECT count(a), sum(a) FROM t1 } db2
   } $res0
 
   do_test wal2-2.$tn.3 {
     set ::locks
-  } {READ RECOVER READ UNLOCK}
+  } $LOCKS
 
   do_test wal2-2.$tn.4 {
     set ::locks [list]
     set ::cb_done 0
     proc tvfs_cb {method args} {
@@ -243,10 +257,12 @@
 db close
 db2 close
 tvfs delete
 file delete -force test.db test.db-wal test.db-journal
 
+
+if 0 {
 #-------------------------------------------------------------------------
 # This test case - wal2-3.* - tests the response of the library to an
 # SQLITE_BUSY when attempting to obtain a READ or RECOVER lock.
 #
 #   wal2-3.0 - 2: SQLITE_BUSY when obtaining a READ lock
@@ -312,10 +328,12 @@
   list [info exists ::sabotage] [info exists ::locked]
 } {0 0}
 db close
 tvfs delete
 file delete -force test.db test.db-wal test.db-journal
+
+}
 
 #-------------------------------------------------------------------------
 # Test that a database connection using a VFS that does not support the
 # xShmXXX interfaces cannot open a WAL database.
 #
@@ -347,10 +365,23 @@
 
 #-------------------------------------------------------------------------
 # Test that if a database connection is forced to run recovery before it
 # can perform a checkpoint, it does not transition into RECOVER state.
 #
+# UPDATE: This has now changed. When running a checkpoint, if recovery is
+# required the client grabs all exclusive locks (just as it would for a
+# recovery performed as a pre-cursor to a normal database transaction).
+#
+set expected_locks [list]
+lappend expected_locks {1 1 lock exclusive}   ;# Lock checkpoint
+lappend expected_locks {0 1 lock exclusive}   ;# Lock writer
+lappend expected_locks {2 6 lock exclusive}   ;# Lock recovery & all aReadMark[]
+lappend expected_locks {2 6 unlock exclusive} ;# Unlock recovery & aReadMark[]
+lappend expected_locks {0 1 unlock exclusive} ;# Unlock writer
+lappend expected_locks {3 1 lock exclusive}   ;# Lock aReadMark[0]
+lappend expected_locks {3 1 unlock exclusive} ;# Unlock aReadMark[0]
+lappend expected_locks {1 1 unlock exclusive} ;# Unlock checkpoint
 do_test wal2-5.1 {
   proc tvfs_cb {method args} {
     set ::shm_file [lindex $args 0]
     if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] }
     return $::tvfs_cb_return
@@ -368,11 +399,11 @@
 
   incr_tvfs_hdr $::shm_file 1 1
   set ::locks [list]
   execsql { PRAGMA wal_checkpoint }
   set ::locks
-} {CHECKPOINT UNLOCK}
+} $expected_locks
 db close
 tvfs delete
 
 #-------------------------------------------------------------------------
 # This block, test cases wal2-6.*, tests the operation of WAL with
@@ -533,58 +564,132 @@
 do_test wal2-6.3.7 {
   execsql { PRAGMA lock_status }
 } {main exclusive temp closed}
 db close
 
+
+# This test - wal2-6.4.* - uses a single database connection and the
+# [testvfs] instrumentation to test that xShmLock() is being called
+# as expected when a WAL database is used with locking_mode=exclusive.
+#
 do_test wal2-6.4.1 {
   file delete -force test.db test.db-wal test.db-journal
   proc tvfs_cb {method args} {
     set ::shm_file [lindex $args 0]
     if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] }
     return "SQLITE_OK"
   }
   testvfs tvfs tvfs_cb
   sqlite3 db test.db -vfs tvfs
-
-  execsql {
-    PRAGMA journal_mode = WAL;
-    CREATE TABLE t1(x);
-    INSERT INTO t1 VALUES('Leonard');
-    INSERT INTO t1 VALUES('Arthur');
-  }
-
-  set ::locks [list]
-  execsql { PRAGMA locking_mode = exclusive }
-  set ::locks
-} {}
-do_test wal2-6.4.2 {
-  execsql { SELECT * FROM t1 }
-} {Leonard Arthur}
-do_test wal2-6.4.3 {
-  set ::locks
-} {READ}
-do_test wal2-6.4.4 {
-  execsql { 
-    INSERT INTO t1 VALUES('Julius Henry');
-    SELECT * FROM t1;
-  }
-} {Leonard Arthur {Julius Henry}}
-do_test wal2-6.4.5 {
-  set ::locks
-} {READ}
-do_test wal2-6.4.6 {
-  execsql {
-    PRAGMA locking_mode = NORMAL;
-    DELETE FROM t1;
-  }
-  set ::locks
-} {READ UNLOCK}
-do_test wal2-6.4.7 {
-  set ::locks [list]
-  execsql { INSERT INTO t1 VALUES('Karl') }
-  set ::locks
-} {READ WRITE READ UNLOCK}
+} {}
+
+set RECOVERY {
+  {0 1 lock exclusive} {1 7 lock exclusive} 
+  {1 7 unlock exclusive} {0 1 unlock exclusive}
+}
+set READMARK0_READ {
+  {3 1 lock shared} {3 1 unlock shared}
+}
+set READMARK0_WRITE {
+  {3 1 lock shared} 
+  {0 1 lock exclusive} {3 1 unlock shared} 
+  {4 1 lock exclusive} {4 1 unlock exclusive} {4 1 lock shared} 
+  {0 1 unlock exclusive} {4 1 unlock shared}
+}
+set READMARK1_SET {
+  {4 1 lock exclusive} {4 1 unlock exclusive}
+}
+set READMARK1_READ {
+  {4 1 lock shared} {4 1 unlock shared}
+}
+
+foreach {tn sql res expected_locks} {
+  2 {
+    PRAGMA journal_mode = WAL;
+    BEGIN;
+      CREATE TABLE t1(x);
+      INSERT INTO t1 VALUES('Leonard');
+      INSERT INTO t1 VALUES('Arthur');
+    COMMIT;
+  } {wal} {
+    $RECOVERY 
+    $READMARK0_READ 
+    $READMARK0_WRITE
+  }
+
+  3 {
+    # This test should do the READMARK1_SET locking to populate the 
+    # aReadMark[1] slot with the current mxFrame value. Followed by
+    # READMARK1_READ to read the database.
+    #
+    SELECT * FROM t1
+  } {Leonard Arthur} {
+    $READMARK1_SET
+    $READMARK1_READ
+  }
+
+  4 {
+    # aReadMark[1] is already set to mxFrame. So just READMARK1_READ
+    # this time, not READMARK1_SET.
+    #
+    SELECT * FROM t1 ORDER BY x
+  } {Arthur Leonard} { 
+    $READMARK1_READ 
+  }
+
+  5 {
+    PRAGMA locking_mode = exclusive
+  } {exclusive} { } 
+
+  6 {
+    INSERT INTO t1 VALUES('Julius Henry');
+    SELECT * FROM t1;
+  } {Leonard Arthur {Julius Henry}} {
+    $READMARK1_READ
+  }
+
+  7 {
+    INSERT INTO t1 VALUES('Karl');
+    SELECT * FROM t1;
+  } {Leonard Arthur {Julius Henry} Karl} { }
+
+  8 {
+    PRAGMA locking_mode = normal
+  } {normal} { }
+
+  9 {
+    SELECT * FROM t1 ORDER BY x
+  } {Arthur {Julius Henry} Karl Leonard} { }
+
+  10 {
+    DELETE FROM t1
+  } {} {
+    $READMARK1_READ
+  }
+
+  11 {
+    SELECT * FROM t1
+  } {} {
+    $READMARK1_SET
+    $READMARK1_READ
+  }
+} {
+
+  set L [list]
+  foreach el [subst $expected_locks] { lappend L $el }
+
+  set S ""
+  foreach sq [split $sql "\n"] { 
+    set sq [string trim $sq]
+    if {[string match {#*} $sq]==0} {append S "$sq\n"}
+  }
+
+  set ::locks [list]
+  do_test wal2-6.4.$tn.1 { execsql $S } $res
+  do_test wal2-6.4.$tn.2 { set ::locks  } $L
+}
+
 db close
 tvfs delete
 
 do_test wal2-6.5.1 {
   sqlite3 db test.db

Index: test/walthread.test
==================================================================
--- test/walthread.test
+++ test/walthread.test
@@ -282,10 +282,16 @@
         INSERT INTO t1 SELECT md5sum(x) FROM t1;
       COMMIT;
     }
   }
 
+  # Turn off auto-checkpoint. Otherwise, an auto-checkpoint run by a
+  # writer may cause the dedicated checkpoint thread to return an
+  # SQLITE_BUSY error.
+  #
+  db eval { PRAGMA wal_autocheckpoint = 0 }
+
   set nRun 0
   while {[tt_continue]} {
     read_transaction
     write_transaction 
     incr nRun
@@ -387,11 +393,14 @@
 } -thread t 10 {
 
   set nextwrite $E(pid)
 
   proc wal_hook {zDb nEntry} {
-    if {$nEntry>10} {db eval {PRAGMA wal_checkpoint}}
+    if {$nEntry>10} { 
+      set rc [catch { db eval {PRAGMA wal_checkpoint} } msg]
+      if {$rc && $msg != "database is locked"} { error $msg }
+    }
     return 0
   }
   db wal_hook wal_hook
 
   while {[tt_continue]} {