/ Check-in [ef3ba7a1]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Initial code for incremental checkpoint in WAL mode. This check-in compiles on unix and runs as long as you do not engage WAL mode. WAL mode crashes and burns. Consider this check-in a baseline implementation for getting the new capability up and running.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | wal-incr-ckpt
Files: files | file ages | folders
SHA1: ef3ba7a17ff90674d702e5694b9e792851ab6998
User & Date: drh 2010-05-30 19:55:16
Context
2010-05-31
01:41
WAL runs but quickly deadlocks. check-in: ace58acb user: drh tags: wal-incr-ckpt
2010-05-30
19:55
Initial code for incremental checkpoint in WAL mode. This check-in compiles on unix and runs as long as you do not engage WAL mode. WAL mode crashes and burns. Consider this check-in a baseline implementation for getting the new capability up and running. check-in: ef3ba7a1 user: drh tags: wal-incr-ckpt
2010-05-29
08:40
Add tests to fkey2.test to check that ON CONFLICT clauses do not affect SQLite's behaviour when an FK constraint is violated. check-in: e9e5b100 user: dan tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/os.c.

   106    106   }
   107    107   int sqlite3OsShmGet(sqlite3_file *id,int reqSize,int *pSize,void volatile **pp){
   108    108     return id->pMethods->xShmGet(id, reqSize, pSize, pp);
   109    109   }
   110    110   int sqlite3OsShmRelease(sqlite3_file *id){
   111    111     return id->pMethods->xShmRelease(id);
   112    112   }
   113         -int sqlite3OsShmLock(sqlite3_file *id, int desiredLock, int *pGotLock){
   114         -  return id->pMethods->xShmLock(id, desiredLock, pGotLock);
          113  +int sqlite3OsShmLock(sqlite3_file *id, int offset, int n, int flags){
          114  +  return id->pMethods->xShmLock(id, offset, n, flags);
   115    115   }
   116    116   void sqlite3OsShmBarrier(sqlite3_file *id){
   117    117     id->pMethods->xShmBarrier(id);
   118    118   }
   119    119   int sqlite3OsShmClose(sqlite3_file *id, int deleteFlag){
   120    120     return id->pMethods->xShmClose(id, deleteFlag);
   121    121   }

Changes to src/os.h.

   243    243   #define SQLITE_FCNTL_DB_UNCHANGED 0xca093fa0
   244    244   int sqlite3OsSectorSize(sqlite3_file *id);
   245    245   int sqlite3OsDeviceCharacteristics(sqlite3_file *id);
   246    246   int sqlite3OsShmOpen(sqlite3_file *id);
   247    247   int sqlite3OsShmSize(sqlite3_file *id, int, int*);
   248    248   int sqlite3OsShmGet(sqlite3_file *id, int, int*, void volatile**);
   249    249   int sqlite3OsShmRelease(sqlite3_file *id);
   250         -int sqlite3OsShmLock(sqlite3_file *id, int, int*);
          250  +int sqlite3OsShmLock(sqlite3_file *id, int, int, int);
   251    251   void sqlite3OsShmBarrier(sqlite3_file *id);
   252    252   int sqlite3OsShmClose(sqlite3_file *id, int);
   253    253   
   254    254   /* 
   255    255   ** Functions for accessing sqlite3_vfs methods 
   256    256   */
   257    257   int sqlite3OsOpen(sqlite3_vfs *, const char *, sqlite3_file*, int, int *);

Changes to src/os_unix.c.

  3164   3164   **
  3165   3165   ** All other fields are read/write.  The unixShm.pFile->mutex must be held
  3166   3166   ** while accessing any read/write fields.
  3167   3167   */
  3168   3168   struct unixShm {
  3169   3169     unixShmNode *pShmNode;     /* The underlying unixShmNode object */
  3170   3170     unixShm *pNext;            /* Next unixShm with the same unixShmNode */
  3171         -  u8 lockState;              /* Current lock state */
  3172   3171     u8 hasMutex;               /* True if holding the unixShmNode mutex */
  3173   3172     u8 hasMutexBuf;            /* True if holding pFile->mutexBuf */
  3174         -  u8 sharedMask;             /* Mask of shared locks held */
  3175         -  u8 exclMask;               /* Mask of exclusive locks held */
         3173  +  u16 sharedMask;            /* Mask of shared locks held */
         3174  +  u16 exclMask;              /* Mask of exclusive locks held */
  3176   3175   #ifdef SQLITE_DEBUG
  3177   3176     u8 id;                     /* Id of this connection within its unixShmNode */
  3178   3177   #endif
  3179   3178   };
  3180   3179   
  3181         -/*
  3182         -** Size increment by which shared memory grows
  3183         -*/
  3184         -#define SQLITE_UNIX_SHM_INCR  4096
  3185         -
  3186   3180   /*
  3187   3181   ** Constants used for locking
  3188   3182   */
  3189   3183   #define UNIX_SHM_BASE      80        /* Byte offset of the first lock byte */
  3190         -#define UNIX_SHM_DMS       0x01      /* Mask for Dead-Man-Switch lock */
  3191         -#define UNIX_SHM_A         0x10      /* Mask for region locks... */
  3192         -#define UNIX_SHM_B         0x20
  3193         -#define UNIX_SHM_C         0x40
  3194         -#define UNIX_SHM_D         0x80
         3184  +#define UNIX_SHM_DMS       80        /* The deadman switch lock */
  3195   3185   
  3196   3186   #ifdef SQLITE_DEBUG
  3197   3187   /*
  3198   3188   ** Return a pointer to a nul-terminated string in static memory that
  3199   3189   ** describes a locking mask.  The string is of the form "MSABCD" with
  3200   3190   ** each character representing a lock.  "M" for MUTEX, "S" for DMS, 
  3201   3191   ** and "A" through "D" for the region locks.  If a lock is held, the
  3202   3192   ** letter is shown.  If the lock is not held, the letter is converted
  3203   3193   ** to ".".
  3204   3194   **
  3205   3195   ** This routine is for debugging purposes only and does not appear
  3206   3196   ** in a production build.
  3207   3197   */
  3208         -static const char *unixShmLockString(u8 mask){
  3209         -  static char zBuf[48];
         3198  +static const char *unixShmLockString(u16 maskShared, u16 maskExclusive){
         3199  +  static char zBuf[52];
  3210   3200     static int iBuf = 0;
         3201  +  int i;
         3202  +  u16 mask;
  3211   3203     char *z;
  3212   3204   
  3213   3205     z = &zBuf[iBuf];
  3214         -  iBuf += 8;
         3206  +  iBuf += 16;
  3215   3207     if( iBuf>=sizeof(zBuf) ) iBuf = 0;
  3216         -
  3217         -  z[0] = (mask & UNIX_SHM_DMS)   ? 'S' : '.';
  3218         -  z[1] = (mask & UNIX_SHM_A)     ? 'A' : '.';
  3219         -  z[2] = (mask & UNIX_SHM_B)     ? 'B' : '.';
  3220         -  z[3] = (mask & UNIX_SHM_C)     ? 'C' : '.';
  3221         -  z[4] = (mask & UNIX_SHM_D)     ? 'D' : '.';
  3222         -  z[5] = 0;
         3208  +  for(i=0, mask=1; i<SQLITE_SHM_NLOCK; i++, mask += mask){
         3209  +    if( mask & maskShared ){
         3210  +      z[i] = 's';
         3211  +    }else if( mask & maskExclusive ){
         3212  +      z[i] = 'E';
         3213  +    }else{
         3214  +      z[i] = '.';
         3215  +    }
         3216  +  }
         3217  +  z[i] = 0;
  3223   3218     return z;
  3224   3219   }
  3225   3220   #endif /* SQLITE_DEBUG */
  3226   3221   
  3227   3222   /*
  3228         -** Apply posix advisory locks for all bytes identified in lockMask.
  3229         -**
  3230         -** lockMask might contain multiple bits but all bits are guaranteed
  3231         -** to be contiguous.
         3223  +** Apply posix advisory locks for all bytes from ofst through ofst+n-1.
  3232   3224   **
  3233   3225   ** Locks block if the mask is exactly UNIX_SHM_C and are non-blocking
  3234   3226   ** otherwise.
  3235   3227   */
  3236   3228   static int unixShmSystemLock(
  3237   3229     unixShmNode *pShmNode, /* Apply locks to this open shared-memory segment */
  3238   3230     int lockType,          /* F_UNLCK, F_RDLCK, or F_WRLCK */
  3239         -  u8 lockMask            /* Which bytes to lock or unlock */
         3231  +  int ofst,              /* First byte of the locking range */
         3232  +  int n                  /* Number of bytes to lock */
  3240   3233   ){
  3241   3234     struct flock f;       /* The posix advisory locking structure */
  3242         -  int lockOp;           /* The opcode for fcntl() */
  3243         -  int i;                /* Offset into the locking byte range */
  3244         -  int rc;               /* Result code form fcntl() */
  3245         -  u8 mask;              /* Mask of bits in lockMask */
         3235  +  int rc = SQLITE_OK;   /* Result code form fcntl() */
  3246   3236   
  3247   3237     /* Access to the unixShmNode object is serialized by the caller */
  3248   3238     assert( sqlite3_mutex_held(pShmNode->mutex) || pShmNode->nRef==0 );
  3249   3239   
         3240  +  /* Shared locks never span more than one byte */
         3241  +  assert( n==1 || lockType!=F_RDLCK );
         3242  +
         3243  +  /* Locks are within range */
         3244  +  assert( n>=1 && ofst>=0 && ofst+n<SQLITE_SHM_NLOCK );
         3245  +
  3250   3246     /* Initialize the locking parameters */
  3251   3247     memset(&f, 0, sizeof(f));
  3252   3248     f.l_type = lockType;
  3253   3249     f.l_whence = SEEK_SET;
  3254         -  if( lockMask==UNIX_SHM_C && lockType!=F_UNLCK ){
  3255         -    lockOp = F_SETLKW;
  3256         -    OSTRACE(("SHM-LOCK requesting blocking lock\n"));
  3257         -  }else{
  3258         -    lockOp = F_SETLK;
  3259         -  }
         3250  +  f.l_start = ofst+UNIX_SHM_BASE;
         3251  +  f.l_len = n;
  3260   3252   
  3261         -  /* Find the first bit in lockMask that is set */
  3262         -  for(i=0, mask=0x01; mask!=0 && (lockMask&mask)==0; mask <<= 1, i++){}
  3263         -  assert( mask!=0 );
  3264         -  f.l_start = i+UNIX_SHM_BASE;
  3265         -  f.l_len = 1;
  3266         -
  3267         -  /* Extend the locking range for each additional bit that is set */
  3268         -  mask <<= 1;
  3269         -  while( mask!=0 && (lockMask & mask)!=0 ){
  3270         -    f.l_len++;
  3271         -    mask <<= 1;
  3272         -  }
  3273         -
  3274         -  /* Verify that all bits set in lockMask are contiguous */
  3275         -  assert( mask==0 || (lockMask & ~(mask | (mask-1)))==0 );
  3276         -
  3277         -  /* Acquire the system-level lock */
  3278         -  rc = fcntl(pShmNode->h, lockOp, &f);
         3253  +  rc = fcntl(pShmNode->h, F_SETLK, &f);
  3279   3254     rc = (rc!=(-1)) ? SQLITE_OK : SQLITE_BUSY;
  3280   3255   
  3281   3256     /* Update the global lock state and do debug tracing */
  3282   3257   #ifdef SQLITE_DEBUG
         3258  +  { u16 mask;
  3283   3259     OSTRACE(("SHM-LOCK "));
         3260  +  mask = (1<<(ofst+n)) - (1<<ofst);
  3284   3261     if( rc==SQLITE_OK ){
  3285   3262       if( lockType==F_UNLCK ){
  3286         -      OSTRACE(("unlock ok"));
  3287         -      pShmNode->exclMask &= ~lockMask;
  3288         -      pShmNode->sharedMask &= ~lockMask;
         3263  +      OSTRACE(("unlock %d ok", ofst));
         3264  +      pShmNode->exclMask &= ~mask;
         3265  +      pShmNode->sharedMask &= ~mask;
  3289   3266       }else if( lockType==F_RDLCK ){
  3290         -      OSTRACE(("read-lock ok"));
  3291         -      pShmNode->exclMask &= ~lockMask;
  3292         -      pShmNode->sharedMask |= lockMask;
         3267  +      OSTRACE(("read-lock %d ok", ofst));
         3268  +      pShmNode->exclMask &= ~mask;
         3269  +      pShmNode->sharedMask |= mask;
  3293   3270       }else{
  3294   3271         assert( lockType==F_WRLCK );
  3295         -      OSTRACE(("write-lock ok"));
  3296         -      pShmNode->exclMask |= lockMask;
  3297         -      pShmNode->sharedMask &= ~lockMask;
         3272  +      OSTRACE(("write-lock %d ok", ofst));
         3273  +      pShmNode->exclMask |= mask;
         3274  +      pShmNode->sharedMask &= ~mask;
  3298   3275       }
  3299   3276     }else{
  3300   3277       if( lockType==F_UNLCK ){
  3301         -      OSTRACE(("unlock failed"));
         3278  +      OSTRACE(("unlock %d failed", ofst));
  3302   3279       }else if( lockType==F_RDLCK ){
  3303   3280         OSTRACE(("read-lock failed"));
  3304   3281       }else{
  3305   3282         assert( lockType==F_WRLCK );
  3306         -      OSTRACE(("write-lock failed"));
         3283  +      OSTRACE(("write-lock %d failed", ofst));
  3307   3284       }
  3308   3285     }
  3309         -  OSTRACE((" - change requested %s - afterwards %s:%s\n",
  3310         -           unixShmLockString(lockMask),
  3311         -           unixShmLockString(pShmNode->sharedMask),
  3312         -           unixShmLockString(pShmNode->exclMask)));
         3286  +  OSTRACE((" - afterwards %s\n",
         3287  +           unixShmLockString(pShmNode->sharedMask, pShmNode->exclMask)));
         3288  +  }
  3313   3289   #endif
  3314   3290   
  3315   3291     return rc;        
  3316   3292   }
  3317   3293   
  3318         -/*
  3319         -** For connection p, unlock all of the locks identified by the unlockMask
  3320         -** parameter.
  3321         -*/
  3322         -static int unixShmUnlock(
  3323         -  unixShmNode *pShmNode,   /* The underlying shared-memory file */
  3324         -  unixShm *p,              /* The connection to be unlocked */
  3325         -  u8 unlockMask            /* Mask of locks to be unlocked */
  3326         -){
  3327         -  int rc;      /* Result code */
  3328         -  unixShm *pX; /* For looping over all sibling connections */
  3329         -  u8 allMask;  /* Union of locks held by connections other than "p" */
  3330         -
  3331         -  /* Access to the unixShmNode object is serialized by the caller */
  3332         -  assert( sqlite3_mutex_held(pShmNode->mutex) );
  3333         -
  3334         -  /* Compute locks held by sibling connections */
  3335         -  allMask = 0;
  3336         -  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
  3337         -    if( pX==p ) continue;
  3338         -    assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
  3339         -    allMask |= pX->sharedMask;
  3340         -  }
  3341         -
  3342         -  /* Unlock the system-level locks */
  3343         -  if( (unlockMask & allMask)!=unlockMask ){
  3344         -    rc = unixShmSystemLock(pShmNode, F_UNLCK, unlockMask & ~allMask);
  3345         -  }else{
  3346         -    rc = SQLITE_OK;
  3347         -  }
  3348         -
  3349         -  /* Undo the local locks */
  3350         -  if( rc==SQLITE_OK ){
  3351         -    p->exclMask &= ~unlockMask;
  3352         -    p->sharedMask &= ~unlockMask;
  3353         -  } 
  3354         -  return rc;
  3355         -}
  3356         -
  3357         -/*
  3358         -** Get reader locks for connection p on all locks in the readMask parameter.
  3359         -*/
  3360         -static int unixShmSharedLock(
  3361         -  unixShmNode *pShmNode,   /* The underlying shared-memory file */
  3362         -  unixShm *p,              /* The connection to get the shared locks */
  3363         -  u8 readMask              /* Mask of shared locks to be acquired */
  3364         -){
  3365         -  int rc;        /* Result code */
  3366         -  unixShm *pX;   /* For looping over all sibling connections */
  3367         -  u8 allShared;  /* Union of locks held by connections other than "p" */
  3368         -
  3369         -  /* Access to the unixShmNode object is serialized by the caller */
  3370         -  assert( sqlite3_mutex_held(pShmNode->mutex) );
  3371         -
  3372         -  /* Find out which shared locks are already held by sibling connections.
  3373         -  ** If any sibling already holds an exclusive lock, go ahead and return
  3374         -  ** SQLITE_BUSY.
  3375         -  */
  3376         -  allShared = 0;
  3377         -  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
  3378         -    if( pX==p ) continue;
  3379         -    if( (pX->exclMask & readMask)!=0 ) return SQLITE_BUSY;
  3380         -    allShared |= pX->sharedMask;
  3381         -  }
  3382         -
  3383         -  /* Get shared locks at the system level, if necessary */
  3384         -  if( (~allShared) & readMask ){
  3385         -    rc = unixShmSystemLock(pShmNode, F_RDLCK, readMask);
  3386         -  }else{
  3387         -    rc = SQLITE_OK;
  3388         -  }
  3389         -
  3390         -  /* Get the local shared locks */
  3391         -  if( rc==SQLITE_OK ){
  3392         -    p->sharedMask |= readMask;
  3393         -  }
  3394         -  return rc;
  3395         -}
  3396         -
  3397         -/*
  3398         -** For connection p, get an exclusive lock on all locks identified in
  3399         -** the writeMask parameter.
  3400         -*/
  3401         -static int unixShmExclusiveLock(
  3402         -  unixShmNode *pShmNode,    /* The underlying shared-memory file */
  3403         -  unixShm *p,               /* The connection to get the exclusive locks */
  3404         -  u8 writeMask              /* Mask of exclusive locks to be acquired */
  3405         -){
  3406         -  int rc;        /* Result code */
  3407         -  unixShm *pX;   /* For looping over all sibling connections */
  3408         -
  3409         -  /* Access to the unixShmNode object is serialized by the caller */
  3410         -  assert( sqlite3_mutex_held(pShmNode->mutex) );
  3411         -
  3412         -  /* Make sure no sibling connections hold locks that will block this
  3413         -  ** lock.  If any do, return SQLITE_BUSY right away.
  3414         -  */
  3415         -  for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
  3416         -    if( pX==p ) continue;
  3417         -    if( (pX->exclMask & writeMask)!=0 ) return SQLITE_BUSY;
  3418         -    if( (pX->sharedMask & writeMask)!=0 ) return SQLITE_BUSY;
  3419         -  }
  3420         -
  3421         -  /* Get the exclusive locks at the system level.  Then if successful
  3422         -  ** also mark the local connection as being locked.
  3423         -  */
  3424         -  rc = unixShmSystemLock(pShmNode, F_WRLCK, writeMask);
  3425         -  if( rc==SQLITE_OK ){
  3426         -    p->sharedMask &= ~writeMask;
  3427         -    p->exclMask |= writeMask;
  3428         -  }
  3429         -  return rc;
  3430         -}
  3431   3294   
  3432   3295   /*
  3433   3296   ** Purge the unixShmNodeList list of all entries with unixShmNode.nRef==0.
  3434   3297   **
  3435   3298   ** This is not a VFS shared-memory method; it is a utility function called
  3436   3299   ** by VFS shared-memory methods.
  3437   3300   */
................................................................................
  3516   3379         goto shm_open_err;
  3517   3380       }
  3518   3381   
  3519   3382       /* Check to see if another process is holding the dead-man switch.
  3520   3383       ** If not, truncate the file to zero length. 
  3521   3384       */
  3522   3385       rc = SQLITE_OK;
  3523         -    if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS)==SQLITE_OK ){
         3386  +    if( unixShmSystemLock(pShmNode, F_WRLCK, UNIX_SHM_DMS, 1)==SQLITE_OK ){
  3524   3387         if( ftruncate(pShmNode->h, 0) ){
  3525   3388           rc = SQLITE_IOERR;
  3526   3389         }
  3527   3390       }
  3528   3391       if( rc==SQLITE_OK ){
  3529         -      rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS);
         3392  +      rc = unixShmSystemLock(pShmNode, F_RDLCK, UNIX_SHM_DMS, 1);
  3530   3393       }
  3531   3394       if( rc ) goto shm_open_err;
  3532   3395     }
  3533   3396   
  3534   3397     /* Make the new connection a child of the unixShmNode */
  3535   3398     p->pShmNode = pShmNode;
  3536   3399     p->pNext = pShmNode->pFirst;
................................................................................
  3683   3546     unixShm *p = pDbFd->pShm;
  3684   3547     unixShmNode *pShmNode = p->pShmNode;
  3685   3548     int rc = SQLITE_OK;
  3686   3549   
  3687   3550     assert( pShmNode==pDbFd->pInode->pShmNode );
  3688   3551     assert( pShmNode->pInode==pDbFd->pInode );
  3689   3552   
  3690         -  if( p->lockState!=SQLITE_SHM_CHECKPOINT && p->hasMutexBuf==0 ){
         3553  +  if( p->hasMutexBuf==0 ){
  3691   3554       assert( sqlite3_mutex_notheld(pShmNode->mutex) );
  3692   3555       sqlite3_mutex_enter(pShmNode->mutexBuf);
  3693   3556       p->hasMutexBuf = 1;
  3694   3557     }
  3695   3558     sqlite3_mutex_enter(pShmNode->mutex);
  3696   3559     if( pShmNode->szMap==0 || reqMapSize>pShmNode->szMap ){
  3697   3560       int actualSize;
................................................................................
  3727   3590   ** really want to release the lock, so in that case too, this routine
  3728   3591   ** is a no-op.
  3729   3592   */
  3730   3593   static int unixShmRelease(sqlite3_file *fd){
  3731   3594     unixFile *pDbFd = (unixFile*)fd;
  3732   3595     unixShm *p = pDbFd->pShm;
  3733   3596   
  3734         -  if( p->hasMutexBuf && p->lockState!=SQLITE_SHM_RECOVER ){
         3597  +  if( p->hasMutexBuf ){
  3735   3598       assert( sqlite3_mutex_notheld(p->pShmNode->mutex) );
  3736   3599       sqlite3_mutex_leave(p->pShmNode->mutexBuf);
  3737   3600       p->hasMutexBuf = 0;
  3738   3601     }
  3739   3602     return SQLITE_OK;
  3740   3603   }
  3741   3604   
  3742         -/*
  3743         -** Symbolic names for LOCK states used for debugging.
  3744         -*/
  3745         -#ifdef SQLITE_DEBUG
  3746         -static const char *azLkName[] = {
  3747         -  "UNLOCK",
  3748         -  "READ",
  3749         -  "READ_FULL",
  3750         -  "WRITE",
  3751         -  "PENDING",
  3752         -  "CHECKPOINT",
  3753         -  "RECOVER"
  3754         -};
  3755         -#endif
  3756         -
  3757   3605   
  3758   3606   /*
  3759   3607   ** Change the lock state for a shared-memory segment.
  3760   3608   */
  3761   3609   static int unixShmLock(
  3762   3610     sqlite3_file *fd,          /* Database file holding the shared memory */
  3763         -  int desiredLock,           /* One of SQLITE_SHM_xxxxx locking states */
  3764         -  int *pGotLock              /* The lock you actually got */
         3611  +  int ofst,                  /* First lock to acquire or release */
         3612  +  int n,                     /* Number of locks to acquire or release */
         3613  +  int flags                  /* What to do with the lock */
  3765   3614   ){
  3766         -  unixFile *pDbFd = (unixFile*)fd;
  3767         -  unixShm *p = pDbFd->pShm;
  3768         -  unixShmNode *pShmNode = p->pShmNode;
  3769         -  int rc = SQLITE_PROTOCOL;
         3615  +  unixFile *pDbFd = (unixFile*)fd;      /* Connection holding shared memory */
         3616  +  unixShm *p = pDbFd->pShm;             /* The shared memory being locked */
         3617  +  unixShm *pX;                          /* For looping over all siblings */
         3618  +  unixShmNode *pShmNode = p->pShmNode;  /* The underlying file iNode */
         3619  +  int rc = SQLITE_OK;                   /* Result code */
         3620  +  u16 mask;                             /* Mask of locks to take or release */
  3770   3621   
  3771   3622     assert( pShmNode==pDbFd->pInode->pShmNode );
  3772   3623     assert( pShmNode->pInode==pDbFd->pInode );
         3624  +  assert( ofst>=0 && ofst+n<SQLITE_SHM_NLOCK );
         3625  +  assert( n>=1 );
         3626  +  assert( flags==(SQLITE_SHM_LOCK | SQLITE_SHM_SHARED)
         3627  +       || flags==(SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE)
         3628  +       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED)
         3629  +       || flags==(SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE) );
         3630  +  assert( n==1 || (flags & SQLITE_SHM_EXCLUSIVE)!=0 );
  3773   3631   
  3774         -  /* Note that SQLITE_SHM_READ_FULL and SQLITE_SHM_PENDING are never
  3775         -  ** directly requested; they are side effects from requesting
  3776         -  ** SQLITE_SHM_READ and SQLITE_SHM_CHECKPOINT, respectively.
  3777         -  */
  3778         -  assert( desiredLock==SQLITE_SHM_UNLOCK
  3779         -       || desiredLock==SQLITE_SHM_READ
  3780         -       || desiredLock==SQLITE_SHM_WRITE
  3781         -       || desiredLock==SQLITE_SHM_CHECKPOINT
  3782         -       || desiredLock==SQLITE_SHM_RECOVER );
  3783         -
  3784         -  /* Return directly if this is just a lock state query, or if
  3785         -  ** the connection is already in the desired locking state.
  3786         -  */
  3787         -  if( desiredLock==p->lockState
  3788         -   || (desiredLock==SQLITE_SHM_READ && p->lockState==SQLITE_SHM_READ_FULL)
  3789         -  ){
  3790         -    OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s and got %s\n",
  3791         -             p->id, getpid(), azLkName[desiredLock], azLkName[p->lockState]));
  3792         -    if( pGotLock ) *pGotLock = p->lockState;
  3793         -    return SQLITE_OK;
  3794         -  }
  3795         -
  3796         -  OSTRACE(("SHM-LOCK shmid-%d, pid-%d request %s->%s\n",
  3797         -            p->id, getpid(), azLkName[p->lockState], azLkName[desiredLock]));
  3798         -  
  3799         -  if( desiredLock==SQLITE_SHM_RECOVER && !p->hasMutexBuf ){
  3800         -    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
  3801         -    sqlite3_mutex_enter(pShmNode->mutexBuf);
  3802         -    p->hasMutexBuf = 1;
  3803         -  }
         3632  +  mask = (1<<(ofst+n+1)) - (1<<(ofst+1));
         3633  +  assert( n>1 || mask==(1<<ofst) );
  3804   3634     sqlite3_mutex_enter(pShmNode->mutex);
  3805         -  switch( desiredLock ){
  3806         -    case SQLITE_SHM_UNLOCK: {
  3807         -      assert( p->lockState!=SQLITE_SHM_RECOVER );
  3808         -      unixShmUnlock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B|UNIX_SHM_C|UNIX_SHM_D);
         3635  +  if( flags & SQLITE_SHM_UNLOCK ){
         3636  +    u16 allMask = 0; /* Mask of locks held by siblings */
         3637  +
         3638  +    /* See if any siblings hold this same lock */
         3639  +    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
         3640  +      if( pX==p ) continue;
         3641  +      assert( (pX->exclMask & (p->exclMask|p->sharedMask))==0 );
         3642  +      allMask |= pX->sharedMask;
         3643  +    }
         3644  +
         3645  +    /* Unlock the system-level locks */
         3646  +    if( (mask & allMask)==0 ){
         3647  +      rc = unixShmSystemLock(pShmNode, F_UNLCK, ofst+1, n);
         3648  +    }else{
  3809   3649         rc = SQLITE_OK;
  3810         -      p->lockState = SQLITE_SHM_UNLOCK;
  3811         -      break;
  3812   3650       }
  3813         -    case SQLITE_SHM_READ: {
  3814         -      if( p->lockState==SQLITE_SHM_UNLOCK ){
  3815         -        int nAttempt;
         3651  +
         3652  +    /* Undo the local locks */
         3653  +    if( rc==SQLITE_OK ){
         3654  +      p->exclMask &= ~mask;
         3655  +      p->sharedMask &= ~mask;
         3656  +    } 
         3657  +  }else if( flags & SQLITE_SHM_SHARED ){
         3658  +    u16 allShared = 0;  /* Union of locks held by connections other than "p" */
         3659  +
         3660  +    /* Find out which shared locks are already held by sibling connections.
         3661  +    ** If any sibling already holds an exclusive lock, go ahead and return
         3662  +    ** SQLITE_BUSY.
         3663  +    */
         3664  +    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
         3665  +      if( pX==p ) continue;
         3666  +      if( (pX->exclMask & mask)!=0 ){
  3816   3667           rc = SQLITE_BUSY;
  3817         -        assert( p->lockState==SQLITE_SHM_UNLOCK );
  3818         -        for(nAttempt=0; nAttempt<5 && rc==SQLITE_BUSY; nAttempt++){
  3819         -          rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A|UNIX_SHM_B);
  3820         -          if( rc==SQLITE_BUSY ){
  3821         -            rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_D);
  3822         -            if( rc==SQLITE_OK ){
  3823         -              p->lockState = SQLITE_SHM_READ_FULL;
  3824         -            }
  3825         -          }else{
  3826         -            unixShmUnlock(pShmNode, p, UNIX_SHM_B);
  3827         -            p->lockState = SQLITE_SHM_READ;
  3828         -          }
  3829         -        }
         3668  +        break;
         3669  +      }
         3670  +      allShared |= pX->sharedMask;
         3671  +    }
         3672  +
         3673  +    /* Get shared locks at the system level, if necessary */
         3674  +    if( rc==SQLITE_OK ){
         3675  +      if( (allShared & mask)==0 ){
         3676  +        rc = unixShmSystemLock(pShmNode, F_RDLCK, ofst+1, n);
  3830   3677         }else{
  3831         -       assert( p->lockState==SQLITE_SHM_WRITE
  3832         -               || p->lockState==SQLITE_SHM_RECOVER );
  3833         -        rc = unixShmSharedLock(pShmNode, p, UNIX_SHM_A);
  3834         -        unixShmUnlock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D);
  3835         -        p->lockState = SQLITE_SHM_READ;
         3678  +        rc = SQLITE_OK;
  3836   3679         }
  3837         -      break;
  3838   3680       }
  3839         -    case SQLITE_SHM_WRITE: {
  3840         -      assert( p->lockState==SQLITE_SHM_READ 
  3841         -              || p->lockState==SQLITE_SHM_READ_FULL );
  3842         -      rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C|UNIX_SHM_D);
         3681  +
         3682  +    /* Get the local shared locks */
         3683  +    if( rc==SQLITE_OK ){
         3684  +      p->sharedMask |= mask;
         3685  +    }
         3686  +  }else{
         3687  +    /* Make sure no sibling connections hold locks that will block this
         3688  +    ** lock.  If any do, return SQLITE_BUSY right away.
         3689  +    */
         3690  +    for(pX=pShmNode->pFirst; pX; pX=pX->pNext){
         3691  +      if( pX==p ) continue;
         3692  +      if( (pX->exclMask & mask)!=0 || (pX->sharedMask & mask)!=0 ){
         3693  +        rc = SQLITE_BUSY;
         3694  +        break;
         3695  +      }
         3696  +    }
         3697  +  
         3698  +    /* Get the exclusive locks at the system level.  Then if successful
         3699  +    ** also mark the local connection as being locked.
         3700  +    */
         3701  +    if( rc==SQLITE_OK ){
         3702  +      rc = unixShmSystemLock(pShmNode, F_WRLCK, ofst+1, n);
  3843   3703         if( rc==SQLITE_OK ){
  3844         -        p->lockState = SQLITE_SHM_WRITE;
         3704  +        p->sharedMask &= ~mask;
         3705  +        p->exclMask |= mask;
  3845   3706         }
  3846         -      break;
  3847         -    }
  3848         -    case SQLITE_SHM_CHECKPOINT: {
  3849         -      assert( p->lockState==SQLITE_SHM_UNLOCK
  3850         -           || p->lockState==SQLITE_SHM_PENDING
  3851         -      );
  3852         -      if( p->lockState==SQLITE_SHM_UNLOCK ){
  3853         -        rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_B|UNIX_SHM_C);
  3854         -        if( rc==SQLITE_OK ){
  3855         -          p->lockState = SQLITE_SHM_PENDING;
  3856         -        }
  3857         -      }
  3858         -      if( p->lockState==SQLITE_SHM_PENDING ){
  3859         -        rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_A);
  3860         -        if( rc==SQLITE_OK ){
  3861         -          p->lockState = SQLITE_SHM_CHECKPOINT;
  3862         -        }
  3863         -      }
  3864         -      break;
  3865         -    }
  3866         -    default: {
  3867         -      assert( desiredLock==SQLITE_SHM_RECOVER );
  3868         -      assert( p->lockState==SQLITE_SHM_READ
  3869         -           || p->lockState==SQLITE_SHM_READ_FULL
  3870         -      );
  3871         -      assert( sqlite3_mutex_held(pShmNode->mutexBuf) );
  3872         -      rc = unixShmExclusiveLock(pShmNode, p, UNIX_SHM_C);
  3873         -      if( rc==SQLITE_OK ){
  3874         -        p->lockState = SQLITE_SHM_RECOVER;
  3875         -      }
  3876         -      break;
  3877   3707       }
  3878   3708     }
  3879   3709     sqlite3_mutex_leave(pShmNode->mutex);
  3880   3710     OSTRACE(("SHM-LOCK shmid-%d, pid-%d got %s\n",
  3881         -           p->id, getpid(), azLkName[p->lockState]));
  3882         -  if( pGotLock ) *pGotLock = p->lockState;
         3711  +           p->id, getpid(), unixShmLockString(p->sharedMask, p->exclMask)));
  3883   3712     return rc;
  3884   3713   }
  3885   3714   
  3886   3715   /*
  3887   3716   ** Implement a memory barrier or memory fence on shared memory.  
  3888   3717   **
  3889   3718   ** All loads and stores begun before the barrier must complete before

Changes to src/pager.c.

  1199   1199     return (pPager->pWal!=0);
  1200   1200   }
  1201   1201   #else
  1202   1202   # define pagerUseWal(x) 0
  1203   1203   # define pagerRollbackWal(x) 0
  1204   1204   # define pagerWalFrames(v,w,x,y,z) 0
  1205   1205   # define pagerOpenWalIfPresent(z) SQLITE_OK
  1206         -# define pagerOpenSnapshot(z) SQLITE_OK
         1206  +# define pagerBeginReadTransaction(z) SQLITE_OK
  1207   1207   #endif
  1208   1208   
  1209   1209   /*
  1210   1210   ** Unlock the database file. This function is a no-op if the pager
  1211   1211   ** is in exclusive mode.
  1212   1212   **
  1213   1213   ** If the pager is currently in error state, discard the contents of 
................................................................................
  1234   1234       ** this happens.  One can argue that this doesn't need to be cleared
  1235   1235       ** until the change-counter check fails in PagerSharedLock().
  1236   1236       ** Clearing the page size cache here is being conservative.
  1237   1237       */
  1238   1238       pPager->dbSizeValid = 0;
  1239   1239   
  1240   1240       if( pagerUseWal(pPager) ){
  1241         -      sqlite3WalCloseSnapshot(pPager->pWal);
         1241  +      sqlite3WalEndReadTransaction(pPager->pWal);
  1242   1242       }else{
  1243   1243         rc = osUnlock(pPager->fd, NO_LOCK);
  1244   1244       }
  1245   1245       if( rc ){
  1246   1246         pPager->errCode = rc;
  1247   1247       }
  1248   1248       IOTRACE(("UNLOCK %p\n", pPager))
................................................................................
  1433   1433     }
  1434   1434     sqlite3BitvecDestroy(pPager->pInJournal);
  1435   1435     pPager->pInJournal = 0;
  1436   1436     pPager->nRec = 0;
  1437   1437     sqlite3PcacheCleanAll(pPager->pPCache);
  1438   1438   
  1439   1439     if( pagerUseWal(pPager) ){
  1440         -    rc2 = sqlite3WalWriteLock(pPager->pWal, 0);
         1440  +    rc2 = sqlite3WalEndWriteTransaction(pPager->pWal);
  1441   1441       pPager->state = PAGER_SHARED;
  1442   1442   
  1443   1443       /* If the connection was in locking_mode=exclusive mode but is no longer,
  1444   1444       ** drop the EXCLUSIVE lock held on the database file.
  1445   1445       */
  1446   1446       if( rc2==SQLITE_OK 
  1447   1447        && !pPager->exclusiveMode 
................................................................................
  2358   2358         sqlite3BackupUpdate(pPager->pBackup, p->pgno, (u8 *)p->pData);
  2359   2359       }
  2360   2360     }
  2361   2361     return rc;
  2362   2362   }
  2363   2363   
  2364   2364   /*
  2365         -** Open a WAL snapshot on the log file this pager is connected to.
         2365  +** Begin a read transaction on the WAL.
         2366  +**
         2367  +** This routine used to be called "pagerOpenSnapshot()" because it essentially
         2368  +** makes a snapshot of the database at the current point in time and preserves
         2369  +** that snapshot for use by the reader in spite of concurrently changes by
         2370  +** other writers or checkpointers.
  2366   2371   */
  2367         -static int pagerOpenSnapshot(Pager *pPager){
         2372  +static int pagerBeginReadTransaction(Pager *pPager){
  2368   2373     int rc;                         /* Return code */
  2369   2374     int changed = 0;                /* True if cache must be reset */
  2370   2375   
  2371   2376     assert( pagerUseWal(pPager) );
  2372   2377   
  2373         -  rc = sqlite3WalOpenSnapshot(pPager->pWal, &changed);
         2378  +  rc = sqlite3WalBeginReadTransaction(pPager->pWal, &changed);
  2374   2379     if( rc==SQLITE_OK ){
  2375   2380       int dummy;
  2376   2381       if( changed ){
  2377   2382         pager_reset(pPager);
  2378   2383         assert( pPager->errCode || pPager->dbSizeValid==0 );
  2379   2384       }
  2380   2385       rc = sqlite3PagerPagecount(pPager, &dummy);
................................................................................
  2424   2429       int isWal;                    /* True if WAL file exists */
  2425   2430       rc = pagerHasWAL(pPager, &isWal);
  2426   2431       if( rc==SQLITE_OK ){
  2427   2432         if( isWal ){
  2428   2433           pager_reset(pPager);
  2429   2434           rc = sqlite3PagerOpenWal(pPager, 0);
  2430   2435           if( rc==SQLITE_OK ){
  2431         -          rc = pagerOpenSnapshot(pPager);
         2436  +          rc = pagerBeginReadTransaction(pPager);
  2432   2437           }
  2433   2438         }else if( pPager->journalMode==PAGER_JOURNALMODE_WAL ){
  2434   2439           pPager->journalMode = PAGER_JOURNALMODE_DELETE;
  2435   2440         }
  2436   2441       }
  2437   2442     }
  2438   2443     return rc;
................................................................................
  3998   4003         isErrorReset = 1;
  3999   4004       }
  4000   4005       pPager->errCode = SQLITE_OK;
  4001   4006       pager_reset(pPager);
  4002   4007     }
  4003   4008   
  4004   4009     if( pagerUseWal(pPager) ){
  4005         -    rc = pagerOpenSnapshot(pPager);
         4010  +    rc = pagerBeginReadTransaction(pPager);
  4006   4011     }else if( pPager->state==PAGER_UNLOCK || isErrorReset ){
  4007   4012       sqlite3_vfs * const pVfs = pPager->pVfs;
  4008   4013       int isHotJournal = 0;
  4009   4014       assert( !MEMDB );
  4010   4015       assert( sqlite3PcacheRefCount(pPager->pPCache)==0 );
  4011   4016       if( pPager->noReadlock ){
  4012   4017         assert( pPager->readOnly );
................................................................................
  4557   4562         **
  4558   4563         ** WAL mode sets Pager.state to PAGER_RESERVED when it has an open
  4559   4564         ** transaction, but never to PAGER_EXCLUSIVE. This is because in 
  4560   4565         ** PAGER_EXCLUSIVE state the code to roll back savepoint transactions
  4561   4566         ** may copy data from the sub-journal into the database file as well
  4562   4567         ** as into the page cache. Which would be incorrect in WAL mode.
  4563   4568         */
  4564         -      rc = sqlite3WalWriteLock(pPager->pWal, 1);
         4569  +      rc = sqlite3WalBeginWriteTransaction(pPager->pWal);
  4565   4570         if( rc==SQLITE_OK ){
  4566   4571           pPager->dbOrigSize = pPager->dbSize;
  4567   4572           pPager->state = PAGER_RESERVED;
  4568   4573           pPager->journalOff = 0;
  4569   4574         }
  4570   4575   
  4571   4576         assert( rc!=SQLITE_OK || pPager->state==PAGER_RESERVED );
................................................................................
  5888   5893   */
  5889   5894   int sqlite3PagerCheckpoint(Pager *pPager){
  5890   5895     int rc = SQLITE_OK;
  5891   5896     if( pPager->pWal ){
  5892   5897       u8 *zBuf = (u8 *)pPager->pTmpSpace;
  5893   5898       rc = sqlite3WalCheckpoint(pPager->pWal,
  5894   5899           (pPager->noSync ? 0 : pPager->sync_flags),
  5895         -        pPager->pageSize, zBuf, 
  5896         -        pPager->xBusyHandler, pPager->pBusyHandlerArg
         5900  +        pPager->pageSize, zBuf
  5897   5901       );
  5898   5902     }
  5899   5903     return rc;
  5900   5904   }
  5901   5905   
  5902   5906   int sqlite3PagerWalCallback(Pager *pPager){
  5903   5907     return sqlite3WalCallback(pPager->pWal);

Changes to src/sqlite.h.in.

   440    440   #define SQLITE_IOERR_BLOCKED           (SQLITE_IOERR | (11<<8))
   441    441   #define SQLITE_IOERR_NOMEM             (SQLITE_IOERR | (12<<8))
   442    442   #define SQLITE_IOERR_ACCESS            (SQLITE_IOERR | (13<<8))
   443    443   #define SQLITE_IOERR_CHECKRESERVEDLOCK (SQLITE_IOERR | (14<<8))
   444    444   #define SQLITE_IOERR_LOCK              (SQLITE_IOERR | (15<<8))
   445    445   #define SQLITE_IOERR_CLOSE             (SQLITE_IOERR | (16<<8))
   446    446   #define SQLITE_IOERR_DIR_CLOSE         (SQLITE_IOERR | (17<<8))
   447         -#define SQLITE_LOCKED_SHAREDCACHE      (SQLITE_LOCKED | (1<<8) )
          447  +#define SQLITE_LOCKED_SHAREDCACHE      (SQLITE_LOCKED |  (1<<8))
          448  +#define SQLITE_BUSY_RECOVERY           (SQLITE_BUSY   |  (1<<8))
   448    449   
   449    450   /*
   450    451   ** CAPI3REF: Flags For File Open Operations
   451    452   **
   452    453   ** These bit values are intended for use in the
   453    454   ** 3rd parameter to the [sqlite3_open_v2()] interface and
   454    455   ** in the 4th parameter to the xOpen method of the
................................................................................
   654    655     int (*xSectorSize)(sqlite3_file*);
   655    656     int (*xDeviceCharacteristics)(sqlite3_file*);
   656    657     /* Methods above are valid for version 1 */
   657    658     int (*xShmOpen)(sqlite3_file*);
   658    659     int (*xShmSize)(sqlite3_file*, int reqSize, int *pNewSize);
   659    660     int (*xShmGet)(sqlite3_file*, int reqSize, int *pSize, void volatile**);
   660    661     int (*xShmRelease)(sqlite3_file*);
   661         -  int (*xShmLock)(sqlite3_file*, int desiredLock, int *gotLock);
          662  +  int (*xShmLock)(sqlite3_file*, int offset, int n, int flags);
   662    663     void (*xShmBarrier)(sqlite3_file*);
   663    664     int (*xShmClose)(sqlite3_file*, int deleteFlag);
   664    665     /* Methods above are valid for version 2 */
   665    666     /* Additional methods may be added in future releases */
   666    667   };
   667    668   
   668    669   /*
................................................................................
   884    885   #define SQLITE_ACCESS_EXISTS    0
   885    886   #define SQLITE_ACCESS_READWRITE 1
   886    887   #define SQLITE_ACCESS_READ      2
   887    888   
   888    889   /*
   889    890   ** CAPI3REF: Flags for the xShmLock VFS method
   890    891   **
   891         -** These integer constants define the various locking states that
   892         -** an sqlite3_shm object can be in.
          892  +** These integer constants define the various locking operations
          893  +** allowed by the xShmLock method of [sqlite3_io_methods].  The
          894  +** following are the only legal combinations of flags to the
          895  +** xShmLock method:
          896  +**
          897  +** <ul>
          898  +** <li>  SQLITE_SHM_LOCK | SQLITE_SHM_SHARED
          899  +** <li>  SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE
          900  +** <li>  SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED
          901  +** <li>  SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE
          902  +** </ul>
          903  +**
          904  +** When unlocking, the same SHARED or EXCLUSIVE flag must be supplied as
          905  +** was given no the corresponding lock.  
          906  +**
          907  +** The xShmLock method can transition between unlocked and SHARED or
          908  +** between unlocked and EXCLUSIVE.  It cannot transition between SHARED
          909  +** and EXCLUSIVE.
   893    910   */
   894         -#define SQLITE_SHM_UNLOCK       0
   895         -#define SQLITE_SHM_READ         1
   896         -#define SQLITE_SHM_READ_FULL    2
   897         -#define SQLITE_SHM_WRITE        3
   898         -#define SQLITE_SHM_PENDING      4
   899         -#define SQLITE_SHM_CHECKPOINT   5
   900         -#define SQLITE_SHM_RECOVER      6
          911  +#define SQLITE_SHM_UNLOCK       1
          912  +#define SQLITE_SHM_LOCK         2
          913  +#define SQLITE_SHM_SHARED       4
          914  +#define SQLITE_SHM_EXCLUSIVE    8
          915  +
          916  +/*
          917  +** CAPI3REF: Maximum xShmLock index
          918  +**
          919  +** The xShmLock method on [sqlite3_io_methods] may use values
          920  +** between 0 and this upper bound as its "offset" argument.
          921  +** The SQLite core will never attempt to acquire or release a
          922  +** lock outside of this range
          923  +*/
          924  +#define SQLITE_SHM_NLOCK        8
          925  +
   901    926   
   902    927   /*
   903    928   ** CAPI3REF: Initialize The SQLite Library
   904    929   **
   905    930   ** ^The sqlite3_initialize() routine initializes the
   906    931   ** SQLite library.  ^The sqlite3_shutdown() routine
   907    932   ** deallocates any resources that were allocated by sqlite3_initialize().

Changes to src/test6.c.

   536    536     void volatile **pp
   537    537   ){
   538    538     return sqlite3OsShmGet(((CrashFile*)pFile)->pRealFile, reqSize, pSize, pp);
   539    539   }
   540    540   static int cfShmRelease(sqlite3_file *pFile){
   541    541     return sqlite3OsShmRelease(((CrashFile*)pFile)->pRealFile);
   542    542   }
   543         -static int cfShmLock(sqlite3_file *pFile, int desired, int *pGot){
   544         -  return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, desired, pGot);
          543  +static int cfShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
          544  +  return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, ofst, n, flags);
   545    545   }
   546    546   static void cfShmBarrier(sqlite3_file *pFile){
   547    547     sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile);
   548    548   }
   549    549   static int cfShmClose(sqlite3_file *pFile, int delFlag){
   550    550     return sqlite3OsShmClose(((CrashFile*)pFile)->pRealFile, delFlag);
   551    551   }

Changes to src/test_devsym.c.

    50     50   static int devsymFileControl(sqlite3_file*, int op, void *pArg);
    51     51   static int devsymSectorSize(sqlite3_file*);
    52     52   static int devsymDeviceCharacteristics(sqlite3_file*);
    53     53   static int devsymShmOpen(sqlite3_file*);
    54     54   static int devsymShmSize(sqlite3_file*,int,int*);
    55     55   static int devsymShmGet(sqlite3_file*,int,int*,volatile void**);
    56     56   static int devsymShmRelease(sqlite3_file*);
    57         -static int devsymShmLock(sqlite3_file*,int,int*);
           57  +static int devsymShmLock(sqlite3_file*,int,int,int);
    58     58   static void devsymShmBarrier(sqlite3_file*);
    59     59   static int devsymShmClose(sqlite3_file*,int);
    60     60   
    61     61   /*
    62     62   ** Method declarations for devsym_vfs.
    63     63   */
    64     64   static int devsymOpen(sqlite3_vfs*, const char *, sqlite3_file*, int , int *);
................................................................................
   259    259     devsym_file *p = (devsym_file *)pFile;
   260    260     return sqlite3OsShmGet(p->pReal, reqSz, pSize, pp);
   261    261   }
   262    262   static int devsymShmRelease(sqlite3_file *pFile){
   263    263     devsym_file *p = (devsym_file *)pFile;
   264    264     return sqlite3OsShmRelease(p->pReal);
   265    265   }
   266         -static int devsymShmLock(sqlite3_file *pFile, int desired, int *pGot){
          266  +static int devsymShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   267    267     devsym_file *p = (devsym_file *)pFile;
   268         -  return sqlite3OsShmLock(p->pReal, desired, pGot);
          268  +  return sqlite3OsShmLock(p->pReal, ofst, n, flags);
   269    269   }
   270    270   static void devsymShmBarrier(sqlite3_file *pFile){
   271    271     devsym_file *p = (devsym_file *)pFile;
   272    272     sqlite3OsShmBarrier(p->pReal);
   273    273   }
   274    274   static int devsymShmClose(sqlite3_file *pFile, int delFlag){
   275    275     devsym_file *p = (devsym_file *)pFile;

Changes to src/test_osinst.c.

   151    151   static int vfslogSectorSize(sqlite3_file*);
   152    152   static int vfslogDeviceCharacteristics(sqlite3_file*);
   153    153   
   154    154   static int vfslogShmOpen(sqlite3_file *pFile);
   155    155   static int vfslogShmSize(sqlite3_file *pFile, int reqSize, int *pNewSize);
   156    156   static int vfslogShmGet(sqlite3_file *pFile, int,int*,volatile void **);
   157    157   static int vfslogShmRelease(sqlite3_file *pFile);
   158         -static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock);
          158  +static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags);
   159    159   static void vfslogShmBarrier(sqlite3_file*);
   160    160   static int vfslogShmClose(sqlite3_file *pFile, int deleteFlag);
   161    161   
   162    162   /*
   163    163   ** Method declarations for vfslog_vfs.
   164    164   */
   165    165   static int vfslogOpen(sqlite3_vfs*, const char *, sqlite3_file*, int , int *);
................................................................................
   456    456     VfslogFile *p = (VfslogFile *)pFile;
   457    457     t = vfslog_time();
   458    458     rc = p->pReal->pMethods->xShmRelease(p->pReal);
   459    459     t = vfslog_time() - t;
   460    460     vfslog_call(p->pVfslog, OS_SHMRELEASE, p->iFileId, t, rc, 0, 0);
   461    461     return rc;
   462    462   }
   463         -static int vfslogShmLock(sqlite3_file *pFile, int desiredLock, int *gotLock){
          463  +static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   464    464     int rc;
   465    465     sqlite3_uint64 t;
   466    466     VfslogFile *p = (VfslogFile *)pFile;
   467    467     t = vfslog_time();
   468         -  rc = p->pReal->pMethods->xShmLock(p->pReal, desiredLock, gotLock);
          468  +  rc = p->pReal->pMethods->xShmLock(p->pReal, ofst, n, flags);
   469    469     t = vfslog_time() - t;
   470    470     vfslog_call(p->pVfslog, OS_SHMLOCK, p->iFileId, t, rc, 0, 0);
   471    471     return rc;
   472    472   }
   473    473   static void vfslogShmBarrier(sqlite3_file *pFile){
   474    474     sqlite3_uint64 t;
   475    475     VfslogFile *p = (VfslogFile *)pFile;

Changes to src/test_vfs.c.

    98     98   static int tvfsSleep(sqlite3_vfs*, int microseconds);
    99     99   static int tvfsCurrentTime(sqlite3_vfs*, double*);
   100    100   
   101    101   static int tvfsShmOpen(sqlite3_file*);
   102    102   static int tvfsShmSize(sqlite3_file*, int , int *);
   103    103   static int tvfsShmGet(sqlite3_file*, int , int *, volatile void **);
   104    104   static int tvfsShmRelease(sqlite3_file*);
   105         -static int tvfsShmLock(sqlite3_file*, int , int *);
          105  +static int tvfsShmLock(sqlite3_file*, int , int, int);
   106    106   static void tvfsShmBarrier(sqlite3_file*);
   107    107   static int tvfsShmClose(sqlite3_file*, int);
   108    108   
   109    109   static sqlite3_io_methods tvfs_io_methods = {
   110    110     2,                            /* iVersion */
   111    111     tvfsClose,                      /* xClose */
   112    112     tvfsRead,                       /* xRead */
................................................................................
   540    540     tvfsResultCode(p, &rc);
   541    541   
   542    542     return rc;
   543    543   }
   544    544   
   545    545   static int tvfsShmLock(
   546    546     sqlite3_file *pFile,
   547         -  int desiredLock,
   548         -  int *gotLock
          547  +  int ofst,
          548  +  int n,
          549  +  int flags
   549    550   ){
   550    551     int rc = SQLITE_OK;
   551    552     TestvfsFile *pFd = (TestvfsFile *)pFile;
   552    553     Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
   553         -  char *zLock = "";
          554  +  int nLock;
          555  +  char zLock[80];
   554    556   
   555         -  switch( desiredLock ){
   556         -    case SQLITE_SHM_READ:         zLock = "READ";       break;
   557         -    case SQLITE_SHM_WRITE:        zLock = "WRITE";      break;
   558         -    case SQLITE_SHM_CHECKPOINT:   zLock = "CHECKPOINT"; break;
   559         -    case SQLITE_SHM_RECOVER:      zLock = "RECOVER";    break;
   560         -    case SQLITE_SHM_PENDING:      zLock = "PENDING";    break;
   561         -    case SQLITE_SHM_UNLOCK:       zLock = "UNLOCK";     break;
          557  +  sqlite3_snprintf(sizeof(zLock), zLock, "%d %d", ofst, n);
          558  +  nLock = strlen(zLock);
          559  +  if( flags & SQLITE_SHM_LOCK ){
          560  +    strcpy(&zLock[nLock], " lock");
          561  +  }else{
          562  +    strcpy(&zLock[nLock], " unlock");
          563  +  }
          564  +  nLock += strlen(&zLock[nLock]);
          565  +  if( flags & SQLITE_SHM_SHARED ){
          566  +    strcpy(&zLock[nLock], " shared");
          567  +  }else{
          568  +    strcpy(&zLock[nLock], " exclusive");
   562    569     }
   563    570     tvfsExecTcl(p, "xShmLock", 
   564    571         Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId,
   565    572         Tcl_NewStringObj(zLock, -1)
   566    573     );
   567    574     tvfsResultCode(p, &rc);
   568         -  if( rc==SQLITE_OK ){
   569         -    *gotLock = desiredLock;
   570         -  }
   571         -
   572    575     return rc;
   573    576   }
   574    577   
   575    578   static void tvfsShmBarrier(sqlite3_file *pFile){
   576    579     int rc = SQLITE_OK;
   577    580     TestvfsFile *pFd = (TestvfsFile *)pFile;
   578    581     Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
................................................................................
   712    715   **   VFSNAME shm FILENAME ?NEWVALUE?
   713    716   **
   714    717   ** When the xShmLock method is invoked by SQLite, the following script is
   715    718   ** run:
   716    719   **
   717    720   **   SCRIPT xShmLock    FILENAME ID LOCK
   718    721   **
   719         -** where LOCK is one of "UNLOCK", "READ", "READ_FULL", "WRITE", "PENDING",
   720         -** "CHECKPOINT" or "RECOVER". The script should return an SQLite error
   721         -** code.
          722  +** where LOCK is of the form "OFFSET NBYTE lock/unlock shared/exclusive"
   722    723   */
   723    724   static int testvfs_cmd(
   724    725     ClientData cd,
   725    726     Tcl_Interp *interp,
   726    727     int objc,
   727    728     Tcl_Obj *CONST objv[]
   728    729   ){

Changes to src/wal.c.

    89     89   ** being considered valid at the same time and being checkpointing together
    90     90   ** following a crash.
    91     91   **
    92     92   ** READER ALGORITHM
    93     93   **
    94     94   ** To read a page from the database (call it page number P), a reader
    95     95   ** first checks the WAL to see if it contains page P.  If so, then the
    96         -** last valid instance of page P that is or is followed by a commit frame
    97         -** become the value read.  If the WAL contains no copies of page P that
    98         -** are valid and which are or are followed by a commit frame, then page
    99         -** P is read from the database file.
           96  +** last valid instance of page P that is a followed by a commit frame
           97  +** or is a commit frame itself becomes the value read.  If the WAL
           98  +** contains no copies of page P that are valid and which are a commit
           99  +** frame or are followed by a commit frame, then page P is read from
          100  +** the database file.
   100    101   **
   101         -** The reader algorithm in the previous paragraph works correctly, but 
          102  +** To start a read transaction, the reader records the index of the last
          103  +** valid frame in the WAL.  The reader uses this recorded "mxFrame" value
          104  +** for all subsequent read operations.  New transactions can be appended
          105  +** to the WAL, but as long as the reader uses its original mxFrame value
          106  +** and ignores the newly appended content, it will see a consistent snapshot
          107  +** of the database from a single point in time.  This technique allows
          108  +** multiple concurrent readers to view different versions of the database
          109  +** content simultaneously.
          110  +**
          111  +** The reader algorithm in the previous paragraphs works correctly, but 
   102    112   ** because frames for page P can appear anywhere within the WAL, the
   103    113   ** reader has to scan the entire WAL looking for page P frames.  If the
   104    114   ** WAL is large (multiple megabytes is typical) that scan can be slow,
   105    115   ** and read performance suffers.  To overcome this problem, a separate
   106    116   ** data structure called the wal-index is maintained to expedite the
   107    117   ** search for frames of a particular page.
   108    118   ** 
................................................................................
   157    167   ** table is never more than half full.  The expected number of collisions 
   158    168   ** prior to finding a match is 1.  Each entry of the hash table is an
   159    169   ** 1-based index of an entry in the mapping section of the same
   160    170   ** index block.   Let K be the 1-based index of the largest entry in
   161    171   ** the mapping section.  (For index blocks other than the last, K will
   162    172   ** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
   163    173   ** K will be (mxFrame%HASHTABLE_NPAGE).)  Unused slots of the hash table
   164         -** contain a value greater than K.  Note that no hash table slot ever
   165         -** contains a zero value.
          174  +** contain a value of 0.
   166    175   **
   167    176   ** To look for page P in the hash table, first compute a hash iKey on
   168    177   ** P as follows:
   169    178   **
   170    179   **      iKey = (P * 383) % HASHTABLE_NSLOT
   171    180   **
   172    181   ** Then start scanning entries of the hash table, starting with iKey
................................................................................
   210    219   ** that correspond to frames greater than the new K value are removed
   211    220   ** from the hash table at this point.
   212    221   */
   213    222   #ifndef SQLITE_OMIT_WAL
   214    223   
   215    224   #include "wal.h"
   216    225   
          226  +/*
          227  +** Indices of various locking bytes.   WAL_NREADER is the number
          228  +** of available reader locks and should be at least 3.
          229  +*/
          230  +#define WAL_WRITE_LOCK         0
          231  +#define WAL_ALL_BUT_WRITE      1
          232  +#define WAL_CKPT_LOCK          1
          233  +#define WAL_RECOVER_LOCK       2
          234  +#define WAL_READ_LOCK(I)       (3+(I))
          235  +#define WAL_NREADER            (SQLITE_SHM_NLOCK-3)
          236  +
   217    237   
   218    238   /* Object declarations */
   219    239   typedef struct WalIndexHdr WalIndexHdr;
   220    240   typedef struct WalIterator WalIterator;
          241  +typedef struct WalCkptInfo WalCkptInfo;
   221    242   
   222    243   
   223    244   /*
   224    245   ** The following object holds a copy of the wal-index header content.
   225    246   **
   226    247   ** The actual header in the wal-index consists of two copies of this
   227    248   ** object.
................................................................................
   232    253     u16 szPage;                     /* Database page size in bytes */
   233    254     u32 mxFrame;                    /* Index of last valid frame in the WAL */
   234    255     u32 nPage;                      /* Size of database in pages */
   235    256     u32 aFrameCksum[2];             /* Checksum of last frame in log */
   236    257     u32 aSalt[2];                   /* Two salt values copied from WAL header */
   237    258     u32 aCksum[2];                  /* Checksum over all prior fields */
   238    259   };
          260  +
          261  +/*
          262  +** A copy of the following object occurs in the wal-index immediately
          263  +** following the second copy of the WalIndexHdr.  This object stores
          264  +** information used by checkpoint.
          265  +**
          266  +** nBackfill is the number of frames in the WAL that have been written
          267  +** back into the database. (We call the act of moving content from WAL to
          268  +** database "backfilling".)  The nBackfill number is never greater than
          269  +** WalIndexHdr.mxFrame.  nBackfill can only be increased by threads
          270  +** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
          271  +** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
          272  +** mxFrame back to zero when the WAL is reset.
          273  +**
          274  +** There is one entry in aReadMark[] for each reader lock.  If a reader
          275  +** holds read-lock K, then the value in aReadMark[K] is no greater than
          276  +** the mxFrame for that reader.  aReadMark[0] is a special case.  It
          277  +** always holds zero.  Readers holding WAL_READ_LOCK(0) always ignore 
          278  +** the entire WAL and read all content directly from the database.
          279  +**
          280  +** The value of aReadMark[K] may only be changed by a thread that
          281  +** is holding an exclusive lock on WAL_READ_LOCK(K).  Thus, the value of
          282  +** aReadMark[K] cannot changed while there is a reader is using that mark
          283  +** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
          284  +**
          285  +** The checkpointer may only transfer frames from WAL to database where
          286  +** the frame numbers are less than or equal to every aReadMark[] that is
          287  +** in use (that is, every aReadMark[j] for which there is a corresponding
          288  +** WAL_READ_LOCK(j)).  New readers (usually) pick the aReadMark[] with the
          289  +** largest value and will increase an unused aReadMark[] to mxFrame if there
          290  +** is not already an aReadMark[] equal to mxFrame.  The exception to the
          291  +** previous sentence is when nBackfill equals mxFrame (meaning that everything
          292  +** in the WAL has been backfilled into the database) then new readers
          293  +** will choose aReadMark[0] which has value 0 and hence such reader will
          294  +** get all their all content directly from the database file and ignore 
          295  +** the WAL.
          296  +**
          297  +** Writers normally append new frames to the end of the WAL.  However,
          298  +** if nBackfill equals mxFrame (meaning that all WAL content has been
          299  +** written back into the database) and if no readers are using the WAL
          300  +** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
          301  +** the writer will first "reset" the WAL back to the beginning and start
          302  +** writing new content beginning at frame 1.
          303  +**
          304  +** We assume that 32-bit loads are atomic and so no locks are needed in
          305  +** order to read from any aReadMark[] entries.
          306  +*/
          307  +struct WalCkptInfo {
          308  +  u32 nBackfill;                  /* Number of WAL frames backfilled into DB */
          309  +  u32 aReadMark[WAL_NREADER];     /* Reader marks */
          310  +};
          311  +
   239    312   
   240    313   /* A block of WALINDEX_LOCK_RESERVED bytes beginning at
   241    314   ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
   242    315   ** only support mandatory file-locks, we do not read or write data
   243    316   ** from the region of the file on which locks are applied.
   244    317   */
   245         -#define WALINDEX_LOCK_OFFSET   (sizeof(WalIndexHdr)*2)
   246         -#define WALINDEX_LOCK_RESERVED 8
          318  +#define WALINDEX_LOCK_OFFSET   (sizeof(WalIndexHdr)*2 + sizeof(WalCkptInfo))
          319  +#define WALINDEX_LOCK_RESERVED 16
   247    320   #define WALINDEX_HDR_SIZE      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)
   248    321   
   249    322   /* Size of header before each frame in wal */
   250    323   #define WAL_FRAME_HDRSIZE 24
   251    324   
   252    325   /* Size of write ahead log header */
   253    326   #define WAL_HDRSIZE 24
................................................................................
   273    346   )
   274    347   
   275    348   /*
   276    349   ** An open write-ahead log file is represented by an instance of the
   277    350   ** following object.
   278    351   */
   279    352   struct Wal {
   280         -  sqlite3_vfs *pVfs;         /* The VFS used to create pFd */
          353  +  sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
   281    354     sqlite3_file *pDbFd;       /* File handle for the database file */
   282    355     sqlite3_file *pWalFd;      /* File handle for WAL file */
   283    356     u32 iCallback;             /* Value to pass to log callback (or 0) */
   284    357     int szWIndex;              /* Size of the wal-index that is mapped in mem */
   285    358     volatile u32 *pWiData;     /* Pointer to wal-index content in memory */
   286         -  u8 lockState;              /* SQLITE_SHM_xxxx constant showing lock state */
   287         -  u8 readerType;             /* SQLITE_SHM_READ or SQLITE_SHM_READ_FULL */
          359  +  u16 szPage;                /* Database page size */
          360  +  i16 readLock;              /* Which read lock is being held.  -1 for none */
   288    361     u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
   289         -  u8 isWindexOpen;           /* True if ShmOpen() called on pDbFd */
   290         -  WalIndexHdr hdr;           /* Wal-index for current snapshot */
          362  +  u8 isWIndexOpen;           /* True if ShmOpen() called on pDbFd */
          363  +  u8 writeLock;              /* True if in a write transaction */
          364  +  u8 ckptLock;               /* True if holding a checkpoint lock */
          365  +  WalIndexHdr hdr;           /* Wal-index header for current transaction */
   291    366     char *zWalName;            /* Name of WAL file */
   292         -  int szPage;                /* Database page size */
   293    367     u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
   294    368   };
          369  +
          370  +/*
          371  +** Return a pointer to the WalCkptInfo structure in the wal-index.
          372  +*/
          373  +static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
          374  +  assert( pWal->pWiData!=0 );
          375  +  return (volatile WalCkptInfo*)&pWal->pWiData[sizeof(WalIndexHdr)/2];
          376  +}
   295    377   
   296    378   
   297    379   /*
   298    380   ** This structure is used to implement an iterator that loops through
   299    381   ** all frames in the WAL in database page order. Where two or more frames
   300    382   ** correspond to the same database page, the iterator visits only the 
   301    383   ** frame most recently written to the WAL (in other words, the frame with
................................................................................
   375    457       }while( aData<aEnd );
   376    458     }
   377    459   
   378    460     aOut[0] = s1;
   379    461     aOut[1] = s2;
   380    462   }
   381    463   
   382         -/*
   383         -** Attempt to change the lock status.
   384         -**
   385         -** When changing the lock status to SQLITE_SHM_READ, store the
   386         -** type of reader lock (either SQLITE_SHM_READ or SQLITE_SHM_READ_FULL)
   387         -** in pWal->readerType.
   388         -*/
   389         -static int walSetLock(Wal *pWal, int desiredStatus){
   390         -  int rc = SQLITE_OK;             /* Return code */
   391         -  if( pWal->exclusiveMode || pWal->lockState==desiredStatus ){
   392         -    pWal->lockState = desiredStatus;
   393         -  }else{
   394         -    int got = pWal->lockState;
   395         -    rc = sqlite3OsShmLock(pWal->pDbFd, desiredStatus, &got);
   396         -    pWal->lockState = got;
   397         -    if( got==SQLITE_SHM_READ_FULL || got==SQLITE_SHM_READ ){
   398         -      pWal->readerType = got;
   399         -      pWal->lockState = SQLITE_SHM_READ;
   400         -    }
   401         -  }
   402         -  return rc;
   403         -}
   404         -
   405    464   /*
   406    465   ** Write the header information in pWal->hdr into the wal-index.
   407    466   **
   408    467   ** The checksum on pWal->hdr is updated before it is written.
   409    468   */
   410    469   static void walIndexWriteHdr(Wal *pWal){
   411    470     WalIndexHdr *aHdr;
   412         -  walChecksumBytes(1, (u8*)&pWal->hdr,
   413         -                   sizeof(pWal->hdr) - sizeof(pWal->hdr.aCksum),
          471  +
          472  +  assert( pWal->writeLock );
          473  +  walChecksumBytes(1, (u8*)&pWal->hdr, offsetof(WalIndexHdr, aCksum),
   414    474                      0, pWal->hdr.aCksum);
   415    475     aHdr = (WalIndexHdr*)pWal->pWiData;
   416         -  memcpy(&aHdr[1], &pWal->hdr, sizeof(pWal->hdr));
          476  +  memcpy(&aHdr[1], &pWal->hdr, sizeof(WalIndexHdr));
   417    477     sqlite3OsShmBarrier(pWal->pDbFd);
   418         -  memcpy(&aHdr[0], &pWal->hdr, sizeof(pWal->hdr));
          478  +  memcpy(&aHdr[0], &pWal->hdr, sizeof(WalIndexHdr));
   419    479   }
   420    480   
   421    481   /*
   422    482   ** This function encodes a single frame header and writes it to a buffer
   423    483   ** supplied by the caller. A frame-header is made up of a series of 
   424    484   ** 4-byte big-endian integers, as follows:
   425    485   **
................................................................................
   515    575   ** create incompatibilities.
   516    576   */
   517    577   #define HASHTABLE_NPAGE      4096  /* Must be power of 2 and multiple of 256 */
   518    578   #define HASHTABLE_DATATYPE   u16
   519    579   #define HASHTABLE_HASH_1     383                  /* Should be prime */
   520    580   #define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
   521    581   #define HASHTABLE_NBYTE      (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)
          582  +
          583  +/*
          584  +** Set or release locks.
          585  +**
          586  +** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
          587  +*/
          588  +static int walLockShared(Wal *pWal, int lockIdx){
          589  +  if( pWal->exclusiveMode ) return SQLITE_OK;
          590  +  return sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
          591  +                          SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
          592  +}
          593  +static void walUnlockShared(Wal *pWal, int lockIdx){
          594  +  if( pWal->exclusiveMode ) return;
          595  +  (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
          596  +                         SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
          597  +}
          598  +static int walLockExclusive(Wal *pWal, int lockIdx, int n){
          599  +  if( pWal->exclusiveMode ) return SQLITE_OK;
          600  +  return sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
          601  +                          SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
          602  +}
          603  +static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
          604  +  if( pWal->exclusiveMode ) return;
          605  +  (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
          606  +                         SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
          607  +}
   522    608   
   523    609   /*
   524    610   ** Return the index in the Wal.pWiData array that corresponds to 
   525    611   ** frame iFrame.
   526    612   **
   527    613   ** Wal.pWiData is an array of u32 elements that is the wal-index.
   528    614   ** The array begins with a header and is then followed by alternating
................................................................................
   596    682   **
   597    683   ** If enlargeTo is non-negative, then increase the size of the underlying
   598    684   ** storage to be at least as big as enlargeTo before remapping.
   599    685   */
   600    686   static int walIndexRemap(Wal *pWal, int enlargeTo){
   601    687     int rc;
   602    688     int sz;
   603         -  assert( pWal->lockState>=SQLITE_SHM_WRITE );
          689  +  assert( pWal->writeLock );
   604    690     rc = sqlite3OsShmSize(pWal->pDbFd, enlargeTo, &sz);
   605    691     if( rc==SQLITE_OK && sz>pWal->szWIndex ){
   606    692       walIndexUnmap(pWal);
   607    693       rc = walIndexMap(pWal, sz);
   608    694     }
   609    695     assert( pWal->szWIndex>=enlargeTo || rc!=SQLITE_OK );
   610    696     return rc;
................................................................................
   681    767   */
   682    768   static void walCleanupHash(Wal *pWal){
   683    769     volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
   684    770     volatile u32 *aPgno;                 /* Unused return from walHashFind() */
   685    771     u32 iZero;                           /* frame == (aHash[x]+iZero) */
   686    772     int iLimit;                          /* Zero values greater than this */
   687    773   
   688         -  assert( pWal->lockState==SQLITE_SHM_WRITE );
          774  +  assert( pWal->writeLock );
   689    775     walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
   690    776     iLimit = pWal->hdr.mxFrame - iZero;
   691    777     if( iLimit>0 ){
   692    778       int nByte;                    /* Number of bytes to zero in aPgno[] */
   693    779       int i;                        /* Used to iterate through aHash[] */
   694    780       for(i=0; i<HASHTABLE_NSLOT; i++){
   695    781         if( aHash[i]>iLimit ){
................................................................................
   806    892   
   807    893     return rc;
   808    894   }
   809    895   
   810    896   
   811    897   /*
   812    898   ** Recover the wal-index by reading the write-ahead log file. 
   813         -** The caller must hold RECOVER lock on the wal-index file.
          899  +**
          900  +** This routine first tries to establish an exclusive lock on the
          901  +** wal-index to prevent other threads/processes from doing anything
          902  +** with the WAL or wal-index while recovery is running.  The
          903  +** WAL_RECOVER_LOCK is also held so that other threads will know
          904  +** that this thread is running recovery.  If unable to establish
          905  +** the necessary locks, this routine returns SQLITE_BUSY.
   814    906   */
   815    907   static int walIndexRecover(Wal *pWal){
   816    908     int rc;                         /* Return Code */
   817    909     i64 nSize;                      /* Size of log file */
   818    910     u32 aFrameCksum[2] = {0, 0};
   819    911   
   820         -  assert( pWal->lockState>SQLITE_SHM_READ );
          912  +  rc = walLockExclusive(pWal, WAL_ALL_BUT_WRITE, SQLITE_SHM_NLOCK-1);
          913  +  if( rc ){
          914  +    return rc;
          915  +  }
          916  +
   821    917     memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
   822    918   
   823    919     rc = sqlite3OsFileSize(pWal->pWalFd, &nSize);
   824    920     if( rc!=SQLITE_OK ){
   825         -    return rc;
          921  +    goto recovery_error;
   826    922     }
   827    923   
   828    924     if( nSize>WAL_HDRSIZE ){
   829    925       u8 aBuf[WAL_HDRSIZE];         /* Buffer to load WAL header into */
   830    926       u8 *aFrame = 0;               /* Malloc'd buffer to load entire frame */
   831    927       int szFrame;                  /* Number of bytes in buffer aFrame[] */
   832    928       u8 *aData;                    /* Pointer to data part of aFrame buffer */
................................................................................
   834    930       i64 iOffset;                  /* Next offset to read from log file */
   835    931       int szPage;                   /* Page size according to the log */
   836    932       u32 magic;                    /* Magic value read from WAL header */
   837    933   
   838    934       /* Read in the WAL header. */
   839    935       rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
   840    936       if( rc!=SQLITE_OK ){
   841         -      return rc;
          937  +      goto recovery_error;
   842    938       }
   843    939   
   844    940       /* If the database page size is not a power of two, or is greater than
   845    941       ** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid 
   846    942       ** data. Similarly, if the 'magic' value is invalid, ignore the whole
   847    943       ** WAL file.
   848    944       */
................................................................................
   863    959           aBuf, WAL_HDRSIZE, 0, pWal->hdr.aFrameCksum
   864    960       );
   865    961   
   866    962       /* Malloc a buffer to read frames into. */
   867    963       szFrame = szPage + WAL_FRAME_HDRSIZE;
   868    964       aFrame = (u8 *)sqlite3_malloc(szFrame);
   869    965       if( !aFrame ){
   870         -      return SQLITE_NOMEM;
          966  +      rc = SQLITE_NOMEM;
          967  +      goto recovery_error;
   871    968       }
   872    969       aData = &aFrame[WAL_FRAME_HDRSIZE];
   873    970   
   874    971       /* Read all frames from the log file. */
   875    972       iFrame = 0;
   876    973       for(iOffset=WAL_HDRSIZE; (iOffset+szFrame)<=nSize; iOffset+=szFrame){
   877    974         u32 pgno;                   /* Database page number for frame */
................................................................................
   904   1001       rc = walIndexRemap(pWal, walMappingSize(1));
   905   1002     }
   906   1003     if( rc==SQLITE_OK ){
   907   1004       pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
   908   1005       pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
   909   1006       walIndexWriteHdr(pWal);
   910   1007     }
         1008  +
         1009  +recovery_error:
         1010  +  walUnlockExclusive(pWal, WAL_ALL_BUT_WRITE, SQLITE_SHM_NLOCK-1);
   911   1011     return rc;
   912   1012   }
   913   1013   
   914   1014   /*
   915   1015   ** Close an open wal-index.
   916   1016   */
   917   1017   static void walIndexClose(Wal *pWal, int isDelete){
   918         -  if( pWal->isWindexOpen ){
   919         -    int notUsed;
   920         -    sqlite3OsShmLock(pWal->pDbFd, SQLITE_SHM_UNLOCK, &notUsed);
         1018  +  if( pWal->isWIndexOpen ){
   921   1019       sqlite3OsShmClose(pWal->pDbFd, isDelete);
   922         -    pWal->isWindexOpen = 0;
         1020  +    pWal->isWIndexOpen = 0;
   923   1021     }
   924   1022   }
   925   1023   
   926   1024   /* 
   927   1025   ** Open a connection to the log file associated with database zDb. The
   928   1026   ** database file does not actually have to exist. zDb is used only to
   929   1027   ** figure out the name of the log file to open. If the log file does not 
................................................................................
   974   1072       return SQLITE_NOMEM;
   975   1073     }
   976   1074   
   977   1075     pRet->pVfs = pVfs;
   978   1076     pRet->pWalFd = (sqlite3_file *)&pRet[1];
   979   1077     pRet->pDbFd = pDbFd;
   980   1078     pRet->szWIndex = -1;
         1079  +  pRet->readLock = -1;
   981   1080     sqlite3_randomness(8, &pRet->hdr.aSalt);
   982   1081     pRet->zWalName = zWal = pVfs->szOsFile + (char*)pRet->pWalFd;
   983   1082     sqlite3_snprintf(nWal, zWal, "%s-wal", zDbName);
   984   1083     rc = sqlite3OsShmOpen(pDbFd);
   985   1084   
   986   1085     /* Open file handle on the write-ahead log file. */
   987   1086     if( rc==SQLITE_OK ){
   988         -    pRet->isWindexOpen = 1;
         1087  +    pRet->isWIndexOpen = 1;
   989   1088       flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_MAIN_JOURNAL);
   990   1089       rc = sqlite3OsOpen(pVfs, zWal, pRet->pWalFd, flags, &flags);
   991   1090     }
   992   1091   
   993   1092     if( rc!=SQLITE_OK ){
   994   1093       walIndexClose(pRet, 0);
   995   1094       sqlite3OsClose(pRet->pWalFd);
................................................................................
  1129   1228     }
  1130   1229   
  1131   1230     /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
  1132   1231     ** thread is able to write to shared memory while this routine is
  1133   1232     ** running (or, indeed, while the WalIterator object exists).  Hence,
  1134   1233     ** we can cast off the volatile qualifacation from shared memory
  1135   1234     */
  1136         -  assert( pWal->lockState==SQLITE_SHM_CHECKPOINT );
         1235  +  assert( pWal->ckptLock );
  1137   1236     aData = (u32*)pWal->pWiData;
  1138   1237   
  1139   1238     /* Allocate space for the WalIterator object */
  1140   1239     iLast = pWal->hdr.mxFrame;
  1141   1240     nSegment = (iLast >> 8) + 1;
  1142   1241     nFinal = (iLast & 0x000000FF);
  1143   1242     nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
................................................................................
  1175   1274   
  1176   1275   /* 
  1177   1276   ** Free an iterator allocated by walIteratorInit().
  1178   1277   */
  1179   1278   static void walIteratorFree(WalIterator *p){
  1180   1279     sqlite3_free(p);
  1181   1280   }
         1281  +
  1182   1282   
  1183   1283   /*
  1184         -** Checkpoint the contents of the log file.
         1284  +** Copy as much content as we can from the WAL back into the database file
         1285  +** in response to an sqlite3_wal_checkpoint() request or the equivalent.
         1286  +**
         1287  +** The amount of information copies from WAL to database might be limited
         1288  +** by active readers.  This routine will never overwrite a database page
         1289  +** that a concurrent reader might be using.
         1290  +**
         1291  +** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
         1292  +** SQLite is in WAL-mode in synchronous=NORMAL.  That means that if 
         1293  +** checkpoints are always run by a background thread or background 
         1294  +** process, foreground threads will never block on a lengthy fsync call.
         1295  +**
         1296  +** Fsync is called on the WAL before writing content out of the WAL and
         1297  +** into the database.  This ensures that if the new content is persistent
         1298  +** in the WAL and can be recovered following a power-loss or hard reset.
         1299  +**
         1300  +** Fsync is also called on the database file if (and only if) the entire
         1301  +** WAL content is copied into the database file.  This second fsync makes
         1302  +** it safe to delete the WAL since the new content will persist in the
         1303  +** database file.
         1304  +**
         1305  +** This routine uses and updates the nBackfill field of the wal-index header.
         1306  +** This is the only routine tha will increase the value of nBackfill.  
         1307  +** (A WAL reset or recovery will revert nBackfill to zero, but not increase
         1308  +** its value.)
         1309  +**
         1310  +** The caller must be holding sufficient locks to ensure that no other
         1311  +** checkpoint is running (in any other thread or process) at the same
         1312  +** time.
  1185   1313   */
  1186   1314   static int walCheckpoint(
  1187   1315     Wal *pWal,                      /* Wal connection */
  1188   1316     int sync_flags,                 /* Flags for OsSync() (or 0) */
  1189   1317     int nBuf,                       /* Size of zBuf in bytes */
  1190   1318     u8 *zBuf                        /* Temporary buffer to use */
  1191   1319   ){
  1192   1320     int rc;                         /* Return code */
  1193   1321     int szPage = pWal->hdr.szPage;  /* Database page-size */
  1194   1322     WalIterator *pIter = 0;         /* Wal iterator context */
  1195   1323     u32 iDbpage = 0;                /* Next database page to write */
  1196   1324     u32 iFrame = 0;                 /* Wal frame containing data for iDbpage */
         1325  +  u32 mxSafeFrame;                /* Max frame that can be backfilled */
         1326  +  int i;                          /* Loop counter */
         1327  +  volatile WalIndexHdr *pHdr;     /* The actual wal-index header in SHM */
         1328  +  volatile WalCkptInfo *pInfo;    /* The checkpoint status information */
  1197   1329   
  1198   1330     /* Allocate the iterator */
  1199   1331     rc = walIteratorInit(pWal, &pIter);
  1200   1332     if( rc!=SQLITE_OK || pWal->hdr.mxFrame==0 ){
  1201         -    goto out;
         1333  +    walIteratorFree(pIter);
         1334  +    return rc;
  1202   1335     }
  1203   1336   
         1337  +  /*** TODO:  Move this test out to the caller.  Make it an assert() here ***/
  1204   1338     if( pWal->hdr.szPage!=nBuf ){
  1205         -    rc = SQLITE_CORRUPT_BKPT;
  1206         -    goto out;
  1207         -  }
  1208         -
  1209         -  /* Sync the log file to disk */
  1210         -  if( sync_flags ){
  1211         -    rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
  1212         -    if( rc!=SQLITE_OK ) goto out;
  1213         -  }
  1214         -
  1215         -  /* Iterate through the contents of the log, copying data to the db file. */
  1216         -  while( 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
  1217         -    rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, 
  1218         -        walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE
  1219         -    );
  1220         -    if( rc!=SQLITE_OK ) goto out;
  1221         -    rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage);
  1222         -    if( rc!=SQLITE_OK ) goto out;
  1223         -  }
  1224         -
  1225         -  /* Truncate the database file */
  1226         -  rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage));
  1227         -  if( rc!=SQLITE_OK ) goto out;
  1228         -
  1229         -  /* Sync the database file. If successful, update the wal-index. */
  1230         -  if( sync_flags ){
  1231         -    rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
  1232         -    if( rc!=SQLITE_OK ) goto out;
  1233         -  }
  1234         -  pWal->hdr.mxFrame = 0;
  1235         -  pWal->nCkpt++;
  1236         -  sqlite3Put4byte((u8*)pWal->hdr.aSalt,
  1237         -                   1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt));
  1238         -  sqlite3_randomness(4, &pWal->hdr.aSalt[1]);
  1239         -  walIndexWriteHdr(pWal);
  1240         -
  1241         - out:
         1339  +    walIteratorFree(pIter);
         1340  +    return SQLITE_CORRUPT_BKPT;
         1341  +  }
         1342  +
         1343  +  /* Compute in mxSafeFrame the index of the last frame of the WAL that is
         1344  +  ** safe to write into the database.  Frames beyond mxSafeFrame might
         1345  +  ** overwrite database pages that are in use by active readers and thus
         1346  +  ** cannot be backfilled from the WAL.
         1347  +  */
         1348  +  mxSafeFrame = 0;
         1349  +  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
         1350  +  pInfo = (volatile WalCkptInfo*)&pHdr[2];
         1351  +  assert( pInfo==walCkptInfo(pWal) );
         1352  +  for(i=1; i<WAL_NREADER; i++){
         1353  +    u32 y = pInfo->aReadMark[i];
         1354  +    if( y>0 && (mxSafeFrame==0 || mxSafeFrame<y) ){
         1355  +      if( y<pWal->hdr.mxFrame
         1356  +       && (rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1))==SQLITE_OK
         1357  +      ){
         1358  +        pInfo->aReadMark[i] = 0;
         1359  +        walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
         1360  +      }else{
         1361  +        mxSafeFrame = y;
         1362  +      }
         1363  +    }
         1364  +  }
         1365  +
         1366  +  if( pInfo->nBackfill<mxSafeFrame
         1367  +   && (rc = walLockExclusive(pWal, WAL_READ_LOCK(0), 1))==SQLITE_OK
         1368  +  ){
         1369  +    u32 nBackfill = pInfo->nBackfill;
         1370  +
         1371  +    /* Sync the WAL to disk */
         1372  +    if( sync_flags ){
         1373  +      rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
         1374  +    }
         1375  +
         1376  +    /* Iterate through the contents of the WAL, copying data to the db file. */
         1377  +    while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
         1378  +      if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue;
         1379  +      rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, 
         1380  +          walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE
         1381  +      );
         1382  +      if( rc!=SQLITE_OK ) break;
         1383  +      rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage);
         1384  +      if( rc!=SQLITE_OK ) break;
         1385  +    }
         1386  +
         1387  +    /* If work was actually accomplished... */
         1388  +    if( rc==SQLITE_OK && pInfo->nBackfill<mxSafeFrame ){
         1389  +      pInfo->nBackfill = mxSafeFrame;
         1390  +      if( mxSafeFrame==pHdr[0].mxFrame && sync_flags ){
         1391  +        rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage));
         1392  +        if( rc==SQLITE_OK && sync_flags ){
         1393  +          rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
         1394  +        }
         1395  +      }
         1396  +    }
         1397  +
         1398  +    /* Release the reader lock held while backfilling */
         1399  +    walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
         1400  +  }
         1401  +
  1242   1402     walIteratorFree(pIter);
  1243   1403     return rc;
  1244   1404   }
  1245   1405   
  1246   1406   /*
  1247   1407   ** Close a connection to a log file.
  1248   1408   */
................................................................................
  1262   1422       ** the database. In this case checkpoint the database and unlink both
  1263   1423       ** the wal and wal-index files.
  1264   1424       **
  1265   1425       ** The EXCLUSIVE lock is not released before returning.
  1266   1426       */
  1267   1427       rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);
  1268   1428       if( rc==SQLITE_OK ){
  1269         -      rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf, 0, 0);
         1429  +      pWal->exclusiveMode = 1;
         1430  +      rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
  1270   1431         if( rc==SQLITE_OK ){
  1271   1432           isDelete = 1;
  1272   1433         }
  1273   1434         walIndexUnmap(pWal);
  1274   1435       }
  1275   1436   
  1276   1437       walIndexClose(pWal, isDelete);
................................................................................
  1286   1447   /*
  1287   1448   ** Try to read the wal-index header.  Return 0 on success and 1 if
  1288   1449   ** there is a problem.
  1289   1450   **
  1290   1451   ** The wal-index is in shared memory.  Another thread or process might
  1291   1452   ** be writing the header at the same time this procedure is trying to
  1292   1453   ** read it, which might result in inconsistency.  A dirty read is detected
  1293         -** by verifying a checksum on the header.
         1454  +** by verifying that both copies of the header are the same and also by
         1455  +** a checksum on the header.
  1294   1456   **
  1295   1457   ** If and only if the read is consistent and the header is different from
  1296   1458   ** pWal->hdr, then pWal->hdr is updated to the content of the new header
  1297   1459   ** and *pChanged is set to 1.
  1298   1460   **
  1299   1461   ** If the checksum cannot be verified return non-zero. If the header
  1300   1462   ** is read successfully and the checksum verified, return zero.
................................................................................
  1307   1469     if( pWal->szWIndex < WALINDEX_HDR_SIZE ){
  1308   1470       /* The wal-index is not large enough to hold the header, then assume
  1309   1471       ** header is invalid. */
  1310   1472       return 1;
  1311   1473     }
  1312   1474     assert( pWal->pWiData );
  1313   1475   
  1314         -  /* Read the header. The caller may or may not have an exclusive 
  1315         -  ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index
  1316         -  ** file, meaning it is possible that an inconsistent snapshot is read
         1476  +  /* Read the header. This might happen currently with a write to the
         1477  +  ** same area of shared memory on a different CPU in a SMP,
         1478  +  ** meaning it is possible that an inconsistent snapshot is read
  1317   1479     ** from the file. If this happens, return non-zero.
  1318   1480     **
  1319   1481     ** There are two copies of the header at the beginning of the wal-index.
  1320   1482     ** When reading, read [0] first then [1].  Writes are in the reverse order.
  1321   1483     ** Memory barriers are used to prevent the compiler or the hardware from
  1322   1484     ** reordering the reads and writes.
  1323   1485     */
................................................................................
  1363   1525   ** after this routine returns.
  1364   1526   **
  1365   1527   ** If the wal-index header is successfully read, return SQLITE_OK. 
  1366   1528   ** Otherwise an SQLite error code.
  1367   1529   */
  1368   1530   static int walIndexReadHdr(Wal *pWal, int *pChanged){
  1369   1531     int rc;                         /* Return code */
  1370         -  int lockState;                  /* pWal->lockState before running recovery */
         1532  +  int badHdr;                     /* True if a header read failed */
  1371   1533   
  1372         -  assert( pWal->lockState>=SQLITE_SHM_READ );
  1373   1534     assert( pChanged );
  1374   1535     rc = walIndexMap(pWal, walMappingSize(1));
  1375   1536     if( rc!=SQLITE_OK ){
  1376   1537       return rc;
  1377   1538     }
  1378   1539   
  1379         -  /* First attempt to read the wal-index header. This may fail for one
  1380         -  ** of two reasons: (a) the wal-index does not yet exist or has been
  1381         -  ** corrupted and needs to be constructed by running recovery, or (b)
  1382         -  ** the caller is only holding a READ lock and made a dirty read of
  1383         -  ** the wal-index header.
  1384         -  **
  1385         -  ** A dirty read of the wal-index header occurs if another thread or
  1386         -  ** process happens to be writing to the wal-index header at roughly
  1387         -  ** the same time as this thread is reading it. In this case it is 
  1388         -  ** possible that an inconsistent header is read (which is detected
  1389         -  ** using the header checksum mechanism).
         1540  +  /* Try once to read the header straight out.  This works most of the
         1541  +  ** time.
         1542  +  */
         1543  +  badHdr = walIndexTryHdr(pWal, pChanged);
         1544  +
         1545  +  /* If the first attempt failed, it might have been due to a race
         1546  +  ** with a writer.  So get a WRITE lock and try again.
  1390   1547     */
  1391         -  if( walIndexTryHdr(pWal, pChanged)!=0 ){
  1392         -
  1393         -    /* If the first attempt to read the header failed, lock the wal-index
  1394         -    ** file with an exclusive lock and try again. If the header checksum 
  1395         -    ** verification fails again, we can be sure that it is not simply a
  1396         -    ** dirty read, but that the wal-index really does need to be 
  1397         -    ** reconstructed by running log recovery.
  1398         -    **
  1399         -    ** In the paragraph above, an "exclusive lock" may be any of WRITE,
  1400         -    ** PENDING, CHECKPOINT or RECOVER. If any of these are already held,
  1401         -    ** no locking operations are required. If the caller currently holds
  1402         -    ** a READ lock, then upgrade to a RECOVER lock before re-reading the
  1403         -    ** wal-index header and revert to a READ lock before returning.
  1404         -    */
  1405         -    lockState = pWal->lockState;
  1406         -    if( lockState>SQLITE_SHM_READ
  1407         -     || SQLITE_OK==(rc = walSetLock(pWal, SQLITE_SHM_RECOVER)) 
  1408         -    ){
  1409         -      if( walIndexTryHdr(pWal, pChanged) ){
  1410         -        *pChanged = 1;
         1548  +  assert( pWal->writeLock==0 );
         1549  +  if( badHdr ){
         1550  +    rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1551  +    if( rc==SQLITE_OK ){
         1552  +      pWal->writeLock = 1;
         1553  +      badHdr = walIndexTryHdr(pWal, pChanged);
         1554  +      if( badHdr ){
         1555  +        /* If the wal-index header is still malformed even while holding
         1556  +        ** a WRITE lock, it can only mean that the header is corrupted and
         1557  +        ** needs to be reconstructed.  So run recovery to do exactly that.
         1558  +        */
  1411   1559           rc = walIndexRecover(pWal);
  1412   1560         }
  1413         -      if( lockState==SQLITE_SHM_READ ){
  1414         -        walSetLock(pWal, SQLITE_SHM_READ);
  1415         -      }
         1561  +      walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1562  +      pWal->writeLock = 0;
         1563  +    }else if( rc!=SQLITE_BUSY ){
         1564  +      return rc;
  1416   1565       }
  1417   1566     }
  1418   1567   
  1419   1568     /* Make sure the mapping is large enough to cover the entire wal-index */
  1420   1569     if( rc==SQLITE_OK ){
  1421   1570       int szWanted = walMappingSize(pWal->hdr.mxFrame);
  1422   1571       if( pWal->szWIndex<szWanted ){
................................................................................
  1424   1573       }
  1425   1574     }
  1426   1575   
  1427   1576     return rc;
  1428   1577   }
  1429   1578   
  1430   1579   /*
  1431         -** Take a snapshot of the state of the WAL and wal-index for the current
  1432         -** instant in time.  The current thread will continue to use this snapshot.
  1433         -** Other threads might containing appending to the WAL and wal-index but
  1434         -** the extra content appended will be ignored by the current thread.
         1580  +** This is the value that walTryBeginRead returns when it needs to
         1581  +** be retried.
         1582  +*/
         1583  +#define WAL_RETRY  (-1)
         1584  +
         1585  +/*
         1586  +** Attempt to start a read transaction.  This might fail due to a race or
         1587  +** other transient condition.  When that happens, it returns WAL_RETRY to
         1588  +** indicate to the caller that it is safe to retry immediately.
         1589  +**
         1590  +** On success return SQLITE_OK.  On a permantent failure (such an
         1591  +** I/O error or an SQLITE_BUSY because another process is running
         1592  +** recovery) return a positive error code.
         1593  +**
         1594  +** On success, this routine obtains a read lock on 
         1595  +** WAL_READ_LOCK(pWal->readLock).  The pWal->readLock integer is
         1596  +** in the range 0 <= pWal->readLock < WAL_NREADER.  If pWal->readLock==(-1)
         1597  +** that means the Wal does not hold any read lock.  The reader must not
         1598  +** access any database page that is modified by a WAL frame up to and
         1599  +** including frame number aReadMark[pWal->readLock].  The reader will
         1600  +** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0
         1601  +** Or if pWal->readLock==0, then the reader will ignore the WAL
         1602  +** completely and get all content directly from the database file.
         1603  +** When the read transaction is completed, the caller must release the
         1604  +** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.
  1435   1605   **
  1436         -** A snapshot is like a read transaction.
         1606  +** This routine uses the nBackfill and aReadMark[] fields of the header
         1607  +** to select a particular WAL_READ_LOCK() that strives to let the
         1608  +** checkpoint process do as much work as possible.  This routine might
         1609  +** update values of the aReadMark[] array in the header, but if it does
         1610  +** so it takes care to hold an exclusive lock on the corresponding
         1611  +** WAL_READ_LOCK() while changing values.
         1612  +*/
         1613  +static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal){
         1614  +  volatile WalIndexHdr *pHdr;     /* Header of the wal-index */
         1615  +  volatile WalCkptInfo *pInfo;    /* Checkpoint information in wal-index */
         1616  +  u32 mxReadMark;                 /* Largest aReadMark[] value */
         1617  +  int mxI;                        /* Index of largest aReadMark[] value */
         1618  +  int i;                          /* Loop counter */
         1619  +  int rc;                         /* Return code  */
         1620  +
         1621  +  assert( pWal->readLock<0 );  /* No read lock held on entry */
         1622  +
         1623  +  if( !useWal ){
         1624  +    rc = walIndexReadHdr(pWal, pChanged);
         1625  +    if( rc==SQLITE_BUSY ){
         1626  +      /* If there is not a recovery running in another thread or process
         1627  +      ** then convert BUSY errors to WAL_RETRY.  If recovery is known to
         1628  +      ** be running, convert BUSY to BUSY_RECOVERY.  There is a race here
         1629  +      ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
         1630  +      ** would be technically correct.  But the race is benign since with
         1631  +      ** WAL_RETRY this routine will be called again and will probably be
         1632  +      ** right on the second iteration.
         1633  +      */
         1634  +      rc = walLockShared(pWal, WAL_RECOVER_LOCK);
         1635  +      if( rc==SQLITE_OK ){
         1636  +        walUnlockShared(pWal, WAL_RECOVER_LOCK);
         1637  +        rc = WAL_RETRY;
         1638  +      }else if( rc==SQLITE_BUSY ){
         1639  +        rc = SQLITE_BUSY_RECOVERY;
         1640  +      }
         1641  +    }
         1642  +  }else{
         1643  +    rc = walIndexMap(pWal, pWal->hdr.mxFrame);
         1644  +  }
         1645  +  if( rc!=SQLITE_OK ){
         1646  +    return rc;
         1647  +  }
         1648  +
         1649  +  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
         1650  +  pInfo = (volatile WalCkptInfo*)&pHdr[2];
         1651  +  assert( pInfo==walCkptInfo(pWal) );
         1652  +  if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){
         1653  +    /* The WAL has been completely backfilled (or it is empty).
         1654  +    ** and can be safely ignored.
         1655  +    */
         1656  +    rc = walLockShared(pWal, WAL_READ_LOCK(0));
         1657  +    if( rc==SQLITE_OK ){
         1658  +      if( pHdr->mxFrame!=pWal->hdr.mxFrame ){
         1659  +        walUnlockShared(pWal, WAL_READ_LOCK(0));
         1660  +        return WAL_RETRY;
         1661  +      }
         1662  +      pWal->readLock = 0;
         1663  +      return SQLITE_OK;
         1664  +    }else if( rc!=SQLITE_BUSY ){
         1665  +      return rc;
         1666  +    }
         1667  +  }
         1668  +
         1669  +  /* If we get this far, it means that the reader will want to use
         1670  +  ** the WAL to get at content from recent commits.  The job now is
         1671  +  ** to select one of the aReadMark[] entries that is closest to
         1672  +  ** but not exceeding pWal->hdr.mxFrame and lock that entry.
         1673  +  */
         1674  +  mxReadMark = 0;
         1675  +  mxI = 0;
         1676  +  for(i=1; i<WAL_NREADER; i++){
         1677  +    u32 thisMark = pInfo->aReadMark[i];
         1678  +    if( mxReadMark<thisMark ){
         1679  +      mxReadMark = thisMark;
         1680  +      mxI = i;
         1681  +    }
         1682  +  }
         1683  +  if( mxI==0 ){
         1684  +    /* If we get here, it means that all of the aReadMark[] entries between
         1685  +    ** 1 and WAL_NREADER-1 are zero.  Try to initialize aReadMark[1] to
         1686  +    ** be mxFrame, then retry.
         1687  +    */
         1688  +    rc = walLockExclusive(pWal, WAL_READ_LOCK(1), 1);
         1689  +    if( rc==SQLITE_OK ){
         1690  +      pInfo->aReadMark[1] = pWal->hdr.mxFrame;
         1691  +      walUnlockExclusive(pWal, WAL_READ_LOCK(1), 1);
         1692  +    }
         1693  +    return WAL_RETRY;
         1694  +  }else{
         1695  +    if( mxReadMark < pWal->hdr.mxFrame ){
         1696  +      for(i=0; i<WAL_NREADER; i++){
         1697  +        rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
         1698  +        if( rc==SQLITE_OK ){
         1699  +          pInfo->aReadMark[i] = pWal->hdr.mxFrame;
         1700  +          mxReadMark = pWal->hdr.mxFrame;
         1701  +          mxI = i;
         1702  +          walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
         1703  +          break;
         1704  +        }
         1705  +      }
         1706  +    }
         1707  +
         1708  +    rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
         1709  +    if( rc ){
         1710  +      return rc==SQLITE_BUSY ? WAL_RETRY : rc;
         1711  +    }
         1712  +    if( pInfo->aReadMark[mxI]!=mxReadMark
         1713  +     || pHdr[0].mxFrame!=pWal->hdr.mxFrame
         1714  +     || (sqlite3OsShmBarrier(pWal->pDbFd), pHdr[1].mxFrame!=pWal->hdr.mxFrame)
         1715  +    ){
         1716  +      walUnlockShared(pWal, WAL_READ_LOCK(mxI));
         1717  +      return WAL_RETRY;
         1718  +    }else{
         1719  +      pWal->readLock = mxI;
         1720  +    }
         1721  +  }
         1722  +  return rc;
         1723  +}
         1724  +
         1725  +/*
         1726  +** Begin a read transaction on the database.
  1437   1727   **
  1438         -** No other threads are allowed to run a checkpoint while this thread is
  1439         -** holding the snapshot since a checkpoint would remove data out from under
  1440         -** this thread.
         1728  +** This routine used to be called sqlite3OpenSnapshot() and with good reason:
         1729  +** it takes a snapshot of the state of the WAL and wal-index for the current
         1730  +** instant in time.  The current thread will continue to use this snapshot.
         1731  +** Other threads might append new content to the WAL and wal-index but
         1732  +** that extra content is ignored by the current thread.
  1441   1733   **
  1442         -** If this call obtains a new read-lock and the database contents have been
  1443         -** modified since the most recent call to WalCloseSnapshot() on this Wal
  1444         -** connection, then *pChanged is set to 1 before returning. Otherwise, it 
  1445         -** is left unmodified. This is used by the pager layer to determine whether 
  1446         -** or not any cached pages may be safely reused.
         1734  +** If the database contents have changes since the previous read
         1735  +** transaction, then *pChanged is set to 1 before returning.  The
         1736  +** Pager layer will use this to know that is cache is stale and
         1737  +** needs to be flushed.
  1447   1738   */
  1448         -int sqlite3WalOpenSnapshot(Wal *pWal, int *pChanged){
         1739  +int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
  1449   1740     int rc;                         /* Return code */
  1450   1741   
  1451         -  rc = walSetLock(pWal, SQLITE_SHM_READ);
  1452         -  assert( rc!=SQLITE_OK || pWal->lockState==SQLITE_SHM_READ );
  1453         -
  1454         -  if( rc==SQLITE_OK ){
  1455         -    rc = walIndexReadHdr(pWal, pChanged);
  1456         -    if( rc!=SQLITE_OK ){
  1457         -      /* An error occured while attempting log recovery. */
  1458         -      sqlite3WalCloseSnapshot(pWal);
  1459         -    }
  1460         -  }
  1461         -
         1742  +  do{
         1743  +    rc = walTryBeginRead(pWal, pChanged, 0);
         1744  +  }while( rc==WAL_RETRY );
  1462   1745     walIndexUnmap(pWal);
  1463   1746     return rc;
  1464   1747   }
  1465   1748   
  1466   1749   /*
  1467         -** Unlock the current snapshot.
         1750  +** Finish with a read transaction.  All this does is release the
         1751  +** read-lock.
  1468   1752   */
  1469         -void sqlite3WalCloseSnapshot(Wal *pWal){
  1470         -  assert( pWal->lockState==SQLITE_SHM_READ
  1471         -       || pWal->lockState==SQLITE_SHM_UNLOCK
  1472         -  );
  1473         -  walSetLock(pWal, SQLITE_SHM_UNLOCK);
         1753  +void sqlite3WalEndReadTransaction(Wal *pWal){
         1754  +  if( pWal->readLock>=0 ){
         1755  +    walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
         1756  +    pWal->readLock = -1;
         1757  +  }
  1474   1758   }
  1475   1759   
  1476   1760   /*
  1477         -** Read a page from the log, if it is present. 
         1761  +** Read a page from the WAL, if it is present in the WAL and if the 
         1762  +** current read transaction is configured to use the WAL.  
         1763  +**
         1764  +** The *pInWal is set to 1 if the requested page is in the WAL and
         1765  +** has been loaded.  Or *pInWal is set to 0 if the page was not in 
         1766  +** the WAL and needs to be read out of the database.
  1478   1767   */
  1479   1768   int sqlite3WalRead(
  1480   1769     Wal *pWal,                      /* WAL handle */
  1481   1770     Pgno pgno,                      /* Database page number to read data for */
  1482   1771     int *pInWal,                    /* OUT: True if data is read from WAL */
  1483   1772     int nOut,                       /* Size of buffer pOut in bytes */
  1484   1773     u8 *pOut                        /* Buffer to write page data to */
  1485   1774   ){
  1486   1775     int rc;                         /* Return code */
  1487   1776     u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
  1488   1777     u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
  1489   1778     int iHash;                      /* Used to loop through N hash tables */
         1779  +
         1780  +  /* This routine is only called from within a read transaction */
         1781  +  assert( pWal->readLock>=0 );
  1490   1782   
  1491   1783     /* If the "last page" field of the wal-index header snapshot is 0, then
  1492   1784     ** no data will be read from the wal under any circumstances. Return early
  1493         -  ** in this case to avoid the walIndexMap/Unmap overhead.
         1785  +  ** in this case to avoid the walIndexMap/Unmap overhead.  Likewise, if
         1786  +  ** pWal->readLock==0, then the WAL is ignored by the reader so
         1787  +  ** return early, as if the WAL were empty.
  1494   1788     */
  1495         -  if( iLast==0 ){
         1789  +  if( iLast==0 || pWal->readLock==0 ){
  1496   1790       *pInWal = 0;
  1497   1791       return SQLITE_OK;
  1498   1792     }
  1499   1793   
  1500   1794     /* Ensure the wal-index is mapped. */
  1501         -  assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
  1502   1795     rc = walIndexMap(pWal, walMappingSize(iLast));
  1503   1796     if( rc!=SQLITE_OK ){
  1504   1797       return rc;
  1505   1798     }
  1506   1799   
  1507   1800     /* Search the hash table or tables for an entry matching page number
  1508   1801     ** pgno. Each iteration of the following for() loop searches one
................................................................................
  1603   1896   }
  1604   1897   
  1605   1898   
  1606   1899   /* 
  1607   1900   ** Set *pPgno to the size of the database file (or zero, if unknown).
  1608   1901   */
  1609   1902   void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno){
  1610         -  assert( pWal->lockState==SQLITE_SHM_READ
  1611         -       || pWal->lockState==SQLITE_SHM_WRITE );
         1903  +  assert( pWal->readLock>=0 );
  1612   1904     *pPgno = pWal->hdr.nPage;
  1613   1905   }
         1906  +
  1614   1907   
  1615   1908   /* 
  1616         -** This function returns SQLITE_OK if the caller may write to the database.
  1617         -** Otherwise, if the caller is operating on a snapshot that has already
  1618         -** been overwritten by another writer, SQLITE_BUSY is returned.
         1909  +** This function starts a write transaction on the WAL.
         1910  +**
         1911  +** A read transaction must have already been started by a prior call
         1912  +** to sqlite3WalBeginReadTransaction().
         1913  +**
         1914  +** If another thread or process has written into the database since
         1915  +** the read transaction was started, then it is not possible for this
         1916  +** thread to write as doing so would cause a fork.  So this routine
         1917  +** returns SQLITE_BUSY in that case and no write transaction is started.
         1918  +**
         1919  +** There can only be a single writer active at a time.
  1619   1920   */
  1620         -int sqlite3WalWriteLock(Wal *pWal, int op){
  1621         -  int rc = SQLITE_OK;
  1622         -  if( op ){
  1623         -    assert( pWal->lockState==SQLITE_SHM_READ );
  1624         -    rc = walSetLock(pWal, SQLITE_SHM_WRITE);
         1921  +int sqlite3WalBeginWriteTransaction(Wal *pWal){
         1922  +  int rc;
         1923  +  volatile WalCkptInfo *pInfo;
  1625   1924   
  1626         -    /* If this connection is not reading the most recent database snapshot,
  1627         -    ** it is not possible to write to the database. In this case release
  1628         -    ** the write locks and return SQLITE_BUSY.
  1629         -    */
         1925  +  /* Cannot start a write transaction without first holding a read
         1926  +  ** transaction. */
         1927  +  assert( pWal->readLock>=0 );
         1928  +
         1929  +  /* Only one writer allowed at a time.  Get the write lock.  Return
         1930  +  ** SQLITE_BUSY if unable.
         1931  +  */
         1932  +  rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1933  +  if( rc ){
         1934  +    return rc;
         1935  +  }
         1936  +
         1937  +  /* If another connection has written to the database file since the
         1938  +  ** time the read transaction on this connection was started, then
         1939  +  ** the write is disallowed.
         1940  +  */
         1941  +  rc = walIndexMap(pWal, pWal->hdr.mxFrame);
         1942  +  if( rc ){
         1943  +    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1944  +    return rc;
         1945  +  }
         1946  +  if( memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))!=0 ){
         1947  +    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1948  +    walIndexUnmap(pWal);
         1949  +    return SQLITE_BUSY;
         1950  +  }
         1951  +
         1952  +  pInfo = walCkptInfo(pWal);
         1953  +  if( pWal->readLock==0 && pInfo->nBackfill==pWal->hdr.mxFrame ){
         1954  +    rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
  1630   1955       if( rc==SQLITE_OK ){
  1631         -      rc = walIndexMap(pWal, walMappingSize(1));
  1632         -      assert( pWal->szWIndex>=WALINDEX_HDR_SIZE || rc!=SQLITE_OK );
  1633         -      if( rc==SQLITE_OK
  1634         -       && memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))
  1635         -      ){
  1636         -        rc = SQLITE_BUSY;
  1637         -      }
  1638         -      walIndexUnmap(pWal);
  1639         -      if( rc!=SQLITE_OK ){
  1640         -        walSetLock(pWal, SQLITE_SHM_READ);
  1641         -      }
         1956  +      /* If all readers are using WAL_READ_LOCK(0) (in other words if no
         1957  +      ** readers are currently using the WAL) */
         1958  +      pWal->nCkpt++;
         1959  +      pWal->hdr.mxFrame = 0;
         1960  +      sqlite3Put4byte((u8*)pWal->hdr.aSalt,
         1961  +                       1 + sqlite3Get4byte((u8*)pWal->hdr.aSalt));
         1962  +      sqlite3_randomness(4, &pWal->hdr.aSalt[1]);
         1963  +      walIndexWriteHdr(pWal);
         1964  +      pInfo->nBackfill = 0;
         1965  +      memset(&pInfo->aReadMark[1], 0, sizeof(pInfo->aReadMark)-sizeof(u32));
         1966  +      rc = sqlite3OsTruncate(pWal->pDbFd, 
         1967  +                             ((i64)pWal->hdr.nPage*(i64)pWal->szPage));
         1968  +      walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
  1642   1969       }
  1643         -  }else if( pWal->lockState==SQLITE_SHM_WRITE ){
  1644         -    rc = walSetLock(pWal, SQLITE_SHM_READ);
         1970  +    walUnlockShared(pWal, WAL_READ_LOCK(0));
         1971  +    do{
         1972  +      int notUsed;
         1973  +      rc = walTryBeginRead(pWal, &notUsed, 1);
         1974  +    }while( rc==WAL_RETRY );
  1645   1975     }
  1646   1976     return rc;
  1647   1977   }
         1978  +
         1979  +/*
         1980  +** End a write transaction.  The commit has already been done.  This
         1981  +** routine merely releases the lock.
         1982  +*/
         1983  +int sqlite3WalEndWriteTransaction(Wal *pWal){
         1984  +  walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
         1985  +  return SQLITE_OK;
         1986  +}
  1648   1987   
  1649   1988   /*
  1650   1989   ** If any data has been written (but not committed) to the log file, this
  1651   1990   ** function moves the write-pointer back to the start of the transaction.
  1652   1991   **
  1653   1992   ** Additionally, the callback function is invoked for each frame written
  1654         -** to the log since the start of the transaction. If the callback returns
         1993  +** to the WAL since the start of the transaction. If the callback returns
  1655   1994   ** other than SQLITE_OK, it is not invoked again and the error code is
  1656   1995   ** returned to the caller.
  1657   1996   **
  1658   1997   ** Otherwise, if the callback function does not return an error, this
  1659   1998   ** function returns SQLITE_OK.
  1660   1999   */
  1661   2000   int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
  1662   2001     int rc = SQLITE_OK;
  1663         -  if( pWal->lockState==SQLITE_SHM_WRITE ){
         2002  +  if( pWal->writeLock ){
  1664   2003       int unused;
  1665   2004       Pgno iMax = pWal->hdr.mxFrame;
  1666   2005       Pgno iFrame;
  1667   2006     
  1668   2007       assert( pWal->pWiData==0 );
  1669   2008       rc = walIndexReadHdr(pWal, &unused);
  1670   2009       if( rc==SQLITE_OK ){
  1671   2010         rc = walIndexMap(pWal, walMappingSize(iMax));
  1672   2011       }
  1673   2012       if( rc==SQLITE_OK ){
  1674   2013         for(iFrame=pWal->hdr.mxFrame+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){
  1675         -        assert( pWal->lockState==SQLITE_SHM_WRITE );
         2014  +        assert( pWal->writeLock );
  1676   2015           rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
  1677   2016         }
  1678   2017         walCleanupHash(pWal);
  1679   2018       }
  1680   2019       walIndexUnmap(pWal);
  1681   2020     }
  1682   2021     return rc;
................................................................................
  1685   2024   /* 
  1686   2025   ** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32 
  1687   2026   ** values. This function populates the array with values required to 
  1688   2027   ** "rollback" the write position of the WAL handle back to the current 
  1689   2028   ** point in the event of a savepoint rollback (via WalSavepointUndo()).
  1690   2029   */
  1691   2030   void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){
  1692         -  assert( pWal->lockState==SQLITE_SHM_WRITE );
         2031  +  assert( pWal->writeLock );
  1693   2032     aWalData[0] = pWal->hdr.mxFrame;
  1694   2033     aWalData[1] = pWal->hdr.aFrameCksum[0];
  1695   2034     aWalData[2] = pWal->hdr.aFrameCksum[1];
  1696   2035   }
  1697   2036   
  1698   2037   /* 
  1699   2038   ** Move the write position of the WAL back to the point identified by
  1700   2039   ** the values in the aWalData[] array. aWalData must point to an array
  1701   2040   ** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
  1702   2041   ** by a call to WalSavepoint().
  1703   2042   */
  1704   2043   int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
  1705   2044     int rc = SQLITE_OK;
  1706         -  assert( pWal->lockState==SQLITE_SHM_WRITE );
         2045  +  assert( pWal->writeLock );
  1707   2046   
  1708   2047     assert( aWalData[0]<=pWal->hdr.mxFrame );
  1709   2048     if( aWalData[0]<pWal->hdr.mxFrame ){
  1710   2049       rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
  1711   2050       pWal->hdr.mxFrame = aWalData[0];
  1712   2051       pWal->hdr.aFrameCksum[0] = aWalData[1];
  1713   2052       pWal->hdr.aFrameCksum[1] = aWalData[2];
................................................................................
  1735   2074     u32 iFrame;                     /* Next frame address */
  1736   2075     u8 aFrame[WAL_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
  1737   2076     PgHdr *p;                       /* Iterator to run through pList with. */
  1738   2077     PgHdr *pLast = 0;               /* Last frame in list */
  1739   2078     int nLast = 0;                  /* Number of extra copies of last page */
  1740   2079   
  1741   2080     assert( pList );
  1742         -  assert( pWal->lockState==SQLITE_SHM_WRITE );
         2081  +  assert( pWal->writeLock );
  1743   2082     assert( pWal->pWiData==0 );
  1744   2083   
  1745   2084     /* If this is the first frame written into the log, write the WAL
  1746   2085     ** header to the start of the WAL file. See comments at the top of
  1747   2086     ** this source file for a description of the WAL header format.
  1748   2087     */
  1749   2088     iFrame = pWal->hdr.mxFrame;
................................................................................
  1848   2187     }
  1849   2188   
  1850   2189     walIndexUnmap(pWal);
  1851   2190     return rc;
  1852   2191   }
  1853   2192   
  1854   2193   /* 
  1855         -** Checkpoint the database:
         2194  +** This routine is called to implement sqlite3_wal_checkpoint() and
         2195  +** related interfaces.
  1856   2196   **
  1857         -**   1. Acquire a CHECKPOINT lock
  1858         -**   2. Copy the contents of the log into the database file.
  1859         -**   3. Zero the wal-index header (so new readers will ignore the log).
  1860         -**   4. Drop the CHECKPOINT lock.
         2197  +** Obtain a CHECKPOINT lock and then backfill as much information as
         2198  +** we can from WAL into the database.
  1861   2199   */
  1862   2200   int sqlite3WalCheckpoint(
  1863   2201     Wal *pWal,                      /* Wal connection */
  1864   2202     int sync_flags,                 /* Flags to sync db file with (or 0) */
  1865   2203     int nBuf,                       /* Size of temporary buffer */
  1866         -  u8 *zBuf,                       /* Temporary buffer to use */
  1867         -  int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
  1868         -  void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
         2204  +  u8 *zBuf                        /* Temporary buffer to use */
  1869   2205   ){
  1870   2206     int rc;                         /* Return code */
  1871   2207     int isChanged = 0;              /* True if a new wal-index header is loaded */
  1872   2208   
  1873   2209     assert( pWal->pWiData==0 );
  1874   2210   
  1875         -  /* Get the CHECKPOINT lock. 
  1876         -  **
  1877         -  ** Normally, the connection will be in UNLOCK state at this point. But
  1878         -  ** if the connection is in exclusive-mode it may still be in READ state
  1879         -  ** even though the upper layer has no active read-transaction (because
  1880         -  ** WalCloseSnapshot() is not called in exclusive mode). The state will
  1881         -  ** be set to UNLOCK when this function returns. This is Ok.
  1882         -  */
  1883         -  assert( (pWal->lockState==SQLITE_SHM_UNLOCK)
  1884         -       || (pWal->lockState==SQLITE_SHM_READ) );
  1885         -  walSetLock(pWal, SQLITE_SHM_UNLOCK);
  1886         -  do {
  1887         -    rc = walSetLock(pWal, SQLITE_SHM_CHECKPOINT);
  1888         -  }while( rc==SQLITE_BUSY && xBusyHandler(pBusyHandlerArg) );
  1889         -  if( rc!=SQLITE_OK ){
  1890         -    walSetLock(pWal, SQLITE_SHM_UNLOCK);
         2211  +  rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
         2212  +  if( rc ){
         2213  +    /* Usually this is SQLITE_BUSY meaning that another thread or process
         2214  +    ** is already running a checkpoint, or maybe a recovery.  But it might
         2215  +    ** also be SQLITE_IOERR. */
  1891   2216       return rc;
  1892   2217     }
  1893   2218   
  1894   2219     /* Copy data from the log to the database file. */
  1895   2220     rc = walIndexReadHdr(pWal, &isChanged);
  1896   2221     if( rc==SQLITE_OK ){
  1897   2222       rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
................................................................................
  1904   2229       ** the cache needs to be reset.
  1905   2230       */
  1906   2231       memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  1907   2232     }
  1908   2233   
  1909   2234     /* Release the locks. */
  1910   2235     walIndexUnmap(pWal);
  1911         -  walSetLock(pWal, SQLITE_SHM_UNLOCK);
         2236  +  walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
  1912   2237     return rc;
  1913   2238   }
  1914   2239   
  1915   2240   /* Return the value to pass to a sqlite3_wal_hook callback, the
  1916   2241   ** number of frames in the WAL at the point of the last commit since
  1917   2242   ** sqlite3WalCallback() was called.  If no commits have occurred since
  1918   2243   ** the last call, then return 0.
................................................................................
  1930   2255   ** This function is called to set or query the exclusive-mode flag 
  1931   2256   ** associated with the WAL connection passed as the first argument. The
  1932   2257   ** exclusive-mode flag should be set to indicate that the caller is
  1933   2258   ** holding an EXCLUSIVE lock on the database file (it does this in
  1934   2259   ** locking_mode=exclusive mode). If the EXCLUSIVE lock is to be dropped,
  1935   2260   ** the flag set by this function should be cleared before doing so.
  1936   2261   **
  1937         -** The value of the exclusive-mode flag may only be modified when
  1938         -** the WAL connection is in READ state.
  1939         -**
  1940   2262   ** When the flag is set, this module does not call the VFS xShmLock()
  1941   2263   ** method to obtain any locks on the wal-index (as it assumes it
  1942   2264   ** has exclusive access to the wal and wal-index files anyhow). It
  1943   2265   ** continues to hold (and does not drop) the existing READ lock on
  1944   2266   ** the wal-index.
  1945   2267   **
  1946   2268   ** To set or clear the flag, the "op" parameter is passed 1 or 0,
  1947   2269   ** respectively. To query the flag, pass -1. In all cases, the value
  1948   2270   ** returned is the value of the exclusive-mode flag (after its value
  1949   2271   ** has been modified, if applicable).
  1950   2272   */
  1951   2273   int sqlite3WalExclusiveMode(Wal *pWal, int op){
  1952   2274     if( op>=0 ){
  1953         -    assert( pWal->lockState==SQLITE_SHM_READ );
  1954   2275       pWal->exclusiveMode = (u8)op;
  1955   2276     }
  1956   2277     return pWal->exclusiveMode;
  1957   2278   }
  1958   2279   
  1959   2280   #endif /* #ifndef SQLITE_OMIT_WAL */

Changes to src/wal.h.

    16     16   
    17     17   #ifndef _WAL_H_
    18     18   #define _WAL_H_
    19     19   
    20     20   #include "sqliteInt.h"
    21     21   
    22     22   #ifdef SQLITE_OMIT_WAL
    23         -# define sqlite3WalOpen(x,y,z)             0
    24         -# define sqlite3WalClose(w,x,y,z)          0
    25         -# define sqlite3WalOpenSnapshot(y,z)       0
    26         -# define sqlite3WalCloseSnapshot(z) 
    27         -# define sqlite3WalRead(v,w,x,y,z)         0
           23  +# define sqlite3WalOpen(x,y,z)                 0
           24  +# define sqlite3WalClose(w,x,y,z)              0
           25  +# define sqlite3WalBeginReadTransaction(y,z)   0
           26  +# define sqlite3WalEndReadTransaction(z)
           27  +# define sqlite3WalRead(v,w,x,y,z)             0
    28     28   # define sqlite3WalDbsize(y,z)
    29         -# define sqlite3WalWriteLock(y,z)          0
    30         -# define sqlite3WalUndo(x,y,z)             0
           29  +# define sqlite3WalBeginWriteTransaction(y)    0
           30  +# define sqlite3WalEndWRiteTransaction(x)      0
           31  +# define sqlite3WalUndo(x,y,z)                 0
    31     32   # define sqlite3WalSavepoint(y,z)
    32         -# define sqlite3WalSavepointUndo(y,z)      0
    33         -# define sqlite3WalFrames(u,v,w,x,y,z)     0
    34         -# define sqlite3WalCheckpoint(u,v,w,x,y,z) 0
    35         -# define sqlite3WalCallback(z)             0
           33  +# define sqlite3WalSavepointUndo(y,z)          0
           34  +# define sqlite3WalFrames(u,v,w,x,y,z)         0
           35  +# define sqlite3WalCheckpoint(u,v,w,x)         0
           36  +# define sqlite3WalCallback(z)                 0
    36     37   #else
    37     38   
    38     39   #define WAL_SAVEPOINT_NDATA 3
    39     40   
    40     41   /* Connection to a write-ahead log (WAL) file. 
    41     42   ** There is one object of this type for each pager. 
    42     43   */
................................................................................
    49     50   /* Used by readers to open (lock) and close (unlock) a snapshot.  A 
    50     51   ** snapshot is like a read-transaction.  It is the state of the database
    51     52   ** at an instant in time.  sqlite3WalOpenSnapshot gets a read lock and
    52     53   ** preserves the current state even if the other threads or processes
    53     54   ** write to or checkpoint the WAL.  sqlite3WalCloseSnapshot() closes the
    54     55   ** transaction and releases the lock.
    55     56   */
    56         -int sqlite3WalOpenSnapshot(Wal *pWal, int *);
    57         -void sqlite3WalCloseSnapshot(Wal *pWal);
           57  +int sqlite3WalBeginReadTransaction(Wal *pWal, int *);
           58  +void sqlite3WalEndReadTransaction(Wal *pWal);
    58     59   
    59     60   /* Read a page from the write-ahead log, if it is present. */
    60     61   int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, int nOut, u8 *pOut);
    61     62   
    62     63   /* Return the size of the database as it existed at the beginning
    63     64   ** of the snapshot */
    64     65   void sqlite3WalDbsize(Wal *pWal, Pgno *pPgno);
    65     66   
    66     67   /* Obtain or release the WRITER lock. */
    67         -int sqlite3WalWriteLock(Wal *pWal, int op);
           68  +int sqlite3WalBeginWriteTransaction(Wal *pWal);
           69  +int sqlite3WalEndWriteTransaction(Wal *pWal);
    68     70   
    69     71   /* Undo any frames written (but not committed) to the log */
    70     72   int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx);
    71     73   
    72     74   /* Return an integer that records the current (uncommitted) write
    73     75   ** position in the WAL */
    74     76   void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData);
................................................................................
    81     83   int sqlite3WalFrames(Wal *pWal, int, PgHdr *, Pgno, int, int);
    82     84   
    83     85   /* Copy pages from the log to the database file */ 
    84     86   int sqlite3WalCheckpoint(
    85     87     Wal *pWal,                      /* Write-ahead log connection */
    86     88     int sync_flags,                 /* Flags to sync db file with (or 0) */
    87     89     int nBuf,                       /* Size of buffer nBuf */
    88         -  u8 *zBuf,                       /* Temporary buffer to use */
    89         -  int (*xBusyHandler)(void *),    /* Pointer to busy-handler function */
    90         -  void *pBusyHandlerArg           /* Argument to pass to xBusyHandler */
           90  +  u8 *zBuf                        /* Temporary buffer to use */
    91     91   );
    92     92   
    93     93   /* Return the value to pass to a sqlite3_wal_hook callback, the
    94     94   ** number of frames in the WAL at the point of the last commit since
    95     95   ** sqlite3WalCallback() was called.  If no commits have occurred since
    96     96   ** the last call, then return 0.
    97     97   */