/ Check-in [f295e7ed]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge the experimental shared-memory mmap-by-chunk changes into the trunk.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: f295e7ed5f04f6b8bffdaff8b565be3836ce4e80
User & Date: drh 2010-06-14 17:09:53
Context
2010-06-14
18:01
Move the xShmMap method to in between xShmLock and xShmBarrier, since it seems to fit in there logically. check-in: 58dfd83d user: drh tags: trunk
17:09
Merge the experimental shared-memory mmap-by-chunk changes into the trunk. check-in: f295e7ed user: drh tags: trunk
16:16
Add the new xShmMap (formerly xShmPage) to os_win.c. check-in: 13e7a824 user: dan tags: experimental
2010-06-11
17:01
Refactor and simplify the logic used to change journalmode. check-in: 95cc3f6f user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/os.c.

    97     97   }
    98     98   int sqlite3OsDeviceCharacteristics(sqlite3_file *id){
    99     99     return id->pMethods->xDeviceCharacteristics(id);
   100    100   }
   101    101   int sqlite3OsShmOpen(sqlite3_file *id){
   102    102     return id->pMethods->xShmOpen(id);
   103    103   }
   104         -int sqlite3OsShmSize(sqlite3_file *id, int reqSize, int *pNewSize){
   105         -  return id->pMethods->xShmSize(id, reqSize, pNewSize);
   106         -}
   107         -int sqlite3OsShmGet(sqlite3_file *id,int reqSize,int *pSize,void volatile **pp){
   108         -  return id->pMethods->xShmGet(id, reqSize, pSize, pp);
   109         -}
   110         -int sqlite3OsShmRelease(sqlite3_file *id){
   111         -  return id->pMethods->xShmRelease(id);
   112         -}
   113    104   int sqlite3OsShmLock(sqlite3_file *id, int offset, int n, int flags){
   114    105     return id->pMethods->xShmLock(id, offset, n, flags);
   115    106   }
   116    107   void sqlite3OsShmBarrier(sqlite3_file *id){
   117    108     id->pMethods->xShmBarrier(id);
   118    109   }
   119    110   int sqlite3OsShmClose(sqlite3_file *id, int deleteFlag){
   120    111     return id->pMethods->xShmClose(id, deleteFlag);
   121    112   }
          113  +int sqlite3OsShmMap(
          114  +  sqlite3_file *id, 
          115  +  int iPage, 
          116  +  int pgsz, 
          117  +  int isWrite, 
          118  +  void volatile **pp
          119  +){
          120  +  return id->pMethods->xShmMap(id, iPage, pgsz, isWrite, pp);
          121  +}
   122    122   
   123    123   /*
   124    124   ** The next group of routines are convenience wrappers around the
   125    125   ** VFS methods.
   126    126   */
   127    127   int sqlite3OsOpen(
   128    128     sqlite3_vfs *pVfs, 

Changes to src/os.h.

   244    244   int sqlite3OsUnlock(sqlite3_file*, int);
   245    245   int sqlite3OsCheckReservedLock(sqlite3_file *id, int *pResOut);
   246    246   int sqlite3OsFileControl(sqlite3_file*,int,void*);
   247    247   #define SQLITE_FCNTL_DB_UNCHANGED 0xca093fa0
   248    248   int sqlite3OsSectorSize(sqlite3_file *id);
   249    249   int sqlite3OsDeviceCharacteristics(sqlite3_file *id);
   250    250   int sqlite3OsShmOpen(sqlite3_file *id);
   251         -int sqlite3OsShmSize(sqlite3_file *id, int, int*);
   252         -int sqlite3OsShmGet(sqlite3_file *id, int, int*, void volatile**);
   253         -int sqlite3OsShmRelease(sqlite3_file *id);
   254    251   int sqlite3OsShmLock(sqlite3_file *id, int, int, int);
   255    252   void sqlite3OsShmBarrier(sqlite3_file *id);
   256    253   int sqlite3OsShmClose(sqlite3_file *id, int);
          254  +int sqlite3OsShmMap(sqlite3_file *,int,int,int,void volatile **);
   257    255   
   258    256   /* 
   259    257   ** Functions for accessing sqlite3_vfs methods 
   260    258   */
   261    259   int sqlite3OsOpen(sqlite3_vfs *, const char *, sqlite3_file*, int, int *);
   262    260   int sqlite3OsDelete(sqlite3_vfs *, const char *, int);
   263    261   int sqlite3OsAccess(sqlite3_vfs *, const char *, int, int *pResOut);

Changes to src/os_unix.c.

  3124   3124   ** 
  3125   3125   **      fid
  3126   3126   **      zFilename
  3127   3127   **
  3128   3128   ** Either unixShmNode.mutex must be held or unixShmNode.nRef==0 and
  3129   3129   ** unixMutexHeld() is true when reading or writing any other field
  3130   3130   ** in this structure.
  3131         -**
  3132         -** To avoid deadlocks, mutex and mutexBuf are always released in the
  3133         -** reverse order that they are acquired.  mutexBuf is always acquired
  3134         -** first and released last.  This invariant is check by asserting
  3135         -** sqlite3_mutex_notheld() on mutex whenever mutexBuf is acquired or
  3136         -** released.
  3137   3131   */
  3138   3132   struct unixShmNode {
  3139   3133     unixInodeInfo *pInode;     /* unixInodeInfo that owns this SHM node */
  3140   3134     sqlite3_mutex *mutex;      /* Mutex to access this object */
  3141         -  sqlite3_mutex *mutexBuf;   /* Mutex to access zBuf[] */
  3142   3135     char *zFilename;           /* Name of the mmapped file */
  3143   3136     int h;                     /* Open file descriptor */
  3144         -  int szMap;                 /* Size of the mapping into memory */
  3145         -  char *pMMapBuf;            /* Where currently mmapped().  NULL if unmapped */
         3137  +  int szRegion;              /* Size of shared-memory regions */
         3138  +  int nRegion;               /* Size of array apRegion */
         3139  +  char **apRegion;           /* Array of mapped shared-memory regions */
  3146   3140     int nRef;                  /* Number of unixShm objects pointing to this */
  3147   3141     unixShm *pFirst;           /* All unixShm objects pointing to this */
  3148   3142   #ifdef SQLITE_DEBUG
  3149   3143     u8 exclMask;               /* Mask of exclusive locks held */
  3150   3144     u8 sharedMask;             /* Mask of shared locks held */
  3151   3145     u8 nextShmId;              /* Next available unixShm.id value */
  3152   3146   #endif
................................................................................
  3165   3159   ** All other fields are read/write.  The unixShm.pFile->mutex must be held
  3166   3160   ** while accessing any read/write fields.
  3167   3161   */
  3168   3162   struct unixShm {
  3169   3163     unixShmNode *pShmNode;     /* The underlying unixShmNode object */
  3170   3164     unixShm *pNext;            /* Next unixShm with the same unixShmNode */
  3171   3165     u8 hasMutex;               /* True if holding the unixShmNode mutex */
  3172         -  u8 hasMutexBuf;            /* True if holding pFile->mutexBuf */
  3173   3166     u16 sharedMask;            /* Mask of shared locks held */
  3174   3167     u16 exclMask;              /* Mask of exclusive locks held */
  3175   3168   #ifdef SQLITE_DEBUG
  3176   3169     u8 id;                     /* Id of this connection within its unixShmNode */
  3177   3170   #endif
  3178   3171   };
  3179   3172   
................................................................................
  3262   3255   ** This is not a VFS shared-memory method; it is a utility function called
  3263   3256   ** by VFS shared-memory methods.
  3264   3257   */
  3265   3258   static void unixShmPurge(unixFile *pFd){
  3266   3259     unixShmNode *p = pFd->pInode->pShmNode;
  3267   3260     assert( unixMutexHeld() );
  3268   3261     if( p && p->nRef==0 ){
         3262  +    int i;
  3269   3263       assert( p->pInode==pFd->pInode );
  3270   3264       if( p->mutex ) sqlite3_mutex_free(p->mutex);
  3271         -    if( p->mutexBuf ) sqlite3_mutex_free(p->mutexBuf);
  3272         -    if( p->pMMapBuf ) munmap(p->pMMapBuf, p->szMap);
         3265  +    for(i=0; i<p->nRegion; i++){
         3266  +      munmap(p->apRegion[i], p->szRegion);
         3267  +    }
         3268  +    sqlite3_free(p->apRegion);
  3273   3269       if( p->h>=0 ) close(p->h);
  3274   3270       p->pInode->pShmNode = 0;
  3275   3271       sqlite3_free(p);
  3276   3272     }
  3277   3273   }
  3278   3274   
  3279   3275   /* Forward reference */
................................................................................
  3341   3337       pDbFd->pInode->pShmNode = pShmNode;
  3342   3338       pShmNode->pInode = pDbFd->pInode;
  3343   3339       pShmNode->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
  3344   3340       if( pShmNode->mutex==0 ){
  3345   3341         rc = SQLITE_NOMEM;
  3346   3342         goto shm_open_err;
  3347   3343       }
  3348         -    pShmNode->mutexBuf = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
  3349         -    if( pShmNode->mutexBuf==0 ){
  3350         -      rc = SQLITE_NOMEM;
  3351         -      goto shm_open_err;
  3352         -    }
  3353   3344   
  3354   3345       pShmNode->h = open(pShmNode->zFilename, O_RDWR|O_CREAT, 0664);
  3355   3346       if( pShmNode->h<0 ){
  3356   3347         rc = SQLITE_CANTOPEN_BKPT;
  3357   3348         goto shm_open_err;
  3358   3349       }
  3359   3350   
................................................................................
  3416   3407     /* Remove connection p from the set of connections associated
  3417   3408     ** with pShmNode */
  3418   3409     sqlite3_mutex_enter(pShmNode->mutex);
  3419   3410     for(pp=&pShmNode->pFirst; (*pp)!=p; pp = &(*pp)->pNext){}
  3420   3411     *pp = p->pNext;
  3421   3412   
  3422   3413     /* Free the connection p */
  3423         -  assert( p->hasMutexBuf==0 );
  3424   3414     sqlite3_free(p);
  3425   3415     pDbFd->pShm = 0;
  3426   3416     sqlite3_mutex_leave(pShmNode->mutex);
  3427   3417   
  3428   3418     /* If pShmNode->nRef has reached 0, then close the underlying
  3429   3419     ** shared-memory file, too */
  3430   3420     unixEnterMutex();
................................................................................
  3434   3424       if( deleteFlag ) unlink(pShmNode->zFilename);
  3435   3425       unixShmPurge(pDbFd);
  3436   3426     }
  3437   3427     unixLeaveMutex();
  3438   3428   
  3439   3429     return SQLITE_OK;
  3440   3430   }
  3441         -
  3442         -/*
  3443         -** Changes the size of the underlying storage for  a shared-memory segment.
  3444         -**
  3445         -** The reqSize parameter is the new requested size of the shared memory.
  3446         -** This implementation is free to increase the shared memory size to
  3447         -** any amount greater than or equal to reqSize.  If the shared memory is
  3448         -** already as big or bigger as reqSize, this routine is a no-op.
  3449         -**
  3450         -** The reqSize parameter is the minimum size requested.  The implementation
  3451         -** is free to expand the storage to some larger amount if it chooses.
  3452         -*/
  3453         -static int unixShmSize(
  3454         -  sqlite3_file *fd,         /* The open database file holding SHM */
  3455         -  int reqSize,              /* Requested size.  -1 for query only */
  3456         -  int *pNewSize             /* Write new size here */
  3457         -){
  3458         -  unixFile *pDbFd = (unixFile*)fd;
  3459         -  unixShm *p = pDbFd->pShm;
  3460         -  unixShmNode *pShmNode = p->pShmNode;
  3461         -  int rc = SQLITE_OK;
  3462         -  struct stat sStat;
  3463         -
  3464         -  assert( pShmNode==pDbFd->pInode->pShmNode );
  3465         -  assert( pShmNode->pInode==pDbFd->pInode );
  3466         -
  3467         -  while( 1 ){
  3468         -    if( fstat(pShmNode->h, &sStat)==0 ){
  3469         -      *pNewSize = (int)sStat.st_size;
  3470         -      if( reqSize<=(int)sStat.st_size ) break;
  3471         -    }else{
  3472         -      *pNewSize = 0;
  3473         -      rc = SQLITE_IOERR_SHMSIZE;
  3474         -      break;
  3475         -    }
  3476         -    rc = ftruncate(pShmNode->h, reqSize);
  3477         -    reqSize = -1;
  3478         -  }
  3479         -  return rc;
  3480         -}
  3481         -
  3482         -/*
  3483         -** Release the lock held on the shared memory segment to that other
  3484         -** threads are free to resize it if necessary.
  3485         -**
  3486         -** If the lock is not currently held, this routine is a harmless no-op.
  3487         -**
  3488         -** If the shared-memory object is in lock state RECOVER, then we do not
  3489         -** really want to release the lock, so in that case too, this routine
  3490         -** is a no-op.
  3491         -*/
  3492         -static int unixShmRelease(sqlite3_file *fd){
  3493         -  unixFile *pDbFd = (unixFile*)fd;
  3494         -  unixShm *p = pDbFd->pShm;
  3495         -
  3496         -  if( p->hasMutexBuf ){
  3497         -    assert( sqlite3_mutex_notheld(p->pShmNode->mutex) );
  3498         -    sqlite3_mutex_leave(p->pShmNode->mutexBuf);
  3499         -    p->hasMutexBuf = 0;
  3500         -  }
  3501         -  return SQLITE_OK;
  3502         -}
  3503         -
  3504         -/*
  3505         -** Map the shared storage into memory. 
  3506         -**
  3507         -** If reqMapSize is positive, then an attempt is made to make the
  3508         -** mapping at least reqMapSize bytes in size.  However, the mapping
  3509         -** will never be larger than the size of the underlying shared memory
  3510         -** as set by prior calls to xShmSize().  
  3511         -**
  3512         -** *ppBuf is made to point to the memory which is a mapping of the
  3513         -** underlying storage.  A mutex is acquired to prevent other threads
  3514         -** from running while *ppBuf is in use in order to prevent other threads
  3515         -** remapping *ppBuf out from under this thread.  The unixShmRelease()
  3516         -** call will release the mutex.  However, if the lock state is CHECKPOINT,
  3517         -** the mutex is not acquired because CHECKPOINT will never remap the
  3518         -** buffer.  RECOVER might remap, though, so CHECKPOINT will acquire
  3519         -** the mutex if and when it promotes to RECOVER.
  3520         -**
  3521         -** RECOVER needs to be atomic.  The same mutex that prevents *ppBuf from
  3522         -** being remapped also prevents more than one thread from being in
  3523         -** RECOVER at a time.  But, RECOVER sometimes wants to remap itself.
  3524         -** To prevent RECOVER from losing its lock while remapping, the
  3525         -** mutex is not released by unixShmRelease() when in RECOVER.
  3526         -**
  3527         -** *pNewMapSize is set to the size of the mapping.  Usually *pNewMapSize
  3528         -** will be reqMapSize or larger, though it could be smaller if the
  3529         -** underlying shared memory has never been enlarged to reqMapSize bytes
  3530         -** by prior calls to xShmSize().
  3531         -**
  3532         -** *ppBuf might be NULL and zero if no space has
  3533         -** yet been allocated to the underlying storage.
  3534         -*/
  3535         -static int unixShmGet(
  3536         -  sqlite3_file *fd,        /* Database file holding shared memory */
  3537         -  int reqMapSize,          /* Requested size of mapping. -1 means don't care */
  3538         -  int *pNewMapSize,        /* Write new size of mapping here */
  3539         -  void volatile **ppBuf    /* Write mapping buffer origin here */
  3540         -){
  3541         -  unixFile *pDbFd = (unixFile*)fd;
  3542         -  unixShm *p = pDbFd->pShm;
  3543         -  unixShmNode *pShmNode = p->pShmNode;
  3544         -  int rc = SQLITE_OK;
  3545         -
  3546         -  assert( pShmNode==pDbFd->pInode->pShmNode );
  3547         -  assert( pShmNode->pInode==pDbFd->pInode );
  3548         -
  3549         -  if( p->hasMutexBuf==0 ){
  3550         -    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
  3551         -    sqlite3_mutex_enter(pShmNode->mutexBuf);
  3552         -    p->hasMutexBuf = 1;
  3553         -  }
  3554         -  sqlite3_mutex_enter(pShmNode->mutex);
  3555         -  if( pShmNode->szMap==0 || reqMapSize>pShmNode->szMap ){
  3556         -    int actualSize;
  3557         -    if( unixShmSize(fd, -1, &actualSize)!=SQLITE_OK ){
  3558         -      actualSize = 0;
  3559         -    }
  3560         -    reqMapSize = actualSize;
  3561         -    if( pShmNode->pMMapBuf || reqMapSize<=0 ){
  3562         -      munmap(pShmNode->pMMapBuf, pShmNode->szMap);
  3563         -    }
  3564         -    if( reqMapSize>0 ){
  3565         -      pShmNode->pMMapBuf = mmap(0, reqMapSize, PROT_READ|PROT_WRITE, MAP_SHARED,
  3566         -                             pShmNode->h, 0);
  3567         -      pShmNode->szMap = pShmNode->pMMapBuf ? reqMapSize : 0;
  3568         -    }else{
  3569         -      pShmNode->pMMapBuf = 0;
  3570         -      pShmNode->szMap = 0;
  3571         -    }
  3572         -  }
  3573         -  *pNewMapSize = pShmNode->szMap;
  3574         -  *ppBuf = pShmNode->pMMapBuf;
  3575         -  sqlite3_mutex_leave(pShmNode->mutex);
  3576         -  if( *ppBuf==0 ){
  3577         -    /* Do not hold the mutex if a NULL pointer is being returned. */
  3578         -    unixShmRelease(fd);
  3579         -  }
  3580         -  return rc;
  3581         -}
  3582         -
  3583   3431   
  3584   3432   /*
  3585   3433   ** Change the lock state for a shared-memory segment.
  3586   3434   **
  3587   3435   ** Note that the relationship between SHAREd and EXCLUSIVE locks is a little
  3588   3436   ** different here than in posix.  In xShmLock(), one can go from unlocked
  3589   3437   ** to shared and back or from unlocked to exclusive and back.  But one may
................................................................................
  3696   3544   /*
  3697   3545   ** Implement a memory barrier or memory fence on shared memory.  
  3698   3546   **
  3699   3547   ** All loads and stores begun before the barrier must complete before
  3700   3548   ** any load or store begun after the barrier.
  3701   3549   */
  3702   3550   static void unixShmBarrier(
  3703         -  sqlite3_file *fd           /* Database file holding the shared memory */
         3551  +  sqlite3_file *fd                /* Database file holding the shared memory */
  3704   3552   ){
  3705   3553     unixEnterMutex();
  3706   3554     unixLeaveMutex();
  3707   3555   }
  3708   3556   
         3557  +/*
         3558  +** This function is called to obtain a pointer to region iRegion of the 
         3559  +** shared-memory associated with the database file fd. Shared-memory regions 
         3560  +** are numbered starting from zero. Each shared-memory region is szRegion 
         3561  +** bytes in size.
         3562  +**
         3563  +** If an error occurs, an error code is returned and *pp is set to NULL.
         3564  +**
         3565  +** Otherwise, if the isWrite parameter is 0 and the requested shared-memory
         3566  +** region has not been allocated (by any client, including one running in a
         3567  +** separate process), then *pp is set to NULL and SQLITE_OK returned. If 
         3568  +** isWrite is non-zero and the requested shared-memory region has not yet 
         3569  +** been allocated, it is allocated by this function.
         3570  +**
         3571  +** If the shared-memory region has already been allocated or is allocated by
         3572  +** this call as described above, then it is mapped into this processes 
         3573  +** address space (if it is not already), *pp is set to point to the mapped 
         3574  +** memory and SQLITE_OK returned.
         3575  +*/
         3576  +static int unixShmMap(
         3577  +  sqlite3_file *fd,               /* Handle open on database file */
         3578  +  int iRegion,                    /* Region to retrieve */
         3579  +  int szRegion,                   /* Size of regions */
         3580  +  int isWrite,                    /* True to extend file if necessary */
         3581  +  void volatile **pp              /* OUT: Mapped memory */
         3582  +){
         3583  +  unixFile *pDbFd = (unixFile*)fd;
         3584  +  unixShm *p = pDbFd->pShm;
         3585  +  unixShmNode *pShmNode = p->pShmNode;
         3586  +  int rc = SQLITE_OK;
         3587  +
         3588  +  sqlite3_mutex_enter(pShmNode->mutex);
         3589  +  assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
         3590  +
         3591  +  if( pShmNode->nRegion<=iRegion ){
         3592  +    char **apNew;                      /* New apRegion[] array */
         3593  +    int nByte = (iRegion+1)*szRegion;  /* Minimum required file size */
         3594  +    struct stat sStat;                 /* Used by fstat() */
         3595  +
         3596  +    pShmNode->szRegion = szRegion;
         3597  +
         3598  +    /* The requested region is not mapped into this processes address space.
         3599  +    ** Check to see if it has been allocated (i.e. if the wal-index file is
         3600  +    ** large enough to contain the requested region).
         3601  +    */
         3602  +    if( fstat(pShmNode->h, &sStat) ){
         3603  +      rc = SQLITE_IOERR_SHMSIZE;
         3604  +      goto shmpage_out;
         3605  +    }
         3606  +
         3607  +    if( sStat.st_size<nByte ){
         3608  +      /* The requested memory region does not exist. If isWrite is set to
         3609  +      ** zero, exit early. *pp will be set to NULL and SQLITE_OK returned.
         3610  +      **
         3611  +      ** Alternatively, if isWrite is non-zero, use ftruncate() to allocate
         3612  +      ** the requested memory region.
         3613  +      */
         3614  +      if( !isWrite ) goto shmpage_out;
         3615  +      if( ftruncate(pShmNode->h, nByte) ){
         3616  +        rc = SQLITE_IOERR_SHMSIZE;
         3617  +        goto shmpage_out;
         3618  +      }  
         3619  +    }
         3620  +
         3621  +    /* Map the requested memory region into this processes address space. */
         3622  +    apNew = (char **)sqlite3_realloc(
         3623  +        pShmNode->apRegion, (iRegion+1)*sizeof(char *)
         3624  +    );
         3625  +    if( !apNew ){
         3626  +      rc = SQLITE_IOERR_NOMEM;
         3627  +      goto shmpage_out;
         3628  +    }
         3629  +    pShmNode->apRegion = apNew;
         3630  +    while(pShmNode->nRegion<=iRegion){
         3631  +      void *pMem = mmap(0, szRegion, PROT_READ|PROT_WRITE, 
         3632  +          MAP_SHARED, pShmNode->h, iRegion*szRegion
         3633  +      );
         3634  +      if( pMem==MAP_FAILED ){
         3635  +        rc = SQLITE_IOERR;
         3636  +        goto shmpage_out;
         3637  +      }
         3638  +      pShmNode->apRegion[pShmNode->nRegion] = pMem;
         3639  +      pShmNode->nRegion++;
         3640  +    }
         3641  +  }
         3642  +
         3643  +shmpage_out:
         3644  +  if( pShmNode->nRegion>iRegion ){
         3645  +    *pp = pShmNode->apRegion[iRegion];
         3646  +  }else{
         3647  +    *pp = 0;
         3648  +  }
         3649  +  sqlite3_mutex_leave(pShmNode->mutex);
         3650  +  return rc;
         3651  +}
  3709   3652   
  3710   3653   #else
  3711   3654   # define unixShmOpen    0
  3712         -# define unixShmSize    0
  3713         -# define unixShmGet     0
  3714         -# define unixShmRelease 0
  3715   3655   # define unixShmLock    0
  3716   3656   # define unixShmBarrier 0
  3717   3657   # define unixShmClose   0
         3658  +# define unixShmMap     0
  3718   3659   #endif /* #ifndef SQLITE_OMIT_WAL */
  3719   3660   
  3720   3661   /*
  3721   3662   ** Here ends the implementation of all sqlite3_file methods.
  3722   3663   **
  3723   3664   ********************** End sqlite3_file Methods *******************************
  3724   3665   ******************************************************************************/
................................................................................
  3769   3710      LOCK,                       /* xLock */                                   \
  3770   3711      UNLOCK,                     /* xUnlock */                                 \
  3771   3712      CKLOCK,                     /* xCheckReservedLock */                      \
  3772   3713      unixFileControl,            /* xFileControl */                            \
  3773   3714      unixSectorSize,             /* xSectorSize */                             \
  3774   3715      unixDeviceCharacteristics,  /* xDeviceCapabilities */                     \
  3775   3716      unixShmOpen,                /* xShmOpen */                                \
  3776         -   unixShmSize,                /* xShmSize */                                \
  3777         -   unixShmGet,                 /* xShmGet */                                 \
  3778         -   unixShmRelease,             /* xShmRelease */                             \
  3779   3717      unixShmLock,                /* xShmLock */                                \
  3780   3718      unixShmBarrier,             /* xShmBarrier */                             \
  3781         -   unixShmClose                /* xShmClose */                               \
         3719  +   unixShmClose,               /* xShmClose */                               \
         3720  +   unixShmMap                  /* xShmMap */                                 \
  3782   3721   };                                                                           \
  3783   3722   static const sqlite3_io_methods *FINDER##Impl(const char *z, unixFile *p){   \
  3784   3723     UNUSED_PARAMETER(z); UNUSED_PARAMETER(p);                                  \
  3785   3724     return &METHOD;                                                            \
  3786   3725   }                                                                            \
  3787   3726   static const sqlite3_io_methods *(*const FINDER)(const char*,unixFile *p)    \
  3788   3727       = FINDER##Impl;

Changes to src/os_win.c.

  1212   1212   ** reverse order that they are acquired.  mutexBuf is always acquired
  1213   1213   ** first and released last.  This invariant is check by asserting
  1214   1214   ** sqlite3_mutex_notheld() on mutex whenever mutexBuf is acquired or
  1215   1215   ** released.
  1216   1216   */
  1217   1217   struct winShmNode {
  1218   1218     sqlite3_mutex *mutex;      /* Mutex to access this object */
  1219         -  sqlite3_mutex *mutexBuf;   /* Mutex to access zBuf[] */
  1220   1219     char *zFilename;           /* Name of the file */
  1221   1220     winFile hFile;             /* File handle from winOpen */
  1222         -  HANDLE hMap;               /* File handle from CreateFileMapping */
         1221  +
         1222  +  int szRegion;              /* Size of shared-memory regions */
         1223  +  int nRegion;               /* Size of array apRegion */
         1224  +  struct ShmRegion {
         1225  +    HANDLE hMap;             /* File handle from CreateFileMapping */
         1226  +    void *pMap;
         1227  +  } *aRegion;
  1223   1228     DWORD lastErrno;           /* The Windows errno from the last I/O error */
  1224         -  int szMap;                 /* Size of the mapping of file into memory */
  1225         -  char *pMMapBuf;            /* Where currently mmapped().  NULL if unmapped */
         1229  +
  1226   1230     int nRef;                  /* Number of winShm objects pointing to this */
  1227   1231     winShm *pFirst;            /* All winShm objects pointing to this */
  1228   1232     winShmNode *pNext;         /* Next in list of all winShmNode objects */
  1229   1233   #ifdef SQLITE_DEBUG
  1230   1234     u8 nextShmId;              /* Next available winShm.id value */
  1231   1235   #endif
  1232   1236   };
................................................................................
  1321   1325   static void winShmPurge(sqlite3_vfs *pVfs, int deleteFlag){
  1322   1326     winShmNode **pp;
  1323   1327     winShmNode *p;
  1324   1328     assert( winShmMutexHeld() );
  1325   1329     pp = &winShmNodeList;
  1326   1330     while( (p = *pp)!=0 ){
  1327   1331       if( p->nRef==0 ){
         1332  +      int i;
  1328   1333         if( p->mutex ) sqlite3_mutex_free(p->mutex);
  1329         -      if( p->mutexBuf ) sqlite3_mutex_free(p->mutexBuf);
  1330         -      if( p->pMMapBuf ){
  1331         -        UnmapViewOfFile(p->pMMapBuf);
  1332         -      }
  1333         -      if( INVALID_HANDLE_VALUE != p->hMap ){
  1334         -        CloseHandle(p->hMap);
         1334  +      for(i=0; i<p->nRegion; i++){
         1335  +        UnmapViewOfFile(p->aRegion[i].pMap);
         1336  +        CloseHandle(p->aRegion[i].hMap);
  1335   1337         }
  1336   1338         if( p->hFile.h != INVALID_HANDLE_VALUE ) {
  1337   1339           winClose((sqlite3_file *)&p->hFile);
  1338   1340         }
  1339   1341         if( deleteFlag ) winDelete(pVfs, p->zFilename, 0);
  1340   1342         *pp = p->pNext;
         1343  +      sqlite3_free(p->aRegion);
  1341   1344         sqlite3_free(p);
  1342   1345       }else{
  1343   1346         pp = &p->pNext;
  1344   1347       }
  1345   1348     }
  1346   1349   }
  1347   1350   
................................................................................
  1400   1403       if( sqlite3StrICmp(pShmNode->zFilename, pNew->zFilename)==0 ) break;
  1401   1404     }
  1402   1405     if( pShmNode ){
  1403   1406       sqlite3_free(pNew);
  1404   1407     }else{
  1405   1408       pShmNode = pNew;
  1406   1409       pNew = 0;
  1407         -    pShmNode->pMMapBuf = NULL;
  1408         -    pShmNode->hMap = INVALID_HANDLE_VALUE;
  1409   1410       ((winFile*)(&pShmNode->hFile))->h = INVALID_HANDLE_VALUE;
  1410   1411       pShmNode->pNext = winShmNodeList;
  1411   1412       winShmNodeList = pShmNode;
  1412   1413   
  1413   1414       pShmNode->mutex = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
  1414   1415       if( pShmNode->mutex==0 ){
  1415   1416         rc = SQLITE_NOMEM;
  1416         -      goto shm_open_err;
  1417         -    }
  1418         -    pShmNode->mutexBuf = sqlite3_mutex_alloc(SQLITE_MUTEX_FAST);
  1419         -    if( pShmNode->mutexBuf==0 ){
  1420         -      rc = SQLITE_NOMEM;
  1421   1417         goto shm_open_err;
  1422   1418       }
  1423   1419       rc = winOpen(pDbFd->pVfs,
  1424   1420                    pShmNode->zFilename,             /* Name of the file (UTF-8) */
  1425   1421                    (sqlite3_file*)&pShmNode->hFile,  /* File handle here */
  1426   1422                    SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, /* Mode flags */
  1427   1423                    0);
................................................................................
  1503   1499     }
  1504   1500     winShmLeaveMutex();
  1505   1501   
  1506   1502     return SQLITE_OK;
  1507   1503   }
  1508   1504   
  1509   1505   /*
  1510         -** Increase the size of the underlying storage for a shared-memory segment.
         1506  +** This function is called to obtain a pointer to region iRegion of the 
         1507  +** shared-memory associated with the database file fd. Shared-memory regions 
         1508  +** are numbered starting from zero. Each shared-memory region is szRegion 
         1509  +** bytes in size.
         1510  +**
         1511  +** If an error occurs, an error code is returned and *pp is set to NULL.
  1511   1512   **
  1512         -** The reqSize parameter is the new requested minimum size of the underlying
  1513         -** shared memory.  This routine may choose to make the shared memory larger
  1514         -** than this value (for example to round the shared memory size up to an
  1515         -** operating-system dependent page size.)
         1513  +** Otherwise, if the isWrite parameter is 0 and the requested shared-memory
         1514  +** region has not been allocated (by any client, including one running in a
         1515  +** separate process), then *pp is set to NULL and SQLITE_OK returned. If 
         1516  +** isWrite is non-zero and the requested shared-memory region has not yet 
         1517  +** been allocated, it is allocated by this function.
  1516   1518   **
  1517         -** This routine will only grow the size of shared memory.  A request for
  1518         -** a smaller size is a no-op.
  1519         -*/
  1520         -static int winShmSize(
  1521         -  sqlite3_file *fd,         /* Database holding the shared memory */
  1522         -  int reqSize,              /* Requested size.  -1 for query only */
  1523         -  int *pNewSize             /* Write new size here */
  1524         -){
  1525         -  winFile *pDbFd = (winFile*)fd;
  1526         -  winShm *p = pDbFd->pShm;
  1527         -  winShmNode *pShmNode = p->pShmNode;
  1528         -  int rc = SQLITE_OK;
  1529         -
  1530         -  *pNewSize = 0;
  1531         -  if( reqSize>=0 ){
  1532         -    sqlite3_int64 sz;
  1533         -    rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz);
  1534         -    if( SQLITE_OK==rc && reqSize>sz ){
  1535         -      rc = winTruncate((sqlite3_file *)&pShmNode->hFile, reqSize);
  1536         -    }
  1537         -  }
  1538         -  if( SQLITE_OK==rc ){
  1539         -    sqlite3_int64 sz;
  1540         -    rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz);
  1541         -    if( SQLITE_OK==rc ){
  1542         -      *pNewSize = (int)sz;
  1543         -    }else{
  1544         -      rc = SQLITE_IOERR;
  1545         -    }
  1546         -  }
  1547         -  return rc;
  1548         -}
  1549         -
  1550         -
  1551         -/*
  1552         -** Map the shared storage into memory.  The minimum size of the
  1553         -** mapping should be reqMapSize if reqMapSize is positive.  If
  1554         -** reqMapSize is zero or negative, the implementation can choose
  1555         -** whatever mapping size is convenient.
  1556         -**
  1557         -** *ppBuf is made to point to the memory which is a mapping of the
  1558         -** underlying storage.  A mutex is acquired to prevent other threads
  1559         -** from running while *ppBuf is in use in order to prevent other threads
  1560         -** remapping *ppBuf out from under this thread.  The winShmRelease()
  1561         -** call will release the mutex.  However, if the lock state is CHECKPOINT,
  1562         -** the mutex is not acquired because CHECKPOINT will never remap the
  1563         -** buffer.  RECOVER might remap, though, so CHECKPOINT will acquire
  1564         -** the mutex if and when it promotes to RECOVER.
  1565         -**
  1566         -** RECOVER needs to be atomic.  The same mutex that prevents *ppBuf from
  1567         -** being remapped also prevents more than one thread from being in
  1568         -** RECOVER at a time.  But, RECOVER sometimes wants to remap itself.
  1569         -** To prevent RECOVER from losing its lock while remapping, the
  1570         -** mutex is not released by winShmRelease() when in RECOVER.
  1571         -**
  1572         -** *pNewMapSize is set to the size of the mapping.
  1573         -**
  1574         -** *ppBuf and *pNewMapSize might be NULL and zero if no space has
  1575         -** yet been allocated to the underlying storage.
  1576         -*/
  1577         -static int winShmGet(
  1578         -  sqlite3_file *fd,        /* The database file holding the shared memory */
  1579         -  int reqMapSize,          /* Requested size of mapping. -1 means don't care */
  1580         -  int *pNewMapSize,        /* Write new size of mapping here */
  1581         -  void volatile **ppBuf    /* Write mapping buffer origin here */
         1519  +** If the shared-memory region has already been allocated or is allocated by
         1520  +** this call as described above, then it is mapped into this processes 
         1521  +** address space (if it is not already), *pp is set to point to the mapped 
         1522  +** memory and SQLITE_OK returned.
         1523  +*/
         1524  +static int winShmMap(
         1525  +  sqlite3_file *fd,               /* Handle open on database file */
         1526  +  int iRegion,                    /* Region to retrieve */
         1527  +  int szRegion,                   /* Size of regions */
         1528  +  int isWrite,                    /* True to extend file if necessary */
         1529  +  void volatile **pp              /* OUT: Mapped memory */
  1582   1530   ){
  1583   1531     winFile *pDbFd = (winFile*)fd;
  1584   1532     winShm *p = pDbFd->pShm;
  1585   1533     winShmNode *pShmNode = p->pShmNode;
  1586   1534     int rc = SQLITE_OK;
  1587   1535   
  1588         -  if( p->hasMutexBuf==0 ){
  1589         -    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
  1590         -    sqlite3_mutex_enter(pShmNode->mutexBuf);
  1591         -    p->hasMutexBuf = 1;
  1592         -  }
  1593   1536     sqlite3_mutex_enter(pShmNode->mutex);
  1594         -  if( pShmNode->szMap==0 || reqMapSize>pShmNode->szMap ){
  1595         -    int actualSize;
  1596         -    if( winShmSize(fd, -1, &actualSize)==SQLITE_OK
  1597         -     && reqMapSize<actualSize
  1598         -    ){
  1599         -      reqMapSize = actualSize;
  1600         -    }
  1601         -    if( pShmNode->pMMapBuf ){
  1602         -      if( !UnmapViewOfFile(pShmNode->pMMapBuf) ){
         1537  +  assert( szRegion==pShmNode->szRegion || pShmNode->nRegion==0 );
         1538  +
         1539  +  if( pShmNode->nRegion<=iRegion ){
         1540  +    struct ShmRegion *apNew;           /* New aRegion[] array */
         1541  +    int nByte = (iRegion+1)*szRegion;  /* Minimum required file size */
         1542  +    sqlite3_int64 sz;                  /* Current size of wal-index file */
         1543  +
         1544  +    pShmNode->szRegion = szRegion;
         1545  +
         1546  +    /* The requested region is not mapped into this processes address space.
         1547  +    ** Check to see if it has been allocated (i.e. if the wal-index file is
         1548  +    ** large enough to contain the requested region).
         1549  +    */
         1550  +    rc = winFileSize((sqlite3_file *)&pShmNode->hFile, &sz);
         1551  +    if( rc!=SQLITE_OK ){
         1552  +      goto shmpage_out;
         1553  +    }
         1554  +
         1555  +    if( sz<nByte ){
         1556  +      /* The requested memory region does not exist. If isWrite is set to
         1557  +      ** zero, exit early. *pp will be set to NULL and SQLITE_OK returned.
         1558  +      **
         1559  +      ** Alternatively, if isWrite is non-zero, use ftruncate() to allocate
         1560  +      ** the requested memory region.
         1561  +      */
         1562  +      if( !isWrite ) goto shmpage_out;
         1563  +      rc = winTruncate((sqlite3_file *)&pShmNode->hFile, nByte);
         1564  +      if( rc!=SQLITE_OK ){
         1565  +        goto shmpage_out;
         1566  +      }
         1567  +    }
         1568  +
         1569  +    /* Map the requested memory region into this processes address space. */
         1570  +    apNew = (struct ShmRegion *)sqlite3_realloc(
         1571  +        pShmNode->aRegion, (iRegion+1)*sizeof(apNew[0])
         1572  +    );
         1573  +    if( !apNew ){
         1574  +      rc = SQLITE_IOERR_NOMEM;
         1575  +      goto shmpage_out;
         1576  +    }
         1577  +    pShmNode->aRegion = apNew;
         1578  +
         1579  +    while( pShmNode->nRegion<=iRegion ){
         1580  +      HANDLE hMap;                /* file-mapping handle */
         1581  +      void *pMap = 0;             /* Mapped memory region */
         1582  +     
         1583  +      hMap = CreateFileMapping(pShmNode->hFile.h, 
         1584  +          NULL, PAGE_READWRITE, 0, nByte, NULL
         1585  +      );
         1586  +      if( hMap ){
         1587  +        pMap = MapViewOfFile(hMap, FILE_MAP_WRITE | FILE_MAP_READ,
         1588  +            0, 0, nByte
         1589  +        );
         1590  +      }
         1591  +      if( !pMap ){
  1603   1592           pShmNode->lastErrno = GetLastError();
  1604   1593           rc = SQLITE_IOERR;
         1594  +        if( hMap ) CloseHandle(hMap);
         1595  +        goto shmpage_out;
  1605   1596         }
  1606         -      CloseHandle(pShmNode->hMap);
  1607         -      pShmNode->hMap = INVALID_HANDLE_VALUE;
  1608         -    }
  1609         -    if( SQLITE_OK == rc ){
  1610         -      pShmNode->pMMapBuf = 0;
  1611         -      if( reqMapSize == 0 ){
  1612         -        /* can't create 0 byte file mapping in Windows */
  1613         -        pShmNode->szMap = 0;
  1614         -      }else{
  1615         -        /* create the file mapping object */
  1616         -        if( INVALID_HANDLE_VALUE == pShmNode->hMap ){
  1617         -          /* TBD provide an object name to each file
  1618         -          ** mapping so it can be re-used across processes.
  1619         -          */
  1620         -          pShmNode->hMap = CreateFileMapping(pShmNode->hFile.h,
  1621         -                                          NULL,
  1622         -                                          PAGE_READWRITE,
  1623         -                                          0,
  1624         -                                          reqMapSize,
  1625         -                                          NULL);
  1626         -        }
  1627         -        if( NULL==pShmNode->hMap ){
  1628         -          pShmNode->lastErrno = GetLastError();
  1629         -          rc = SQLITE_IOERR;
  1630         -          pShmNode->szMap = 0;
  1631         -          pShmNode->hMap = INVALID_HANDLE_VALUE;
  1632         -        }else{
  1633         -          pShmNode->pMMapBuf = MapViewOfFile(pShmNode->hMap,
  1634         -                                          FILE_MAP_WRITE | FILE_MAP_READ,
  1635         -                                          0,
  1636         -                                          0,
  1637         -                                          reqMapSize);
  1638         -          if( !pShmNode->pMMapBuf ){
  1639         -            pShmNode->lastErrno = GetLastError();
  1640         -            rc = SQLITE_IOERR;
  1641         -            pShmNode->szMap = 0;
  1642         -          }else{
  1643         -            pShmNode->szMap = reqMapSize;
  1644         -          }
  1645         -        }
  1646         -      }
  1647         -    }
  1648         -  }
  1649         -  *pNewMapSize = pShmNode->szMap;
  1650         -  *ppBuf = pShmNode->pMMapBuf;
         1597  +
         1598  +      pShmNode->aRegion[pShmNode->nRegion].pMap = pMap;
         1599  +      pShmNode->aRegion[pShmNode->nRegion].hMap = hMap;
         1600  +      pShmNode->nRegion++;
         1601  +    }
         1602  +  }
         1603  +
         1604  +shmpage_out:
         1605  +  if( pShmNode->nRegion>iRegion ){
         1606  +    char *p = (char *)pShmNode->aRegion[iRegion].pMap;
         1607  +    *pp = (void *)&p[iRegion*szRegion];
         1608  +  }else{
         1609  +    *pp = 0;
         1610  +  }
  1651   1611     sqlite3_mutex_leave(pShmNode->mutex);
  1652   1612     return rc;
  1653   1613   }
  1654   1614   
  1655         -/*
  1656         -** Release the lock held on the shared memory segment so that other
  1657         -** threads are free to resize it if necessary.
  1658         -**
  1659         -** If the lock is not currently held, this routine is a harmless no-op.
  1660         -**
  1661         -** If the shared-memory object is in lock state RECOVER, then we do not
  1662         -** really want to release the lock, so in that case too, this routine
  1663         -** is a no-op.
  1664         -*/
  1665         -static int winShmRelease(sqlite3_file *fd){
  1666         -  winFile *pDbFd = (winFile*)fd;
  1667         -  winShm *p = pDbFd->pShm;
  1668         -  if( p->hasMutexBuf ){
  1669         -    winShmNode *pShmNode = p->pShmNode;
  1670         -    assert( sqlite3_mutex_notheld(pShmNode->mutex) );
  1671         -    sqlite3_mutex_leave(pShmNode->mutexBuf);
  1672         -    p->hasMutexBuf = 0;
  1673         -  }
  1674         -  return SQLITE_OK;
  1675         -}
  1676         -
  1677   1615   /*
  1678   1616   ** Change the lock state for a shared-memory segment.
  1679   1617   */
  1680   1618   static int winShmLock(
  1681   1619     sqlite3_file *fd,          /* Database file holding the shared memory */
  1682   1620     int ofst,                  /* First lock to acquire or release */
  1683   1621     int n,                     /* Number of locks to acquire or release */
................................................................................
  1752   1690     winLock,
  1753   1691     winUnlock,
  1754   1692     winCheckReservedLock,
  1755   1693     winFileControl,
  1756   1694     winSectorSize,
  1757   1695     winDeviceCharacteristics,
  1758   1696     winShmOpen,              /* xShmOpen */
  1759         -  winShmSize,              /* xShmSize */
  1760         -  winShmGet,               /* xShmGet */
  1761         -  winShmRelease,           /* xShmRelease */
  1762   1697     winShmLock,              /* xShmLock */
  1763   1698     winShmBarrier,           /* xShmBarrier */
  1764         -  winShmClose              /* xShmClose */
         1699  +  winShmClose,             /* xShmClose */
         1700  +  winShmMap                /* xShmMap */
  1765   1701   };
  1766   1702   
  1767   1703   /***************************************************************************
  1768   1704   ** Here ends the I/O methods that form the sqlite3_io_methods object.
  1769   1705   **
  1770   1706   ** The next block of code implements the VFS methods.
  1771   1707   ****************************************************************************/

Changes to src/sqlite.h.in.

   656    656     int (*xUnlock)(sqlite3_file*, int);
   657    657     int (*xCheckReservedLock)(sqlite3_file*, int *pResOut);
   658    658     int (*xFileControl)(sqlite3_file*, int op, void *pArg);
   659    659     int (*xSectorSize)(sqlite3_file*);
   660    660     int (*xDeviceCharacteristics)(sqlite3_file*);
   661    661     /* Methods above are valid for version 1 */
   662    662     int (*xShmOpen)(sqlite3_file*);
   663         -  int (*xShmSize)(sqlite3_file*, int reqSize, int *pNewSize);
   664         -  int (*xShmGet)(sqlite3_file*, int reqSize, int *pSize, void volatile**);
   665         -  int (*xShmRelease)(sqlite3_file*);
   666    663     int (*xShmLock)(sqlite3_file*, int offset, int n, int flags);
   667    664     void (*xShmBarrier)(sqlite3_file*);
   668    665     int (*xShmClose)(sqlite3_file*, int deleteFlag);
          666  +  int (*xShmMap)(sqlite3_file*, int iPage, int pgsz, int, void volatile**);
   669    667     /* Methods above are valid for version 2 */
   670    668     /* Additional methods may be added in future releases */
   671    669   };
   672    670   
   673    671   /*
   674    672   ** CAPI3REF: Standard File Control Opcodes
   675    673   **

Changes to src/test6.c.

   522    522   
   523    523   /*
   524    524   ** Pass-throughs for WAL support.
   525    525   */
   526    526   static int cfShmOpen(sqlite3_file *pFile){
   527    527     return sqlite3OsShmOpen(((CrashFile*)pFile)->pRealFile);
   528    528   }
   529         -static int cfShmSize(sqlite3_file *pFile, int reqSize, int *pNew){
   530         -  return sqlite3OsShmSize(((CrashFile*)pFile)->pRealFile, reqSize, pNew);
   531         -}
   532         -static int cfShmGet(
   533         -  sqlite3_file *pFile,
   534         -  int reqSize,
   535         -  int *pSize,
   536         -  void volatile **pp
   537         -){
   538         -  return sqlite3OsShmGet(((CrashFile*)pFile)->pRealFile, reqSize, pSize, pp);
   539         -}
   540         -static int cfShmRelease(sqlite3_file *pFile){
   541         -  return sqlite3OsShmRelease(((CrashFile*)pFile)->pRealFile);
   542         -}
   543    529   static int cfShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   544    530     return sqlite3OsShmLock(((CrashFile*)pFile)->pRealFile, ofst, n, flags);
   545    531   }
   546    532   static void cfShmBarrier(sqlite3_file *pFile){
   547    533     sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile);
   548    534   }
   549    535   static int cfShmClose(sqlite3_file *pFile, int delFlag){
   550    536     return sqlite3OsShmClose(((CrashFile*)pFile)->pRealFile, delFlag);
   551    537   }
   552         -
          538  +static int cfShmMap(
          539  +  sqlite3_file *pFile,            /* Handle open on database file */
          540  +  int iRegion,                    /* Region to retrieve */
          541  +  int sz,                         /* Size of regions */
          542  +  int w,                          /* True to extend file if necessary */
          543  +  void volatile **pp              /* OUT: Mapped memory */
          544  +){
          545  +  return sqlite3OsShmMap(((CrashFile*)pFile)->pRealFile, iRegion, sz, w, pp);
          546  +}
   553    547   
   554    548   static const sqlite3_io_methods CrashFileVtab = {
   555    549     2,                            /* iVersion */
   556    550     cfClose,                      /* xClose */
   557    551     cfRead,                       /* xRead */
   558    552     cfWrite,                      /* xWrite */
   559    553     cfTruncate,                   /* xTruncate */
................................................................................
   562    556     cfLock,                       /* xLock */
   563    557     cfUnlock,                     /* xUnlock */
   564    558     cfCheckReservedLock,          /* xCheckReservedLock */
   565    559     cfFileControl,                /* xFileControl */
   566    560     cfSectorSize,                 /* xSectorSize */
   567    561     cfDeviceCharacteristics,      /* xDeviceCharacteristics */
   568    562     cfShmOpen,                    /* xShmOpen */
   569         -  cfShmSize,                    /* xShmSize */
   570         -  cfShmGet,                     /* xShmGet */
   571         -  cfShmRelease,                 /* xShmRelease */
   572    563     cfShmLock,                    /* xShmLock */
   573    564     cfShmBarrier,                 /* xShmBarrier */
   574         -  cfShmClose                    /* xShmClose */
          565  +  cfShmClose,                   /* xShmClose */
          566  +  cfShmMap                      /* xShmMap */
   575    567   };
   576    568   
   577    569   /*
   578    570   ** Application data for the crash VFS
   579    571   */
   580    572   struct crashAppData {
   581    573     sqlite3_vfs *pOrig;                   /* Wrapped vfs structure */

Changes to src/test_devsym.c.

    47     47   static int devsymLock(sqlite3_file*, int);
    48     48   static int devsymUnlock(sqlite3_file*, int);
    49     49   static int devsymCheckReservedLock(sqlite3_file*, int *);
    50     50   static int devsymFileControl(sqlite3_file*, int op, void *pArg);
    51     51   static int devsymSectorSize(sqlite3_file*);
    52     52   static int devsymDeviceCharacteristics(sqlite3_file*);
    53     53   static int devsymShmOpen(sqlite3_file*);
    54         -static int devsymShmSize(sqlite3_file*,int,int*);
    55         -static int devsymShmGet(sqlite3_file*,int,int*,volatile void**);
    56         -static int devsymShmRelease(sqlite3_file*);
    57     54   static int devsymShmLock(sqlite3_file*,int,int,int);
    58     55   static void devsymShmBarrier(sqlite3_file*);
    59     56   static int devsymShmClose(sqlite3_file*,int);
           57  +static int devsymShmMap(sqlite3_file*,int,int,int, void volatile **);
    60     58   
    61     59   /*
    62     60   ** Method declarations for devsym_vfs.
    63     61   */
    64     62   static int devsymOpen(sqlite3_vfs*, const char *, sqlite3_file*, int , int *);
    65     63   static int devsymDelete(sqlite3_vfs*, const char *zName, int syncDir);
    66     64   static int devsymAccess(sqlite3_vfs*, const char *zName, int flags, int *);
................................................................................
   116    114     devsymLock,                       /* xLock */
   117    115     devsymUnlock,                     /* xUnlock */
   118    116     devsymCheckReservedLock,          /* xCheckReservedLock */
   119    117     devsymFileControl,                /* xFileControl */
   120    118     devsymSectorSize,                 /* xSectorSize */
   121    119     devsymDeviceCharacteristics,      /* xDeviceCharacteristics */
   122    120     devsymShmOpen,                    /* xShmOpen */
   123         -  devsymShmSize,                    /* xShmSize */
   124         -  devsymShmGet,                     /* xShmGet */
   125         -  devsymShmRelease,                 /* xShmRelease */
   126    121     devsymShmLock,                    /* xShmLock */
   127    122     devsymShmBarrier,                 /* xShmBarrier */
   128         -  devsymShmClose                    /* xShmClose */
          123  +  devsymShmClose,                   /* xShmClose */
          124  +  devsymShmMap                     /* xShmMap */
   129    125   };
   130    126   
   131    127   struct DevsymGlobal {
   132    128     sqlite3_vfs *pVfs;
   133    129     int iDeviceChar;
   134    130     int iSectorSize;
   135    131   };
................................................................................
   242    238   /*
   243    239   ** Shared-memory methods are all pass-thrus.
   244    240   */
   245    241   static int devsymShmOpen(sqlite3_file *pFile){
   246    242     devsym_file *p = (devsym_file *)pFile;
   247    243     return sqlite3OsShmOpen(p->pReal);
   248    244   }
   249         -static int devsymShmSize(sqlite3_file *pFile, int reqSize, int *pSize){
   250         -  devsym_file *p = (devsym_file *)pFile;
   251         -  return sqlite3OsShmSize(p->pReal, reqSize, pSize);
   252         -}
   253         -static int devsymShmGet(
   254         -  sqlite3_file *pFile,
   255         -  int reqSz,
   256         -  int *pSize,
   257         -  void volatile **pp
   258         -){
   259         -  devsym_file *p = (devsym_file *)pFile;
   260         -  return sqlite3OsShmGet(p->pReal, reqSz, pSize, pp);
   261         -}
   262         -static int devsymShmRelease(sqlite3_file *pFile){
   263         -  devsym_file *p = (devsym_file *)pFile;
   264         -  return sqlite3OsShmRelease(p->pReal);
   265         -}
   266    245   static int devsymShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   267    246     devsym_file *p = (devsym_file *)pFile;
   268    247     return sqlite3OsShmLock(p->pReal, ofst, n, flags);
   269    248   }
   270    249   static void devsymShmBarrier(sqlite3_file *pFile){
   271    250     devsym_file *p = (devsym_file *)pFile;
   272    251     sqlite3OsShmBarrier(p->pReal);
   273    252   }
   274    253   static int devsymShmClose(sqlite3_file *pFile, int delFlag){
   275    254     devsym_file *p = (devsym_file *)pFile;
   276    255     return sqlite3OsShmClose(p->pReal, delFlag);
   277    256   }
          257  +static int devsymShmMap(
          258  +  sqlite3_file *pFile, 
          259  +  int iRegion, 
          260  +  int szRegion, 
          261  +  int isWrite, 
          262  +  void volatile **pp
          263  +){
          264  +  devsym_file *p = (devsym_file *)pFile;
          265  +  return sqlite3OsShmMap(p->pReal, iRegion, szRegion, isWrite, pp);
          266  +}
   278    267   
   279    268   
   280    269   
   281    270   /*
   282    271   ** Open an devsym file handle.
   283    272   */
   284    273   static int devsymOpen(

Changes to src/test_osinst.c.

    96     96   #define OS_SLEEP             16
    97     97   #define OS_SYNC              17
    98     98   #define OS_TRUNCATE          18
    99     99   #define OS_UNLOCK            19
   100    100   #define OS_WRITE             20
   101    101   #define OS_SHMOPEN           21
   102    102   #define OS_SHMCLOSE          22
   103         -#define OS_SHMGET            23
   104         -#define OS_SHMRELEASE        24
          103  +#define OS_SHMMAP            23
   105    104   #define OS_SHMLOCK           25
   106    105   #define OS_SHMBARRIER        26
   107         -#define OS_SHMSIZE           27
   108    106   #define OS_ANNOTATE          28
   109    107   
   110    108   #define OS_NUMEVENTS         29
   111    109   
   112    110   #define VFSLOG_BUFFERSIZE 8192
   113    111   
   114    112   typedef struct VfslogVfs VfslogVfs;
................................................................................
   148    146   static int vfslogUnlock(sqlite3_file*, int);
   149    147   static int vfslogCheckReservedLock(sqlite3_file*, int *pResOut);
   150    148   static int vfslogFileControl(sqlite3_file*, int op, void *pArg);
   151    149   static int vfslogSectorSize(sqlite3_file*);
   152    150   static int vfslogDeviceCharacteristics(sqlite3_file*);
   153    151   
   154    152   static int vfslogShmOpen(sqlite3_file *pFile);
   155         -static int vfslogShmSize(sqlite3_file *pFile, int reqSize, int *pNewSize);
   156         -static int vfslogShmGet(sqlite3_file *pFile, int,int*,volatile void **);
   157         -static int vfslogShmRelease(sqlite3_file *pFile);
   158    153   static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags);
   159    154   static void vfslogShmBarrier(sqlite3_file*);
   160    155   static int vfslogShmClose(sqlite3_file *pFile, int deleteFlag);
          156  +static int vfslogShmMap(sqlite3_file *pFile,int,int,int,volatile void **);
   161    157   
   162    158   /*
   163    159   ** Method declarations for vfslog_vfs.
   164    160   */
   165    161   static int vfslogOpen(sqlite3_vfs*, const char *, sqlite3_file*, int , int *);
   166    162   static int vfslogDelete(sqlite3_vfs*, const char *zName, int syncDir);
   167    163   static int vfslogAccess(sqlite3_vfs*, const char *zName, int flags, int *);
................................................................................
   212    208     vfslogLock,                     /* xLock */
   213    209     vfslogUnlock,                   /* xUnlock */
   214    210     vfslogCheckReservedLock,        /* xCheckReservedLock */
   215    211     vfslogFileControl,              /* xFileControl */
   216    212     vfslogSectorSize,               /* xSectorSize */
   217    213     vfslogDeviceCharacteristics,    /* xDeviceCharacteristics */
   218    214     vfslogShmOpen,                  /* xShmOpen */
   219         -  vfslogShmSize,                  /* xShmSize */
   220         -  vfslogShmGet,                   /* xShmGet */
   221         -  vfslogShmRelease,               /* xShmRelease */
   222    215     vfslogShmLock,                  /* xShmLock */
   223    216     vfslogShmBarrier,               /* xShmBarrier */
   224         -  vfslogShmClose                  /* xShmClose */
          217  +  vfslogShmClose,                 /* xShmClose */
          218  +  vfslogShmMap                    /* xShmMap */
   225    219   };
   226    220   
   227    221   #if defined(SQLITE_OS_UNIX) && !defined(NO_GETTOD)
   228    222   #include <sys/time.h>
   229    223   static sqlite3_uint64 vfslog_time(){
   230    224     struct timeval sTime;
   231    225     gettimeofday(&sTime, 0);
................................................................................
   437    431     VfslogFile *p = (VfslogFile *)pFile;
   438    432     t = vfslog_time();
   439    433     rc = p->pReal->pMethods->xShmOpen(p->pReal);
   440    434     t = vfslog_time() - t;
   441    435     vfslog_call(p->pVfslog, OS_SHMOPEN, p->iFileId, t, rc, 0, 0);
   442    436     return rc;
   443    437   }
   444         -static int vfslogShmSize(sqlite3_file *pFile, int reqSize, int *pNewSize){
   445         -  int rc;
   446         -  sqlite3_uint64 t;
   447         -  VfslogFile *p = (VfslogFile *)pFile;
   448         -  t = vfslog_time();
   449         -  rc = p->pReal->pMethods->xShmSize(p->pReal, reqSize, pNewSize);
   450         -  t = vfslog_time() - t;
   451         -  vfslog_call(p->pVfslog, OS_SHMSIZE, p->iFileId, t, rc, 0, 0);
   452         -  return rc;
   453         -}
   454         -static int vfslogShmGet(
   455         -  sqlite3_file *pFile,
   456         -  int req,
   457         -  int *pSize,
   458         -  volatile void **pp
   459         -){
   460         -  int rc;
   461         -  sqlite3_uint64 t;
   462         -  VfslogFile *p = (VfslogFile *)pFile;
   463         -  t = vfslog_time();
   464         -  rc = p->pReal->pMethods->xShmGet(p->pReal, req, pSize, pp);
   465         -  t = vfslog_time() - t;
   466         -  vfslog_call(p->pVfslog, OS_SHMGET, p->iFileId, t, rc, 0, 0);
   467         -  return rc;
   468         -}
   469         -static int vfslogShmRelease(sqlite3_file *pFile){
   470         -  int rc;
   471         -  sqlite3_uint64 t;
   472         -  VfslogFile *p = (VfslogFile *)pFile;
   473         -  t = vfslog_time();
   474         -  rc = p->pReal->pMethods->xShmRelease(p->pReal);
   475         -  t = vfslog_time() - t;
   476         -  vfslog_call(p->pVfslog, OS_SHMRELEASE, p->iFileId, t, rc, 0, 0);
   477         -  return rc;
   478         -}
   479    438   static int vfslogShmLock(sqlite3_file *pFile, int ofst, int n, int flags){
   480    439     int rc;
   481    440     sqlite3_uint64 t;
   482    441     VfslogFile *p = (VfslogFile *)pFile;
   483    442     t = vfslog_time();
   484    443     rc = p->pReal->pMethods->xShmLock(p->pReal, ofst, n, flags);
   485    444     t = vfslog_time() - t;
................................................................................
   499    458     sqlite3_uint64 t;
   500    459     VfslogFile *p = (VfslogFile *)pFile;
   501    460     t = vfslog_time();
   502    461     rc = p->pReal->pMethods->xShmClose(p->pReal, deleteFlag);
   503    462     t = vfslog_time() - t;
   504    463     vfslog_call(p->pVfslog, OS_SHMCLOSE, p->iFileId, t, rc, 0, 0);
   505    464     return rc;
          465  +}
          466  +static int vfslogShmMap(
          467  +  sqlite3_file *pFile, 
          468  +  int iRegion, 
          469  +  int szRegion, 
          470  +  int isWrite, 
          471  +  volatile void **pp
          472  +){
          473  +  int rc;
          474  +  sqlite3_uint64 t;
          475  +  VfslogFile *p = (VfslogFile *)pFile;
          476  +  t = vfslog_time();
          477  +  rc = p->pReal->pMethods->xShmMap(p->pReal, iRegion, szRegion, isWrite, pp);
          478  +  t = vfslog_time() - t;
          479  +  vfslog_call(p->pVfslog, OS_SHMMAP, p->iFileId, t, rc, 0, 0);
          480  +  return rc;
   506    481   }
   507    482   
   508    483   
   509    484   /*
   510    485   ** Open an vfslog file handle.
   511    486   */
   512    487   static int vfslogOpen(
................................................................................
   822    797       case OS_FULLPATHNAME:      zEvent = "xFullPathname"; break;
   823    798       case OS_RANDOMNESS:        zEvent = "xRandomness"; break;
   824    799       case OS_SLEEP:             zEvent = "xSleep"; break;
   825    800       case OS_CURRENTTIME:       zEvent = "xCurrentTime"; break;
   826    801   
   827    802       case OS_SHMCLOSE:          zEvent = "xShmClose"; break;
   828    803       case OS_SHMOPEN:           zEvent = "xShmOpen"; break;
   829         -    case OS_SHMGET:            zEvent = "xShmGet"; break;
   830         -    case OS_SHMSIZE:           zEvent = "xShmSize"; break;
   831         -    case OS_SHMRELEASE:        zEvent = "xShmRelease"; break;
   832    804       case OS_SHMLOCK:           zEvent = "xShmLock"; break;
   833    805       case OS_SHMBARRIER:        zEvent = "xShmBarrier"; break;
          806  +    case OS_SHMMAP:            zEvent = "xShmMap"; break;
   834    807   
   835    808       case OS_ANNOTATE:          zEvent = "annotation"; break;
   836    809     }
   837    810   
   838    811     return zEvent;
   839    812   }
   840    813   

Changes to src/test_vfs.c.

    65     65   ** If a bit is clear in Testvfs.mask, then calls made by SQLite to the 
    66     66   ** corresponding VFS method is ignored for purposes of:
    67     67   **
    68     68   **   + Simulating IO errors, and
    69     69   **   + Invoking the Tcl callback script.
    70     70   */
    71     71   #define TESTVFS_SHMOPEN_MASK    0x00000001
    72         -#define TESTVFS_SHMSIZE_MASK    0x00000002
    73         -#define TESTVFS_SHMGET_MASK     0x00000004
    74         -#define TESTVFS_SHMRELEASE_MASK 0x00000008
    75     72   #define TESTVFS_SHMLOCK_MASK    0x00000010
    76     73   #define TESTVFS_SHMBARRIER_MASK 0x00000020
    77     74   #define TESTVFS_SHMCLOSE_MASK   0x00000040
           75  +#define TESTVFS_SHMPAGE_MASK    0x00000080
    78     76   
    79         -#define TESTVFS_OPEN_MASK       0x00000080
    80         -#define TESTVFS_SYNC_MASK       0x00000100
    81         -#define TESTVFS_ALL_MASK        0x000001FF
           77  +#define TESTVFS_OPEN_MASK       0x00000100
           78  +#define TESTVFS_SYNC_MASK       0x00000200
           79  +#define TESTVFS_ALL_MASK        0x000003FF
           80  +
           81  +
           82  +#define TESTVFS_MAX_PAGES 256
    82     83   
    83     84   /*
    84     85   ** A shared-memory buffer. There is one of these objects for each shared
    85     86   ** memory region opened by clients. If two clients open the same file,
    86     87   ** there are two TestvfsFile structures but only one TestvfsBuffer structure.
    87     88   */
    88     89   struct TestvfsBuffer {
    89     90     char *zFile;                    /* Associated file name */
    90         -  int n;                          /* Size of allocated buffer in bytes */
    91         -  u8 *a;                          /* Buffer allocated using ckalloc() */
           91  +  int pgsz;                       /* Page size */
           92  +  u8 *aPage[TESTVFS_MAX_PAGES];   /* Array of ckalloc'd pages */
    92     93     TestvfsFile *pFile;             /* List of open handles */
    93     94     TestvfsBuffer *pNext;           /* Next in linked list of all buffers */
    94     95   };
    95     96   
    96     97   
    97     98   #define PARENTVFS(x) (((Testvfs *)((x)->pAppData))->pParent)
    98     99   
................................................................................
   129    130   static void tvfsDlClose(sqlite3_vfs*, void*);
   130    131   #endif /* SQLITE_OMIT_LOAD_EXTENSION */
   131    132   static int tvfsRandomness(sqlite3_vfs*, int nByte, char *zOut);
   132    133   static int tvfsSleep(sqlite3_vfs*, int microseconds);
   133    134   static int tvfsCurrentTime(sqlite3_vfs*, double*);
   134    135   
   135    136   static int tvfsShmOpen(sqlite3_file*);
   136         -static int tvfsShmSize(sqlite3_file*, int , int *);
   137         -static int tvfsShmGet(sqlite3_file*, int , int *, volatile void **);
   138         -static int tvfsShmRelease(sqlite3_file*);
   139    137   static int tvfsShmLock(sqlite3_file*, int , int, int);
   140    138   static void tvfsShmBarrier(sqlite3_file*);
   141    139   static int tvfsShmClose(sqlite3_file*, int);
          140  +static int tvfsShmPage(sqlite3_file*,int,int,int, void volatile **);
   142    141   
   143    142   static sqlite3_io_methods tvfs_io_methods = {
   144    143     2,                            /* iVersion */
   145    144     tvfsClose,                      /* xClose */
   146    145     tvfsRead,                       /* xRead */
   147    146     tvfsWrite,                      /* xWrite */
   148    147     tvfsTruncate,                   /* xTruncate */
................................................................................
   151    150     tvfsLock,                       /* xLock */
   152    151     tvfsUnlock,                     /* xUnlock */
   153    152     tvfsCheckReservedLock,          /* xCheckReservedLock */
   154    153     tvfsFileControl,                /* xFileControl */
   155    154     tvfsSectorSize,                 /* xSectorSize */
   156    155     tvfsDeviceCharacteristics,      /* xDeviceCharacteristics */
   157    156     tvfsShmOpen,                    /* xShmOpen */
   158         -  tvfsShmSize,                    /* xShmSize */
   159         -  tvfsShmGet,                     /* xShmGet */
   160         -  tvfsShmRelease,                 /* xShmRelease */
   161    157     tvfsShmLock,                    /* xShmLock */
   162    158     tvfsShmBarrier,                 /* xShmBarrier */
   163         -  tvfsShmClose                    /* xShmClose */
          159  +  tvfsShmClose,                   /* xShmClose */
          160  +  tvfsShmPage                     /* xShmPage */
   164    161   };
   165    162   
   166    163   static int tvfsResultCode(Testvfs *p, int *pRc){
   167    164     struct errcode {
   168    165       int eCode;
   169    166       const char *zCode;
   170    167     } aCode[] = {
................................................................................
   439    436     rc = sqlite3OsOpen(PARENTVFS(pVfs), zName, pFd->pReal, flags, pOutFlags);
   440    437     if( pFd->pReal->pMethods ){
   441    438       sqlite3_io_methods *pMethods;
   442    439       pMethods = (sqlite3_io_methods *)ckalloc(sizeof(sqlite3_io_methods));
   443    440       memcpy(pMethods, &tvfs_io_methods, sizeof(sqlite3_io_methods));
   444    441       if( ((Testvfs *)pVfs->pAppData)->isNoshm ){
   445    442         pMethods->xShmOpen = 0;
   446         -      pMethods->xShmGet = 0;
   447         -      pMethods->xShmSize = 0;
   448         -      pMethods->xShmRelease = 0;
   449    443         pMethods->xShmClose = 0;
   450    444         pMethods->xShmLock = 0;
   451    445         pMethods->xShmBarrier = 0;
          446  +      pMethods->xShmMap = 0;
   452    447       }
   453    448       pFile->pMethods = pMethods;
   454    449     }
   455    450   
   456    451     return rc;
   457    452   }
   458    453   
................................................................................
   543    538   /*
   544    539   ** Return the current time as a Julian Day number in *pTimeOut.
   545    540   */
   546    541   static int tvfsCurrentTime(sqlite3_vfs *pVfs, double *pTimeOut){
   547    542     return PARENTVFS(pVfs)->xCurrentTime(PARENTVFS(pVfs), pTimeOut);
   548    543   }
   549    544   
   550         -static void tvfsGrowBuffer(TestvfsFile *pFd, int reqSize, int *pNewSize){
   551         -  TestvfsBuffer *pBuffer = pFd->pShm;
   552         -  if( reqSize>pBuffer->n ){
   553         -    pBuffer->a = (u8 *)ckrealloc((char *)pBuffer->a, reqSize);
   554         -    memset(&pBuffer->a[pBuffer->n], 0x55, reqSize-pBuffer->n);
   555         -    pBuffer->n = reqSize;
   556         -  }
   557         -  *pNewSize = pBuffer->n;
   558         -}
   559         -
   560    545   static int tvfsInjectIoerr(Testvfs *p){
   561    546     int ret = 0;
   562    547     if( p->ioerr ){
   563    548       p->iIoerrCnt--;
   564    549       if( p->iIoerrCnt==0 || (p->iIoerrCnt<0 && p->ioerr==2) ){
   565    550         ret = 1;
   566    551         p->nIoerrFail++;
................................................................................
   615    600     /* Connect the TestvfsBuffer to the new TestvfsShm handle and return. */
   616    601     pFd->pNext = pBuffer->pFile;
   617    602     pBuffer->pFile = pFd;
   618    603     pFd->pShm = pBuffer;
   619    604     return SQLITE_OK;
   620    605   }
   621    606   
   622         -static int tvfsShmSize(
   623         -  sqlite3_file *pFile,
   624         -  int reqSize,
   625         -  int *pNewSize
   626         -){
   627         -  int rc = SQLITE_OK;
   628         -  TestvfsFile *pFd = (TestvfsFile *)pFile;
   629         -  Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
   630         -
   631         -  if( p->pScript && p->mask&TESTVFS_SHMSIZE_MASK ){
   632         -    tvfsExecTcl(p, "xShmSize", 
   633         -        Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, 0
   634         -    );
   635         -    tvfsResultCode(p, &rc);
          607  +static void tvfsAllocPage(TestvfsBuffer *p, int iPage, int pgsz){
          608  +  assert( iPage<TESTVFS_MAX_PAGES );
          609  +  if( p->aPage[iPage]==0 ){
          610  +    p->aPage[iPage] = (u8 *)ckalloc(pgsz);
          611  +    memset(p->aPage[iPage], 0, pgsz);
          612  +    p->pgsz = pgsz;
   636    613     }
   637         -  if( rc==SQLITE_OK && p->mask&TESTVFS_SHMSIZE_MASK && tvfsInjectIoerr(p) ){
   638         -    rc = SQLITE_IOERR;
   639         -  }
   640         -  if( rc==SQLITE_OK ){
   641         -    tvfsGrowBuffer(pFd, reqSize, pNewSize);
   642         -  }
   643         -  return rc;
   644    614   }
   645    615   
   646         -static int tvfsShmGet(
   647         -  sqlite3_file *pFile, 
   648         -  int reqMapSize, 
   649         -  int *pMapSize, 
   650         -  volatile void **pp
          616  +static int tvfsShmPage(
          617  +  sqlite3_file *pFile,            /* Handle open on database file */
          618  +  int iPage,                      /* Page to retrieve */
          619  +  int pgsz,                       /* Size of pages */
          620  +  int isWrite,                    /* True to extend file if necessary */
          621  +  void volatile **pp              /* OUT: Mapped memory */
   651    622   ){
   652    623     int rc = SQLITE_OK;
   653    624     TestvfsFile *pFd = (TestvfsFile *)pFile;
   654    625     Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
   655    626   
   656         -  if( p->pScript && p->mask&TESTVFS_SHMGET_MASK ){
   657         -    tvfsExecTcl(p, "xShmGet", 
   658         -        Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, 
   659         -        Tcl_NewIntObj(reqMapSize)
          627  +  if( p->pScript && p->mask&TESTVFS_SHMPAGE_MASK ){
          628  +    Tcl_Obj *pArg = Tcl_NewObj();
          629  +    Tcl_IncrRefCount(pArg);
          630  +    Tcl_ListObjAppendElement(p->interp, pArg, Tcl_NewIntObj(iPage));
          631  +    Tcl_ListObjAppendElement(p->interp, pArg, Tcl_NewIntObj(pgsz));
          632  +    Tcl_ListObjAppendElement(p->interp, pArg, Tcl_NewIntObj(isWrite));
          633  +    tvfsExecTcl(p, "xShmPage", 
          634  +        Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, pArg
   660    635       );
   661    636       tvfsResultCode(p, &rc);
          637  +    Tcl_DecrRefCount(pArg);
   662    638     }
   663         -  if( rc==SQLITE_OK && p->mask&TESTVFS_SHMGET_MASK && tvfsInjectIoerr(p) ){
          639  +  if( rc==SQLITE_OK && p->mask&TESTVFS_SHMPAGE_MASK && tvfsInjectIoerr(p) ){
   664    640       rc = SQLITE_IOERR;
   665    641     }
   666    642   
   667         -  *pMapSize = pFd->pShm->n;
   668         -  *pp = pFd->pShm->a;
          643  +  if( rc==SQLITE_OK && isWrite && !pFd->pShm->aPage[iPage] ){
          644  +    tvfsAllocPage(pFd->pShm, iPage, pgsz);
          645  +  }
          646  +  *pp = (void volatile *)pFd->pShm->aPage[iPage];
          647  +
   669    648     return rc;
   670    649   }
   671    650   
   672         -static int tvfsShmRelease(sqlite3_file *pFile){
   673         -  int rc = SQLITE_OK;
   674         -  TestvfsFile *pFd = (TestvfsFile *)pFile;
   675         -  Testvfs *p = (Testvfs *)(pFd->pVfs->pAppData);
   676         -
   677         -  if( p->pScript && p->mask&TESTVFS_SHMRELEASE_MASK ){
   678         -    tvfsExecTcl(p, "xShmRelease", 
   679         -        Tcl_NewStringObj(pFd->pShm->zFile, -1), pFd->pShmId, 0
   680         -    );
   681         -    tvfsResultCode(p, &rc);
   682         -  }
   683         -
   684         -  return rc;
   685         -}
   686    651   
   687    652   static int tvfsShmLock(
   688    653     sqlite3_file *pFile,
   689    654     int ofst,
   690    655     int n,
   691    656     int flags
   692    657   ){
................................................................................
   778    743     }
   779    744   
   780    745     for(ppFd=&pBuffer->pFile; *ppFd!=pFd; ppFd=&((*ppFd)->pNext));
   781    746     assert( (*ppFd)==pFd );
   782    747     *ppFd = pFd->pNext;
   783    748   
   784    749     if( pBuffer->pFile==0 ){
          750  +    int i;
   785    751       TestvfsBuffer **pp;
   786    752       for(pp=&p->pBuffer; *pp!=pBuffer; pp=&((*pp)->pNext));
   787    753       *pp = (*pp)->pNext;
   788         -    ckfree((char *)pBuffer->a);
          754  +    for(i=0; pBuffer->aPage[i]; i++){
          755  +      ckfree((char *)pBuffer->aPage[i]);
          756  +    }
   789    757       ckfree((char *)pBuffer);
   790    758     }
   791    759     pFd->pShm = 0;
   792    760   
   793    761     return rc;
   794    762   }
   795    763   
................................................................................
   817    785     if( Tcl_GetIndexFromObj(interp, objv[1], CMD_strs, "subcommand", 0, &i) ){
   818    786       return TCL_ERROR;
   819    787     }
   820    788     Tcl_ResetResult(interp);
   821    789   
   822    790     switch( (enum DB_enum)i ){
   823    791       case CMD_SHM: {
          792  +      Tcl_Obj *pObj;
          793  +      int i;
   824    794         TestvfsBuffer *pBuffer;
   825    795         char *zName;
   826    796         if( objc!=3 && objc!=4 ){
   827    797           Tcl_WrongNumArgs(interp, 2, objv, "FILE ?VALUE?");
   828    798           return TCL_ERROR;
   829    799         }
   830         -      zName = Tcl_GetString(objv[2]);
          800  +      zName = ckalloc(p->pParent->mxPathname);
          801  +      p->pParent->xFullPathname(
          802  +          p->pParent, Tcl_GetString(objv[2]), 
          803  +          p->pParent->mxPathname, zName
          804  +      );
   831    805         for(pBuffer=p->pBuffer; pBuffer; pBuffer=pBuffer->pNext){
   832    806           if( 0==strcmp(pBuffer->zFile, zName) ) break;
   833    807         }
          808  +      ckfree(zName);
   834    809         if( !pBuffer ){
   835         -        Tcl_AppendResult(interp, "no such file: ", zName, 0);
          810  +        Tcl_AppendResult(interp, "no such file: ", Tcl_GetString(objv[2]), 0);
   836    811           return TCL_ERROR;
   837    812         }
   838    813         if( objc==4 ){
   839    814           int n;
   840    815           u8 *a = Tcl_GetByteArrayFromObj(objv[3], &n);
   841         -        pBuffer->a = (u8 *)ckrealloc((char *)pBuffer->a, n);
   842         -        pBuffer->n = n;
   843         -        memcpy(pBuffer->a, a, n);
          816  +        assert( pBuffer->pgsz==0 || pBuffer->pgsz==32768 );
          817  +        for(i=0; i*32768<n; i++){
          818  +          int nByte = 32768;
          819  +          tvfsAllocPage(pBuffer, i, 32768);
          820  +          if( n-i*32768<32768 ){
          821  +            nByte = n;
          822  +          }
          823  +          memcpy(pBuffer->aPage[i], &a[i*32768], nByte);
          824  +        }
          825  +      }
          826  +
          827  +      pObj = Tcl_NewObj();
          828  +      for(i=0; pBuffer->aPage[i]; i++){
          829  +        Tcl_AppendObjToObj(pObj, Tcl_NewByteArrayObj(pBuffer->aPage[i], 32768));
   844    830         }
   845         -      Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(pBuffer->a, pBuffer->n));
          831  +      Tcl_SetObjResult(interp, pObj);
   846    832         break;
   847    833       }
   848    834   
   849    835       case CMD_FILTER: {
   850    836         static struct VfsMethod {
   851    837           char *zName;
   852    838           int mask;
   853    839         } vfsmethod [] = {
   854    840           { "xShmOpen",    TESTVFS_SHMOPEN_MASK },
   855         -        { "xShmSize",    TESTVFS_SHMSIZE_MASK },
   856         -        { "xShmGet",     TESTVFS_SHMGET_MASK },
   857         -        { "xShmRelease", TESTVFS_SHMRELEASE_MASK },
   858    841           { "xShmLock",    TESTVFS_SHMLOCK_MASK },
   859    842           { "xShmBarrier", TESTVFS_SHMBARRIER_MASK },
   860    843           { "xShmClose",   TESTVFS_SHMCLOSE_MASK },
          844  +        { "xShmPage",    TESTVFS_SHMPAGE_MASK },
   861    845           { "xSync",       TESTVFS_SYNC_MASK },
   862    846           { "xOpen",       TESTVFS_OPEN_MASK },
   863    847         };
   864    848         Tcl_Obj **apElem = 0;
   865    849         int nElem = 0;
   866    850         int i;
   867    851         int mask = 0;
................................................................................
   895    879         if( objc==3 ){
   896    880           int nByte;
   897    881           if( p->pScript ){
   898    882             Tcl_DecrRefCount(p->pScript);
   899    883             ckfree((char *)p->apScript);
   900    884             p->apScript = 0;
   901    885             p->nScript = 0;
          886  +          p->pScript = 0;
   902    887           }
   903    888           Tcl_GetStringFromObj(objv[2], &nByte);
   904    889           if( nByte>0 ){
   905    890             p->pScript = Tcl_DuplicateObj(objv[2]);
   906    891             Tcl_IncrRefCount(p->pScript);
   907    892           }
   908    893         }else if( objc!=2 ){
................................................................................
  1067   1052     }
  1068   1053   
  1069   1054     zVfs = Tcl_GetString(objv[1]);
  1070   1055     nByte = sizeof(Testvfs) + strlen(zVfs)+1;
  1071   1056     p = (Testvfs *)ckalloc(nByte);
  1072   1057     memset(p, 0, nByte);
  1073   1058   
         1059  +  /* Create the new object command before querying SQLite for a default VFS
         1060  +  ** to use for 'real' IO operations. This is because creating the new VFS
         1061  +  ** may delete an existing [testvfs] VFS of the same name. If such a VFS
         1062  +  ** is currently the default, the new [testvfs] may end up calling the 
         1063  +  ** methods of a deleted object.
         1064  +  */
         1065  +  Tcl_CreateObjCommand(interp, zVfs, testvfs_obj_cmd, p, testvfs_obj_del);
  1074   1066     p->pParent = sqlite3_vfs_find(0);
  1075   1067     p->interp = interp;
  1076   1068   
  1077   1069     p->zName = (char *)&p[1];
  1078   1070     memcpy(p->zName, zVfs, strlen(zVfs)+1);
  1079   1071   
  1080   1072     pVfs = (sqlite3_vfs *)ckalloc(sizeof(sqlite3_vfs));
................................................................................
  1083   1075     pVfs->zName = p->zName;
  1084   1076     pVfs->mxPathname = p->pParent->mxPathname;
  1085   1077     pVfs->szOsFile += p->pParent->szOsFile;
  1086   1078     p->pVfs = pVfs;
  1087   1079     p->isNoshm = isNoshm;
  1088   1080     p->mask = TESTVFS_ALL_MASK;
  1089   1081   
  1090         -  Tcl_CreateObjCommand(interp, zVfs, testvfs_obj_cmd, p, testvfs_obj_del);
  1091   1082     sqlite3_vfs_register(pVfs, isDefault);
  1092   1083   
  1093   1084     return TCL_OK;
  1094   1085   
  1095   1086    bad_args:
  1096   1087     Tcl_WrongNumArgs(interp, 1, objv, "VFSNAME ?-noshm BOOL? ?-default BOOL?");
  1097   1088     return TCL_ERROR;

Changes to src/wal.c.

   137    137   ** a page number P, return the index of the last frame for page P in the WAL,
   138    138   ** or return NULL if there are no frames for page P in the WAL.
   139    139   **
   140    140   ** The wal-index consists of a header region, followed by an one or
   141    141   ** more index blocks.  
   142    142   **
   143    143   ** The wal-index header contains the total number of frames within the WAL
   144         -** in the the mxFrame field.  Each index block contains information on
   145         -** HASHTABLE_NPAGE frames.  Each index block contains two sections, a
   146         -** mapping which is a database page number for each frame, and a hash
   147         -** table used to look up frames by page number.  The mapping section is
   148         -** an array of HASHTABLE_NPAGE 32-bit page numbers.  The first entry on the
   149         -** array is the page number for the first frame; the second entry is the
   150         -** page number for the second frame; and so forth.  The last index block
   151         -** holds a total of (mxFrame%HASHTABLE_NPAGE) page numbers.  All index
   152         -** blocks other than the last are completely full with HASHTABLE_NPAGE
   153         -** page numbers.  All index blocks are the same size; the mapping section
   154         -** of the last index block merely contains unused entries if mxFrame is
   155         -** not an even multiple of HASHTABLE_NPAGE.
          144  +** in the the mxFrame field.  
          145  +**
          146  +** Each index block except for the first contains information on 
          147  +** HASHTABLE_NPAGE frames. The first index block contains information on
          148  +** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and 
          149  +** HASHTABLE_NPAGE are selected so that together the wal-index header and
          150  +** first index block are the same size as all other index blocks in the
          151  +** wal-index.
          152  +**
          153  +** Each index block contains two sections, a page-mapping that contains the
          154  +** database page number associated with each wal frame, and a hash-table 
          155  +** that allows users to query an index block for a specific page number.
          156  +** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
          157  +** for the first index block) 32-bit page numbers. The first entry in the 
          158  +** first index-block contains the database page number corresponding to the
          159  +** first frame in the WAL file. The first entry in the second index block
          160  +** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
          161  +** the log, and so on.
          162  +**
          163  +** The last index block in a wal-index usually contains less than the full
          164  +** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
          165  +** depending on the contents of the WAL file. This does not change the
          166  +** allocated size of the page-mapping array - the page-mapping array merely
          167  +** contains unused entries.
   156    168   **
   157    169   ** Even without using the hash table, the last frame for page P
   158         -** can be found by scanning the mapping sections of each index block
          170  +** can be found by scanning the page-mapping sections of each index block
   159    171   ** starting with the last index block and moving toward the first, and
   160    172   ** within each index block, starting at the end and moving toward the
   161    173   ** beginning.  The first entry that equals P corresponds to the frame
   162    174   ** holding the content for that page.
   163    175   **
   164    176   ** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
   165    177   ** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
................................................................................
   366    378   ** following object.
   367    379   */
   368    380   struct Wal {
   369    381     sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
   370    382     sqlite3_file *pDbFd;       /* File handle for the database file */
   371    383     sqlite3_file *pWalFd;      /* File handle for WAL file */
   372    384     u32 iCallback;             /* Value to pass to log callback (or 0) */
   373         -  int szWIndex;              /* Size of the wal-index that is mapped in mem */
   374         -  volatile u32 *pWiData;     /* Pointer to wal-index content in memory */
          385  +  int nWiData;               /* Size of array apWiData */
          386  +  volatile u32 **apWiData;   /* Pointer to wal-index content in memory */
   375    387     u16 szPage;                /* Database page size */
   376    388     i16 readLock;              /* Which read lock is being held.  -1 for none */
   377    389     u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
   378    390     u8 isWIndexOpen;           /* True if ShmOpen() called on pDbFd */
   379    391     u8 writeLock;              /* True if in a write transaction */
   380    392     u8 ckptLock;               /* True if holding a checkpoint lock */
   381    393     WalIndexHdr hdr;           /* Wal-index header for current transaction */
................................................................................
   383    395     u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
   384    396   #ifdef SQLITE_DEBUG
   385    397     u8 lockError;              /* True if a locking error has occurred */
   386    398   #endif
   387    399   };
   388    400   
   389    401   /*
   390         -** Return a pointer to the WalCkptInfo structure in the wal-index.
          402  +** Each page of the wal-index mapping contains a hash-table made up of
          403  +** an array of HASHTABLE_NSLOT elements of the following type.
   391    404   */
   392         -static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
   393         -  assert( pWal->pWiData!=0 );
   394         -  return (volatile WalCkptInfo*)&pWal->pWiData[sizeof(WalIndexHdr)/2];
   395         -}
   396         -
          405  +typedef u16 ht_slot;
   397    406   
   398    407   /*
   399    408   ** This structure is used to implement an iterator that loops through
   400    409   ** all frames in the WAL in database page order. Where two or more frames
   401    410   ** correspond to the same database page, the iterator visits only the 
   402    411   ** frame most recently written to the WAL (in other words, the frame with
   403    412   ** the largest index).
................................................................................
   407    416   **   walIteratorInit() - Create a new iterator,
   408    417   **   walIteratorNext() - Step an iterator,
   409    418   **   walIteratorFree() - Free an iterator.
   410    419   **
   411    420   ** This functionality is used by the checkpoint code (see walCheckpoint()).
   412    421   */
   413    422   struct WalIterator {
   414         -  int iPrior;           /* Last result returned from the iterator */
   415         -  int nSegment;         /* Size of the aSegment[] array */
   416         -  int nFinal;           /* Elements in aSegment[nSegment-1]  */
          423  +  int iPrior;                     /* Last result returned from the iterator */
          424  +  int nSegment;                   /* Size of the aSegment[] array */
   417    425     struct WalSegment {
   418         -    int iNext;              /* Next slot in aIndex[] not previously returned */
   419         -    u8 *aIndex;             /* i0, i1, i2... such that aPgno[iN] ascending */
   420         -    u32 *aPgno;             /* 256 page numbers.  Pointer to Wal.pWiData */
   421         -  } aSegment[1];        /* One for every 256 entries in the WAL */
          426  +    int iNext;                    /* Next slot in aIndex[] not yet returned */
          427  +    ht_slot *aIndex;              /* i0, i1, i2... such that aPgno[iN] ascend */
          428  +    u32 *aPgno;                   /* Array of page numbers. */
          429  +    int nEntry;                   /* Max size of aPgno[] and aIndex[] arrays */
          430  +    int iZero;                    /* Frame number associated with aPgno[0] */
          431  +  } aSegment[1];                  /* One for every 32KB page in the WAL */
   422    432   };
          433  +
          434  +/*
          435  +** Define the parameters of the hash tables in the wal-index file. There
          436  +** is a hash-table following every HASHTABLE_NPAGE page numbers in the
          437  +** wal-index.
          438  +**
          439  +** Changing any of these constants will alter the wal-index format and
          440  +** create incompatibilities.
          441  +*/
          442  +#define HASHTABLE_NPAGE      4096                 /* Must be power of 2 */
          443  +#define HASHTABLE_HASH_1     383                  /* Should be prime */
          444  +#define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
          445  +
          446  +/* 
          447  +** The block of page numbers associated with the first hash-table in a
          448  +** wal-index is smaller than usual. This is so that there is a complete
          449  +** hash-table on each aligned 32KB page of the wal-index.
          450  +*/
          451  +#define HASHTABLE_NPAGE_ONE  (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
          452  +
          453  +/* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
          454  +#define WALINDEX_PGSZ   (                                         \
          455  +    sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
          456  +)
          457  +
          458  +/*
          459  +** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
          460  +** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
          461  +** numbered from zero.
          462  +**
          463  +** If this call is successful, *ppPage is set to point to the wal-index
          464  +** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
          465  +** then an SQLite error code is returned and *ppPage is set to 0.
          466  +*/
          467  +static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
          468  +  int rc = SQLITE_OK;
          469  +
          470  +  /* Enlarge the pWal->apWiData[] array if required */
          471  +  if( pWal->nWiData<=iPage ){
          472  +    int nByte = sizeof(u32 *)*(iPage+1);
          473  +    volatile u32 **apNew;
          474  +    apNew = (volatile u32 **)sqlite3_realloc(pWal->apWiData, nByte);
          475  +    if( !apNew ){
          476  +      *ppPage = 0;
          477  +      return SQLITE_NOMEM;
          478  +    }
          479  +    memset(&apNew[pWal->nWiData], 0, sizeof(u32 *)*(iPage+1-pWal->nWiData));
          480  +    pWal->apWiData = apNew;
          481  +    pWal->nWiData = iPage+1;
          482  +  }
          483  +
          484  +  /* Request a pointer to the required page from the VFS */
          485  +  if( pWal->apWiData[iPage]==0 ){
          486  +    rc = sqlite3OsShmMap(pWal->pDbFd, iPage, WALINDEX_PGSZ, 
          487  +        pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
          488  +    );
          489  +  }
          490  +
          491  +  *ppPage = pWal->apWiData[iPage];
          492  +  assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
          493  +  return rc;
          494  +}
          495  +
          496  +/*
          497  +** Return a pointer to the WalCkptInfo structure in the wal-index.
          498  +*/
          499  +static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
          500  +  assert( pWal->nWiData>0 && pWal->apWiData[0] );
          501  +  return (volatile WalCkptInfo*)&(pWal->apWiData[0][sizeof(WalIndexHdr)/2]);
          502  +}
          503  +
          504  +/*
          505  +** Return a pointer to the WalIndexHdr structure in the wal-index.
          506  +*/
          507  +static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
          508  +  assert( pWal->nWiData>0 && pWal->apWiData[0] );
          509  +  return (volatile WalIndexHdr*)pWal->apWiData[0];
          510  +}
   423    511   
   424    512   /*
   425    513   ** The argument to this macro must be of type u32. On a little-endian
   426    514   ** architecture, it returns the u32 value that results from interpreting
   427    515   ** the 4 bytes as a big-endian value. On a big-endian architecture, it
   428    516   ** returns the value that would be produced by intepreting the 4 bytes
   429    517   ** of the input value as a little-endian integer.
................................................................................
   482    570   
   483    571   /*
   484    572   ** Write the header information in pWal->hdr into the wal-index.
   485    573   **
   486    574   ** The checksum on pWal->hdr is updated before it is written.
   487    575   */
   488    576   static void walIndexWriteHdr(Wal *pWal){
   489         -  WalIndexHdr *aHdr;
          577  +  volatile WalIndexHdr *aHdr = walIndexHdr(pWal);
          578  +  const int nCksum = offsetof(WalIndexHdr, aCksum);
   490    579   
   491    580     assert( pWal->writeLock );
   492    581     pWal->hdr.isInit = 1;
   493         -  walChecksumBytes(1, (u8*)&pWal->hdr, offsetof(WalIndexHdr, aCksum),
   494         -                   0, pWal->hdr.aCksum);
   495         -  aHdr = (WalIndexHdr*)pWal->pWiData;
   496         -  memcpy(&aHdr[1], &pWal->hdr, sizeof(WalIndexHdr));
          582  +  walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum);
          583  +  memcpy((void *)&aHdr[1], (void *)&pWal->hdr, sizeof(WalIndexHdr));
   497    584     sqlite3OsShmBarrier(pWal->pDbFd);
   498         -  memcpy(&aHdr[0], &pWal->hdr, sizeof(WalIndexHdr));
          585  +  memcpy((void *)&aHdr[0], (void *)&pWal->hdr, sizeof(WalIndexHdr));
   499    586   }
   500    587   
   501    588   /*
   502    589   ** This function encodes a single frame header and writes it to a buffer
   503    590   ** supplied by the caller. A frame-header is made up of a series of 
   504    591   ** 4-byte big-endian integers, as follows:
   505    592   **
................................................................................
   582    669     ** and the new database size.
   583    670     */
   584    671     *piPage = pgno;
   585    672     *pnTruncate = sqlite3Get4byte(&aFrame[4]);
   586    673     return 1;
   587    674   }
   588    675   
   589         -/*
   590         -** Define the parameters of the hash tables in the wal-index file. There
   591         -** is a hash-table following every HASHTABLE_NPAGE page numbers in the
   592         -** wal-index.
   593         -**
   594         -** Changing any of these constants will alter the wal-index format and
   595         -** create incompatibilities.
   596         -*/
   597         -#define HASHTABLE_NPAGE      4096  /* Must be power of 2 and multiple of 256 */
   598         -#define HASHTABLE_DATATYPE   u16
   599         -#define HASHTABLE_HASH_1     383                  /* Should be prime */
   600         -#define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
   601         -#define HASHTABLE_NBYTE      (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)
   602    676   
   603    677   #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
   604    678   /*
   605    679   ** Names of locks.  This routine is used to provide debugging output and is not
   606    680   ** a part of an ordinary build.
   607    681   */
   608    682   static const char *walLockName(int lockIdx){
................................................................................
   659    733     if( pWal->exclusiveMode ) return;
   660    734     (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
   661    735                            SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
   662    736     WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
   663    737                walLockName(lockIdx), n));
   664    738   }
   665    739   
   666         -/*
   667         -** Return the index in the Wal.pWiData array that corresponds to 
   668         -** frame iFrame.
   669         -**
   670         -** Wal.pWiData is an array of u32 elements that is the wal-index.
   671         -** The array begins with a header and is then followed by alternating
   672         -** "map" and "hash-table" blocks.  Each "map" block consists of
   673         -** HASHTABLE_NPAGE u32 elements which are page numbers corresponding
   674         -** to frames in the WAL file.  
   675         -**
   676         -** This routine returns an index X such that Wal.pWiData[X] is part
   677         -** of a "map" block that contains the page number of the iFrame-th
   678         -** frame in the WAL file.
   679         -*/
   680         -static int walIndexEntry(u32 iFrame){
   681         -  return (
   682         -      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
   683         -    + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
   684         -    + (iFrame-1)
   685         -  );
   686         -}
   687         -
   688         -/*
   689         -** Return the minimum size of the shared-memory, in bytes, that is needed
   690         -** to support a wal-index containing frame iFrame.  The value returned
   691         -** includes the wal-index header and the complete "block" containing iFrame,
   692         -** including the hash table segment that follows the block.
   693         -*/
   694         -static int walMappingSize(u32 iFrame){
   695         -  const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ;
   696         -  return ( WALINDEX_LOCK_OFFSET 
   697         -         + WALINDEX_LOCK_RESERVED 
   698         -         + nByte * ((iFrame + HASHTABLE_NPAGE - 1)/HASHTABLE_NPAGE)
   699         -  );
   700         -}
   701         -
   702         -/*
   703         -** Release our reference to the wal-index memory map, if we are holding
   704         -** it.
   705         -*/
   706         -static void walIndexUnmap(Wal *pWal){
   707         -  if( pWal->pWiData ){
   708         -    sqlite3OsShmRelease(pWal->pDbFd);
   709         -  }
   710         -  pWal->pWiData = 0;
   711         -  pWal->szWIndex = -1;
   712         -}
   713         -
   714         -/*
   715         -** Map the wal-index file into memory if it isn't already. 
   716         -**
   717         -** The reqSize parameter is the requested size of the mapping.  The
   718         -** mapping will be at least this big if the underlying storage is
   719         -** that big.  But the mapping will never grow larger than the underlying
   720         -** storage.  Use the walIndexRemap() to enlarget the storage space.
   721         -*/
   722         -static int walIndexMap(Wal *pWal, int reqSize){
   723         -  int rc = SQLITE_OK;
   724         -  if( pWal->pWiData==0 || reqSize>pWal->szWIndex ){
   725         -    walIndexUnmap(pWal);
   726         -    rc = sqlite3OsShmGet(pWal->pDbFd, reqSize, &pWal->szWIndex,
   727         -                             (void volatile**)(char volatile*)&pWal->pWiData);
   728         -    if( rc!=SQLITE_OK ){
   729         -      walIndexUnmap(pWal);
   730         -    }
   731         -  }
   732         -  return rc;
   733         -}
   734         -
   735         -/*
   736         -** Enlarge the wal-index to be at least enlargeTo bytes in size and
   737         -** Remap the wal-index so that the mapping covers the full size
   738         -** of the underlying file.
   739         -**
   740         -** If enlargeTo is non-negative, then increase the size of the underlying
   741         -** storage to be at least as big as enlargeTo before remapping.
   742         -*/
   743         -static int walIndexRemap(Wal *pWal, int enlargeTo){
   744         -  int rc;
   745         -  int sz;
   746         -  assert( pWal->writeLock );
   747         -  rc = sqlite3OsShmSize(pWal->pDbFd, enlargeTo, &sz);
   748         -  if( rc==SQLITE_OK && sz>pWal->szWIndex ){
   749         -    walIndexUnmap(pWal);
   750         -    rc = walIndexMap(pWal, sz);
   751         -  }
   752         -  assert( pWal->szWIndex>=enlargeTo || rc!=SQLITE_OK );
   753         -  return rc;
   754         -}
   755         -
   756    740   /*
   757    741   ** Compute a hash on a page number.  The resulting hash value must land
   758    742   ** between 0 and (HASHTABLE_NSLOT-1).  The walHashNext() function advances
   759    743   ** the hash to the next value in the event of a collision.
   760    744   */
   761    745   static int walHash(u32 iPage){
   762    746     assert( iPage>0 );
................................................................................
   763    747     assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
   764    748     return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
   765    749   }
   766    750   static int walNextHash(int iPriorHash){
   767    751     return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
   768    752   }
   769    753   
   770         -
   771    754   /* 
   772         -** Find the hash table and (section of the) page number array used to
   773         -** store data for WAL frame iFrame.
          755  +** Return pointers to the hash table and page number array stored on
          756  +** page iHash of the wal-index. The wal-index is broken into 32KB pages
          757  +** numbered starting from 0.
   774    758   **
   775    759   ** Set output variable *paHash to point to the start of the hash table
   776    760   ** in the wal-index file. Set *piZero to one less than the frame 
   777    761   ** number of the first frame indexed by this hash table. If a
   778    762   ** slot in the hash table is set to N, it refers to frame number 
   779    763   ** (*piZero+N) in the log.
   780    764   **
   781         -** Finally, set *paPgno such that for all frames F between (*piZero+1) and 
   782         -** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number 
   783         -** associated with frame F.
          765  +** Finally, set *paPgno so that *paPgno[1] is the page number of the
          766  +** first frame indexed by the hash table, frame (*piZero+1).
   784    767   */
   785         -static void walHashFind(
          768  +static int walHashGet(
   786    769     Wal *pWal,                      /* WAL handle */
   787         -  u32 iFrame,                     /* Find the hash table indexing this frame */
   788         -  volatile HASHTABLE_DATATYPE **paHash,    /* OUT: Pointer to hash index */
          770  +  int iHash,                      /* Find the iHash'th table */
          771  +  volatile ht_slot **paHash,      /* OUT: Pointer to hash index */
   789    772     volatile u32 **paPgno,          /* OUT: Pointer to page number array */
   790    773     u32 *piZero                     /* OUT: Frame associated with *paPgno[0] */
   791    774   ){
   792         -  u32 iZero;
          775  +  int rc;                         /* Return code */
   793    776     volatile u32 *aPgno;
   794         -  volatile HASHTABLE_DATATYPE *aHash;
   795         -
   796         -  iZero = ((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NPAGE;
   797         -  aPgno = &pWal->pWiData[walIndexEntry(iZero+1)-iZero-1];
   798         -  aHash = (HASHTABLE_DATATYPE *)&aPgno[iZero+HASHTABLE_NPAGE+1];
   799         -
   800         -  /* Assert that:
   801         -  **
   802         -  **   + the mapping is large enough for this hash-table, and
   803         -  **
   804         -  **   + that aPgno[iZero+1] really is the database page number associated
   805         -  **     with the first frame indexed by this hash table.
   806         -  */
   807         -  assert( (u32*)(&aHash[HASHTABLE_NSLOT])<=&pWal->pWiData[pWal->szWIndex/4] );
   808         -  assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );
   809         -
   810         -  *paHash = aHash;
   811         -  *paPgno = aPgno;
   812         -  *piZero = iZero;
          777  +
          778  +  rc = walIndexPage(pWal, iHash, &aPgno);
          779  +  assert( rc==SQLITE_OK || iHash>0 );
          780  +
          781  +  if( rc==SQLITE_OK ){
          782  +    u32 iZero;
          783  +    volatile ht_slot *aHash;
          784  +
          785  +    aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];
          786  +    if( iHash==0 ){
          787  +      aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];
          788  +      iZero = 0;
          789  +    }else{
          790  +      iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
          791  +    }
          792  +  
          793  +    *paPgno = &aPgno[-1];
          794  +    *paHash = aHash;
          795  +    *piZero = iZero;
          796  +  }
          797  +  return rc;
          798  +}
          799  +
          800  +/*
          801  +** Return the number of the wal-index page that contains the hash-table
          802  +** and page-number array that contain entries corresponding to WAL frame
          803  +** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages 
          804  +** are numbered starting from 0.
          805  +*/
          806  +static int walFramePage(u32 iFrame){
          807  +  int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;
          808  +  assert( (iHash==0 || iFrame>HASHTABLE_NPAGE_ONE)
          809  +       && (iHash>=1 || iFrame<=HASHTABLE_NPAGE_ONE)
          810  +       && (iHash<=1 || iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))
          811  +       && (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)
          812  +       && (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))
          813  +  );
          814  +  return iHash;
          815  +}
          816  +
          817  +/*
          818  +** Return the page number associated with frame iFrame in this WAL.
          819  +*/
          820  +static u32 walFramePgno(Wal *pWal, u32 iFrame){
          821  +  int iHash = walFramePage(iFrame);
          822  +  if( iHash==0 ){
          823  +    return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];
          824  +  }
          825  +  return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];
   813    826   }
   814    827   
   815    828   /*
   816    829   ** Remove entries from the hash table that point to WAL slots greater
   817    830   ** than pWal->hdr.mxFrame.
   818    831   **
   819    832   ** This function is called whenever pWal->hdr.mxFrame is decreased due
................................................................................
   821    834   **
   822    835   ** At most only the hash table containing pWal->hdr.mxFrame needs to be
   823    836   ** updated.  Any later hash tables will be automatically cleared when
   824    837   ** pWal->hdr.mxFrame advances to the point where those hash tables are
   825    838   ** actually needed.
   826    839   */
   827    840   static void walCleanupHash(Wal *pWal){
   828         -  volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
   829         -  volatile u32 *aPgno;                 /* Unused return from walHashFind() */
   830         -  u32 iZero;                           /* frame == (aHash[x]+iZero) */
   831         -  int iLimit = 0;                      /* Zero values greater than this */
          841  +  volatile ht_slot *aHash;        /* Pointer to hash table to clear */
          842  +  volatile u32 *aPgno;            /* Page number array for hash table */
          843  +  u32 iZero;                      /* frame == (aHash[x]+iZero) */
          844  +  int iLimit = 0;                 /* Zero values greater than this */
          845  +  int nByte;                      /* Number of bytes to zero in aPgno[] */
          846  +  int i;                          /* Used to iterate through aHash[] */
   832    847   
   833    848     assert( pWal->writeLock );
   834    849     testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE-1 );
   835    850     testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE );
   836    851     testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE+1 );
   837         -  if( (pWal->hdr.mxFrame % HASHTABLE_NPAGE)>0 ){
   838         -    int nByte;                    /* Number of bytes to zero in aPgno[] */
   839         -    int i;                        /* Used to iterate through aHash[] */
   840         -
   841         -    walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
   842         -    iLimit = pWal->hdr.mxFrame - iZero;
   843         -    assert( iLimit>0 );
   844         -    for(i=0; i<HASHTABLE_NSLOT; i++){
   845         -      if( aHash[i]>iLimit ){
   846         -        aHash[i] = 0;
   847         -      }
   848         -    }
   849         -
   850         -    /* Zero the entries in the aPgno array that correspond to frames with
   851         -    ** frame numbers greater than pWal->hdr.mxFrame. 
   852         -    */
   853         -    nByte = sizeof(u32) * (HASHTABLE_NPAGE-iLimit);
   854         -    memset((void *)&aPgno[iZero+iLimit+1], 0, nByte);
   855         -    assert( &((u8 *)&aPgno[iZero+iLimit+1])[nByte]==(u8 *)aHash );
   856         -  }
          852  +
          853  +  if( pWal->hdr.mxFrame==0 ) return;
          854  +
          855  +  /* Obtain pointers to the hash-table and page-number array containing 
          856  +  ** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed
          857  +  ** that the page said hash-table and array reside on is already mapped.
          858  +  */
          859  +  assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) );
          860  +  assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] );
          861  +  walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &aHash, &aPgno, &iZero);
          862  +
          863  +  /* Zero all hash-table entries that correspond to frame numbers greater
          864  +  ** than pWal->hdr.mxFrame.
          865  +  */
          866  +  iLimit = pWal->hdr.mxFrame - iZero;
          867  +  assert( iLimit>0 );
          868  +  for(i=0; i<HASHTABLE_NSLOT; i++){
          869  +    if( aHash[i]>iLimit ){
          870  +      aHash[i] = 0;
          871  +    }
          872  +  }
          873  +  
          874  +  /* Zero the entries in the aPgno array that correspond to frames with
          875  +  ** frame numbers greater than pWal->hdr.mxFrame. 
          876  +  */
          877  +  nByte = ((char *)aHash - (char *)&aPgno[iLimit+1]);
          878  +  memset((void *)&aPgno[iLimit+1], 0, nByte);
   857    879   
   858    880   #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
   859    881     /* Verify that the every entry in the mapping region is still reachable
   860    882     ** via the hash table even after the cleanup.
   861    883     */
   862    884     if( iLimit ){
   863    885       int i;           /* Loop counter */
   864    886       int iKey;        /* Hash key */
   865    887       for(i=1; i<=iLimit; i++){
   866         -      for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){
          888  +      for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
   867    889           if( aHash[iKey]==i ) break;
   868    890         }
   869    891         assert( aHash[iKey]==i );
   870    892       }
   871    893     }
   872    894   #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
   873    895   }
................................................................................
   875    897   
   876    898   /*
   877    899   ** Set an entry in the wal-index that will map database page number
   878    900   ** pPage into WAL frame iFrame.
   879    901   */
   880    902   static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
   881    903     int rc;                         /* Return code */
   882         -  int nMapping;                   /* Required mapping size in bytes */
   883         -  
   884         -  /* Make sure the wal-index is mapped. Enlarge the mapping if required. */
   885         -  nMapping = walMappingSize(iFrame);
   886         -  rc = walIndexMap(pWal, nMapping);
   887         -  while( rc==SQLITE_OK && nMapping>pWal->szWIndex ){
   888         -    rc = walIndexRemap(pWal, nMapping);
   889         -  }
          904  +  u32 iZero;                      /* One less than frame number of aPgno[1] */
          905  +  volatile u32 *aPgno;            /* Page number array */
          906  +  volatile ht_slot *aHash;        /* Hash table */
   890    907   
   891         -  /* Assuming the wal-index file was successfully mapped, find the hash 
   892         -  ** table and section of of the page number array that pertain to frame 
   893         -  ** iFrame of the WAL. Then populate the page number array and the hash 
   894         -  ** table entry.
          908  +  rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);
          909  +
          910  +  /* Assuming the wal-index file was successfully mapped, populate the
          911  +  ** page number array and hash table entry.
   895    912     */
   896    913     if( rc==SQLITE_OK ){
   897    914       int iKey;                     /* Hash table key */
   898         -    u32 iZero;                    /* One less than frame number of aPgno[1] */
   899         -    volatile u32 *aPgno;                 /* Page number array */
   900         -    volatile HASHTABLE_DATATYPE *aHash;  /* Hash table */
   901         -    int idx;                             /* Value to write to hash-table slot */
   902         -    TESTONLY( int nCollide = 0;          /* Number of hash collisions */ )
          915  +    int idx;                      /* Value to write to hash-table slot */
          916  +    TESTONLY( int nCollide = 0;   /* Number of hash collisions */ )
   903    917   
   904         -    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
   905    918       idx = iFrame - iZero;
          919  +    assert( idx <= HASHTABLE_NSLOT/2 + 1 );
          920  +    
          921  +    /* If this is the first entry to be added to this hash-table, zero the
          922  +    ** entire hash table and aPgno[] array before proceding. 
          923  +    */
   906    924       if( idx==1 ){
   907         -      memset((void*)&aPgno[iZero+1], 0, HASHTABLE_NPAGE*sizeof(u32));
   908         -      memset((void*)aHash, 0, HASHTABLE_NBYTE);
          925  +      int nByte = (u8 *)&aHash[HASHTABLE_NSLOT] - (u8 *)&aPgno[1];
          926  +      memset((void*)&aPgno[1], 0, nByte);
   909    927       }
   910         -    assert( idx <= HASHTABLE_NSLOT/2 + 1 );
   911    928   
   912         -    if( aPgno[iFrame] ){
   913         -      /* If the entry in aPgno[] is already set, then the previous writer
   914         -      ** must have exited unexpectedly in the middle of a transaction (after
   915         -      ** writing one or more dirty pages to the WAL to free up memory). 
   916         -      ** Remove the remnants of that writers uncommitted transaction from 
   917         -      ** the hash-table before writing any new entries.
   918         -      */
          929  +    /* If the entry in aPgno[] is already set, then the previous writer
          930  +    ** must have exited unexpectedly in the middle of a transaction (after
          931  +    ** writing one or more dirty pages to the WAL to free up memory). 
          932  +    ** Remove the remnants of that writers uncommitted transaction from 
          933  +    ** the hash-table before writing any new entries.
          934  +    */
          935  +    if( aPgno[idx] ){
   919    936         walCleanupHash(pWal);
   920         -      assert( !aPgno[iFrame] );
          937  +      assert( !aPgno[idx] );
   921    938       }
   922         -    aPgno[iFrame] = iPage;
          939  +
          940  +    /* Write the aPgno[] array entry and the hash-table slot. */
   923    941       for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
   924    942         assert( nCollide++ < idx );
   925    943       }
          944  +    aPgno[idx] = iPage;
   926    945       aHash[iKey] = idx;
   927    946   
   928    947   #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
   929    948       /* Verify that the number of entries in the hash table exactly equals
   930    949       ** the number of entries in the mapping region.
   931    950       */
   932    951       {
................................................................................
   940    959       ** via the hash table.  This turns out to be a really, really expensive
   941    960       ** thing to check, so only do this occasionally - not on every
   942    961       ** iteration.
   943    962       */
   944    963       if( (idx&0x3ff)==0 ){
   945    964         int i;           /* Loop counter */
   946    965         for(i=1; i<=idx; i++){
   947         -        for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){
          966  +        for(iKey=walHash(aPgno[i]); aHash[iKey]; iKey=walNextHash(iKey)){
   948    967             if( aHash[iKey]==i ) break;
   949    968           }
   950    969           assert( aHash[iKey]==i );
   951    970         }
   952    971       }
   953    972   #endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
   954    973     }
................................................................................
  1072   1091         }
  1073   1092       }
  1074   1093   
  1075   1094       sqlite3_free(aFrame);
  1076   1095     }
  1077   1096   
  1078   1097   finished:
  1079         -  if( rc==SQLITE_OK && pWal->hdr.mxFrame==0 ){
  1080         -    rc = walIndexRemap(pWal, walMappingSize(1));
  1081         -  }
  1082   1098     if( rc==SQLITE_OK ){
  1083   1099       volatile WalCkptInfo *pInfo;
  1084   1100       int i;
  1085   1101       pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
  1086   1102       pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
  1087   1103       walIndexWriteHdr(pWal);
  1088   1104   
................................................................................
  1160   1176     if( !pRet ){
  1161   1177       return SQLITE_NOMEM;
  1162   1178     }
  1163   1179   
  1164   1180     pRet->pVfs = pVfs;
  1165   1181     pRet->pWalFd = (sqlite3_file *)&pRet[1];
  1166   1182     pRet->pDbFd = pDbFd;
  1167         -  pRet->szWIndex = -1;
  1168   1183     pRet->readLock = -1;
  1169   1184     sqlite3_randomness(8, &pRet->hdr.aSalt);
  1170   1185     pRet->zWalName = zWal = pVfs->szOsFile + (char*)pRet->pWalFd;
  1171   1186     sqlite3_snprintf(nWal, zWal, "%s-wal", zDbName);
  1172   1187     rc = sqlite3OsShmOpen(pDbFd);
  1173   1188   
  1174   1189     /* Open file handle on the write-ahead log file. */
................................................................................
  1203   1218     WalIterator *p,               /* Iterator */
  1204   1219     u32 *piPage,                  /* OUT: The page number of the next page */
  1205   1220     u32 *piFrame                  /* OUT: Wal frame index of next page */
  1206   1221   ){
  1207   1222     u32 iMin;                     /* Result pgno must be greater than iMin */
  1208   1223     u32 iRet = 0xFFFFFFFF;        /* 0xffffffff is never a valid page number */
  1209   1224     int i;                        /* For looping through segments */
  1210         -  int nBlock = p->nFinal;       /* Number of entries in current segment */
  1211   1225   
  1212   1226     iMin = p->iPrior;
  1213   1227     assert( iMin<0xffffffff );
  1214   1228     for(i=p->nSegment-1; i>=0; i--){
  1215   1229       struct WalSegment *pSegment = &p->aSegment[i];
  1216         -    while( pSegment->iNext<nBlock ){
         1230  +    while( pSegment->iNext<pSegment->nEntry ){
  1217   1231         u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]];
  1218   1232         if( iPg>iMin ){
  1219   1233           if( iPg<iRet ){
  1220   1234             iRet = iPg;
  1221         -          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
         1235  +          *piFrame = pSegment->iZero + pSegment->aIndex[pSegment->iNext];
  1222   1236           }
  1223   1237           break;
  1224   1238         }
  1225   1239         pSegment->iNext++;
  1226   1240       }
  1227         -    nBlock = 256;
  1228   1241     }
  1229   1242   
  1230   1243     *piPage = p->iPrior = iRet;
  1231   1244     return (iRet==0xFFFFFFFF);
  1232   1245   }
  1233   1246   
  1234   1247   
  1235         -static void walMergesort8(
  1236         -  Pgno *aContent,                 /* Pages in wal */
  1237         -  u8 *aBuffer,                    /* Buffer of at least *pnList items to use */
  1238         -  u8 *aList,                      /* IN/OUT: List to sort */
         1248  +static void walMergesort(
         1249  +  u32 *aContent,                  /* Pages in wal */
         1250  +  ht_slot *aBuffer,               /* Buffer of at least *pnList items to use */
         1251  +  ht_slot *aList,                 /* IN/OUT: List to sort */
  1239   1252     int *pnList                     /* IN/OUT: Number of elements in aList[] */
  1240   1253   ){
  1241   1254     int nList = *pnList;
  1242   1255     if( nList>1 ){
  1243   1256       int nLeft = nList / 2;        /* Elements in left list */
  1244   1257       int nRight = nList - nLeft;   /* Elements in right list */
  1245         -    u8 *aLeft = aList;            /* Left list */
  1246         -    u8 *aRight = &aList[nLeft];   /* Right list */
  1247   1258       int iLeft = 0;                /* Current index in aLeft */
  1248   1259       int iRight = 0;               /* Current index in aright */
  1249   1260       int iOut = 0;                 /* Current index in output buffer */
         1261  +    ht_slot *aLeft = aList;       /* Left list */
         1262  +    ht_slot *aRight = aList+nLeft;/* Right list */
  1250   1263   
  1251   1264       /* TODO: Change to non-recursive version. */
  1252         -    walMergesort8(aContent, aBuffer, aLeft, &nLeft);
  1253         -    walMergesort8(aContent, aBuffer, aRight, &nRight);
         1265  +    walMergesort(aContent, aBuffer, aLeft, &nLeft);
         1266  +    walMergesort(aContent, aBuffer, aRight, &nRight);
  1254   1267   
  1255   1268       while( iRight<nRight || iLeft<nLeft ){
  1256         -      u8 logpage;
         1269  +      ht_slot logpage;
  1257   1270         Pgno dbpage;
  1258   1271   
  1259   1272         if( (iLeft<nLeft) 
  1260   1273          && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
  1261   1274         ){
  1262   1275           logpage = aLeft[iLeft++];
  1263   1276         }else{
................................................................................
  1280   1293       int i;
  1281   1294       for(i=1; i<*pnList; i++){
  1282   1295         assert( aContent[aList[i]] > aContent[aList[i-1]] );
  1283   1296       }
  1284   1297     }
  1285   1298   #endif
  1286   1299   }
         1300  +
         1301  +/* 
         1302  +** Free an iterator allocated by walIteratorInit().
         1303  +*/
         1304  +static void walIteratorFree(WalIterator *p){
         1305  +  sqlite3_free(p);
         1306  +}
  1287   1307   
  1288   1308   /*
  1289   1309   ** Map the wal-index into memory owned by this thread, if it is not
  1290   1310   ** mapped already.  Then construct a WalInterator object that can be
  1291   1311   ** used to loop over all pages in the WAL in ascending order.  
  1292   1312   **
  1293   1313   ** On success, make *pp point to the newly allocated WalInterator object
................................................................................
  1296   1316   **
  1297   1317   ** The calling routine should invoke walIteratorFree() to destroy the
  1298   1318   ** WalIterator object when it has finished with it.  The caller must
  1299   1319   ** also unmap the wal-index.  But the wal-index must not be unmapped
  1300   1320   ** prior to the WalIterator object being destroyed.
  1301   1321   */
  1302   1322   static int walIteratorInit(Wal *pWal, WalIterator **pp){
  1303         -  u32 *aData;           /* Content of the wal-index file */
  1304         -  WalIterator *p;       /* Return value */
  1305         -  int nSegment;         /* Number of segments to merge */
  1306         -  u32 iLast;            /* Last frame in log */
  1307         -  int nByte;            /* Number of bytes to allocate */
  1308         -  int i;                /* Iterator variable */
  1309         -  int nFinal;           /* Number of unindexed entries */
  1310         -  u8 *aTmp;             /* Temp space used by merge-sort */
  1311         -  u8 *aSpace;           /* Surplus space on the end of the allocation */
  1312         -
  1313         -  /* Make sure the wal-index is mapped into local memory */
  1314         -  assert( pWal->pWiData && pWal->szWIndex>=walMappingSize(pWal->hdr.mxFrame) );
         1323  +  WalIterator *p;                 /* Return value */
         1324  +  int nSegment;                   /* Number of segments to merge */
         1325  +  u32 iLast;                      /* Last frame in log */
         1326  +  int nByte;                      /* Number of bytes to allocate */
         1327  +  int i;                          /* Iterator variable */
         1328  +  ht_slot *aTmp;                  /* Temp space used by merge-sort */
         1329  +  ht_slot *aSpace;                /* Space at the end of the allocation */
  1315   1330   
  1316   1331     /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
  1317   1332     ** thread is able to write to shared memory while this routine is
  1318   1333     ** running (or, indeed, while the WalIterator object exists).  Hence,
  1319         -  ** we can cast off the volatile qualifacation from shared memory
         1334  +  ** we can cast off the volatile qualification from shared memory
  1320   1335     */
  1321   1336     assert( pWal->ckptLock );
  1322         -  aData = (u32*)pWal->pWiData;
         1337  +  iLast = pWal->hdr.mxFrame;
  1323   1338   
  1324   1339     /* Allocate space for the WalIterator object */
  1325         -  iLast = pWal->hdr.mxFrame;
  1326         -  nSegment = (iLast >> 8) + 1;
  1327         -  nFinal = (iLast & 0x000000FF);
  1328         -  nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
         1340  +  nSegment = walFramePage(iLast) + 1;
         1341  +  nByte = sizeof(WalIterator) 
         1342  +        + nSegment*(sizeof(struct WalSegment))
         1343  +        + (nSegment+1)*(HASHTABLE_NPAGE * sizeof(ht_slot));
  1329   1344     p = (WalIterator *)sqlite3_malloc(nByte);
  1330   1345     if( !p ){
  1331   1346       return SQLITE_NOMEM;
  1332   1347     }
  1333   1348     memset(p, 0, nByte);
  1334   1349   
  1335         -  /* Initialize the WalIterator object.  Each 256-entry segment is
  1336         -  ** presorted in order to make iterating through all entries much
  1337         -  ** faster.
  1338         -  */
         1350  +  /* Allocate space for the WalIterator object */
  1339   1351     p->nSegment = nSegment;
  1340         -  aSpace = (u8 *)&p->aSegment[nSegment];
  1341         -  aTmp = &aSpace[nSegment*256];
         1352  +  aSpace = (ht_slot *)&p->aSegment[nSegment];
         1353  +  aTmp = &aSpace[HASHTABLE_NPAGE*nSegment];
  1342   1354     for(i=0; i<nSegment; i++){
         1355  +    volatile ht_slot *aHash;
  1343   1356       int j;
  1344         -    int nIndex = (i==nSegment-1) ? nFinal : 256;
  1345         -    p->aSegment[i].aPgno = &aData[walIndexEntry(i*256+1)];
         1357  +    u32 iZero;
         1358  +    int nEntry;
         1359  +    volatile u32 *aPgno;
         1360  +    int rc;
         1361  +
         1362  +    rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);
         1363  +    if( rc!=SQLITE_OK ){
         1364  +      walIteratorFree(p);
         1365  +      return rc;
         1366  +    }
         1367  +    aPgno++;
         1368  +    nEntry = ((i+1)==nSegment)?iLast-iZero:(u32 *)aHash-(u32 *)aPgno;
         1369  +    iZero++;
         1370  +
         1371  +    for(j=0; j<nEntry; j++){
         1372  +      aSpace[j] = j;
         1373  +    }
         1374  +    walMergesort((u32 *)aPgno, aTmp, aSpace, &nEntry);
         1375  +    p->aSegment[i].iZero = iZero;
         1376  +    p->aSegment[i].nEntry = nEntry;
  1346   1377       p->aSegment[i].aIndex = aSpace;
  1347         -    for(j=0; j<nIndex; j++){
  1348         -      aSpace[j] = j;
  1349         -    }
  1350         -    walMergesort8(p->aSegment[i].aPgno, aTmp, aSpace, &nIndex);
  1351         -    memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
  1352         -    aSpace += 256;
  1353         -    p->nFinal = nIndex;
         1378  +    p->aSegment[i].aPgno = (u32 *)aPgno;
         1379  +    aSpace += HASHTABLE_NPAGE;
  1354   1380     }
         1381  +  assert( aSpace==aTmp );
  1355   1382   
  1356         -  /* Return the fully initializd WalIterator object */
         1383  +  /* Return the fully initialized WalIterator object */
  1357   1384     *pp = p;
  1358   1385     return SQLITE_OK ;
  1359   1386   }
  1360   1387   
  1361         -/* 
  1362         -** Free an iterator allocated by walIteratorInit().
  1363         -*/
  1364         -static void walIteratorFree(WalIterator *p){
  1365         -  sqlite3_free(p);
  1366         -}
  1367         -
  1368   1388   /*
  1369   1389   ** Copy as much content as we can from the WAL back into the database file
  1370   1390   ** in response to an sqlite3_wal_checkpoint() request or the equivalent.
  1371   1391   **
  1372   1392   ** The amount of information copies from WAL to database might be limited
  1373   1393   ** by active readers.  This routine will never overwrite a database page
  1374   1394   ** that a concurrent reader might be using.
................................................................................
  1405   1425     int rc;                         /* Return code */
  1406   1426     int szPage = pWal->hdr.szPage;  /* Database page-size */
  1407   1427     WalIterator *pIter = 0;         /* Wal iterator context */
  1408   1428     u32 iDbpage = 0;                /* Next database page to write */
  1409   1429     u32 iFrame = 0;                 /* Wal frame containing data for iDbpage */
  1410   1430     u32 mxSafeFrame;                /* Max frame that can be backfilled */
  1411   1431     int i;                          /* Loop counter */
  1412         -  volatile WalIndexHdr *pHdr;     /* The actual wal-index header in SHM */
  1413   1432     volatile WalCkptInfo *pInfo;    /* The checkpoint status information */
  1414   1433   
  1415   1434     /* Allocate the iterator */
  1416   1435     rc = walIteratorInit(pWal, &pIter);
  1417   1436     if( rc!=SQLITE_OK || pWal->hdr.mxFrame==0 ){
  1418   1437       goto walcheckpoint_out;
  1419   1438     }
................................................................................
  1426   1445   
  1427   1446     /* Compute in mxSafeFrame the index of the last frame of the WAL that is
  1428   1447     ** safe to write into the database.  Frames beyond mxSafeFrame might
  1429   1448     ** overwrite database pages that are in use by active readers and thus
  1430   1449     ** cannot be backfilled from the WAL.
  1431   1450     */
  1432   1451     mxSafeFrame = pWal->hdr.mxFrame;
  1433         -  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
  1434         -  pInfo = (volatile WalCkptInfo*)&pHdr[2];
  1435         -  assert( pInfo==walCkptInfo(pWal) );
         1452  +  pInfo = walCkptInfo(pWal);
  1436   1453     for(i=1; i<WAL_NREADER; i++){
  1437   1454       u32 y = pInfo->aReadMark[i];
  1438   1455       if( mxSafeFrame>=y ){
  1439   1456         assert( y<=pWal->hdr.mxFrame );
  1440   1457         rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
  1441   1458         if( rc==SQLITE_OK ){
  1442   1459           pInfo->aReadMark[i] = READMARK_NOT_USED;
................................................................................
  1457   1474       /* Sync the WAL to disk */
  1458   1475       if( sync_flags ){
  1459   1476         rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
  1460   1477       }
  1461   1478   
  1462   1479       /* Iterate through the contents of the WAL, copying data to the db file. */
  1463   1480       while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
         1481  +      assert( walFramePgno(pWal, iFrame)==iDbpage );
  1464   1482         if( iFrame<=nBackfill || iFrame>mxSafeFrame ) continue;
  1465   1483         rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, 
  1466   1484             walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE
  1467   1485         );
  1468   1486         if( rc!=SQLITE_OK ) break;
  1469   1487         rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, (iDbpage-1)*szPage);
  1470   1488         if( rc!=SQLITE_OK ) break;
  1471   1489       }
  1472   1490   
  1473   1491       /* If work was actually accomplished... */
  1474   1492       if( rc==SQLITE_OK ){
  1475         -      if( mxSafeFrame==pHdr[0].mxFrame ){
         1493  +      if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
  1476   1494           rc = sqlite3OsTruncate(pWal->pDbFd, ((i64)pWal->hdr.nPage*(i64)szPage));
  1477   1495           if( rc==SQLITE_OK && sync_flags ){
  1478   1496             rc = sqlite3OsSync(pWal->pDbFd, sync_flags);
  1479   1497           }
  1480   1498         }
  1481   1499         if( rc==SQLITE_OK ){
  1482   1500           pInfo->nBackfill = mxSafeFrame;
................................................................................
  1521   1539       rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE);
  1522   1540       if( rc==SQLITE_OK ){
  1523   1541         pWal->exclusiveMode = 1;
  1524   1542         rc = sqlite3WalCheckpoint(pWal, sync_flags, nBuf, zBuf);
  1525   1543         if( rc==SQLITE_OK ){
  1526   1544           isDelete = 1;
  1527   1545         }
  1528         -      walIndexUnmap(pWal);
  1529   1546       }
  1530   1547   
  1531   1548       walIndexClose(pWal, isDelete);
  1532   1549       sqlite3OsClose(pWal->pWalFd);
  1533   1550       if( isDelete ){
  1534   1551         sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
  1535   1552       }
  1536   1553       WALTRACE(("WAL%p: closed\n", pWal));
         1554  +    sqlite3_free(pWal->apWiData);
  1537   1555       sqlite3_free(pWal);
  1538   1556     }
  1539   1557     return rc;
  1540   1558   }
  1541   1559   
  1542   1560   /*
  1543   1561   ** Try to read the wal-index header.  Return 0 on success and 1 if
................................................................................
  1553   1571   ** pWal->hdr, then pWal->hdr is updated to the content of the new header
  1554   1572   ** and *pChanged is set to 1.
  1555   1573   **
  1556   1574   ** If the checksum cannot be verified return non-zero. If the header
  1557   1575   ** is read successfully and the checksum verified, return zero.
  1558   1576   */
  1559   1577   int walIndexTryHdr(Wal *pWal, int *pChanged){
  1560         -  u32 aCksum[2];               /* Checksum on the header content */
  1561         -  WalIndexHdr h1, h2;          /* Two copies of the header content */
  1562         -  WalIndexHdr *aHdr;           /* Header in shared memory */
         1578  +  u32 aCksum[2];                  /* Checksum on the header content */
         1579  +  WalIndexHdr h1, h2;             /* Two copies of the header content */
         1580  +  WalIndexHdr volatile *aHdr;     /* Header in shared memory */
  1563   1581   
  1564         -  if( pWal->szWIndex < WALINDEX_HDR_SIZE ){
  1565         -    /* The wal-index is not large enough to hold the header, then assume
  1566         -    ** header is invalid. */
  1567         -    return 1;
  1568         -  }
  1569         -  assert( pWal->pWiData );
         1582  +  /* The first page of the wal-index must be mapped at this point. */
         1583  +  assert( pWal->nWiData>0 && pWal->apWiData[0] );
  1570   1584   
  1571   1585     /* Read the header. This might happen currently with a write to the
  1572   1586     ** same area of shared memory on a different CPU in a SMP,
  1573   1587     ** meaning it is possible that an inconsistent snapshot is read
  1574   1588     ** from the file. If this happens, return non-zero.
  1575   1589     **
  1576   1590     ** There are two copies of the header at the beginning of the wal-index.
  1577   1591     ** When reading, read [0] first then [1].  Writes are in the reverse order.
  1578   1592     ** Memory barriers are used to prevent the compiler or the hardware from
  1579   1593     ** reordering the reads and writes.
  1580   1594     */
  1581         -  aHdr = (WalIndexHdr*)pWal->pWiData;
  1582         -  memcpy(&h1, &aHdr[0], sizeof(h1));
         1595  +  aHdr = walIndexHdr(pWal);
         1596  +  memcpy(&h1, (void *)&aHdr[0], sizeof(h1));
  1583   1597     sqlite3OsShmBarrier(pWal->pDbFd);
  1584         -  memcpy(&h2, &aHdr[1], sizeof(h2));
         1598  +  memcpy(&h2, (void *)&aHdr[1], sizeof(h2));
  1585   1599   
  1586   1600     if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
  1587   1601       return 1;   /* Dirty read */
  1588   1602     }  
  1589   1603     if( h1.isInit==0 ){
  1590   1604       return 1;   /* Malformed header - probably all zeros */
  1591   1605     }
................................................................................
  1621   1635   **
  1622   1636   ** If the wal-index header is successfully read, return SQLITE_OK. 
  1623   1637   ** Otherwise an SQLite error code.
  1624   1638   */
  1625   1639   static int walIndexReadHdr(Wal *pWal, int *pChanged){
  1626   1640     int rc;                         /* Return code */
  1627   1641     int badHdr;                     /* True if a header read failed */
         1642  +  volatile u32 *page0;
  1628   1643   
         1644  +  /* Ensure that page 0 of the wal-index (the page that contains the 
         1645  +  ** wal-index header) is mapped. Return early if an error occurs here.
         1646  +  */
  1629   1647     assert( pChanged );
  1630         -  rc = walIndexMap(pWal, walMappingSize(1));
         1648  +  rc = walIndexPage(pWal, 0, &page0);
  1631   1649     if( rc!=SQLITE_OK ){
  1632   1650       return rc;
  1633         -  }
         1651  +  };
         1652  +  assert( page0 || pWal->writeLock==0 );
  1634   1653   
  1635         -  /* Try once to read the header straight out.  This works most of the
  1636         -  ** time.
         1654  +  /* If the first page of the wal-index has been mapped, try to read the
         1655  +  ** wal-index header immediately, without holding any lock. This usually
         1656  +  ** works, but may fail if the wal-index header is corrupt or currently 
         1657  +  ** being modified by another user.
  1637   1658     */
  1638         -  badHdr = walIndexTryHdr(pWal, pChanged);
         1659  +  badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
  1639   1660   
  1640   1661     /* If the first attempt failed, it might have been due to a race
  1641   1662     ** with a writer.  So get a WRITE lock and try again.
  1642   1663     */
  1643   1664     assert( badHdr==0 || pWal->writeLock==0 );
  1644         -  if( badHdr ){
  1645         -    rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
  1646         -    if( rc==SQLITE_OK ){
  1647         -      pWal->writeLock = 1;
         1665  +  if( badHdr && SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
         1666  +    pWal->writeLock = 1;
         1667  +    if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
  1648   1668         badHdr = walIndexTryHdr(pWal, pChanged);
  1649   1669         if( badHdr ){
  1650   1670           /* If the wal-index header is still malformed even while holding
  1651   1671           ** a WRITE lock, it can only mean that the header is corrupted and
  1652   1672           ** needs to be reconstructed.  So run recovery to do exactly that.
  1653   1673           */
  1654   1674           rc = walIndexRecover(pWal);
  1655   1675           *pChanged = 1;
  1656   1676         }
  1657         -      walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  1658         -      pWal->writeLock = 0;
  1659   1677       }
  1660         -  }
  1661         -
  1662         -  /* Make sure the mapping is large enough to cover the entire wal-index */
  1663         -  if( rc==SQLITE_OK ){
  1664         -    int szWanted = walMappingSize(pWal->hdr.mxFrame);
  1665         -    if( pWal->szWIndex<szWanted ){
  1666         -      rc = walIndexMap(pWal, szWanted);
  1667         -    }
         1678  +    pWal->writeLock = 0;
         1679  +    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  1668   1680     }
  1669   1681   
  1670   1682     return rc;
  1671   1683   }
  1672   1684   
  1673   1685   /*
  1674   1686   ** This is the value that walTryBeginRead returns when it needs to
................................................................................
  1701   1713   ** to select a particular WAL_READ_LOCK() that strives to let the
  1702   1714   ** checkpoint process do as much work as possible.  This routine might
  1703   1715   ** update values of the aReadMark[] array in the header, but if it does
  1704   1716   ** so it takes care to hold an exclusive lock on the corresponding
  1705   1717   ** WAL_READ_LOCK() while changing values.
  1706   1718   */
  1707   1719   static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
  1708         -  volatile WalIndexHdr *pHdr;     /* Header of the wal-index */
  1709   1720     volatile WalCkptInfo *pInfo;    /* Checkpoint information in wal-index */
  1710   1721     u32 mxReadMark;                 /* Largest aReadMark[] value */
  1711   1722     int mxI;                        /* Index of largest aReadMark[] value */
  1712   1723     int i;                          /* Loop counter */
  1713         -  int rc;                         /* Return code  */
         1724  +  int rc = SQLITE_OK;             /* Return code  */
  1714   1725   
  1715   1726     assert( pWal->readLock<0 );     /* Not currently locked */
  1716   1727   
  1717   1728     /* Take steps to avoid spinning forever if there is a protocol error. */
  1718   1729     if( cnt>5 ){
  1719   1730       if( cnt>100 ) return SQLITE_PROTOCOL;
  1720   1731       sqlite3OsSleep(pWal->pVfs, 1);
................................................................................
  1735   1746         if( rc==SQLITE_OK ){
  1736   1747           walUnlockShared(pWal, WAL_RECOVER_LOCK);
  1737   1748           rc = WAL_RETRY;
  1738   1749         }else if( rc==SQLITE_BUSY ){
  1739   1750           rc = SQLITE_BUSY_RECOVERY;
  1740   1751         }
  1741   1752       }
  1742         -  }else{
  1743         -    rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
  1744   1753     }
  1745   1754     if( rc!=SQLITE_OK ){
  1746   1755       return rc;
  1747   1756     }
  1748   1757   
  1749         -  pHdr = (volatile WalIndexHdr*)pWal->pWiData;
  1750         -  pInfo = (volatile WalCkptInfo*)&pHdr[2];
  1751         -  assert( pInfo==walCkptInfo(pWal) );
         1758  +  pInfo = walCkptInfo(pWal);
  1752   1759     if( !useWal && pInfo->nBackfill==pWal->hdr.mxFrame ){
  1753   1760       /* The WAL has been completely backfilled (or it is empty).
  1754   1761       ** and can be safely ignored.
  1755   1762       */
  1756   1763       rc = walLockShared(pWal, WAL_READ_LOCK(0));
  1757   1764       sqlite3OsShmBarrier(pWal->pDbFd);
  1758   1765       if( rc==SQLITE_OK ){
  1759         -      if( memcmp((void *)pHdr, &pWal->hdr, sizeof(WalIndexHdr)) ){
         1766  +      if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){
  1760   1767           /* It is not safe to allow the reader to continue here if frames
  1761   1768           ** may have been appended to the log before READ_LOCK(0) was obtained.
  1762   1769           ** When holding READ_LOCK(0), the reader ignores the entire log file,
  1763   1770           ** which implies that the database file contains a trustworthy
  1764   1771           ** snapshoT. Since holding READ_LOCK(0) prevents a checkpoint from
  1765   1772           ** happening, this is usually correct.
  1766   1773           **
................................................................................
  1846   1853       ** date before proceeding. That would not be possible without somehow
  1847   1854       ** blocking writers. It only guarantees that a dangerous checkpoint or 
  1848   1855       ** log-wrap (either of which would require an exclusive lock on
  1849   1856       ** WAL_READ_LOCK(mxI)) has not occurred since the snapshot was valid.
  1850   1857       */
  1851   1858       sqlite3OsShmBarrier(pWal->pDbFd);
  1852   1859       if( pInfo->aReadMark[mxI]!=mxReadMark
  1853         -     || memcmp((void *)pHdr, &pWal->hdr, sizeof(WalIndexHdr))
         1860  +     || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))
  1854   1861       ){
  1855   1862         walUnlockShared(pWal, WAL_READ_LOCK(mxI));
  1856   1863         return WAL_RETRY;
  1857   1864       }else{
  1858   1865         assert( mxReadMark<=pWal->hdr.mxFrame );
  1859   1866         pWal->readLock = mxI;
  1860   1867       }
................................................................................
  1879   1886   int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
  1880   1887     int rc;                         /* Return code */
  1881   1888     int cnt = 0;                    /* Number of TryBeginRead attempts */
  1882   1889   
  1883   1890     do{
  1884   1891       rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);
  1885   1892     }while( rc==WAL_RETRY );
  1886         -  walIndexUnmap(pWal);
  1887   1893     return rc;
  1888   1894   }
  1889   1895   
  1890   1896   /*
  1891   1897   ** Finish with a read transaction.  All this does is release the
  1892   1898   ** read-lock.
  1893   1899   */
................................................................................
  1909   1915   int sqlite3WalRead(
  1910   1916     Wal *pWal,                      /* WAL handle */
  1911   1917     Pgno pgno,                      /* Database page number to read data for */
  1912   1918     int *pInWal,                    /* OUT: True if data is read from WAL */
  1913   1919     int nOut,                       /* Size of buffer pOut in bytes */
  1914   1920     u8 *pOut                        /* Buffer to write page data to */
  1915   1921   ){
  1916         -  int rc;                         /* Return code */
  1917   1922     u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
  1918   1923     u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
  1919   1924     int iHash;                      /* Used to loop through N hash tables */
  1920   1925   
  1921   1926     /* This routine is only be called from within a read transaction. */
  1922   1927     assert( pWal->readLock>=0 || pWal->lockError );
  1923   1928   
................................................................................
  1928   1933     ** return early, as if the WAL were empty.
  1929   1934     */
  1930   1935     if( iLast==0 || pWal->readLock==0 ){
  1931   1936       *pInWal = 0;
  1932   1937       return SQLITE_OK;
  1933   1938     }
  1934   1939   
  1935         -  /* Ensure the wal-index is mapped. */
  1936         -  rc = walIndexMap(pWal, walMappingSize(iLast));
  1937         -  if( rc!=SQLITE_OK ){
  1938         -    return rc;
  1939         -  }
  1940         -
  1941   1940     /* Search the hash table or tables for an entry matching page number
  1942   1941     ** pgno. Each iteration of the following for() loop searches one
  1943   1942     ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
  1944   1943     **
  1945   1944     ** This code may run concurrently to the code in walIndexAppend()
  1946   1945     ** that adds entries to the wal-index (and possibly to this hash 
  1947   1946     ** table). This means the value just read from the hash 
................................................................................
  1959   1958     **   (aPgno[iFrame]==pgno): 
  1960   1959     **     This condition filters out normal hash-table collisions.
  1961   1960     **
  1962   1961     **   (iFrame<=iLast): 
  1963   1962     **     This condition filters out entries that were added to the hash
  1964   1963     **     table after the current read-transaction had started.
  1965   1964     */
  1966         -  for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
  1967         -    volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table */
  1968         -    volatile u32 *aPgno;                 /* Pointer to array of page numbers */
         1965  +  for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
         1966  +    volatile ht_slot *aHash;      /* Pointer to hash table */
         1967  +    volatile u32 *aPgno;          /* Pointer to array of page numbers */
  1969   1968       u32 iZero;                    /* Frame number corresponding to aPgno[0] */
  1970   1969       int iKey;                     /* Hash slot index */
  1971         -    int mxHash;                   /* upper bound on aHash[] values */
         1970  +    int rc;
  1972   1971   
  1973         -    walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);
  1974         -    mxHash = iLast - iZero;
  1975         -    if( mxHash > HASHTABLE_NPAGE )  mxHash = HASHTABLE_NPAGE;
         1972  +    rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);
         1973  +    if( rc!=SQLITE_OK ){
         1974  +      return rc;
         1975  +    }
  1976   1976       for(iKey=walHash(pgno); aHash[iKey]; iKey=walNextHash(iKey)){
  1977   1977         u32 iFrame = aHash[iKey] + iZero;
  1978         -      if( iFrame<=iLast && aPgno[iFrame]==pgno ){
         1978  +      if( iFrame<=iLast && aPgno[aHash[iKey]]==pgno ){
  1979   1979           assert( iFrame>iRead );
  1980   1980           iRead = iFrame;
  1981   1981         }
  1982   1982       }
  1983   1983     }
  1984         -  assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );
  1985   1984   
  1986   1985   #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
  1987   1986     /* If expensive assert() statements are available, do a linear search
  1988   1987     ** of the wal-index file content. Make sure the results agree with the
  1989   1988     ** result obtained using the hash indexes above.  */
  1990   1989     {
  1991   1990       u32 iRead2 = 0;
  1992   1991       u32 iTest;
  1993   1992       for(iTest=iLast; iTest>0; iTest--){
  1994         -      if( pWal->pWiData[walIndexEntry(iTest)]==pgno ){
         1993  +      if( walFramePgno(pWal, iTest)==pgno ){
  1995   1994           iRead2 = iTest;
  1996   1995           break;
  1997   1996         }
  1998   1997       }
  1999   1998       assert( iRead==iRead2 );
  2000   1999     }
  2001   2000   #endif
  2002   2001   
  2003   2002     /* If iRead is non-zero, then it is the log frame number that contains the
  2004   2003     ** required page. Read and return data from the log file.
  2005   2004     */
  2006         -  walIndexUnmap(pWal);
  2007   2005     if( iRead ){
  2008   2006       i64 iOffset = walFrameOffset(iRead, pWal->hdr.szPage) + WAL_FRAME_HDRSIZE;
  2009   2007       *pInWal = 1;
  2010   2008       return sqlite3OsRead(pWal->pWalFd, pOut, nOut, iOffset);
  2011   2009     }
  2012   2010   
  2013   2011     *pInWal = 0;
................................................................................
  2053   2051     }
  2054   2052     pWal->writeLock = 1;
  2055   2053   
  2056   2054     /* If another connection has written to the database file since the
  2057   2055     ** time the read transaction on this connection was started, then
  2058   2056     ** the write is disallowed.
  2059   2057     */
  2060         -  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
  2061         -  if( rc ){
  2062         -    walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  2063         -    pWal->writeLock = 0;
  2064         -    return rc;
  2065         -  }
  2066         -  if( memcmp(&pWal->hdr, (void*)pWal->pWiData, sizeof(WalIndexHdr))!=0 ){
         2058  +  if( memcmp(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){
  2067   2059       walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
  2068   2060       pWal->writeLock = 0;
  2069   2061       rc = SQLITE_BUSY;
  2070   2062     }
  2071   2063   
  2072         -  walIndexUnmap(pWal);
  2073   2064     return rc;
  2074   2065   }
  2075   2066   
  2076   2067   /*
  2077   2068   ** End a write transaction.  The commit has already been done.  This
  2078   2069   ** routine merely releases the lock.
  2079   2070   */
................................................................................
  2094   2085   **
  2095   2086   ** Otherwise, if the callback function does not return an error, this
  2096   2087   ** function returns SQLITE_OK.
  2097   2088   */
  2098   2089   int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
  2099   2090     int rc = SQLITE_OK;
  2100   2091     if( pWal->writeLock ){
  2101         -    int unused;
  2102   2092       Pgno iMax = pWal->hdr.mxFrame;
  2103   2093       Pgno iFrame;
  2104   2094     
  2105         -    assert( pWal->pWiData==0 );
  2106         -    rc = walIndexReadHdr(pWal, &unused);
  2107         -    if( rc==SQLITE_OK ){
  2108         -      rc = walIndexMap(pWal, walMappingSize(iMax));
  2109         -    }
  2110         -    if( rc==SQLITE_OK ){
  2111         -      for(iFrame=pWal->hdr.mxFrame+1; 
  2112         -          ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; 
  2113         -          iFrame++
  2114         -      ){
  2115         -        /* This call cannot fail. Unless the page for which the page number
  2116         -        ** is passed as the second argument is (a) in the cache and 
  2117         -        ** (b) has an outstanding reference, then xUndo is either a no-op
  2118         -        ** (if (a) is false) or simply expels the page from the cache (if (b)
  2119         -        ** is false).
  2120         -        **
  2121         -        ** If the upper layer is doing a rollback, it is guaranteed that there
  2122         -        ** are no outstanding references to any page other than page 1. And
  2123         -        ** page 1 is never written to the log until the transaction is
  2124         -        ** committed. As a result, the call to xUndo may not fail.
  2125         -        */
  2126         -        assert( pWal->writeLock );
  2127         -        assert( pWal->pWiData[walIndexEntry(iFrame)]!=1 );
  2128         -        rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
  2129         -      }
  2130         -      walCleanupHash(pWal);
         2095  +    /* Restore the clients cache of the wal-index header to the state it
         2096  +    ** was in before the client began writing to the database. 
         2097  +    */
         2098  +    memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));
         2099  +
         2100  +    for(iFrame=pWal->hdr.mxFrame+1; 
         2101  +        ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; 
         2102  +        iFrame++
         2103  +    ){
         2104  +      /* This call cannot fail. Unless the page for which the page number
         2105  +      ** is passed as the second argument is (a) in the cache and 
         2106  +      ** (b) has an outstanding reference, then xUndo is either a no-op
         2107  +      ** (if (a) is false) or simply expels the page from the cache (if (b)
         2108  +      ** is false).
         2109  +      **
         2110  +      ** If the upper layer is doing a rollback, it is guaranteed that there
         2111  +      ** are no outstanding references to any page other than page 1. And
         2112  +      ** page 1 is never written to the log until the transaction is
         2113  +      ** committed. As a result, the call to xUndo may not fail.
         2114  +      */
         2115  +      assert( walFramePgno(pWal, iFrame)!=1 );
         2116  +      rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
  2131   2117       }
  2132         -    walIndexUnmap(pWal);
         2118  +    walCleanupHash(pWal);
  2133   2119     }
         2120  +  assert( rc==SQLITE_OK );
  2134   2121     return rc;
  2135   2122   }
  2136   2123   
  2137   2124   /* 
  2138   2125   ** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32 
  2139   2126   ** values. This function populates the array with values required to 
  2140   2127   ** "rollback" the write position of the WAL handle back to the current 
................................................................................
  2166   2153       ** to the start of the log. Update the savepoint values to match.
  2167   2154       */
  2168   2155       aWalData[0] = 0;
  2169   2156       aWalData[3] = pWal->nCkpt;
  2170   2157     }
  2171   2158   
  2172   2159     if( aWalData[0]<pWal->hdr.mxFrame ){
  2173         -    rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame));
  2174   2160       pWal->hdr.mxFrame = aWalData[0];
  2175   2161       pWal->hdr.aFrameCksum[0] = aWalData[1];
  2176   2162       pWal->hdr.aFrameCksum[1] = aWalData[2];
  2177         -    if( rc==SQLITE_OK ){
  2178         -      walCleanupHash(pWal);
  2179         -    }
         2163  +    walCleanupHash(pWal);
  2180   2164     }
  2181   2165   
  2182         -  walIndexUnmap(pWal);
  2183   2166     return rc;
  2184   2167   }
  2185   2168   
  2186   2169   /*
  2187   2170   ** This function is called just before writing a set of frames to the log
  2188   2171   ** file (see sqlite3WalFrames()). It checks to see if, instead of appending
  2189   2172   ** to the current log file, it is possible to overwrite the start of the
................................................................................
  2195   2178   ** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned
  2196   2179   ** if some error 
  2197   2180   */
  2198   2181   static int walRestartLog(Wal *pWal){
  2199   2182     int rc = SQLITE_OK;
  2200   2183     int cnt;
  2201   2184   
  2202         -  if( pWal->readLock==0 
  2203         -   && SQLITE_OK==(rc = walIndexMap(pWal, walMappingSize(pWal->hdr.mxFrame)))
  2204         -  ){
         2185  +  if( pWal->readLock==0 ){
  2205   2186       volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
  2206   2187       assert( pInfo->nBackfill==pWal->hdr.mxFrame );
  2207   2188       if( pInfo->nBackfill>0 ){
  2208   2189         rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
  2209   2190         if( rc==SQLITE_OK ){
  2210   2191           /* If all readers are using WAL_READ_LOCK(0) (in other words if no
  2211   2192           ** readers are currently using the WAL), then the transactions
................................................................................
  2233   2214       walUnlockShared(pWal, WAL_READ_LOCK(0));
  2234   2215       pWal->readLock = -1;
  2235   2216       cnt = 0;
  2236   2217       do{
  2237   2218         int notUsed;
  2238   2219         rc = walTryBeginRead(pWal, &notUsed, 1, ++cnt);
  2239   2220       }while( rc==WAL_RETRY );
  2240         -
  2241         -    /* Unmap the wal-index before returning. Otherwise the VFS layer may
  2242         -    ** hold a mutex for the duration of the IO performed by WalFrames().
  2243         -    */
  2244         -    walIndexUnmap(pWal);
  2245   2221     }
  2246   2222     return rc;
  2247   2223   }
  2248   2224   
  2249   2225   /* 
  2250   2226   ** Write a set of frames to the log. The caller must hold the write-lock
  2251   2227   ** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
................................................................................
  2263   2239     u8 aFrame[WAL_FRAME_HDRSIZE];   /* Buffer to assemble frame-header in */
  2264   2240     PgHdr *p;                       /* Iterator to run through pList with. */
  2265   2241     PgHdr *pLast = 0;               /* Last frame in list */
  2266   2242     int nLast = 0;                  /* Number of extra copies of last page */
  2267   2243   
  2268   2244     assert( pList );
  2269   2245     assert( pWal->writeLock );
  2270         -  assert( pWal->pWiData==0 );
  2271   2246   
  2272   2247   #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
  2273   2248     { int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
  2274   2249       WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
  2275   2250                 pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));
  2276   2251     }
  2277   2252   #endif
  2278   2253   
  2279   2254     /* See if it is possible to write these frames into the start of the
  2280   2255     ** log file, instead of appending to it at pWal->hdr.mxFrame.
  2281   2256     */
  2282   2257     if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){
  2283         -    assert( pWal->pWiData==0 );
  2284   2258       return rc;
  2285   2259     }
  2286         -  assert( pWal->pWiData==0 && pWal->readLock>0 );
  2287   2260   
  2288   2261     /* If this is the first frame written into the log, write the WAL
  2289   2262     ** header to the start of the WAL file. See comments at the top of
  2290   2263     ** this source file for a description of the WAL header format.
  2291   2264     */
  2292   2265     iFrame = pWal->hdr.mxFrame;
  2293   2266     if( iFrame==0 ){
................................................................................
  2354   2327         }
  2355   2328         nLast++;
  2356   2329         iOffset += szPage;
  2357   2330       }
  2358   2331   
  2359   2332       rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
  2360   2333     }
  2361         -  assert( pWal->pWiData==0 );
  2362   2334   
  2363   2335     /* Append data to the wal-index. It is not necessary to lock the 
  2364   2336     ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
  2365   2337     ** guarantees that there are no other writers, and no data that may
  2366   2338     ** be in use by existing readers is being overwritten.
  2367   2339     */
  2368   2340     iFrame = pWal->hdr.mxFrame;
................................................................................
  2387   2359       /* If this is a commit, update the wal-index header too. */
  2388   2360       if( isCommit ){
  2389   2361         walIndexWriteHdr(pWal);
  2390   2362         pWal->iCallback = iFrame;
  2391   2363       }
  2392   2364     }
  2393   2365   
  2394         -  walIndexUnmap(pWal);
  2395   2366     WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));
  2396   2367     return rc;
  2397   2368   }
  2398   2369   
  2399   2370   /* 
  2400   2371   ** This routine is called to implement sqlite3_wal_checkpoint() and
  2401   2372   ** related interfaces.
................................................................................
  2408   2379     int sync_flags,                 /* Flags to sync db file with (or 0) */
  2409   2380     int nBuf,                       /* Size of temporary buffer */
  2410   2381     u8 *zBuf                        /* Temporary buffer to use */
  2411   2382   ){
  2412   2383     int rc;                         /* Return code */
  2413   2384     int isChanged = 0;              /* True if a new wal-index header is loaded */
  2414   2385   
  2415         -  assert( pWal->pWiData==0 );
  2416   2386     assert( pWal->ckptLock==0 );
  2417   2387   
  2418   2388     WALTRACE(("WAL%p: checkpoint begins\n", pWal));
  2419   2389     rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
  2420   2390     if( rc ){
  2421   2391       /* Usually this is SQLITE_BUSY meaning that another thread or process
  2422   2392       ** is already running a checkpoint, or maybe a recovery.  But it might
................................................................................
  2437   2407       ** next time the pager opens a snapshot on this database it knows that
  2438   2408       ** the cache needs to be reset.
  2439   2409       */
  2440   2410       memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  2441   2411     }
  2442   2412   
  2443   2413     /* Release the locks. */
  2444         -  walIndexUnmap(pWal);
  2445   2414     walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
  2446   2415     pWal->ckptLock = 0;
  2447   2416     WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
  2448   2417     return rc;
  2449   2418   }
  2450   2419   
  2451   2420   /* Return the value to pass to a sqlite3_wal_hook callback, the

Changes to test/permutations.test.

     9      9   #
    10     10   #***********************************************************************
    11     11   #
    12     12   # $Id: permutations.test,v 1.51 2009/07/01 18:09:02 danielk1977 Exp $
    13     13   
    14     14   set testdir [file dirname $argv0]
    15     15   source $testdir/tester.tcl
           16  +db close
    16     17   
    17     18   # Argument processing.
    18     19   #
    19     20   #puts "PERM-DEBUG: argv=$argv"
    20     21   namespace eval ::perm {
    21     22     variable testmode [lindex $::argv 0]
    22     23     variable testfile [lindex $::argv 1]

Changes to test/wal2.test.

    71     71   #      of the the integer fields (so that the reader ends up with a corrupted
    72     72   #      header).
    73     73   #
    74     74   #   3. Check that the reader recovers the wal-index and reads the correct
    75     75   #      database content.
    76     76   #
    77     77   do_test wal2-1.0 {
    78         -  proc tvfs_cb {method args} { return SQLITE_OK }
           78  +  proc tvfs_cb {method filename args} { 
           79  +    set ::filename $filename
           80  +    return SQLITE_OK 
           81  +  }
           82  +
    79     83     testvfs tvfs
    80     84     tvfs script tvfs_cb
           85  +  tvfs filter xShmOpen
    81     86   
    82     87     sqlite3 db  test.db -vfs tvfs
    83     88     sqlite3 db2 test.db -vfs tvfs
    84     89   
    85     90     execsql {
    86     91       PRAGMA journal_mode = WAL;
    87     92       CREATE TABLE t1(a);
................................................................................
   119    124           10   13   {13 91}   8             {$RECOVER $READ}
   120    125           11   14   {14 105}  9             {$RECOVER $READ}
   121    126           12   15   {15 120}  -1            {$READ}
   122    127   " {
   123    128   
   124    129     do_test wal2-1.$tn.1 {
   125    130       execsql { INSERT INTO t1 VALUES($iInsert) }
   126         -
   127    131       set ::locks [list]
   128         -    set ::cb_done 0
   129         -
   130    132       proc tvfs_cb {method args} {
   131         -      if {$::cb_done == 0 && $method == "xShmGet"} {
   132         -        set ::cb_done 1
   133         -        if {$::wal_index_hdr_mod >= 0} {
   134         -          incr_tvfs_hdr [lindex $args 0] $::wal_index_hdr_mod 1
   135         -        }
   136         -      }
   137         -      if {$method == "xShmLock"} { lappend ::locks [lindex $args 2] }
          133  +      lappend ::locks [lindex $args 2]
   138    134         return SQLITE_OK
   139    135       }
   140         -
          136  +    tvfs filter xShmLock
          137  +    if {$::wal_index_hdr_mod >= 0} {
          138  +      incr_tvfs_hdr $::filename $::wal_index_hdr_mod 1
          139  +    }
   141    140       execsql { SELECT count(a), sum(a) FROM t1 } db2
   142    141     } $res
   143    142   
   144    143     do_test wal2-1.$tn.2 {
   145    144       set ::locks
   146    145     } $wal_locks
   147    146   }
................................................................................
   170    169     {4 1 lock exclusive} {4 1 unlock exclusive} \
   171    170     {4 1 lock shared}    {4 1 unlock shared}    \
   172    171   ]
   173    172   do_test wal2-2.0 {
   174    173   
   175    174     testvfs tvfs
   176    175     tvfs script tvfs_cb
          176  +  tvfs filter xShmOpen
   177    177     proc tvfs_cb {method args} {
   178         -    if {$method == "xShmOpen"} { set ::shm_file [lindex $args 0] }
          178  +    set ::filename [lindex $args 0]
   179    179       return SQLITE_OK
   180    180     }
   181    181   
   182    182     sqlite3 db  test.db -vfs tvfs
   183    183     sqlite3 db2 test.db -vfs tvfs
   184    184   
   185    185     execsql {
................................................................................
   204    204            4    7   {6 21}   {7 28}    2
   205    205            5    8   {7 28}   {8 36}    3
   206    206            6    9   {8 36}   {9 45}    4
   207    207            7   10   {9 45}   {10 55}   5
   208    208            8   11   {10 55}  {11 66}   6
   209    209            9   12   {11 66}  {12 78}   7
   210    210   } {
          211  +  tvfs filter xShmLock
          212  +
   211    213     do_test wal2-2.$tn.1 {
   212         -    set oldhdr [set_tvfs_hdr $::shm_file]
          214  +    set oldhdr [set_tvfs_hdr $::filename]
   213    215       execsql { INSERT INTO t1 VALUES($iInsert) }
   214    216       execsql { SELECT count(a), sum(a) FROM t1 }
   215    217     } $res1
   216    218   
   217    219     do_test wal2-2.$tn.2 {
   218    220       set ::locks [list]
   219         -    set ::cb_done 0
   220    221       proc tvfs_cb {method args} {
   221         -      if {$::cb_done == 0 && $method == "xShmGet"} {
   222         -        set ::cb_done 1
   223         -        if {$::wal_index_hdr_mod >= 0} {
   224         -          incr_tvfs_hdr $::shm_file $::wal_index_hdr_mod 1
   225         -        }
   226         -      }
   227         -      if {$method == "xShmLock"} {
   228         -        set lock [lindex $args 2]
   229         -        lappend ::locks $lock
   230         -        if {$lock == $::WRITER} {
   231         -          set_tvfs_hdr $::shm_file $::oldhdr
   232         -        }
          222  +      set lock [lindex $args 2]
          223  +      lappend ::locks $lock
          224  +      if {$lock == $::WRITER} {
          225  +        set_tvfs_hdr $::filename $::oldhdr
   233    226         }
   234    227         return SQLITE_OK
   235    228       }
   236    229   
          230  +    if {$::wal_index_hdr_mod >= 0} {
          231  +      incr_tvfs_hdr $::filename $::wal_index_hdr_mod 1
          232  +    }
   237    233       execsql { SELECT count(a), sum(a) FROM t1 } db2
   238    234     } $res0
   239    235   
   240    236     do_test wal2-2.$tn.3 {
   241    237       set ::locks
   242    238     } $LOCKS
   243    239   
   244    240     do_test wal2-2.$tn.4 {
   245    241       set ::locks [list]
   246         -    set ::cb_done 0
   247    242       proc tvfs_cb {method args} {
   248         -      if {$::cb_done == 0 && $method == "xShmGet"} {
   249         -        set ::cb_done 1
   250         -        if {$::wal_index_hdr_mod >= 0} {
   251         -          incr_tvfs_hdr $::shm_file $::wal_index_hdr_mod 1
   252         -        }
   253         -      }
   254         -      if {$method == "xShmLock"} {
   255         -        set lock [lindex $args 2]
   256         -        lappend ::locks $lock
   257         -      }
          243  +      set lock [lindex $args 2]
          244  +      lappend ::locks $lock
   258    245         return SQLITE_OK
   259    246       }
   260    247   
          248  +    if {$::wal_index_hdr_mod >= 0} {
          249  +      incr_tvfs_hdr $::filename $::wal_index_hdr_mod 1
          250  +    }
   261    251       execsql { SELECT count(a), sum(a) FROM t1 } db2
   262    252     } $res1
   263    253   }
   264    254   db close
   265    255   db2 close
   266    256   tvfs delete
   267    257   file delete -force test.db test.db-wal test.db-journal

Changes to test/wal3.test.

   349    349   
   350    350   testvfs T -default 1
   351    351   T script method_callback
   352    352   
   353    353   proc method_callback {method args} {
   354    354     if {$method == "xShmBarrier"} {
   355    355       incr ::barrier_count
   356         -    if {$::barrier_count == 1} {
          356  +    if {$::barrier_count == 2} {
   357    357         # This code is executed within the xShmBarrier() callback invoked
   358    358         # by the client running recovery as part of writing the recovered
   359    359         # wal-index header. If a second client attempts to access the 
   360    360         # database now, it reads a corrupt (partially written) wal-index
   361    361         # header. But it cannot even get that far, as the first client
   362    362         # is still holding all the locks (recovery takes an exclusive lock
   363    363         # on *all* db locks, preventing access by any other client).

Changes to test/walfault.test.

   115    115     db eval {
   116    116       DELETE FROM abc;
   117    117       PRAGMA wal_checkpoint;
   118    118     }
   119    119   } -test {
   120    120     faultsim_test_result {0 {}}
   121    121   }
          122  +
   122    123   
   123    124   #--------------------------------------------------------------------------
   124    125   #
   125    126   faultsim_delete_and_reopen
   126    127   faultsim_save_and_close
   127    128   do_faultsim_test walfault-4 -prep {
   128    129     faultsim_restore_and_reopen
................................................................................
   148    149       PRAGMA journal_mode = WAL;
   149    150     }
   150    151     faultsim_save_and_close
   151    152   } {}
   152    153   do_faultsim_test walfault-5 -faults shmerr* -prep {
   153    154     faultsim_restore_and_reopen
   154    155     execsql { PRAGMA wal_autocheckpoint = 0 }
   155         -  shmfault filter xShmSize
          156  +  shmfault filter xShmPage
   156    157   } -body {
   157    158     execsql {
   158    159       CREATE TABLE t1(x);
   159    160       BEGIN;
   160    161         INSERT INTO t1 VALUES(randomblob(400));           /* 1 */
   161    162         INSERT INTO t1 SELECT randomblob(400) FROM t1;    /* 2 */
   162    163         INSERT INTO t1 SELECT randomblob(400) FROM t1;    /* 4 */
................................................................................
   207    208         INSERT INTO t1 SELECT randomblob(400) FROM t1;    /* 16384 */
   208    209       COMMIT;
   209    210     }
   210    211     faultsim_save_and_close
   211    212   } {}
   212    213   do_faultsim_test walfault-6 -faults shmerr* -prep {
   213    214     faultsim_restore_and_reopen
   214         -  shmfault filter xShmSize
          215  +  shmfault filter xShmPage
   215    216   } -body {
   216    217     execsql { SELECT count(*) FROM t1 }
   217    218   } -test {
   218    219     faultsim_test_result {0 16384}
   219    220     faultsim_integrity_check
   220    221     set n [db one {SELECT count(*) FROM t1}]
   221    222     if {$n != 16384 && $n != 0} { error "Incorrect number of rows: $n" }
................................................................................
   322    323     if {$n != 1 && $n != 2} { error "Incorrect number of rows: $n" }
   323    324   }
   324    325   
   325    326   do_test walfault-10-pre1 {
   326    327     faultsim_delete_and_reopen
   327    328     execsql {
   328    329       PRAGMA journal_mode = WAL;
   329         -    PRAGMA wal_checkpoint = 0;
          330  +    PRAGMA wal_autocheckpoint = 0;
   330    331       CREATE TABLE z(zz INTEGER PRIMARY KEY, zzz BLOB);
   331    332       CREATE INDEX zzzz ON z(zzz);
   332    333       INSERT INTO z VALUES(NULL, randomblob(800));
   333    334       INSERT INTO z VALUES(NULL, randomblob(800));
   334    335       INSERT INTO z SELECT NULL, randomblob(800) FROM z;
   335    336       INSERT INTO z SELECT NULL, randomblob(800) FROM z;
   336    337       INSERT INTO z SELECT NULL, randomblob(800) FROM z;
................................................................................
   358    359     faultsim_test_result {0 {}}
   359    360     catch { db eval { ROLLBACK } }
   360    361     faultsim_integrity_check
   361    362   
   362    363     set n [db eval {SELECT count(*), sum(length(zzz)) FROM z}]
   363    364     if {$n != "64 51200"} { error "Incorrect data: $n" }
   364    365   }
          366  +
          367  +#--------------------------------------------------------------------------
          368  +# Test fault injection while checkpointing a large WAL file, if the 
          369  +# checkpoint is the first operation run after opening the database.
          370  +# This means that some of the required wal-index pages are mapped as part of
          371  +# the checkpoint process, which means there are a few more opportunities
          372  +# for IO errors.
          373  +#
          374  +# To speed this up, IO errors are only simulated within xShmPage() calls.
          375  +#
          376  +do_test walfault-11-pre-1 {
          377  +  sqlite3 db test.db
          378  +  execsql {
          379  +    PRAGMA journal_mode = WAL;
          380  +    PRAGMA wal_autocheckpoint = 0;
          381  +    BEGIN;
          382  +      CREATE TABLE abc(a PRIMARY KEY);
          383  +      INSERT INTO abc VALUES(randomblob(1500));
          384  +      INSERT INTO abc VALUES(randomblob(1500));
          385  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --    4
          386  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --    8
          387  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --   16
          388  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --   32
          389  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --   64
          390  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --  128
          391  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --  256
          392  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   --  512
          393  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   -- 1024
          394  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   -- 2048
          395  +      INSERT INTO abc SELECT randomblob(1500) FROM abc;   -- 4096
          396  +    COMMIT;
          397  +  }
          398  +  faultsim_save_and_close
          399  +} {}
          400  +do_faultsim_test walfault-11 -faults shmerr* -prep {
          401  +  catch { db2 close }
          402  +  faultsim_restore_and_reopen
          403  +  shmfault filter xShmPage
          404  +} -body {
          405  +  db eval { SELECT count(*) FROM abc }
          406  +  sqlite3 db2 test.db -vfs shmfault
          407  +  db2 eval { PRAGMA wal_checkpoint }
          408  +} -test {
          409  +  faultsim_test_result {0 {}}
          410  +}
          411  +
          412  +#-------------------------------------------------------------------------
          413  +# Test the handling of the various IO/OOM/SHM errors that may occur during 
          414  +# a log recovery operation undertaken as part of a call to 
          415  +# sqlite3_wal_checkpoint().
          416  +# 
          417  +do_test walfault-12-pre-1 {
          418  +  faultsim_delete_and_reopen
          419  +  execsql {
          420  +    PRAGMA journal_mode = WAL;
          421  +    PRAGMA wal_autocheckpoint = 0;
          422  +    BEGIN;
          423  +      CREATE TABLE abc(a PRIMARY KEY);
          424  +      INSERT INTO abc VALUES(randomblob(1500));
          425  +      INSERT INTO abc VALUES(randomblob(1500));
          426  +    COMMIT;
          427  +  }
          428  +  faultsim_save_and_close
          429  +} {}
          430  +do_faultsim_test walfault-12 -prep {
          431  +  if {[info commands shmfault] == ""} {
          432  +    testvfs shmfault -default true
          433  +  }
          434  +  faultsim_restore_and_reopen
          435  +  db eval { SELECT * FROM sqlite_master }
          436  +  shmfault shm test.db [string repeat "\000" 40]
          437  +} -body {
          438  +  set rc [sqlite3_wal_checkpoint db]
          439  +  if {$rc != "SQLITE_OK"} { error [sqlite3_errmsg db] }
          440  +} -test {
          441  +  db close
          442  +  faultsim_test_result {0 {}}
          443  +}
          444  +
   365    445   
   366    446   finish_test
   367    447