SQLite

Check-in [d190ddabc3]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Allow the database file to be memory mapped in wal mode.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | experimental-mmap
Files: files | file ages | folders
SHA1: d190ddabc386bc9654b99e33fb81b2f6e67b54d6
User & Date: dan 2013-03-22 18:20:14.143
Context
2013-03-22
19:17
Fix a case where the mapped part of a database file may be written during a backup operation. (check-in: c8d67aefff user: dan tags: experimental-mmap)
18:20
Allow the database file to be memory mapped in wal mode. (check-in: d190ddabc3 user: dan tags: experimental-mmap)
17:46
Add a fix for the assert() statements added by the previous commit. (check-in: 19345416ed user: dan tags: experimental-mmap)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/pager.c.
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
** Write nBuf bytes of data from buffer pBuf to offset iOff of the 
** database file. If this part of the database file is memory mapped,
** use memcpy() to do so. Otherwise, call sqlite3OsWrite().
**
** Return SQLITE_OK if successful, or an SQLite error code if an error 
** occurs.
*/
static int pagerWriteData(Pager *pPager, const void *pBuf, int nBuf, i64 iOff){
  int rc = SQLITE_OK;
  if( pPager->nMapValid>=(iOff+nBuf) ){
    memcpy(&((u8 *)(pPager->pMap))[iOff], pBuf, nBuf);
  }else{
    rc = sqlite3OsWrite(pPager->fd, pBuf, nBuf, iOff);
  }
  return rc;







|







2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
** Write nBuf bytes of data from buffer pBuf to offset iOff of the 
** database file. If this part of the database file is memory mapped,
** use memcpy() to do so. Otherwise, call sqlite3OsWrite().
**
** Return SQLITE_OK if successful, or an SQLite error code if an error 
** occurs.
*/
int sqlite3PagerWriteData(Pager *pPager, const void *pBuf, int nBuf, i64 iOff){
  int rc = SQLITE_OK;
  if( pPager->nMapValid>=(iOff+nBuf) ){
    memcpy(&((u8 *)(pPager->pMap))[iOff], pBuf, nBuf);
  }else{
    rc = sqlite3OsWrite(pPager->fd, pBuf, nBuf, iOff);
  }
  return rc;
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
  if( isOpen(pPager->fd)
   && (pPager->eState>=PAGER_WRITER_DBMOD || pPager->eState==PAGER_OPEN)
   && isSynced
  ){
    i64 ofst = (pgno-1)*(i64)pPager->pageSize;
    testcase( !isSavepnt && pPg!=0 && (pPg->flags&PGHDR_NEED_SYNC)!=0 );
    assert( !pagerUseWal(pPager) );
    rc = pagerWriteData(pPager, aData, pPager->pageSize, ofst);
    if( pgno>pPager->dbFileSize ){
      pPager->dbFileSize = pgno;
    }
    if( pPager->pBackup ){
      CODEC1(pPager, aData, pgno, 3, rc=SQLITE_NOMEM);
      sqlite3BackupUpdate(pPager->pBackup, pgno, (u8*)aData);
      CODEC2(pPager, aData, pgno, 7, rc=SQLITE_NOMEM, aData);







|







2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
  if( isOpen(pPager->fd)
   && (pPager->eState>=PAGER_WRITER_DBMOD || pPager->eState==PAGER_OPEN)
   && isSynced
  ){
    i64 ofst = (pgno-1)*(i64)pPager->pageSize;
    testcase( !isSavepnt && pPg!=0 && (pPg->flags&PGHDR_NEED_SYNC)!=0 );
    assert( !pagerUseWal(pPager) );
    rc = sqlite3PagerWriteData(pPager, aData, pPager->pageSize, ofst);
    if( pgno>pPager->dbFileSize ){
      pPager->dbFileSize = pgno;
    }
    if( pPager->pBackup ){
      CODEC1(pPager, aData, pgno, 3, rc=SQLITE_NOMEM);
      sqlite3BackupUpdate(pPager->pBackup, pgno, (u8*)aData);
      CODEC2(pPager, aData, pgno, 7, rc=SQLITE_NOMEM, aData);
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
**
** If page 1 is read, then the value of Pager.dbFileVers[] is set to
** the value read from the database file.
**
** If an IO error occurs, then the IO error is returned to the caller.
** Otherwise, SQLITE_OK is returned.
*/
static int readDbPage(PgHdr *pPg){
  Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */
  Pgno pgno = pPg->pgno;       /* Page number to read */
  int rc = SQLITE_OK;          /* Return code */
  int isInWal = 0;             /* True if page is in log file */
  int pgsz = pPager->pageSize; /* Number of bytes to read */

  assert( pPager->eState>=PAGER_READER && !MEMDB );
  assert( isOpen(pPager->fd) );

  if( NEVER(!isOpen(pPager->fd)) ){
    assert( pPager->tempFile );
    memset(pPg->pData, 0, pPager->pageSize);
    return SQLITE_OK;
  }

  if( pagerUseWal(pPager) ){
    /* Try to pull the page from the write-ahead log. */
    rc = sqlite3WalRead(pPager->pWal, pgno, &isInWal, pgsz, pPg->pData);
  }
  if( rc==SQLITE_OK && !isInWal ){
    i64 iOffset = (pgno-1)*(i64)pPager->pageSize;
    if( pPager->pMap && pPager->nMapValid>=iOffset+pPager->pageSize ){
      memcpy(pPg->pData, &((u8 *)(pPager->pMap))[iOffset], pPager->pageSize);
    }else{
      rc = sqlite3OsRead(pPager->fd, pPg->pData, pgsz, iOffset);
      if( rc==SQLITE_IOERR_SHORT_READ ){
        rc = SQLITE_OK;







|



<











|

|
|
<







2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870

2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885

2886
2887
2888
2889
2890
2891
2892
**
** If page 1 is read, then the value of Pager.dbFileVers[] is set to
** the value read from the database file.
**
** If an IO error occurs, then the IO error is returned to the caller.
** Otherwise, SQLITE_OK is returned.
*/
static int readDbPage(PgHdr *pPg, u32 iFrame){
  Pager *pPager = pPg->pPager; /* Pager object associated with page pPg */
  Pgno pgno = pPg->pgno;       /* Page number to read */
  int rc = SQLITE_OK;          /* Return code */

  int pgsz = pPager->pageSize; /* Number of bytes to read */

  assert( pPager->eState>=PAGER_READER && !MEMDB );
  assert( isOpen(pPager->fd) );

  if( NEVER(!isOpen(pPager->fd)) ){
    assert( pPager->tempFile );
    memset(pPg->pData, 0, pPager->pageSize);
    return SQLITE_OK;
  }

  if( iFrame ){
    /* Try to pull the page from the write-ahead log. */
    rc = sqlite3WalReadFrame(pPager->pWal, iFrame, pgsz, pPg->pData);
  }else{

    i64 iOffset = (pgno-1)*(i64)pPager->pageSize;
    if( pPager->pMap && pPager->nMapValid>=iOffset+pPager->pageSize ){
      memcpy(pPg->pData, &((u8 *)(pPager->pMap))[iOffset], pPager->pageSize);
    }else{
      rc = sqlite3OsRead(pPager->fd, pPg->pData, pgsz, iOffset);
      if( rc==SQLITE_IOERR_SHORT_READ ){
        rc = SQLITE_OK;
2968
2969
2970
2971
2972
2973
2974





2975

2976
2977
2978
2979
2980
2981
2982
  PgHdr *pPg;

  pPg = sqlite3PagerLookup(pPager, iPg);
  if( pPg ){
    if( sqlite3PcachePageRefcount(pPg)==1 ){
      sqlite3PcacheDrop(pPg);
    }else{





      rc = readDbPage(pPg);

      if( rc==SQLITE_OK ){
        pPager->xReiniter(pPg);
      }
      sqlite3PagerUnref(pPg);
    }
  }








>
>
>
>
>
|
>







2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
  PgHdr *pPg;

  pPg = sqlite3PagerLookup(pPager, iPg);
  if( pPg ){
    if( sqlite3PcachePageRefcount(pPg)==1 ){
      sqlite3PcacheDrop(pPg);
    }else{
      u32 iFrame = 0;
      if( pagerUseWal(pPager) ){
        rc = sqlite3WalFindFrame(pPager->pWal, pPg->pgno, &iFrame);
      }
      if( rc==SQLITE_OK ){
        rc = readDbPage(pPg, iFrame);
      }
      if( rc==SQLITE_OK ){
        pPager->xReiniter(pPg);
      }
      sqlite3PagerUnref(pPg);
    }
  }

3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
** If this connection should not use mmap at all, set nMapLimit to zero.
*/
static void pagerFixMaplimit(Pager *pPager){
  if( isOpen(pPager->fd)==0 
   || pPager->fd->pMethods->iVersion<3 
   || pPager->fd->pMethods->xMremap==0 
   || pPager->tempFile 
   || pPager->pWal 
  ){
    pPager->nMapLimit = 0;
  }else if( pPager->nMapCfgLimit<0 ){
    pPager->nMapLimit = (i64)pPager->nMapCfgLimit * -1024;
  }else{
    pPager->nMapLimit = (i64)pPager->nMapCfgLimit * pPager->pageSize;
  }







<







3388
3389
3390
3391
3392
3393
3394

3395
3396
3397
3398
3399
3400
3401
** If this connection should not use mmap at all, set nMapLimit to zero.
*/
static void pagerFixMaplimit(Pager *pPager){
  if( isOpen(pPager->fd)==0 
   || pPager->fd->pMethods->iVersion<3 
   || pPager->fd->pMethods->xMremap==0 
   || pPager->tempFile 

  ){
    pPager->nMapLimit = 0;
  }else if( pPager->nMapCfgLimit<0 ){
    pPager->nMapLimit = (i64)pPager->nMapCfgLimit * -1024;
  }else{
    pPager->nMapLimit = (i64)pPager->nMapCfgLimit * pPager->pageSize;
  }
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
** Create, or recreate, the memory mapping of the database file.
*/
static int pagerMap(Pager *pPager, int bExtend){
  int rc = SQLITE_OK;             /* Return code */
  Pgno nPg;                       /* Size of mapping to request in pages */
  i64 sz;                         /* Size of mapping to request in bytes */

  assert( pPager->pWal==0 && isOpen(pPager->fd) && pPager->tempFile==0 );
  assert( pPager->pMap==0 || pPager->nMap>0 );
  assert( pPager->eState>=1 );
  assert( pPager->nMmapOut==0 );
  assert( pPager->nMapLimit>0 );

  /* Figure out how large a mapping to request. Set variable sz to this 
  ** value in bytes. */
  nPg = (pPager->eState==1) ? pPager->dbSize : pPager->dbFileSize;
  sz = (i64)nPg * pPager->pageSize;







|

|







3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
** Create, or recreate, the memory mapping of the database file.
*/
static int pagerMap(Pager *pPager, int bExtend){
  int rc = SQLITE_OK;             /* Return code */
  Pgno nPg;                       /* Size of mapping to request in pages */
  i64 sz;                         /* Size of mapping to request in bytes */

  assert( isOpen(pPager->fd) && pPager->tempFile==0 );
  assert( pPager->pMap==0 || pPager->nMap>0 );
  /* assert( pPager->eState>=1 ); */
  assert( pPager->nMmapOut==0 );
  assert( pPager->nMapLimit>0 );

  /* Figure out how large a mapping to request. Set variable sz to this 
  ** value in bytes. */
  nPg = (pPager->eState==1) ? pPager->dbSize : pPager->dbFileSize;
  sz = (i64)nPg * pPager->pageSize;
4208
4209
4210
4211
4212
4213
4214








































4215
4216
4217
4218
4219
4220
4221
  ** all pages.
  */
  sqlite3PcacheClearSyncFlags(pPager->pPCache);
  pPager->eState = PAGER_WRITER_DBMOD;
  assert( assert_pager_state(pPager) );
  return SQLITE_OK;
}









































/*
** The argument is the first in a linked list of dirty pages connected
** by the PgHdr.pDirty pointer. This function writes each one of the
** in-memory pages in the list to the database file. The argument may
** be NULL, representing an empty list. In this case this function is
** a no-op.







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







4211
4212
4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
4227
4228
4229
4230
4231
4232
4233
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244
4245
4246
4247
4248
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
  ** all pages.
  */
  sqlite3PcacheClearSyncFlags(pPager->pPCache);
  pPager->eState = PAGER_WRITER_DBMOD;
  assert( assert_pager_state(pPager) );
  return SQLITE_OK;
}

/*
** This is called by the wal.c module at the start of a checkpoint. If the
** checkpoint runs to completion, it will set the database file size to
** szReq bytes. This function performs two tasks:
**
**   * If the file is currently less than szReq bytes in size, an
**     xFileControl(SQLITE_FNCTL_SIZE_HINT) is issued to inform the OS
**     layer of the expected file size, and
**
**   * If mmap is being used, then the mapping is extended to szReq
**     bytes in size.
**
** SQLITE_OK is returned if successful, or an error code if an error occurs.
*/
int sqlite3PagerSetFilesize(Pager *pPager, i64 szReq){
  int rc;
  sqlite3_int64 sz;

  assert( pPager->eState==PAGER_OPEN );
  assert( pPager->nMmapOut==0 );

  rc = sqlite3OsFileSize(pPager->fd, &sz);
  if( rc==SQLITE_OK ){
    if( sz>szReq ){
      sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_SIZE_HINT, &sz);
    }
  }

  if( rc==SQLITE_OK 
   && pPager->nMapLimit>0 
   && pPager->nMapValid<szReq 
   && pPager->nMapValid<pPager->nMapLimit 
  ){
    pPager->dbFileSize = (szReq / pPager->pageSize);
    rc = pagerMap(pPager, 1);
  }

  return rc;
}

/*
** The argument is the first in a linked list of dirty pages connected
** by the PgHdr.pDirty pointer. This function writes each one of the
** in-memory pages in the list to the database file. The argument may
** be NULL, representing an empty list. In this case this function is
** a no-op.
4271
4272
4273
4274
4275
4276
4277
4278
4279
4280
4281
4282
4283
4284
4285
  ){
    sqlite3_int64 szFile = pPager->pageSize * (sqlite3_int64)pPager->dbSize;
    sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_SIZE_HINT, &szFile);
    pPager->dbHintSize = pPager->dbSize;

    if( pPager->nMmapOut==0 && pPager->nMapLimit>0 ){
      pPager->dbFileSize = pPager->dbSize;
      pagerMap(pPager, 1);
    }
  }

  while( rc==SQLITE_OK && pList ){
    Pgno pgno = pList->pgno;

    /* If there are dirty pages in the page cache with page numbers greater







|







4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
  ){
    sqlite3_int64 szFile = pPager->pageSize * (sqlite3_int64)pPager->dbSize;
    sqlite3OsFileControlHint(pPager->fd, SQLITE_FCNTL_SIZE_HINT, &szFile);
    pPager->dbHintSize = pPager->dbSize;

    if( pPager->nMmapOut==0 && pPager->nMapLimit>0 ){
      pPager->dbFileSize = pPager->dbSize;
      rc = pagerMap(pPager, 1);
    }
  }

  while( rc==SQLITE_OK && pList ){
    Pgno pgno = pList->pgno;

    /* If there are dirty pages in the page cache with page numbers greater
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
      assert( (pList->flags&PGHDR_NEED_SYNC)==0 );
      if( pList->pgno==1 ) pager_write_changecounter(pList);

      /* Encode the database */
      CODEC2(pPager, pList->pData, pgno, 6, return SQLITE_NOMEM, pData);

      /* Write out the page data. */
      pagerWriteData(pPager, pData, pPager->pageSize, offset);

      /* If page 1 was just written, update Pager.dbFileVers to match
      ** the value now stored in the database file. If writing this 
      ** page caused the database file to grow, update dbFileSize. 
      */
      if( pgno==1 ){
        memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));







|







4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
      assert( (pList->flags&PGHDR_NEED_SYNC)==0 );
      if( pList->pgno==1 ) pager_write_changecounter(pList);

      /* Encode the database */
      CODEC2(pPager, pList->pData, pgno, 6, return SQLITE_NOMEM, pData);

      /* Write out the page data. */
      rc = sqlite3PagerWriteData(pPager, pData, pPager->pageSize, offset);

      /* If page 1 was just written, update Pager.dbFileVers to match
      ** the value now stored in the database file. If writing this 
      ** page caused the database file to grow, update dbFileSize. 
      */
      if( pgno==1 ){
        memcpy(&pPager->dbFileVers, &pData[24], sizeof(pPager->dbFileVers));
5271
5272
5273
5274
5275
5276
5277

5278
5279
5280
5281
5282
5283
5284
5285
5286
5287
5288
5289

5290
5291
5292
5293
5294
5295
5296
5297
5298
5299
5300
5301





5302
5303
5304
5305
5306
5307
5308
  Pager *pPager,      /* The pager open on the database file */
  Pgno pgno,          /* Page number to fetch */
  DbPage **ppPage,    /* Write a pointer to the page here */
  int flags           /* PAGER_ACQUIRE_XXX flags */
){
  int rc = SQLITE_OK;
  PgHdr *pPg = 0;

  const int noContent = (flags & PAGER_ACQUIRE_NOCONTENT);

  /* It is acceptable to use a read-only (mmap) page for any page except
  ** page 1 if there is no write-transaction open or the ACQUIRE_READONLY
  ** flag was specified by the caller. And so long as the db is not a 
  ** temporary or in-memory database.  */
  const int bMmapOk = (pPager->nMapLimit>0 && pgno!=1
   && (pPager->eState==PAGER_READER || (flags & PAGER_ACQUIRE_READONLY))
  );

  assert( pPager->eState>=PAGER_READER );
  assert( assert_pager_state(pPager) );


  if( pgno==0 ){
    return SQLITE_CORRUPT_BKPT;
  }

  /* If the pager is in the error state, return an error immediately. 
  ** Otherwise, request the page from the PCache layer. */
  if( pPager->errCode!=SQLITE_OK ){
    rc = pPager->errCode;
  }else{

    if( bMmapOk ){





      if( pPager->pMap==0 || (pPager->bMapResize && pPager->nMmapOut==0) ){
        rc = pagerMap(pPager, 0);
      }
      if( rc==SQLITE_OK && pPager->nMap>=((i64)pgno * pPager->pageSize) ){
        if( pPager->eState>PAGER_READER ){
          (void)sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &pPg);
        }







>












>











|
>
>
>
>
>







5314
5315
5316
5317
5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
  Pager *pPager,      /* The pager open on the database file */
  Pgno pgno,          /* Page number to fetch */
  DbPage **ppPage,    /* Write a pointer to the page here */
  int flags           /* PAGER_ACQUIRE_XXX flags */
){
  int rc = SQLITE_OK;
  PgHdr *pPg = 0;
  u32 iFrame = 0;                 /* Frame to read from WAL file */
  const int noContent = (flags & PAGER_ACQUIRE_NOCONTENT);

  /* It is acceptable to use a read-only (mmap) page for any page except
  ** page 1 if there is no write-transaction open or the ACQUIRE_READONLY
  ** flag was specified by the caller. And so long as the db is not a 
  ** temporary or in-memory database.  */
  const int bMmapOk = (pPager->nMapLimit>0 && pgno!=1
   && (pPager->eState==PAGER_READER || (flags & PAGER_ACQUIRE_READONLY))
  );

  assert( pPager->eState>=PAGER_READER );
  assert( assert_pager_state(pPager) );
  assert( noContent==0 || bMmapOk==0 );

  if( pgno==0 ){
    return SQLITE_CORRUPT_BKPT;
  }

  /* If the pager is in the error state, return an error immediately. 
  ** Otherwise, request the page from the PCache layer. */
  if( pPager->errCode!=SQLITE_OK ){
    rc = pPager->errCode;
  }else{

    if( bMmapOk && pagerUseWal(pPager) ){
      rc = sqlite3WalFindFrame(pPager->pWal, pgno, &iFrame);
      if( rc!=SQLITE_OK ) goto pager_acquire_err;
    }

    if( iFrame==0 && bMmapOk ){
      if( pPager->pMap==0 || (pPager->bMapResize && pPager->nMmapOut==0) ){
        rc = pagerMap(pPager, 0);
      }
      if( rc==SQLITE_OK && pPager->nMap>=((i64)pgno * pPager->pageSize) ){
        if( pPager->eState>PAGER_READER ){
          (void)sqlite3PcacheFetch(pPager->pPCache, pgno, 0, &pPg);
        }
5374
5375
5376
5377
5378
5379
5380




5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
        TESTONLY( rc = ) addToSavepointBitvecs(pPager, pgno);
        testcase( rc==SQLITE_NOMEM );
        sqlite3EndBenignMalloc();
      }
      memset(pPg->pData, 0, pPager->pageSize);
      IOTRACE(("ZERO %p %d\n", pPager, pgno));
    }else{




      assert( pPg->pPager==pPager );
      pPager->aStat[PAGER_STAT_MISS]++;
      rc = readDbPage(pPg);
      if( rc!=SQLITE_OK ){
        goto pager_acquire_err;
      }
    }
    pager_set_pagehash(pPg);
  }








>
>
>
>


|







5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
        TESTONLY( rc = ) addToSavepointBitvecs(pPager, pgno);
        testcase( rc==SQLITE_NOMEM );
        sqlite3EndBenignMalloc();
      }
      memset(pPg->pData, 0, pPager->pageSize);
      IOTRACE(("ZERO %p %d\n", pPager, pgno));
    }else{
      if( pagerUseWal(pPager) && bMmapOk==0 ){
        rc = sqlite3WalFindFrame(pPager->pWal, pgno, &iFrame);
        if( rc!=SQLITE_OK ) goto pager_acquire_err;
      }
      assert( pPg->pPager==pPager );
      pPager->aStat[PAGER_STAT_MISS]++;
      rc = readDbPage(pPg, iFrame);
      if( rc!=SQLITE_OK ){
        goto pager_acquire_err;
      }
    }
    pager_set_pagehash(pPg);
  }

7059
7060
7061
7062
7063
7064
7065
7066
7067
7068
7069
7070
7071
7072
7073
    rc = pagerExclusiveLock(pPager);
  }

  /* Open the connection to the log file. If this operation fails, 
  ** (e.g. due to malloc() failure), return an error code.
  */
  if( rc==SQLITE_OK ){
    rc = sqlite3WalOpen(pPager->pVfs, 
        pPager->fd, pPager->zWal, pPager->exclusiveMode,
        pPager->journalSizeLimit, &pPager->pWal
    );
  }
  pagerFixMaplimit(pPager);

  return rc;







|







7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
    rc = pagerExclusiveLock(pPager);
  }

  /* Open the connection to the log file. If this operation fails, 
  ** (e.g. due to malloc() failure), return an error code.
  */
  if( rc==SQLITE_OK ){
    rc = sqlite3WalOpen(pPager->pVfs, pPager,
        pPager->fd, pPager->zWal, pPager->exclusiveMode,
        pPager->journalSizeLimit, &pPager->pWal
    );
  }
  pagerFixMaplimit(pPager);

  return rc;
Changes to src/pager.h.
170
171
172
173
174
175
176




177
178
179
180
181
182
183
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);





#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)







>
>
>
>







170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);
int sqlite3PagerSetFilesize(Pager *, i64);

/* Write data to the database file */
int sqlite3PagerWriteData(Pager *pPager, const void *pBuf, int nBuf, i64 iOff);

#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)
Changes to src/wal.c.
408
409
410
411
412
413
414

415
416
417
418
419
420
421
** An open write-ahead log file is represented by an instance of the
** following object.
*/
struct Wal {
  sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
  sqlite3_file *pDbFd;       /* File handle for the database file */
  sqlite3_file *pWalFd;      /* File handle for WAL file */

  u32 iCallback;             /* Value to pass to log callback (or 0) */
  i64 mxWalSize;             /* Truncate WAL to this size upon reset */
  int nWiData;               /* Size of array apWiData */
  int szFirstBlock;          /* Size of first block written to WAL file */
  volatile u32 **apWiData;   /* Pointer to wal-index content in memory */
  u32 szPage;                /* Database page size */
  i16 readLock;              /* Which read lock is being held.  -1 for none */







>







408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
** An open write-ahead log file is represented by an instance of the
** following object.
*/
struct Wal {
  sqlite3_vfs *pVfs;         /* The VFS used to create pDbFd */
  sqlite3_file *pDbFd;       /* File handle for the database file */
  sqlite3_file *pWalFd;      /* File handle for WAL file */
  Pager *pPager;             /* Pager object */
  u32 iCallback;             /* Value to pass to log callback (or 0) */
  i64 mxWalSize;             /* Truncate WAL to this size upon reset */
  int nWiData;               /* Size of array apWiData */
  int szFirstBlock;          /* Size of first block written to WAL file */
  volatile u32 **apWiData;   /* Pointer to wal-index content in memory */
  u32 szPage;                /* Database page size */
  i16 readLock;              /* Which read lock is being held.  -1 for none */
1247
1248
1249
1250
1251
1252
1253

1254
1255
1256
1257
1258
1259
1260
**
** If the log file is successfully opened, SQLITE_OK is returned and 
** *ppWal is set to point to a new WAL handle. If an error occurs,
** an SQLite error code is returned and *ppWal is left unmodified.
*/
int sqlite3WalOpen(
  sqlite3_vfs *pVfs,              /* vfs module to open wal and wal-index */

  sqlite3_file *pDbFd,            /* The open database file */
  const char *zWalName,           /* Name of the WAL file */
  int bNoShm,                     /* True to run in heap-memory mode */
  i64 mxWalSize,                  /* Truncate WAL to this size on reset */
  Wal **ppWal                     /* OUT: Allocated Wal handle */
){
  int rc;                         /* Return Code */







>







1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
**
** If the log file is successfully opened, SQLITE_OK is returned and 
** *ppWal is set to point to a new WAL handle. If an error occurs,
** an SQLite error code is returned and *ppWal is left unmodified.
*/
int sqlite3WalOpen(
  sqlite3_vfs *pVfs,              /* vfs module to open wal and wal-index */
  Pager *pPager,                  /* Pager object handle */
  sqlite3_file *pDbFd,            /* The open database file */
  const char *zWalName,           /* Name of the WAL file */
  int bNoShm,                     /* True to run in heap-memory mode */
  i64 mxWalSize,                  /* Truncate WAL to this size on reset */
  Wal **ppWal                     /* OUT: Allocated Wal handle */
){
  int rc;                         /* Return Code */
1287
1288
1289
1290
1291
1292
1293

1294
1295
1296
1297
1298
1299
1300
  pRet->pWalFd = (sqlite3_file *)&pRet[1];
  pRet->pDbFd = pDbFd;
  pRet->readLock = -1;
  pRet->mxWalSize = mxWalSize;
  pRet->zWalName = zWalName;
  pRet->syncHeader = 1;
  pRet->padToSectorBoundary = 1;

  pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);

  /* Open file handle on the write-ahead log file. */
  flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
  rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
  if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
    pRet->readOnly = WAL_RDONLY;







>







1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
  pRet->pWalFd = (sqlite3_file *)&pRet[1];
  pRet->pDbFd = pDbFd;
  pRet->readLock = -1;
  pRet->mxWalSize = mxWalSize;
  pRet->zWalName = zWalName;
  pRet->syncHeader = 1;
  pRet->padToSectorBoundary = 1;
  pRet->pPager = pPager;
  pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);

  /* Open file handle on the write-ahead log file. */
  flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
  rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
  if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
    pRet->readOnly = WAL_RDONLY;
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
    }

    /* If the database file may grow as a result of this checkpoint, hint
    ** about the eventual size of the db file to the VFS layer. 
    */
    if( rc==SQLITE_OK ){
      i64 nReq = ((i64)mxPage * szPage);
      rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
      if( rc==SQLITE_OK && nSize<nReq ){
        sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT, &nReq);
      }
    }

    /* Iterate through the contents of the WAL, copying data to the db file. */
    while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
      i64 iOffset;
      assert( walFramePgno(pWal, iFrame)==iDbpage );
      if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ) continue;
      iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
      /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
      rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
      if( rc!=SQLITE_OK ) break;
      iOffset = (iDbpage-1)*(i64)szPage;
      testcase( IS_BIG_INT(iOffset) );
      rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);
      if( rc!=SQLITE_OK ) break;
    }

    /* If work was actually accomplished... */
    if( rc==SQLITE_OK ){
      if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
        i64 szDb = pWal->hdr.nPage*(i64)szPage;







|
<
<
<













|







1726
1727
1728
1729
1730
1731
1732
1733



1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
    }

    /* If the database file may grow as a result of this checkpoint, hint
    ** about the eventual size of the db file to the VFS layer. 
    */
    if( rc==SQLITE_OK ){
      i64 nReq = ((i64)mxPage * szPage);
      rc = sqlite3PagerSetFilesize(pWal->pPager, nReq); 



    }

    /* Iterate through the contents of the WAL, copying data to the db file. */
    while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
      i64 iOffset;
      assert( walFramePgno(pWal, iFrame)==iDbpage );
      if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ) continue;
      iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
      /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
      rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
      if( rc!=SQLITE_OK ) break;
      iOffset = (iDbpage-1)*(i64)szPage;
      testcase( IS_BIG_INT(iOffset) );
      rc = sqlite3PagerWriteData(pWal->pPager, zBuf, szPage, iOffset);
      if( rc!=SQLITE_OK ) break;
    }

    /* If work was actually accomplished... */
    if( rc==SQLITE_OK ){
      if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
        i64 szDb = pWal->hdr.nPage*(i64)szPage;
2283
2284
2285
2286
2287
2288
2289
2290

2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
  if( pWal->readLock>=0 ){
    walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
    pWal->readLock = -1;
  }
}

/*
** Read a page from the WAL, if it is present in the WAL and if the 

** current read transaction is configured to use the WAL.  
**
** The *pInWal is set to 1 if the requested page is in the WAL and
** has been loaded.  Or *pInWal is set to 0 if the page was not in 
** the WAL and needs to be read out of the database.
*/
int sqlite3WalRead(
  Wal *pWal,                      /* WAL handle */
  Pgno pgno,                      /* Database page number to read data for */
  int *pInWal,                    /* OUT: True if data is read from WAL */
  int nOut,                       /* Size of buffer pOut in bytes */
  u8 *pOut                        /* Buffer to write page data to */
){
  u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
  u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
  int iHash;                      /* Used to loop through N hash tables */

  /* This routine is only be called from within a read transaction. */
  assert( pWal->readLock>=0 || pWal->lockError );

  /* If the "last page" field of the wal-index header snapshot is 0, then
  ** no data will be read from the wal under any circumstances. Return early
  ** in this case as an optimization.  Likewise, if pWal->readLock==0, 
  ** then the WAL is ignored by the reader so return early, as if the 
  ** WAL were empty.
  */
  if( iLast==0 || pWal->readLock==0 ){
    *pInWal = 0;
    return SQLITE_OK;
  }

  /* Search the hash table or tables for an entry matching page number
  ** pgno. Each iteration of the following for() loop searches one
  ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
  **







|
>
|

<
|
|

|


|
<
<















|







2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293

2294
2295
2296
2297
2298
2299
2300


2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
  if( pWal->readLock>=0 ){
    walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
    pWal->readLock = -1;
  }
}

/*
** Search the wal file for page pgno. If found, set *piRead to the frame that
** contains the page. Otherwise, if pgno is not in the wal file, set *piRead
** to zero.
**

** Return SQLITE_OK if successful, or an error code if an error occurs. If an
** error does occur, the final value of *piRead is undefined.
*/
int sqlite3WalFindFrame(
  Wal *pWal,                      /* WAL handle */
  Pgno pgno,                      /* Database page number to read data for */
  u32 *piRead                     /* OUT: Frame number (or zero) */


){
  u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
  u32 iLast = pWal->hdr.mxFrame;  /* Last page in WAL for this reader */
  int iHash;                      /* Used to loop through N hash tables */

  /* This routine is only be called from within a read transaction. */
  assert( pWal->readLock>=0 || pWal->lockError );

  /* If the "last page" field of the wal-index header snapshot is 0, then
  ** no data will be read from the wal under any circumstances. Return early
  ** in this case as an optimization.  Likewise, if pWal->readLock==0, 
  ** then the WAL is ignored by the reader so return early, as if the 
  ** WAL were empty.
  */
  if( iLast==0 || pWal->readLock==0 ){
    *piRead = 0;
    return SQLITE_OK;
  }

  /* Search the hash table or tables for an entry matching page number
  ** pgno. Each iteration of the following for() loop searches one
  ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
  **
2382
2383
2384
2385
2386
2387
2388
2389


2390




2391





2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
        break;
      }
    }
    assert( iRead==iRead2 );
  }
#endif

  /* If iRead is non-zero, then it is the log frame number that contains the


  ** required page. Read and return data from the log file.




  */





  if( iRead ){
    int sz;
    i64 iOffset;
    sz = pWal->hdr.szPage;
    sz = (sz&0xfe00) + ((sz&0x0001)<<16);
    testcase( sz<=32768 );
    testcase( sz>=65536 );
    iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;
    *pInWal = 1;
    /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
    return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);
  }

  *pInWal = 0;
  return SQLITE_OK;
}


/* 
** Return the size of the database in pages (or zero, if unknown).
*/
Pgno sqlite3WalDbsize(Wal *pWal){
  if( pWal && ALWAYS(pWal->readLock>=0) ){
    return pWal->hdr.nPage;







|
>
>
|
>
>
>
>
|
>
>
>
>
>
|
|
|
|
|
|
|
|
<
|
|
|
<
<
<
<
<







2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408

2409
2410
2411





2412
2413
2414
2415
2416
2417
2418
        break;
      }
    }
    assert( iRead==iRead2 );
  }
#endif

  *piRead = iRead;
  return SQLITE_OK;
}

/*
** Read the contents of frame iRead from the wal file into buffer pOut
** (which is nOut bytes in size). Return SQLITE_OK if successful, or an
** error code otherwise.
*/
int sqlite3WalReadFrame(
  Wal *pWal,                      /* WAL handle */
  u32 iRead,                      /* Frame to read */
  int nOut,                       /* Size of buffer pOut in bytes */
  u8 *pOut                        /* Buffer to write page data to */
){
  int sz;
  i64 iOffset;
  sz = pWal->hdr.szPage;
  sz = (sz&0xfe00) + ((sz&0x0001)<<16);
  testcase( sz<=32768 );
  testcase( sz>=65536 );
  iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;

  /* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
  return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);
}






/* 
** Return the size of the database in pages (or zero, if unknown).
*/
Pgno sqlite3WalDbsize(Wal *pWal){
  if( pWal && ALWAYS(pWal->readLock>=0) ){
    return pWal->hdr.nPage;
Changes to src/wal.h.
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

#ifdef SQLITE_OMIT_WAL
# define sqlite3WalOpen(x,y,z)                   0
# define sqlite3WalLimit(x,y)
# define sqlite3WalClose(w,x,y,z)                0
# define sqlite3WalBeginReadTransaction(y,z)     0
# define sqlite3WalEndReadTransaction(z)
# define sqlite3WalRead(v,w,x,y,z)               0
# define sqlite3WalDbsize(y)                     0
# define sqlite3WalBeginWriteTransaction(y)      0
# define sqlite3WalEndWriteTransaction(x)        0
# define sqlite3WalUndo(x,y,z)                   0
# define sqlite3WalSavepoint(y,z)
# define sqlite3WalSavepointUndo(y,z)            0
# define sqlite3WalFrames(u,v,w,x,y,z)           0







<







27
28
29
30
31
32
33

34
35
36
37
38
39
40

#ifdef SQLITE_OMIT_WAL
# define sqlite3WalOpen(x,y,z)                   0
# define sqlite3WalLimit(x,y)
# define sqlite3WalClose(w,x,y,z)                0
# define sqlite3WalBeginReadTransaction(y,z)     0
# define sqlite3WalEndReadTransaction(z)

# define sqlite3WalDbsize(y)                     0
# define sqlite3WalBeginWriteTransaction(y)      0
# define sqlite3WalEndWriteTransaction(x)        0
# define sqlite3WalUndo(x,y,z)                   0
# define sqlite3WalSavepoint(y,z)
# define sqlite3WalSavepointUndo(y,z)            0
# define sqlite3WalFrames(u,v,w,x,y,z)           0
50
51
52
53
54
55
56
57

58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

74
75
76
77
78
79
80
81

/* Connection to a write-ahead log (WAL) file. 
** There is one object of this type for each pager. 
*/
typedef struct Wal Wal;

/* Open and close a connection to a write-ahead log. */
int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *, int, i64, Wal**);

int sqlite3WalClose(Wal *pWal, int sync_flags, int, u8 *);

/* Set the limiting size of a WAL file. */
void sqlite3WalLimit(Wal*, i64);

/* Used by readers to open (lock) and close (unlock) a snapshot.  A 
** snapshot is like a read-transaction.  It is the state of the database
** at an instant in time.  sqlite3WalOpenSnapshot gets a read lock and
** preserves the current state even if the other threads or processes
** write to or checkpoint the WAL.  sqlite3WalCloseSnapshot() closes the
** transaction and releases the lock.
*/
int sqlite3WalBeginReadTransaction(Wal *pWal, int *);
void sqlite3WalEndReadTransaction(Wal *pWal);

/* Read a page from the write-ahead log, if it is present. */

int sqlite3WalRead(Wal *pWal, Pgno pgno, int *pInWal, int nOut, u8 *pOut);

/* If the WAL is not empty, return the size of the database. */
Pgno sqlite3WalDbsize(Wal *pWal);

/* Obtain or release the WRITER lock. */
int sqlite3WalBeginWriteTransaction(Wal *pWal);
int sqlite3WalEndWriteTransaction(Wal *pWal);







|
>
















>
|







49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

/* Connection to a write-ahead log (WAL) file. 
** There is one object of this type for each pager. 
*/
typedef struct Wal Wal;

/* Open and close a connection to a write-ahead log. */
int sqlite3WalOpen(
  sqlite3_vfs*, Pager *, sqlite3_file*, const char *, int, i64, Wal**);
int sqlite3WalClose(Wal *pWal, int sync_flags, int, u8 *);

/* Set the limiting size of a WAL file. */
void sqlite3WalLimit(Wal*, i64);

/* Used by readers to open (lock) and close (unlock) a snapshot.  A 
** snapshot is like a read-transaction.  It is the state of the database
** at an instant in time.  sqlite3WalOpenSnapshot gets a read lock and
** preserves the current state even if the other threads or processes
** write to or checkpoint the WAL.  sqlite3WalCloseSnapshot() closes the
** transaction and releases the lock.
*/
int sqlite3WalBeginReadTransaction(Wal *pWal, int *);
void sqlite3WalEndReadTransaction(Wal *pWal);

/* Read a page from the write-ahead log, if it is present. */
int sqlite3WalFindFrame(Wal *, Pgno, u32 *);
int sqlite3WalReadFrame(Wal *, u32, int, u8 *);

/* If the WAL is not empty, return the size of the database. */
Pgno sqlite3WalDbsize(Wal *pWal);

/* Obtain or release the WRITER lock. */
int sqlite3WalBeginWriteTransaction(Wal *pWal);
int sqlite3WalEndWriteTransaction(Wal *pWal);