/ Check-in [1008f536]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the xShmPage method to the "crash" vfs in test6.c.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: 1008f536440840da7d56c01ec147a25295fd1fd4
User & Date: dan 2010-06-14 10:30:12
Context
2010-06-14
11:18
Change the interface to internal function walGetHash() to make it easier to follow. check-in: 5e8e2e97 user: dan tags: experimental
10:30
Add the xShmPage method to the "crash" vfs in test6.c. check-in: 1008f536 user: dan tags: experimental
07:53
Add some fault-injection tests to improve coverage. check-in: 37b26d12 user: dan tags: experimental
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/test6.c.

545
546
547
548
549
550
551









552
553
554
555
556
557
558
...
567
568
569
570
571
572
573
574

575
576
577
578
579
580
581
}
static void cfShmBarrier(sqlite3_file *pFile){
  sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile);
}
static int cfShmClose(sqlite3_file *pFile, int delFlag){
  return sqlite3OsShmClose(((CrashFile*)pFile)->pRealFile, delFlag);
}











static const sqlite3_io_methods CrashFileVtab = {
  2,                            /* iVersion */
  cfClose,                      /* xClose */
  cfRead,                       /* xRead */
  cfWrite,                      /* xWrite */
................................................................................
  cfDeviceCharacteristics,      /* xDeviceCharacteristics */
  cfShmOpen,                    /* xShmOpen */
  cfShmSize,                    /* xShmSize */
  cfShmGet,                     /* xShmGet */
  cfShmRelease,                 /* xShmRelease */
  cfShmLock,                    /* xShmLock */
  cfShmBarrier,                 /* xShmBarrier */
  cfShmClose                    /* xShmClose */

};

/*
** Application data for the crash VFS
*/
struct crashAppData {
  sqlite3_vfs *pOrig;                   /* Wrapped vfs structure */







>
>
>
>
>
>
>
>
>







 







|
>







545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
...
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
}
static void cfShmBarrier(sqlite3_file *pFile){
  sqlite3OsShmBarrier(((CrashFile*)pFile)->pRealFile);
}
static int cfShmClose(sqlite3_file *pFile, int delFlag){
  return sqlite3OsShmClose(((CrashFile*)pFile)->pRealFile, delFlag);
}
static int cfShmPage(
  sqlite3_file *pFile,            /* Handle open on database file */
  int iPage,                      /* Page to retrieve */
  int pgsz,                       /* Size of pages */
  int w,                          /* True to extend file if necessary */
  void volatile **pp              /* OUT: Mapped memory */
){
  return sqlite3OsShmPage(((CrashFile*)pFile)->pRealFile, iPage, pgsz, w, pp);
}


static const sqlite3_io_methods CrashFileVtab = {
  2,                            /* iVersion */
  cfClose,                      /* xClose */
  cfRead,                       /* xRead */
  cfWrite,                      /* xWrite */
................................................................................
  cfDeviceCharacteristics,      /* xDeviceCharacteristics */
  cfShmOpen,                    /* xShmOpen */
  cfShmSize,                    /* xShmSize */
  cfShmGet,                     /* xShmGet */
  cfShmRelease,                 /* xShmRelease */
  cfShmLock,                    /* xShmLock */
  cfShmBarrier,                 /* xShmBarrier */
  cfShmClose,                   /* xShmClose */
  cfShmPage                     /* xShmPage */
};

/*
** Application data for the crash VFS
*/
struct crashAppData {
  sqlite3_vfs *pOrig;                   /* Wrapped vfs structure */

Changes to src/wal.c.

382
383
384
385
386
387
388






389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409

410

411
412
413
414
415
416
417
418
419
420
421
...
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
...
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
...
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
...
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
...
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
....
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
....
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
....
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
....
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
....
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
  char *zWalName;            /* Name of WAL file */
  u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
#ifdef SQLITE_DEBUG
  u8 lockError;              /* True if a locking error has occurred */
#endif
};







/*
** Define the parameters of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
**
** Changing any of these constants will alter the wal-index format and
** create incompatibilities.
*/
#define HASHTABLE_NPAGE      4096  /* Must be power of 2 and multiple of 256 */
#define HASHTABLE_DATATYPE   u16
#define HASHTABLE_HASH_1     383                  /* Should be prime */
#define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
#define HASHTABLE_NBYTE      (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/* The block of page numbers associated with the first hash-table in a
** wal-index is smaller than usual. This is so that there is a complete
** hash-table on each aligned 32KB page of the wal-index.
*/
#define HASHTABLE_NPAGE_ONE  (4096 - (WALINDEX_HDR_SIZE/sizeof(u32)))

/* The wal-index is divided into pages of HASHTABLE_PAGESIZE bytes each. */

#define HASHTABLE_PAGESIZE   (HASHTABLE_NBYTE + HASHTABLE_NPAGE*sizeof(u32))


/*
** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
** is broken into pages of HASHTABLE_PAGESIZE bytes. Wal-index pages are
** numbered from zero.
**
** If this call is successful, *ppPage is set to point to the wal-index
** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
** then an SQLite error code is returned and *ppPage is set to 0.
*/
static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
................................................................................
    memset(&apNew[pWal->nWiData], 0, sizeof(u32 *)*(iPage+1-pWal->nWiData));
    pWal->apWiData = apNew;
    pWal->nWiData = iPage+1;
  }

  /* Request a pointer to the required page from the VFS */
  if( pWal->apWiData[iPage]==0 ){
    rc = sqlite3OsShmPage(pWal->pDbFd, iPage, HASHTABLE_PAGESIZE, 
        pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
    );
  }

  *ppPage = pWal->apWiData[iPage];
  assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
  return rc;
................................................................................
**   walIteratorInit() - Create a new iterator,
**   walIteratorNext() - Step an iterator,
**   walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct WalIterator {
  int iPrior;           /* Last result returned from the iterator */
  int nSegment;         /* Size of the aSegment[] array */
  struct WalSegment {
    int iNext;                    /* Next slot in aIndex[] not yet returned */
    HASHTABLE_DATATYPE *aIndex;   /* i0, i1, i2... such that aPgno[iN] ascend */
    u32 *aPgno;                   /* Array of page numbers. */
    int nEntry;                   /* Max size of aPgno[] and aIndex[] arrays */
    int iZero;                    /* Frame number associated with aPgno[0] */
  } aSegment[1];        /* One for every 32KB page in the WAL */
};

/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by intepreting the 4 bytes
................................................................................
** Finally, set *paPgno such that for all frames F between (*piZero+1) and 
** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number 
** associated with frame F.
*/
static int walHashGet(
  Wal *pWal,                      /* WAL handle */
  int iHash,                      /* Find the iHash'th table */
  volatile HASHTABLE_DATATYPE **paHash,     /* OUT: Pointer to hash index */
  volatile u32 **paPgno,          /* OUT: Pointer to page number array */
  u32 *piZero                     /* OUT: Frame associated with *paPgno[0] */
){
  int rc;                         /* Return code */
  volatile u32 *aPgno;

  rc = walIndexPage(pWal, iHash, &aPgno);
  assert( rc==SQLITE_OK || iHash>0 );

  if( rc==SQLITE_OK ){
    u32 iZero;
    volatile HASHTABLE_DATATYPE *aHash;

    aHash = (volatile HASHTABLE_DATATYPE *)&aPgno[HASHTABLE_NPAGE];
    if( iHash==0 ){
      aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)-1];
      iZero = 0;
    }else{
      iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
      aPgno = &aPgno[-1*iZero-1];
    }
................................................................................
**
** At most only the hash table containing pWal->hdr.mxFrame needs to be
** updated.  Any later hash tables will be automatically cleared when
** pWal->hdr.mxFrame advances to the point where those hash tables are
** actually needed.
*/
static void walCleanupHash(Wal *pWal){
  volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
  volatile u32 *aPgno;                 /* Page number array for hash table */
  u32 iZero;                           /* frame == (aHash[x]+iZero) */
  int iLimit = 0;                      /* Zero values greater than this */
  int nByte;                           /* Number of bytes to zero in aPgno[] */
  int i;                               /* Used to iterate through aHash[] */

  assert( pWal->writeLock );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE-1 );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE+1 );

  if( pWal->hdr.mxFrame==0 ) return;
................................................................................
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
  u32 iZero;                      /* One less than frame number of aPgno[1] */
  volatile u32 *aPgno;            /* Page number array */
  volatile HASHTABLE_DATATYPE *aHash;   /* Hash table */

  rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);

  /* Assuming the wal-index file was successfully mapped, populate the
  ** page number array and hash table entry.
  */
  if( rc==SQLITE_OK ){
................................................................................
  *piPage = p->iPrior = iRet;
  return (iRet==0xFFFFFFFF);
}


static void walMergesort(
  u32 *aContent,                  /* Pages in wal */
  HASHTABLE_DATATYPE *aBuffer,    /* Buffer of at least *pnList items to use */
  HASHTABLE_DATATYPE *aList,      /* IN/OUT: List to sort */
  int *pnList                     /* IN/OUT: Number of elements in aList[] */
){
  int nList = *pnList;
  if( nList>1 ){
    int nLeft = nList / 2;        /* Elements in left list */
    int nRight = nList - nLeft;   /* Elements in right list */
    int iLeft = 0;                /* Current index in aLeft */
    int iRight = 0;               /* Current index in aright */
    int iOut = 0;                 /* Current index in output buffer */
    HASHTABLE_DATATYPE *aLeft = aList;           /* Left list */
    HASHTABLE_DATATYPE *aRight = &aList[nLeft];  /* Right list */

    /* TODO: Change to non-recursive version. */
    walMergesort(aContent, aBuffer, aLeft, &nLeft);
    walMergesort(aContent, aBuffer, aRight, &nRight);

    while( iRight<nRight || iLeft<nLeft ){
      HASHTABLE_DATATYPE logpage;
      Pgno dbpage;

      if( (iLeft<nLeft) 
       && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
      ){
        logpage = aLeft[iLeft++];
      }else{
................................................................................
**
** The calling routine should invoke walIteratorFree() to destroy the
** WalIterator object when it has finished with it.  The caller must
** also unmap the wal-index.  But the wal-index must not be unmapped
** prior to the WalIterator object being destroyed.
*/
static int walIteratorInit(Wal *pWal, WalIterator **pp){
  WalIterator *p;       /* Return value */
  int nSegment;         /* Number of segments to merge */
  u32 iLast;            /* Last frame in log */
  int nByte;            /* Number of bytes to allocate */
  int i;                /* Iterator variable */
  HASHTABLE_DATATYPE *aTmp;       /* Temp space used by merge-sort */
  HASHTABLE_DATATYPE *aSpace;     /* Space at the end of the allocation */

  /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
  ** thread is able to write to shared memory while this routine is
  ** running (or, indeed, while the WalIterator object exists).  Hence,
  ** we can cast off the volatile qualification from shared memory
  */
  assert( pWal->ckptLock );
  iLast = pWal->hdr.mxFrame;

  /* Allocate space for the WalIterator object */
  nSegment = walFramePage(iLast) + 1;
  nByte = sizeof(WalIterator) 
        + nSegment*(sizeof(struct WalSegment))
        + (nSegment+1)*(HASHTABLE_NPAGE * sizeof(HASHTABLE_DATATYPE));
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    return SQLITE_NOMEM;
  }
  memset(p, 0, nByte);

  /* Allocate space for the WalIterator object */
  p->nSegment = nSegment;
  aSpace = (HASHTABLE_DATATYPE *)&p->aSegment[nSegment];
  aTmp = &aSpace[HASHTABLE_NPAGE*nSegment];
  for(i=0; i<nSegment; i++){
    volatile HASHTABLE_DATATYPE *aHash;
    int j;
    u32 iZero;
    int nEntry;
    volatile u32 *aPgno;
    int rc;

    rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);
................................................................................
  **     This condition filters out normal hash-table collisions.
  **
  **   (iFrame<=iLast): 
  **     This condition filters out entries that were added to the hash
  **     table after the current read-transaction had started.
  */
  for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
    volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table */
    volatile u32 *aPgno;                 /* Pointer to array of page numbers */
    u32 iZero;                    /* Frame number corresponding to aPgno[0] */
    int iKey;                     /* Hash slot index */
    int rc;

    rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);
    if( rc!=SQLITE_OK ){
      return rc;
................................................................................
**
** Otherwise, if the callback function does not return an error, this
** function returns SQLITE_OK.
*/
int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
  int rc = SQLITE_OK;
  if( pWal->writeLock ){
    int unused;
    Pgno iMax = pWal->hdr.mxFrame;
    Pgno iFrame;
  
    /* Restore the clients cache of the wal-index header to the state it
    ** was in before the client began writing to the database. 
    */
    memcpy(&pWal->hdr, walIndexHdr(pWal), sizeof(WalIndexHdr));

    for(iFrame=pWal->hdr.mxFrame+1; 
        ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; 
        iFrame++
    ){
      /* This call cannot fail. Unless the page for which the page number
      ** is passed as the second argument is (a) in the cache and 
................................................................................
      ** is false).
      **
      ** If the upper layer is doing a rollback, it is guaranteed that there
      ** are no outstanding references to any page other than page 1. And
      ** page 1 is never written to the log until the transaction is
      ** committed. As a result, the call to xUndo may not fail.
      */
      assert( pWal->writeLock );
      assert( walFramePgno(pWal, iFrame)!=1 );
      rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
    }
    walCleanupHash(pWal);
  }
  assert( rc==SQLITE_OK );
  return rc;







>
>
>
>
>
>








|
<


<





|

|
>
|
>



|







 







|







 







|
|


|



|







 







|











|

|







 







|
|
|
|
|
|







 







|







 







|
|









|
|






|







 







|
|
|
|
|
|
|













|








|


|







 







|
|







 







<






|







 







<







382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

404
405

406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
...
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
...
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
...
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
...
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
...
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
....
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
....
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
....
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
....
2075
2076
2077
2078
2079
2080
2081

2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
....
2098
2099
2100
2101
2102
2103
2104

2105
2106
2107
2108
2109
2110
2111
  char *zWalName;            /* Name of WAL file */
  u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
#ifdef SQLITE_DEBUG
  u8 lockError;              /* True if a locking error has occurred */
#endif
};

/*
** Each page of the wal-index mapping contains a hash-table made up of
** an array of HASHTABLE_NSLOT elements of the following type.
*/
typedef u16 ht_slot;

/*
** Define the parameters of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
**
** Changing any of these constants will alter the wal-index format and
** create incompatibilities.
*/
#define HASHTABLE_NPAGE      4096                 /* Must be power of 2 */

#define HASHTABLE_HASH_1     383                  /* Should be prime */
#define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */


/* The block of page numbers associated with the first hash-table in a
** wal-index is smaller than usual. This is so that there is a complete
** hash-table on each aligned 32KB page of the wal-index.
*/
#define HASHTABLE_NPAGE_ONE  (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))

/* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
#define WALINDEX_PGSZ   (                                         \
    sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
)

/*
** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
** numbered from zero.
**
** If this call is successful, *ppPage is set to point to the wal-index
** page and SQLITE_OK is returned. If an error (an OOM or VFS error) occurs,
** then an SQLite error code is returned and *ppPage is set to 0.
*/
static int walIndexPage(Wal *pWal, int iPage, volatile u32 **ppPage){
................................................................................
    memset(&apNew[pWal->nWiData], 0, sizeof(u32 *)*(iPage+1-pWal->nWiData));
    pWal->apWiData = apNew;
    pWal->nWiData = iPage+1;
  }

  /* Request a pointer to the required page from the VFS */
  if( pWal->apWiData[iPage]==0 ){
    rc = sqlite3OsShmPage(pWal->pDbFd, iPage, WALINDEX_PGSZ, 
        pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
    );
  }

  *ppPage = pWal->apWiData[iPage];
  assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
  return rc;
................................................................................
**   walIteratorInit() - Create a new iterator,
**   walIteratorNext() - Step an iterator,
**   walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct WalIterator {
  int iPrior;                     /* Last result returned from the iterator */
  int nSegment;                   /* Size of the aSegment[] array */
  struct WalSegment {
    int iNext;                    /* Next slot in aIndex[] not yet returned */
    ht_slot *aIndex;              /* i0, i1, i2... such that aPgno[iN] ascend */
    u32 *aPgno;                   /* Array of page numbers. */
    int nEntry;                   /* Max size of aPgno[] and aIndex[] arrays */
    int iZero;                    /* Frame number associated with aPgno[0] */
  } aSegment[1];                  /* One for every 32KB page in the WAL */
};

/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by intepreting the 4 bytes
................................................................................
** Finally, set *paPgno such that for all frames F between (*piZero+1) and 
** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number 
** associated with frame F.
*/
static int walHashGet(
  Wal *pWal,                      /* WAL handle */
  int iHash,                      /* Find the iHash'th table */
  volatile ht_slot **paHash,      /* OUT: Pointer to hash index */
  volatile u32 **paPgno,          /* OUT: Pointer to page number array */
  u32 *piZero                     /* OUT: Frame associated with *paPgno[0] */
){
  int rc;                         /* Return code */
  volatile u32 *aPgno;

  rc = walIndexPage(pWal, iHash, &aPgno);
  assert( rc==SQLITE_OK || iHash>0 );

  if( rc==SQLITE_OK ){
    u32 iZero;
    volatile ht_slot *aHash;

    aHash = (volatile ht_slot *)&aPgno[HASHTABLE_NPAGE];
    if( iHash==0 ){
      aPgno = &aPgno[WALINDEX_HDR_SIZE/sizeof(u32)-1];
      iZero = 0;
    }else{
      iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
      aPgno = &aPgno[-1*iZero-1];
    }
................................................................................
**
** At most only the hash table containing pWal->hdr.mxFrame needs to be
** updated.  Any later hash tables will be automatically cleared when
** pWal->hdr.mxFrame advances to the point where those hash tables are
** actually needed.
*/
static void walCleanupHash(Wal *pWal){
  volatile ht_slot *aHash;        /* Pointer to hash table to clear */
  volatile u32 *aPgno;            /* Page number array for hash table */
  u32 iZero;                      /* frame == (aHash[x]+iZero) */
  int iLimit = 0;                 /* Zero values greater than this */
  int nByte;                      /* Number of bytes to zero in aPgno[] */
  int i;                          /* Used to iterate through aHash[] */

  assert( pWal->writeLock );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE-1 );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE );
  testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE+1 );

  if( pWal->hdr.mxFrame==0 ) return;
................................................................................
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
  u32 iZero;                      /* One less than frame number of aPgno[1] */
  volatile u32 *aPgno;            /* Page number array */
  volatile ht_slot *aHash;        /* Hash table */

  rc = walHashGet(pWal, walFramePage(iFrame), &aHash, &aPgno, &iZero);

  /* Assuming the wal-index file was successfully mapped, populate the
  ** page number array and hash table entry.
  */
  if( rc==SQLITE_OK ){
................................................................................
  *piPage = p->iPrior = iRet;
  return (iRet==0xFFFFFFFF);
}


static void walMergesort(
  u32 *aContent,                  /* Pages in wal */
  ht_slot *aBuffer,               /* Buffer of at least *pnList items to use */
  ht_slot *aList,                 /* IN/OUT: List to sort */
  int *pnList                     /* IN/OUT: Number of elements in aList[] */
){
  int nList = *pnList;
  if( nList>1 ){
    int nLeft = nList / 2;        /* Elements in left list */
    int nRight = nList - nLeft;   /* Elements in right list */
    int iLeft = 0;                /* Current index in aLeft */
    int iRight = 0;               /* Current index in aright */
    int iOut = 0;                 /* Current index in output buffer */
    ht_slot *aLeft = aList;       /* Left list */
    ht_slot *aRight = aList+nLeft;/* Right list */

    /* TODO: Change to non-recursive version. */
    walMergesort(aContent, aBuffer, aLeft, &nLeft);
    walMergesort(aContent, aBuffer, aRight, &nRight);

    while( iRight<nRight || iLeft<nLeft ){
      ht_slot logpage;
      Pgno dbpage;

      if( (iLeft<nLeft) 
       && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
      ){
        logpage = aLeft[iLeft++];
      }else{
................................................................................
**
** The calling routine should invoke walIteratorFree() to destroy the
** WalIterator object when it has finished with it.  The caller must
** also unmap the wal-index.  But the wal-index must not be unmapped
** prior to the WalIterator object being destroyed.
*/
static int walIteratorInit(Wal *pWal, WalIterator **pp){
  WalIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  ht_slot *aTmp;                  /* Temp space used by merge-sort */
  ht_slot *aSpace;                /* Space at the end of the allocation */

  /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
  ** thread is able to write to shared memory while this routine is
  ** running (or, indeed, while the WalIterator object exists).  Hence,
  ** we can cast off the volatile qualification from shared memory
  */
  assert( pWal->ckptLock );
  iLast = pWal->hdr.mxFrame;

  /* Allocate space for the WalIterator object */
  nSegment = walFramePage(iLast) + 1;
  nByte = sizeof(WalIterator) 
        + nSegment*(sizeof(struct WalSegment))
        + (nSegment+1)*(HASHTABLE_NPAGE * sizeof(ht_slot));
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    return SQLITE_NOMEM;
  }
  memset(p, 0, nByte);

  /* Allocate space for the WalIterator object */
  p->nSegment = nSegment;
  aSpace = (ht_slot *)&p->aSegment[nSegment];
  aTmp = &aSpace[HASHTABLE_NPAGE*nSegment];
  for(i=0; i<nSegment; i++){
    volatile ht_slot *aHash;
    int j;
    u32 iZero;
    int nEntry;
    volatile u32 *aPgno;
    int rc;

    rc = walHashGet(pWal, i, &aHash, &aPgno, &iZero);
................................................................................
  **     This condition filters out normal hash-table collisions.
  **
  **   (iFrame<=iLast): 
  **     This condition filters out entries that were added to the hash
  **     table after the current read-transaction had started.
  */
  for(iHash=walFramePage(iLast); iHash>=0 && iRead==0; iHash--){
    volatile ht_slot *aHash;      /* Pointer to hash table */
    volatile u32 *aPgno;          /* Pointer to array of page numbers */
    u32 iZero;                    /* Frame number corresponding to aPgno[0] */
    int iKey;                     /* Hash slot index */
    int rc;

    rc = walHashGet(pWal, iHash, &aHash, &aPgno, &iZero);
    if( rc!=SQLITE_OK ){
      return rc;
................................................................................
**
** Otherwise, if the callback function does not return an error, this
** function returns SQLITE_OK.
*/
int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
  int rc = SQLITE_OK;
  if( pWal->writeLock ){

    Pgno iMax = pWal->hdr.mxFrame;
    Pgno iFrame;
  
    /* Restore the clients cache of the wal-index header to the state it
    ** was in before the client began writing to the database. 
    */
    memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));

    for(iFrame=pWal->hdr.mxFrame+1; 
        ALWAYS(rc==SQLITE_OK) && iFrame<=iMax; 
        iFrame++
    ){
      /* This call cannot fail. Unless the page for which the page number
      ** is passed as the second argument is (a) in the cache and 
................................................................................
      ** is false).
      **
      ** If the upper layer is doing a rollback, it is guaranteed that there
      ** are no outstanding references to any page other than page 1. And
      ** page 1 is never written to the log until the transaction is
      ** committed. As a result, the call to xUndo may not fail.
      */

      assert( walFramePgno(pWal, iFrame)!=1 );
      rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
    }
    walCleanupHash(pWal);
  }
  assert( rc==SQLITE_OK );
  return rc;