/ Check-in [b5b60fdc]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactoring of the WalIterator implementation.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b5b60fdcc5dcf41f2c79912075ac241f7ce220d6
User & Date: drh 2010-05-18 18:01:09
Context
2010-05-18
23:29
Update the wal-index hash format so that hash-table space is reused following a rollback, thus preventing hash table overflows. Add assert()s to verify that hash tables do not overfill. Further refactoring of the wal-index code. check-in: ada9a8c7 user: drh tags: trunk
18:01
Refactoring of the WalIterator implementation. check-in: b5b60fdc user: drh tags: trunk
13:27
Mark the shared-memory in the WAL implementation as volatile. check-in: 0a678790 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/wal.c.

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
...
130
131
132
133
134
135
136
137
138
139
140

141
142
143
144
145
146
147
148
149
150
151
152

153
154
155
156
157

158
159
160
161
162
163
164
...
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372









373
374
375
376
377
378
379
...
711
712
713
714
715
716
717










718
719
720
721
722
723

724
725

726
727


728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747




































































748
749
750
751
752
753
754
755
756
757

758

759
760
761
762







763


764
765
766
767
768
769
770
771
772
773

774
775
776





777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793

794
795
796
797
798
799
800
801
802
803
804
805
806
...
920
921
922
923
924
925
926
927
928
929
930
931






932
933
934
935
936


937
938
939
940
941
942
943
...
947
948
949
950
951
952
953
954



955
956
957
958
959
960
961
...
965
966
967
968
969
970
971
972
973
974










975
976
977
978
979
980
981
....
1028
1029
1030
1031
1032
1033
1034
1035









1036
1037
1038
1039
1040
1041
1042
....
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
....
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
....
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
....
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
**     8: Checksum value 1.
**    12: Checksum value 2.
*/

/* 
** WAL-INDEX FILE FORMAT
**
** The wal-index consists of a 32-byte header region, followed by an 
** 8-byte region that contains no useful data (used to apply byte-range locks
** in some implementations), followed by the data region. 
**
** The contents of both the header and data region are specified in terms
** of 1, 2 and 4 byte unsigned integers. All integers are stored in 
** machine-endian order.  The wal-index is not a persistent file and
** so it does not need to be portable across archtectures.
................................................................................
  u8 isWindexOpen;           /* True if ShmOpen() called on pDbFd */
  WalIndexHdr hdr;           /* Wal-index for current snapshot */
  char *zWalName;            /* Name of WAL file */
};


/*
** This structure is used to implement an iterator that iterates through
** all frames in the log in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the 
** frame most recently written to the log.

**
** The internals of this structure are only accessed by:
**
**   walIteratorInit() - Create a new iterator,
**   walIteratorNext() - Step an iterator,
**   walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct WalIterator {
  int nSegment;                   /* Size of WalIterator.aSegment[] array */
  int nFinal;                     /* Elements in segment nSegment-1 */

  struct WalSegment {
    int iNext;                    /* Next aIndex index */
    u8 *aIndex;                   /* Pointer to index array */
    u32 *aDbPage;                 /* Pointer to db page array */
  } aSegment[1];

};


/*
** Generate an 8 byte checksum based on the data in array aByte[] and the
** initial values of aCksum[0] and aCksum[1]. The checksum is written into
** aCksum[] before returning.
................................................................................
  }

  *piPage = sqlite3Get4byte(&aFrame[0]);
  *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  return 1;
}

static void walMergesort8(
  Pgno *aContent,                 /* Pages in wal */
  u8 *aBuffer,                    /* Buffer of at least *pnList items to use */
  u8 *aList,                      /* IN/OUT: List to sort */
  int *pnList                     /* IN/OUT: Number of elements in aList[] */
){
  int nList = *pnList;
  if( nList>1 ){
    int nLeft = nList / 2;        /* Elements in left list */
    int nRight = nList - nLeft;   /* Elements in right list */
    u8 *aLeft = aList;            /* Left list */
    u8 *aRight = &aList[nLeft];   /* Right list */
    int iLeft = 0;                /* Current index in aLeft */
    int iRight = 0;               /* Current index in aright */
    int iOut = 0;                 /* Current index in output buffer */

    /* TODO: Change to non-recursive version. */
    walMergesort8(aContent, aBuffer, aLeft, &nLeft);
    walMergesort8(aContent, aBuffer, aRight, &nRight);

    while( iRight<nRight || iLeft<nLeft ){
      u8 logpage;
      Pgno dbpage;

      if( (iLeft<nLeft) 
       && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
      ){
        logpage = aLeft[iLeft++];
      }else{
        logpage = aRight[iRight++];
      }
      dbpage = aContent[logpage];

      aBuffer[iOut++] = logpage;
      if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;

      assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
      assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
    }
    memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
    *pnList = iOut;
  }

#ifdef SQLITE_DEBUG
  {
    int i;
    for(i=1; i<*pnList; i++){
      assert( aContent[aList[i]] > aContent[aList[i-1]] );
    }
  }
#endif
}

/*
** Define the size of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
*/
#define HASHTABLE_NPAGE      4096
#define HASHTABLE_DATATYPE   u16

#define HASHTABLE_NSLOT     (HASHTABLE_NPAGE*2)
#define HASHTABLE_NBYTE     (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/*
** Return the index in the WalIndex.aData array that corresponds to 
** frame iFrame. The wal-index file consists of a header, followed by
** alternating "map" and "index" blocks.









*/
static int walIndexEntry(u32 iFrame){
  return (
      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
    + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
    + (iFrame-1)
  );
................................................................................
    sqlite3_free(pRet);
  }else{
    *ppWal = pRet;
  }
  return rc;
}











static int walIteratorNext(
  WalIterator *p,               /* Iterator */
  u32 *piPage,                  /* OUT: Next db page to write */
  u32 *piFrame                  /* OUT: Wal frame to read from */
){
  u32 iMin = *piPage;

  u32 iRet = 0xFFFFFFFF;
  int i;

  int nBlock = p->nFinal;



  for(i=p->nSegment-1; i>=0; i--){
    struct WalSegment *pSegment = &p->aSegment[i];
    while( pSegment->iNext<nBlock ){
      u32 iPg = pSegment->aDbPage[pSegment->aIndex[pSegment->iNext]];
      if( iPg>iMin ){
        if( iPg<iRet ){
          iRet = iPg;
          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
        }
        break;
      }
      pSegment->iNext++;
    }
    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}





































































static int walIteratorInit(Wal *pWal, WalIterator **pp){
  u32 *aData;                     /* Content of the wal-index file */
  WalIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */
  u8 *aTmp;                       /* Temp space used by merge-sort */
  int rc;                         /* Return code of walIndexMap() */



  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
  if( rc!=SQLITE_OK ){
    return rc;
  }







  aData = (u32*)pWal->pWiData;


  iLast = pWal->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    rc = SQLITE_NOMEM;
  }else{
    u8 *aSpace;

    memset(p, 0, nByte);
    p->nSegment = nSegment;






    aSpace = (u8 *)&p->aSegment[nSegment];
    aTmp = &aSpace[nSegment*256];
    for(i=0; i<nSegment; i++){
      int j;
      int nIndex = (i==nSegment-1) ? nFinal : 256;
      p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
      p->aSegment[i].aIndex = aSpace;
      for(j=0; j<nIndex; j++){
        aSpace[j] = j;
      }
      walMergesort8(p->aSegment[i].aDbPage, aTmp, aSpace, &nIndex);
      memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
      aSpace += 256;
      p->nFinal = nIndex;
    }
  }


  *pp = p;
  return rc;
}

/* 
** Free a log iterator allocated by walIteratorInit().
*/
static void walIteratorFree(WalIterator *p){
  sqlite3_free(p);
}

/*
** Checkpoint the contents of the log file.
................................................................................
    }
    sqlite3_free(pWal);
  }
  return rc;
}

/*
** Try to read the wal-index header. Attempt to verify the header
** checksum. If the checksum can be verified, copy the wal-index
** header into structure pWal->hdr. If the contents of pWal->hdr are
** modified by this and pChanged is not NULL, set *pChanged to 1. 
** Otherwise leave *pChanged unmodified.






**
** If the checksum cannot be verified return non-zero. If the header
** is read successfully and the checksum verified, return zero.
*/
int walIndexTryHdr(Wal *pWal, int *pChanged){


  u32 aCksum[2] = {1, 1};
  u32 aHdr[WALINDEX_HDR_NFIELD+2];

  assert( pWal->pWiData );
  if( pWal->szWIndex==0 ){
    /* The wal-index is of size 0 bytes. This is handled in the same way
    ** as an invalid header. The caller will run recovery to construct
................................................................................
  }

  /* Read the header. The caller may or may not have an exclusive 
  ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index
  ** file, meaning it is possible that an inconsistent snapshot is read
  ** from the file. If this happens, return non-zero.
  */
  memcpy(aHdr, (void*)pWal->pWiData, sizeof(aHdr));



  walChecksumBytes((u8*)aHdr, sizeof(u32)*WALINDEX_HDR_NFIELD, aCksum);
  if( aCksum[0]!=aHdr[WALINDEX_HDR_NFIELD]
   || aCksum[1]!=aHdr[WALINDEX_HDR_NFIELD+1]
  ){
    return 1;
  }

................................................................................
  }

  /* The header was successfully read. Return zero. */
  return 0;
}

/*
** Read the wal-index header from the wal-index file into structure 
** pWal->hdr. If attempting to verify the header checksum fails, try
** to recover the log before returning.










**
** If the wal-index header is successfully read, return SQLITE_OK. 
** Otherwise an SQLite error code.
*/
static int walIndexReadHdr(Wal *pWal, int *pChanged){
  int rc;                         /* Return code */
  int lockState;                  /* pWal->lockState before running recovery */
................................................................................
    }
  }

  return rc;
}

/*
** Lock a snapshot.









**
** If this call obtains a new read-lock and the database contents have been
** modified since the most recent call to WalCloseSnapshot() on this Wal
** connection, then *pChanged is set to 1 before returning. Otherwise, it 
** is left unmodified. This is used by the pager layer to determine whether 
** or not any cached pages may be safely reused.
*/
................................................................................
  int nLast = 0;                  /* Number of extra copies of last page */

  assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
  assert( pList );
  assert( pWal->lockState==SQLITE_SHM_WRITE );
  assert( pWal->pWiData==0 );

  /* If this is the first frame written into the log, write the log 
  ** header to the start of the log file. See comments at the top of
  ** this file for a description of the log-header format.
  */
  assert( WAL_FRAME_HDRSIZE>=WAL_HDRSIZE );
  iFrame = pWal->hdr.iLastPg;
  if( iFrame==0 ){
    sqlite3Put4byte(aFrame, nPgsz);
    sqlite3_randomness(8, &aFrame[4]);
    pWal->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
................................................................................
    walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
    rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    /* Write the page data */
    rc = sqlite3OsWrite(pWal->pWalFd, p->pData, nPgsz, iOffset + sizeof(aFrame));
    if( rc!=SQLITE_OK ){
      return rc;
    }
    pLast = p;
  }

  /* Sync the log file if the 'isSync' flag was specified. */
................................................................................
    }

    rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
  }
  assert( pWal->pWiData==0 );

  /* Append data to the wal-index. It is not necessary to lock the 
  ** wal-index to do this as the RESERVED lock held on the db file
  ** guarantees that there are no other writers, and no data that may
  ** be in use by existing readers is being overwritten.
  */
  iFrame = pWal->hdr.iLastPg;
  for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
    iFrame++;
    rc = walIndexAppend(pWal, iFrame, p->pgno);
................................................................................
  /* Copy data from the log to the database file. */
  rc = walIndexReadHdr(pWal, &isChanged);
  if( rc==SQLITE_OK ){
    rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
  }
  if( isChanged ){
    /* If a new wal-index header was loaded before the checkpoint was 
    ** performed, then the pager-cache associated with log pWal is now
    ** out of date. So zero the cached wal-index header to ensure that
    ** next time the pager opens a snapshot on this database it knows that
    ** the cache needs to be reset.
    */
    memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  }








|







 







|
|

|
>










|
|
>

|
|
<
|
>







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<












|
|
|
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>


|
|

<
>
|
<
>
|

>
>



|












|



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
|
|
|
|
|
|
|
|
>

>




>
>
>
>
>
>
>

>
>



<



|
<
<
>
|
<

>
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<
>

|



|







 







|
|
|
|
|
>
>
>
>
>
>





>
>







 







|
>
>
>







 







|
|
|
>
>
>
>
>
>
>
>
>
>







 







|
>
>
>
>
>
>
>
>
>







 







|
|
|







 







|







 







|







 







|







40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
...
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

158
159
160
161
162
163
164
165
166
...
300
301
302
303
304
305
306





















































307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
...
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690

691
692

693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815

816
817
818
819


820
821

822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843

844
845
846
847
848
849
850
851
852
853
854
855
856
857
...
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
....
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
....
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
....
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
....
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
....
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
....
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
....
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
**     8: Checksum value 1.
**    12: Checksum value 2.
*/

/* 
** WAL-INDEX FILE FORMAT
**
** The wal-index consists of a header region, followed by an 
** 8-byte region that contains no useful data (used to apply byte-range locks
** in some implementations), followed by the data region. 
**
** The contents of both the header and data region are specified in terms
** of 1, 2 and 4 byte unsigned integers. All integers are stored in 
** machine-endian order.  The wal-index is not a persistent file and
** so it does not need to be portable across archtectures.
................................................................................
  u8 isWindexOpen;           /* True if ShmOpen() called on pDbFd */
  WalIndexHdr hdr;           /* Wal-index for current snapshot */
  char *zWalName;            /* Name of WAL file */
};


/*
** This structure is used to implement an iterator that loops through
** all frames in the WAL in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the 
** frame most recently written to the WAL (in other words, the frame with
** the largest index).
**
** The internals of this structure are only accessed by:
**
**   walIteratorInit() - Create a new iterator,
**   walIteratorNext() - Step an iterator,
**   walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct WalIterator {
  int iPrior;           /* Last result returned from the iterator */
  int nSegment;         /* Size of the aSegment[] array */
  int nFinal;           /* Elements in aSegment[nSegment-1]  */
  struct WalSegment {
    int iNext;              /* Next slot in aIndex[] not previously returned */
    u8 *aIndex;             /* i0, i1, i2... such that aPgno[iN] ascending */

    u32 *aPgno;             /* 256 page numbers.  Pointer to Wal.pWiData */
  } aSegment[1];        /* One for every 256 entries in the WAL */
};


/*
** Generate an 8 byte checksum based on the data in array aByte[] and the
** initial values of aCksum[0] and aCksum[1]. The checksum is written into
** aCksum[] before returning.
................................................................................
  }

  *piPage = sqlite3Get4byte(&aFrame[0]);
  *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  return 1;
}






















































/*
** Define the size of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
*/
#define HASHTABLE_NPAGE      4096
#define HASHTABLE_DATATYPE   u16

#define HASHTABLE_NSLOT     (HASHTABLE_NPAGE*2)
#define HASHTABLE_NBYTE     (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/*
** Return the index in the Wal.pWiData array that corresponds to 
** frame iFrame.
**
** Wal.pWiData is an array of u32 elements that is the wal-index.
** The array begins with a header and is then followed by alternating
** "map" and "hash-table" blocks.  Each "map" block consists of
** HASHTABLE_NPAGE u32 elements which are page numbers corresponding
** to frames in the WAL file.  
**
** This routine returns an index X such that Wal.pWiData[X] is part
** of a "map" block that contains the page number of the iFrame-th
** frame in the WAL file.
*/
static int walIndexEntry(u32 iFrame){
  return (
      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
    + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
    + (iFrame-1)
  );
................................................................................
    sqlite3_free(pRet);
  }else{
    *ppWal = pRet;
  }
  return rc;
}

/*
** Find the smallest page number out of all pages held in the WAL that
** has not been returned by any prior invocation of this method on the
** same WalIterator object.   Write into *piFrame the frame index where
** that page was last written into the WAL.  Write into *piPage the page
** number.
**
** Return 0 on success.  If there are no pages in the WAL with a page
** number larger than *piPage, then return 1.
*/
static int walIteratorNext(
  WalIterator *p,               /* Iterator */
  u32 *piPage,                  /* OUT: The page number of the next page */
  u32 *piFrame                  /* OUT: Wal frame index of next page */
){

  u32 iMin;                     /* Result pgno must be greater than iMin */
  u32 iRet = 0xFFFFFFFF;        /* 0xffffffff is never a valid page number */

  int i;                        /* For looping through segments */
  int nBlock = p->nFinal;       /* Number of entries in current segment */

  iMin = p->iPrior;
  assert( iMin<0xffffffff );
  for(i=p->nSegment-1; i>=0; i--){
    struct WalSegment *pSegment = &p->aSegment[i];
    while( pSegment->iNext<nBlock ){
      u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]];
      if( iPg>iMin ){
        if( iPg<iRet ){
          iRet = iPg;
          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
        }
        break;
      }
      pSegment->iNext++;
    }
    nBlock = 256;
  }

  *piPage = p->iPrior = iRet;
  return (iRet==0xFFFFFFFF);
}


static void walMergesort8(
  Pgno *aContent,                 /* Pages in wal */
  u8 *aBuffer,                    /* Buffer of at least *pnList items to use */
  u8 *aList,                      /* IN/OUT: List to sort */
  int *pnList                     /* IN/OUT: Number of elements in aList[] */
){
  int nList = *pnList;
  if( nList>1 ){
    int nLeft = nList / 2;        /* Elements in left list */
    int nRight = nList - nLeft;   /* Elements in right list */
    u8 *aLeft = aList;            /* Left list */
    u8 *aRight = &aList[nLeft];   /* Right list */
    int iLeft = 0;                /* Current index in aLeft */
    int iRight = 0;               /* Current index in aright */
    int iOut = 0;                 /* Current index in output buffer */

    /* TODO: Change to non-recursive version. */
    walMergesort8(aContent, aBuffer, aLeft, &nLeft);
    walMergesort8(aContent, aBuffer, aRight, &nRight);

    while( iRight<nRight || iLeft<nLeft ){
      u8 logpage;
      Pgno dbpage;

      if( (iLeft<nLeft) 
       && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
      ){
        logpage = aLeft[iLeft++];
      }else{
        logpage = aRight[iRight++];
      }
      dbpage = aContent[logpage];

      aBuffer[iOut++] = logpage;
      if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;

      assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
      assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
    }
    memcpy(aList, aBuffer, sizeof(aList[0])*iOut);
    *pnList = iOut;
  }

#ifdef SQLITE_DEBUG
  {
    int i;
    for(i=1; i<*pnList; i++){
      assert( aContent[aList[i]] > aContent[aList[i-1]] );
    }
  }
#endif
}

/*
** Map the wal-index into memory owned by this thread, if it is not
** mapped already.  Then construct a WalInterator object that can be
** used to loop over all pages in the WAL in ascending order.  
**
** On success, make *pp point to the newly allocated WalInterator object
** return SQLITE_OK.  Otherwise, leave *pp unchanged and return an error
** code.
**
** The calling routine should invoke walIteratorFree() to destroy the
** WalIterator object when it has finished with it.  The caller must
** also unmap the wal-index.  But the wal-index must not be unmapped
** prior to the WalIterator object being destroyed.
*/
static int walIteratorInit(Wal *pWal, WalIterator **pp){
  u32 *aData;           /* Content of the wal-index file */
  WalIterator *p;       /* Return value */
  int nSegment;         /* Number of segments to merge */
  u32 iLast;            /* Last frame in log */
  int nByte;            /* Number of bytes to allocate */
  int i;                /* Iterator variable */
  int nFinal;           /* Number of unindexed entries */
  u8 *aTmp;             /* Temp space used by merge-sort */
  int rc;               /* Return code of walIndexMap() */
  u8 *aSpace;           /* Surplus space on the end of the allocation */

  /* Make sure the wal-index is mapped into local memory */
  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
  if( rc!=SQLITE_OK ){
    return rc;
  }

  /* This routine only runs while holding SQLITE_SHM_CHECKPOINT.  No other
  ** thread is able to write to shared memory while this routine is
  ** running (or, indeed, while the WalIterator object exists).  Hence,
  ** we can cast off the volatile qualifacation from shared memory
  */
  assert( pWal->lockState==SQLITE_SHM_CHECKPOINT );
  aData = (u32*)pWal->pWiData;

  /* Allocate space for the WalIterator object */
  iLast = pWal->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    return SQLITE_NOMEM;


  }
  memset(p, 0, nByte);


  /* Initialize the WalIterator object.  Each 256-entry segment is
  ** presorted in order to make iterating through all entries much
  ** faster.
  */
  p->nSegment = nSegment;
  aSpace = (u8 *)&p->aSegment[nSegment];
  aTmp = &aSpace[nSegment*256];
  for(i=0; i<nSegment; i++){
    int j;
    int nIndex = (i==nSegment-1) ? nFinal : 256;
    p->aSegment[i].aPgno = &aData[walIndexEntry(i*256+1)];
    p->aSegment[i].aIndex = aSpace;
    for(j=0; j<nIndex; j++){
      aSpace[j] = j;
    }
    walMergesort8(p->aSegment[i].aPgno, aTmp, aSpace, &nIndex);
    memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
    aSpace += 256;
    p->nFinal = nIndex;
  }


  /* Return the fully initializd WalIterator object */
  *pp = p;
  return SQLITE_OK ;
}

/* 
** Free an iterator allocated by walIteratorInit().
*/
static void walIteratorFree(WalIterator *p){
  sqlite3_free(p);
}

/*
** Checkpoint the contents of the log file.
................................................................................
    }
    sqlite3_free(pWal);
  }
  return rc;
}

/*
** Try to read the wal-index header.  Return 0 on success and 1 if
** there is a problem.
**
** The wal-index is in shared memory.  Another thread or process might
** be writing the header at the same time this procedure is trying to
** read it, which might result in inconsistency.  A dirty read is detected
** by verifying a checksum on the header.
**
** If and only if the read is consistent and the header is different from
** pWal->hdr, then pWal->hdr is updated to the content of the new header
** and *pChanged is set to 1.
**
** If the checksum cannot be verified return non-zero. If the header
** is read successfully and the checksum verified, return zero.
*/
int walIndexTryHdr(Wal *pWal, int *pChanged){
  int i;
  volatile u32 *aWiData;
  u32 aCksum[2] = {1, 1};
  u32 aHdr[WALINDEX_HDR_NFIELD+2];

  assert( pWal->pWiData );
  if( pWal->szWIndex==0 ){
    /* The wal-index is of size 0 bytes. This is handled in the same way
    ** as an invalid header. The caller will run recovery to construct
................................................................................
  }

  /* Read the header. The caller may or may not have an exclusive 
  ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index
  ** file, meaning it is possible that an inconsistent snapshot is read
  ** from the file. If this happens, return non-zero.
  */
  aWiData = pWal->pWiData;
  for(i=0; i<WALINDEX_HDR_NFIELD+2; i++){
    aHdr[i] = aWiData[i];
  }
  walChecksumBytes((u8*)aHdr, sizeof(u32)*WALINDEX_HDR_NFIELD, aCksum);
  if( aCksum[0]!=aHdr[WALINDEX_HDR_NFIELD]
   || aCksum[1]!=aHdr[WALINDEX_HDR_NFIELD+1]
  ){
    return 1;
  }

................................................................................
  }

  /* The header was successfully read. Return zero. */
  return 0;
}

/*
** Read the wal-index header from the wal-index and into pWal->hdr.
** If the wal-header appears to be corrupt, try to recover the log
** before returning.
**
** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
** changed by this opertion.  If pWal->hdr is unchanged, set *pChanged
** to 0.
**
** This routine also maps the wal-index content into memory and assigns
** ownership of that mapping to the current thread.  In some implementations,
** only one thread at a time can hold a mapping of the wal-index.  Hence,
** the caller should strive to invoke walIndexUnmap() as soon as possible
** after this routine returns.
**
** If the wal-index header is successfully read, return SQLITE_OK. 
** Otherwise an SQLite error code.
*/
static int walIndexReadHdr(Wal *pWal, int *pChanged){
  int rc;                         /* Return code */
  int lockState;                  /* pWal->lockState before running recovery */
................................................................................
    }
  }

  return rc;
}

/*
** Take a snapshot of the state of the WAL and wal-index for the current
** instant in time.  The current thread will continue to use this snapshot.
** Other threads might containing appending to the WAL and wal-index but
** the extra content appended will be ignored by the current thread.
**
** A snapshot is like a read transaction.
**
** No other threads are allowed to run a checkpoint while this thread is
** holding the snapshot since a checkpoint would remove data out from under
** this thread.
**
** If this call obtains a new read-lock and the database contents have been
** modified since the most recent call to WalCloseSnapshot() on this Wal
** connection, then *pChanged is set to 1 before returning. Otherwise, it 
** is left unmodified. This is used by the pager layer to determine whether 
** or not any cached pages may be safely reused.
*/
................................................................................
  int nLast = 0;                  /* Number of extra copies of last page */

  assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) );
  assert( pList );
  assert( pWal->lockState==SQLITE_SHM_WRITE );
  assert( pWal->pWiData==0 );

  /* If this is the first frame written into the log, write the WAL
  ** header to the start of the WAL file. See comments at the top of
  ** this source file for a description of the WAL header format.
  */
  assert( WAL_FRAME_HDRSIZE>=WAL_HDRSIZE );
  iFrame = pWal->hdr.iLastPg;
  if( iFrame==0 ){
    sqlite3Put4byte(aFrame, nPgsz);
    sqlite3_randomness(8, &aFrame[4]);
    pWal->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]);
................................................................................
    walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame);
    rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset);
    if( rc!=SQLITE_OK ){
      return rc;
    }

    /* Write the page data */
    rc = sqlite3OsWrite(pWal->pWalFd, p->pData, nPgsz, iOffset+sizeof(aFrame));
    if( rc!=SQLITE_OK ){
      return rc;
    }
    pLast = p;
  }

  /* Sync the log file if the 'isSync' flag was specified. */
................................................................................
    }

    rc = sqlite3OsSync(pWal->pWalFd, sync_flags);
  }
  assert( pWal->pWiData==0 );

  /* Append data to the wal-index. It is not necessary to lock the 
  ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
  ** guarantees that there are no other writers, and no data that may
  ** be in use by existing readers is being overwritten.
  */
  iFrame = pWal->hdr.iLastPg;
  for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
    iFrame++;
    rc = walIndexAppend(pWal, iFrame, p->pgno);
................................................................................
  /* Copy data from the log to the database file. */
  rc = walIndexReadHdr(pWal, &isChanged);
  if( rc==SQLITE_OK ){
    rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf);
  }
  if( isChanged ){
    /* If a new wal-index header was loaded before the checkpoint was 
    ** performed, then the pager-cache associated with pWal is now
    ** out of date. So zero the cached wal-index header to ensure that
    ** next time the pager opens a snapshot on this database it knows that
    ** the cache needs to be reset.
    */
    memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  }