/ Check-in [40b0a635]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Modify the format of the wal-index to use a hash table to index log file segments.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 40b0a6357b160e04326ab176955a68a1cf3f8b7c
User & Date: dan 2010-05-10 14:46:09
References
2010-05-10
18:10
Merge [96d6eaf4d2] and [40b0a6357b]. check-in: c67756c4 user: dan tags: trunk
Context
2010-05-10
18:10
Merge [96d6eaf4d2] and [40b0a6357b]. check-in: c67756c4 user: dan tags: trunk
14:46
Modify the format of the wal-index to use a hash table to index log file segments. check-in: 40b0a635 user: dan tags: trunk
14:10
If an ATTACH command fails due to OP_JournalMode but still attaches the database, make sure VACUUM still detaches it when done. check-in: 6ecdc7ba user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/wal.c.

352
353
354
355
356
357
358










359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379

380
381
382

383
384
385
386
387
388
389
...
436
437
438
439
440
441
442


















































443
444
445
446
447
448
449
450
451
452
453
454


455
456
457
458
459

460
461
462
463
464
465
466
467
468
469
470
471
472
473




474
475
476
477
478
479

480




481
482
483
484






485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
...
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
...
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720

721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737

738
739
740
741
742
743
744
....
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033










1034
1035
1036
1037
1038
1039
1040
1041
1042
1043















































1044
1045
1046
1047
1048
1049
1050
1051





1052





1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068




1069
1070
1071
1072
1073
1074

1075
1076


1077

1078
1079

1080
1081
1082

1083
1084
1085
1086

1087
1088
1089
1090
1091
1092
1093
    for(i=1; i<*pnList; i++){
      assert( aContent[aList[i]] > aContent[aList[i-1]] );
    }
  }
#endif
}












/*
** Return the index in the WalIndex.aData array that corresponds to 
** frame iFrame. The wal-index file consists of a header, followed by
** alternating "map" and "index" blocks.
*/
static int walIndexEntry(u32 iFrame){
  return (
      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
    + (((iFrame-1)>>8)<<6)        /* Indexes that occur before iFrame */
    + iFrame-1                    /* Db page numbers that occur before iFrame */
  );
}

/*
** Return the minimum mapping size in bytes that can be used to read the
** wal-index up to and including frame iFrame. If iFrame is the last frame
** in a block of 256 frames, the returned byte-count includes the space
** required by the 256-byte index block.
*/
static int walMappingSize(u32 iFrame){

  return ( WALINDEX_LOCK_OFFSET + WALINDEX_LOCK_RESERVED 
         + iFrame*sizeof(u32)
         + (iFrame>>8)*256

  );
}

/*
** Release our reference to the wal-index memory map, if we are holding
** it.
*/
................................................................................
  return rc;
}

/*
** Increment by which to increase the wal-index file size.
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)



















































/*
** Set an entry in the wal-index map to map log frame iFrame to db 
** page iPage. Values are always appended to the wal-index (i.e. the
** value of iFrame is always exactly one more than the value passed to
** the previous call), but that restriction is not enforced or asserted
** here.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;
  u32 iSlot = walIndexEntry(iFrame);
  


  rc = walIndexMap(pWal, -1);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  while( ((iSlot+128)*sizeof(u32))>=pWal->szWIndex ){

    int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT;

    /* Enlarge the storage, then remap it. */
    rc = walIndexRemap(pWal, nByte);
    if( rc!=SQLITE_OK ){
      return rc;
    }
  }

  /* Set the wal-index entry itself */
  pWal->pWiData[iSlot] = iPage;

  /* If the frame number is a multiple of 256 (frames are numbered starting
  ** at 1), build an index of the most recently added 256 frames.




  */
  if( (iFrame&0x000000FF)==0 ){
    int i;                        /* Iterator used while initializing aIndex */
    u32 *aFrame;                  /* Pointer to array of 256 frames */
    int nIndex;                   /* Number of entries in index */
    u8 *aIndex;                   /* 256 bytes to build index in */

    u8 *aTmp;                     /* Scratch space to use while sorting */





    aFrame = &pWal->pWiData[iSlot-255];
    aIndex = (u8 *)&pWal->pWiData[iSlot+1];
    aTmp = &aIndex[256];







    nIndex = 256;
    for(i=0; i<256; i++) aIndex[i] = (u8)i;
    walMergesort8(aFrame, aTmp, aIndex, &nIndex);
    memset(&aIndex[nIndex], aIndex[nIndex-1], 256-nIndex);
  }

  return SQLITE_OK;
}


/*
** Recover the wal-index by reading the write-ahead log file. 
** The caller must hold RECOVER lock on the wal-index file.
*/
................................................................................
          iRet = iPg;
          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
        }
        break;
      }
      pSegment->iNext++;
    }

    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}

................................................................................
  u32 *aData;                     /* Content of the wal-index file */
  WalIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */
  struct WalSegment *pFinal;      /* Final (unindexed) segment */
  u8 *aTmp;                       /* Temp space used by merge-sort */
  int rc;                         /* Return code of walIndexMap() */

  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
  if( rc!=SQLITE_OK ){
    return rc;
  }
  aData = pWal->pWiData;
  iLast = pWal->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(WalIterator) + (nSegment-1)*sizeof(struct WalSegment) + 512;
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    rc = SQLITE_NOMEM;
  }else{

    memset(p, 0, nByte);
    p->nSegment = nSegment;

    for(i=0; i<nSegment-1; i++){
      p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
      p->aSegment[i].aIndex = (u8 *)&aData[walIndexEntry(i*256+1)+256];
    }
    pFinal = &p->aSegment[nSegment-1];
  
    pFinal->aDbPage = &aData[walIndexEntry((nSegment-1)*256+1)];
    pFinal->aIndex = (u8 *)&pFinal[1];
    aTmp = &pFinal->aIndex[256];
    for(i=0; i<nFinal; i++){
      pFinal->aIndex[i] = i;
    }
    walMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
    p->nFinal = nFinal;

  }

  *pp = p;
  return rc;
}

/* 
................................................................................
  walSetLock(pWal, SQLITE_SHM_UNLOCK);
}

/*
** Read a page from the log, if it is present. 
*/
int sqlite3WalRead(
  Wal *pWal, 
  Pgno pgno, 
  int *pInWal, 
  int nOut,
  u8 *pOut
){
  int rc;                         /* Return code */
  u32 iRead = 0;
  u32 *aData; 
  int iFrame = (pWal->hdr.iLastPg & 0xFFFFFF00);











  assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
  if( rc!=SQLITE_OK ){
    return rc;
  }

  /* Do a linear search of the unindexed block of page-numbers (if any) 
  ** at the end of the wal-index. An alternative to this would be to
  ** build an index in private memory each time a read transaction is
  ** opened on a new snapshot.















































  */
  aData = pWal->pWiData;
  if( pWal->hdr.iLastPg ){
    u32 *pi = &aData[walIndexEntry(pWal->hdr.iLastPg)];
    u32 *piStop = pi - (pWal->hdr.iLastPg & 0xFF);
    while( *pi!=pgno && pi!=piStop ) pi--;
    if( pi!=piStop ){
      iRead = (pi-piStop) + iFrame;





    }





  }
  assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );

  while( iRead==0 && iFrame>0 ){
    int iLow = 0;
    int iHigh = 255;
    u32 *aFrame;
    u8 *aIndex;

    iFrame -= 256;
    aFrame = &aData[walIndexEntry(iFrame+1)];
    aIndex = (u8 *)&aFrame[256];

    while( iLow<=iHigh ){
      int iTest = (iLow+iHigh)>>1;
      u32 iPg = aFrame[aIndex[iTest]];





      if( iPg==pgno ){
        iRead = iFrame + 1 + aIndex[iTest];
        break;
      }
      else if( iPg<pgno ){

        iLow = iTest+1;
      }else{


        iHigh = iTest-1;

      }
    }

  }
  assert( iRead==0 || aData[walIndexEntry(iRead)]==pgno );
  walIndexUnmap(pWal);


  /* If iRead is non-zero, then it is the log frame number that contains the
  ** required page. Read and return data from the log file.
  */

  if( iRead ){
    i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
    *pInWal = 1;
    return sqlite3OsRead(pWal->pFd, pOut, nOut, iOffset);
  }

  *pInWal = 0;







>
>
>
>
>
>
>
>
>
>









|
|










>
|
<
|
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>









|
|

>
>

<
<
<
<
>

<
<

<
<
|
|
<
<
<
<
<
<
>
>
>
>

<
<
<
<
<
>
|
>
>
>
>

<
<
<
>
>
>
>
>
>
|
<
<
<
<
|
<
|







 







<







 







<












|




>



|
|
|
|
|
|
|
|
|
|
|
|
|
|
>







 







|
|
|
|
|


|
|
|

>
>
>
>
>
>
>
>
>
>

|




<
<
<
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

<
<
<
<
<
<
<
>
>
>
>
>
|
>
>
>
>
>
|
<
|
<
<
<
<
<
|
<
|
<

<
<
<
>
>
>
>
|
<
<
<
<
<
>
|
<
>
>
|
>


>

<
<
>




>







352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391

392
393
394
395
396
397
398
399
400
...
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518




519
520


521


522
523






524
525
526
527
528





529
530
531
532
533
534
535



536
537
538
539
540
541
542




543

544
545
546
547
548
549
550
551
...
732
733
734
735
736
737
738

739
740
741
742
743
744
745
...
747
748
749
750
751
752
753

754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
....
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101




1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149







1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161

1162





1163

1164

1165



1166
1167
1168
1169
1170





1171
1172

1173
1174
1175
1176
1177
1178
1179
1180


1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
    for(i=1; i<*pnList; i++){
      assert( aContent[aList[i]] > aContent[aList[i-1]] );
    }
  }
#endif
}

/*
** Define the size of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
*/
#define HASHTABLE_NPAGE      4096
#define HASHTABLE_DATATYPE   u16

#define HASHTABLE_NSLOT     (HASHTABLE_NPAGE*2)
#define HASHTABLE_NBYTE     (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/*
** Return the index in the WalIndex.aData array that corresponds to 
** frame iFrame. The wal-index file consists of a header, followed by
** alternating "map" and "index" blocks.
*/
static int walIndexEntry(u32 iFrame){
  return (
      (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32)
    + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32)
    + (iFrame-1)
  );
}

/*
** Return the minimum mapping size in bytes that can be used to read the
** wal-index up to and including frame iFrame. If iFrame is the last frame
** in a block of 256 frames, the returned byte-count includes the space
** required by the 256-byte index block.
*/
static int walMappingSize(u32 iFrame){
  const int nByte = (sizeof(u32)*HASHTABLE_NPAGE + HASHTABLE_NBYTE) ;
  return ( WALINDEX_LOCK_OFFSET 

         + WALINDEX_LOCK_RESERVED 
         + nByte * ((iFrame + HASHTABLE_NPAGE - 1)/HASHTABLE_NPAGE)
  );
}

/*
** Release our reference to the wal-index memory map, if we are holding
** it.
*/
................................................................................
  return rc;
}

/*
** Increment by which to increase the wal-index file size.
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)

static int walHashKey(u32 iPage){
  return (iPage*2) % (HASHTABLE_NSLOT-1);
}


/* 
** Find the hash table and (section of the) page number array used to
** store data for WAL frame iFrame.
**
** Set output variable *paHash to point to the start of the hash table
** in the wal-index file. Set *piZero to one less than the frame 
** number of the first frame indexed by this hash table. If a
** slot in the hash table is set to N, it refers to frame number 
** (*piZero+N) in the log.
**
** Finally, set *paPgno such that for all frames F between (*piZero+1) and 
** (*piZero+HASHTABLE_NPAGE), (*paPgno)[F] is the database page number 
** associated with frame F.
*/
static void walHashFind(
  Wal *pWal,                      /* WAL handle */
  u32 iFrame,                     /* Find the hash table indexing this frame */
  HASHTABLE_DATATYPE **paHash,    /* OUT: Pointer to hash index */
  u32 **paPgno,                   /* OUT: Pointer to page number array */
  u32 *piZero                     /* OUT: Frame associated with *paPgno[0] */
){
  u32 iZero;
  u32 *aPgno;
  HASHTABLE_DATATYPE *aHash;

  iZero = ((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NPAGE;
  aPgno = &pWal->pWiData[walIndexEntry(iZero+1)-iZero-1];
  aHash = (HASHTABLE_DATATYPE *)&aPgno[iZero+HASHTABLE_NPAGE+1];

  /* Assert that:
  **
  **   + the mapping is large enough for this hash-table, and
  **
  **   + that aPgno[iZero+1] really is the database page number associated
  **     with the first frame indexed by this hash table.
  */
  assert( (u32*)(&aHash[HASHTABLE_NSLOT])<=&pWal->pWiData[pWal->szWIndex/4] );
  assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );

  *paHash = aHash;
  *paPgno = aPgno;
  *piZero = iZero;
}


/*
** Set an entry in the wal-index map to map log frame iFrame to db 
** page iPage. Values are always appended to the wal-index (i.e. the
** value of iFrame is always exactly one more than the value passed to
** the previous call), but that restriction is not enforced or asserted
** here.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
  int nMapping;                   /* Required mapping size in bytes */
  
  /* Make sure the wal-index is mapped. Enlarge the mapping if required. */
  nMapping = walMappingSize(iFrame);
  rc = walIndexMap(pWal, -1);




  while( rc==SQLITE_OK && nMapping>pWal->szWIndex ){
    int nByte = pWal->szWIndex + WALINDEX_MMAP_INCREMENT;


    rc = walIndexRemap(pWal, nByte);


  }







  /* Assuming the wal-index file was successfully mapped, find the hash 
  ** table and section of of the page number array that pertain to frame 
  ** iFrame of the WAL. Then populate the page number array and the hash 
  ** table entry.
  */





  if( rc==SQLITE_OK ){
    int iKey;                     /* Hash table key */
    u32 iZero;                    /* One less than frame number of aPgno[1] */
    u32 *aPgno;                   /* Page number array */
    HASHTABLE_DATATYPE *aHash;    /* Hash table */
    int idx;                      /* Value to write to hash-table slot */




    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
    idx = iFrame - iZero;
    if( idx==1 ) memset(aHash, 0, HASHTABLE_NBYTE);
    aPgno[iFrame] = iPage;
    for(iKey=walHashKey(iPage); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT);
    aHash[iKey] = idx;
  }






  return rc;
}


/*
** Recover the wal-index by reading the write-ahead log file. 
** The caller must hold RECOVER lock on the wal-index file.
*/
................................................................................
          iRet = iPg;
          *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext];
        }
        break;
      }
      pSegment->iNext++;
    }

    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}

................................................................................
  u32 *aData;                     /* Content of the wal-index file */
  WalIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */

  u8 *aTmp;                       /* Temp space used by merge-sort */
  int rc;                         /* Return code of walIndexMap() */

  rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg));
  if( rc!=SQLITE_OK ){
    return rc;
  }
  aData = pWal->pWiData;
  iLast = pWal->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256);
  p = (WalIterator *)sqlite3_malloc(nByte);
  if( !p ){
    rc = SQLITE_NOMEM;
  }else{
    u8 *aSpace;
    memset(p, 0, nByte);
    p->nSegment = nSegment;

    aSpace = (u8 *)&p->aSegment[nSegment];
    aTmp = &aSpace[nSegment*256];
    for(i=0; i<nSegment; i++){
      int j;
      int nIndex = (i==nSegment-1) ? nFinal : 256;
      p->aSegment[i].aDbPage = &aData[walIndexEntry(i*256+1)];
      p->aSegment[i].aIndex = aSpace;
      for(j=0; j<nIndex; j++){
        aSpace[j] = j;
      }
      walMergesort8(p->aSegment[i].aDbPage, aTmp, aSpace, &nIndex);
      memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex);
      aSpace += 256;
      p->nFinal = nIndex;
    }
  }

  *pp = p;
  return rc;
}

/* 
................................................................................
  walSetLock(pWal, SQLITE_SHM_UNLOCK);
}

/*
** Read a page from the log, if it is present. 
*/
int sqlite3WalRead(
  Wal *pWal,                      /* WAL handle */
  Pgno pgno,                      /* Database page number to read data for */
  int *pInWal,                    /* OUT: True if data is read from WAL */
  int nOut,                       /* Size of buffer pOut in bytes */
  u8 *pOut                        /* Buffer to write page data to */
){
  int rc;                         /* Return code */
  u32 iRead = 0;                  /* If !=0, WAL frame to return data from */
  u32 iLast = pWal->hdr.iLastPg;  /* Last page in WAL for this reader */
  int iHash;                      /* Used to loop through N hash tables */

  /* If the "last page" field of the wal-index header snapshot is 0, then
  ** no data will be read from the wal under any circumstances. Return early
  ** in this case to avoid the walIndexMap/Unmap overhead.
  */
  if( iLast==0 ){
    *pInWal = 0;
    return SQLITE_OK;
  }

  /* Ensure the wal-index is mapped. */
  assert( pWal->lockState==SQLITE_SHM_READ||pWal->lockState==SQLITE_SHM_WRITE );
  rc = walIndexMap(pWal, walMappingSize(iLast));
  if( rc!=SQLITE_OK ){
    return rc;
  }





  /* Search the hash table or tables for an entry matching page number
  ** pgno. Each iteration of the following for() loop searches one
  ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
  **
  ** This code may run concurrently to the code in walIndexAppend()
  ** that adds entries to the wal-index (and possibly to this hash 
  ** table). This means the non-zero value just read from the hash 
  ** slot (aHash[iKey]) may have been added before or after the 
  ** current read transaction was opened. Values added after the
  ** read transaction was opened may have been written incorrectly -
  ** i.e. these slots may contain garbage data. However, we assume
  ** that any slots written before the current read transaction was
  ** opened remain unmodified.
  **
  ** For the reasons above, the if(...) condition featured in the inner
  ** loop of the following block is more stringent that would be required 
  ** if we had exclusive access to the hash-table:
  **
  **   (aPgno[iFrame]==pgno): 
  **     This condition filters out normal hash-table collisions.
  **
  **   (iFrame<=iLast): 
  **     This condition filters out entries that were added to the hash
  **     table after the current read-transaction had started.
  **
  **   (iFrame>iRead): 
  **     This filters out a dangerous class of garbage data. The 
  **     garbage hash slot may refer to a frame with the correct page 
  **     number, but not the most recent version of the frame. For
  **     example, if at the start of the read-transaction the log 
  **     contains three copies of the desired page in frames 2, 3 and 4,
  **     the hash table may contain the following:
  **
  **       { ..., 2, 3, 4, 0, 0, ..... }
  **
  **     The correct answer is to read data from frame 4. But a 
  **     dirty-read may potentially cause the hash-table to appear as 
  **     follows to the reader:
  **
  **       { ..., 2, 3, 4, 3, 0, ..... }
  **
  **     Without this part of the if(...) clause, the reader might
  **     incorrectly read data from frame 3 instead of 4. This would be
  **     an error.
  **
  ** It is not actually clear to the developers that such a dirty-read
  ** can occur. But if it does, it should not cause any problems.
  */







  for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
    HASHTABLE_DATATYPE *aHash;    /* Pointer to hash table */
    u32 *aPgno;                   /* Pointer to array of page numbers */
    u32 iZero;                    /* Frame number corresponding to aPgno[0] */
    int iKey;                     /* Hash slot index */

    walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);
    for(iKey=walHashKey(pgno); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT){
      u32 iFrame = aHash[iKey] + iZero;
      if( iFrame<=iLast && aPgno[iFrame]==pgno && iFrame>iRead ){
        iRead = iFrame;
      }

    }





  }

  assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );





#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
  /* If expensive assert() statements are available, do a linear search
  ** of the wal-index file content. Make sure the results agree with the
  ** result obtained using the hash indexes above.  */
  {





    u32 iRead2 = 0;
    u32 iTest;

    for(iTest=iLast; iTest>0; iTest--){
      if( pWal->pWiData[walIndexEntry(iTest)]==pgno ){
        iRead2 = iTest;
        break;
      }
    }
    assert( iRead==iRead2 );
  }


#endif

  /* If iRead is non-zero, then it is the log frame number that contains the
  ** required page. Read and return data from the log file.
  */
  walIndexUnmap(pWal);
  if( iRead ){
    i64 iOffset = walFrameOffset(iRead, pWal->hdr.pgsz) + WAL_FRAME_HDRSIZE;
    *pInWal = 1;
    return sqlite3OsRead(pWal->pFd, pOut, nOut, iOffset);
  }

  *pInWal = 0;