SQLite4
Check-in [3c32332c59]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix various multi-client bugs preventing the multi-threaded tests from passing.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 3c32332c592c584f96a09dd415787fde5d8b2400
User & Date: dan 2013-11-04 18:21:18
Context
2013-11-07
20:22
Begin adding tests to check that bt database transactions are robust in the face of system failure. check-in: 5d57889261 user: dan tags: trunk
2013-11-04
18:21
Fix various multi-client bugs preventing the multi-threaded tests from passing. check-in: 3c32332c59 user: dan tags: trunk
2013-11-02
18:28
Fix a problem with skipping past a section of log still in use. check-in: f96d760355 user: dan tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/btInt.h.

282
283
284
285
286
287
288


289
290
291


292
293
294

/*
** End of utility interface.
*************************************************************************/

#ifdef NDEBUG
# define sqlite4BtDebugReadPage(a,b,c,d)


# define btErrorBkpt(x) x
#else
void sqlite4BtDebugReadPage(BtLock *pLock, u32 pgno, u8 *aData, int pgsz);


int btErrorBkpt(int rc);
#endif








>
>



>
>



282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298

/*
** End of utility interface.
*************************************************************************/

#ifdef NDEBUG
# define sqlite4BtDebugReadPage(a,b,c,d)
# define sqlite4BtDebugKV(a,b,c,d,e,f)
# define sqlite4BtDebugReadlock(a,b,c)
# define btErrorBkpt(x) x
#else
void sqlite4BtDebugReadPage(BtLock *pLock, u32 pgno, u8 *aData, int pgsz);
void sqlite4BtDebugKV(BtLock*, const char*,u8 *pK, int nK, u8 *pV, int nV);
void sqlite4BtDebugReadlock(BtLock *pLock, u32 iFirst, u32 iLast);
int btErrorBkpt(int rc);
#endif

Changes to src/bt_lock.c.

468
469
470
471
472
473
474


475
476
477
478
479
480
481
...
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
...
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
        rc = btLockLockop(pLock, BT_LOCK_READER0 + i, BT_LOCK_SHARED, 0);
        if( rc==SQLITE4_OK ){
          int iSF = sqlite4BtLogFrameToIdx(aLog, aSlot[i].iFirst);
          int iSL = sqlite4BtLogFrameToIdx(aLog, aSlot[i].iLast);
          if( iSF>iIdxFirst || iSL>iIdxLast || iSF<0 || iSL<0 ){
            btLockLockop(pLock, BT_LOCK_READER0 + i, BT_LOCK_UNLOCK, 0);
            rc = SQLITE4_BUSY;


          }
        }else if( rc==SQLITE4_BUSY && nAttempt>(nMaxRetry/2) ){
          btLockDelay();
        }
      }
    }
  }
................................................................................
  BtLock *pLock,                  /* Lock handle */
  u32 *aLog,                      /* Current log topology */
  BtReadSlot *aSlot,              /* Array of BT_NREADER read slots */
  u32 *piOut,                     /* OUT: Query result */
  int *piDblocked                 /* OUT: True if READER_DB_ONLY is locked */
){
  u32 iOut = 0;
  u32 iIdxOut = 0;
  int bLast = (piDblocked!=0);
  int rc = SQLITE4_OK;
  int i;

  if( piDblocked ){
    rc = btLockLockop(pLock, BT_LOCK_READER_DBONLY, BT_LOCK_EXCL, 0);
    if( rc==SQLITE4_OK ){
................................................................................
        rc = btLockLockop(pLock, BT_LOCK_READER0 + iSlot, BT_LOCK_EXCL, 0);
        if( rc==SQLITE4_OK ){
          aSlot[iSlot].iFirst = 0;
          aSlot[iSlot].iLast = 0;
          btLockLockop(pLock, BT_LOCK_READER0 + iSlot, BT_LOCK_UNLOCK, 0);
        }else if( rc==SQLITE4_BUSY ){
          int iIdx = sqlite4BtLogFrameToIdx(aLog, iVal);
          if( iOut==0 || iIdx<iIdxOut ){
            iIdxOut = iIdx;
            iOut = iVal;
          }
        }else{
          return rc;
        }
      }







>
>







 







|







 







|







468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
...
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
...
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
        rc = btLockLockop(pLock, BT_LOCK_READER0 + i, BT_LOCK_SHARED, 0);
        if( rc==SQLITE4_OK ){
          int iSF = sqlite4BtLogFrameToIdx(aLog, aSlot[i].iFirst);
          int iSL = sqlite4BtLogFrameToIdx(aLog, aSlot[i].iLast);
          if( iSF>iIdxFirst || iSL>iIdxLast || iSF<0 || iSL<0 ){
            btLockLockop(pLock, BT_LOCK_READER0 + i, BT_LOCK_UNLOCK, 0);
            rc = SQLITE4_BUSY;
          }else{
            sqlite4BtDebugReadlock(pLock, aSlot[i].iFirst, aSlot[i].iLast);
          }
        }else if( rc==SQLITE4_BUSY && nAttempt>(nMaxRetry/2) ){
          btLockDelay();
        }
      }
    }
  }
................................................................................
  BtLock *pLock,                  /* Lock handle */
  u32 *aLog,                      /* Current log topology */
  BtReadSlot *aSlot,              /* Array of BT_NREADER read slots */
  u32 *piOut,                     /* OUT: Query result */
  int *piDblocked                 /* OUT: True if READER_DB_ONLY is locked */
){
  u32 iOut = 0;
  int iIdxOut = 0;
  int bLast = (piDblocked!=0);
  int rc = SQLITE4_OK;
  int i;

  if( piDblocked ){
    rc = btLockLockop(pLock, BT_LOCK_READER_DBONLY, BT_LOCK_EXCL, 0);
    if( rc==SQLITE4_OK ){
................................................................................
        rc = btLockLockop(pLock, BT_LOCK_READER0 + iSlot, BT_LOCK_EXCL, 0);
        if( rc==SQLITE4_OK ){
          aSlot[iSlot].iFirst = 0;
          aSlot[iSlot].iLast = 0;
          btLockLockop(pLock, BT_LOCK_READER0 + iSlot, BT_LOCK_UNLOCK, 0);
        }else if( rc==SQLITE4_BUSY ){
          int iIdx = sqlite4BtLogFrameToIdx(aLog, iVal);
          if( iIdx>=0 && (iOut==0 || iIdx<iIdxOut) ){
            iIdxOut = iIdx;
            iOut = iVal;
          }
        }else{
          return rc;
        }
      }

Changes to src/bt_log.c.

241
242
243
244
245
246
247

248
249
250
251
252
253
254
255










256
257
258
259
260
261
262
263
264
265
266
267
268
269
270





















271
272
273
274
275
276
277
278
279
280
281
282
283
284
...
305
306
307
308
309
310
311

































312
313
314
315
316
317
318
319
320
321
322
323
...
775
776
777
778
779
780
781

782
783
784
785
786
787
788
...
930
931
932
933
934
935
936
937
938
939



940
941
942
943
944
945
946
...
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
....
1195
1196
1197
1198
1199
1200
1201

















1202
1203
1204
1205
1206
1207
1208
....
1236
1237
1238
1239
1240
1241
1242






1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
....
1304
1305
1306
1307
1308
1309
1310




1311
1312
1313
1314
1315
1316
1317
....
1332
1333
1334
1335
1336
1337
1338


1339
1340
1341
1342
1343
1344
1345
....
1439
1440
1441
1442
1443
1444
1445




1446
1447
1448
1449
1450
1451







1452
1453
1454
1455
1456
1457
1458
....
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483







1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
....
1557
1558
1559
1560
1561
1562
1563
1564
1565










1566
1567
1568
1569
1570
1571
1572
1573
1574
1575

1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602

1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637

1638
1639
1640
1641
1642
1643
1644
){
  assert( (nByte&0x00000007)==4 && nByte>=8 );
  btLogChecksum(nativeCksum, a, 8, aIn, aOut);
  btLogChecksum(nativeCksum, &a[4], nByte-4, aOut, aOut);
}

#define BT_PAGE_DEBUG 0


static void btDebugTopology(BtLock *pLock, char *zStr, int iSide, u32 *aLog){
#if BT_PAGE_DEBUG
  fprintf(stderr, "%d:%s: (side=%d) %d..%d  %d..%d  %d..%d\n", 
      pLock->iDebugId, zStr, iSide,
      (int)aLog[0], (int)aLog[1], (int)aLog[2], 
      (int)aLog[3], (int)aLog[4], (int)aLog[5]
  );










  fflush(stderr);
#endif
}

#ifndef NDEBUG
static void btDebugCheckSnapshot(BtShmHdr *pHdr){
  u32 *aLog = pHdr->aLog;
  assert( pHdr->iNextFrame!=1 ||
      (aLog[0]==0 && aLog[1]==0 && aLog[2]==0 && aLog[3]==0)
  );
}
#else
#define btDebugCheckSnapshot(x,y)
#endif






















static void btDebugCkptPage(u32 pgno, u8 *aData, int pgsz){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  u32 aCksum[2];
  btLogChecksum(1, aData, pgsz, 0, aCksum);
  fprintf(stderr, "%d: Ckpt page %d (cksum=%08x%08x)\n", nCall++,
      (int)pgno, aCksum[0], aCksum[1]
  );
  fflush(stderr);
#endif
}

static void btDebugLogPage(
    BtLock *pLock, u32 pgno, u32 iFrame, u8 *aData, int pgsz, int bCommit
................................................................................
      nCall++, (int)pgno, aCksum[0], aCksum[1]
  );
  fflush(stderr);
#endif
}
#endif


































static void btDebugLogSearch(BtLock *pLock, u32 pgno, u32 iFrame){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  fprintf(stderr, "%d:%d: Search log for page %d - frame %d\n", pLock->iDebugId,
      nCall++, (int)pgno, (int)iFrame
  );
  fflush(stderr);
#endif
}

static void btDebugSetPgno(
    BtLock *pLock, int iHash, u32 *aPgno, int iFrame, int iZero, u32 pgno
................................................................................
      /* One or more transactions were recovered from the log file. */
      BtShm *pShm = btLogShm(pLog);
      pShm->ckpt.iWalHdr = (iSlot<<2) + pHdr->iCnt;
      pShm->ckpt.iFirstRead = pHdr->iFirstFrame;
      pShm->ckpt.iFirstRecover = pHdr->iFirstFrame;
      rc = btLogRollbackRecovery(pLog, &ctx);
      pLog->snapshot.iNextFrame = ctx.iNextFrame;

    }
  }

  if( rc==SQLITE4_OK && ctx.iLast==0 ){
    /* No transactions were recovered from the log file. */
    BtDbhdr dbhdr;            /* Database header */
    btLogZeroSnapshot(pLog);
................................................................................
** SQLITE4_NOTFOUND.
*/
int btLogRead(BtLog *pLog, u32 pgno, u8 *aData, u32 iSafe){
  const int pgsz = sqlite4BtPagerPagesize((BtPager*)(pLog->pLock));
  int rc = SQLITE4_NOTFOUND;
  u32 iFrame = 0;
  int i;

  int bSeen = (iSafe==0);




  /* Loop through regions (c), (b) and (a) of the log file. In that order. */
  for(i=2; i>=0 && rc==SQLITE4_NOTFOUND; i--){
    u32 iLo = pLog->snapshot.aLog[i*2+0];
    u32 iHi = pLog->snapshot.aLog[i*2+1];
    int iSide;
    int iHash;
    int iHashLast;
................................................................................

    for( ; rc==SQLITE4_NOTFOUND && iHash>=iHashLast; iHash--){
      rc = btLogHashSearch(pLog, iSide, iHash, iHi, pgno, &iFrame);
      if( rc==SQLITE4_OK ){
        if( iFrame<iLo || iFrame>iHi ){
          rc = SQLITE4_NOTFOUND;
        }else{
          if( iSafe>=iLo && iSafe<=iHi ){
            if( iFrame>iSafe ) return SQLITE4_NOTFOUND;
          }else if( bSeen==0 ){
            return SQLITE4_NOTFOUND;
          }
        }
      }
    }
    if( (iSafe>=iLo && iSafe<=iHi) ){
      bSeen = 1;
    }
  }

  btDebugLogSearch(pLog->pLock, pgno, (rc==SQLITE4_OK ? iFrame : 0));

  if( rc==SQLITE4_OK ){
    bt_env *pVfs = pLog->pLock->pVfs;
    i64 iOff;
    assert( rc==SQLITE4_OK );
    iOff = btLogFrameOffset(pLog, pgsz, iFrame);
    rc = pVfs->xRead(pLog->pFd, iOff + sizeof(BtFrameHdr), aData, pgsz);
................................................................................
    }

    if( nAttempt==0 ) rc = SQLITE4_PROTOCOL;
  }

  return rc;
}


















int sqlite4BtLogSnapshotOpen(BtLog *pLog){
  u32 *aLog = pLog->snapshot.aLog;
  int rc = SQLITE4_NOTFOUND;
  BtShmHdr shmhdr;
  u32 iFirstRead = 0;

................................................................................
      }
    }
    
    if( rc!=SQLITE4_OK ){
      sqlite4BtLockReaderUnlock(pLog->pLock);
    }
  }







  /* If a snapshot was successfully read, adjust it so that the aLog[] 
  ** array specifies that no frames before iFirstRead is ever read from 
  ** the log file.  */
  if( rc==SQLITE4_OK ){
    int iRegion;
    for(iRegion=0; iRegion<3; iRegion++){
      if( aLog[iRegion*2] ){
        if( iFirstRead>=aLog[iRegion*2] && iFirstRead<=aLog[iRegion*2+1] ){
          aLog[iRegion*2] = iFirstRead;
          break;
        }else{
          aLog[iRegion*2] = 0;
          aLog[iRegion*2+1] = 0;
        }
      }
    }
  }

if( rc==SQLITE4_OK ){
  btDebugTopology(
      pLog->pLock, "snapshot", pLog->snapshot.iHashSide, pLog->snapshot.aLog
  );
}

  return rc;
}

int sqlite4BtLogSnapshotClose(BtLog *pLog){
  sqlite4BtLockReaderUnlock(pLog->pLock);
  return SQLITE4_OK;
................................................................................
    ** that it contains a map of all frames that are currently in use
    ** by any reader, or may be used by any future reader or recovery
    ** process.  */
    if( rc==SQLITE4_OK ){
      u32 *aLog = shmhdr.aLog;
      u32 iRecover = pShm->ckpt.iFirstRecover;
      u32 iRead = 0;




      btDebugCheckSnapshot(&pLog->snapshot);

      rc = sqlite4BtLockReaderQuery(pLock, aLog, pShm->aReadlock, &iRead, 0);

      if( rc==SQLITE4_OK ){
        /* Now "trim" the snapshot so that it accesses nothing earlier than
        ** either iRecover or iRead (whichever occurs first in the log). */
................................................................................
              break;
            }else{
              aLog[2*i] = aLog[2*i+1] = 0;
            }
          }
        }
      }



      if( rc==SQLITE4_OK ){
        memcpy(pLog->snapshot.aLog, aLog, sizeof(u32)*6);
      }
      btDebugCheckSnapshot(&pLog->snapshot);
    }
  }
................................................................................
  u32 *aSpace;                    /* Temporary space used by merge-sort */
  int nMax;
  int rc = SQLITE4_OK;
  int iRegion;
  int bLocked;
  u32 iSafe;                      /* Last frame in log it is safe to gather */





  *paPgno = 0;
  *pnPgno = 0;
  *piLastFrame = 0;

  rc = sqlite4BtLockReaderQuery(pLock, aLog, pShm->aReadlock, &iSafe, &bLocked);
  if( rc!=SQLITE4_OK || bLocked ) return rc;








  /* Determine an upper limit on the number of distinct page numbers. This
  ** limit is used to allocate space for the returned array.  */
  nMax = 0;
  for(iRegion=0; iRegion<3; iRegion++){
    if( aLog[iRegion*2] ){
      nMax += 1 + aLog[iRegion*2+1] - aLog[iRegion*2+0];
................................................................................
  nPgno = 0;

  /* Copy the required page numbers into the allocated array */
  for(iRegion=0; iRegion<3; iRegion++){
    u32 iFirst = aLog[iRegion*2];
    u32 iLast = aLog[iRegion*2+1];
    if( iFirst ){
      /* If the last frame that it is safe to gather is part of this 
      ** region, gather no frames that lie beyond it.  */
      if( iSafe>=iFirst && iSafe<=iLast ){
        iLast = iSafe;
      }

      for(i=iFirst; rc==SQLITE4_OK && i<=iLast; i++){
        int iHash = btLogFrameHash(pLog, i);
        u32 *aPage;
        ht_slot *aHash;
        u32 iZero;








        /* It doesn't matter which 'side' of the hash table is requested here,
        ** as only the page-number array, not the aHash[] table, will be used.
        ** And it is the same for both sides. Hence the constant 0 passed as
        ** the second argument to btLogFindHash().  */
        rc = btLogFindHash(pLog, 0, iHash, &aHash, &aPage, &iZero);
        if( rc==SQLITE4_OK ){
          aPgno[nPgno++] = aPage[i-iZero];
        }
      }
      *piLastFrame = iLast;

      /* If the last frame of this region is the last frame that it is
      ** safe to gather, break out of the loop.  */
      if( iLast==iSafe ) break;
    }
  }

  /* Sort the contents of the array in ascending order. This step also 
  ** eliminates any  duplicate page numbers. */
  if( rc==SQLITE4_OK ){
    btLogMergeSort(aPgno, &nPgno, aSpace);
................................................................................
    int nPgno;                    /* Number of entries in aPgno[] */
    int i;                        /* Used to loop through aPgno[] */
    u8 *aBuf;                     /* Buffer to load page data into */
    u32 iFirstRead;               /* First frame not checkpointed */

    rc = btLogSnapshot(pLog, &pLog->snapshot);

    /* Allocate space to load log data into */
    if( rc==SQLITE4_OK ){










      aBuf = sqlite4_malloc(pLock->pEnv, pgsz);
      if( aBuf==0 ) rc = btErrorBkpt(SQLITE4_NOMEM);
    }
    
    /* Figure out the set of page numbers stored in the part of the log 
    ** file being checkpointed. Remove any duplicates and sort them in 
    ** ascending order.  */
    if( rc==SQLITE4_OK ){
      rc = btLogGatherPgno(pLog, &aPgno, &nPgno, &iLast);
    }

    if( rc==SQLITE4_OK ){
      i64 iOff = btLogFrameOffset(pLog, pgsz, iLast);
      rc = btLogReadData(pLog, iOff, (u8*)&fhdr, sizeof(BtFrameHdr));
      iFirstRead = fhdr.iNext;
    }

    /* Copy data from the log file to the database file. */
    for(i=0; rc==SQLITE4_OK && i<nPgno; i++){
      u32 pgno = aPgno[i];
      rc = btLogRead(pLog, pgno, aBuf, iLast);
      if( rc==SQLITE4_OK ){
        i64 iOff = (i64)pgsz * (pgno-1);
        if( pgno==1 ){
          btLogUpdateDbhdr(pLog, aBuf);
        }
        btDebugCkptPage(pgno, aBuf, pgsz);
        rc = pVfs->xWrite(pFd, iOff, aBuf, pgsz);
      }else if( rc==SQLITE4_NOTFOUND ){
        rc = SQLITE4_OK;
      }
    }

    /* Update the first field of the checkpoint-header. This tells readers
    ** that they need not consider anything that in the log before this
    ** point (since the data has already been copied into the database
    ** file).  */
    if( rc==SQLITE4_OK ){

      pShm = btLogShm(pLog);
      pShm->ckpt.iFirstRead = iFirstRead;
      pVfs->xShmBarrier(pLog->pFd);
    }

    /* Write a new header into the log file. This tells any future recovery
    ** where it should start reading the log. Once this new header is synced
    ** to disk, the space cleared by this checkpoint operation can be 
    ** reused.  */
    if( rc==SQLITE4_OK ){
      int iSlot = ((pShm->ckpt.iWalHdr >> 2) + 1) % 2;
      BtWalHdr hdr;

      memset(&hdr, 0, sizeof(BtWalHdr));
      hdr.iMagic = BT_WAL_MAGIC;
      hdr.iVersion = BT_WAL_VERSION;
      hdr.iCnt = (((pShm->ckpt.iWalHdr & 0x03) + 1) % 3);
      hdr.nSector = pLog->snapshot.nSector;
      hdr.nPgsz = pgsz;
      hdr.iFirstFrame = iFirstRead;

      hdr.iSalt1 = fhdr.aCksum[0];
      hdr.iSalt2 = fhdr.aCksum[1];
      rc = btLogWriteHeader(pLog, iSlot, &hdr);
      if( rc==SQLITE4_OK ){
        pShm->ckpt.iWalHdr = (iSlot<<2) + hdr.iCnt;
      }
    }

    /* Update the second field of the checkpoint header. This tells future
    ** writers that it is now safe to recycle pages before this point
    ** (assuming all live readers are cleared).  */
    if( rc==SQLITE4_OK ){
      pShm->ckpt.iFirstRecover = iFirstRead;
      pVfs->xShmBarrier(pLog->pFd);

    }

    /* Free buffers and drop the checkpointer lock */
    sqlite4_free(pLock->pEnv, aBuf);
    sqlite4_free(pLock->pEnv, aPgno);
    sqlite4BtLockCkptUnlock(pLog->pLock);
  }







>








>
>
>
>
>
>
>
>
>
>












|


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|




|
|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|


|
|







 







>







 







<


>
>
>







 







|
|
<





<
<
|
|
<
|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>





|
<
<
<
<
<
<
<
<
|
|
<
<
<
|
|
|
|
|







 







>
>
>
>







 







>
>







 







>
>
>
>






>
>
>
>
>
>
>







 







<
<
<
<
|
<
|




>
>
>
>
>
>
>










<
<
<
<
<







 







<

>
>
>
>
>
>
>
>
>
>










>
|



|
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
>
|
|
|
|

|
|
|
|
|
|
|

|
|
|
|
|
|
|

|
|
|
|
|
|
|

|
|
|
|
|
|
>







241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
...
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
...
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
...
996
997
998
999
1000
1001
1002

1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
....
1019
1020
1021
1022
1023
1024
1025
1026
1027

1028
1029
1030
1031
1032


1033
1034

1035
1036
1037
1038
1039
1040
1041
1042
....
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
....
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335








1336
1337



1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
....
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
....
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
....
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
....
1559
1560
1561
1562
1563
1564
1565




1566

1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588





1589
1590
1591
1592
1593
1594
1595
....
1647
1648
1649
1650
1651
1652
1653

1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680

1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
){
  assert( (nByte&0x00000007)==4 && nByte>=8 );
  btLogChecksum(nativeCksum, a, 8, aIn, aOut);
  btLogChecksum(nativeCksum, &a[4], nByte-4, aOut, aOut);
}

#define BT_PAGE_DEBUG 0
#define BT_VAL_DEBUG  0

static void btDebugTopology(BtLock *pLock, char *zStr, int iSide, u32 *aLog){
#if BT_PAGE_DEBUG
  fprintf(stderr, "%d:%s: (side=%d) %d..%d  %d..%d  %d..%d\n", 
      pLock->iDebugId, zStr, iSide,
      (int)aLog[0], (int)aLog[1], (int)aLog[2], 
      (int)aLog[3], (int)aLog[4], (int)aLog[5]
  );
  fflush(stderr);
#endif
}

void sqlite4BtDebugReadlock(BtLock *pLock, u32 iFirst, u32 iLast){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  fprintf(stderr, "%d:%d: readlock=(%d..%d)\n",
      pLock->iDebugId, nCall++, (int)iFirst, (int)iLast
  );
  fflush(stderr);
#endif
}

#ifndef NDEBUG
static void btDebugCheckSnapshot(BtShmHdr *pHdr){
  u32 *aLog = pHdr->aLog;
  assert( pHdr->iNextFrame!=1 ||
      (aLog[0]==0 && aLog[1]==0 && aLog[2]==0 && aLog[3]==0)
  );
}
#else
#define btDebugCheckSnapshot(x)
#endif

#ifndef NDEBUG
static void btDebugCheckHash(BtLock *pLock){
}
#else
#define btDebugCheckHash(x)
#endif

#ifndef NDEBUG
static void btDebugLogSafepoint(BtLock *pLock, u32 iSafe){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  fprintf(stderr, "%d:%d: checkpoint safepoint=%d\n",
      pLock->iDebugId, nCall++, (int)iSafe
  );
  fflush(stderr);
#endif
}
#else
#define btDebugLogSafepoint(x)
#endif

static void btDebugCkptPage(BtLock *pLock, u32 pgno, u8 *aData, int pgsz){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  u32 aCksum[2];
  btLogChecksum(1, aData, pgsz, 0, aCksum);
  fprintf(stderr, "%d:%d: Ckpt page %d (cksum=%08x%08x)\n", 
      pLock->iDebugId, nCall++, (int)pgno, aCksum[0], aCksum[1]
  );
  fflush(stderr);
#endif
}

static void btDebugLogPage(
    BtLock *pLock, u32 pgno, u32 iFrame, u8 *aData, int pgsz, int bCommit
................................................................................
      nCall++, (int)pgno, aCksum[0], aCksum[1]
  );
  fflush(stderr);
#endif
}
#endif

#ifndef NDEBUG
#include <ctype.h>
static void binToStr(u8 *pIn, int nIn, u8 *pOut, int nOut){
  int i;
  int nCopy = MIN(nIn, (nOut-1));
  for(i=0; i<nCopy; i++){
    if( isprint(pIn[i]) ){
      pOut[i] = pIn[i];
    }else{
      pOut[i] = ".";
    }
  }
  pOut[i] = '\0';
}
void sqlite4BtDebugKV(
    BtLock *pLock, const char *zStr, u8 *pK, int nK, u8 *pV, int nV
){
#if BT_VAL_DEBUG
  u8 aKBuf[40];
  u8 aVBuf[40];
  static int nCall = 0;

  binToStr(pK, nK, aKBuf, sizeof(aKBuf));
  binToStr(pV, nV, aVBuf, sizeof(aVBuf));
  fprintf(stderr, "%d:%d: %s \"%s\" -> \"%s\"\n", 
      pLock->iDebugId, nCall++, zStr, aKBuf, aVBuf
  );

  fflush(stderr);
#endif
}
#endif

static void btDebugLogSearch(BtLock *pLock, u32 pgno, u32 iSafe, u32 iFrame){
#if BT_PAGE_DEBUG
  static int nCall = 0;
  fprintf(stderr, "%d:%d: Search log for page %d (safe=%d) - frame %d\n", 
      pLock->iDebugId, nCall++, (int)pgno, (int)iSafe, (int)iFrame
  );
  fflush(stderr);
#endif
}

static void btDebugSetPgno(
    BtLock *pLock, int iHash, u32 *aPgno, int iFrame, int iZero, u32 pgno
................................................................................
      /* One or more transactions were recovered from the log file. */
      BtShm *pShm = btLogShm(pLog);
      pShm->ckpt.iWalHdr = (iSlot<<2) + pHdr->iCnt;
      pShm->ckpt.iFirstRead = pHdr->iFirstFrame;
      pShm->ckpt.iFirstRecover = pHdr->iFirstFrame;
      rc = btLogRollbackRecovery(pLog, &ctx);
      pLog->snapshot.iNextFrame = ctx.iNextFrame;
      assert( pShm->ckpt.iFirstRead>0 );
    }
  }

  if( rc==SQLITE4_OK && ctx.iLast==0 ){
    /* No transactions were recovered from the log file. */
    BtDbhdr dbhdr;            /* Database header */
    btLogZeroSnapshot(pLog);
................................................................................
** SQLITE4_NOTFOUND.
*/
int btLogRead(BtLog *pLog, u32 pgno, u8 *aData, u32 iSafe){
  const int pgsz = sqlite4BtPagerPagesize((BtPager*)(pLog->pLock));
  int rc = SQLITE4_NOTFOUND;
  u32 iFrame = 0;
  int i;

  int bSeen = (iSafe==0);

  u32 *aLog = pLog->snapshot.aLog;
  int iSafeIdx = sqlite4BtLogFrameToIdx(aLog, iSafe);

  /* Loop through regions (c), (b) and (a) of the log file. In that order. */
  for(i=2; i>=0 && rc==SQLITE4_NOTFOUND; i--){
    u32 iLo = pLog->snapshot.aLog[i*2+0];
    u32 iHi = pLog->snapshot.aLog[i*2+1];
    int iSide;
    int iHash;
    int iHashLast;
................................................................................

    for( ; rc==SQLITE4_NOTFOUND && iHash>=iHashLast; iHash--){
      rc = btLogHashSearch(pLog, iSide, iHash, iHi, pgno, &iFrame);
      if( rc==SQLITE4_OK ){
        if( iFrame<iLo || iFrame>iHi ){
          rc = SQLITE4_NOTFOUND;
        }else{
          assert( sqlite4BtLogFrameToIdx(aLog, iFrame)>=0 );
          if( iSafeIdx>=0 && sqlite4BtLogFrameToIdx(aLog, iFrame)>iSafeIdx ){

            return SQLITE4_NOTFOUND;
          }
        }
      }
    }


  }


  btDebugLogSearch(pLog->pLock, pgno, iSafe, (rc==SQLITE4_OK ? iFrame : 0));

  if( rc==SQLITE4_OK ){
    bt_env *pVfs = pLog->pLock->pVfs;
    i64 iOff;
    assert( rc==SQLITE4_OK );
    iOff = btLogFrameOffset(pLog, pgsz, iFrame);
    rc = pVfs->xRead(pLog->pFd, iOff + sizeof(BtFrameHdr), aData, pgsz);
................................................................................
    }

    if( nAttempt==0 ) rc = SQLITE4_PROTOCOL;
  }

  return rc;
}

static void btLogSnapshotTrim(u32 *aLog, u32 iFirst){
  if( iFirst ){
    int iRegion;
    for(iRegion=0; iRegion<3; iRegion++){
      if( aLog[iRegion*2] ){
        if( iFirst>=aLog[iRegion*2] && iFirst<=aLog[iRegion*2+1] ){
          aLog[iRegion*2] = iFirst;
          break;
        }else{
          aLog[iRegion*2] = 0;
          aLog[iRegion*2+1] = 0;
        }
      }
    }
  }
}

int sqlite4BtLogSnapshotOpen(BtLog *pLog){
  u32 *aLog = pLog->snapshot.aLog;
  int rc = SQLITE4_NOTFOUND;
  BtShmHdr shmhdr;
  u32 iFirstRead = 0;

................................................................................
      }
    }
    
    if( rc!=SQLITE4_OK ){
      sqlite4BtLockReaderUnlock(pLog->pLock);
    }
  }

  if( rc==SQLITE4_OK ){
    btDebugTopology(
        pLog->pLock, "snapshotA", pLog->snapshot.iHashSide, pLog->snapshot.aLog
    );
  }

  /* If a snapshot was successfully read, adjust it so that the aLog[] 
  ** array specifies that no frames before iFirstRead is ever read from 
  ** the log file.  */
  if( rc==SQLITE4_OK ){
    btLogSnapshotTrim(aLog, iFirstRead);








  }




  if( rc==SQLITE4_OK ){
    btDebugTopology(
        pLog->pLock, "snapshotB", pLog->snapshot.iHashSide, pLog->snapshot.aLog
    );
  }

  return rc;
}

int sqlite4BtLogSnapshotClose(BtLog *pLog){
  sqlite4BtLockReaderUnlock(pLog->pLock);
  return SQLITE4_OK;
................................................................................
    ** that it contains a map of all frames that are currently in use
    ** by any reader, or may be used by any future reader or recovery
    ** process.  */
    if( rc==SQLITE4_OK ){
      u32 *aLog = shmhdr.aLog;
      u32 iRecover = pShm->ckpt.iFirstRecover;
      u32 iRead = 0;

      btDebugTopology(pLog->pLock, "snapshotC", shmhdr.iHashSide, aLog);

      assert( shmhdr.iHashSide==pLog->snapshot.iHashSide );
      btDebugCheckSnapshot(&pLog->snapshot);

      rc = sqlite4BtLockReaderQuery(pLock, aLog, pShm->aReadlock, &iRead, 0);

      if( rc==SQLITE4_OK ){
        /* Now "trim" the snapshot so that it accesses nothing earlier than
        ** either iRecover or iRead (whichever occurs first in the log). */
................................................................................
              break;
            }else{
              aLog[2*i] = aLog[2*i+1] = 0;
            }
          }
        }
      }

      btDebugTopology(pLog->pLock, "snapshotD", shmhdr.iHashSide, aLog);

      if( rc==SQLITE4_OK ){
        memcpy(pLog->snapshot.aLog, aLog, sizeof(u32)*6);
      }
      btDebugCheckSnapshot(&pLog->snapshot);
    }
  }
................................................................................
  u32 *aSpace;                    /* Temporary space used by merge-sort */
  int nMax;
  int rc = SQLITE4_OK;
  int iRegion;
  int bLocked;
  u32 iSafe;                      /* Last frame in log it is safe to gather */

  int iSafeIdx = -1;
  int iFirstIdx = -1;
  int iIdx = 0;

  *paPgno = 0;
  *pnPgno = 0;
  *piLastFrame = 0;

  rc = sqlite4BtLockReaderQuery(pLock, aLog, pShm->aReadlock, &iSafe, &bLocked);
  if( rc!=SQLITE4_OK || bLocked ) return rc;
  btDebugLogSafepoint(pLock, iSafe);
  btDebugTopology(
      pLock, "checkpointer", pLog->snapshot.iHashSide, pLog->snapshot.aLog
  );

  iSafeIdx = sqlite4BtLogFrameToIdx(aLog, iSafe);
  iFirstIdx = sqlite4BtLogFrameToIdx(aLog, pShm->ckpt.iFirstRecover);

  /* Determine an upper limit on the number of distinct page numbers. This
  ** limit is used to allocate space for the returned array.  */
  nMax = 0;
  for(iRegion=0; iRegion<3; iRegion++){
    if( aLog[iRegion*2] ){
      nMax += 1 + aLog[iRegion*2+1] - aLog[iRegion*2+0];
................................................................................
  nPgno = 0;

  /* Copy the required page numbers into the allocated array */
  for(iRegion=0; iRegion<3; iRegion++){
    u32 iFirst = aLog[iRegion*2];
    u32 iLast = aLog[iRegion*2+1];
    if( iFirst ){






      for(i=iFirst; rc==SQLITE4_OK && i<=iLast; i++, iIdx++){
        int iHash = btLogFrameHash(pLog, i);
        u32 *aPage;
        ht_slot *aHash;
        u32 iZero;

        if( (iFirstIdx>=0 && iIdx<iFirstIdx) 
         || (iSafeIdx>=0 && iIdx>iSafeIdx) 
        ){
          continue;
        }
        *piLastFrame = i;

        /* It doesn't matter which 'side' of the hash table is requested here,
        ** as only the page-number array, not the aHash[] table, will be used.
        ** And it is the same for both sides. Hence the constant 0 passed as
        ** the second argument to btLogFindHash().  */
        rc = btLogFindHash(pLog, 0, iHash, &aHash, &aPage, &iZero);
        if( rc==SQLITE4_OK ){
          aPgno[nPgno++] = aPage[i-iZero];
        }
      }





    }
  }

  /* Sort the contents of the array in ascending order. This step also 
  ** eliminates any  duplicate page numbers. */
  if( rc==SQLITE4_OK ){
    btLogMergeSort(aPgno, &nPgno, aSpace);
................................................................................
    int nPgno;                    /* Number of entries in aPgno[] */
    int i;                        /* Used to loop through aPgno[] */
    u8 *aBuf;                     /* Buffer to load page data into */
    u32 iFirstRead;               /* First frame not checkpointed */

    rc = btLogSnapshot(pLog, &pLog->snapshot);


    if( rc==SQLITE4_OK ){
      /* Ensure that the checkpoint does not read any frames from the log
      ** that occur earlier than iFirstRecover. This is not just an 
      ** optimization - there is a chance that such frames may be 
      ** overwritten by a writer running concurrently with this checkpoint. */
#if 0
      pShm = btLogShm(pLog);
      btLogSnapshotTrim(pLog->snapshot.aLog, pShm->ckpt.iFirstRecover);
#endif

      /* Allocate space to load log data into */
      aBuf = sqlite4_malloc(pLock->pEnv, pgsz);
      if( aBuf==0 ) rc = btErrorBkpt(SQLITE4_NOMEM);
    }
    
    /* Figure out the set of page numbers stored in the part of the log 
    ** file being checkpointed. Remove any duplicates and sort them in 
    ** ascending order.  */
    if( rc==SQLITE4_OK ){
      rc = btLogGatherPgno(pLog, &aPgno, &nPgno, &iLast);
    }

    if( rc==SQLITE4_OK && nPgno>0 ){
      i64 iOff = btLogFrameOffset(pLog, pgsz, iLast);
      rc = btLogReadData(pLog, iOff, (u8*)&fhdr, sizeof(BtFrameHdr));
      iFirstRead = fhdr.iNext;


      /* Copy data from the log file to the database file. */
      for(i=0; rc==SQLITE4_OK && i<nPgno; i++){
        u32 pgno = aPgno[i];
        rc = btLogRead(pLog, pgno, aBuf, iLast);
        if( rc==SQLITE4_OK ){
          i64 iOff = (i64)pgsz * (pgno-1);
          if( pgno==1 ){
            btLogUpdateDbhdr(pLog, aBuf);
          }
          btDebugCkptPage(pLog->pLock, pgno, aBuf, pgsz);
          rc = pVfs->xWrite(pFd, iOff, aBuf, pgsz);
        }else if( rc==SQLITE4_NOTFOUND ){
          rc = SQLITE4_OK;
        }
      }

      /* Update the first field of the checkpoint-header. This tells readers
      ** that they need not consider anything that in the log before this
      ** point (since the data has already been copied into the database
      ** file).  */
      if( rc==SQLITE4_OK ){
        assert( iFirstRead>0 );
        pShm = btLogShm(pLog);
        pShm->ckpt.iFirstRead = iFirstRead;
        pVfs->xShmBarrier(pLog->pFd);
      }

      /* Write a new header into the log file. This tells any future recovery
      ** where it should start reading the log. Once this new header is synced
      ** to disk, the space cleared by this checkpoint operation can be 
      ** reused.  */
      if( rc==SQLITE4_OK ){
        int iSlot = ((pShm->ckpt.iWalHdr >> 2) + 1) % 2;
        BtWalHdr hdr;

        memset(&hdr, 0, sizeof(BtWalHdr));
        hdr.iMagic = BT_WAL_MAGIC;
        hdr.iVersion = BT_WAL_VERSION;
        hdr.iCnt = (((pShm->ckpt.iWalHdr & 0x03) + 1) % 3);
        hdr.nSector = pLog->snapshot.nSector;
        hdr.nPgsz = pgsz;
        hdr.iFirstFrame = iFirstRead;

        hdr.iSalt1 = fhdr.aCksum[0];
        hdr.iSalt2 = fhdr.aCksum[1];
        rc = btLogWriteHeader(pLog, iSlot, &hdr);
        if( rc==SQLITE4_OK ){
          pShm->ckpt.iWalHdr = (iSlot<<2) + hdr.iCnt;
        }
      }

      /* Update the second field of the checkpoint header. This tells future
      ** writers that it is now safe to recycle pages before this point
      ** (assuming all live readers are cleared).  */
      if( rc==SQLITE4_OK ){
        pShm->ckpt.iFirstRecover = iFirstRead;
        pVfs->xShmBarrier(pLog->pFd);
      }
    }

    /* Free buffers and drop the checkpointer lock */
    sqlite4_free(pLock->pEnv, aBuf);
    sqlite4_free(pLock->pEnv, aPgno);
    sqlite4BtLockCkptUnlock(pLog->pLock);
  }

Changes to src/bt_main.c.

1045
1046
1047
1048
1049
1050
1051











1052
1053
1054
1055
1056
1057
1058
....
2281
2282
2283
2284
2285
2286
2287


2288
2289
2290
2291
2292
2293
2294
      *ppV = &pCsr->ovfl.pBuf[pCsr->ovfl.nKey];
      *pnV = pCsr->ovfl.nVal;
    }
  }else{
    *ppV = pCell;
    *pnV = (nV-1);
  }












  return rc;
}

/*
** The argument points to a buffer containing an overflow array. Return
** the size of the overflow array in bytes. 
................................................................................

/*
** Insert a new key/value pair or replace an existing one.
*/
int sqlite4BtReplace(bt_db *db, const void *pK, int nK, const void *pV, int nV){
  int rc = SQLITE4_OK;
  bt_cursor csr;



  btCheckPageRefs(db);
  btCsrSetup(db, &csr);
  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, 1);
  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek







>
>
>
>
>
>
>
>
>
>
>







 







>
>







1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
....
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
      *ppV = &pCsr->ovfl.pBuf[pCsr->ovfl.nKey];
      *pnV = pCsr->ovfl.nVal;
    }
  }else{
    *ppV = pCell;
    *pnV = (nV-1);
  }

#ifndef NDEBUG
  if( rc==SQLITE4_OK ){
    const void *pK; int nK;
    rc = sqlite4BtCsrKey(pCsr, &pK, &nK);
    if( rc==SQLITE4_OK ){
      BtLock *pLock = (BtLock*)pCsr->pDb->pPager;
      sqlite4BtDebugKV(pLock, "select", (u8*)pK, nK, (u8*)*ppV, *pnV);
    }
  }
#endif

  return rc;
}

/*
** The argument points to a buffer containing an overflow array. Return
** the size of the overflow array in bytes. 
................................................................................

/*
** Insert a new key/value pair or replace an existing one.
*/
int sqlite4BtReplace(bt_db *db, const void *pK, int nK, const void *pV, int nV){
  int rc = SQLITE4_OK;
  bt_cursor csr;

  sqlite4BtDebugKV((BtLock*)db->pPager, "replace", pK, nK, pV, nV);

  btCheckPageRefs(db);
  btCsrSetup(db, &csr);
  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, 1);
  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek