Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Save sub-tree root page numbers, instead of block numbers, in the meta-tree.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 6003e7dcc2485c89d7e6ff12e8953a8655fb9e1b
User & Date: dan 2014-01-03 19:36:12.337
Context
2014-01-06
20:30
Add merging code. check-in: ad91883237 user: dan tags: trunk
2014-01-03
19:36
Save sub-tree root page numbers, instead of block numbers, in the meta-tree. check-in: 6003e7dcc2 user: dan tags: trunk
2014-01-02
18:53
Changes to FiCursor object to support reading merged blocks. check-in: 27248a1ebc user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btInt.h.
141
142
143
144
145
146
147
148

149
150
151





152
153
154
155
156
157
158

/*
** Query for the database page size. Requires an open read transaction.
*/
int sqlite4BtPagerPagesize(BtPager*);

/* 
** Query for the db header values. Requires an open read transaction.

*/
BtDbHdr *sqlite4BtPagerDbhdr(BtPager*);






/*
** Read, write and trim existing database pages.
*/
int sqlite4BtPageGet(BtPager*, u32 pgno, BtPage **ppPage);
int sqlite4BtPageTrimPgno(BtPager*, u32 pgno);
int sqlite4BtPageWrite(BtPage*);
int sqlite4BtPageTrim(BtPage*);







|
>



>
>
>
>
>







141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

/*
** Query for the database page size. Requires an open read transaction.
*/
int sqlite4BtPagerPagesize(BtPager*);

/* 
** Query for the db header values. Requires an open read transaction or
** an active checkpoint.
*/
BtDbHdr *sqlite4BtPagerDbhdr(BtPager*);

/*
** Used by checkpointers to specify the header to use during a checkpoint.
*/
void sqlite4BtPagerSetDbhdr(BtPager *, BtDbHdr *);

/*
** Read, write and trim existing database pages.
*/
int sqlite4BtPageGet(BtPager*, u32 pgno, BtPage **ppPage);
int sqlite4BtPageTrimPgno(BtPager*, u32 pgno);
int sqlite4BtPageWrite(BtPage*);
int sqlite4BtPageTrim(BtPage*);
Changes to src/bt_log.c.
1824
1825
1826
1827
1828
1829
1830

1831
1832
1833
1834
1835
1836
1837
    u32 *aPgno = 0;               /* Array of page numbers to checkpoint */
    int nPgno;                    /* Number of entries in aPgno[] */
    int i;                        /* Used to loop through aPgno[] */
    u8 *aBuf;                     /* Buffer to load page data into */
    u32 iFirstRead;               /* First frame not checkpointed */

    rc = btLogSnapshot(pLog, &pLog->snapshot);

    pgsz = pLog->snapshot.dbhdr.pgsz;

    if( rc==SQLITE4_OK ){
      /* Allocate space to load log data into */
      aBuf = sqlite4_malloc(pLock->pEnv, pgsz);
      if( aBuf==0 ) rc = btErrorBkpt(SQLITE4_NOMEM);
    }







>







1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
    u32 *aPgno = 0;               /* Array of page numbers to checkpoint */
    int nPgno;                    /* Number of entries in aPgno[] */
    int i;                        /* Used to loop through aPgno[] */
    u8 *aBuf;                     /* Buffer to load page data into */
    u32 iFirstRead;               /* First frame not checkpointed */

    rc = btLogSnapshot(pLog, &pLog->snapshot);
    sqlite4BtPagerSetDbhdr((BtPager*)pLock, &pLog->snapshot.dbhdr);
    pgsz = pLog->snapshot.dbhdr.pgsz;

    if( rc==SQLITE4_OK ){
      /* Allocate space to load log data into */
      aBuf = sqlite4_malloc(pLock->pEnv, pgsz);
      if( aBuf==0 ) rc = btErrorBkpt(SQLITE4_NOMEM);
    }
1925
1926
1927
1928
1929
1930
1931
1932

1933

1934
1935
1936
1937
1938
1939
1940
        pVfs->xShmBarrier(pLog->pFd);
      }
    }

    /* Free buffers and drop the checkpointer lock */
    sqlite4_free(pLock->pEnv, aBuf);
    sqlite4_free(pLock->pEnv, aPgno);
    sqlite4BtLockCkptUnlock(pLog->pLock);

  }

  return rc;
}

#if 0
/*
** Return the database page size in bytes.
*/







|
>

>







1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
        pVfs->xShmBarrier(pLog->pFd);
      }
    }

    /* Free buffers and drop the checkpointer lock */
    sqlite4_free(pLock->pEnv, aBuf);
    sqlite4_free(pLock->pEnv, aPgno);
    sqlite4BtLockCkptUnlock(pLock);
    sqlite4BtPagerSetDbhdr((BtPager*)pLock, 0);
  }

  return rc;
}

#if 0
/*
** Return the database page size in bytes.
*/
Changes to src/bt_main.c.
52
53
54
55
56
57
58




59
60
61
62
63

64
65
66
67
68
69
70
/*
** Values that make up the bt_cursor.flags mask.
**
** CSR_NEXT_OK, CSR_PREV_OK:
**   These are only used by fast-insert cursors. The CSR_NEXT_OK flag is
**   set if xNext() may be safely called on the cursor. CSR_PREV_OK is
**   true if xPrev() is Ok.




*/
#define CSR_TYPE_BT   0x0001
#define CSR_TYPE_FAST 0x0002
#define CSR_NEXT_OK   0x0004
#define CSR_PREV_OK   0x0008


#define IsBtCsr(pCsr) (((pCsr)->flags & CSR_TYPE_BT)!=0)

/* 
** Base class for both cursor types (BtCursor and FiCursor).
*/
struct bt_cursor {







>
>
>
>





>







52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
/*
** Values that make up the bt_cursor.flags mask.
**
** CSR_NEXT_OK, CSR_PREV_OK:
**   These are only used by fast-insert cursors. The CSR_NEXT_OK flag is
**   set if xNext() may be safely called on the cursor. CSR_PREV_OK is
**   true if xPrev() is Ok.
**
** CSR_VISIT_DEL:
**   If this flag is set, do not skip over delete keys that occur in the
**   merged cursor output. This is used by checkpoint merges.
*/
#define CSR_TYPE_BT   0x0001
#define CSR_TYPE_FAST 0x0002
#define CSR_NEXT_OK   0x0004
#define CSR_PREV_OK   0x0008
#define CSR_VISIT_DEL 0x0010

#define IsBtCsr(pCsr) (((pCsr)->flags & CSR_TYPE_BT)!=0)

/* 
** Base class for both cursor types (BtCursor and FiCursor).
*/
struct bt_cursor {
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561

  if( pgno==pHdr->iSRoot ){
    int i;
    BtSchedule s;
    sqlite4BtBufAppendf(pBuf, "(schedule page) ");
    memcpy(&s, aData, sizeof(s));

    sqlite4BtBufAppendf(pBuf, "bBusy=%d ", (int)s.bBusy);
    sqlite4BtBufAppendf(pBuf, "iAge=%d ", (int)s.iAge);
    sqlite4BtBufAppendf(pBuf, "iMinLevel=%d ", (int)s.iMinLevel);
    sqlite4BtBufAppendf(pBuf, "iMaxLevel=%d ", (int)s.iMaxLevel);
    sqlite4BtBufAppendf(pBuf, "iOutLevel=%d ", (int)s.iOutLevel);
    sqlite4BtBufAppendf(pBuf, "blocks=(");
    for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]);
    }
    sqlite4BtBufAppendf(pBuf, ") ");

    sqlite4BtBufAppendf(pBuf, "iNextPg=%d ", (int)s.iNextPg);
    sqlite4BtBufAppendf(pBuf, "iNextCell=%d ", (int)s.iNextCell);
    sqlite4BtBufAppendf(pBuf, "nUsed=%d ", (int)s.nUsed);
    sqlite4BtBufAppendf(pBuf, "iFreeList=%d", (int)s.iFreeList);
  }else{
    sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
    sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
    sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
    if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
    }







|
|
|
|
|
|



|

|
|
|
|







538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566

  if( pgno==pHdr->iSRoot ){
    int i;
    BtSchedule s;
    sqlite4BtBufAppendf(pBuf, "(schedule page) ");
    memcpy(&s, aData, sizeof(s));

    sqlite4BtBufAppendf(pBuf, "  bBusy=%d\n", (int)s.bBusy);
    sqlite4BtBufAppendf(pBuf, "  iAge=%d\n", (int)s.iAge);
    sqlite4BtBufAppendf(pBuf, "  iMinLevel=%d\n", (int)s.iMinLevel);
    sqlite4BtBufAppendf(pBuf, "  iMaxLevel=%d\n", (int)s.iMaxLevel);
    sqlite4BtBufAppendf(pBuf, "  iOutLevel=%d\n", (int)s.iOutLevel);
    sqlite4BtBufAppendf(pBuf, "  blocks=(");
    for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]);
    }
    sqlite4BtBufAppendf(pBuf, ")\n");

    sqlite4BtBufAppendf(pBuf, "  iNextPg=%d\n", (int)s.iNextPg);
    sqlite4BtBufAppendf(pBuf, "  iNextCell=%d\n", (int)s.iNextCell);
    sqlite4BtBufAppendf(pBuf, "  nUsed=%d\n", (int)s.nUsed);
    sqlite4BtBufAppendf(pBuf, "  iFreeList=%d\n", (int)s.iFreeList);
  }else{
    sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
    sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
    sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
    if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
    }
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
          /* Interpret the meta-tree entry */
          if( nKey==sizeof(aSummaryKey) && 0==memcmp(pKey, aSummaryKey, nKey) ){
            sqlite4BtBufAppendf(pBuf, "  [summary]");
          }else{
            u32 iAge = btGetU32(&pKey[0]);
            u32 iLevel = ~btGetU32(&pKey[4]);
            u32 iBlk = btGetU32(pVal);
            sqlite4BtBufAppendf(pBuf, "  [age=%d level=%d block=%d]", 
                (int)iAge, (int)iLevel, (int)iBlk
                );
          }
        }
      }
      sqlite4BtBufAppendf(pBuf, "\n");
    }







|







604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
          /* Interpret the meta-tree entry */
          if( nKey==sizeof(aSummaryKey) && 0==memcmp(pKey, aSummaryKey, nKey) ){
            sqlite4BtBufAppendf(pBuf, "  [summary]");
          }else{
            u32 iAge = btGetU32(&pKey[0]);
            u32 iLevel = ~btGetU32(&pKey[4]);
            u32 iBlk = btGetU32(pVal);
            sqlite4BtBufAppendf(pBuf, "  [age=%d level=%d root=%d]", 
                (int)iAge, (int)iLevel, (int)iBlk
                );
          }
        }
      }
      sqlite4BtBufAppendf(pBuf, "\n");
    }
1002
1003
1004
1005
1006
1007
1008



1009
1010
1011
1012
1013
1014
1015
1016
      rc = btErrorBkpt(SQLITE4_NOMEM);
    }
  }

  return rc;
}




static u32 btBlockToRoot(BtDbHdr *pHdr, u32 iBlk){
  assert( iBlk>0 );
  return (iBlk - 1) * (pHdr->blksz / pHdr->pgsz) + 1;
}

/*
** Return true if the cell that the argument cursor currently points to
** is a delete marker.







>
>
>
|







1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
      rc = btErrorBkpt(SQLITE4_NOMEM);
    }
  }

  return rc;
}

/*
** Return the page number of the first page on block iBlk.
*/
static u32 btFirstOfBlock(BtDbHdr *pHdr, u32 iBlk){
  assert( iBlk>0 );
  return (iBlk - 1) * (pHdr->blksz / pHdr->pgsz) + 1;
}

/*
** Return true if the cell that the argument cursor currently points to
** is a delete marker.
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
        rc = SQLITE4_NOTFOUND;
      }else{
        rc = btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      }
    }
    if( rc==SQLITE4_OK ){
      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(pCsr->base.pDb->pPager);
      pSub->csr.iRoot = btBlockToRoot(pHdr, sqlite4BtGetU32((const u8*)pV));
      rc = btCsrEnd(&pSub->csr, !bNext);
    }
  }

  return rc;
}








|







1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
        rc = SQLITE4_NOTFOUND;
      }else{
        rc = btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      }
    }
    if( rc==SQLITE4_OK ){
      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(pCsr->base.pDb->pPager);
      pSub->csr.iRoot = sqlite4BtGetU32((const u8*)pV);
      rc = btCsrEnd(&pSub->csr, !bNext);
    }
  }

  return rc;
}

1423
1424
1425
1426
1427
1428
1429

1430
1431
1432
1433
1434
1435
1436
  while( p->iLvl<(int)iMin ){
    p->iAge++;
    if( p->iAge>=(p->nSum)/6 ) return 1;
    btReadSummary(p->aSum, p->iAge, &iMin, &nLevel, 0);
    p->iLvl = (int)iMin + (int)nLevel - 1;
  }


  return 0;
}

static int fiLevelIterInit(bt_db *db, FiLevelIter *p){
  int rc;                         /* Return code */

  memset(p, 0, sizeof(FiLevelIter));







>







1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
  while( p->iLvl<(int)iMin ){
    p->iAge++;
    if( p->iAge>=(p->nSum)/6 ) return 1;
    btReadSummary(p->aSum, p->iAge, &iMin, &nLevel, 0);
    p->iLvl = (int)iMin + (int)nLevel - 1;
  }

  assert( p->iSub<p->nSub );
  return 0;
}

static int fiLevelIterInit(bt_db *db, FiLevelIter *p){
  int rc;                         /* Return code */

  memset(p, 0, sizeof(FiLevelIter));
1450
1451
1452
1453
1454
1455
1456









1457
1458
1459
1460
1461
1462
1463

  return rc;
}

static void fiLevelIterCleanup(FiLevelIter *p){
  btCsrReset(&p->csr, 1);
}










/*
** Seek a fast-insert cursor.
*/
static int fiCsrSeek(FiCursor *pCsr, const void *pK, int nK, int eSeek){
  u8 aPrefix[8];
  int rc = SQLITE4_NOTFOUND;      /* Return code */







>
>
>
>
>
>
>
>
>







1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481

  return rc;
}

static void fiLevelIterCleanup(FiLevelIter *p){
  btCsrReset(&p->csr, 1);
}

/*
** Format values iAge and iLvl into an 8 byte prefix as used in the
** meta-tree.
*/
static void fiFormatPrefix(u8 *aPrefix, u32 iAge, u32 iLvl){
  btPutU32(&aPrefix[0], iAge);
  btPutU32(&aPrefix[4], ~(u32)iLvl);
}

/*
** Seek a fast-insert cursor.
*/
static int fiCsrSeek(FiCursor *pCsr, const void *pK, int nK, int eSeek){
  u8 aPrefix[8];
  int rc = SQLITE4_NOTFOUND;      /* Return code */
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
    /* Initialize the iterator used to skip through database levels */
    rc = fiLevelIterInit(db, &iter);
    if( rc!=SQLITE4_OK ) return rc;

    if( eSeek==BT_SEEK_EQ ){
      FiSubCursor *pSub;
      BtCursor *pM;
      int iAge;

      /* A BT_SEEK_EQ is a special case. There is no need to set up a cursor
      ** that can be advanced (in either direction) in this case. All that
      ** is required is to search each level in order for the requested key 
      ** (or a corresponding delete marker). Once a match is found, there
      ** is no need to search any further. As a result, only a single
      ** sub-cursor is required.  */
      rc = fiCsrAllocateSubs(db, pCsr, 1);
      pSub = pCsr->aSub;
      pM = &pSub->mcsr;

      btCsrSetup(db, pHdr->iMRoot, pM);
      while( 0==fiLevelIterNext(&iter) ){

        u16 iMin;
        u16 nLevel;
        u16 iMerge;
        int iLvl;

        btPutU32(aPrefix, iter.iAge);
        btPutU32(&aPrefix[4], ~(u32)iter.iLvl);
        rc = btCsrSeek(pM, aPrefix, pK, nK, BT_SEEK_LE, BT_CSRSEEK_SEEK);

        if( rc==SQLITE4_NOTFOUND ){
          /* All keys in this level are greater than pK/nK. */
          /* no-op */
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          int iBlk;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);
          assert( rc!=SQLITE4_INEXACT );
          if( rc!=SQLITE4_NOTFOUND ){
            /* A hit on the requested key or an error has occurred. Either
             ** way, break out of the loop. If this is a hit, set iBt to
             ** zero so that the BtCsrKey() and BtCsrData() routines know







<














<
<
<
<
<
|
<








|

|

|







1491
1492
1493
1494
1495
1496
1497

1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511





1512

1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
    /* Initialize the iterator used to skip through database levels */
    rc = fiLevelIterInit(db, &iter);
    if( rc!=SQLITE4_OK ) return rc;

    if( eSeek==BT_SEEK_EQ ){
      FiSubCursor *pSub;
      BtCursor *pM;


      /* A BT_SEEK_EQ is a special case. There is no need to set up a cursor
      ** that can be advanced (in either direction) in this case. All that
      ** is required is to search each level in order for the requested key 
      ** (or a corresponding delete marker). Once a match is found, there
      ** is no need to search any further. As a result, only a single
      ** sub-cursor is required.  */
      rc = fiCsrAllocateSubs(db, pCsr, 1);
      pSub = pCsr->aSub;
      pM = &pSub->mcsr;

      btCsrSetup(db, pHdr->iMRoot, pM);
      while( 0==fiLevelIterNext(&iter) ){






        fiFormatPrefix(aPrefix, iter.iAge, iter.iLvl);

        rc = btCsrSeek(pM, aPrefix, pK, nK, BT_SEEK_LE, BT_CSRSEEK_SEEK);

        if( rc==SQLITE4_NOTFOUND ){
          /* All keys in this level are greater than pK/nK. */
          /* no-op */
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          u32 iRoot;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iRoot = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, iRoot, &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);
          assert( rc!=SQLITE4_INEXACT );
          if( rc!=SQLITE4_NOTFOUND ){
            /* A hit on the requested key or an error has occurred. Either
             ** way, break out of the loop. If this is a hit, set iBt to
             ** zero so that the BtCsrKey() and BtCsrData() routines know
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575

        if( rc==SQLITE4_NOTFOUND ){
          /* No keys to visit in this level */
          rc = SQLITE4_OK;
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          int iBlk;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, eSeek, BT_CSRSEEK_SEEK);
          if( rc==SQLITE4_OK ) bMatch = 1;
          if( rc==SQLITE4_INEXACT ) bHit = 1;
          if( rc==SQLITE4_INEXACT || rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;

        }else{







|

|

|







1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586

        if( rc==SQLITE4_NOTFOUND ){
          /* No keys to visit in this level */
          rc = SQLITE4_OK;
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          u32 iRoot;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iRoot = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, iRoot, &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, eSeek, BT_CSRSEEK_SEEK);
          if( rc==SQLITE4_OK ) bMatch = 1;
          if( rc==SQLITE4_INEXACT ) bHit = 1;
          if( rc==SQLITE4_INEXACT || rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;

        }else{
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
      }
    }
    if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;

    if( rc==SQLITE4_OK ){
      const void *pV;
      int nV;
      int iBlk;
      btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      iBlk = sqlite4BtGetU32((const u8*)pV);
      btCsrReset(&pSub->csr, 1);
      btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);
      rc = btCsrEnd(&pSub->csr, bLast);
    }
  }
  fiLevelIterCleanup(&iter);

  if( rc==SQLITE4_OK ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);







|

|

|







1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
      }
    }
    if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;

    if( rc==SQLITE4_OK ){
      const void *pV;
      int nV;
      int iRoot;
      btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      iRoot = sqlite4BtGetU32((const u8*)pV);
      btCsrReset(&pSub->csr, 1);
      btCsrSetup(db, iRoot, &pSub->csr);
      rc = btCsrEnd(&pSub->csr, bLast);
    }
  }
  fiLevelIterCleanup(&iter);

  if( rc==SQLITE4_OK ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
*/
static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){
  int rc;
  if( db->bFastInsertOp ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
    u32 iPg;

    iPg = pHdr->nSubPg + btBlockToRoot(pHdr, pHdr->iSubBlock);
    pHdr->nSubPg++;
    rc = sqlite4BtPageGet(db->pPager, iPg, ppPg);
    if( rc==SQLITE4_OK ){
      rc = sqlite4BtPageWrite(*ppPg);
      if( rc!=SQLITE4_OK ){
        sqlite4BtPageRelease(*ppPg);
      }







|







2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
*/
static int btAllocateNonOverflow(bt_db *db, BtPage **ppPg){
  int rc;
  if( db->bFastInsertOp ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
    u32 iPg;

    iPg = pHdr->nSubPg + btFirstOfBlock(pHdr, pHdr->iSubBlock);
    pHdr->nSubPg++;
    rc = sqlite4BtPageGet(db->pPager, iPg, ppPg);
    if( rc==SQLITE4_OK ){
      rc = sqlite4BtPageWrite(*ppPg);
      if( rc!=SQLITE4_OK ){
        sqlite4BtPageRelease(*ppPg);
      }
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );

          /* Try to schedule a merge operation */
#if 0
          rc = btScheduleMerge(db);
#endif
          rc = SQLITE4_OK;

          if( rc==SQLITE4_OK ){
            rc = btFastInsertRoot(db, pHdr, &iRootPg);
          }
          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);







<

<
<







3325
3326
3327
3328
3329
3330
3331

3332


3333
3334
3335
3336
3337
3338
3339
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );

          /* Try to schedule a merge operation */

          rc = btScheduleMerge(db);



          if( rc==SQLITE4_OK ){
            rc = btFastInsertRoot(db, pHdr, &iRootPg);
          }
          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
        /* Find the input age and maximum level */
        btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge);
        *piMinLevel = (u32)iMin;
        *piMaxLevel = (u32)(iMin + nLevel - 1);
        *piAge = iBestAge;

        /* Find the output level */
        btReadSummary(aSum, iBestAge+1, &iMin, &nLevel, &iMerge);
        *piOutLevel = iMin + nLevel;

        /* Update the summary record for the output segment. */
        btWriteSummary(aNew, iBestAge+1, iMin, nLevel+1, iMerge);

        /* Write the updated summary record back to the db. */
        rc = btReplaceEntry(







|







3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
        /* Find the input age and maximum level */
        btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge);
        *piMinLevel = (u32)iMin;
        *piMaxLevel = (u32)(iMin + nLevel - 1);
        *piAge = iBestAge;

        /* Find the output level */
        btReadSummary(aNew, iBestAge+1, &iMin, &nLevel, &iMerge);
        *piOutLevel = iMin + nLevel;

        /* Update the summary record for the output segment. */
        btWriteSummary(aNew, iBestAge+1, iMin, nLevel+1, iMerge);

        /* Write the updated summary record back to the db. */
        rc = btReplaceEntry(
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726


3727











































3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743

3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761

3762
3763
3764
3765
3766
3767
3768

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);
      btPutU32(&aVal[0], iSubBlock);
      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 4);
    }
  }

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }
  db->bFastInsertOp = 1;
  return rc;
}

/*
** Set up a fast-insert cursor to read the input data for a merge operation.
*/
static int fiSetupMergeCsr(
  bt_db *db,                      /* Database handle */
  BtDbHdr *pHdr,                  /* Current database header values */
  BtSchedule *p,                  /* Description of merge operation */
  FiCursor *pCsr                  /* Populate this object before returning */
){
  int iBt;                        /* Used to loop through component cursors */
  int rc;                         /* Return code */

  memset(pCsr, 0, sizeof(FiCursor));
  pCsr->base.flags = CSR_TYPE_FAST;
  pCsr->base.pDb = db;

  rc = fiCsrAllocateSubs(db, pCsr, (p->iMaxLevel - p->iMinLevel) + 1);
  assert( rc==SQLITE4_OK || pCsr->nBt==0 );


  for(iBt=0; iBt<pCsr->nBt && rc==SQLITE4_OK; iBt++){











































  }

  return rc;
}

/*
** This is called by a checkpointer to handle a schedule page.
*/
int sqlite4BtMerge(bt_db *db, BtDbHdr *pHdr, u8 *aSched){
  BtSchedule s;                   /* Deserialized schedule object */
  FiCursor fcsr;                  /* FiCursor used to read input */
  int rc;                         /* Return code */

  /* Set up the input cursor. */
  btReadSchedule(db, aSched, &s);
  assert( s.bBusy );

  rc = fiSetupMergeCsr(db, pHdr, &s, &fcsr);

  /* The following loop runs once for each key copied from the input to
  ** the output segments. It terminates either when the input is exhausted
  ** or when all available output blocks are full.  */
  while( rc==SQLITE4_OK ){
    rc = fiCsrStep(&fcsr);
  }

  /* Assuming no error has occurred, update the serialized BtSchedule
  ** structure stored in buffer aSched[]. The caller will write this
  ** buffer to the database file as page (pHdr->iSRoot).  */
  if( rc==SQLITE4_OK || rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;
    btWriteSchedule(aSched, &s, &rc);
  }

  fiCsrReset(&fcsr);

  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending







|





|














|



|

<


>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










<
|



|
>
|

|
|
|
|
|
|

|
|
|
|
|
|
|

|
>







3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731

3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789

3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);
      btPutU32(&aVal[0], btFirstOfBlock(pHdr, iSubBlock));
      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 4);
    }
  }

  if( rc==SQLITE4_OK ){
    *piRoot = btFirstOfBlock(pHdr, pHdr->iSubBlock);
  }
  db->bFastInsertOp = 1;
  return rc;
}

/*
** Set up a fast-insert cursor to read the input data for a merge operation.
*/
static int fiSetupMergeCsr(
  bt_db *db,                      /* Database handle */
  BtDbHdr *pHdr,                  /* Current database header values */
  BtSchedule *p,                  /* Description of merge operation */
  FiCursor *pCsr                  /* Populate this object before returning */
){
  int iSub;                       /* Used to loop through component cursors */
  int rc;                         /* Return code */

  memset(pCsr, 0, sizeof(FiCursor));
  pCsr->base.flags = CSR_TYPE_FAST | CSR_NEXT_OK | CSR_VISIT_DEL;
  pCsr->base.pDb = db;

  rc = fiCsrAllocateSubs(db, pCsr, (p->iMaxLevel - p->iMinLevel) + 1);
  assert( rc==SQLITE4_OK || pCsr->nBt==0 );

  /* Initialize each sub-cursor */
  for(iSub=0; iSub<pCsr->nBt && rc==SQLITE4_OK; iSub++){
    u32 iLvl = p->iMaxLevel - iSub;
    FiSubCursor *pSub = &pCsr->aSub[iSub];
    BtCursor *pM = &pSub->mcsr;
    const void *pKey = 0; int nKey = 0;

    /* Seek the meta-tree cursor to the first entry (smallest keys) for the
    ** current level. If an earlier merge operation completely emptied the
    ** level, the sought entry may not exist at all.  */
    fiFormatPrefix(pSub->aPrefix, p->iAge, iLvl);
    btCsrSetup(db, pHdr->iMRoot, pM);
    rc = btCsrSeek(pM, 0, pSub->aPrefix, sizeof(pSub->aPrefix), BT_SEEK_GE, 0);

    if( rc==SQLITE4_INEXACT ){
      const int nPrefix = sizeof(pSub->aPrefix);
      rc = btCsrKey(pM, &pKey, &nKey);
      if( rc==SQLITE4_OK ){
        if( nKey<nPrefix || memcmp(pKey, pSub->aPrefix, nPrefix) ){
          /* Level is completely empty. Nothing to do for this level. */
          rc = SQLITE4_NOTFOUND;
        }else{
          nKey -= nPrefix;
          pKey = (const void*)(((const u8*)pKey) + nPrefix);
        }
      }
    }

    /* Assuming the process above found a block, set up the block cursor and
    ** seek it to the smallest first valid key.  */
    if( rc==SQLITE4_OK ){
      const void *pVal = 0; int nVal = 0;
      rc = btCsrData(pM, 0, 4, &pVal, &nVal);
      if( rc==SQLITE4_OK ){
        u32 iRoot = sqlite4BtGetU32((const u8*)pVal);
        btCsrSetup(db, iRoot, &pSub->csr);
        rc = btCsrSeek(&pSub->csr, 0, pKey, nKey, BT_SEEK_GE, 0);
        if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;
        if( rc==SQLITE4_NOTFOUND ) rc = btErrorBkpt(SQLITE4_CORRUPT);
      }
    }
  }

  if( rc==SQLITE4_OK ){
    rc = fiCsrSetCurrent(pCsr);
  }

  return rc;
}

/*
** This is called by a checkpointer to handle a schedule page.
*/
int sqlite4BtMerge(bt_db *db, BtDbHdr *pHdr, u8 *aSched){
  BtSchedule s;                   /* Deserialized schedule object */

  int rc = SQLITE4_OK;            /* Return code */

  /* Set up the input cursor. */
  btReadSchedule(db, aSched, &s);
  if( s.bBusy ){
    FiCursor fcsr;                /* FiCursor used to read input */
    rc = fiSetupMergeCsr(db, pHdr, &s, &fcsr);

    /* The following loop runs once for each key copied from the input to
    ** the output segments. It terminates either when the input is exhausted
    ** or when all available output blocks are full.  */
    while( rc==SQLITE4_OK ){
      rc = fiCsrStep(&fcsr);
    }

    /* Assuming no error has occurred, update the serialized BtSchedule
    ** structure stored in buffer aSched[]. The caller will write this
    ** buffer to the database file as page (pHdr->iSRoot).  */
    if( rc==SQLITE4_OK || rc==SQLITE4_NOTFOUND ){
      rc = SQLITE4_OK;
      btWriteSchedule(aSched, &s, &rc);
    }

    fiCsrReset(&fcsr);
  }
  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending
Changes to src/bt_pager.c.
760
761
762
763
764
765
766





767
768
769
770
771
772
773

/* 
** Query for the root page number. Requires an open read transaction.
*/
BtDbHdr *sqlite4BtPagerDbhdr(BtPager *p){
  return p->pHdr;
}






/*
** Request a reference to page pgno of the database.
*/
int sqlite4BtPageGet(BtPager *p, u32 pgno, BtPage **ppPg){
  int rc = SQLITE4_OK;            /* Return code */
  BtPage *pRet;                   /* Returned page handle */







>
>
>
>
>







760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778

/* 
** Query for the root page number. Requires an open read transaction.
*/
BtDbHdr *sqlite4BtPagerDbhdr(BtPager *p){
  return p->pHdr;
}

void sqlite4BtPagerSetDbhdr(BtPager *p, BtDbHdr *pHdr){
  assert( p->pHdr==0 || pHdr==0 );
  p->pHdr = pHdr;
}

/*
** Request a reference to page pgno of the database.
*/
int sqlite4BtPageGet(BtPager *p, u32 pgno, BtPage **ppPg){
  int rc = SQLITE4_OK;            /* Return code */
  BtPage *pRet;                   /* Returned page handle */