SQLite4
Check-in [27248a1ebc]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
SHA1 Hash:27248a1ebcd52fa9c0f51c0f2a235d0d8c0ee9de
Date: 2014-01-02 18:53:22
User: dan
Comment:Changes to FiCursor object to support reading merged blocks.
Tags And Properties
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/btInt.h

94
95
96
97
98
99
100

101
102
103
104
105
106
107
108
109


110
111
112
113
114
115
116
/*
** This struct defines the format of database "schedule" pages.
*/
typedef struct BtSchedule BtSchedule;
struct BtSchedule {
  u32 bBusy;                      /* True if page is busy */
  u32 iAge;                       /* Age of input segments */

  u32 iMaxLevel;                  /* Maximum level of input segments */
  u32 iOutLevel;                  /* Level at which to write output */
  u32 aBlock[32];                 /* Allocated blocks */

  u32 iNextPg;                    /* Page that contains next input key */
  u32 iNextCell;                  /* Cell that contains next input key */
  u32 nUsed;                      /* Number of output blocks actually written */
  u32 iFreeList;                  /* First page of new free-list (if any) */
};



/*************************************************************************
** Interface to bt_pager.c functionality.
*/
typedef struct BtPage BtPage;
typedef struct BtPager BtPager;








>









>
>







94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
/*
** This struct defines the format of database "schedule" pages.
*/
typedef struct BtSchedule BtSchedule;
struct BtSchedule {
  u32 bBusy;                      /* True if page is busy */
  u32 iAge;                       /* Age of input segments */
  u32 iMinLevel;                  /* Minimum level of input segments */
  u32 iMaxLevel;                  /* Maximum level of input segments */
  u32 iOutLevel;                  /* Level at which to write output */
  u32 aBlock[32];                 /* Allocated blocks */

  u32 iNextPg;                    /* Page that contains next input key */
  u32 iNextCell;                  /* Cell that contains next input key */
  u32 nUsed;                      /* Number of output blocks actually written */
  u32 iFreeList;                  /* First page of new free-list (if any) */
};

int sqlite4BtMerge(bt_db *db, BtDbHdr *pHdr, u8 *aSched);

/*************************************************************************
** Interface to bt_pager.c functionality.
*/
typedef struct BtPage BtPage;
typedef struct BtPager BtPager;

Changes to src/bt_log.c

1797
1798
1799
1800
1801
1802
1803






1804
1805
1806
1807
1808
1809
1810
....
1851
1852
1853
1854
1855
1856
1857
1858


1859

1860
1861

1862
1863
1864
1865
1866
1867
1868
    + (pLog->snapshot.aLog[0]!=0)
    + (int)pLog->snapshot.aLog[3] - (int)pLog->snapshot.aLog[2]
    + (pLog->snapshot.aLog[3]!=0)
    + (int)pLog->snapshot.aLog[5] - (int)pLog->snapshot.aLog[4]
    + (pLog->snapshot.aLog[5]!=0)
  ;
}







int sqlite4BtLogCheckpoint(BtLog *pLog, int nFrameBuffer){
  BtLock *pLock = pLog->pLock;
  int rc;

  /* Take the CHECKPOINTER lock. */
  rc = sqlite4BtLockCkpt(pLock);
................................................................................
      /* Copy data from the log file to the database file. */
      for(i=0; rc==SQLITE4_OK && i<nPgno; i++){
        u32 pgno = aPgno[i];
        rc = btLogRead(pLog, pgno, aBuf, iLast);
        if( rc==SQLITE4_OK ){
          i64 iOff = (i64)pgsz * (pgno-1);
          if( pgno==1 ){
            btLogUpdateDbhdr(pLog, aBuf);


          }

          btDebugCkptPage(pLog->pLock, pgno, aBuf, pgsz);
          rc = pVfs->xWrite(pFd, iOff, aBuf, pgsz);

        }else if( rc==SQLITE4_NOTFOUND ){
          rc = SQLITE4_OK;
        }
      }

      /* Sync the database file to disk. */
      if( rc==SQLITE4_OK ){







>
>
>
>
>
>







 







|
>
>

>
|
|
>







1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
....
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
    + (pLog->snapshot.aLog[0]!=0)
    + (int)pLog->snapshot.aLog[3] - (int)pLog->snapshot.aLog[2]
    + (pLog->snapshot.aLog[3]!=0)
    + (int)pLog->snapshot.aLog[5] - (int)pLog->snapshot.aLog[4]
    + (pLog->snapshot.aLog[5]!=0)
  ;
}

static int btLogMerge(BtLog *pLog, u8 *aBuf){
  bt_db *db = (bt_db*)sqlite4BtPagerExtra((BtPager*)pLog->pLock);
  return sqlite4BtMerge(db, &pLog->snapshot.dbhdr, aBuf);
}


int sqlite4BtLogCheckpoint(BtLog *pLog, int nFrameBuffer){
  BtLock *pLock = pLog->pLock;
  int rc;

  /* Take the CHECKPOINTER lock. */
  rc = sqlite4BtLockCkpt(pLock);
................................................................................
      /* Copy data from the log file to the database file. */
      for(i=0; rc==SQLITE4_OK && i<nPgno; i++){
        u32 pgno = aPgno[i];
        rc = btLogRead(pLog, pgno, aBuf, iLast);
        if( rc==SQLITE4_OK ){
          i64 iOff = (i64)pgsz * (pgno-1);
          if( pgno==1 ){
            rc = btLogUpdateDbhdr(pLog, aBuf);
          }else if( pgno==pLog->snapshot.dbhdr.iSRoot ){
            rc = btLogMerge(pLog, aBuf);
          }
          if( rc==SQLITE4_OK ){
            btDebugCkptPage(pLog->pLock, pgno, aBuf, pgsz);
            rc = pVfs->xWrite(pFd, iOff, aBuf, pgsz);
          }
        }else if( rc==SQLITE4_NOTFOUND ){
          rc = SQLITE4_OK;
        }
      }

      /* Sync the database file to disk. */
      if( rc==SQLITE4_OK ){

Changes to src/bt_main.c

27
28
29
30
31
32
33

34
35
36
37
38
39
40
..
89
90
91
92
93
94
95





96
97
98
99
100
101
102







103
104
105
106
107
108
109
...
123
124
125
126
127
128
129

130
131
132
133
134
135
136
137
...
330
331
332
333
334
335
336
337

338
339
340
341
342
343
344
345
346
347
...
520
521
522
523
524
525
526

527
528
529
530
531
532
533
...
649
650
651
652
653
654
655

656
657
658
659
660
661
662
...
686
687
688
689
690
691
692











693
694
695
696
697
698
699

700
701
702
703
704
705
706
707
708
709
710

711
712
713
714
715
716
717
...
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
...
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
...
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
....
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
....
1268
1269
1270
1271
1272
1273
1274














































1275
1276
1277
1278
1279
1280
1281
....
1284
1285
1286
1287
1288
1289
1290

1291
1292

1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325























































1326
1327
1328
1329

1330
1331
1332
1333
1334
1335
1336
1337






1338




1339
1340
1341
1342
1343

1344
1345
1346
1347
1348

1349
1350
1351
1352
1353
1354

1355
1356
1357
1358
1359




1360



1361







1362
1363
1364
1365

1366

1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426





1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453

1454

1455
1456
1457

1458
1459
1460
1461
1462

1463
1464









1465
1466

1467
1468
1469
1470
1471
1472

1473




1474
1475

1476

1477
1478
1479
1480
1481

1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
....
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
....
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
....
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
....
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
3133
3134
3135
....
3150
3151
3152
3153
3154
3155
3156

3157


3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
....
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
....
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
....
3307
3308
3309
3310
3311
3312
3313












3314
3315
3316
3317
3318
3319
3320

3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
....
3340
3341
3342
3343
3344
3345
3346

3347
3348
3349
3350
3351
3352
3353
....
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367

3368
3369
3370
3371
3372
3373
3374
....
3392
3393
3394
3395
3396
3397
3398

3399
3400
3401
3402
3403
3404
3405
....
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434

3435
3436
3437
3438
3439


3440
3441
3442
3443
3444
3445
3446
....
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471

3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
....
3524
3525
3526
3527
3528
3529
3530
























































3531
3532
3533
3534
3535
3536
3537
....
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
#define BT_PGFLAGS_METATREE 0x02  /* True for a meta-tree page */
#define BT_PGFLAGS_SCHEDULE 0x04  /* True for a schedule-tree page */

/* #define BT_STDERR_DEBUG 1 */

typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;


struct bt_db {
  sqlite4_env *pEnv;              /* SQLite environment */
  BtPager *pPager;                /* Underlying page-based database */
  bt_cursor *pAllCsr;             /* List of all open cursors */
  int nMinMerge;
  int nScheduleAlloc;
................................................................................
  int bSkipNext;                  /* True if next CsrNext() is a no-op */
  int bSkipPrev;                  /* True if next CsrPrev() is a no-op */
};

/*
** Database f-tree cursor handle.
*/





struct FiCursor {
  bt_cursor base;                 /* Base cursor class */
  int iBt;                        /* Current sub-tree (or -1 for EOF) */
  int nBt;                        /* Number of entries in aBt[] array */
  BtCursor *aBt;                  /* Array of sub-tree cursors */
};








/* 
** Meta-table summary key.
*/
static const u8 aSummaryKey[] = {0xFF, 0xFF, 0xFF, 0xFF};

#ifndef btErrorBkpt
int btErrorBkpt(int rc){
................................................................................
    if( IsBtCsr(pCsr) ){
      BtCursor *p = (BtCursor*)pCsr;
      if( p->nPg>0 ) nExpect += p->nPg;
    }else{
      FiCursor *p = (FiCursor*)pCsr;
      int i;
      for(i=0; i<p->nBt; i++){

        if( p->aBt[i].nPg>0 ) nExpect += p->aBt[i].nPg;
      }
    }
  }
  nActual = sqlite4BtPagerRefcount(pDb->pPager);
  assert( nActual==nExpect );
}
#else
................................................................................
  pCsr->bRequireReseek = 0;
}

static void fiCsrReset(FiCursor *pCsr){
  int i;
  bt_db *db = pCsr->base.pDb;
  for(i=0; i<pCsr->nBt; i++){
    btCsrReset(&pCsr->aBt[i], 1);

  }
  sqlite4_free(db->pEnv, pCsr->aBt);
  pCsr->aBt = 0;
  pCsr->nBt = 0;
  pCsr->iBt = -1;
}

int sqlite4BtCsrClose(bt_cursor *pCsr){
  if( pCsr ){
    bt_cursor **pp;
................................................................................
    int i;
    BtSchedule s;
    sqlite4BtBufAppendf(pBuf, "(schedule page) ");
    memcpy(&s, aData, sizeof(s));

    sqlite4BtBufAppendf(pBuf, "bBusy=%d ", (int)s.bBusy);
    sqlite4BtBufAppendf(pBuf, "iAge=%d ", (int)s.iAge);

    sqlite4BtBufAppendf(pBuf, "iMaxLevel=%d ", (int)s.iMaxLevel);
    sqlite4BtBufAppendf(pBuf, "iOutLevel=%d ", (int)s.iOutLevel);
    sqlite4BtBufAppendf(pBuf, "blocks=(");
    for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]);
    }
    sqlite4BtBufAppendf(pBuf, ") ");
................................................................................
**
** In other words, *piRes is +ve, zero or -ve if C is respectively larger, 
** equal to or smaller than K.
*/
static int btCellKeyCompare(
  BtCursor *pCsr,                 /* Cursor handle */
  int bLeaf,                      /* True if cursor currently points to leaf */

  const void *pK, int nK,         /* Key to compare against cursor key */
  int *piRes                      /* OUT: Result of comparison */
){
  const void *pCsrKey;
  int nCsrKey;
  int nCmp;
  int nAscend = 0;
................................................................................
        iCell = 0;
      }
      rc = sqlite4BtCsrKey((bt_cursor*)pCsr, &pCsrKey, &nCsrKey);
    }
  }

  if( rc==SQLITE4_OK ){











    nCmp = MIN(nCsrKey, nK);
    res = memcmp(pCsrKey, pK, nCmp);
    if( res==0 ){
      res = nCsrKey - nK;
    }
  }


  btCsrAscend(pCsr, nAscend);
  *piRes = res;
  return rc;
}

#define BT_CSRSEEK_SEEK   0
#define BT_CSRSEEK_UPDATE 1
#define BT_CSRSEEK_RESEEK 2

static int btCsrSeek(
  BtCursor *pCsr, 

  const void *pK,                 /* Key to seek for */
  int nK,                         /* Size of key pK in bytes */
  int eSeek,                      /* Seek mode (a BT_SEEK_XXX constant) */
  int eCsrseek
){
  const int pgsz = sqlite4BtPagerPagesize(pCsr->base.pDb->pPager);
  u32 pgno;                       /* Page number for next page to load */
................................................................................
        rc = SQLITE4_NOTFOUND;
        break;
      }

      while( iHi>iLo ){
        int iTst = (iHi+iLo)/2;   /* Cell to compare to pK/nK */
        pCsr->aiCell[pCsr->nPg-1] = iTst;
        rc = btCellKeyCompare(pCsr, bLeaf, pK, nK, &res);
        if( rc!=SQLITE4_OK || res==0 ){
          /* Cell iTst is EQUAL to pK/nK */
          iHi = iLo = iTst;
        }else if( res<0 ){
          /* Cell iTst is SMALLER than pK/nK */
          iLo = iTst+1;
        }else{
................................................................................
  }
  return rc;
}

static int btCsrReseek(BtCursor *pCsr){
  int rc = SQLITE4_OK;
  if( pCsr->bRequireReseek ){
    BtOvfl ovfl;
    memcpy(&ovfl, &pCsr->ovfl, sizeof(BtOvfl));

    pCsr->ovfl.buf.n = 0;
    pCsr->ovfl.buf.p = 0;
    pCsr->bSkipNext = 0;
    pCsr->bRequireReseek = 0;

    rc = btCsrSeek(pCsr, ovfl.buf.p, ovfl.nKey, BT_SEEK_EQ, BT_CSRSEEK_RESEEK);
    assert( rc!=SQLITE4_INEXACT );
    if( pCsr->ovfl.buf.p==0 ){
      pCsr->ovfl.buf.p = ovfl.buf.p;
    }else{
      sqlite4_buffer_clear(&ovfl.buf);
    }
  }
  return rc;
}

/*
** This function does the work of both sqlite4BtCsrNext() (if parameter
................................................................................
  return rc;
}


static int fiCsrAllocateSubs(bt_db *db, FiCursor *pCsr, int nBt){
  int rc = SQLITE4_OK;            /* Return code */
  if( nBt>pCsr->nBt ){
    int nByte = sizeof(BtCursor) * nBt;
    BtCursor *aNew;               /* Allocated array */

    aNew = (BtCursor*)sqlite4_realloc(db->pEnv, pCsr->aBt, nByte);
    if( aNew ){
      memset(&aNew[pCsr->nBt], 0, sizeof(BtCursor)*(nBt-pCsr->nBt));
      pCsr->aBt = aNew;
      pCsr->nBt = nBt;
    }else{
      rc = btErrorBkpt(SQLITE4_NOMEM);
    }
  }

  return rc;
................................................................................
  int rc = SQLITE4_OK;
  int mul;

  assert( pCsr->base.flags & (CSR_NEXT_OK | CSR_PREV_OK) );
  mul = ((pCsr->base.flags & CSR_NEXT_OK) ? 1 : -1);

  for(i=0; i<pCsr->nBt && rc==SQLITE4_OK; i++){
    BtCursor *pSub = &pCsr->aBt[i];
    if( pSub->nPg>0 ){
      int nKey;
      const void *pKey;

      rc = btCsrKey(pSub, &pKey, &nKey);
      if( rc==SQLITE4_OK 
          && (iMin<0 || (mul * btKeyCompare(pKey, nKey, pMin, nMin))<0) 
................................................................................
    }
  }

  if( iMin<0 && rc==SQLITE4_OK ) rc = SQLITE4_NOTFOUND;
  pCsr->iBt = iMin;
  return rc;
}















































/*
** Advance the cursor. The direction (xPrev or xNext) is implied by the
** cursor itself - as fast-insert cursors may only be advanced in one
** direction.
*/
static int fiCsrStep(FiCursor *pCsr){
................................................................................
  const void *pKey; int nKey;     /* Current key that cursor points to */
  int i;

  assert( pCsr->base.flags & (CSR_NEXT_OK | CSR_PREV_OK) );
  assert( pCsr->iBt>=0 );

  do{

    /* Advance all bt cursors that share a key with the current bt cursor. */
    rc = btCsrKey(&pCsr->aBt[pCsr->iBt], &pKey, &nKey);

    for(i=0; rc==SQLITE4_OK && i<pCsr->nBt; i++){
      BtCursor *pSub = &pCsr->aBt[i];
      if( i!=pCsr->iBt && pSub->nPg>0 ){
        const void *p; int n;       /* Key that this sub-cursor points to */
        rc = btCsrKey(&pCsr->aBt[i], &p, &n);
        if( rc==SQLITE4_OK && btKeyCompare(p, n, pKey, nKey)==0 ){
          rc = btCsrStep(pSub, bNext);
          if( rc==SQLITE4_NOTFOUND ){
            assert( pSub->nPg==0 );
            rc = SQLITE4_OK;
          }
        }
      }
    }

    /* Advance the current bt cursor */
    if( rc==SQLITE4_OK ){
      rc = btCsrStep(&pCsr->aBt[pCsr->iBt], bNext);
      if( rc==SQLITE4_NOTFOUND ){
        assert( pCsr->aBt[pCsr->iBt].nPg==0 );
        rc = SQLITE4_OK;
      }
    }

    /* Figure out a new current bt cursor */
    if( rc==SQLITE4_OK ){
      rc = fiCsrSetCurrent(pCsr);
    }
  }while( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) );

  return rc;
}
























































/*
** Seek a fast-insert cursor.
*/
static int fiCsrSeek(FiCursor *pCsr, const void *pK, int nK, int eSeek){

  int rc = SQLITE4_OK;            /* Return code */
  bt_db *db = pCsr->base.pDb;     /* Database handle */
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);

  assert( eSeek==BT_SEEK_LE || eSeek==BT_SEEK_EQ || eSeek==BT_SEEK_GE );
  fiCsrReset(pCsr);

  if( pHdr->iMRoot ){






    if( eSeek==BT_SEEK_EQ ){




      /* A BT_SEEK_EQ is a special case. There is no need to set up a cursor
      ** that can be advanced (in either direction) in this case. All that
      ** is required is to search each level in order for the requested key 
      ** (or a corresponding delete marker). Once a match is found, there
      ** is no need to search any further.  */

      rc = fiCsrAllocateSubs(db, pCsr, 1);

      if( rc==SQLITE4_OK ){
        BtCursor mcsr;            /* Used to iterate through the meta-tree */
        BtCursor *pSub = pCsr->aBt;


        /* Loop through the set of f-tree levels */
        btCsrSetup(db, pHdr->iMRoot, &mcsr);
        for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
          const void *pV; int nV; /* Value associated with meta-tree entry */
          u32 iBlk;               /* Block containing sub-tree */


          btCsrKey(&mcsr, &pV, &nV);
          if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){
            rc = SQLITE4_NOTFOUND;
            break;




          }











          sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(pSub, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);

          rc = btCsrSeek(pSub, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);

          if( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ){
            /* A hit on the requested key or an error has occurred. Either
            ** way, break out of the loop. If this is a hit, set iBt to
            ** zero so that the BtCsrKey() and BtCsrData() routines know
            ** to return data from the first (only) sub-cursor. */
            assert( pCsr->iBt<0 );
            if( rc==SQLITE4_OK ){
              if( 0==fiCsrIsDelete(pSub) ){
                pCsr->iBt = 0;
              }else{
                rc = SQLITE4_NOTFOUND;
              }
            }
            break;
          }
        }
        btCsrReset(&mcsr, 1);
      }
    }else{
      int bMatch = 0;           /* Found an exact match */
      int bHit = 0;             /* Found at least one entry */
      int iSub = 0;
      BtCursor mcsr;            /* Used to iterate through the meta-tree */

      /* Loop through the set of f-tree levels. Open a cursor on each. */
      btCsrSetup(db, pHdr->iMRoot, &mcsr);
      for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
        const void *pV; int nV;   /* Value associated with meta-tree entry */
        u32 iBlk;                 /* Block containing sub-tree */
        BtCursor *pSub;           /* Sub-cursor for this block */

        rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
        if( rc!=SQLITE4_OK ) break;
        pSub = &pCsr->aBt[iSub];
        iSub++;

        btCsrKey(&mcsr, &pV, &nV);
        if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){
          rc = SQLITE4_NOTFOUND;
          break;
        }

        sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
        iBlk = sqlite4BtGetU32((const u8*)pV);

        btCsrReset(pSub, 1);
        btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);
        rc = btCsrSeek(pSub, pK, nK, eSeek, BT_CSRSEEK_SEEK);
        if( rc==SQLITE4_OK ){
          bMatch = 1;
        }else if( rc==SQLITE4_INEXACT ){
          bHit = 1;
        }else if( rc!=SQLITE4_NOTFOUND ){
          break;
        }
      }
      assert( rc!=SQLITE4_OK && rc!=SQLITE4_INEXACT );
      btCsrReset(&mcsr, 1);

      if( rc==SQLITE4_NOTFOUND ){





        pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK);
        rc = fiCsrSetCurrent(pCsr);
        if( rc==SQLITE4_OK ){
          BtCursor *pSub = &pCsr->aBt[pCsr->iBt];
          if( fiCsrIsDelete(pSub) ){
            rc = fiCsrStep(pCsr);
            if( rc==SQLITE4_OK ) rc = SQLITE4_INEXACT;
          }else if( bMatch==0 ){
            rc = (bHit ? SQLITE4_INEXACT : SQLITE4_NOTFOUND);
          }
        }
      }
    }
  }else{
    rc = SQLITE4_NOTFOUND;
  }

  return rc;
}

static int fiCsrEnd(FiCursor *pCsr, int bLast){
  bt_db *db = pCsr->base.pDb;
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int iSub = 0;             /* Index of sub-cursor to seek */
  BtCursor mcsr;            /* Used to iterate through the meta-tree */
  int rc;                   /* Return code */


  fiCsrReset(pCsr);


  /* Loop through the set of f-tree levels. Open a cursor on each. */
  btCsrSetup(db, pHdr->iMRoot, &mcsr);

  for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
    const void *pV; int nV;   /* Value associated with meta-tree entry */
    u32 iBlk;                 /* Block containing sub-tree */
    BtCursor *pSub;           /* Sub-cursor for this block */


    btCsrKey(&mcsr, &pV, &nV);
    if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){









      rc = SQLITE4_NOTFOUND;
      break;

    }

    rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
    if( rc!=SQLITE4_OK ) break;
    pSub = &pCsr->aBt[iSub];
    iSub++;






    sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
    iBlk = sqlite4BtGetU32((const u8*)pV);

    btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);


    rc = btCsrEnd(pSub, bLast);
    if( rc!=SQLITE4_OK && rc!=SQLITE4_NOTFOUND ) break;
  }
  btCsrReset(&mcsr, 1);


  if( rc==SQLITE4_NOTFOUND ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);
    pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK);
    rc = fiCsrSetCurrent(pCsr);
    if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) ){
      rc = fiCsrStep(pCsr);
    }
  }

  return rc;
}

................................................................................
  int nK,                         /* Size of key pK in bytes */
  int eSeek                       /* Seek mode (a BT_SEEK_XXX constant) */
){
  int rc;
  btCheckPageRefs(pBase->pDb);
  if( IsBtCsr(pBase) ){
    BtCursor *pCsr = (BtCursor*)pBase;
    rc = btCsrSeek(pCsr, pK, nK, eSeek, BT_CSRSEEK_SEEK);
  }else{
    rc = fiCsrSeek((FiCursor*)pBase, pK, nK, eSeek);
  }
  btCheckPageRefs(pBase->pDb);
  return rc;
}

................................................................................
  int rc = SQLITE4_OK;            /* Return code */
  
  if( IsBtCsr(pBase) ){
    rc = btCsrKey((BtCursor*)pBase, ppK, pnK);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    assert( pCsr->iBt>=0 );
    rc = btCsrKey(&pCsr->aBt[pCsr->iBt], ppK, pnK);
  }

  return rc;
}

int sqlite4BtCsrData(
  bt_cursor *pBase,               /* Cursor handle */
................................................................................
  int rc = SQLITE4_OK;            /* Return code */
  
  if( IsBtCsr(pBase) ){
    rc = btCsrData((BtCursor*)pBase, iOffset, nByte, ppV, pnV);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    assert( pCsr->iBt>=0 );
    rc = btCsrData(&pCsr->aBt[pCsr->iBt], iOffset, nByte, ppV, pnV);
  }

  return rc;
}

/*
** The argument points to a buffer containing an overflow array. Return
................................................................................
    rc = btFastInsertRoot(db, pHdr, &iRootPg);
  }
  btCsrSetup(db, iRootPg, &csr);

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  if( rc==SQLITE4_OK ){
    rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
  }

  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);
    if( rc==SQLITE4_OK && (nV>=0 || iRoot==0) ){
      rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
      if( rc==SQLITE4_OK ) rc = btErrorBkpt(SQLITE4_CORRUPT);
    }
  }


  if( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ){
    if( nV<0 && iRoot!=0 ){
................................................................................
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );

          /* Try to schedule a merge operation */

          rc = btScheduleMerge(db);



          if( rc==SQLITE4_OK ){
            rc = btFastInsertRoot(db, pHdr, &iRootPg);
          }
          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);
            rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
          }
        }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT );
      }

      if( kv.eType==KV_CELL ){
        sqlite4_free(db->pEnv, (void*)kv.pV);
      }
................................................................................
){
  static const u8 aZero[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc;

  btCsrSetup(db, pHdr->iMRoot, p);
  rc = btCsrSeek(
      p, aSummaryKey, sizeof(aSummaryKey), BT_SEEK_EQ, BT_CSRSEEK_SEEK
  );
  if( rc==SQLITE4_OK ){
    rc = btCsrData(p, 0, -1, (const void**)paSummary, pnSummary);
  }else if( rc==SQLITE4_NOTFOUND ){
    *paSummary = aZero;
    *pnSummary = sizeof(aZero);
    rc = SQLITE4_OK;
................................................................................

static void btReadSummary(
  const u8 *aSum, int iAge, 
  u16 *piMinLevel,
  u16 *pnLevel,
  u16 *piMergeLevel
){
  *piMinLevel = btGetU16(&aSum[iAge * 6]);
  *pnLevel = btGetU16(&aSum[iAge * 6 + 2]);
  *piMergeLevel = btGetU16(&aSum[iAge * 6 + 4]);
}

static void btWriteSummary(
  u8 *aSum, int iAge,
  u16 iMinLevel,
  u16 nLevel,
  u16 iMergeLevel
................................................................................
    *piNext = (iMin + nLevel);
  }

  btCsrReset(&csr, 1);
  return rc;
}













static void btWriteSchedule(BtPage *pPg, BtSchedule *p, int *pRc){
  if( *pRc==SQLITE4_OK ){
    int rc = sqlite4BtPageWrite(pPg);
    if( rc==SQLITE4_OK ){
      /* TODO: convert values to big-endian format */
      u8 *aData = sqlite4BtPageData(pPg);
      memcpy(aData, p, sizeof(BtSchedule));

    }
    *pRc = rc;
  }
}
static int btReadSchedule(bt_db *db, BtPage *pPg, BtSchedule *p){
  /* TODO: convert values to big-endian format */
  u8 *aData = sqlite4BtPageData(pPg);
  memcpy(p, aData, sizeof(BtSchedule));
  return SQLITE4_OK;
}

static int btAllocateBlock(
  bt_db *db, 
  int nBlk,
  u32 *aiBlk
){
  return sqlite4BtBlockAllocate(db->pPager, nBlk, aiBlk);
................................................................................
/*
** This is a helper function for btScheduleMerge(). It determines the
** age and range of levels to be used as inputs by the merge (if any).
*/
static int btFindMerge(
  bt_db *db,                      /* Database handle */
  u32 *piAge,                     /* OUT: Age of input segments to merge */

  u32 *piMaxLevel,                /* OUT: Maximum input level value */
  u32 *piOutLevel                 /* OUT: Output level value */
){
  BtCursor csr;                   /* Cursor used to read summary record */
  int rc;                         /* Return code */
  const u8 *aSum;
  int nSum;
................................................................................
  if( rc==SQLITE4_OK ){
    int iAge;
    int iBestAge = -1;            /* Best age to merge levels from */
    int nBest = (db->nMinMerge-1);/* Number of levels merged at iBestAge */
    u16 iMin, nLevel, iMerge;     /* Summary of current age */

    rc = SQLITE4_NOTFOUND;
    for(iAge=0; iAge<(nSum/3); iAge++){
      btReadSummary(aSum, iAge, &iMin, &nLevel, &iMerge);
      if( iMerge ){
        int n = 1 + (iMerge-iMin);
        if( n>nBest ){

          *piMaxLevel = iMerge;
          *piAge = iAge;
          btReadSummary(aSum, iAge+1, &iMin, &nLevel, &iMerge);
          *piOutLevel = (iMin + nLevel - 1);
          rc = SQLITE4_OK;
        }
        break;
................................................................................

        /* Create a copy of the summary record */
        memcpy(aNew, aSum, nSum);
        if( nByte>nSum ) btWriteSummary(aNew, iBestAge+1, 0, 0, 0);

        /* Find the input age and maximum level */
        btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge);

        *piMaxLevel = (u32)(iMin + nLevel - 1);
        *piAge = iBestAge;

        /* Find the output level */
        btReadSummary(aSum, iBestAge+1, &iMin, &nLevel, &iMerge);
        *piOutLevel = iMin + nLevel;

................................................................................
**
** The merge operation is selected based on the following criteria:
**
**   * The more levels involved in the merge the better, and
**   * It is better to merge younger segments than older ones.
*/
static int btScheduleMerge(bt_db *db){

  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  BtPage *pPg = 0;                /* Schedule page */
  u8 *aData;                      /* Schedule page data */


  int rc;
  u32 iAge;
  u32 iLvl;
  u32 iOutLvl;



  /* Find the schedule page. If there is no schedule page, allocate it now. */
  if( pHdr->iSRoot==0 ){
    rc = sqlite4BtPageAllocate(db->pPager, &pPg);
    if( rc==SQLITE4_OK ){
      u8 *aData = sqlite4BtPageData(pPg);
      memset(aData, 0, pHdr->pgsz);
................................................................................
  ** scheduled. If the schedule page is not busy, call btFindMerge() to
  ** figure out which levels should be scheduled for merge.  */
  if( rc==SQLITE4_OK ){
    aData = sqlite4BtPageData(pPg);
    if( btGetU32(aData) ){
      rc = SQLITE4_NOTFOUND;
    }else{
      rc = btFindMerge(db, &iAge, &iLvl, &iOutLvl);
    }
  }

  if( rc==SQLITE4_OK ){
    BtSchedule s;
    memset(&s, 0, sizeof(BtSchedule));

    s.bBusy = 1;
    s.iAge = iAge;
    s.iMaxLevel = iLvl;

    s.iOutLevel = iOutLvl;
    rc = btAllocateBlock(db, db->nScheduleAlloc, s.aBlock);

    btWriteSchedule(pPg, &s, &rc);
  }

  sqlite4BtPageRelease(pPg);
  if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;
  return rc;
}

................................................................................

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }
  db->bFastInsertOp = 1;
  return rc;
}

























































/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending
** on whether or not the db->bFastInsertOp flag is set.
*/
................................................................................
    if( rc==SQLITE4_OK ){
      rc = btBalanceIfUnderfull(pCsr);
    }

    btCsrReleaseAll(pCsr);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    BtCursor *pSub = &pCsr->aBt[pCsr->iBt];

    void *pKey;
    int nKey;

    rc = btCsrBuffer(pSub, 0);
    pKey = pSub->ovfl.buf.p;
    nKey = pSub->ovfl.nKey;







>







 







>
>
>
>
>




|


>
>
>
>
>
>
>







 







>
|







 







|
>

|
|







 







>







 







>







 







>
>
>
>
>
>
>
>
>
>
>







>










|
>







 







|







 







|
|






|


|

|







 







|
|

|

|
|







 







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
|
<
>

|
|

|

|

|






|

|

|








|




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




>
|







>
>
>
>
>
>

>
>
>
>




|
>

<
<
<
|
>

<
|
<
<
<
>

<
<
<
<
>
>
>
>
|
>
>
>

>
>
>
>
>
>
>
|

|
|
>
|
>
|

|
|
|


|








<




|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>



|
|








|
|








<
|


>
|
>
|
<
<
>
|
|
|
<

>
|
<
>
>
>
>
>
>
>
>
>
|
<
>
|
|
<
<
<
<
>

>
>
>
>
|
|
>
|
>
|
<
<

<
>

|



|







 







|







 







|







 







|







 







|








|







 







>

>
>







|







 







|







 







|
|
|







 







>
>
>
>
>
>
>
>
>
>
>
>
|



<

<
>




<
<
<
<
<
<







 







>







 







|




>







 







>







 







<



>

<
|
|
|
>
>







 







|









|
>



|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
..
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
...
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
...
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
...
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
...
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
...
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
...
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
...
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
...
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
....
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
....
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
....
1360
1361
1362
1363
1364
1365
1366
1367
1368

1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488



1489
1490
1491

1492



1493
1494




1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533

1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604

1605
1606
1607
1608
1609
1610
1611


1612
1613
1614
1615

1616
1617
1618

1619
1620
1621
1622
1623
1624
1625
1626
1627
1628

1629
1630
1631




1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643


1644

1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
....
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
....
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
....
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
....
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
....
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
....
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
....
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
....
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496

3497

3498
3499
3500
3501
3502






3503
3504
3505
3506
3507
3508
3509
....
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
....
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
....
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
....
3599
3600
3601
3602
3603
3604
3605

3606
3607
3608
3609
3610

3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
....
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
....
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3726
3727
3728
3729
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
....
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
3850
3851
#define BT_PGFLAGS_METATREE 0x02  /* True for a meta-tree page */
#define BT_PGFLAGS_SCHEDULE 0x04  /* True for a schedule-tree page */

/* #define BT_STDERR_DEBUG 1 */

typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;
typedef struct FiSubCursor FiSubCursor;

struct bt_db {
  sqlite4_env *pEnv;              /* SQLite environment */
  BtPager *pPager;                /* Underlying page-based database */
  bt_cursor *pAllCsr;             /* List of all open cursors */
  int nMinMerge;
  int nScheduleAlloc;
................................................................................
  int bSkipNext;                  /* True if next CsrNext() is a no-op */
  int bSkipPrev;                  /* True if next CsrPrev() is a no-op */
};

/*
** Database f-tree cursor handle.
*/
struct FiSubCursor {
  u8 aPrefix[8];                  /* Meta-tree key prefix for this age/level */
  BtCursor mcsr;                  /* Cursor opened on meta table (scans only) */
  BtCursor csr;                   /* Cursor opened on current sub-tree */
};
struct FiCursor {
  bt_cursor base;                 /* Base cursor class */
  int iBt;                        /* Current sub-tree (or -1 for EOF) */
  int nBt;                        /* Number of entries in aBt[] array */
  FiSubCursor *aSub;              /* Array of sub-tree cursors */
};

/*
** TODO: Rearrange things so these are not required!
*/
static int fiLoadSummary(bt_db *, BtCursor *, const u8 **, int *);
static void btReadSummary(const u8 *, int, u16 *, u16 *, u16 *);
static int btCsrData(BtCursor *, int, int, const void **, int *);

/* 
** Meta-table summary key.
*/
static const u8 aSummaryKey[] = {0xFF, 0xFF, 0xFF, 0xFF};

#ifndef btErrorBkpt
int btErrorBkpt(int rc){
................................................................................
    if( IsBtCsr(pCsr) ){
      BtCursor *p = (BtCursor*)pCsr;
      if( p->nPg>0 ) nExpect += p->nPg;
    }else{
      FiCursor *p = (FiCursor*)pCsr;
      int i;
      for(i=0; i<p->nBt; i++){
        if( p->aSub[i].mcsr.nPg>0 ) nExpect += p->aSub[i].mcsr.nPg;
        if( p->aSub[i].csr.nPg>0 ) nExpect += p->aSub[i].csr.nPg;
      }
    }
  }
  nActual = sqlite4BtPagerRefcount(pDb->pPager);
  assert( nActual==nExpect );
}
#else
................................................................................
  pCsr->bRequireReseek = 0;
}

static void fiCsrReset(FiCursor *pCsr){
  int i;
  bt_db *db = pCsr->base.pDb;
  for(i=0; i<pCsr->nBt; i++){
    btCsrReset(&pCsr->aSub[i].csr, 1);
    btCsrReset(&pCsr->aSub[i].mcsr, 1);
  }
  sqlite4_free(db->pEnv, pCsr->aSub);
  pCsr->aSub = 0;
  pCsr->nBt = 0;
  pCsr->iBt = -1;
}

int sqlite4BtCsrClose(bt_cursor *pCsr){
  if( pCsr ){
    bt_cursor **pp;
................................................................................
    int i;
    BtSchedule s;
    sqlite4BtBufAppendf(pBuf, "(schedule page) ");
    memcpy(&s, aData, sizeof(s));

    sqlite4BtBufAppendf(pBuf, "bBusy=%d ", (int)s.bBusy);
    sqlite4BtBufAppendf(pBuf, "iAge=%d ", (int)s.iAge);
    sqlite4BtBufAppendf(pBuf, "iMinLevel=%d ", (int)s.iMinLevel);
    sqlite4BtBufAppendf(pBuf, "iMaxLevel=%d ", (int)s.iMaxLevel);
    sqlite4BtBufAppendf(pBuf, "iOutLevel=%d ", (int)s.iOutLevel);
    sqlite4BtBufAppendf(pBuf, "blocks=(");
    for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]);
    }
    sqlite4BtBufAppendf(pBuf, ") ");
................................................................................
**
** In other words, *piRes is +ve, zero or -ve if C is respectively larger, 
** equal to or smaller than K.
*/
static int btCellKeyCompare(
  BtCursor *pCsr,                 /* Cursor handle */
  int bLeaf,                      /* True if cursor currently points to leaf */
  const void *aPrefix,
  const void *pK, int nK,         /* Key to compare against cursor key */
  int *piRes                      /* OUT: Result of comparison */
){
  const void *pCsrKey;
  int nCsrKey;
  int nCmp;
  int nAscend = 0;
................................................................................
        iCell = 0;
      }
      rc = sqlite4BtCsrKey((bt_cursor*)pCsr, &pCsrKey, &nCsrKey);
    }
  }

  if( rc==SQLITE4_OK ){
    if( aPrefix ){
      if( nCsrKey<8 ){
        res = 1;
        goto keycompare_done;
      }
      res = memcmp(pCsrKey, aPrefix, 8);
      if( res ) goto keycompare_done;
      nCsrKey -= 8;
      pCsrKey = (void*)(((u8*)pCsrKey) + 8);
    }

    nCmp = MIN(nCsrKey, nK);
    res = memcmp(pCsrKey, pK, nCmp);
    if( res==0 ){
      res = nCsrKey - nK;
    }
  }

 keycompare_done:
  btCsrAscend(pCsr, nAscend);
  *piRes = res;
  return rc;
}

#define BT_CSRSEEK_SEEK   0
#define BT_CSRSEEK_UPDATE 1
#define BT_CSRSEEK_RESEEK 2

static int btCsrSeek(
  BtCursor *pCsr,                 /* Cursor object to seek */
  u8 *aPrefix,                    /* 8-byte key prefix, or NULL */
  const void *pK,                 /* Key to seek for */
  int nK,                         /* Size of key pK in bytes */
  int eSeek,                      /* Seek mode (a BT_SEEK_XXX constant) */
  int eCsrseek
){
  const int pgsz = sqlite4BtPagerPagesize(pCsr->base.pDb->pPager);
  u32 pgno;                       /* Page number for next page to load */
................................................................................
        rc = SQLITE4_NOTFOUND;
        break;
      }

      while( iHi>iLo ){
        int iTst = (iHi+iLo)/2;   /* Cell to compare to pK/nK */
        pCsr->aiCell[pCsr->nPg-1] = iTst;
        rc = btCellKeyCompare(pCsr, bLeaf, aPrefix, pK, nK, &res);
        if( rc!=SQLITE4_OK || res==0 ){
          /* Cell iTst is EQUAL to pK/nK */
          iHi = iLo = iTst;
        }else if( res<0 ){
          /* Cell iTst is SMALLER than pK/nK */
          iLo = iTst+1;
        }else{
................................................................................
  }
  return rc;
}

static int btCsrReseek(BtCursor *pCsr){
  int rc = SQLITE4_OK;
  if( pCsr->bRequireReseek ){
    BtOvfl o;                     /* Copy of initial overflow buffer */
    memcpy(&o, &pCsr->ovfl, sizeof(BtOvfl));

    pCsr->ovfl.buf.n = 0;
    pCsr->ovfl.buf.p = 0;
    pCsr->bSkipNext = 0;
    pCsr->bRequireReseek = 0;

    rc = btCsrSeek(pCsr, 0, o.buf.p, o.nKey, BT_SEEK_EQ, BT_CSRSEEK_RESEEK);
    assert( rc!=SQLITE4_INEXACT );
    if( pCsr->ovfl.buf.p==0 ){
      pCsr->ovfl.buf.p = o.buf.p;
    }else{
      sqlite4_buffer_clear(&o.buf);
    }
  }
  return rc;
}

/*
** This function does the work of both sqlite4BtCsrNext() (if parameter
................................................................................
  return rc;
}


static int fiCsrAllocateSubs(bt_db *db, FiCursor *pCsr, int nBt){
  int rc = SQLITE4_OK;            /* Return code */
  if( nBt>pCsr->nBt ){
    int nByte = sizeof(FiSubCursor) * nBt;
    FiSubCursor *aNew;            /* Allocated array */

    aNew = (FiSubCursor*)sqlite4_realloc(db->pEnv, pCsr->aSub, nByte);
    if( aNew ){
      memset(&aNew[pCsr->nBt], 0, sizeof(FiSubCursor)*(nBt-pCsr->nBt));
      pCsr->aSub = aNew;
      pCsr->nBt = nBt;
    }else{
      rc = btErrorBkpt(SQLITE4_NOMEM);
    }
  }

  return rc;
................................................................................
  int rc = SQLITE4_OK;
  int mul;

  assert( pCsr->base.flags & (CSR_NEXT_OK | CSR_PREV_OK) );
  mul = ((pCsr->base.flags & CSR_NEXT_OK) ? 1 : -1);

  for(i=0; i<pCsr->nBt && rc==SQLITE4_OK; i++){
    BtCursor *pSub = &pCsr->aSub[i].csr;
    if( pSub->nPg>0 ){
      int nKey;
      const void *pKey;

      rc = btCsrKey(pSub, &pKey, &nKey);
      if( rc==SQLITE4_OK 
          && (iMin<0 || (mul * btKeyCompare(pKey, nKey, pMin, nMin))<0) 
................................................................................
    }
  }

  if( iMin<0 && rc==SQLITE4_OK ) rc = SQLITE4_NOTFOUND;
  pCsr->iBt = iMin;
  return rc;
}

/*
** Return SQLITE4_OK if the cursor is successfully stepped, or 
** SQLITE4_NOTFOUND if an EOF is encountered.
**
** If an error occurs (e.g. an IO error or OOM condition), return the
** relevant error code.
*/
static int fiSubCsrStep( 
  FiCursor *pCsr,                 /* Parent cursor */
  FiSubCursor *pSub,              /* Sub-cursor to advance */
  int bNext                       /* True for xNext(), false for xPrev() */
){
  int rc;

  rc = btCsrStep(&pSub->csr, bNext);
  if( rc==SQLITE4_NOTFOUND ){
    const void *pV;
    int nV;
#if 1
    rc = btCsrKey(&pSub->mcsr, &pV, &nV);
    assert( rc==SQLITE4_OK && memcmp(pV, pSub->aPrefix, 8)==0 );
#endif
    rc = btCsrStep(&pSub->mcsr, bNext);

    if( rc==SQLITE4_OK ){
      rc = btCsrKey(&pSub->mcsr, &pV, &nV);
    }
    if( rc==SQLITE4_OK ){
      if( nV<sizeof(pSub->aPrefix) 
       || memcmp(pSub->aPrefix, pV, sizeof(pSub->aPrefix)) 
      ){
        rc = SQLITE4_NOTFOUND;
      }else{
        rc = btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      }
    }
    if( rc==SQLITE4_OK ){
      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(pCsr->base.pDb->pPager);
      pSub->csr.iRoot = btBlockToRoot(pHdr, sqlite4BtGetU32((const u8*)pV));
      rc = btCsrEnd(&pSub->csr, !bNext);
    }
  }

  return rc;
}

/*
** Advance the cursor. The direction (xPrev or xNext) is implied by the
** cursor itself - as fast-insert cursors may only be advanced in one
** direction.
*/
static int fiCsrStep(FiCursor *pCsr){
................................................................................
  const void *pKey; int nKey;     /* Current key that cursor points to */
  int i;

  assert( pCsr->base.flags & (CSR_NEXT_OK | CSR_PREV_OK) );
  assert( pCsr->iBt>=0 );

  do{
    /* Load the current key in to pKey/nKey. Then advance all sub-cursors 
    ** that share a key with the current sub-cursor. */

    rc = sqlite4BtCsrKey(&pCsr->base, &pKey, &nKey);
    for(i=0; rc==SQLITE4_OK && i<pCsr->nBt; i++){
      FiSubCursor *pSub = &pCsr->aSub[i];
      if( i!=pCsr->iBt && pSub->csr.nPg>0 ){
        const void *p; int n;       /* Key that this sub-cursor points to */
        rc = btCsrKey(&pSub->csr, &p, &n);
        if( rc==SQLITE4_OK && btKeyCompare(p, n, pKey, nKey)==0 ){
          rc = fiSubCsrStep(pCsr, pSub, bNext);
          if( rc==SQLITE4_NOTFOUND ){
            assert( pSub->csr.nPg==0 );
            rc = SQLITE4_OK;
          }
        }
      }
    }

    /* Advance the current sub-cursor */
    if( rc==SQLITE4_OK ){
      rc = fiSubCsrStep(pCsr, &pCsr->aSub[pCsr->iBt], bNext);
      if( rc==SQLITE4_NOTFOUND ){
        assert( pCsr->aSub[pCsr->iBt].csr.nPg==0 );
        rc = SQLITE4_OK;
      }
    }

    /* Figure out a new current bt cursor */
    if( rc==SQLITE4_OK ){
      rc = fiCsrSetCurrent(pCsr);
    }
  }while( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aSub[pCsr->iBt].csr) );

  return rc;
}

typedef struct FiLevelIter FiLevelIter;
struct FiLevelIter {
  /* Used internally */
  BtCursor csr;                   /* Cursor used to read summary blob */
  const u8 *aSum;                 /* Summary blob */
  int nSum;                       /* Size of summary blob in bytes */

  /* Output values */
  int nSub;                       /* Total number of expected levels */
  int iAge;                       /* Current age */
  int iLvl;                       /* Current level */
  int iSub;                       /* Current sub-cursor */
};

static int fiLevelIterNext(FiLevelIter *p){
  u16 iMin, nLevel;

  p->iSub++;
  p->iLvl--;
  btReadSummary(p->aSum, 0, &iMin, &nLevel, 0);
  while( p->iLvl<(int)iMin ){
    p->iAge++;
    if( p->iAge>=(p->nSum)/6 ) return 1;
    btReadSummary(p->aSum, p->iAge, &iMin, &nLevel, 0);
    p->iLvl = (int)iMin + (int)nLevel - 1;
  }

  return 0;
}

static int fiLevelIterInit(bt_db *db, FiLevelIter *p){
  int rc;                         /* Return code */

  memset(p, 0, sizeof(FiLevelIter));
  rc = fiLoadSummary(db, &p->csr, &p->aSum, &p->nSum);
  if( rc==SQLITE4_OK ){
    int iAge;
    for(iAge=0; iAge<(p->nSum/6); iAge++){
      u16 iMin, nLevel;
      btReadSummary(p->aSum, iAge, &iMin, &nLevel, 0);
      p->nSub += nLevel;
      if( iAge==0 ){
        p->iLvl = ((int)iMin + nLevel);
        p->iSub = -1;
      }
    }
  }

  return rc;
}

static void fiLevelIterCleanup(FiLevelIter *p){
  btCsrReset(&p->csr, 1);
}

/*
** Seek a fast-insert cursor.
*/
static int fiCsrSeek(FiCursor *pCsr, const void *pK, int nK, int eSeek){
  u8 aPrefix[8];
  int rc = SQLITE4_NOTFOUND;      /* Return code */
  bt_db *db = pCsr->base.pDb;     /* Database handle */
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);

  assert( eSeek==BT_SEEK_LE || eSeek==BT_SEEK_EQ || eSeek==BT_SEEK_GE );
  fiCsrReset(pCsr);

  if( pHdr->iMRoot ){
    FiLevelIter iter;

    /* Initialize the iterator used to skip through database levels */
    rc = fiLevelIterInit(db, &iter);
    if( rc!=SQLITE4_OK ) return rc;

    if( eSeek==BT_SEEK_EQ ){
      FiSubCursor *pSub;
      BtCursor *pM;
      int iAge;

      /* A BT_SEEK_EQ is a special case. There is no need to set up a cursor
      ** that can be advanced (in either direction) in this case. All that
      ** is required is to search each level in order for the requested key 
      ** (or a corresponding delete marker). Once a match is found, there
      ** is no need to search any further. As a result, only a single
      ** sub-cursor is required.  */
      rc = fiCsrAllocateSubs(db, pCsr, 1);



      pSub = pCsr->aSub;
      pM = &pSub->mcsr;


      btCsrSetup(db, pHdr->iMRoot, pM);



      while( 0==fiLevelIterNext(&iter) ){





        u16 iMin;
        u16 nLevel;
        u16 iMerge;
        int iLvl;

        btPutU32(aPrefix, iter.iAge);
        btPutU32(&aPrefix[4], ~(u32)iter.iLvl);
        rc = btCsrSeek(pM, aPrefix, pK, nK, BT_SEEK_LE, BT_CSRSEEK_SEEK);

        if( rc==SQLITE4_NOTFOUND ){
          /* All keys in this level are greater than pK/nK. */
          /* no-op */
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          int iBlk;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);
          assert( rc!=SQLITE4_INEXACT );
          if( rc!=SQLITE4_NOTFOUND ){
            /* A hit on the requested key or an error has occurred. Either
             ** way, break out of the loop. If this is a hit, set iBt to
             ** zero so that the BtCsrKey() and BtCsrData() routines know
             ** to return data from the first (only) sub-cursor. */
            assert( pCsr->iBt<0 );
            if( rc==SQLITE4_OK ){
              if( 0==fiCsrIsDelete(&pSub->csr) ){
                pCsr->iBt = 0;
              }else{
                rc = SQLITE4_NOTFOUND;
              }
            }
            break;
          }
        }

      }
    }else{
      int bMatch = 0;           /* Found an exact match */
      int bHit = 0;             /* Found at least one entry */

      /* Allocate required sub-cursors. */
      if( rc==SQLITE4_OK ){
        rc = fiCsrAllocateSubs(db, pCsr, iter.nSub);
      }

      /* This loop runs once for each sub-cursor */
      while( rc==SQLITE4_OK && 0==fiLevelIterNext(&iter) ){
        FiSubCursor *pSub = &pCsr->aSub[iter.iSub];
        BtCursor *pM = &pSub->mcsr;
        btCsrSetup(db, pHdr->iMRoot, pM);

        btPutU32(&pSub->aPrefix[0], (u32)iter.iAge);
        btPutU32(&pSub->aPrefix[4], ~(u32)iter.iLvl);

        rc = btCsrSeek(pM, pSub->aPrefix, pK, nK, BT_SEEK_LE, BT_CSRSEEK_SEEK);
        if( rc==SQLITE4_NOTFOUND && eSeek==BT_SEEK_GE ){
          rc = btCsrEnd(pM, 0);
        }

        if( rc==SQLITE4_NOTFOUND ){
          /* No keys to visit in this level */
          rc = SQLITE4_OK;
        }else if( rc==SQLITE4_OK || rc==SQLITE4_INEXACT ){
          const void *pV;
          int nV;
          int iBlk;
          sqlite4BtCsrData(&pM->base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(&pSub->csr, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);

          rc = btCsrSeek(&pSub->csr, 0, pK, nK, eSeek, BT_CSRSEEK_SEEK);
          if( rc==SQLITE4_OK ) bMatch = 1;
          if( rc==SQLITE4_INEXACT ) bHit = 1;
          if( rc==SQLITE4_INEXACT || rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;

        }else{
          /* An error */
        }
      }
      assert( rc!=SQLITE4_OK || iter.iSub==iter.nSub );

      if( rc==SQLITE4_OK ){
        pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK);
        rc = fiCsrSetCurrent(pCsr);
        if( rc==SQLITE4_OK ){
          BtCursor *pCurrent = &pCsr->aSub[pCsr->iBt].csr;
          if( fiCsrIsDelete(pCurrent) ){
            rc = fiCsrStep(pCsr);
            if( rc==SQLITE4_OK ) rc = SQLITE4_INEXACT;
          }else if( bMatch==0 ){
            rc = (bHit ? SQLITE4_INEXACT : SQLITE4_NOTFOUND);
          }
        }
      }
    }

    fiLevelIterCleanup(&iter);
  }

  return rc;
}

static int fiCsrEnd(FiCursor *pCsr, int bLast){
  bt_db *db = pCsr->base.pDb;
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);

  FiLevelIter iter;         /* Used to iterate through all f-tree levels */
  int rc;                   /* Return code */

  rc = fiLevelIterInit(db, &iter);
  if( rc==SQLITE4_OK ){
    rc = fiCsrAllocateSubs(db, pCsr, iter.nSub);
  }



  while( rc==SQLITE4_OK && 0==fiLevelIterNext(&iter) ){
    FiSubCursor *pSub = &pCsr->aSub[iter.iSub];
    const int n = (int)sizeof(pSub->aPrefix);


    btPutU32(pSub->aPrefix, (u32)iter.iAge);
    btPutU32(&pSub->aPrefix[4], ~(u32)iter.iLvl);


    btCsrSetup(db, pHdr->iMRoot, &pSub->mcsr);
    if( bLast==0 ){
      rc = btCsrSeek(&pSub->mcsr, 0, pSub->aPrefix, n, BT_SEEK_GE, 0);
    }else{
      u8 aPrefix[8];
      btPutU32(aPrefix, (u32)iter.iAge + (iter.iLvl==0 ? 1 : 0));
      btPutU32(&aPrefix[4], ~(u32)(iter.iLvl - (iter.iLvl==0 ? 0 : 1)));
      rc = btCsrSeek(&pSub->mcsr, 0, aPrefix, n, BT_SEEK_LE, 0);
      if( rc==SQLITE4_OK ){

        rc = btCsrStep(&pSub->mcsr, 0);
      }
    }




    if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;

    if( rc==SQLITE4_OK ){
      const void *pV;
      int nV;
      int iBlk;
      btCsrData(&pSub->mcsr, 0, 4, &pV, &nV);
      iBlk = sqlite4BtGetU32((const u8*)pV);
      btCsrReset(&pSub->csr, 1);
      btCsrSetup(db, btBlockToRoot(pHdr, iBlk), &pSub->csr);
      rc = btCsrEnd(&pSub->csr, bLast);
    }


  }

  fiLevelIterCleanup(&iter);

  if( rc==SQLITE4_OK ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);
    pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK);
    rc = fiCsrSetCurrent(pCsr);
    if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aSub[pCsr->iBt].csr) ){
      rc = fiCsrStep(pCsr);
    }
  }

  return rc;
}

................................................................................
  int nK,                         /* Size of key pK in bytes */
  int eSeek                       /* Seek mode (a BT_SEEK_XXX constant) */
){
  int rc;
  btCheckPageRefs(pBase->pDb);
  if( IsBtCsr(pBase) ){
    BtCursor *pCsr = (BtCursor*)pBase;
    rc = btCsrSeek(pCsr, 0, pK, nK, eSeek, BT_CSRSEEK_SEEK);
  }else{
    rc = fiCsrSeek((FiCursor*)pBase, pK, nK, eSeek);
  }
  btCheckPageRefs(pBase->pDb);
  return rc;
}

................................................................................
  int rc = SQLITE4_OK;            /* Return code */
  
  if( IsBtCsr(pBase) ){
    rc = btCsrKey((BtCursor*)pBase, ppK, pnK);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    assert( pCsr->iBt>=0 );
    rc = btCsrKey(&pCsr->aSub[pCsr->iBt].csr, ppK, pnK);
  }

  return rc;
}

int sqlite4BtCsrData(
  bt_cursor *pBase,               /* Cursor handle */
................................................................................
  int rc = SQLITE4_OK;            /* Return code */
  
  if( IsBtCsr(pBase) ){
    rc = btCsrData((BtCursor*)pBase, iOffset, nByte, ppV, pnV);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    assert( pCsr->iBt>=0 );
    rc = btCsrData(&pCsr->aSub[pCsr->iBt].csr, iOffset, nByte, ppV, pnV);
  }

  return rc;
}

/*
** The argument points to a buffer containing an overflow array. Return
................................................................................
    rc = btFastInsertRoot(db, pHdr, &iRootPg);
  }
  btCsrSetup(db, iRootPg, &csr);

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  if( rc==SQLITE4_OK ){
    rc = btCsrSeek(&csr, 0, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
  }

  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);
    if( rc==SQLITE4_OK && (nV>=0 || iRoot==0) ){
      rc = btCsrSeek(&csr, 0, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
      if( rc==SQLITE4_OK ) rc = btErrorBkpt(SQLITE4_CORRUPT);
    }
  }


  if( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ){
    if( nV<0 && iRoot!=0 ){
................................................................................
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );

          /* Try to schedule a merge operation */
#if 0
          rc = btScheduleMerge(db);
#endif
          rc = SQLITE4_OK;

          if( rc==SQLITE4_OK ){
            rc = btFastInsertRoot(db, pHdr, &iRootPg);
          }
          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);
            rc = btCsrSeek(&csr, 0, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
          }
        }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT );
      }

      if( kv.eType==KV_CELL ){
        sqlite4_free(db->pEnv, (void*)kv.pV);
      }
................................................................................
){
  static const u8 aZero[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc;

  btCsrSetup(db, pHdr->iMRoot, p);
  rc = btCsrSeek(
      p, 0, aSummaryKey, sizeof(aSummaryKey), BT_SEEK_EQ, BT_CSRSEEK_SEEK
  );
  if( rc==SQLITE4_OK ){
    rc = btCsrData(p, 0, -1, (const void**)paSummary, pnSummary);
  }else if( rc==SQLITE4_NOTFOUND ){
    *paSummary = aZero;
    *pnSummary = sizeof(aZero);
    rc = SQLITE4_OK;
................................................................................

static void btReadSummary(
  const u8 *aSum, int iAge, 
  u16 *piMinLevel,
  u16 *pnLevel,
  u16 *piMergeLevel
){
  if( piMinLevel ) *piMinLevel = btGetU16(&aSum[iAge * 6]);
  if( pnLevel ) *pnLevel = btGetU16(&aSum[iAge * 6 + 2]);
  if( piMergeLevel ) *piMergeLevel = btGetU16(&aSum[iAge * 6 + 4]);
}

static void btWriteSummary(
  u8 *aSum, int iAge,
  u16 iMinLevel,
  u16 nLevel,
  u16 iMergeLevel
................................................................................
    *piNext = (iMin + nLevel);
  }

  btCsrReset(&csr, 1);
  return rc;
}

static void btWriteSchedule(u8 *aData, BtSchedule *p, int *pRc){
  if( *pRc==SQLITE4_OK ){
    /* TODO: convert values to big-endian format */
    memcpy(aData, p, sizeof(BtSchedule));
  }
}
static int btReadSchedule(bt_db *db, u8 *aData, BtSchedule *p){
  /* TODO: convert values to big-endian format */
  memcpy(p, aData, sizeof(BtSchedule));
  return SQLITE4_OK;
}

static void btWriteSchedulePage(BtPage *pPg, BtSchedule *p, int *pRc){
  if( *pRc==SQLITE4_OK ){
    int rc = sqlite4BtPageWrite(pPg);
    if( rc==SQLITE4_OK ){

      u8 *aData = sqlite4BtPageData(pPg);

      btWriteSchedule(aData, p, &rc);
    }
    *pRc = rc;
  }
}







static int btAllocateBlock(
  bt_db *db, 
  int nBlk,
  u32 *aiBlk
){
  return sqlite4BtBlockAllocate(db->pPager, nBlk, aiBlk);
................................................................................
/*
** This is a helper function for btScheduleMerge(). It determines the
** age and range of levels to be used as inputs by the merge (if any).
*/
static int btFindMerge(
  bt_db *db,                      /* Database handle */
  u32 *piAge,                     /* OUT: Age of input segments to merge */
  u32 *piMinLevel,                /* OUT: Minimum input level value */
  u32 *piMaxLevel,                /* OUT: Maximum input level value */
  u32 *piOutLevel                 /* OUT: Output level value */
){
  BtCursor csr;                   /* Cursor used to read summary record */
  int rc;                         /* Return code */
  const u8 *aSum;
  int nSum;
................................................................................
  if( rc==SQLITE4_OK ){
    int iAge;
    int iBestAge = -1;            /* Best age to merge levels from */
    int nBest = (db->nMinMerge-1);/* Number of levels merged at iBestAge */
    u16 iMin, nLevel, iMerge;     /* Summary of current age */

    rc = SQLITE4_NOTFOUND;
    for(iAge=0; iAge<(nSum/6); iAge++){
      btReadSummary(aSum, iAge, &iMin, &nLevel, &iMerge);
      if( iMerge ){
        int n = 1 + (iMerge-iMin);
        if( n>nBest ){
          *piMinLevel = iMin;
          *piMaxLevel = iMerge;
          *piAge = iAge;
          btReadSummary(aSum, iAge+1, &iMin, &nLevel, &iMerge);
          *piOutLevel = (iMin + nLevel - 1);
          rc = SQLITE4_OK;
        }
        break;
................................................................................

        /* Create a copy of the summary record */
        memcpy(aNew, aSum, nSum);
        if( nByte>nSum ) btWriteSummary(aNew, iBestAge+1, 0, 0, 0);

        /* Find the input age and maximum level */
        btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge);
        *piMinLevel = (u32)iMin;
        *piMaxLevel = (u32)(iMin + nLevel - 1);
        *piAge = iBestAge;

        /* Find the output level */
        btReadSummary(aSum, iBestAge+1, &iMin, &nLevel, &iMerge);
        *piOutLevel = iMin + nLevel;

................................................................................
**
** The merge operation is selected based on the following criteria:
**
**   * The more levels involved in the merge the better, and
**   * It is better to merge younger segments than older ones.
*/
static int btScheduleMerge(bt_db *db){

  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  BtPage *pPg = 0;                /* Schedule page */
  u8 *aData;                      /* Schedule page data */
  int rc;                         /* Return code */


  /* Details of proposed merge: */
  u32 iAge;                       /* Input age */
  u32 iMin;                       /* Minimum input level number */
  u32 iMax;                       /* Maximum input level number */
  u32 iOutLvl;                    /* Output level number */

  /* Find the schedule page. If there is no schedule page, allocate it now. */
  if( pHdr->iSRoot==0 ){
    rc = sqlite4BtPageAllocate(db->pPager, &pPg);
    if( rc==SQLITE4_OK ){
      u8 *aData = sqlite4BtPageData(pPg);
      memset(aData, 0, pHdr->pgsz);
................................................................................
  ** scheduled. If the schedule page is not busy, call btFindMerge() to
  ** figure out which levels should be scheduled for merge.  */
  if( rc==SQLITE4_OK ){
    aData = sqlite4BtPageData(pPg);
    if( btGetU32(aData) ){
      rc = SQLITE4_NOTFOUND;
    }else{
      rc = btFindMerge(db, &iAge, &iMin, &iMax, &iOutLvl);
    }
  }

  if( rc==SQLITE4_OK ){
    BtSchedule s;
    memset(&s, 0, sizeof(BtSchedule));

    s.bBusy = 1;
    s.iAge = iAge;
    s.iMaxLevel = iMax;
    s.iMinLevel = iMin;
    s.iOutLevel = iOutLvl;
    rc = btAllocateBlock(db, db->nScheduleAlloc, s.aBlock);

    btWriteSchedulePage(pPg, &s, &rc);
  }

  sqlite4BtPageRelease(pPg);
  if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;
  return rc;
}

................................................................................

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }
  db->bFastInsertOp = 1;
  return rc;
}

/*
** Set up a fast-insert cursor to read the input data for a merge operation.
*/
static int fiSetupMergeCsr(
  bt_db *db,                      /* Database handle */
  BtDbHdr *pHdr,                  /* Current database header values */
  BtSchedule *p,                  /* Description of merge operation */
  FiCursor *pCsr                  /* Populate this object before returning */
){
  int iBt;                        /* Used to loop through component cursors */
  int rc;                         /* Return code */

  memset(pCsr, 0, sizeof(FiCursor));
  pCsr->base.flags = CSR_TYPE_FAST;
  pCsr->base.pDb = db;

  rc = fiCsrAllocateSubs(db, pCsr, (p->iMaxLevel - p->iMinLevel) + 1);
  assert( rc==SQLITE4_OK || pCsr->nBt==0 );
  for(iBt=0; iBt<pCsr->nBt && rc==SQLITE4_OK; iBt++){
  }

  return rc;
}

/*
** This is called by a checkpointer to handle a schedule page.
*/
int sqlite4BtMerge(bt_db *db, BtDbHdr *pHdr, u8 *aSched){
  BtSchedule s;                   /* Deserialized schedule object */
  FiCursor fcsr;                  /* FiCursor used to read input */
  int rc;                         /* Return code */

  /* Set up the input cursor. */
  btReadSchedule(db, aSched, &s);
  assert( s.bBusy );
  rc = fiSetupMergeCsr(db, pHdr, &s, &fcsr);

  /* The following loop runs once for each key copied from the input to
  ** the output segments. It terminates either when the input is exhausted
  ** or when all available output blocks are full.  */
  while( rc==SQLITE4_OK ){
    rc = fiCsrStep(&fcsr);
  }

  /* Assuming no error has occurred, update the serialized BtSchedule
  ** structure stored in buffer aSched[]. The caller will write this
  ** buffer to the database file as page (pHdr->iSRoot).  */
  if( rc==SQLITE4_OK || rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;
    btWriteSchedule(aSched, &s, &rc);
  }

  fiCsrReset(&fcsr);
  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending
** on whether or not the db->bFastInsertOp flag is set.
*/
................................................................................
    if( rc==SQLITE4_OK ){
      rc = btBalanceIfUnderfull(pCsr);
    }

    btCsrReleaseAll(pCsr);
  }else{
    FiCursor *pCsr = (FiCursor*)pBase;
    BtCursor *pSub = &pCsr->aSub[pCsr->iBt].csr;

    void *pKey;
    int nKey;

    rc = btCsrBuffer(pSub, 0);
    pKey = pSub->ovfl.buf.p;
    nKey = pSub->ovfl.nKey;

Changes to www/bt.wiki

698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
resides in the db file (not the log). The writer can tell if the schedule page
resides in the db file or the log by checking the "merge performed" flag.


<p>The page is organized as follows:

<ul>
  <li> Magic number used to identify the state of the page (big-endian u32):
       0x6a846607 if the merge has not yet been performed or 0x1a468c33 if
       it has.

  <li> Age of input segments (big-endian u32).

  <li> Max level of input segments (bit-endian u32). All segments of the
       specified age with a level equal to or less than this are considered
       inputs.

  <li> Level of output segments.

  <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>.
       Maximum is (say) 32.

  <li> Block number of each allocated block (array of <i>nBlock</i> big-endian 
       u32 fields).
</ul>

<p>Then, after the merge has taken place:
<ul>
  <li> Page number of input page containing next key to read.







|
<
<









<
<
<







698
699
700
701
702
703
704
705


706
707
708
709
710
711
712
713
714



715
716
717
718
719
720
721
resides in the db file (not the log). The writer can tell if the schedule page
resides in the db file or the log by checking the "merge performed" flag.


<p>The page is organized as follows:

<ul>
  <li> Busy flag.



  <li> Age of input segments (big-endian u32).

  <li> Max level of input segments (bit-endian u32). All segments of the
       specified age with a level equal to or less than this are considered
       inputs.

  <li> Level of output segments.




  <li> Block number of each allocated block (array of <i>nBlock</i> big-endian 
       u32 fields).
</ul>

<p>Then, after the merge has taken place:
<ul>
  <li> Page number of input page containing next key to read.