Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 590e0410b18526cb3d69afd4372fcddcf77a4262
User & Date: dan 2013-12-14 18:59:18.588
Context
2014-01-02
18:53
Changes to FiCursor object to support reading merged blocks. check-in: 27248a1ebc user: dan tags: trunk
2013-12-14
18:59
Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled. check-in: 590e0410b1 user: dan tags: trunk
2013-12-07
20:38
Add notes describing schedule pages to bt.wiki. check-in: 1df60437f3 user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btInt.h.
86
87
88
89
90
91
92


















93
94
95
96
97
98
99
  u32 nSubPg;                     /* Number of non-overflow pages in sub-tree */

  u32 iCookie;                    /* Current value of schema cookie */
  u32 iFreePg;                    /* First page in free-page list trunk */
  u32 iFreeBlk;                   /* First page in free-block list trunk */
};



















/*************************************************************************
** Interface to bt_pager.c functionality.
*/
typedef struct BtPage BtPage;
typedef struct BtPager BtPager;

/*







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  u32 nSubPg;                     /* Number of non-overflow pages in sub-tree */

  u32 iCookie;                    /* Current value of schema cookie */
  u32 iFreePg;                    /* First page in free-page list trunk */
  u32 iFreeBlk;                   /* First page in free-block list trunk */
};


/*
** This struct defines the format of database "schedule" pages.
*/
typedef struct BtSchedule BtSchedule;
struct BtSchedule {
  u32 bBusy;                      /* True if page is busy */
  u32 iAge;                       /* Age of input segments */
  u32 iMaxLevel;                  /* Maximum level of input segments */
  u32 iOutLevel;                  /* Level at which to write output */
  u32 aBlock[32];                 /* Allocated blocks */

  u32 iNextPg;                    /* Page that contains next input key */
  u32 iNextCell;                  /* Cell that contains next input key */
  u32 nUsed;                      /* Number of output blocks actually written */
  u32 iFreeList;                  /* First page of new free-list (if any) */
};

/*************************************************************************
** Interface to bt_pager.c functionality.
*/
typedef struct BtPage BtPage;
typedef struct BtPager BtPager;

/*
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
int sqlite4BtPageRelease(BtPage*);
void sqlite4BtPageReference(BtPage*);

/*
** Allocate new database pages or blocks.
*/
int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage);
int sqlite4BtBlockAllocate(BtPager*, u32 *piBlk);

/*
** Query page references.
*/
u32 sqlite4BtPagePgno(BtPage*);
void *sqlite4BtPageData(BtPage*);








|







156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
int sqlite4BtPageRelease(BtPage*);
void sqlite4BtPageReference(BtPage*);

/*
** Allocate new database pages or blocks.
*/
int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage);
int sqlite4BtBlockAllocate(BtPager*, int nBlk, u32 *piBlk);

/*
** Query page references.
*/
u32 sqlite4BtPagePgno(BtPage*);
void *sqlite4BtPageData(BtPage*);

Changes to src/bt_main.c.
32
33
34
35
36
37
38


39
40
41
42
43
44
45
typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;

struct bt_db {
  sqlite4_env *pEnv;              /* SQLite environment */
  BtPager *pPager;                /* Underlying page-based database */
  bt_cursor *pAllCsr;             /* List of all open cursors */


  int bFastInsertOp;              /* Set by CONTROL_FAST_INSERT_OP */
};

typedef struct BtOvfl BtOvfl;
struct BtOvfl {
  int nKey;
  int nVal;







>
>







32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;

struct bt_db {
  sqlite4_env *pEnv;              /* SQLite environment */
  BtPager *pPager;                /* Underlying page-based database */
  bt_cursor *pAllCsr;             /* List of all open cursors */
  int nMinMerge;
  int nScheduleAlloc;
  int bFastInsertOp;              /* Set by CONTROL_FAST_INSERT_OP */
};

typedef struct BtOvfl BtOvfl;
struct BtOvfl {
  int nKey;
  int nVal;
94
95
96
97
98
99
100





101
102
103
104
105
106
107
struct FiCursor {
  bt_cursor base;                 /* Base cursor class */
  int iBt;                        /* Current sub-tree (or -1 for EOF) */
  int nBt;                        /* Number of entries in aBt[] array */
  BtCursor *aBt;                  /* Array of sub-tree cursors */
};






#ifndef btErrorBkpt
int btErrorBkpt(int rc){
  static int error_cnt = 0;
  error_cnt++;
  return rc;
}
#endif







>
>
>
>
>







96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
struct FiCursor {
  bt_cursor base;                 /* Base cursor class */
  int iBt;                        /* Current sub-tree (or -1 for EOF) */
  int nBt;                        /* Number of entries in aBt[] array */
  BtCursor *aBt;                  /* Array of sub-tree cursors */
};

/* 
** Meta-table summary key.
*/
static const u8 aSummaryKey[] = {0xFF, 0xFF, 0xFF, 0xFF};

#ifndef btErrorBkpt
int btErrorBkpt(int rc){
  static int error_cnt = 0;
  error_cnt++;
  return rc;
}
#endif
169
170
171
172
173
174
175



176
177
178
179
180
181
182
183
184
185
186



187
188
189
190
191
192
193
}
#define btPutU32(x,y) sqlite4BtPutU32(x,y)

/*
** Allocate a new database handle.
*/
int sqlite4BtNew(sqlite4_env *pEnv, int nExtra, bt_db **ppDb){



  bt_db *db = 0;                  /* New database object */
  BtPager *pPager = 0;            /* Pager object for this database */
  int nReq;                       /* Bytes of space required for bt_db object */
  int rc;                         /* Return code */

  nReq = sizeof(bt_db);
  rc = sqlite4BtPagerNew(pEnv, nExtra + nReq, &pPager);
  if( rc==SQLITE4_OK ){
    db = (bt_db*)sqlite4BtPagerExtra(pPager);
    db->pPager = pPager;
    db->pEnv = pEnv;



  }

  *ppDb = db;
  return rc;
}

/*







>
>
>











>
>
>







176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
}
#define btPutU32(x,y) sqlite4BtPutU32(x,y)

/*
** Allocate a new database handle.
*/
int sqlite4BtNew(sqlite4_env *pEnv, int nExtra, bt_db **ppDb){
  static const int MIN_MERGE = 4;
  static const int SCHEDULE_ALLOC = 4;

  bt_db *db = 0;                  /* New database object */
  BtPager *pPager = 0;            /* Pager object for this database */
  int nReq;                       /* Bytes of space required for bt_db object */
  int rc;                         /* Return code */

  nReq = sizeof(bt_db);
  rc = sqlite4BtPagerNew(pEnv, nExtra + nReq, &pPager);
  if( rc==SQLITE4_OK ){
    db = (bt_db*)sqlite4BtPagerExtra(pPager);
    db->pPager = pPager;
    db->pEnv = pEnv;

    db->nMinMerge = MIN_MERGE;
    db->nScheduleAlloc = SCHEDULE_ALLOC;
  }

  *ppDb = db;
  return rc;
}

/*
492
493
494
495
496
497
498

499
500
501
502
503






















504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547



548
549
550
551
552
553
554
555

556
557
558
559
560
561
562
563
564
565
566
567
568
569

570
571
572
573
574
575
576
static void btPageToAscii(
  u32 pgno,                       /* Page number */
  int bAscii,                     /* True to print keys and values as ASCII */
  BtPager *pPager,                /* Pager object (or NULL) */
  u8 *aData, int nData,           /* Buffer containing page data */
  sqlite4_buffer *pBuf            /* Output buffer */
){

  int i;
  int nCell = (int)btCellCount(aData, nData);
  u8 flags = btFlags(aData);      /* Page flags */

  sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno);






















  sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
  sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
  sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
  if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
  }
  sqlite4BtBufAppendf(pBuf, "cell-offsets=(");
  for(i=0; i<nCell; i++){
    u8 *ptr = btCellPtrFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr));
  }
  sqlite4BtBufAppendf(pBuf, ")\n");

  for(i=0; i<nCell; i++){
    u8 *pCell;          /* Cell i */
    int nKey;           /* Number of bytes of key to output */
    u8 *pKey;           /* Buffer containing key. */
    int nVal;           /* Number of bytes of value to output */
    u8 *pVal = 0;       /* Buffer containing value. */

    pCell = btCellFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "  Cell %d: ", i);

    pCell += sqlite4BtVarintGet32(pCell, &nKey);
    pKey = pCell;
    pCell += nKey;
    btBufferAppendBlob(pBuf, bAscii, pKey, nKey);

    if( flags & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "  child=%d ", (int)btGetU32(pCell));
    }else{
      sqlite4BtBufAppendf(pBuf, "  ");
      pCell += sqlite4BtVarintGet32(pCell, &nVal);
      if( nVal>=2 ){
        nVal -= 2;
        pVal = pCell;
      }else{
        sqlite4BtBufAppendf(pBuf, "delete-key");
      }
    }
    if( pVal ){
      btBufferAppendBlob(pBuf, bAscii, pVal, nVal);
      if( flags & BT_PGFLAGS_METATREE ){
        /* Interpret the meta-tree entry */



        u32 iAge = btGetU32(&pKey[0]);
        u32 iLevel = ~btGetU32(&pKey[4]);
        u32 iBlk = btGetU32(pVal);
        sqlite4BtBufAppendf(pBuf, "  [age=%d level=%d block=%d]", 
            (int)iAge, (int)iLevel, (int)iBlk
        );
      }
    }

    sqlite4BtBufAppendf(pBuf, "\n");
  }

  if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    for(i=0; i<btCellCount(aData, nData); i++){
      BtPage *pChild;
      u8 *aChild;
      u32 child;

      child = btChildPgno(aData, nData, i);
      sqlite4BtPageGet(pPager, child, &pChild);
      aChild = sqlite4BtPageData(pChild);
      btPageToAscii(child, bAscii, pPager, aChild, nData, pBuf);
      sqlite4BtPageRelease(pChild);

    }
  }
  sqlite4BtBufAppendf(pBuf, "\n");
}

#ifndef NDEBUG
#include <stdio.h>







>





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
|
|
|
|
|
|
|
|
|

|
|
|
|
|
|

|
|

|
|
|
|

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
|
|
|
|
|
|
|
|
>
|
|

|
|
|
|
|

|
|
|
|
|
>







505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
static void btPageToAscii(
  u32 pgno,                       /* Page number */
  int bAscii,                     /* True to print keys and values as ASCII */
  BtPager *pPager,                /* Pager object (or NULL) */
  u8 *aData, int nData,           /* Buffer containing page data */
  sqlite4_buffer *pBuf            /* Output buffer */
){
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(pPager);
  int i;
  int nCell = (int)btCellCount(aData, nData);
  u8 flags = btFlags(aData);      /* Page flags */

  sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno);

  if( pgno==pHdr->iSRoot ){
    int i;
    BtSchedule s;
    sqlite4BtBufAppendf(pBuf, "(schedule page) ");
    memcpy(&s, aData, sizeof(s));

    sqlite4BtBufAppendf(pBuf, "bBusy=%d ", (int)s.bBusy);
    sqlite4BtBufAppendf(pBuf, "iAge=%d ", (int)s.iAge);
    sqlite4BtBufAppendf(pBuf, "iMaxLevel=%d ", (int)s.iMaxLevel);
    sqlite4BtBufAppendf(pBuf, "iOutLevel=%d ", (int)s.iOutLevel);
    sqlite4BtBufAppendf(pBuf, "blocks=(");
    for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]);
    }
    sqlite4BtBufAppendf(pBuf, ") ");

    sqlite4BtBufAppendf(pBuf, "iNextPg=%d ", (int)s.iNextPg);
    sqlite4BtBufAppendf(pBuf, "iNextCell=%d ", (int)s.iNextCell);
    sqlite4BtBufAppendf(pBuf, "nUsed=%d ", (int)s.nUsed);
    sqlite4BtBufAppendf(pBuf, "iFreeList=%d", (int)s.iFreeList);
  }else{
    sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
    sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
    sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
    if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
    }
    sqlite4BtBufAppendf(pBuf, "cell-offsets=(");
    for(i=0; i<nCell; i++){
      u8 *ptr = btCellPtrFind(aData, nData, i);
      sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr));
    }
    sqlite4BtBufAppendf(pBuf, ")\n");

    for(i=0; i<nCell; i++){
      u8 *pCell;          /* Cell i */
      int nKey;           /* Number of bytes of key to output */
      u8 *pKey;           /* Buffer containing key. */
      int nVal;           /* Number of bytes of value to output */
      u8 *pVal = 0;       /* Buffer containing value. */

      pCell = btCellFind(aData, nData, i);
      sqlite4BtBufAppendf(pBuf, "  Cell %d: ", i);

      pCell += sqlite4BtVarintGet32(pCell, &nKey);
      pKey = pCell;
      pCell += nKey;
      btBufferAppendBlob(pBuf, bAscii, pKey, nKey);

      if( flags & BT_PGFLAGS_INTERNAL ){
        sqlite4BtBufAppendf(pBuf, "  child=%d ", (int)btGetU32(pCell));
      }else{
        sqlite4BtBufAppendf(pBuf, "  ");
        pCell += sqlite4BtVarintGet32(pCell, &nVal);
        if( nVal>=2 ){
          nVal -= 2;
          pVal = pCell;
        }else{
          sqlite4BtBufAppendf(pBuf, "delete-key");
        }
      }
      if( pVal ){
        btBufferAppendBlob(pBuf, bAscii, pVal, nVal);
        if( flags & BT_PGFLAGS_METATREE ){
          /* Interpret the meta-tree entry */
          if( nKey==sizeof(aSummaryKey) && 0==memcmp(pKey, aSummaryKey, nKey) ){
            sqlite4BtBufAppendf(pBuf, "  [summary]");
          }else{
            u32 iAge = btGetU32(&pKey[0]);
            u32 iLevel = ~btGetU32(&pKey[4]);
            u32 iBlk = btGetU32(pVal);
            sqlite4BtBufAppendf(pBuf, "  [age=%d level=%d block=%d]", 
                (int)iAge, (int)iLevel, (int)iBlk
                );
          }
        }
      }
      sqlite4BtBufAppendf(pBuf, "\n");
    }

    if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){
      for(i=0; i<btCellCount(aData, nData); i++){
        BtPage *pChild;
        u8 *aChild;
        u32 child;

        child = btChildPgno(aData, nData, i);
        sqlite4BtPageGet(pPager, child, &pChild);
        aChild = sqlite4BtPageData(pChild);
        btPageToAscii(child, bAscii, pPager, aChild, nData, pBuf);
        sqlite4BtPageRelease(pChild);
      }
    }
  }
  sqlite4BtBufAppendf(pBuf, "\n");
}

#ifndef NDEBUG
#include <stdio.h>
1307
1308
1309
1310
1311
1312
1313






1314
1315
1316
1317
1318
1319
1320
        BtCursor *pSub = pCsr->aBt;

        /* Loop through the set of f-tree levels */
        btCsrSetup(db, pHdr->iMRoot, &mcsr);
        for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
          const void *pV; int nV; /* Value associated with meta-tree entry */
          u32 iBlk;               /* Block containing sub-tree */







          sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(pSub, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);
          rc = btCsrSeek(pSub, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);
          if( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ){







>
>
>
>
>
>







1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
        BtCursor *pSub = pCsr->aBt;

        /* Loop through the set of f-tree levels */
        btCsrSetup(db, pHdr->iMRoot, &mcsr);
        for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
          const void *pV; int nV; /* Value associated with meta-tree entry */
          u32 iBlk;               /* Block containing sub-tree */

          btCsrKey(&mcsr, &pV, &nV);
          if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){
            rc = SQLITE4_NOTFOUND;
            break;
          }

          sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
          iBlk = sqlite4BtGetU32((const u8*)pV);
          btCsrReset(pSub, 1);
          btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);
          rc = btCsrSeek(pSub, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK);
          if( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ){
1348
1349
1350
1351
1352
1353
1354






1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370

1371
1372
1373
1374
1375
1376
1377
        u32 iBlk;                 /* Block containing sub-tree */
        BtCursor *pSub;           /* Sub-cursor for this block */

        rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
        if( rc!=SQLITE4_OK ) break;
        pSub = &pCsr->aBt[iSub];
        iSub++;







        sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
        iBlk = sqlite4BtGetU32((const u8*)pV);

        btCsrReset(pSub, 1);
        btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);
        rc = btCsrSeek(pSub, pK, nK, eSeek, BT_CSRSEEK_SEEK);
        if( rc==SQLITE4_OK ){
          bMatch = 1;
        }else if( rc==SQLITE4_INEXACT ){
          bHit = 1;
        }else if( rc!=SQLITE4_NOTFOUND ){
          break;
        }
      }
      assert( rc!=SQLITE4_OK && rc!=SQLITE4_INEXACT );


      if( rc==SQLITE4_NOTFOUND ){
        pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK);
        rc = fiCsrSetCurrent(pCsr);
        if( rc==SQLITE4_OK ){
          BtCursor *pSub = &pCsr->aBt[pCsr->iBt];
          if( fiCsrIsDelete(pSub) ){







>
>
>
>
>
>
















>







1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
        u32 iBlk;                 /* Block containing sub-tree */
        BtCursor *pSub;           /* Sub-cursor for this block */

        rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
        if( rc!=SQLITE4_OK ) break;
        pSub = &pCsr->aBt[iSub];
        iSub++;

        btCsrKey(&mcsr, &pV, &nV);
        if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){
          rc = SQLITE4_NOTFOUND;
          break;
        }

        sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
        iBlk = sqlite4BtGetU32((const u8*)pV);

        btCsrReset(pSub, 1);
        btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);
        rc = btCsrSeek(pSub, pK, nK, eSeek, BT_CSRSEEK_SEEK);
        if( rc==SQLITE4_OK ){
          bMatch = 1;
        }else if( rc==SQLITE4_INEXACT ){
          bHit = 1;
        }else if( rc!=SQLITE4_NOTFOUND ){
          break;
        }
      }
      assert( rc!=SQLITE4_OK && rc!=SQLITE4_INEXACT );
      btCsrReset(&mcsr, 1);

      if( rc==SQLITE4_NOTFOUND ){
        pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK);
        rc = fiCsrSetCurrent(pCsr);
        if( rc==SQLITE4_OK ){
          BtCursor *pSub = &pCsr->aBt[pCsr->iBt];
          if( fiCsrIsDelete(pSub) ){
1401
1402
1403
1404
1405
1406
1407






1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420

1421
1422
1423
1424
1425
1426
1427

  /* Loop through the set of f-tree levels. Open a cursor on each. */
  btCsrSetup(db, pHdr->iMRoot, &mcsr);
  for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
    const void *pV; int nV;   /* Value associated with meta-tree entry */
    u32 iBlk;                 /* Block containing sub-tree */
    BtCursor *pSub;           /* Sub-cursor for this block */







    rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
    if( rc!=SQLITE4_OK ) break;
    pSub = &pCsr->aBt[iSub];
    iSub++;

    sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
    iBlk = sqlite4BtGetU32((const u8*)pV);
    btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);

    rc = btCsrEnd(pSub, bLast);
    if( rc!=SQLITE4_OK && rc!=SQLITE4_NOTFOUND ) break;
  }


  if( rc==SQLITE4_NOTFOUND ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);
    pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK);
    rc = fiCsrSetCurrent(pCsr);
    if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) ){
      rc = fiCsrStep(pCsr);







>
>
>
>
>
>













>







1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488

  /* Loop through the set of f-tree levels. Open a cursor on each. */
  btCsrSetup(db, pHdr->iMRoot, &mcsr);
  for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){
    const void *pV; int nV;   /* Value associated with meta-tree entry */
    u32 iBlk;                 /* Block containing sub-tree */
    BtCursor *pSub;           /* Sub-cursor for this block */

    btCsrKey(&mcsr, &pV, &nV);
    if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){
      rc = SQLITE4_NOTFOUND;
      break;
    }

    rc = fiCsrAllocateSubs(db, pCsr, iSub+1);
    if( rc!=SQLITE4_OK ) break;
    pSub = &pCsr->aBt[iSub];
    iSub++;

    sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV);
    iBlk = sqlite4BtGetU32((const u8*)pV);
    btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub);

    rc = btCsrEnd(pSub, bLast);
    if( rc!=SQLITE4_OK && rc!=SQLITE4_NOTFOUND ) break;
  }
  btCsrReset(&mcsr, 1);

  if( rc==SQLITE4_NOTFOUND ){
    pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK);
    pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK);
    rc = fiCsrSetCurrent(pCsr);
    if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) ){
      rc = fiCsrStep(pCsr);
3030
3031
3032
3033
3034
3035
3036

3037
3038
3039
3040
3041
3042
3043
    }
  }

  return rc;
}

static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot);


static int btReplaceEntry(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){







>







3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
    }
  }

  return rc;
}

static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot);
static int btScheduleMerge(bt_db *db);

static int btReplaceEntry(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){
3087
3088
3089
3090
3091
3092
3093




3094

3095
3096
3097
3098
3099
3100
3101
          /* Insert the new KV pair into the current leaf. */
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );





          rc = btFastInsertRoot(db, pHdr, &iRootPg);

          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);
            rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
          }
        }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT );
      }







>
>
>
>
|
>







3149
3150
3151
3152
3153
3154
3155
3156
3157
3158
3159
3160
3161
3162
3163
3164
3165
3166
3167
3168
          /* Insert the new KV pair into the current leaf. */
          rc = btInsertAndBalance(&csr, 1, &kv);

          /* Unless this is a block-full error, break out of the loop */
          if( rc!=BT_BLOCKFULL ) break;
          assert( iRoot==0 );

          /* Try to schedule a merge operation */
          rc = btScheduleMerge(db);

          if( rc==SQLITE4_OK ){
            rc = btFastInsertRoot(db, pHdr, &iRootPg);
          }
          if( rc==SQLITE4_OK ){
            btCsrReset(&csr, 1);
            btCsrSetup(db, iRootPg, &csr);
            rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
          }
        }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT );
      }
3144
3145
3146
3147
3148
3149
3150































































3151
3152
3153
3154
3155

3156
3157



3158
3159
3160




3161






3162
3163
















































3164
3165



3166


3167







3168





3169
3170



3171




3172



3173



3174




3175
















3176
3177
3178
3179





















3180




3181


3182































3183
3184
3185
3186
3187
3188



3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
3200
3201
3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229

3230
3231
3232
3233
3234
3235
3236
      *paKey = &aK[8];
      *pnKey = nK-8;
    }
  }
  return rc;
}
































































static int btFindNextLevel(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piNext
){

  int rc;
  BtCursor csr;




  btCsrSetup(db, pHdr->iMRoot, &csr);
  rc = btCsrEnd(&csr, 0);




  assert( rc!=SQLITE4_INEXACT );







  *piNext = 0;
















































  if( rc==SQLITE4_OK ){
    u32 iAge;



    u32 iLevel;


    rc = btDecodeMetatreeKey(&csr, &iAge, &iLevel, 0, 0);







    if( rc==SQLITE4_OK && iAge==0 ){





      *piNext = iLevel+1;
    }



  }else if( rc==SQLITE4_NOTFOUND ){




    rc = SQLITE4_OK;



  }



  btCsrReset(&csr, 1);





















  return rc;
}

static int btAllocateBlock(bt_db *db, u32 *piBlk){





















  return sqlite4BtBlockAllocate(db->pPager, piBlk);




}


































static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;




  /* If the meta-tree has not been created, create it now. */
  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, BT_PGFLAGS_METATREE, &pHdr->iMRoot);
  }

  /* If no writable sub-tree has been discovered, create one now. */
  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
    u32 iLevel;                   /* Level number for new sub-tree */
    u32 iSubBlock;                /* New block */

    rc = btFindNextLevel(db, pHdr, &iLevel);
    if( rc==SQLITE4_OK ){
      rc = btAllocateBlock(db, &iSubBlock);
    }

    if( rc==SQLITE4_OK ){
      u8 aKey[8];
      u8 aVal[4];
      pHdr->iSubBlock = iSubBlock;
      pHdr->nSubPg = 1;           /* Root page is automatically allocated */

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);

      btPutU32(&aVal[0], iSubBlock);

      assert( db->bFastInsertOp==1 );
      db->bFastInsertOp = 0;
      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 4);
      db->bFastInsertOp = 1;
    }
  }

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }

  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|




>
|
|
>
>
>

|
|
>
>
>
>
|
>
>
>
>
>
>

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

|
>
>
>
|
>
>
|
>
>
>
>
>
>
>
|
>
>
>
>
>
|
|
>
>
>
|
>
>
>
>
|
>
>
>
|
>
>
>
|
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
|
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






>
>
>






|




|

|














<

<
<
<

<






>







3211
3212
3213
3214
3215
3216
3217
3218
3219
3220
3221
3222
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429
3430
3431
3432
3433
3434
3435
3436
3437
3438
3439
3440
3441
3442
3443
3444
3445
3446
3447
3448
3449
3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
3492
3493
3494
3495
3496
3497
3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519

3520



3521

3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
      *paKey = &aK[8];
      *pnKey = nK-8;
    }
  }
  return rc;
}

static void *btMalloc(bt_db *db, int nByte, int *pRc){
  void *pRet = 0;
  if( *pRc==SQLITE4_OK ){
    pRet = sqlite4_malloc(db->pEnv, nByte);
    if( !pRet ) *pRc = btErrorBkpt(SQLITE4_NOMEM);
  }
  return pRet;
}
static void btFree(bt_db *db, void *p){
  sqlite4_free(db->pEnv, p);
}

static int fiLoadSummary(
  bt_db *db, 
  BtCursor *p, 
  const u8 **paSummary, 
  int *pnSummary
){
  static const u8 aZero[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc;

  btCsrSetup(db, pHdr->iMRoot, p);
  rc = btCsrSeek(
      p, aSummaryKey, sizeof(aSummaryKey), BT_SEEK_EQ, BT_CSRSEEK_SEEK
  );
  if( rc==SQLITE4_OK ){
    rc = btCsrData(p, 0, -1, (const void**)paSummary, pnSummary);
  }else if( rc==SQLITE4_NOTFOUND ){
    *paSummary = aZero;
    *pnSummary = sizeof(aZero);
    rc = SQLITE4_OK;
  }

  return rc;
}

static void btReadSummary(
  const u8 *aSum, int iAge, 
  u16 *piMinLevel,
  u16 *pnLevel,
  u16 *piMergeLevel
){
  *piMinLevel = btGetU16(&aSum[iAge * 6]);
  *pnLevel = btGetU16(&aSum[iAge * 6 + 2]);
  *piMergeLevel = btGetU16(&aSum[iAge * 6 + 4]);
}

static void btWriteSummary(
  u8 *aSum, int iAge,
  u16 iMinLevel,
  u16 nLevel,
  u16 iMergeLevel
){
  btPutU16(&aSum[iAge * 6], iMinLevel);
  btPutU16(&aSum[iAge * 6 + 2], nLevel);
  btPutU16(&aSum[iAge * 6 + 4], iMergeLevel);
}

/*
** Allocate a new level for a new age=0 segment. The new level is always
** one greater than the current largest age=0 level number.
*/
static int btAllocateNewLevel(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piNext
){

  int rc;                         /* Return code */
  BtCursor csr;                   /* Cursor to read meta-table summary */
  int nByte;                      /* Size of buffer aByte[] */
  const u8 *aByte;                /* Summary data */
  u8 *aNew;                       /* New summary data */

  rc = fiLoadSummary(db, &csr, &aByte, &nByte);

  aNew = (u8*)btMalloc(db, nByte, &rc);
  if( rc==SQLITE4_OK ){
    u16 iMin, nLevel, iMerge;
    btReadSummary(aByte, 0, &iMin, &nLevel, &iMerge);

    memcpy(aNew, aByte, nByte);
    btPutU16(&aNew[2], nLevel+1);
    rc = btReplaceEntry(
        db, pHdr->iMRoot, aSummaryKey, sizeof(aSummaryKey), aNew, nByte
    );
    btFree(db, aNew);

    *piNext = (iMin + nLevel);
  }

  btCsrReset(&csr, 1);
  return rc;
}

static void btWriteSchedule(BtPage *pPg, BtSchedule *p, int *pRc){
  if( *pRc==SQLITE4_OK ){
    int rc = sqlite4BtPageWrite(pPg);
    if( rc==SQLITE4_OK ){
      /* TODO: convert values to big-endian format */
      u8 *aData = sqlite4BtPageData(pPg);
      memcpy(aData, p, sizeof(BtSchedule));
    }
    *pRc = rc;
  }
}
static int btReadSchedule(bt_db *db, BtPage *pPg, BtSchedule *p){
  /* TODO: convert values to big-endian format */
  u8 *aData = sqlite4BtPageData(pPg);
  memcpy(p, aData, sizeof(BtSchedule));
  return SQLITE4_OK;
}

static int btAllocateBlock(
  bt_db *db, 
  int nBlk,
  u32 *aiBlk
){
  return sqlite4BtBlockAllocate(db->pPager, nBlk, aiBlk);
}

/*
** This is a helper function for btScheduleMerge(). It determines the
** age and range of levels to be used as inputs by the merge (if any).
*/
static int btFindMerge(
  bt_db *db,                      /* Database handle */
  u32 *piAge,                     /* OUT: Age of input segments to merge */
  u32 *piMaxLevel,                /* OUT: Maximum input level value */
  u32 *piOutLevel                 /* OUT: Output level value */
){
  BtCursor csr;                   /* Cursor used to read summary record */
  int rc;                         /* Return code */
  const u8 *aSum;
  int nSum;

  rc = fiLoadSummary(db, &csr, &aSum, &nSum);
  if( rc==SQLITE4_OK ){
    int iAge;
    int iBestAge = -1;            /* Best age to merge levels from */
    int nBest = (db->nMinMerge-1);/* Number of levels merged at iBestAge */
    u16 iMin, nLevel, iMerge;     /* Summary of current age */

    rc = SQLITE4_NOTFOUND;
    for(iAge=0; iAge<(nSum/3); iAge++){
      btReadSummary(aSum, iAge, &iMin, &nLevel, &iMerge);
      if( iMerge ){
        int n = 1 + (iMerge-iMin);
        if( n>nBest ){
          *piMaxLevel = iMerge;
          *piAge = iAge;
          btReadSummary(aSum, iAge+1, &iMin, &nLevel, &iMerge);
          *piOutLevel = (iMin + nLevel - 1);
          rc = SQLITE4_OK;
        }
        break;
      }else{
        if( nLevel>nBest ){
          iBestAge = iAge;
          nBest = nLevel;
        }
      }
    }

    if( rc==SQLITE4_NOTFOUND && iBestAge>=0 ){
      u8 *aNew;
      int nByte = nSum;
      if( iAge+1>=(nSum/3) ) nByte += 6;

      rc = SQLITE4_OK;
      aNew = (u8*)btMalloc(db, nByte, &rc);
      if( rc==SQLITE4_OK ){
        BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);

        /* Create a copy of the summary record */
        memcpy(aNew, aSum, nSum);
        if( nByte>nSum ) btWriteSummary(aNew, iBestAge+1, 0, 0, 0);

        /* Find the input age and maximum level */
        btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge);
        *piMaxLevel = (u32)(iMin + nLevel - 1);
        *piAge = iBestAge;

        /* Find the output level */
        btReadSummary(aSum, iBestAge+1, &iMin, &nLevel, &iMerge);
        *piOutLevel = iMin + nLevel;

        /* Update the summary record for the output segment. */
        btWriteSummary(aNew, iBestAge+1, iMin, nLevel+1, iMerge);

        /* Write the updated summary record back to the db. */
        rc = btReplaceEntry(
             db, pHdr->iMRoot, aSummaryKey, sizeof(aSummaryKey), aNew, nByte
        );
      }
    }
  }

  btCsrReset(&csr, 1);
  return rc;
}


/*
** If possible, schedule a merge operation. 
**
** The merge operation is selected based on the following criteria:
**
**   * The more levels involved in the merge the better, and
**   * It is better to merge younger segments than older ones.
*/
static int btScheduleMerge(bt_db *db){

  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  BtPage *pPg = 0;                /* Schedule page */
  u8 *aData;                      /* Schedule page data */

  int rc;
  u32 iAge;
  u32 iLvl;
  u32 iOutLvl;

  /* Find the schedule page. If there is no schedule page, allocate it now. */
  if( pHdr->iSRoot==0 ){
    rc = sqlite4BtPageAllocate(db->pPager, &pPg);
    if( rc==SQLITE4_OK ){
      u8 *aData = sqlite4BtPageData(pPg);
      memset(aData, 0, pHdr->pgsz);
      pHdr->iSRoot = sqlite4BtPagePgno(pPg);
    }
  }else{
    rc = sqlite4BtPageGet(db->pPager, pHdr->iSRoot, &pPg);
  }

  /* Check if the schedule page is busy. If so, no new merge may be 
  ** scheduled. If the schedule page is not busy, call btFindMerge() to
  ** figure out which levels should be scheduled for merge.  */
  if( rc==SQLITE4_OK ){
    aData = sqlite4BtPageData(pPg);
    if( btGetU32(aData) ){
      rc = SQLITE4_NOTFOUND;
    }else{
      rc = btFindMerge(db, &iAge, &iLvl, &iOutLvl);
    }
  }

  if( rc==SQLITE4_OK ){
    BtSchedule s;
    memset(&s, 0, sizeof(BtSchedule));

    s.bBusy = 1;
    s.iAge = iAge;
    s.iMaxLevel = iLvl;
    s.iOutLevel = iOutLvl;
    rc = btAllocateBlock(db, db->nScheduleAlloc, s.aBlock);

    btWriteSchedule(pPg, &s, &rc);
  }

  sqlite4BtPageRelease(pPg);
  if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;
  return rc;
}

static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;

  assert( db->bFastInsertOp );
  db->bFastInsertOp = 0;

  /* If the meta-tree has not been created, create it now. */
  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, BT_PGFLAGS_METATREE, &pHdr->iMRoot);
  }

  /* If no writable sub-tree current exists, create one */ 
  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
    u32 iLevel;                   /* Level number for new sub-tree */
    u32 iSubBlock;                /* New block */

    rc = btAllocateNewLevel(db, pHdr, &iLevel);
    if( rc==SQLITE4_OK ){
      rc = btAllocateBlock(db, 1, &iSubBlock);
    }

    if( rc==SQLITE4_OK ){
      u8 aKey[8];
      u8 aVal[4];
      pHdr->iSubBlock = iSubBlock;
      pHdr->nSubPg = 1;           /* Root page is automatically allocated */

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);

      btPutU32(&aVal[0], iSubBlock);



      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 4);

    }
  }

  if( rc==SQLITE4_OK ){
    *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock);
  }
  db->bFastInsertOp = 1;
  return rc;
}

/*
** Insert a new key/value pair or replace an existing one.
**
** This function may modify either the b-tree or fast-insert-tree, depending
3364
3365
3366
3367
3368
3369
3370

3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
    case BT_INFO_PAGEDUMP: {
      int iCtx;                   /* ControlTransaction() context */
      rc = btControlTransaction(db, &iCtx);
      if( rc==SQLITE4_OK ){
        BtPage *pPg = 0;
        rc = sqlite4BtPageGet(db->pPager, pInfo->pgno, &pPg);
        if( rc==SQLITE4_OK ){

          int bAscii = (pInfo->eType==BT_INFO_PAGEDUMP_ASCII);
          u8 *aData;
          int nData;
          aData = sqlite4BtPageData(pPg);
          nData = sqlite4BtPagerPagesize(db->pPager);
          btPageToAscii(pInfo->pgno, bAscii, 0, aData, nData, &pInfo->output);
          sqlite4_buffer_append(&pInfo->output, "", 1);
          sqlite4BtPageRelease(pPg);
        }
        btControlTransactionDone(db, iCtx);
      }
      break;
    }







>




|
|







3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
    case BT_INFO_PAGEDUMP: {
      int iCtx;                   /* ControlTransaction() context */
      rc = btControlTransaction(db, &iCtx);
      if( rc==SQLITE4_OK ){
        BtPage *pPg = 0;
        rc = sqlite4BtPageGet(db->pPager, pInfo->pgno, &pPg);
        if( rc==SQLITE4_OK ){
          BtPager *p = db->pPager;
          int bAscii = (pInfo->eType==BT_INFO_PAGEDUMP_ASCII);
          u8 *aData;
          int nData;
          aData = sqlite4BtPageData(pPg);
          nData = sqlite4BtPagerPagesize(p);
          btPageToAscii(pInfo->pgno, bAscii, p, aData, nData, &pInfo->output);
          sqlite4_buffer_append(&pInfo->output, "", 1);
          sqlite4BtPageRelease(pPg);
        }
        btControlTransactionDone(db, iCtx);
      }
      break;
    }
Changes to src/bt_pager.c.
977
978
979
980
981
982
983
984
985
986
987



988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002

1003
1004
1005
1006
1007
1008
1009
  fprintf(stderr, "allocated page %d\n", pgno);
#endif

  *ppPg = pPg;
  return rc;
}

int sqlite4BtBlockAllocate(BtPager *p, u32 *piBlk){
  int rc = SQLITE4_OK;
  BtDbHdr *pHdr = p->pHdr;
  int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);



  u32 iBlk;
  u32 iRoot;
  u32 iFree;

  /* Figure out the next block in the file. And its root (first) page. */
  iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk;
  iRoot = (iBlk-1) * nPgPerBlk + 1;
  assert( iBlk>0 );

  for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFree<iRoot; iFree++){
    rc = sqlite4BtPageTrimPgno(p, iFree);
  }
  if( rc==SQLITE4_OK ){
    pHdr->nPg = iBlk * nPgPerBlk;
    *piBlk = iBlk;

  }

  return rc;
}

/*
** Return the current page number of the argument page reference.







|



>
>
>
|
|
|

|
|
|
|

|
|
|
|
|
|
>







977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
  fprintf(stderr, "allocated page %d\n", pgno);
#endif

  *ppPg = pPg;
  return rc;
}

int sqlite4BtBlockAllocate(BtPager *p, int nBlk, u32 *aiBlk){
  int rc = SQLITE4_OK;
  BtDbHdr *pHdr = p->pHdr;
  int nPgPerBlk = (pHdr->blksz / pHdr->pgsz);
  int i;

  for(i=0; i<nBlk; i++){
    u32 iBlk;
    u32 iRoot;
    u32 iFree;

    /* Figure out the next block in the file. And its root (first) page. */
    iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk;
    iRoot = (iBlk-1) * nPgPerBlk + 1;
    assert( iBlk>0 );

    for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFree<iRoot; iFree++){
      rc = sqlite4BtPageTrimPgno(p, iFree);
    }
    if( rc==SQLITE4_OK ){
      pHdr->nPg = iBlk * nPgPerBlk;
      aiBlk[i] = iBlk;
    }
  }

  return rc;
}

/*
** Return the current page number of the argument page reference.
Changes to www/bt.wiki.
647
648
649
650
651
652
653












654
655
656
657
658
659
660
  <li> Segment age (big-endian 32-bit integer).
  <li> Level number (ones-compliment of value as a big-endian 32-bit integer).
  <li> Separator key. All keys within the segment are guaranteed to be as
       large or larger than the separator key. The separator key may be
       zero bytes in size.
</ul>













<h3> Schedule-Page Format</h3>

<p>We might need more than one schedule page. Perhaps anyway... But say 
worry about this later on.

<p>The writer adds an entry to the schedule-tree. Schedule-tree entries
are always appended to the tree - the integer key is one greater than the







>
>
>
>
>
>
>
>
>
>
>
>







647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
  <li> Segment age (big-endian 32-bit integer).
  <li> Level number (ones-compliment of value as a big-endian 32-bit integer).
  <li> Separator key. All keys within the segment are guaranteed to be as
       large or larger than the separator key. The separator key may be
       zero bytes in size.
</ul>

<p> As well as the records that index all segments in the db, the meta-table
contains a single summary record. The key is four 0xFF bytes.

<p>For each age between 0 and the largest age in the db, inclusive:

  * current minimum level number (16-bits).
  * total number of levels.
  * maximum level in active merge (16-bits).

Store this as a separate record in the meta-table. Even if there segments with
age=31 in the db, this is still only 6*32=192 bytes.

<h3> Schedule-Page Format</h3>

<p>We might need more than one schedule page. Perhaps anyway... But say 
worry about this later on.

<p>The writer adds an entry to the schedule-tree. Schedule-tree entries
are always appended to the tree - the integer key is one greater than the
675
676
677
678
679
680
681
682
683
684
685
686
687
688



689




690
691
692
693
694


695
696
697
698
699
700
701










702
703
704
705
706
707
708
  <li><p> A flag to indicate that the merge has been performed.

  <li><p> A bitmask (or some other indication) indicating which of the output
          blocks were populated.

  <li><p> A pointer to a list of overflow pages that were freed, if any.

  <li><p> A pointer to the last key read from the input.
</ul>

<p>Note the implication here: A writer may only modify the page if it currently
resides in the db file (not the log). The writer can tell if the schedule page
resides in the db file or the log by checking the "merge performed" flag.




<ul>




  <li> Age of input segments (big-endian u32).

  <li> Max level of input segments (bit-endian u32). All segments of the
       specified age with a level equal to or less than this are considered
       inputs.



  <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>.
       Maximum is (say) 32.

  <li> Block number of each allocated block (<i>nBlock</i> big-endian 
       u32 fields).
</ul>












<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The 







|






>
>
>

>
>
>
>





>
>




|


>
>
>
>
>
>
>
>
>
>







687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
  <li><p> A flag to indicate that the merge has been performed.

  <li><p> A bitmask (or some other indication) indicating which of the output
          blocks were populated.

  <li><p> A pointer to a list of overflow pages that were freed, if any.

  <li><p> A pointer to the next key read from the input levels, if any.
</ul>

<p>Note the implication here: A writer may only modify the page if it currently
resides in the db file (not the log). The writer can tell if the schedule page
resides in the db file or the log by checking the "merge performed" flag.


<p>The page is organized as follows:

<ul>
  <li> Magic number used to identify the state of the page (big-endian u32):
       0x6a846607 if the merge has not yet been performed or 0x1a468c33 if
       it has.

  <li> Age of input segments (big-endian u32).

  <li> Max level of input segments (bit-endian u32). All segments of the
       specified age with a level equal to or less than this are considered
       inputs.

  <li> Level of output segments.

  <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>.
       Maximum is (say) 32.

  <li> Block number of each allocated block (array of <i>nBlock</i> big-endian 
       u32 fields).
</ul>

<p>Then, after the merge has taken place:
<ul>
  <li> Page number of input page containing next key to read.
  <li> Cell number of next key to read within the identified page.
  <li> Number of input blocks used (input blocks are always used in the
       order they are specified).
  <li> First page of freed-page list.
</ul>



<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The