Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add notes describing schedule pages to bt.wiki.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 1df60437f3089489ca0d374bc69bf45ce54d1d20
User & Date: dan 2013-12-07 20:38:27.985
Context
2013-12-14
18:59
Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled. check-in: 590e0410b1 user: dan tags: trunk
2013-12-07
20:38
Add notes describing schedule pages to bt.wiki. check-in: 1df60437f3 user: dan tags: trunk
2013-12-06
20:11
Fix problems with delete markers and range scans. check-in: ccf5a6bb6a user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/bt_main.c.
20
21
22
23
24
25
26


27
28
29
30
31
32
33
#define BT_MAX_DIRECT_OVERFLOW 8  /* Maximum direct overflow pages per cell */

/*
** Values that make up the single byte flags field at the start of
** b-tree pages. 
*/
#define BT_PGFLAGS_INTERNAL 0x01  /* True for non-leaf nodes */



/* #define BT_STDERR_DEBUG 1 */

typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;

struct bt_db {







>
>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#define BT_MAX_DIRECT_OVERFLOW 8  /* Maximum direct overflow pages per cell */

/*
** Values that make up the single byte flags field at the start of
** b-tree pages. 
*/
#define BT_PGFLAGS_INTERNAL 0x01  /* True for non-leaf nodes */
#define BT_PGFLAGS_METATREE 0x02  /* True for a meta-tree page */
#define BT_PGFLAGS_SCHEDULE 0x04  /* True for a schedule-tree page */

/* #define BT_STDERR_DEBUG 1 */

typedef struct BtCursor BtCursor;
typedef struct FiCursor FiCursor;

struct bt_db {
492
493
494
495
496
497
498

499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514

515



516
517
518
519
520
521
522

523
524
525
526
527
528
529
530

531
532
533
534












535
536
537
538
539
540
541
  int bAscii,                     /* True to print keys and values as ASCII */
  BtPager *pPager,                /* Pager object (or NULL) */
  u8 *aData, int nData,           /* Buffer containing page data */
  sqlite4_buffer *pBuf            /* Output buffer */
){
  int i;
  int nCell = (int)btCellCount(aData, nData);


  sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno);
  sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
  sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
  sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
  if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
  }
  sqlite4BtBufAppendf(pBuf, "cell-offsets=(");
  for(i=0; i<nCell; i++){
    u8 *ptr = btCellPtrFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr));
  }
  sqlite4BtBufAppendf(pBuf, ")\n");

  for(i=0; i<nCell; i++){

    int nKey;



    int j;
    u8 *pCell = btCellFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "  Cell %d: ", i);

    pCell += sqlite4BtVarintGet32(pCell, &nKey);
    btBufferAppendBlob(pBuf, bAscii, pCell, nKey);
    pCell += nKey;


    if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "  child=%d ", (int)btGetU32(pCell));
    }else{
      int nVal;
      sqlite4BtBufAppendf(pBuf, "  ");
      pCell += sqlite4BtVarintGet32(pCell, &nVal);
      if( nVal>=2 ){

        btBufferAppendBlob(pBuf, bAscii, pCell, nVal-2);
      }else{
        sqlite4BtBufAppendf(pBuf, "delete-key");
      }












    }
    sqlite4BtBufAppendf(pBuf, "\n");
  }

  if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    for(i=0; i<btCellCount(aData, nData); i++){
      BtPage *pChild;







>
















>
|
>
>
>
|
|



|

>

|


<



>
|



>
>
>
>
>
>
>
>
>
>
>
>







494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534

535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
  int bAscii,                     /* True to print keys and values as ASCII */
  BtPager *pPager,                /* Pager object (or NULL) */
  u8 *aData, int nData,           /* Buffer containing page data */
  sqlite4_buffer *pBuf            /* Output buffer */
){
  int i;
  int nCell = (int)btCellCount(aData, nData);
  u8 flags = btFlags(aData);      /* Page flags */

  sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno);
  sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell);
  sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData));
  sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData));
  if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1]));
  }
  sqlite4BtBufAppendf(pBuf, "cell-offsets=(");
  for(i=0; i<nCell; i++){
    u8 *ptr = btCellPtrFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr));
  }
  sqlite4BtBufAppendf(pBuf, ")\n");

  for(i=0; i<nCell; i++){
    u8 *pCell;          /* Cell i */
    int nKey;           /* Number of bytes of key to output */
    u8 *pKey;           /* Buffer containing key. */
    int nVal;           /* Number of bytes of value to output */
    u8 *pVal = 0;       /* Buffer containing value. */

    pCell = btCellFind(aData, nData, i);
    sqlite4BtBufAppendf(pBuf, "  Cell %d: ", i);

    pCell += sqlite4BtVarintGet32(pCell, &nKey);
    pKey = pCell;
    pCell += nKey;
    btBufferAppendBlob(pBuf, bAscii, pKey, nKey);

    if( flags & BT_PGFLAGS_INTERNAL ){
      sqlite4BtBufAppendf(pBuf, "  child=%d ", (int)btGetU32(pCell));
    }else{

      sqlite4BtBufAppendf(pBuf, "  ");
      pCell += sqlite4BtVarintGet32(pCell, &nVal);
      if( nVal>=2 ){
        nVal -= 2;
        pVal = pCell;
      }else{
        sqlite4BtBufAppendf(pBuf, "delete-key");
      }
    }
    if( pVal ){
      btBufferAppendBlob(pBuf, bAscii, pVal, nVal);
      if( flags & BT_PGFLAGS_METATREE ){
        /* Interpret the meta-tree entry */
        u32 iAge = btGetU32(&pKey[0]);
        u32 iLevel = ~btGetU32(&pKey[4]);
        u32 iBlk = btGetU32(pVal);
        sqlite4BtBufAppendf(pBuf, "  [age=%d level=%d block=%d]", 
            (int)iAge, (int)iLevel, (int)iBlk
        );
      }
    }
    sqlite4BtBufAppendf(pBuf, "\n");
  }

  if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){
    for(i=0; i<btCellCount(aData, nData); i++){
      BtPage *pChild;
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097

3098
3099


3100
3101
3102
3103
3104
3105
3106
    }
  }

  btCsrReset(&csr, 1);
  return rc;
}

static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;


  rc = sqlite4BtPageAllocate(db->pPager, &pPg);
  if( rc==SQLITE4_OK ){


    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;
}







|




>


>
>







3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
    }
  }

  btCsrReset(&csr, 1);
  return rc;
}

static int btAllocateNewRoot(bt_db *db, int flag, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;

  assert( flag==BT_PGFLAGS_METATREE || flag==BT_PGFLAGS_SCHEDULE || flag==0 );
  rc = sqlite4BtPageAllocate(db->pPager, &pPg);
  if( rc==SQLITE4_OK ){
    u8 *aData = sqlite4BtPageData(pPg);
    aData[0] = (flag & 0xFF);
    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;
}
3162
3163
3164
3165
3166
3167
3168
3169
3170
3171
3172
3173
3174
3175
3176
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;

  /* If the meta-tree has not been created, create it now. */
  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  }

  /* If no writable sub-tree has been discovered, create one now. */
  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
    u32 iLevel;                   /* Level number for new sub-tree */
    u32 iSubBlock;                /* New block */








|







3185
3186
3187
3188
3189
3190
3191
3192
3193
3194
3195
3196
3197
3198
3199
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;

  /* If the meta-tree has not been created, create it now. */
  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, BT_PGFLAGS_METATREE, &pHdr->iMRoot);
  }

  /* If no writable sub-tree has been discovered, create one now. */
  if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){
    u32 iLevel;                   /* Level number for new sub-tree */
    u32 iSubBlock;                /* New block */

Changes to www/bt.wiki.
646
647
648
649
650
651
652

















































653
654
655
656
657
658
659
<ul>
  <li> Segment age (big-endian 32-bit integer).
  <li> Level number (ones-compliment of value as a big-endian 32-bit integer).
  <li> Separator key. All keys within the segment are guaranteed to be as
       large or larger than the separator key. The separator key may be
       zero bytes in size.
</ul>



















































<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
<ul>
  <li> Segment age (big-endian 32-bit integer).
  <li> Level number (ones-compliment of value as a big-endian 32-bit integer).
  <li> Separator key. All keys within the segment are guaranteed to be as
       large or larger than the separator key. The separator key may be
       zero bytes in size.
</ul>

<h3> Schedule-Page Format</h3>

<p>We might need more than one schedule page. Perhaps anyway... But say 
worry about this later on.

<p>The writer adds an entry to the schedule-tree. Schedule-tree entries
are always appended to the tree - the integer key is one greater than the
previous greatest key.

<p>The entry identifies the following:
<ul>
  <li><p> the inputs to the merge (age + level range).

  <li><p> space to write the output (allocated block numbers).
</ul>

<p>As well as the above, the entry contains space allocated for output 
fields. Output fields are populated by the checkpointer as it copies the
page image from the log to the database file. Output fields are:

<ul>
  <li><p> A flag to indicate that the merge has been performed.

  <li><p> A bitmask (or some other indication) indicating which of the output
          blocks were populated.

  <li><p> A pointer to a list of overflow pages that were freed, if any.

  <li><p> A pointer to the last key read from the input.
</ul>

<p>Note the implication here: A writer may only modify the page if it currently
resides in the db file (not the log). The writer can tell if the schedule page
resides in the db file or the log by checking the "merge performed" flag.

<ul>
  <li> Age of input segments (big-endian u32).

  <li> Max level of input segments (bit-endian u32). All segments of the
       specified age with a level equal to or less than this are considered
       inputs.

  <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>.
       Maximum is (say) 32.

  <li> Block number of each allocated block (<i>nBlock</i> big-endian 
       u32 fields).
</ul>


<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The