Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add notes describing schedule pages to bt.wiki. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
1df60437f3089489ca0d374bc69bf45c |
User & Date: | dan 2013-12-07 20:38:27.985 |
Context
2013-12-14
| ||
18:59 | Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled. check-in: 590e0410b1 user: dan tags: trunk | |
2013-12-07
| ||
20:38 | Add notes describing schedule pages to bt.wiki. check-in: 1df60437f3 user: dan tags: trunk | |
2013-12-06
| ||
20:11 | Fix problems with delete markers and range scans. check-in: ccf5a6bb6a user: dan tags: trunk | |
Changes
Changes to src/bt_main.c.
︙ | ︙ | |||
20 21 22 23 24 25 26 27 28 29 30 31 32 33 | #define BT_MAX_DIRECT_OVERFLOW 8 /* Maximum direct overflow pages per cell */ /* ** Values that make up the single byte flags field at the start of ** b-tree pages. */ #define BT_PGFLAGS_INTERNAL 0x01 /* True for non-leaf nodes */ /* #define BT_STDERR_DEBUG 1 */ typedef struct BtCursor BtCursor; typedef struct FiCursor FiCursor; struct bt_db { | > > | 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | #define BT_MAX_DIRECT_OVERFLOW 8 /* Maximum direct overflow pages per cell */ /* ** Values that make up the single byte flags field at the start of ** b-tree pages. */ #define BT_PGFLAGS_INTERNAL 0x01 /* True for non-leaf nodes */ #define BT_PGFLAGS_METATREE 0x02 /* True for a meta-tree page */ #define BT_PGFLAGS_SCHEDULE 0x04 /* True for a schedule-tree page */ /* #define BT_STDERR_DEBUG 1 */ typedef struct BtCursor BtCursor; typedef struct FiCursor FiCursor; struct bt_db { |
︙ | ︙ | |||
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 | int bAscii, /* True to print keys and values as ASCII */ BtPager *pPager, /* Pager object (or NULL) */ u8 *aData, int nData, /* Buffer containing page data */ sqlite4_buffer *pBuf /* Output buffer */ ){ int i; int nCell = (int)btCellCount(aData, nData); sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno); sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell); sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData)); sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData)); if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){ sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1])); } sqlite4BtBufAppendf(pBuf, "cell-offsets=("); for(i=0; i<nCell; i++){ u8 *ptr = btCellPtrFind(aData, nData, i); sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr)); } sqlite4BtBufAppendf(pBuf, ")\n"); for(i=0; i<nCell; i++){ | > > | > > > | | | > | < > | > > > > > > > > > > > > | 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 | int bAscii, /* True to print keys and values as ASCII */ BtPager *pPager, /* Pager object (or NULL) */ u8 *aData, int nData, /* Buffer containing page data */ sqlite4_buffer *pBuf /* Output buffer */ ){ int i; int nCell = (int)btCellCount(aData, nData); u8 flags = btFlags(aData); /* Page flags */ sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno); sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell); sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData)); sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData)); if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){ sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1])); } sqlite4BtBufAppendf(pBuf, "cell-offsets=("); for(i=0; i<nCell; i++){ u8 *ptr = btCellPtrFind(aData, nData, i); sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr)); } sqlite4BtBufAppendf(pBuf, ")\n"); for(i=0; i<nCell; i++){ u8 *pCell; /* Cell i */ int nKey; /* Number of bytes of key to output */ u8 *pKey; /* Buffer containing key. */ int nVal; /* Number of bytes of value to output */ u8 *pVal = 0; /* Buffer containing value. */ pCell = btCellFind(aData, nData, i); sqlite4BtBufAppendf(pBuf, " Cell %d: ", i); pCell += sqlite4BtVarintGet32(pCell, &nKey); pKey = pCell; pCell += nKey; btBufferAppendBlob(pBuf, bAscii, pKey, nKey); if( flags & BT_PGFLAGS_INTERNAL ){ sqlite4BtBufAppendf(pBuf, " child=%d ", (int)btGetU32(pCell)); }else{ sqlite4BtBufAppendf(pBuf, " "); pCell += sqlite4BtVarintGet32(pCell, &nVal); if( nVal>=2 ){ nVal -= 2; pVal = pCell; }else{ sqlite4BtBufAppendf(pBuf, "delete-key"); } } if( pVal ){ btBufferAppendBlob(pBuf, bAscii, pVal, nVal); if( flags & BT_PGFLAGS_METATREE ){ /* Interpret the meta-tree entry */ u32 iAge = btGetU32(&pKey[0]); u32 iLevel = ~btGetU32(&pKey[4]); u32 iBlk = btGetU32(pVal); sqlite4BtBufAppendf(pBuf, " [age=%d level=%d block=%d]", (int)iAge, (int)iLevel, (int)iBlk ); } } sqlite4BtBufAppendf(pBuf, "\n"); } if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){ for(i=0; i<btCellCount(aData, nData); i++){ BtPage *pChild; |
︙ | ︙ | |||
3086 3087 3088 3089 3090 3091 3092 | } } btCsrReset(&csr, 1); return rc; } | | > > > | 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 | } } btCsrReset(&csr, 1); return rc; } static int btAllocateNewRoot(bt_db *db, int flag, u32 *piNew){ u32 iNew = 0; BtPage *pPg; int rc; assert( flag==BT_PGFLAGS_METATREE || flag==BT_PGFLAGS_SCHEDULE || flag==0 ); rc = sqlite4BtPageAllocate(db->pPager, &pPg); if( rc==SQLITE4_OK ){ u8 *aData = sqlite4BtPageData(pPg); aData[0] = (flag & 0xFF); iNew = sqlite4BtPagePgno(pPg); sqlite4BtPageRelease(pPg); } *piNew = iNew; return rc; } |
︙ | ︙ | |||
3162 3163 3164 3165 3166 3167 3168 | BtDbHdr *pHdr, u32 *piRoot ){ int rc = SQLITE4_OK; /* If the meta-tree has not been created, create it now. */ if( pHdr->iMRoot==0 ){ | | | 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 | BtDbHdr *pHdr, u32 *piRoot ){ int rc = SQLITE4_OK; /* If the meta-tree has not been created, create it now. */ if( pHdr->iMRoot==0 ){ rc = btAllocateNewRoot(db, BT_PGFLAGS_METATREE, &pHdr->iMRoot); } /* If no writable sub-tree has been discovered, create one now. */ if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){ u32 iLevel; /* Level number for new sub-tree */ u32 iSubBlock; /* New block */ |
︙ | ︙ |
Changes to www/bt.wiki.
︙ | ︙ | |||
646 647 648 649 650 651 652 653 654 655 656 657 658 659 | <ul> <li> Segment age (big-endian 32-bit integer). <li> Level number (ones-compliment of value as a big-endian 32-bit integer). <li> Separator key. All keys within the segment are guaranteed to be as large or larger than the separator key. The separator key may be zero bytes in size. </ul> <h2 id=in-memory_tree>3.2. In-Memory Tree</h2> <p>This design uses the same multi-version in-memory tree that LSM does. The tree structure may be located in heap or shared memory. If shared memory is used then a separate file (*-shm2 or some such) is used for the data. The | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 | <ul> <li> Segment age (big-endian 32-bit integer). <li> Level number (ones-compliment of value as a big-endian 32-bit integer). <li> Separator key. All keys within the segment are guaranteed to be as large or larger than the separator key. The separator key may be zero bytes in size. </ul> <h3> Schedule-Page Format</h3> <p>We might need more than one schedule page. Perhaps anyway... But say worry about this later on. <p>The writer adds an entry to the schedule-tree. Schedule-tree entries are always appended to the tree - the integer key is one greater than the previous greatest key. <p>The entry identifies the following: <ul> <li><p> the inputs to the merge (age + level range). <li><p> space to write the output (allocated block numbers). </ul> <p>As well as the above, the entry contains space allocated for output fields. Output fields are populated by the checkpointer as it copies the page image from the log to the database file. Output fields are: <ul> <li><p> A flag to indicate that the merge has been performed. <li><p> A bitmask (or some other indication) indicating which of the output blocks were populated. <li><p> A pointer to a list of overflow pages that were freed, if any. <li><p> A pointer to the last key read from the input. </ul> <p>Note the implication here: A writer may only modify the page if it currently resides in the db file (not the log). The writer can tell if the schedule page resides in the db file or the log by checking the "merge performed" flag. <ul> <li> Age of input segments (big-endian u32). <li> Max level of input segments (bit-endian u32). All segments of the specified age with a level equal to or less than this are considered inputs. <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>. Maximum is (say) 32. <li> Block number of each allocated block (<i>nBlock</i> big-endian u32 fields). </ul> <h2 id=in-memory_tree>3.2. In-Memory Tree</h2> <p>This design uses the same multi-version in-memory tree that LSM does. The tree structure may be located in heap or shared memory. If shared memory is used then a separate file (*-shm2 or some such) is used for the data. The |
︙ | ︙ |