Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
65642c32ba4415e535a639c71e99c95d |
User & Date: | dan 2013-11-26 20:35:48.418 |
Context
2013-11-27
| ||
14:40 | Allocate blocks of space (not individual pages) within the database file for sub-trees. check-in: 5986afca58 user: dan tags: trunk | |
2013-11-26
| ||
20:35 | Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full. check-in: 65642c32ba user: dan tags: trunk | |
2013-11-25
| ||
20:50 | Begin adding code for blind-writes. check-in: fc9cdc6ca3 user: dan tags: trunk | |
Changes
Changes to src/btInt.h.
︙ | ︙ | |||
18 19 20 21 22 23 24 25 26 27 28 29 30 31 | typedef sqlite4_int64 i64; typedef sqlite4_uint64 u64; typedef unsigned int u32; typedef unsigned short u16; typedef unsigned char u8; /* Number of elements in an array object. */ #define array_size(x) (sizeof(x)/sizeof(x[0])) /* Number of read-lock slots in shared memory */ #define BT_NREADER 4 | > > > > > > > | 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | typedef sqlite4_int64 i64; typedef sqlite4_uint64 u64; typedef unsigned int u32; typedef unsigned short u16; typedef unsigned char u8; /* ** Special error codes used internally. These must be distinct from SQLite4 ** error codes. Which is easy - SQLite4 error codes are all greater than or ** equal to zero. */ #define BT_BLOCKFULL -1 /* Number of elements in an array object. */ #define array_size(x) (sizeof(x)/sizeof(x[0])) /* Number of read-lock slots in shared memory */ #define BT_NREADER 4 |
︙ | ︙ | |||
41 42 43 44 45 46 47 48 49 50 51 52 53 54 | /* By default blocks are 512K bytes in size. */ #define BT_DEFAULT_BLKSZ (512*1024) typedef struct BtDbHdr BtDbHdr; struct BtDbHdr { u32 pgsz; /* Page size in bytes */ u32 nPg; /* Size of database file in pages */ u32 iRoot; /* B-tree root page */ u32 iMRoot; /* Root page of meta-tree */ u32 iSRoot; /* Root page of schedule-tree */ u32 iSubRoot; /* Root of current sub-tree */ | > | 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | /* By default blocks are 512K bytes in size. */ #define BT_DEFAULT_BLKSZ (512*1024) typedef struct BtDbHdr BtDbHdr; struct BtDbHdr { u32 pgsz; /* Page size in bytes */ u32 blksz; /* Block size in bytes */ u32 nPg; /* Size of database file in pages */ u32 iRoot; /* B-tree root page */ u32 iMRoot; /* Root page of meta-tree */ u32 iSRoot; /* Root page of schedule-tree */ u32 iSubRoot; /* Root of current sub-tree */ |
︙ | ︙ |
Changes to src/bt_log.c.
︙ | ︙ | |||
863 864 865 866 867 868 869 870 871 872 873 874 875 876 | memcpy(&hdr, aData, sizeof(BtDbHdrCksum)); btLogChecksum32(1, (u8*)&hdr, offsetof(BtDbHdrCksum, aCksum), 0, aCksum); } if( aData==0 || aCksum[0]!=hdr.aCksum[0] || aCksum[1]!=hdr.aCksum[1] ){ memset(&hdr, 0, sizeof(BtDbHdrCksum)); hdr.hdr.pgsz = BT_DEFAULT_PGSZ; hdr.hdr.nPg = 2; hdr.hdr.iRoot = 2; } memcpy(pHdr, &hdr, sizeof(BtDbHdr)); } | > | 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 | memcpy(&hdr, aData, sizeof(BtDbHdrCksum)); btLogChecksum32(1, (u8*)&hdr, offsetof(BtDbHdrCksum, aCksum), 0, aCksum); } if( aData==0 || aCksum[0]!=hdr.aCksum[0] || aCksum[1]!=hdr.aCksum[1] ){ memset(&hdr, 0, sizeof(BtDbHdrCksum)); hdr.hdr.pgsz = BT_DEFAULT_PGSZ; hdr.hdr.blksz = BT_DEFAULT_BLKSZ; hdr.hdr.nPg = 2; hdr.hdr.iRoot = 2; } memcpy(pHdr, &hdr, sizeof(BtDbHdr)); } |
︙ | ︙ |
Changes to src/bt_main.c.
︙ | ︙ | |||
2430 2431 2432 2433 2434 2435 2436 | btPutU16(&aData[pgsz-6], iWrite); } }else{ /* The new entry will not fit on the leaf page. Entries will have ** to be shuffled between existing leaves and new leaves may need ** to be added to make space for it. */ | > > > > > > > > | | 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 | btPutU16(&aData[pgsz-6], iWrite); } }else{ /* The new entry will not fit on the leaf page. Entries will have ** to be shuffled between existing leaves and new leaves may need ** to be added to make space for it. */ bt_db *db = pCsr->base.pDb; if( db->bFastInsertOp ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); if( pHdr->nSubPg >= (pHdr->blksz / pHdr->pgsz) ){ rc = BT_BLOCKFULL; pHdr->iSubRoot = 0; } } if( rc==SQLITE4_OK && pCsr->nPg==1 ){ rc = btExtendTree(pCsr); } if( rc==SQLITE4_OK ){ rc = btBalance(pCsr, bLeaf, nKV, apKV); } } |
︙ | ︙ | |||
2535 2536 2537 2538 2539 2540 2541 | } } } return rc; } | > > | > > > > > > | | < > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | > > | > | | | < < < < < < | < < < < | < > > > > > > > | > > | | 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 | } } } return rc; } static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot); static int btReplaceEntry( bt_db *db, /* Database handle */ u32 iRoot, /* Root page of b-tree to update */ const void *pK, int nK, /* Key to insert */ const void *pV, int nV /* Value to insert. (nV<0) -> delete */ ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); int rc; /* Return code */ BtCursor csr; /* Cursor object to seek to insert point */ u32 iRootPg = iRoot; if( rc==SQLITE4_OK && iRoot==0 ){ rc = btFastInsertRoot(db, pHdr, &iRootPg); } /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be ** stored on. */ btCsrSetup(db, iRootPg, &csr); rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); if( rc==SQLITE4_OK ){ /* The cursor currently points to an entry with key pK/nK. This call ** should therefore replace that entry. So delete it and then re-seek ** the cursor. */ rc = sqlite4BtDelete(&csr.base); if( rc==SQLITE4_OK && nV>=0 ){ rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); if( rc==SQLITE4_OK ) rc = btErrorBkpt(SQLITE4_CORRUPT); } } if( nV>=0 && (rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ) ){ KeyValue kv; kv.pgno = 0; kv.eType = KV_VALUE; kv.pK = pK; kv.nK = nK; kv.pV = pV; kv.nV = nV; rc = btOverflowAssign(db, &kv); if( rc==SQLITE4_OK ){ do{ /* Insert the new KV pair into the current leaf. */ rc = btInsertAndBalance(&csr, 1, &kv); /* Unless this is a block-full error, break out of the loop */ if( rc!=BT_BLOCKFULL ) break; assert( iRoot==0 ); rc = btFastInsertRoot(db, pHdr, &iRootPg); if( rc==SQLITE4_OK ){ btCsrReset(&csr, 1); btCsrSetup(db, iRootPg, &csr); rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); } }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ); } if( kv.eType==KV_CELL ){ sqlite4_free(db->pEnv, (void*)kv.pV); } } btCsrReset(&csr, 1); return rc; } static int btAllocateNewRoot(bt_db *db, u32 *piNew){ u32 iNew = 0; BtPage *pPg; int rc; rc = btAllocateNonOverflow(db, &pPg); if( rc==SQLITE4_OK ){ iNew = sqlite4BtPagePgno(pPg); sqlite4BtPageRelease(pPg); } *piNew = iNew; return rc; } static int btDecodeMetatreeKey( BtCursor *pCsr, u32 *piAge, u32 *piLevel, u8 **paKey, int *pnKey ){ u8 *aK; int nK; int rc = sqlite4BtCsrKey((bt_cursor*)pCsr, (const void**)&aK, &nK); if( rc==SQLITE4_OK ){ *piAge = btGetU32(&aK[0]); *piLevel = ~btGetU32(&aK[4]); if( paKey ){ *paKey = &aK[8]; *pnKey = nK-8; } } return rc; } static int btFindNextLevel( bt_db *db, BtDbHdr *pHdr, u32 *piNext ){ int rc; BtCursor csr; btCsrSetup(db, pHdr->iMRoot, &csr); rc = btCsrEnd(&csr, 0); assert( rc!=SQLITE4_INEXACT ); *piNext = 0; if( rc==SQLITE4_OK ){ u32 iAge; u32 iLevel; rc = btDecodeMetatreeKey(&csr, &iAge, &iLevel, 0, 0); if( rc==SQLITE4_OK && iAge==0 ){ *piNext = iLevel+1; } }else if( rc==SQLITE4_NOTFOUND ){ rc = SQLITE4_OK; } btCsrReset(&csr, 1); return rc; } static int btFastInsertRoot( bt_db *db, BtDbHdr *pHdr, u32 *piRoot ){ int rc = SQLITE4_OK; u32 iSubRoot = 0; if( pHdr->iMRoot==0 ){ rc = btAllocateNewRoot(db, &pHdr->iMRoot); } iSubRoot = pHdr->iSubRoot; /* If no writable sub-tree has been discovered, create one now. */ if( iSubRoot==0 ){ u32 iLevel; rc = btFindNextLevel(db, pHdr, &iLevel); if( rc==SQLITE4_OK ){ rc = btAllocateNewRoot(db, &iSubRoot); } if( rc==SQLITE4_OK ){ u8 aKey[8]; u8 aVal[8]; pHdr->iSubRoot = iSubRoot; pHdr->nSubPg = 0; /* The key for the new entry consists of the concatentation of two ** 32-bit big-endian integers - the <age> and <level-no>. The age ** of the new segment is 0. The level number is one greater than the ** level number of the previous segment. */ btPutU32(&aKey[0], 0); btPutU32(&aKey[4], ~iLevel); btPutU32(&aVal[0], iSubRoot); btPutU32(&aVal[4], 1); rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8); } } *piRoot = iSubRoot; return rc; } |
︙ | ︙ | |||
2690 2691 2692 2693 2694 2695 2696 | assert( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ); btCheckPageRefs(db); /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be ** stored on. */ if( rc==SQLITE4_OK ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); | | | < < | | 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 | assert( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ); btCheckPageRefs(db); /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be ** stored on. */ if( rc==SQLITE4_OK ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); u32 iRoot = 0; if( !db->bFastInsertOp ){ iRoot = pHdr->iRoot; } if( rc==SQLITE4_OK ){ rc = btReplaceEntry(db, iRoot, pK, nK, pV, nV); } } btCheckPageRefs(db); db->bFastInsertOp = 0; return rc; } |
︙ | ︙ |
Changes to src/bt_pager.c.
︙ | ︙ | |||
1088 1089 1090 1091 1092 1093 1094 | ** or SQLITE4_OK otherwise. */ int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){ BtDbHdr *pHdr = pPager->pHdr; int rc = SQLITE4_OK; sqlite4BtBufAppendf(pBuf, | | > | | 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 | ** or SQLITE4_OK otherwise. */ int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){ BtDbHdr *pHdr = pPager->pHdr; int rc = SQLITE4_OK; sqlite4BtBufAppendf(pBuf, "pgsz=%d blksz=%d nPg=%d" " iRoot=%d iMRoot=%d iSRoot=%d" " iCookie=%d iFreePg=%d iFreeBlk=%d", pHdr->pgsz, pHdr->blksz, pHdr->nPg, pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot, pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk ); return rc; } #ifndef NDEBUG |
︙ | ︙ |
Changes to www/bt.wiki.
︙ | ︙ | |||
630 631 632 633 634 635 636 637 638 639 640 641 642 643 | <li><p>Each segment is assigned to a level. Each level contains a set of non-overlapping segments. All segments in level N are newer than those in segment (N+1). <li><p>Segments are linked together using a tree-like data structure stored on regular database pages (mixed in with b-tree data pages). </ul> <h2 id=in-memory_tree>3.2. In-Memory Tree</h2> <p>This design uses the same multi-version in-memory tree that LSM does. The tree structure may be located in heap or shared memory. If shared memory is used then a separate file (*-shm2 or some such) is used for the data. The tree-header used by LSM to identify the latest tree version is combined with | > > > > > > > > > > > > > > > | 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 | <li><p>Each segment is assigned to a level. Each level contains a set of non-overlapping segments. All segments in level N are newer than those in segment (N+1). <li><p>Segments are linked together using a tree-like data structure stored on regular database pages (mixed in with b-tree data pages). </ul> <h3> Meta-Tree Format</h3> <p> The meta-tree is a b-tree stored in the same format as the main user b-tree. It serves as an index for all segments in the database. The keys in the meta-tree are formated as follows: <ul> <li> Segment age (big-endian 32-bit integer). <li> Level number (ones-compliment of value as a big-endian 32-bit integer). <li> Separator key. All keys within the segment are guaranteed to be as large or larger than the separator key. The separator key may be zero bytes in size. </ul> <h2 id=in-memory_tree>3.2. In-Memory Tree</h2> <p>This design uses the same multi-version in-memory tree that LSM does. The tree structure may be located in heap or shared memory. If shared memory is used then a separate file (*-shm2 or some such) is used for the data. The tree-header used by LSM to identify the latest tree version is combined with |
︙ | ︙ |