Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
590e0410b18526cb3d69afd4372fcddc |
User & Date: | dan 2013-12-14 18:59:18.588 |
Context
2014-01-02
| ||
18:53 | Changes to FiCursor object to support reading merged blocks. check-in: 27248a1ebc user: dan tags: trunk | |
2013-12-14
| ||
18:59 | Add scheduling of fast-insert merges to bt. Merges are not performed yet, just scheduled. check-in: 590e0410b1 user: dan tags: trunk | |
2013-12-07
| ||
20:38 | Add notes describing schedule pages to bt.wiki. check-in: 1df60437f3 user: dan tags: trunk | |
Changes
Changes to src/btInt.h.
︙ | ︙ | |||
86 87 88 89 90 91 92 93 94 95 96 97 98 99 | u32 nSubPg; /* Number of non-overflow pages in sub-tree */ u32 iCookie; /* Current value of schema cookie */ u32 iFreePg; /* First page in free-page list trunk */ u32 iFreeBlk; /* First page in free-block list trunk */ }; /************************************************************************* ** Interface to bt_pager.c functionality. */ typedef struct BtPage BtPage; typedef struct BtPager BtPager; /* | > > > > > > > > > > > > > > > > > > | 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | u32 nSubPg; /* Number of non-overflow pages in sub-tree */ u32 iCookie; /* Current value of schema cookie */ u32 iFreePg; /* First page in free-page list trunk */ u32 iFreeBlk; /* First page in free-block list trunk */ }; /* ** This struct defines the format of database "schedule" pages. */ typedef struct BtSchedule BtSchedule; struct BtSchedule { u32 bBusy; /* True if page is busy */ u32 iAge; /* Age of input segments */ u32 iMaxLevel; /* Maximum level of input segments */ u32 iOutLevel; /* Level at which to write output */ u32 aBlock[32]; /* Allocated blocks */ u32 iNextPg; /* Page that contains next input key */ u32 iNextCell; /* Cell that contains next input key */ u32 nUsed; /* Number of output blocks actually written */ u32 iFreeList; /* First page of new free-list (if any) */ }; /************************************************************************* ** Interface to bt_pager.c functionality. */ typedef struct BtPage BtPage; typedef struct BtPager BtPager; /* |
︙ | ︙ | |||
138 139 140 141 142 143 144 | int sqlite4BtPageRelease(BtPage*); void sqlite4BtPageReference(BtPage*); /* ** Allocate new database pages or blocks. */ int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage); | | | 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | int sqlite4BtPageRelease(BtPage*); void sqlite4BtPageReference(BtPage*); /* ** Allocate new database pages or blocks. */ int sqlite4BtPageAllocate(BtPager*, BtPage **ppPage); int sqlite4BtBlockAllocate(BtPager*, int nBlk, u32 *piBlk); /* ** Query page references. */ u32 sqlite4BtPagePgno(BtPage*); void *sqlite4BtPageData(BtPage*); |
︙ | ︙ |
Changes to src/bt_main.c.
︙ | ︙ | |||
32 33 34 35 36 37 38 39 40 41 42 43 44 45 | typedef struct BtCursor BtCursor; typedef struct FiCursor FiCursor; struct bt_db { sqlite4_env *pEnv; /* SQLite environment */ BtPager *pPager; /* Underlying page-based database */ bt_cursor *pAllCsr; /* List of all open cursors */ int bFastInsertOp; /* Set by CONTROL_FAST_INSERT_OP */ }; typedef struct BtOvfl BtOvfl; struct BtOvfl { int nKey; int nVal; | > > | 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | typedef struct BtCursor BtCursor; typedef struct FiCursor FiCursor; struct bt_db { sqlite4_env *pEnv; /* SQLite environment */ BtPager *pPager; /* Underlying page-based database */ bt_cursor *pAllCsr; /* List of all open cursors */ int nMinMerge; int nScheduleAlloc; int bFastInsertOp; /* Set by CONTROL_FAST_INSERT_OP */ }; typedef struct BtOvfl BtOvfl; struct BtOvfl { int nKey; int nVal; |
︙ | ︙ | |||
94 95 96 97 98 99 100 101 102 103 104 105 106 107 | struct FiCursor { bt_cursor base; /* Base cursor class */ int iBt; /* Current sub-tree (or -1 for EOF) */ int nBt; /* Number of entries in aBt[] array */ BtCursor *aBt; /* Array of sub-tree cursors */ }; #ifndef btErrorBkpt int btErrorBkpt(int rc){ static int error_cnt = 0; error_cnt++; return rc; } #endif | > > > > > | 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | struct FiCursor { bt_cursor base; /* Base cursor class */ int iBt; /* Current sub-tree (or -1 for EOF) */ int nBt; /* Number of entries in aBt[] array */ BtCursor *aBt; /* Array of sub-tree cursors */ }; /* ** Meta-table summary key. */ static const u8 aSummaryKey[] = {0xFF, 0xFF, 0xFF, 0xFF}; #ifndef btErrorBkpt int btErrorBkpt(int rc){ static int error_cnt = 0; error_cnt++; return rc; } #endif |
︙ | ︙ | |||
169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | } #define btPutU32(x,y) sqlite4BtPutU32(x,y) /* ** Allocate a new database handle. */ int sqlite4BtNew(sqlite4_env *pEnv, int nExtra, bt_db **ppDb){ bt_db *db = 0; /* New database object */ BtPager *pPager = 0; /* Pager object for this database */ int nReq; /* Bytes of space required for bt_db object */ int rc; /* Return code */ nReq = sizeof(bt_db); rc = sqlite4BtPagerNew(pEnv, nExtra + nReq, &pPager); if( rc==SQLITE4_OK ){ db = (bt_db*)sqlite4BtPagerExtra(pPager); db->pPager = pPager; db->pEnv = pEnv; } *ppDb = db; return rc; } /* | > > > > > > | 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | } #define btPutU32(x,y) sqlite4BtPutU32(x,y) /* ** Allocate a new database handle. */ int sqlite4BtNew(sqlite4_env *pEnv, int nExtra, bt_db **ppDb){ static const int MIN_MERGE = 4; static const int SCHEDULE_ALLOC = 4; bt_db *db = 0; /* New database object */ BtPager *pPager = 0; /* Pager object for this database */ int nReq; /* Bytes of space required for bt_db object */ int rc; /* Return code */ nReq = sizeof(bt_db); rc = sqlite4BtPagerNew(pEnv, nExtra + nReq, &pPager); if( rc==SQLITE4_OK ){ db = (bt_db*)sqlite4BtPagerExtra(pPager); db->pPager = pPager; db->pEnv = pEnv; db->nMinMerge = MIN_MERGE; db->nScheduleAlloc = SCHEDULE_ALLOC; } *ppDb = db; return rc; } /* |
︙ | ︙ | |||
492 493 494 495 496 497 498 499 500 501 502 503 | static void btPageToAscii( u32 pgno, /* Page number */ int bAscii, /* True to print keys and values as ASCII */ BtPager *pPager, /* Pager object (or NULL) */ u8 *aData, int nData, /* Buffer containing page data */ sqlite4_buffer *pBuf /* Output buffer */ ){ int i; int nCell = (int)btCellCount(aData, nData); u8 flags = btFlags(aData); /* Page flags */ sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno); | > > > > > > > > > > > > > > > > > > > > > > > | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | > > > | | | | | | | | > | | | | | | | | | | | | > | 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | static void btPageToAscii( u32 pgno, /* Page number */ int bAscii, /* True to print keys and values as ASCII */ BtPager *pPager, /* Pager object (or NULL) */ u8 *aData, int nData, /* Buffer containing page data */ sqlite4_buffer *pBuf /* Output buffer */ ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(pPager); int i; int nCell = (int)btCellCount(aData, nData); u8 flags = btFlags(aData); /* Page flags */ sqlite4BtBufAppendf(pBuf, "Page %d: ", pgno); if( pgno==pHdr->iSRoot ){ int i; BtSchedule s; sqlite4BtBufAppendf(pBuf, "(schedule page) "); memcpy(&s, aData, sizeof(s)); sqlite4BtBufAppendf(pBuf, "bBusy=%d ", (int)s.bBusy); sqlite4BtBufAppendf(pBuf, "iAge=%d ", (int)s.iAge); sqlite4BtBufAppendf(pBuf, "iMaxLevel=%d ", (int)s.iMaxLevel); sqlite4BtBufAppendf(pBuf, "iOutLevel=%d ", (int)s.iOutLevel); sqlite4BtBufAppendf(pBuf, "blocks=("); for(i=0; s.aBlock[i] && i<array_size(s.aBlock); i++){ sqlite4BtBufAppendf(pBuf, "%s%d", i==0 ? "" : " ", (int)s.aBlock[i]); } sqlite4BtBufAppendf(pBuf, ") "); sqlite4BtBufAppendf(pBuf, "iNextPg=%d ", (int)s.iNextPg); sqlite4BtBufAppendf(pBuf, "iNextCell=%d ", (int)s.iNextCell); sqlite4BtBufAppendf(pBuf, "nUsed=%d ", (int)s.nUsed); sqlite4BtBufAppendf(pBuf, "iFreeList=%d", (int)s.iFreeList); }else{ sqlite4BtBufAppendf(pBuf, "nCell=%d ", nCell); sqlite4BtBufAppendf(pBuf, "iFree=%d ", (int)btFreeOffset(aData, nData)); sqlite4BtBufAppendf(pBuf, "flags=%d ", (int)btFlags(aData)); if( btFlags(aData) & BT_PGFLAGS_INTERNAL ){ sqlite4BtBufAppendf(pBuf, "rchild=%d ", (int)btGetU32(&aData[1])); } sqlite4BtBufAppendf(pBuf, "cell-offsets=("); for(i=0; i<nCell; i++){ u8 *ptr = btCellPtrFind(aData, nData, i); sqlite4BtBufAppendf(pBuf, "%s%d", i==0?"":" ", (int)btGetU16(ptr)); } sqlite4BtBufAppendf(pBuf, ")\n"); for(i=0; i<nCell; i++){ u8 *pCell; /* Cell i */ int nKey; /* Number of bytes of key to output */ u8 *pKey; /* Buffer containing key. */ int nVal; /* Number of bytes of value to output */ u8 *pVal = 0; /* Buffer containing value. */ pCell = btCellFind(aData, nData, i); sqlite4BtBufAppendf(pBuf, " Cell %d: ", i); pCell += sqlite4BtVarintGet32(pCell, &nKey); pKey = pCell; pCell += nKey; btBufferAppendBlob(pBuf, bAscii, pKey, nKey); if( flags & BT_PGFLAGS_INTERNAL ){ sqlite4BtBufAppendf(pBuf, " child=%d ", (int)btGetU32(pCell)); }else{ sqlite4BtBufAppendf(pBuf, " "); pCell += sqlite4BtVarintGet32(pCell, &nVal); if( nVal>=2 ){ nVal -= 2; pVal = pCell; }else{ sqlite4BtBufAppendf(pBuf, "delete-key"); } } if( pVal ){ btBufferAppendBlob(pBuf, bAscii, pVal, nVal); if( flags & BT_PGFLAGS_METATREE ){ /* Interpret the meta-tree entry */ if( nKey==sizeof(aSummaryKey) && 0==memcmp(pKey, aSummaryKey, nKey) ){ sqlite4BtBufAppendf(pBuf, " [summary]"); }else{ u32 iAge = btGetU32(&pKey[0]); u32 iLevel = ~btGetU32(&pKey[4]); u32 iBlk = btGetU32(pVal); sqlite4BtBufAppendf(pBuf, " [age=%d level=%d block=%d]", (int)iAge, (int)iLevel, (int)iBlk ); } } } sqlite4BtBufAppendf(pBuf, "\n"); } if( pPager && btFlags(aData) & BT_PGFLAGS_INTERNAL ){ for(i=0; i<btCellCount(aData, nData); i++){ BtPage *pChild; u8 *aChild; u32 child; child = btChildPgno(aData, nData, i); sqlite4BtPageGet(pPager, child, &pChild); aChild = sqlite4BtPageData(pChild); btPageToAscii(child, bAscii, pPager, aChild, nData, pBuf); sqlite4BtPageRelease(pChild); } } } sqlite4BtBufAppendf(pBuf, "\n"); } #ifndef NDEBUG #include <stdio.h> |
︙ | ︙ | |||
1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 | BtCursor *pSub = pCsr->aBt; /* Loop through the set of f-tree levels */ btCsrSetup(db, pHdr->iMRoot, &mcsr); for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){ const void *pV; int nV; /* Value associated with meta-tree entry */ u32 iBlk; /* Block containing sub-tree */ sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrReset(pSub, 1); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrSeek(pSub, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK); if( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ){ | > > > > > > | 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 | BtCursor *pSub = pCsr->aBt; /* Loop through the set of f-tree levels */ btCsrSetup(db, pHdr->iMRoot, &mcsr); for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){ const void *pV; int nV; /* Value associated with meta-tree entry */ u32 iBlk; /* Block containing sub-tree */ btCsrKey(&mcsr, &pV, &nV); if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){ rc = SQLITE4_NOTFOUND; break; } sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrReset(pSub, 1); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrSeek(pSub, pK, nK, BT_SEEK_EQ, BT_CSRSEEK_SEEK); if( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT ){ |
︙ | ︙ | |||
1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 | u32 iBlk; /* Block containing sub-tree */ BtCursor *pSub; /* Sub-cursor for this block */ rc = fiCsrAllocateSubs(db, pCsr, iSub+1); if( rc!=SQLITE4_OK ) break; pSub = &pCsr->aBt[iSub]; iSub++; sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrReset(pSub, 1); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrSeek(pSub, pK, nK, eSeek, BT_CSRSEEK_SEEK); if( rc==SQLITE4_OK ){ bMatch = 1; }else if( rc==SQLITE4_INEXACT ){ bHit = 1; }else if( rc!=SQLITE4_NOTFOUND ){ break; } } assert( rc!=SQLITE4_OK && rc!=SQLITE4_INEXACT ); if( rc==SQLITE4_NOTFOUND ){ pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK); rc = fiCsrSetCurrent(pCsr); if( rc==SQLITE4_OK ){ BtCursor *pSub = &pCsr->aBt[pCsr->iBt]; if( fiCsrIsDelete(pSub) ){ | > > > > > > > | 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 | u32 iBlk; /* Block containing sub-tree */ BtCursor *pSub; /* Sub-cursor for this block */ rc = fiCsrAllocateSubs(db, pCsr, iSub+1); if( rc!=SQLITE4_OK ) break; pSub = &pCsr->aBt[iSub]; iSub++; btCsrKey(&mcsr, &pV, &nV); if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){ rc = SQLITE4_NOTFOUND; break; } sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrReset(pSub, 1); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrSeek(pSub, pK, nK, eSeek, BT_CSRSEEK_SEEK); if( rc==SQLITE4_OK ){ bMatch = 1; }else if( rc==SQLITE4_INEXACT ){ bHit = 1; }else if( rc!=SQLITE4_NOTFOUND ){ break; } } assert( rc!=SQLITE4_OK && rc!=SQLITE4_INEXACT ); btCsrReset(&mcsr, 1); if( rc==SQLITE4_NOTFOUND ){ pCsr->base.flags |= (eSeek==BT_SEEK_GE ? CSR_NEXT_OK : CSR_PREV_OK); rc = fiCsrSetCurrent(pCsr); if( rc==SQLITE4_OK ){ BtCursor *pSub = &pCsr->aBt[pCsr->iBt]; if( fiCsrIsDelete(pSub) ){ |
︙ | ︙ | |||
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 | /* Loop through the set of f-tree levels. Open a cursor on each. */ btCsrSetup(db, pHdr->iMRoot, &mcsr); for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){ const void *pV; int nV; /* Value associated with meta-tree entry */ u32 iBlk; /* Block containing sub-tree */ BtCursor *pSub; /* Sub-cursor for this block */ rc = fiCsrAllocateSubs(db, pCsr, iSub+1); if( rc!=SQLITE4_OK ) break; pSub = &pCsr->aBt[iSub]; iSub++; sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrEnd(pSub, bLast); if( rc!=SQLITE4_OK && rc!=SQLITE4_NOTFOUND ) break; } if( rc==SQLITE4_NOTFOUND ){ pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK); pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK); rc = fiCsrSetCurrent(pCsr); if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) ){ rc = fiCsrStep(pCsr); | > > > > > > > | 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 | /* Loop through the set of f-tree levels. Open a cursor on each. */ btCsrSetup(db, pHdr->iMRoot, &mcsr); for(rc=btCsrEnd(&mcsr, 0); rc==SQLITE4_OK; rc=btCsrStep(&mcsr, 1)){ const void *pV; int nV; /* Value associated with meta-tree entry */ u32 iBlk; /* Block containing sub-tree */ BtCursor *pSub; /* Sub-cursor for this block */ btCsrKey(&mcsr, &pV, &nV); if( nV==sizeof(aSummaryKey) && 0==memcmp(pV, aSummaryKey, nV) ){ rc = SQLITE4_NOTFOUND; break; } rc = fiCsrAllocateSubs(db, pCsr, iSub+1); if( rc!=SQLITE4_OK ) break; pSub = &pCsr->aBt[iSub]; iSub++; sqlite4BtCsrData(&mcsr.base, 0, 4, &pV, &nV); iBlk = sqlite4BtGetU32((const u8*)pV); btCsrSetup(db, btBlockToRoot(pHdr, iBlk), pSub); rc = btCsrEnd(pSub, bLast); if( rc!=SQLITE4_OK && rc!=SQLITE4_NOTFOUND ) break; } btCsrReset(&mcsr, 1); if( rc==SQLITE4_NOTFOUND ){ pCsr->base.flags &= ~(CSR_NEXT_OK | CSR_PREV_OK); pCsr->base.flags |= (bLast ? CSR_PREV_OK : CSR_NEXT_OK); rc = fiCsrSetCurrent(pCsr); if( rc==SQLITE4_OK && fiCsrIsDelete(&pCsr->aBt[pCsr->iBt]) ){ rc = fiCsrStep(pCsr); |
︙ | ︙ | |||
3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 | } } return rc; } static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot); static int btReplaceEntry( bt_db *db, /* Database handle */ u32 iRoot, /* Root page of b-tree to update */ const void *pK, int nK, /* Key to insert */ const void *pV, int nV /* Value to insert. (nV<0) -> delete */ ){ | > | 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 | } } return rc; } static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot); static int btScheduleMerge(bt_db *db); static int btReplaceEntry( bt_db *db, /* Database handle */ u32 iRoot, /* Root page of b-tree to update */ const void *pK, int nK, /* Key to insert */ const void *pV, int nV /* Value to insert. (nV<0) -> delete */ ){ |
︙ | ︙ | |||
3087 3088 3089 3090 3091 3092 3093 | /* Insert the new KV pair into the current leaf. */ rc = btInsertAndBalance(&csr, 1, &kv); /* Unless this is a block-full error, break out of the loop */ if( rc!=BT_BLOCKFULL ) break; assert( iRoot==0 ); | > > > > | > | 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 | /* Insert the new KV pair into the current leaf. */ rc = btInsertAndBalance(&csr, 1, &kv); /* Unless this is a block-full error, break out of the loop */ if( rc!=BT_BLOCKFULL ) break; assert( iRoot==0 ); /* Try to schedule a merge operation */ rc = btScheduleMerge(db); if( rc==SQLITE4_OK ){ rc = btFastInsertRoot(db, pHdr, &iRootPg); } if( rc==SQLITE4_OK ){ btCsrReset(&csr, 1); btCsrSetup(db, iRootPg, &csr); rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE); } }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ); } |
︙ | ︙ | |||
3144 3145 3146 3147 3148 3149 3150 | *paKey = &aK[8]; *pnKey = nK-8; } } return rc; } | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > | | > > > | | > > > > | > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > | > > | > > > > > > > | > > > > > | | > > > | > > > > | > > > | > > > | > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > | > > > > | > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | < < < < < > | 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 | *paKey = &aK[8]; *pnKey = nK-8; } } return rc; } static void *btMalloc(bt_db *db, int nByte, int *pRc){ void *pRet = 0; if( *pRc==SQLITE4_OK ){ pRet = sqlite4_malloc(db->pEnv, nByte); if( !pRet ) *pRc = btErrorBkpt(SQLITE4_NOMEM); } return pRet; } static void btFree(bt_db *db, void *p){ sqlite4_free(db->pEnv, p); } static int fiLoadSummary( bt_db *db, BtCursor *p, const u8 **paSummary, int *pnSummary ){ static const u8 aZero[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); int rc; btCsrSetup(db, pHdr->iMRoot, p); rc = btCsrSeek( p, aSummaryKey, sizeof(aSummaryKey), BT_SEEK_EQ, BT_CSRSEEK_SEEK ); if( rc==SQLITE4_OK ){ rc = btCsrData(p, 0, -1, (const void**)paSummary, pnSummary); }else if( rc==SQLITE4_NOTFOUND ){ *paSummary = aZero; *pnSummary = sizeof(aZero); rc = SQLITE4_OK; } return rc; } static void btReadSummary( const u8 *aSum, int iAge, u16 *piMinLevel, u16 *pnLevel, u16 *piMergeLevel ){ *piMinLevel = btGetU16(&aSum[iAge * 6]); *pnLevel = btGetU16(&aSum[iAge * 6 + 2]); *piMergeLevel = btGetU16(&aSum[iAge * 6 + 4]); } static void btWriteSummary( u8 *aSum, int iAge, u16 iMinLevel, u16 nLevel, u16 iMergeLevel ){ btPutU16(&aSum[iAge * 6], iMinLevel); btPutU16(&aSum[iAge * 6 + 2], nLevel); btPutU16(&aSum[iAge * 6 + 4], iMergeLevel); } /* ** Allocate a new level for a new age=0 segment. The new level is always ** one greater than the current largest age=0 level number. */ static int btAllocateNewLevel( bt_db *db, BtDbHdr *pHdr, u32 *piNext ){ int rc; /* Return code */ BtCursor csr; /* Cursor to read meta-table summary */ int nByte; /* Size of buffer aByte[] */ const u8 *aByte; /* Summary data */ u8 *aNew; /* New summary data */ rc = fiLoadSummary(db, &csr, &aByte, &nByte); aNew = (u8*)btMalloc(db, nByte, &rc); if( rc==SQLITE4_OK ){ u16 iMin, nLevel, iMerge; btReadSummary(aByte, 0, &iMin, &nLevel, &iMerge); memcpy(aNew, aByte, nByte); btPutU16(&aNew[2], nLevel+1); rc = btReplaceEntry( db, pHdr->iMRoot, aSummaryKey, sizeof(aSummaryKey), aNew, nByte ); btFree(db, aNew); *piNext = (iMin + nLevel); } btCsrReset(&csr, 1); return rc; } static void btWriteSchedule(BtPage *pPg, BtSchedule *p, int *pRc){ if( *pRc==SQLITE4_OK ){ int rc = sqlite4BtPageWrite(pPg); if( rc==SQLITE4_OK ){ /* TODO: convert values to big-endian format */ u8 *aData = sqlite4BtPageData(pPg); memcpy(aData, p, sizeof(BtSchedule)); } *pRc = rc; } } static int btReadSchedule(bt_db *db, BtPage *pPg, BtSchedule *p){ /* TODO: convert values to big-endian format */ u8 *aData = sqlite4BtPageData(pPg); memcpy(p, aData, sizeof(BtSchedule)); return SQLITE4_OK; } static int btAllocateBlock( bt_db *db, int nBlk, u32 *aiBlk ){ return sqlite4BtBlockAllocate(db->pPager, nBlk, aiBlk); } /* ** This is a helper function for btScheduleMerge(). It determines the ** age and range of levels to be used as inputs by the merge (if any). */ static int btFindMerge( bt_db *db, /* Database handle */ u32 *piAge, /* OUT: Age of input segments to merge */ u32 *piMaxLevel, /* OUT: Maximum input level value */ u32 *piOutLevel /* OUT: Output level value */ ){ BtCursor csr; /* Cursor used to read summary record */ int rc; /* Return code */ const u8 *aSum; int nSum; rc = fiLoadSummary(db, &csr, &aSum, &nSum); if( rc==SQLITE4_OK ){ int iAge; int iBestAge = -1; /* Best age to merge levels from */ int nBest = (db->nMinMerge-1);/* Number of levels merged at iBestAge */ u16 iMin, nLevel, iMerge; /* Summary of current age */ rc = SQLITE4_NOTFOUND; for(iAge=0; iAge<(nSum/3); iAge++){ btReadSummary(aSum, iAge, &iMin, &nLevel, &iMerge); if( iMerge ){ int n = 1 + (iMerge-iMin); if( n>nBest ){ *piMaxLevel = iMerge; *piAge = iAge; btReadSummary(aSum, iAge+1, &iMin, &nLevel, &iMerge); *piOutLevel = (iMin + nLevel - 1); rc = SQLITE4_OK; } break; }else{ if( nLevel>nBest ){ iBestAge = iAge; nBest = nLevel; } } } if( rc==SQLITE4_NOTFOUND && iBestAge>=0 ){ u8 *aNew; int nByte = nSum; if( iAge+1>=(nSum/3) ) nByte += 6; rc = SQLITE4_OK; aNew = (u8*)btMalloc(db, nByte, &rc); if( rc==SQLITE4_OK ){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); /* Create a copy of the summary record */ memcpy(aNew, aSum, nSum); if( nByte>nSum ) btWriteSummary(aNew, iBestAge+1, 0, 0, 0); /* Find the input age and maximum level */ btReadSummary(aSum, iBestAge, &iMin, &nLevel, &iMerge); *piMaxLevel = (u32)(iMin + nLevel - 1); *piAge = iBestAge; /* Find the output level */ btReadSummary(aSum, iBestAge+1, &iMin, &nLevel, &iMerge); *piOutLevel = iMin + nLevel; /* Update the summary record for the output segment. */ btWriteSummary(aNew, iBestAge+1, iMin, nLevel+1, iMerge); /* Write the updated summary record back to the db. */ rc = btReplaceEntry( db, pHdr->iMRoot, aSummaryKey, sizeof(aSummaryKey), aNew, nByte ); } } } btCsrReset(&csr, 1); return rc; } /* ** If possible, schedule a merge operation. ** ** The merge operation is selected based on the following criteria: ** ** * The more levels involved in the merge the better, and ** * It is better to merge younger segments than older ones. */ static int btScheduleMerge(bt_db *db){ BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager); BtPage *pPg = 0; /* Schedule page */ u8 *aData; /* Schedule page data */ int rc; u32 iAge; u32 iLvl; u32 iOutLvl; /* Find the schedule page. If there is no schedule page, allocate it now. */ if( pHdr->iSRoot==0 ){ rc = sqlite4BtPageAllocate(db->pPager, &pPg); if( rc==SQLITE4_OK ){ u8 *aData = sqlite4BtPageData(pPg); memset(aData, 0, pHdr->pgsz); pHdr->iSRoot = sqlite4BtPagePgno(pPg); } }else{ rc = sqlite4BtPageGet(db->pPager, pHdr->iSRoot, &pPg); } /* Check if the schedule page is busy. If so, no new merge may be ** scheduled. If the schedule page is not busy, call btFindMerge() to ** figure out which levels should be scheduled for merge. */ if( rc==SQLITE4_OK ){ aData = sqlite4BtPageData(pPg); if( btGetU32(aData) ){ rc = SQLITE4_NOTFOUND; }else{ rc = btFindMerge(db, &iAge, &iLvl, &iOutLvl); } } if( rc==SQLITE4_OK ){ BtSchedule s; memset(&s, 0, sizeof(BtSchedule)); s.bBusy = 1; s.iAge = iAge; s.iMaxLevel = iLvl; s.iOutLevel = iOutLvl; rc = btAllocateBlock(db, db->nScheduleAlloc, s.aBlock); btWriteSchedule(pPg, &s, &rc); } sqlite4BtPageRelease(pPg); if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK; return rc; } static int btFastInsertRoot( bt_db *db, BtDbHdr *pHdr, u32 *piRoot ){ int rc = SQLITE4_OK; assert( db->bFastInsertOp ); db->bFastInsertOp = 0; /* If the meta-tree has not been created, create it now. */ if( pHdr->iMRoot==0 ){ rc = btAllocateNewRoot(db, BT_PGFLAGS_METATREE, &pHdr->iMRoot); } /* If no writable sub-tree current exists, create one */ if( rc==SQLITE4_OK && pHdr->iSubBlock==0 ){ u32 iLevel; /* Level number for new sub-tree */ u32 iSubBlock; /* New block */ rc = btAllocateNewLevel(db, pHdr, &iLevel); if( rc==SQLITE4_OK ){ rc = btAllocateBlock(db, 1, &iSubBlock); } if( rc==SQLITE4_OK ){ u8 aKey[8]; u8 aVal[4]; pHdr->iSubBlock = iSubBlock; pHdr->nSubPg = 1; /* Root page is automatically allocated */ /* The key for the new entry consists of the concatentation of two ** 32-bit big-endian integers - the <age> and <level-no>. The age ** of the new segment is 0. The level number is one greater than the ** level number of the previous segment. */ btPutU32(&aKey[0], 0); btPutU32(&aKey[4], ~iLevel); btPutU32(&aVal[0], iSubBlock); rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 4); } } if( rc==SQLITE4_OK ){ *piRoot = btBlockToRoot(pHdr, pHdr->iSubBlock); } db->bFastInsertOp = 1; return rc; } /* ** Insert a new key/value pair or replace an existing one. ** ** This function may modify either the b-tree or fast-insert-tree, depending |
︙ | ︙ | |||
3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 | case BT_INFO_PAGEDUMP: { int iCtx; /* ControlTransaction() context */ rc = btControlTransaction(db, &iCtx); if( rc==SQLITE4_OK ){ BtPage *pPg = 0; rc = sqlite4BtPageGet(db->pPager, pInfo->pgno, &pPg); if( rc==SQLITE4_OK ){ int bAscii = (pInfo->eType==BT_INFO_PAGEDUMP_ASCII); u8 *aData; int nData; aData = sqlite4BtPageData(pPg); | > | | | 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 | case BT_INFO_PAGEDUMP: { int iCtx; /* ControlTransaction() context */ rc = btControlTransaction(db, &iCtx); if( rc==SQLITE4_OK ){ BtPage *pPg = 0; rc = sqlite4BtPageGet(db->pPager, pInfo->pgno, &pPg); if( rc==SQLITE4_OK ){ BtPager *p = db->pPager; int bAscii = (pInfo->eType==BT_INFO_PAGEDUMP_ASCII); u8 *aData; int nData; aData = sqlite4BtPageData(pPg); nData = sqlite4BtPagerPagesize(p); btPageToAscii(pInfo->pgno, bAscii, p, aData, nData, &pInfo->output); sqlite4_buffer_append(&pInfo->output, "", 1); sqlite4BtPageRelease(pPg); } btControlTransactionDone(db, iCtx); } break; } |
︙ | ︙ |
Changes to src/bt_pager.c.
︙ | ︙ | |||
977 978 979 980 981 982 983 | fprintf(stderr, "allocated page %d\n", pgno); #endif *ppPg = pPg; return rc; } | | > > > | | | | | | | | | | | | | > | 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 | fprintf(stderr, "allocated page %d\n", pgno); #endif *ppPg = pPg; return rc; } int sqlite4BtBlockAllocate(BtPager *p, int nBlk, u32 *aiBlk){ int rc = SQLITE4_OK; BtDbHdr *pHdr = p->pHdr; int nPgPerBlk = (pHdr->blksz / pHdr->pgsz); int i; for(i=0; i<nBlk; i++){ u32 iBlk; u32 iRoot; u32 iFree; /* Figure out the next block in the file. And its root (first) page. */ iBlk = 1 + (pHdr->nPg + nPgPerBlk - 1) / nPgPerBlk; iRoot = (iBlk-1) * nPgPerBlk + 1; assert( iBlk>0 ); for(iFree = pHdr->nPg+1; rc==SQLITE4_OK && iFree<iRoot; iFree++){ rc = sqlite4BtPageTrimPgno(p, iFree); } if( rc==SQLITE4_OK ){ pHdr->nPg = iBlk * nPgPerBlk; aiBlk[i] = iBlk; } } return rc; } /* ** Return the current page number of the argument page reference. |
︙ | ︙ |
Changes to www/bt.wiki.
︙ | ︙ | |||
647 648 649 650 651 652 653 654 655 656 657 658 659 660 | <li> Segment age (big-endian 32-bit integer). <li> Level number (ones-compliment of value as a big-endian 32-bit integer). <li> Separator key. All keys within the segment are guaranteed to be as large or larger than the separator key. The separator key may be zero bytes in size. </ul> <h3> Schedule-Page Format</h3> <p>We might need more than one schedule page. Perhaps anyway... But say worry about this later on. <p>The writer adds an entry to the schedule-tree. Schedule-tree entries are always appended to the tree - the integer key is one greater than the | > > > > > > > > > > > > | 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 | <li> Segment age (big-endian 32-bit integer). <li> Level number (ones-compliment of value as a big-endian 32-bit integer). <li> Separator key. All keys within the segment are guaranteed to be as large or larger than the separator key. The separator key may be zero bytes in size. </ul> <p> As well as the records that index all segments in the db, the meta-table contains a single summary record. The key is four 0xFF bytes. <p>For each age between 0 and the largest age in the db, inclusive: * current minimum level number (16-bits). * total number of levels. * maximum level in active merge (16-bits). Store this as a separate record in the meta-table. Even if there segments with age=31 in the db, this is still only 6*32=192 bytes. <h3> Schedule-Page Format</h3> <p>We might need more than one schedule page. Perhaps anyway... But say worry about this later on. <p>The writer adds an entry to the schedule-tree. Schedule-tree entries are always appended to the tree - the integer key is one greater than the |
︙ | ︙ | |||
675 676 677 678 679 680 681 | <li><p> A flag to indicate that the merge has been performed. <li><p> A bitmask (or some other indication) indicating which of the output blocks were populated. <li><p> A pointer to a list of overflow pages that were freed, if any. | | > > > > > > > > > | > > > > > > > > > > | 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 | <li><p> A flag to indicate that the merge has been performed. <li><p> A bitmask (or some other indication) indicating which of the output blocks were populated. <li><p> A pointer to a list of overflow pages that were freed, if any. <li><p> A pointer to the next key read from the input levels, if any. </ul> <p>Note the implication here: A writer may only modify the page if it currently resides in the db file (not the log). The writer can tell if the schedule page resides in the db file or the log by checking the "merge performed" flag. <p>The page is organized as follows: <ul> <li> Magic number used to identify the state of the page (big-endian u32): 0x6a846607 if the merge has not yet been performed or 0x1a468c33 if it has. <li> Age of input segments (big-endian u32). <li> Max level of input segments (bit-endian u32). All segments of the specified age with a level equal to or less than this are considered inputs. <li> Level of output segments. <li> Number of allocated output blocks (big-endian u32) - <i>nBlock</i>. Maximum is (say) 32. <li> Block number of each allocated block (array of <i>nBlock</i> big-endian u32 fields). </ul> <p>Then, after the merge has taken place: <ul> <li> Page number of input page containing next key to read. <li> Cell number of next key to read within the identified page. <li> Number of input blocks used (input blocks are always used in the order they are specified). <li> First page of freed-page list. </ul> <h2 id=in-memory_tree>3.2. In-Memory Tree</h2> <p>This design uses the same multi-version in-memory tree that LSM does. The tree structure may be located in heap or shared memory. If shared memory is used then a separate file (*-shm2 or some such) is used for the data. The |
︙ | ︙ |