Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fixes for b-tree balancing routines. Still incomplete. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
9e8d7525d89a278a7b612121f86ed6f3 |
User & Date: | dan 2013-09-28 11:23:52.880 |
Context
2013-09-28
| ||
15:07 | Still further progress on the same. check-in: 4431ab2769 user: dan tags: trunk | |
11:23 | Fixes for b-tree balancing routines. Still incomplete. check-in: 9e8d7525d8 user: dan tags: trunk | |
2013-09-27
| ||
20:25 | Begin adding routines for b-tree balancing and so on. Incomplete. check-in: 3003f1d99a user: dan tags: trunk | |
Changes
Changes to src/bt_main.c.
︙ | ︙ | |||
144 145 146 147 148 149 150 | static void btCsrSetup(bt_db *db, bt_cursor *pCsr){ memset(pCsr, 0, sizeof(bt_cursor)); pCsr->pDb = db; } int sqlite4BtCsrOpen(bt_db *db, int nExtra, bt_cursor **ppCsr){ | | | 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | static void btCsrSetup(bt_db *db, bt_cursor *pCsr){ memset(pCsr, 0, sizeof(bt_cursor)); pCsr->pDb = db; } int sqlite4BtCsrOpen(bt_db *db, int nExtra, bt_cursor **ppCsr){ int rc = SQLITE4_OK; /* Return Code */ int nByte; /* Total bytes of space to allocate */ bt_cursor *pCsr; /* New cursor object */ nByte = sizeof(bt_cursor) + nExtra; *ppCsr = pCsr = (bt_cursor*)sqlite4_malloc(db->pEnv, nByte); if( pCsr==0 ){ rc = btErrorBkpt(SQLITE4_NOMEM); |
︙ | ︙ | |||
270 271 272 273 274 275 276 | } /* **************************************************************************/ #ifndef NDEBUG #include <stdio.h> | | > | | | | | > | | 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 | } /* **************************************************************************/ #ifndef NDEBUG #include <stdio.h> static void printPage(FILE *f, u32 pgno, u8 *aData, int nData){ int i; int nCell = (int)btCellCount(aData, nData); fprintf(f, "Page %d: ", pgno); fprintf(f, "nCell=%d ", nCell); fprintf(f, "iFree=%d ", (int)btFreeOffset(aData, nData)); fprintf(f, "flags=%d ", (int)btFlags(aData, nData)); fprintf(f, "cell-offsets=("); for(i=0; i<nCell; i++){ u8 *ptr = btCellPtrFind(aData, nData, i); fprintf(f, "%s%d", i==0?"":" ", (int)btGetU16(ptr)); } fprintf(f, ")\n"); } #endif /* ** This function compares the key passed via parameters pK and nK to the ** key stored as part of cell iCell on the database page stored in buffer |
︙ | ︙ | |||
386 387 388 389 390 391 392 | pgno = btGetU32(&aData[1]); }else{ u8 *pCell; int nByte; pCell = btCellFind(aData, pgsz, 0); pCell += sqlite4BtVarintGet32(pCell, &nByte); pCell += nByte; | | | 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 | pgno = btGetU32(&aData[1]); }else{ u8 *pCell; int nByte; pCell = btCellFind(aData, pgsz, 0); pCell += sqlite4BtVarintGet32(pCell, &nByte); pCell += nByte; pgno = btGetU32(pCell); } }else{ pgno = 0; if( res!=0 ){ assert( BT_SEEK_LEFAST<0 && BT_SEEK_LE<0 ); if( eSeek<0 ){ |
︙ | ︙ | |||
617 618 619 620 621 622 623 624 625 626 627 628 629 630 | */ static int btNewBuffer(bt_db *pDb, u8 **paBuf){ const int pgsz = sqlite4BtPagerPagesize(pDb->pPager); u8 *aBuf; *paBuf = aBuf = sqlite4_malloc(pDb->pEnv, pgsz); if( aBuf==0 ) return btErrorBkpt(SQLITE4_NOMEM); return SQLITE4_OK; } /* ** Discard a page buffer allocated using btNewBuffer. */ static void btFreeBuffer(bt_db *pDb, u8 *aBuf){ | > > > > > > > | 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 | */ static int btNewBuffer(bt_db *pDb, u8 **paBuf){ const int pgsz = sqlite4BtPagerPagesize(pDb->pPager); u8 *aBuf; *paBuf = aBuf = sqlite4_malloc(pDb->pEnv, pgsz); if( aBuf==0 ) return btErrorBkpt(SQLITE4_NOMEM); #ifndef NDEBUG /* This stops valgrind from complaining about unitialized bytes when (if) ** this buffer is eventually written out to disk. */ memset(aBuf, 0x66, pgsz); #endif return SQLITE4_OK; } /* ** Discard a page buffer allocated using btNewBuffer. */ static void btFreeBuffer(bt_db *pDb, u8 *aBuf){ |
︙ | ︙ | |||
786 787 788 789 790 791 792 | const int pgsz = sqlite4BtPagerPagesize(pDb->pPager); int rc; rc = sqlite4BtPageWrite(pPg); if( rc==SQLITE4_OK ){ u8 *aData = sqlite4BtPageData(pPg); int nCell = btCellCount(aData, pgsz); | < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 | const int pgsz = sqlite4BtPagerPagesize(pDb->pPager); int rc; rc = sqlite4BtPageWrite(pPg); if( rc==SQLITE4_OK ){ u8 *aData = sqlite4BtPageData(pPg); int nCell = btCellCount(aData, pgsz); if( iChild>=nCell ){ btPutU32(&aData[1], pgno); }else{ int nKey; u8 *pCell = btCellFind(aData, pgsz, iChild); pCell += sqlite4BtVarintGet32(pCell, &nKey); pCell += nKey; btPutU32(pCell, pgno); } } return rc; } /* Called recursively by btInsertAndRebalance(). todo: Fix this! */ static int btModifyPage(bt_cursor *, int, int, KeyValue *); #if 0 /* ** An instance of this structure describes all the cells that will be ** redistributed by a single btRebalance() operation. */ struct BtRBIter { /* Input parameters */ int nKV; /* Number of KV pairs to add */ int iKVSib; /* Sibling to add KV pairs to */ int iKVCell; /* Cell offset at which to insert KV pairs */ int nSib; /* Number of sibling pages */ /* Output parameters */ int iSib; /* Sibling page to read cell from (or -ve) */ int iCell; /* Index of cell to read (or -ve) */ }; static void btIterFirst(BtRBIter *p){ p->iSib = 0; p->iCell = 0; if( p->iKVSib==0 && p->iKVCell==0 ){ p->iSib = -1; } } static void btIterNext(BtRBIter *p){ p->iCell++; /* Just finished the KV pairs */ if( p->iSib<0 && p->iCell==p->nKV ){ p->iSib = p->iKVSib; p->iCell = p->iKVCell; } if( p->iSib && p->iCell==p->nKV ){ p->iSib = p->iKVSib; p->iCell = p->iKVCell; } } static void btIterValid(BtRBIter *p){ return (p->iSib>=0 || p->iCell>=0); } #endif typedef struct BalanceCtx BalanceCtx; struct BalanceCtx { int pgsz; /* Database page size */ int bLeaf; /* True if we are rebalancing leaf data */ bt_cursor *pCsr; /* Cursor identifying where to insert pKV */ KeyValue *pKV; /* New KV pair being inserted */ int nPg; /* Number of sibling pages */ BtPage **apPg; /* Array of sibling pages */ int *anOut; /* Cell counts for output pages */ /* Variables used by btBalanceOutput */ int nOut; /* Number of output pages */ int iOut; /* Current output page */ u8 **apOut; /* Buffers to assemble output in */ }; static int btBalanceOutput( BalanceCtx *p, /* Description of balance operation */ int iCell, /* Cell number in this iteration */ u8 *pCell, int nByte, /* Binary cell to copy to output */ KeyValue *pKV /* Key-value cell to write to output */ ){ u8 *aOut; /* Buffer for output page */ int iOff; /* Offset of new cell within page */ int nCell; /* Number of cells already on page */ assert( (pCell==0)!=(pKV==0) ); /* Scribble the new cell into the output page. */ aOut = p->apOut[p->iOut]; iOff = btFreeOffset(aOut, p->pgsz); if( iOff==0 ) iOff = (p->bLeaf ? 1 : 5); nCell = btCellCount(aOut, p->pgsz); btPutU16(btCellPtrFind(aOut, p->pgsz, nCell), iOff); if( pCell ){ memcpy(&aOut[iOff], pCell, nByte); iOff += nByte; }else{ iOff += btKVCellWrite(pKV, p->pgsz, &aOut[iOff]); } btPutU16(&aOut[p->pgsz-2], nCell+1); btPutU16(&aOut[p->pgsz-6], iOff); if( (iCell+1)==p->anOut[p->iOut] ){ /* That was the last cell for this page. Fill in the rest of the ** output page footer and so on. Then increment BalanceCtx.iOut so ** that the next call writes to the next page. */ int nFree; /* Free space remaining on output page */ nFree = p->pgsz - iOff - (6 + 2*(nCell+1)); aOut[0] = (p->bLeaf ? 0 : BT_PGFLAGS_INTERNAL); btPutU16(&aOut[p->pgsz-4], nFree); p->iOut++; } return SQLITE4_OK; } static int btBalanceVisitCells( BalanceCtx *p, int (*xVisit)(BalanceCtx*, int, u8*, int, KeyValue*) ){ const int pgsz = sqlite4BtPagerPagesize(p->pCsr->pDb->pPager); int rc = SQLITE4_OK; /* Return code */ int iPg; /* Current page in apPg[] */ int iCall = 0; BtPage *pIns = p->pCsr->apPage[p->pCsr->nPg-1]; int iIns = p->pCsr->aiCell[p->pCsr->nPg-1]; for(iPg=0; iPg<p->nPg && rc==SQLITE4_OK; iPg++){ BtPage *pPg; /* Current page */ u8 *aData; /* Page data */ int nCell; /* Number of cells on page pPg */ int iCell; /* Current cell in pPg */ pPg = p->apPg[iPg]; aData = sqlite4BtPageData(pPg); nCell = btCellCount(aData, pgsz); for(iCell=0; iCell<nCell && rc==SQLITE4_OK; iCell++){ int nByte; u8 *pCell; if( pPg==pIns && iCell==iIns ){ rc = xVisit(p, iCall++, 0, 0, p->pKV); if( rc!=SQLITE4_OK ) break; } pCell = btCellFindSize(aData, pgsz, iCell, &nByte); rc = xVisit(p, iCall++, pCell, nByte, 0); } if( pPg==pIns && iCell==nCell && rc==SQLITE4_OK ){ rc = xVisit(p, iCall++, 0, 0, p->pKV); } } return rc; } int btInsertAndRebalance(bt_cursor *pCsr, KeyValue *pKV){ bt_db * const pDb = pCsr->pDb; const int pgsz = sqlite4BtPagerPagesize(pDb->pPager); const int nSpacePerPage = (pgsz - 1 - 6); int nIn = 0; /* Number of input pages */ int iPg; /* Used to iterate through pages */ int iCell; /* Used to iterate through cells */ |
︙ | ︙ | |||
827 828 829 830 831 832 833 | KeyValue aPCell[5]; /* Cells to push into the parent page */ BtPage *pPar; /* Parent page */ int iSib; /* Index of left-most sibling */ int nTotal; /* Total bytes of content to distribute */ int rc = SQLITE4_OK; /* Return code */ | | < | 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 | KeyValue aPCell[5]; /* Cells to push into the parent page */ BtPage *pPar; /* Parent page */ int iSib; /* Index of left-most sibling */ int nTotal; /* Total bytes of content to distribute */ int rc = SQLITE4_OK; /* Return code */ BalanceCtx ctx; memset(apOut, 0, sizeof(apOut)); memset(apPg, 0, sizeof(apPg)); memset(aPCell, 0, sizeof(aPCell)); /* Gather the sibling pages from which cells will be redistributed into ** the apPg[] array. */ |
︙ | ︙ | |||
932 933 934 935 936 937 938 939 940 941 | } #endif /* Allocate buffers for the output leaves */ for(iPg=0; iPg<nOut; iPg++){ rc = btNewBuffer(pDb, &apOut[iPg]); if( rc!=SQLITE4_OK ) goto rebalance_out; } /* Populate buffers for the output leaves */ | > > > > > > > > > > > > > > > > | | | 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 | } #endif /* Allocate buffers for the output leaves */ for(iPg=0; iPg<nOut; iPg++){ rc = btNewBuffer(pDb, &apOut[iPg]); if( rc!=SQLITE4_OK ) goto rebalance_out; memset(apOut[iPg] + pgsz-6, 0, 6); } memset(&ctx, 0, sizeof(ctx)); ctx.pCsr = pCsr; ctx.pKV = pKV; ctx.nPg = nIn; ctx.apPg = apPg; ctx.pgsz = pgsz; ctx.bLeaf = 1; /* todo */ ctx.anOut = anOut; ctx.nOut = nOut; ctx.apOut = apOut; rc = btBalanceVisitCells(&ctx, btBalanceOutput); #if 0 /* Populate buffers for the output leaves */ iPgOut = 0; /* Index of current output page */ iPgIn = 0; /* Index of current input page */ iPgInFirst = 0; for(iCell=0; iCell<nCell; iCell++){ int iOff; /* Output page offset */ u8 *aOut; /* Output page buffer */ if( iCell==anOut[iPg] ) iPg++; aOut = apOut[iPg]; |
︙ | ︙ | |||
962 963 964 965 966 967 968 969 970 971 | aIn = sqlite4BtPageData(apPg[iPgIn]); } memcpy(&aOut[iOff], btCellFind(aIn, pgsz, iPgCell), anCellSz[iCell]); iOff += anCellSz[iCell]; } btPutU16(&aOut[pgsz-6], iOff); } /* Clobber the old pages with the new buffers */ for(iPg=0; iPg<nOut; iPg++){ | > > > > > < | < < | > > > > > > > > > > < | 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 | aIn = sqlite4BtPageData(apPg[iPgIn]); } memcpy(&aOut[iOff], btCellFind(aIn, pgsz, iPgCell), anCellSz[iCell]); iOff += anCellSz[iCell]; } btPutU16(&aOut[pgsz-6], iOff); } for(iPg=0; iPg<nOut; iPg++){ u8 *aData = apOut[iPg]; btPutU16(&aData[pgsz-2], anOut[iPg] - (iPg==0 ? 0 : anOut[iPg-1])); } #endif /* Clobber the old pages with the new buffers */ for(iPg=0; iPg<nOut; iPg++){ if( iPg>=nIn ){ rc = sqlite4BtPageAllocate(pDb->pPager, &apPg[iPg]); if( rc!=SQLITE4_OK ) goto rebalance_out; } btSetBuffer(pDb, apPg[iPg], apOut[iPg]); apOut[iPg] = 0; } #ifndef NDEBUG { int iDbg; for(iDbg=0; iDbg<nOut; iDbg++){ u8 *aData = sqlite4BtPageData(apPg[iDbg]); printPage(stderr, sqlite4BtPagePgno(apPg[iDbg]), aData, pgsz); } } #endif /* The leaves are written. Now gather the keys and page numbers to ** push up into the parent page. */ for(iPg=0; iPg<(nOut-1); iPg++){ u8 *aData = sqlite4BtPageData(apPg[iPg]); u8 *pCell; pCell = btCellFind(aData, pgsz, btCellCount(aData, pgsz)-1); aPCell[iPg].pgno = sqlite4BtPagePgno(apPg[iPg]); pCell += sqlite4BtVarintGet32(pCell, &aPCell[iPg].nK); aPCell[iPg].pK = pCell; } assert( nIn==1 && nOut==2 ); rc = btSetChildPgno(pDb, pPar, iSib+nIn-1, sqlite4BtPagePgno(apPg[nOut-1])); if( rc!=SQLITE4_OK ) goto rebalance_out; pCsr->nPg--; rc = btModifyPage(pCsr, nIn-1, nOut-1, aPCell); |
︙ | ︙ | |||
1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 | pCsr->apPage[1] = pNew; pCsr->aiCell[0] = 0; } return rc; } static int btModifyPage( bt_cursor *pCsr, /* Cursor identifying page to modify */ int nDel, /* Number of entries to delete from page */ int nKV, /* Number of entries in apKV */ KeyValue *apKV /* New cells to insert into the page */ ){ int rc = SQLITE4_OK; | > > > > > > > > > > > > > > | 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 | pCsr->apPage[1] = pNew; pCsr->aiCell[0] = 0; } return rc; } /* ** The cursor currently points to a cell on a b-tree page that may or ** may not be a leaf page. This routine modifies the contents of that ** page, balancing the b-tree if necessary. The page is modified as ** follows: ** ** * nDel entries, starting with the one the cursor points to, are ** deleted from the page. ** ** * nKV entries are inserted in their place. ** ** The tree balancing routine is called if this causes the page to ** become either overfull or to contain no entries at all. */ static int btModifyPage( bt_cursor *pCsr, /* Cursor identifying page to modify */ int nDel, /* Number of entries to delete from page */ int nKV, /* Number of entries in apKV */ KeyValue *apKV /* New cells to insert into the page */ ){ int rc = SQLITE4_OK; |
︙ | ︙ |
Changes to src/bt_pager.c.
︙ | ︙ | |||
11 12 13 14 15 16 17 18 19 20 21 22 23 24 | ************************************************************************* */ #include "btInt.h" #include <string.h> #include <assert.h> /* By default pages are 1024 bytes in size. */ #define BT_DEFAULT_PGSZ 1024 typedef struct BtPageHash BtPageHash; typedef struct BtDbhdr BtDbhdr; | > | 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | ************************************************************************* */ #include "btInt.h" #include <string.h> #include <assert.h> #include <stdio.h> /* By default pages are 1024 bytes in size. */ #define BT_DEFAULT_PGSZ 1024 typedef struct BtPageHash BtPageHash; typedef struct BtDbhdr BtDbhdr; |
︙ | ︙ | |||
261 262 263 264 265 266 267 268 269 270 271 272 273 274 | rc = p->pVfs->xSize(p->pFd, &nByte); if( rc==SQLITE4_OK && nByte>0 ){ rc = p->pVfs->xRead(p->pFd, 0, &p->dbhdr, sizeof(p->dbhdr)); }else{ memset(&p->dbhdr, 0, sizeof(p->dbhdr)); p->dbhdr.pgsz = BT_DEFAULT_PGSZ; } if( rc==SQLITE4_OK ){ /* If the read transaction was successfully opened, the transaction ** level is now 1. */ p->iTransactionLevel = 1; } | > | 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 | rc = p->pVfs->xSize(p->pFd, &nByte); if( rc==SQLITE4_OK && nByte>0 ){ rc = p->pVfs->xRead(p->pFd, 0, &p->dbhdr, sizeof(p->dbhdr)); }else{ memset(&p->dbhdr, 0, sizeof(p->dbhdr)); p->dbhdr.pgsz = BT_DEFAULT_PGSZ; p->dbhdr.nPg = 2; } if( rc==SQLITE4_OK ){ /* If the read transaction was successfully opened, the transaction ** level is now 1. */ p->iTransactionLevel = 1; } |
︙ | ︙ | |||
517 518 519 520 521 522 523 | /* ** Allocate a new database page and return a writable reference to it. */ int sqlite4BtPageAllocate(BtPager *p, BtPage **ppPg){ BtPage *pPg = 0; int rc; | | | > > | 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 | /* ** Allocate a new database page and return a writable reference to it. */ int sqlite4BtPageAllocate(BtPager *p, BtPage **ppPg){ BtPage *pPg = 0; int rc; u32 pgno = p->dbhdr.nPg+1; rc = sqlite4BtPageGet(p, pgno, &pPg); if( rc==SQLITE4_OK ){ rc = sqlite4BtPageWrite(pPg); if( rc!=SQLITE4_OK ){ sqlite4BtPageRelease(pPg); pPg = 0; }else{ p->dbhdr.nPg = pgno; } } fprintf(stderr, "allocated page %d\n", pgno); *ppPg = pPg; return rc; } /* ** Return the current page number of the argument page reference. |
︙ | ︙ |
Changes to src/vdbecursor.c.
︙ | ︙ | |||
25 26 27 28 29 30 31 | /* ** Move a VDBE cursor to the first or to the last element of its table. The ** first element is sought if iEnd==+1 and the last element if iEnd==-1. ** ** Return SQLITE4_OK on success. Return SQLITE4_NOTFOUND if the table is empty. | | | 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | /* ** Move a VDBE cursor to the first or to the last element of its table. The ** first element is sought if iEnd==+1 and the last element if iEnd==-1. ** ** Return SQLITE4_OK on success. Return SQLITE4_NOTFOUND if the table is empty. ** Other error codes are also possible for various kinds of errors. */ int sqlite4VdbeSeekEnd(VdbeCursor *pC, int iEnd){ KVCursor *pCur = pC->pKVCur; int rc; KVSize nProbe; KVByteArray aProbe[16]; |
︙ | ︙ |
Changes to test/simple3.test.
︙ | ︙ | |||
76 77 78 79 80 81 82 | } do_execsql_test 2.2 { INSERT INTO t1 VALUES(5, $val); } do_execsql_test 2.3 { | | | > | | | 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | } do_execsql_test 2.2 { INSERT INTO t1 VALUES(5, $val); } do_execsql_test 2.3 { SELECT a, length(b) FROM t1 } {1 200 3 200 4 200 5 200} do_execsql_test 2.4 { INSERT INTO t1 VALUES(6, $val); } breakpoint do_execsql_test 2.5 { SELECT a, length(b) FROM t1 } {1 200 3 200 4 200 5 200 6 200} finish_test |