Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Change the btree balance code so that it does not call balance_nonroot() recursively. (CVS 6729) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
7863db904d6fc36417c923e3d135eb2c |
User & Date: | danielk1977 2009-06-08 14:49:46.000 |
Context
2009-06-08
| ||
17:11 | Clarification of the operation of the OR-term optimizer in where.c. (CVS 6730) (check-in: 6b42dc3d04 user: drh tags: trunk) | |
14:49 | Change the btree balance code so that it does not call balance_nonroot() recursively. (CVS 6729) (check-in: 7863db904d user: danielk1977 tags: trunk) | |
12:52 | Increase the version number to 3.6.15 in preparation for the next release. (CVS 6728) (check-in: 456ea541d6 user: drh tags: trunk) | |
Changes
Changes to src/btree.c.
1 2 3 4 5 6 7 8 9 10 11 | /* ** 2004 April 6 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* | | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | /* ** 2004 April 6 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** $Id: btree.c,v 1.620 2009/06/08 14:49:46 danielk1977 Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** See the header comment on "btreeInt.h" for additional information. ** Including a description of file format and an overview of operation. */ #include "btreeInt.h" /* ** The header string that appears at the beginning of every ** SQLite database. */ static const char zMagicHeader[] = SQLITE_FILE_HEADER; /* ** Set this global variable to 1 to enable tracing using the TRACE ** macro. */ #if 0 int sqlite3BtreeTrace=1; /* True to enable tracing */ # define TRACE(X) if(sqlite3BtreeTrace){printf X;fflush(stdout);} #else # define TRACE(X) #endif |
︙ | ︙ | |||
2705 2706 2707 2708 2709 2710 2711 | } } assert( nRef==sqlite3PagerRefcount(pPager) ); return rc; } | | > > | 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 | } } assert( nRef==sqlite3PagerRefcount(pPager) ); return rc; } #else /* ifndef SQLITE_OMIT_AUTOVACUUM */ # define setChildPtrmaps(x) SQLITE_OK #endif /* ** This routine does the first phase of a two-phase commit. This routine ** causes a rollback journal to be created (if it does not already exist) ** and populated with enough information so that if a power loss occurs ** the database can be restored to its original state by playing back ** the journal. Then the contents of the journal are flushed out to |
︙ | ︙ | |||
4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 | releasePage(pPrevTrunk); if( rc==SQLITE_OK ){ if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ releasePage(*ppPage); return SQLITE_CORRUPT_BKPT; } (*ppPage)->isInit = 0; } return rc; } /* ** This function is used to add page iPage to the database file free-list. ** It is assumed that the page is not already a part of the free-list. | > > | 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 | releasePage(pPrevTrunk); if( rc==SQLITE_OK ){ if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ releasePage(*ppPage); return SQLITE_CORRUPT_BKPT; } (*ppPage)->isInit = 0; }else{ *ppPage = 0; } return rc; } /* ** This function is used to add page iPage to the database file free-list. ** It is assumed that the page is not already a part of the free-list. |
︙ | ︙ | |||
5192 5193 5194 5195 5196 5197 5198 | ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance ** in exchange for a larger degradation in INSERT and UPDATE performance. ** The value of NN appears to give the best results overall. */ #define NN 1 /* Number of neighbors on either side of pPage */ #define NB (NN*2+1) /* Total pages involved in the balance */ | < < > > > > > > | < | > | > > > < | < < < < < < < | > | | | | < | | | | | | | | | | | | | | | | | < < < < < < < | | 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 | ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance ** in exchange for a larger degradation in INSERT and UPDATE performance. ** The value of NN appears to give the best results overall. */ #define NN 1 /* Number of neighbors on either side of pPage */ #define NB (NN*2+1) /* Total pages involved in the balance */ #ifndef SQLITE_OMIT_QUICKBALANCE /* ** This version of balance() handles the common special case where ** a new entry is being inserted on the extreme right-end of the ** tree, in other words, when the new entry will become the largest ** entry in the tree. ** ** Instead of trying balance the 3 right-most leaf pages, just add ** a new page to the right-hand side and put the one new entry in ** that page. This leaves the right side of the tree somewhat ** unbalanced. But odds are that we will be inserting new entries ** at the end soon afterwards so the nearly empty page will quickly ** fill up. On average. ** ** pPage is the leaf page which is the right-most page in the tree. ** pParent is its parent. pPage must have a single overflow entry ** which is also the right-most entry on the page. ** ** The pSpace buffer is used to store a temporary copy of the divider ** cell that will be inserted into pParent. Such a cell consists of a 4 ** byte page number followed by a variable length integer. In other ** words, at most 13 bytes. Hence the pSpace buffer must be at ** least 13 bytes in size. */ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ BtShared *const pBt = pPage->pBt; /* B-Tree Database */ MemPage *pNew = 0; /* Newly allocated page */ int rc; /* Return Code */ Pgno pgnoNew; /* Page number of pNew */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; /* Allocate a new page. This page will become the right-sibling of ** pPage. Make the parent page writable, so that the new divider cell ** may be inserted. If both these operations are successful, proceed. */ rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); if( rc==SQLITE_OK ){ u8 *pOut = &pSpace[4]; u8 *pCell = pPage->aOvfl[0].pCell; u16 szCell = cellSizePtr(pPage, pCell); u8 *pStop; assert( sqlite3PagerIswriteable(pNew->pDbPage) ); zeroPage(pNew, pPage->aData[0]); assemblePage(pNew, 1, &pCell, &szCell); pPage->nOverflow = 0; /* Create a divider cell to insert into pParent. The divider cell ** consists of a 4-byte page number (the page number of pPage) and ** a variable length key value (which must be the same value as the ** largest key on pPage). ** ** To find the largest key value on pPage, first find the right-most ** cell on pPage. The first two fields of this cell are the ** record-length (a variable length integer at most 32-bits in size) ** and the key value (a variable length integer, may have any value). ** The first of the while(...) loops below skips over the record-length ** field. The second while(...) loop copies the key value from the ** cell on pPage into the pSpace buffer. */ put4byte(pSpace, pPage->pgno); pCell = findCell(pPage, pPage->nCell-1); pStop = &pCell[9]; while( (*(pCell++)&0x80) && pCell<pStop ); pStop = &pCell[9]; while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); /* Insert the new divider cell into pParent */ insertCell(pParent, pParent->nCell, pSpace, pOut-pSpace, 0, 0); /* Set the right-child pointer of pParent to point to the new page. */ put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the ** cell on the page to an overflow page. */ if( ISAUTOVACUUM ){ rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); if( rc==SQLITE_OK ){ rc = ptrmapPutOvfl(pNew, 0); } } /* Release the reference to the new page. */ releasePage(pNew); /* At this point the pPage->nFree variable is not set correctly with ** respect to the content of the page (because it was set to 0 by ** insertCell). So call sqlite3BtreeInitPage() to make sure it is ** correct. ** ** This has to be done even if an error will be returned. Normally, if ** an error occurs during tree balancing, the contents of MemPage are ** not important, as they will be recalculated when the page is rolled ** back. But here, in balance_quick(), it is possible that pPage has ** not yet been marked dirty or written into the journal file. Therefore ** it will not be rolled back and so it is important to make sure that ** the page data and contents of MemPage are consistent. */ pPage->isInit = 0; sqlite3BtreeInitPage(pPage); assert( pPage->nOverflow==0 ); } return rc; } #endif /* SQLITE_OMIT_QUICKBALANCE */ /* ** This routine redistributes Cells on pPage and up to NN*2 siblings ** of pPage so that all pages have about the same amount of free space. |
︙ | ︙ | |||
5344 5345 5346 5347 5348 5349 5350 | ** might become overfull or underfull. If that happens, then this routine ** is called recursively on the parent. ** ** If this routine fails for any reason, it might leave the database ** in a corrupted state. So if this routine fails, the database should ** be rolled back. */ | | < < < < | 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 | ** might become overfull or underfull. If that happens, then this routine ** is called recursively on the parent. ** ** If this routine fails for any reason, it might leave the database ** in a corrupted state. So if this routine fails, the database should ** be rolled back. */ static int balance_nonroot(MemPage *pParent, int iParentIdx, u8 *aSpace2){ BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nOld = 0; /* Number of pages in apOld[] */ int nNew = 0; /* Number of pages in apNew[] */ int i, j, k; /* Loop counters */ int nxDiv; /* Next divider slot in pParent->aCell[] */ int rc; /* The return code */ int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ int usableSpace; /* Bytes in pPage beyond the header */ int pageFlags; /* Value of pPage->aData[0] */ int subtotal; /* Subtotal of bytes in cells on one page */ |
︙ | ︙ | |||
5377 5378 5379 5380 5381 5382 5383 | u8 *apDiv[NB]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ int szNew[NB+2]; /* Combined size of cells place on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ u8 *aSpace1; /* Space for copies of dividers cells before balance */ | < | | < < < < < < < | < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < < | < < < < < < < < < < < < < < | | | | < < < | 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 | u8 *apDiv[NB]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ int szNew[NB+2]; /* Combined size of cells place on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aCopy[NB]; /* Space for holding data of apCopy[] */ u8 *aSpace1; /* Space for copies of dividers cells before balance */ u8 *aFrom = 0; pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); /* Find the sibling pages to balance. Also locate the cells in pParent ** that divide the siblings. An attempt is made to find NN siblings on ** either side of pPage. More siblings are taken from one side, however, ** if there are fewer than NN siblings on the other side. If pParent ** has NB or fewer children then all children of pParent are taken. */ nxDiv = iParentIdx - NN; if( nxDiv + NB > pParent->nCell ){ nxDiv = pParent->nCell - NB + 1; } if( nxDiv<0 ){ nxDiv = 0; } for(i=0, k=nxDiv; i<NB; i++, k++){ if( k<pParent->nCell ){ apDiv[i] = findCell(pParent, k); assert( !pParent->leaf ); pgnoOld[i] = get4byte(apDiv[i]); }else if( k==pParent->nCell ){ pgnoOld[i] = get4byte(&pParent->aData[pParent->hdrOffset+8]); }else{ break; } rc = getAndInitPage(pBt, pgnoOld[i], &apOld[i]); if( rc ) goto balance_cleanup; apCopy[i] = 0; assert( i==nOld ); nOld++; nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; } /* Make nMaxCells a multiple of 4 in order to preserve 8-byte |
︙ | ︙ | |||
5501 5502 5503 5504 5505 5506 5507 | assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */ } aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); if( ISAUTOVACUUM ){ aFrom = &aSpace1[pBt->pageSize]; } | | | 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 | assert( ((aCopy[i] - (u8*)0) & 7)==0 ); /* 8-byte alignment required */ } aSpace1 = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))]; assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); if( ISAUTOVACUUM ){ aFrom = &aSpace1[pBt->pageSize]; } /* aSpace2 = sqlite3PageMalloc(pBt->pageSize); */ if( aSpace2==0 ){ rc = SQLITE_NOMEM; goto balance_cleanup; } /* ** Make copies of the content of pPage and its siblings into aOld[]. |
︙ | ︙ | |||
5537 5538 5539 5540 5541 5542 5543 | ** apCell[] include child pointers. Either way, all cells in apCell[] ** are alike. ** ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ nCell = 0; | | | | 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 | ** apCell[] include child pointers. Either way, all cells in apCell[] ** are alike. ** ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ nCell = 0; leafCorrection = apOld[0]->leaf*4; leafData = apOld[0]->hasData; for(i=0; i<nOld; i++){ MemPage *pOld = apCopy[i]; int limit = pOld->nCell+pOld->nOverflow; for(j=0; j<limit; j++){ assert( nCell<nMaxCells ); apCell[nCell] = findOverflowCell(pOld, j); szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); |
︙ | ︙ | |||
5673 5674 5675 5676 5677 5678 5679 | ** page is page 1 and we are the only child of that page. */ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); /* ** Allocate k new pages. Reuse old pages where possible. */ | | | | 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 | ** page is page 1 and we are the only child of that page. */ assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); /* ** Allocate k new pages. Reuse old pages where possible. */ assert( apOld[0]->pgno>1 ); pageFlags = apOld[0]->aData[0]; for(i=0; i<k; i++){ MemPage *pNew; if( i<nOld ){ pNew = apNew[i] = apOld[i]; pgnoNew[i] = pgnoOld[i]; apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); |
︙ | ︙ | |||
5902 5903 5904 5905 5906 5907 5908 | ** But the parent page will always be initialized. */ assert( pParent->isInit ); sqlite3ScratchFree(apCell); apCell = 0; TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", pPage->pgno, nOld, nNew, nCell)); | < < < < < < | > > | > > > > > > > > > | > | < < < > > > > > | < | < < | | > > > > | | > > > | < | | | > > | > > > > | > > > > > > | < | | | | | | | | | | | | | > > | > > > > | | < < | > | | < | < < | < < | > > | | < < | < < < > | | | | | < < < < < | < < | < < < < | < < < < < > | | | > | > > | > > > > > | > | | < | | | < < < < < < | < < < < < < < < < < < < < < < | > > > | | < < | > > > | | > | | | | | | < < < < < | | > | < | | < | < < | < < < | | | | | | | > > > > > > > | | > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | < > | > > > > > > > > > > > > > > > | < | | > > > > | | | > | > > | < | > | | | > > | > > > > | 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 | ** But the parent page will always be initialized. */ assert( pParent->isInit ); sqlite3ScratchFree(apCell); apCell = 0; TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", pPage->pgno, nOld, nNew, nCell)); /* ** Cleanup before returning. */ balance_cleanup: sqlite3ScratchFree(apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); } for(i=0; i<nNew; i++){ releasePage(apNew[i]); } return rc; } /* ** This function is used to copy the contents of the b-tree node stored ** on page pFrom to page pTo. If page pFrom was not a leaf page, then ** the pointer-map entries for each child page are updated so that the ** parent page stored in the pointer map is page pTo. If pFrom contained ** any cells with overflow page pointers, then the corresponding pointer ** map entries are also updated so that the parent page is page pTo. ** ** If pFrom is currently carrying any overflow cells (entries in the ** MemPage.aOvfl[] array), they are not copied to pTo. ** ** Before returning, page pTo is reinitialized using sqlite3BtreeInitPage(). ** ** The performance of this function is not critical. It is only used by ** the balance_shallower() and balance_deeper() procedures, neither of ** which are called often under normal circumstances. */ static int copyNodeContent(MemPage *pFrom, MemPage *pTo){ BtShared * const pBt = pFrom->pBt; u8 * const aFrom = pFrom->aData; u8 * const aTo = pTo->aData; int const iFromHdr = pFrom->hdrOffset; int const iToHdr = ((pTo->pgno==1) ? 100 : 0); int rc = SQLITE_OK; int iData; assert( pFrom->isInit ); assert( pFrom->nFree>=iToHdr ); assert( get2byte(&aFrom[iFromHdr+5])<=pBt->usableSize ); /* Copy the b-tree node content from page pFrom to page pTo. */ iData = get2byte(&aFrom[iFromHdr+5]); memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); /* Reinitialize page pTo so that the contents of the MemPage structure ** match the new data. The initialization of pTo "cannot" fail, as the ** data copied from pFrom is known to be valid. */ pTo->isInit = 0; TESTONLY(rc = ) sqlite3BtreeInitPage(pTo); assert( rc==SQLITE_OK ); /* If this is an auto-vacuum database, update the pointer-map entries ** for any b-tree or overflow pages that pTo now contains the pointers to. */ if( ISAUTOVACUUM ){ rc = setChildPtrmaps(pTo); } return rc; } /* ** This routine is called on the root page of a btree when the root ** page contains no cells. This is an opportunity to make the tree ** shallower by one level. */ static int balance_shallower(MemPage *pRoot){ /* The root page is empty but has one child. Transfer the ** information from that one child into the root page if it ** will fit. This reduces the depth of the tree by one. ** ** If the root page is page 1, it has less space available than ** its child (due to the 100 byte header that occurs at the beginning ** of the database fle), so it might not be able to hold all of the ** information currently contained in the child. If this is the ** case, then do not do the transfer. Leave page 1 empty except ** for the right-pointer to the child page. The child page becomes ** the virtual root of the tree. */ int rc = SQLITE_OK; /* Return code */ int const hdr = pRoot->hdrOffset; /* Offset of root page header */ MemPage *pChild; /* Only child of pRoot */ Pgno const pgnoChild = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); assert( pRoot->nCell==0 ); assert( sqlite3_mutex_held(pRoot->pBt->mutex) ); assert( !pRoot->leaf ); assert( pgnoChild>0 ); assert( pgnoChild<=pagerPagecount(pRoot->pBt) ); assert( hdr==0 || pRoot->pgno==1 ); rc = sqlite3BtreeGetPage(pRoot->pBt, pgnoChild, &pChild, 0); if( rc==SQLITE_OK ){ if( pChild->nFree>=hdr ){ if( hdr ){ rc = defragmentPage(pChild); } if( rc==SQLITE_OK ){ rc = copyNodeContent(pChild, pRoot); } if( rc==SQLITE_OK ){ rc = freePage(pChild); } }else{ /* The child has more information that will fit on the root. ** The tree is already balanced. Do nothing. */ TRACE(("BALANCE: child %d will not fit on page 1\n", pChild->pgno)); } releasePage(pChild); } return rc; } /* ** This function is called when the root page of a b-tree structure is ** overfull (has one or more overflow pages). ** ** A new child page is allocated and the contents of the current root ** page, including overflow cells, are copied into the child. The root ** page is then overwritten to make it an empty page with the right-child ** pointer pointing to the new page. ** ** Before returning, all pointer-map entries corresponding to pages ** that the new child-page now contains pointers to are updated. The ** entry corresponding to the new right-child pointer of the root ** page is also updated. ** ** If successful, *ppChild is set to contain a reference to the child ** page and SQLITE_OK is returned. In this case the caller is required ** to call releasePage() on *ppChild exactly once. If an error occurs, ** an error code is returned and *ppChild is set to 0. */ static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ int rc; /* Return value from subprocedures */ MemPage *pChild = 0; /* Pointer to a new child page */ Pgno pgnoChild; /* Page number of the new child page */ BtShared *pBt = pRoot->pBt; /* The BTree */ assert( pRoot->nOverflow>0 ); assert( sqlite3_mutex_held(pBt->mutex) ); /* Make pRoot, the root page of the b-tree, writable. Allocate a new ** page that will become the new right-child of pPage. Copy the contents ** of the node stored on pRoot into the new child page. */ if( SQLITE_OK!=(rc = sqlite3PagerWrite(pRoot->pDbPage)) || SQLITE_OK!=(rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0)) || SQLITE_OK!=(rc = copyNodeContent(pRoot, pChild)) || (ISAUTOVACUUM && SQLITE_OK!=(rc = ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno))) ){ *ppChild = 0; releasePage(pChild); return rc; } assert( sqlite3PagerIswriteable(pChild->pDbPage) ); assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); assert( pChild->nCell==pRoot->nCell ); TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); /* Copy the overflow cells from pRoot to pChild */ memcpy(pChild->aOvfl, pRoot->aOvfl, pRoot->nOverflow*sizeof(pRoot->aOvfl[0])); pChild->nOverflow = pRoot->nOverflow; pChild->nFree = 0; /* Zero the contents of pRoot. Then install pChild as the right-child. */ zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); *ppChild = pChild; return SQLITE_OK; } /* ** The page that pCur currently points to has just been modified in ** some way. This function figures out if this modification means the ** tree needs to be balanced, and if so calls the appropriate balancing ** routine. Balancing routines are: ** ** balance_quick() ** balance_shallower() ** balance_deeper() ** balance_nonroot() ** ** If built with SQLITE_DEBUG, pCur->pagesShuffled is set to true if ** balance_shallower(), balance_deeper() or balance_nonroot() is called. ** If none of these functions are invoked, pCur->pagesShuffled is left ** unmodified. */ static int balance(BtCursor *pCur){ int rc = SQLITE_OK; const int nMin = pCur->pBt->usableSize * 2 / 3; u8 aBalanceQuickSpace[13]; u8 *pFree = 0; TESTONLY( int balance_quick_called = 0; ); TESTONLY( int balance_deeper_called = 0; ); do { int iPage = pCur->iPage; MemPage *pPage = pCur->apPage[iPage]; if( iPage==0 ){ if( pPage->nOverflow ){ /* The root page of the b-tree is overfull. In this case call the ** balance_deeper() function to create a new child for the root-page ** and copy the current contents of the root-page to it. The ** next iteration of the do-loop will balance the child page. */ assert( (balance_deeper_called++)==0 ); rc = balance_deeper(pPage, &pCur->apPage[1]); if( rc==SQLITE_OK ){ pCur->iPage = 1; pCur->aiIdx[0] = 0; pCur->aiIdx[1] = 0; assert( pCur->apPage[1]->nOverflow ); } VVA_ONLY( pCur->pagesShuffled = 1 ); }else{ /* The root page of the b-tree is now empty. If the root-page is not ** also a leaf page, it will have a single child page. Call ** balance_shallower to attempt to copy the contents of the single ** child-page into the root page (this may not be possible if the ** root page is page 1). ** ** Whether or not this is possible , the tree is now balanced. ** Therefore is no next iteration of the do-loop. */ if( pPage->nCell==0 && !pPage->leaf ){ rc = balance_shallower(pPage); VVA_ONLY( pCur->pagesShuffled = 1 ); } break; } }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ break; }else{ MemPage * const pParent = pCur->apPage[iPage-1]; int const iIdx = pCur->aiIdx[iPage-1]; rc = sqlite3PagerWrite(pParent->pDbPage); if( rc==SQLITE_OK ){ #ifndef SQLITE_OMIT_QUICKBALANCE if( pPage->hasData && pPage->nOverflow==1 && pPage->aOvfl[0].idx==pPage->nCell && pParent->pgno!=1 && pParent->nCell==iIdx ){ /* Call balance_quick() to create a new sibling of pPage on which ** to store the overflow cell. balance_quick() inserts a new cell ** into pParent, which may cause pParent overflow. If this ** happens, the next interation of the do-loop will balance pParent ** use either balance_nonroot() or balance_deeper(). Until this ** happens, the overflow cell is stored in the aBalanceQuickSpace[] ** buffer. ** ** The purpose of the following assert() is to check that only a ** single call to balance_quick() is made for each call to this ** function. If this were not verified, a subtle bug involving reuse ** of the aBalanceQuickSpace[] might sneak in. */ assert( (balance_quick_called++)==0 ); rc = balance_quick(pParent, pPage, aBalanceQuickSpace); }else #endif { /* In this case, call balance_nonroot() to redistribute cells ** between pPage and up to 2 of its sibling pages. This involves ** modifying the contents of pParent, which may cause pParent to ** become overfull or underfull. The next iteration of the do-loop ** will balance the parent page to correct this. ** ** If the parent page becomes overfull, the overflow cell or cells ** are stored in the pSpace buffer allocated immediately below. ** A subsequent iteration of the do-loop will deal with this by ** calling balance_nonroot() (balance_deeper() may be called first, ** but it doesn't deal with overflow cells - just moves them to a ** different page). Once this subsequent call to balance_nonroot() ** has completed, it is safe to release the pSpace buffer used by ** the previous call, as the overflow cell data will have been ** copied either into the body of a database page or into the new ** pSpace buffer passed to the latter call to balance_nonroot(). */ u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); rc = balance_nonroot(pParent, iIdx, pSpace); if( pFree ){ /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ sqlite3PageFree(pFree); } /* The pSpace buffer will be freed after the next call to ** balance_nonroot(), or just before this function returns, whichever ** comes first. */ pFree = pSpace; VVA_ONLY( pCur->pagesShuffled = 1 ); } } pPage->nOverflow = 0; /* The next iteration of the do-loop balances the parent page. */ releasePage(pPage); pCur->iPage--; } }while( rc==SQLITE_OK ); if( pFree ){ sqlite3PageFree(pFree); } return rc; } /* ** This routine checks all cursors that point to table pgnoRoot. ** If any of those cursors were opened with wrFlag==0 in a different |
︙ | ︙ | |||
6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 | SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || (!loc && SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) )){ return rc; } pPage = pCur->apPage[pCur->iPage]; assert( pPage->intKey || nKey>=0 ); assert( pPage->leaf || !pPage->intKey ); TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", pCur->pgnoRoot, nKey, nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); assert( pPage->isInit ); allocateTempSpace(pBt); | > | 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 | SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || (!loc && SQLITE_OK!=(rc = sqlite3BtreeMoveto(pCur, pKey, nKey, appendBias, &loc)) )){ return rc; } pPage = pCur->apPage[pCur->iPage]; assert( pPage->intKey || nKey>=0 ); assert( pPage->intKey || nKey>=0 ); assert( pPage->leaf || !pPage->intKey ); TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", pCur->pgnoRoot, nKey, nData, pPage->pgno, loc==0 ? "overwrite" : "new entry")); assert( pPage->isInit ); allocateTempSpace(pBt); |
︙ | ︙ | |||
6309 6310 6311 6312 6313 6314 6315 | rc = dropCell(pPage, idx, szOld); if( rc!=SQLITE_OK ) { goto end_insert; } }else if( loc<0 && pPage->nCell>0 ){ assert( pPage->leaf ); idx = ++pCur->aiIdx[pCur->iPage]; | < < | | < | | < > > | > < > | < < < < < < < | | > > > > > > | | < | < < | < < | | | > > | 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 | rc = dropCell(pPage, idx, szOld); if( rc!=SQLITE_OK ) { goto end_insert; } }else if( loc<0 && pPage->nCell>0 ){ assert( pPage->leaf ); idx = ++pCur->aiIdx[pCur->iPage]; }else{ assert( pPage->leaf ); } rc = insertCell(pPage, idx, newCell, szNew, 0, 0); assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); /* If no error has occured and pPage has an overflow cell, call balance() ** to redistribute the cells within the tree. Since balance() may move ** the cursor, zero the BtCursor.info.nSize and BtCursor.validNKey ** variables. ** ** Previous versions of SQLite called moveToRoot() to move the cursor ** back to the root page as balance() used to invalidate the contents ** of BtCursor.apPage[] and BtCursor.aiIdx[]. This is no longer necessary, ** as balance() always leaves the cursor pointing to a valid entry. ** ** There is a subtle but important optimization here too. When inserting ** multiple records into an intkey b-tree using a single cursor (as can ** happen while processing an "INSERT INTO ... SELECT" statement), it ** is advantageous to leave the cursor pointing to the last entry in ** the b-tree if possible. If the cursor is left pointing to the last ** entry in the table, and the next row inserted has an integer key ** larger than the largest existing key, it is possible to insert the ** row without seeking the cursor. This can be a big performance boost. */ pCur->info.nSize = 0; pCur->validNKey = 0; if( rc==SQLITE_OK && pPage->nOverflow ){ pCur->atLast = 0; rc = balance(pCur); /* Must make sure nOverflow is reset to zero even if the balance() ** fails. Internal data structure corruption will result otherwise. */ pCur->apPage[pCur->iPage]->nOverflow = 0; } assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); end_insert: return rc; } /* ** Delete the entry that the cursor is pointing to. The cursor |
︙ | ︙ | |||
6496 6497 6498 6499 6500 6501 6502 | leafCursorInvalid = 1; } if( rc==SQLITE_OK ){ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); put4byte(findOverflowCell(pPage, idx), pgnoChild); VVA_ONLY( pCur->pagesShuffled = 0 ); | | | 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 | leafCursorInvalid = 1; } if( rc==SQLITE_OK ){ assert( sqlite3PagerIswriteable(pPage->pDbPage) ); put4byte(findOverflowCell(pPage, idx), pgnoChild); VVA_ONLY( pCur->pagesShuffled = 0 ); rc = balance(pCur); } if( rc==SQLITE_OK && leafCursorInvalid ){ /* The leaf-node is now underfull and so the tree needs to be ** rebalanced. However, the balance() operation on the internal ** node above may have modified the structure of the B-Tree and ** so the current contents of leafCur.apPage[] and leafCur.aiIdx[] |
︙ | ︙ | |||
6538 6539 6540 6541 6542 6543 6544 | } if( SQLITE_OK==rc && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage)) ){ dropCell(pLeafPage, 0, szNext); VVA_ONLY( leafCur.pagesShuffled = 0 ); | | | | 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 | } if( SQLITE_OK==rc && SQLITE_OK==(rc = sqlite3PagerWrite(pLeafPage->pDbPage)) ){ dropCell(pLeafPage, 0, szNext); VVA_ONLY( leafCur.pagesShuffled = 0 ); rc = balance(&leafCur); assert( leafCursorInvalid || !leafCur.pagesShuffled || !pCur->pagesShuffled ); } } sqlite3BtreeReleaseTempCursor(&leafCur); }else{ TRACE(("DELETE: table=%d delete from leaf %d\n", pCur->pgnoRoot, pPage->pgno)); rc = dropCell(pPage, idx, cellSizePtr(pPage, pCell)); if( rc==SQLITE_OK ){ rc = balance(pCur); } } if( rc==SQLITE_OK ){ moveToRoot(pCur); } return rc; } |
︙ | ︙ |