Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Changes to balance_nonroot() and BtreeDelete() to simplify delete operations and reduce stack/heap usage while balancing b-tree structures. (CVS 6768) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
092b276e7d89bbfa3d8637a90ee5d458 |
User & Date: | danielk1977 2009-06-16 16:50:22.000 |
Context
2009-06-16
| ||
17:49 | Changes sqlite3_prepare_v2() (and related routines) so that if it fails due to a missing table and the schema is out of date, it retries once before returning SQLITE_SCHEMA. Other changes to prepare.c to facilitate coverage testing. (CVS 6769) (check-in: 256ec3c6af user: drh tags: trunk) | |
16:50 | Changes to balance_nonroot() and BtreeDelete() to simplify delete operations and reduce stack/heap usage while balancing b-tree structures. (CVS 6768) (check-in: 092b276e7d user: danielk1977 tags: trunk) | |
14:15 | Fix a link error and warning that can occur in where.c when compiling under MSVC with SQLITE_OMIT_VIRTUALTABLE defined. Ticket #3914. (CVS 6767) (check-in: 793c93be16 user: shane tags: trunk) | |
Changes
Changes to src/btree.c.
1 2 3 4 5 6 7 8 9 10 11 | /* ** 2004 April 6 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 | /* ** 2004 April 6 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** $Id: btree.c,v 1.630 2009/06/16 16:50:22 danielk1977 Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** See the header comment on "btreeInt.h" for additional information. ** Including a description of file format and an overview of operation. */ #include "btreeInt.h" |
︙ | ︙ | |||
634 635 636 637 638 639 640 | if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; return SQLITE_OK; } #else /* if defined SQLITE_OMIT_AUTOVACUUM */ #define ptrmapPut(w,x,y,z) SQLITE_OK #define ptrmapGet(w,x,y,z) SQLITE_OK | < | 634 635 636 637 638 639 640 641 642 643 644 645 646 647 | if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT; return SQLITE_OK; } #else /* if defined SQLITE_OMIT_AUTOVACUUM */ #define ptrmapPut(w,x,y,z) SQLITE_OK #define ptrmapGet(w,x,y,z) SQLITE_OK #endif /* ** Given a btree page and a cell index (0 means the first cell on ** the page, 1 means the second cell, and so forth) return a pointer ** to the cell content. ** |
︙ | ︙ | |||
826 827 828 829 830 831 832 | ** for the overflow page. */ static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ CellInfo info; assert( pCell!=0 ); sqlite3BtreeParseCellPtr(pPage, pCell, &info); assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); | | < < < < < < < < < < < | 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 | ** for the overflow page. */ static int ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell){ CellInfo info; assert( pCell!=0 ); sqlite3BtreeParseCellPtr(pPage, pCell, &info); assert( (info.nData+(pPage->intKey?0:info.nKey))==info.nPayload ); if( info.iOverflow ){ Pgno ovfl = get4byte(&pCell[info.iOverflow]); return ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno); } return SQLITE_OK; } #endif /* ** Defragment the page given. All Cells are moved to the ** end of the page and all free space is collected into one ** big FreeBlk that occurs in between the header and cell |
︙ | ︙ | |||
4569 4570 4571 4572 4573 4574 4575 | goto end_allocate_page; } if( !searchList || iPage==nearby ){ int noContent; Pgno nPage; *pPgno = iPage; nPage = pagerPagecount(pBt); | | | 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 | goto end_allocate_page; } if( !searchList || iPage==nearby ){ int noContent; Pgno nPage; *pPgno = iPage; nPage = pagerPagecount(pBt); if( iPage>nPage ){ /* Free page off the end of the file */ rc = SQLITE_CORRUPT_BKPT; goto end_allocate_page; } TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" ": %d more free pages\n", *pPgno, closest+1, k, pTrunk->pgno, n-1)); |
︙ | ︙ | |||
5059 5060 5061 5062 5063 5064 5065 | */ static int insertCell( MemPage *pPage, /* Page into which we are copying */ int i, /* New cell becomes the i-th cell of the page */ u8 *pCell, /* Content of the new cell */ int sz, /* Bytes of content in pCell */ u8 *pTemp, /* Temp storage space for pCell, if needed */ | | > > > > > | 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 | */ static int insertCell( MemPage *pPage, /* Page into which we are copying */ int i, /* New cell becomes the i-th cell of the page */ u8 *pCell, /* Content of the new cell */ int sz, /* Bytes of content in pCell */ u8 *pTemp, /* Temp storage space for pCell, if needed */ Pgno iChild /* If non-zero, replace first 4 bytes with this value */ ){ int idx; /* Where to write new cell content in data[] */ int j; /* Loop counter */ int top; /* First byte of content for any cell in data[] */ int end; /* First byte past the last cell pointer in data[] */ int ins; /* Index in data[] where new cell pointer is inserted */ int hdr; /* Offset into data[] of the page header */ int cellOffset; /* Address of first cell pointer in data[] */ u8 *data; /* The content of the whole page */ u8 *ptr; /* Used for moving information around in data[] */ int nSkip = (iChild ? 4 : 0); assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); assert( pPage->nCell<=MX_CELL(pPage->pBt) && MX_CELL(pPage->pBt)<=5460 ); assert( pPage->nOverflow<=ArraySize(pPage->aOvfl) ); assert( sz==cellSizePtr(pPage, pCell) ); assert( sqlite3_mutex_held(pPage->pBt->mutex) ); if( pPage->nOverflow || sz+2>pPage->nFree ){ if( pTemp ){ memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); pCell = pTemp; } if( iChild ){ put4byte(pCell, iChild); } j = pPage->nOverflow++; assert( j<(int)(sizeof(pPage->aOvfl)/sizeof(pPage->aOvfl[0])) ); pPage->aOvfl[j].pCell = pCell; pPage->aOvfl[j].idx = (u16)i; }else{ int rc = sqlite3PagerWrite(pPage->pDbPage); if( rc!=SQLITE_OK ){ |
︙ | ︙ | |||
5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 | assert( end <= get2byte(&data[hdr+5]) ); if (idx+sz > pPage->pBt->usableSize) { return SQLITE_CORRUPT_BKPT; } pPage->nCell++; pPage->nFree -= 2; memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ ptr[0] = ptr[-2]; ptr[1] = ptr[-1]; } put2byte(&data[ins], idx); put2byte(&data[hdr+3], pPage->nCell); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ /* The cell may contain a pointer to an overflow page. If so, write ** the entry for the overflow page into the pointer map. */ | > > > < | < < < < < < | 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 | assert( end <= get2byte(&data[hdr+5]) ); if (idx+sz > pPage->pBt->usableSize) { return SQLITE_CORRUPT_BKPT; } pPage->nCell++; pPage->nFree -= 2; memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); if( iChild ){ put4byte(&data[idx], iChild); } for(j=end-2, ptr=&data[j]; j>ins; j-=2, ptr-=2){ ptr[0] = ptr[-2]; ptr[1] = ptr[-1]; } put2byte(&data[ins], idx); put2byte(&data[hdr+3], pPage->nCell); #ifndef SQLITE_OMIT_AUTOVACUUM if( pPage->pBt->autoVacuum ){ /* The cell may contain a pointer to an overflow page. If so, write ** the entry for the overflow page into the pointer map. */ rc = ptrmapPutOvflPtr(pPage, pCell); } #endif } return SQLITE_OK; } |
︙ | ︙ | |||
5222 5223 5224 5225 5226 5227 5228 | ** cell that will be inserted into pParent. Such a cell consists of a 4 ** byte page number followed by a variable length integer. In other ** words, at most 13 bytes. Hence the pSpace buffer must be at ** least 13 bytes in size. */ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ BtShared *const pBt = pPage->pBt; /* B-Tree Database */ | | > > > > > > > > > > > > > > > > > < | | < < < < < < < < < < < > | > > > > > > > > | | | | | < > | | < | | < | < > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | < > | > | < > > > > > > > > > > > > > > | > > > > | | < | | < | < > > > > > > > > | | > > > > > > | > > > > | > | > > | < | < < < < < | | | < > | > > | > > | < < < > > > > > > > > > > > > > > > > > > > > > > < | | < < < < < < < < < < < < < < < < < < < < < < | < > > > > > > | > > > > | < < < < < < < < < < | | < < < < < < < < | | | | | | | | | < < < < | | < | | > | | | | | | | | | | | < | 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 | ** cell that will be inserted into pParent. Such a cell consists of a 4 ** byte page number followed by a variable length integer. In other ** words, at most 13 bytes. Hence the pSpace buffer must be at ** least 13 bytes in size. */ static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ BtShared *const pBt = pPage->pBt; /* B-Tree Database */ MemPage *pNew; /* Newly allocated page */ int rc; /* Return Code */ Pgno pgnoNew; /* Page number of pNew */ assert( sqlite3_mutex_held(pPage->pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); assert( pPage->nOverflow==1 ); if( pPage->nCell<=0 ) return SQLITE_CORRUPT_BKPT; /* Allocate a new page. This page will become the right-sibling of ** pPage. Make the parent page writable, so that the new divider cell ** may be inserted. If both these operations are successful, proceed. */ rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); if( rc==SQLITE_OK ){ u8 *pOut = &pSpace[4]; u8 *pCell = pPage->aOvfl[0].pCell; u16 szCell = cellSizePtr(pPage, pCell); u8 *pStop; assert( sqlite3PagerIswriteable(pNew->pDbPage) ); assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); assemblePage(pNew, 1, &pCell, &szCell); /* If this is an auto-vacuum database, update the pointer map ** with entries for the new page, and any pointer from the ** cell on the page to an overflow page. If either of these ** operations fails, the return code is set, but the contents ** of the parent page are still manipulated by thh code below. ** That is Ok, at this point the parent page is guaranteed to ** be marked as dirty. Returning an error code will cause a ** rollback, undoing any changes made to the parent page. */ if( ISAUTOVACUUM ){ rc = ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno); if( szCell>pNew->minLocal && rc==SQLITE_OK ){ rc = ptrmapPutOvflPtr(pNew, pCell); } } /* Create a divider cell to insert into pParent. The divider cell ** consists of a 4-byte page number (the page number of pPage) and ** a variable length key value (which must be the same value as the ** largest key on pPage). ** ** To find the largest key value on pPage, first find the right-most ** cell on pPage. The first two fields of this cell are the ** record-length (a variable length integer at most 32-bits in size) ** and the key value (a variable length integer, may have any value). ** The first of the while(...) loops below skips over the record-length ** field. The second while(...) loop copies the key value from the ** cell on pPage into the pSpace buffer. */ pCell = findCell(pPage, pPage->nCell-1); pStop = &pCell[9]; while( (*(pCell++)&0x80) && pCell<pStop ); pStop = &pCell[9]; while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); /* Insert the new divider cell into pParent. */ insertCell(pParent,pParent->nCell,pSpace,(int)(pOut-pSpace),0,pPage->pgno); /* Set the right-child pointer of pParent to point to the new page. */ put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); /* Release the reference to the new page. */ releasePage(pNew); } return rc; } #endif /* SQLITE_OMIT_QUICKBALANCE */ #if 0 /* ** This function does not contribute anything to the operation of SQLite. ** it is sometimes activated temporarily while debugging code responsible ** for setting pointer-map entries. */ static int ptrmapCheckPages(MemPage **apPage, int nPage){ int i, j; for(i=0; i<nPage; i++){ Pgno n; u8 e; MemPage *pPage = apPage[i]; BtShared *pBt = pPage->pBt; assert( pPage->isInit ); for(j=0; j<pPage->nCell; j++){ CellInfo info; u8 *z; z = findCell(pPage, j); sqlite3BtreeParseCellPtr(pPage, z, &info); if( info.iOverflow ){ Pgno ovfl = get4byte(&z[info.iOverflow]); ptrmapGet(pBt, ovfl, &e, &n); assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); } if( !pPage->leaf ){ Pgno child = get4byte(z); ptrmapGet(pBt, child, &e, &n); assert( n==pPage->pgno && e==PTRMAP_BTREE ); } } if( !pPage->leaf ){ Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); ptrmapGet(pBt, child, &e, &n); assert( n==pPage->pgno && e==PTRMAP_BTREE ); } } return 1; } #endif /* ** This routine redistributes cells on the iParentIdx'th child of pParent ** (hereafter "the page") and up to 2 siblings so that all pages have about the ** same amount of free space. Usually a single sibling on either side of the ** page are used in the balancing, though both siblings might come from one ** side if the page is the first or last child of its parent. If the page ** has fewer than 2 siblings (something which can only happen if the page ** is a root page or a child of a root page) then all available siblings ** participate in the balancing. ** ** The number of siblings of the page might be increased or decreased by ** one or two in an effort to keep pages nearly full but not over full. ** ** Note that when this routine is called, some of the cells on the page ** might not actually be stored in MemPage.aData[]. This can happen ** if the page is overfull. This routine ensures that all cells allocated ** to the page and its siblings fit into MemPage.aData[] before returning. ** ** In the course of balancing the page and its siblings, cells may be ** inserted into or removed from the parent page (pParent). Doing so ** may cause the parent page to become overfull or underfull. If this ** happens, it is the responsibility of the caller to invoke the correct ** balancing routine to fix this problem (see the balance() routine). ** ** If this routine fails for any reason, it might leave the database ** in a corrupted state. So if this routine fails, the database should ** be rolled back. ** ** The third argument to this function, aOvflSpace, is a pointer to a ** buffer page-size bytes in size. If, in inserting cells into the parent ** page (pParent), the parent page becomes overfull, this buffer is ** used to store the parents overflow cells. Because this function inserts ** a maximum of four divider cells into the parent page, and the maximum ** size of a cell stored within an internal node is always less than 1/4 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large ** enough for all overflow cells. ** ** If aOvflSpace is set to a null pointer, this function returns ** SQLITE_NOMEM. */ static int balance_nonroot( MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace /* page-size bytes of space for parent ovfl */ ){ BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nNew = 0; /* Number of pages in apNew[] */ int nOld; /* Number of pages in apOld[] */ int i, j, k; /* Loop counters */ int nxDiv; /* Next divider slot in pParent->aCell[] */ int rc = SQLITE_OK; /* The return code */ int leafCorrection; /* 4 if pPage is a leaf. 0 if not */ int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ int usableSpace; /* Bytes in pPage beyond the header */ int pageFlags; /* Value of pPage->aData[0] */ int subtotal; /* Subtotal of bytes in cells on one page */ int iSpace1 = 0; /* First unused byte of aSpace1[] */ int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ int szScratch; /* Size of scratch memory requested */ MemPage *apOld[NB]; /* pPage and up to two siblings */ MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ u8 *pRight; /* Location in parent of right-sibling pointer */ u8 *apDiv[NB-1]; /* Divider cells in pParent */ int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ int szNew[NB+2]; /* Combined size of cells place on i-th page */ u8 **apCell = 0; /* All cells begin balanced */ u16 *szCell; /* Local size of all cells in apCell[] */ u8 *aSpace1; /* Space for copies of dividers cells */ Pgno pgno; /* Temp var to store a page number in */ pBt = pParent->pBt; assert( sqlite3_mutex_held(pBt->mutex) ); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); /* At this point pParent may have at most one overflow cell. And if ** this overflow cell is present, it must be the cell with ** index iParentIdx. This scenario comes about when this function ** is called (indirectly) from sqlite3BtreeDelete(). */ assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); assert( pParent->nOverflow==0 || pParent->aOvfl[0].idx==iParentIdx ); /* Find the sibling pages to balance. Also locate the cells in pParent ** that divide the siblings. An attempt is made to find NN siblings on ** either side of pPage. More siblings are taken from one side, however, ** if there are fewer than NN siblings on the other side. If pParent ** has NB or fewer children then all children of pParent are taken. ** ** This loop also drops the divider cells from the parent page. This ** way, the remainder of the function does not have to deal with any ** overflow cells in the parent page, as if one existed it has already ** been removed. */ i = pParent->nOverflow + pParent->nCell; if( i<2 ){ nxDiv = 0; nOld = i+1; }else{ nOld = 3; if( iParentIdx==0 ){ nxDiv = 0; }else if( iParentIdx==i ){ nxDiv = i-2; }else{ nxDiv = iParentIdx-1; } i = 2; } if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ pRight = &pParent->aData[pParent->hdrOffset+8]; }else{ pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); } pgno = get4byte(pRight); while( 1 ){ rc = getAndInitPage(pBt, pgno, &apOld[i]); if( rc ){ memset(apOld, 0, i*sizeof(MemPage*)); goto balance_cleanup; } nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; if( (i--)==0 ) break; if( pParent->nOverflow && i+nxDiv==pParent->aOvfl[0].idx ){ apDiv[i] = pParent->aOvfl[0].pCell; pgno = get4byte(apDiv[i]); szNew[i] = cellSizePtr(pParent, apDiv[i]); pParent->nOverflow = 0; }else{ apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); pgno = get4byte(apDiv[i]); szNew[i] = cellSizePtr(pParent, apDiv[i]); /* Drop the cell from the parent page. apDiv[i] still points to ** the cell within the parent, even though it has been dropped. ** This is safe because dropping a cell only overwrites the first ** four bytes of it, and this function does not need the first ** four bytes of the divider cell. So the pointer is safe to use ** later on. */ dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i]); } } /* Make nMaxCells a multiple of 4 in order to preserve 8-byte ** alignment */ nMaxCells = (nMaxCells + 3)&~3; /* ** Allocate space for memory structures */ k = pBt->pageSize + ROUND8(sizeof(MemPage)); szScratch = nMaxCells*sizeof(u8*) /* apCell */ + nMaxCells*sizeof(u16) /* szCell */ + pBt->pageSize /* aSpace1 */ + k*nOld; /* Page copies (apCopy) */ apCell = sqlite3ScratchMalloc( szScratch ); if( apCell==0 || aOvflSpace==0 ){ rc = SQLITE_NOMEM; goto balance_cleanup; } szCell = (u16*)&apCell[nMaxCells]; aSpace1 = (u8*)&szCell[nMaxCells]; assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); /* ** Load pointers to all cells on sibling pages and the divider cells ** into the local apCell[] array. Make copies of the divider cells ** into space obtained from aSpace1[] and remove the the divider Cells ** from pParent. ** ** If the siblings are on leaf pages, then the child pointers of the ** divider cells are stripped from the cells before they are copied ** into aSpace1[]. In this way, all cells in apCell[] are without ** child pointers. If siblings are not leaves, then all cell in ** apCell[] include child pointers. Either way, all cells in apCell[] ** are alike. ** ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. ** leafData: 1 if pPage holds key+data and pParent holds only keys. */ leafCorrection = apOld[0]->leaf*4; leafData = apOld[0]->hasData; for(i=0; i<nOld; i++){ int limit; /* Before doing anything else, take a copy of the i'th original sibling ** The rest of this function will use data from the copies rather ** that the original pages since the original pages will be in the ** process of being overwritten. */ MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; memcpy(pOld, apOld[i], sizeof(MemPage)); pOld->aData = (void*)&pOld[1]; memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); limit = pOld->nCell+pOld->nOverflow; for(j=0; j<limit; j++){ assert( nCell<nMaxCells ); apCell[nCell] = findOverflowCell(pOld, j); szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); nCell++; } if( i<nOld-1 && !leafData){ u16 sz = szNew[i]; u8 *pTemp; assert( nCell<nMaxCells ); szCell[nCell] = sz; pTemp = &aSpace1[iSpace1]; iSpace1 += sz; assert( sz<=pBt->pageSize/4 ); assert( iSpace1<=pBt->pageSize ); memcpy(pTemp, apDiv[i], sz); apCell[nCell] = pTemp+leafCorrection; assert( leafCorrection==0 || leafCorrection==4 ); szCell[nCell] -= (u16)leafCorrection; if( !pOld->leaf ){ assert( leafCorrection==0 ); assert( pOld->hdrOffset==0 ); /* The right pointer of the child page pOld becomes the left ** pointer of the divider cell */ memcpy(apCell[nCell], &pOld->aData[8], 4); }else{ assert( leafCorrection==4 ); if( szCell[nCell]<4 ){ /* Do not allow any cells smaller than 4 bytes. */ szCell[nCell] = 4; } } nCell++; } } /* ** Figure out the number of pages needed to hold all nCell cells. ** Store this number in "k". Also compute szNew[] which is the total ** size of all cells on the i-th page and cntNew[] which is the index |
︙ | ︙ | |||
5598 5599 5600 5601 5602 5603 5604 | goto balance_cleanup; } pageFlags = apOld[0]->aData[0]; for(i=0; i<k; i++){ MemPage *pNew; if( i<nOld ){ pNew = apNew[i] = apOld[i]; | < | > > > > > > > > | 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 | goto balance_cleanup; } pageFlags = apOld[0]->aData[0]; for(i=0; i<k; i++){ MemPage *pNew; if( i<nOld ){ pNew = apNew[i] = apOld[i]; apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); if( rc ) goto balance_cleanup; apNew[i] = pNew; nNew++; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ rc = ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno); if( rc!=SQLITE_OK ){ goto balance_cleanup; } } } } /* Free any old pages that were not reused as new pages. */ while( i<nOld ){ rc = freePage(apOld[i]); |
︙ | ︙ | |||
5637 5638 5639 5640 5641 5642 5643 | ** n is never more than NB (a small constant), that should ** not be a problem. ** ** When NB==3, this one optimization makes the database ** about 25% faster for large insertions and deletions. */ for(i=0; i<k-1; i++){ | | | | | < < | | | | | | | | > > > < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 | ** n is never more than NB (a small constant), that should ** not be a problem. ** ** When NB==3, this one optimization makes the database ** about 25% faster for large insertions and deletions. */ for(i=0; i<k-1; i++){ int minV = apNew[i]->pgno; int minI = i; for(j=i+1; j<k; j++){ if( apNew[j]->pgno<(unsigned)minV ){ minI = j; minV = apNew[j]->pgno; } } if( minI>i ){ int t; MemPage *pT; t = apNew[i]->pgno; pT = apNew[i]; apNew[i] = apNew[minI]; apNew[minI] = pT; } } TRACE(("BALANCE: old: %d %d %d new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", apOld[0]->pgno, nOld>=2 ? apOld[1]->pgno : 0, nOld>=3 ? apOld[2]->pgno : 0, apNew[0]->pgno, szNew[0], nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); assert( sqlite3PagerIswriteable(pParent->pDbPage) ); put4byte(pRight, apNew[nNew-1]->pgno); /* ** Evenly distribute the data in apCell[] across the new pages. ** Insert divider cells into pParent as necessary. */ j = 0; for(i=0; i<nNew; i++){ /* Assemble the new sibling page. */ MemPage *pNew = apNew[i]; assert( j<nMaxCells ); zeroPage(pNew, pageFlags); assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); assert( pNew->nOverflow==0 ); j = cntNew[i]; /* If the sibling page assembled above was not the right-most sibling, ** insert a divider cell into the parent page. */ if( i<nNew-1 && j<nCell ){ u8 *pCell; u8 *pTemp; int sz; assert( j<nMaxCells ); pCell = apCell[j]; sz = szCell[j] + leafCorrection; pTemp = &aOvflSpace[iOvflSpace]; if( !pNew->leaf ){ memcpy(&pNew->aData[8], pCell, 4); }else if( leafData ){ /* If the tree is a leaf-data tree, and the siblings are leaves, ** then there is no divider cell in apCell[]. Instead, the divider ** cell consists of the integer key for the right-most cell of ** the sibling-page assembled above only. */ CellInfo info; j--; sqlite3BtreeParseCellPtr(pNew, apCell[j], &info); pCell = pTemp; sz = 4 + putVarint(&pCell[4], info.nKey); pTemp = 0; }else{ pCell -= 4; /* Obscure case for non-leaf-data trees: If the cell at pCell was ** previously stored on a leaf node, and its reported size was 4 ** bytes, then it may actually be smaller than this ** (see sqlite3BtreeParseCellPtr(), 4 bytes is the minimum size of |
︙ | ︙ | |||
5758 5759 5760 5761 5762 5763 5764 | assert(leafCorrection==4); sz = cellSizePtr(pParent, pCell); } } iOvflSpace += sz; assert( sz<=pBt->pageSize/4 ); assert( iOvflSpace<=pBt->pageSize ); | | < < < < < < < < < < < < < < < < < < < < < < < | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | < < < < < < | < | > > > > > > > > > | > > > > > > > > > > | > > > > > > > > > > > > > | > > > > > > | > > > > > > > > > | < < | | 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 | assert(leafCorrection==4); sz = cellSizePtr(pParent, pCell); } } iOvflSpace += sz; assert( sz<=pBt->pageSize/4 ); assert( iOvflSpace<=pBt->pageSize ); rc = insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno); if( rc!=SQLITE_OK ) goto balance_cleanup; assert( sqlite3PagerIswriteable(pParent->pDbPage) ); j++; nxDiv++; } } assert( j==nCell ); assert( nOld>0 ); assert( nNew>0 ); if( (pageFlags & PTF_LEAF)==0 ){ u8 *zChild = &apCopy[nOld-1]->aData[8]; memcpy(&apNew[nNew-1]->aData[8], zChild, 4); } /* Fix the pointer-map entries for all the cells that were shifted around. ** There are several different types of pointer-map entries that need to ** be dealt with by this routine. Some of these have been set already, but ** many have not. The following is a summary: ** ** 1) The entries associated with new sibling pages that were not ** siblings when this function was called. These have already ** been set. We don't need to worry about old siblings that were ** moved to the free-list - the freePage() code has taken care ** of those. ** ** 2) The pointer-map entries associated with the first overflow ** page in any overflow chains used by new divider cells. These ** have also already been taken care of by the insertCell() code. ** ** 3) If the sibling pages are not leaves, then the child pages of ** cells stored on the sibling pages may need to be updated. ** ** 4) If the sibling pages are not internal intkey nodes, then any ** overflow pages used by these cells may need to be updated ** (internal intkey nodes never contain pointers to overflow pages). ** ** 5) If the sibling pages are not leaves, then the pointer-map ** entries for the right-child pages of each sibling may need ** to be updated. ** ** Cases 1 and 2 are dealt with above by other code. The following ** block deals with cases 3 and 4. Since setting a pointer map entry ** is a relatively expensive operation, this code only sets pointer ** map entries for child or overflow pages that have actually moved ** between pages. */ if( ISAUTOVACUUM ){ MemPage *pNew = apNew[0]; MemPage *pOld = apCopy[0]; int nOverflow = pOld->nOverflow; int iNextOld = pOld->nCell + nOverflow; int iOverflow = (nOverflow ? pOld->aOvfl[0].idx : -1); j = 0; /* Current 'old' sibling page */ k = 0; /* Current 'new' sibling page */ for(i=0; i<nCell && rc==SQLITE_OK; i++){ int isDivider = 0; while( i==iNextOld ){ /* Cell i is the cell immediately following the last cell on old ** sibling page j. If the siblings are not leaf pages of an ** intkey b-tree, then cell i was a divider cell. */ pOld = apCopy[++j]; iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; if( pOld->nOverflow ){ nOverflow = pOld->nOverflow; iOverflow = i + !leafData + pOld->aOvfl[0].idx; } isDivider = !leafData; } assert(nOverflow>0 || iOverflow<i ); assert(nOverflow<2 || pOld->aOvfl[0].idx==pOld->aOvfl[1].idx-1); assert(nOverflow<3 || pOld->aOvfl[1].idx==pOld->aOvfl[2].idx-1); if( i==iOverflow ){ isDivider = 1; if( (--nOverflow)>0 ){ iOverflow++; } } if( i==cntNew[k] ){ /* Cell i is the cell immediately following the last cell on new ** sibling page k. If the siblings are not leaf pages of an ** intkey b-tree, then cell i is a divider cell. */ pNew = apNew[++k]; if( !leafData ) continue; } assert( rc==SQLITE_OK ); assert( j<nOld ); assert( k<nNew ); /* If the cell was originally divider cell (and is not now) or ** an overflow cell, or if the cell was located on a different sibling ** page before the balancing, then the pointer map entries associated ** with any child or overflow pages need to be updated. */ if( isDivider || pOld->pgno!=pNew->pgno ){ if( !leafCorrection ){ rc = ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno); } if( szCell[i]>pNew->minLocal && rc==SQLITE_OK ){ rc = ptrmapPutOvflPtr(pNew, apCell[i]); } } } if( !leafCorrection ){ for(i=0; rc==SQLITE_OK && i<nNew; i++){ rc = ptrmapPut( pBt, get4byte(&apNew[i]->aData[8]), PTRMAP_BTREE, apNew[i]->pgno); } } #if 0 /* The ptrmapCheckPages() contains assert() statements that verify that ** all pointer map pages are set correctly. This is helpful while ** debugging. This is usually disabled because a corrupt database may ** cause an assert() statement to fail. */ ptrmapCheckPages(apNew, nNew); ptrmapCheckPages(&pParent, 1); #endif } assert( pParent->isInit ); TRACE(("BALANCE: finished with %d: old=%d new=%d cells=%d\n", pPage->pgno, nOld, nNew, nCell)); /* ** Cleanup before returning. */ balance_cleanup: sqlite3ScratchFree(apCell); for(i=0; i<nOld; i++){ releasePage(apOld[i]); |
︙ | ︙ | |||
6119 6120 6121 6122 6123 6124 6125 | /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ sqlite3PageFree(pFree); } | | | | | 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 | /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ sqlite3PageFree(pFree); } /* The pSpace buffer will be freed after the next call to ** balance_nonroot(), or just before this function returns, whichever ** comes first. */ pFree = pSpace; VVA_ONLY( pCur->pagesShuffled = 1 ); } } pPage->nOverflow = 0; |
︙ | ︙ | |||
6363 6364 6365 6366 6367 6368 6369 | } /* ** Delete the entry that the cursor is pointing to. The cursor ** is left pointing at a arbitrary location. */ int sqlite3BtreeDelete(BtCursor *pCur){ | < < < < < | > > > > > < < | < | < < | < | < < | < < < < < < < | < | > | < < < < > | < | < < | < < | | > > > > > > < < < < < < < < < < < < < < | | < > > < < | < | < > | > | < < < < < < < | < | | < | > | < < < < < < < < < < < < < < < < | < < < < > > | | < > > > > | | < < < < < < < | | > | > > | < | < > > | | | < < < < < | | | | | | | | < | | > | | | | < < < | < | | < < < < < < | < < < < < < < | | < < < < < < < | | | | 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 | } /* ** Delete the entry that the cursor is pointing to. The cursor ** is left pointing at a arbitrary location. */ int sqlite3BtreeDelete(BtCursor *pCur){ Btree *p = pCur->pBtree; BtShared *pBt = p->pBt; int rc; /* Return code */ MemPage *pPage; /* Page to delete cell from */ unsigned char *pCell; /* Pointer to cell to delete */ int iCellIdx; /* Index of cell to delete */ int iCellDepth; /* Depth of node containing pCell */ assert( cursorHoldsMutex(pCur) ); assert( pBt->inTransaction==TRANS_WRITE ); assert( !pBt->readOnly ); assert( pCur->wrFlag ); if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) || NEVER(pCur->eState!=CURSOR_VALID) ){ return SQLITE_ERROR; /* Something has gone awry. */ } rc = checkForReadConflicts(p, pCur->pgnoRoot, pCur, pCur->info.nKey); if( rc!=SQLITE_OK ){ assert( rc==SQLITE_LOCKED_SHAREDCACHE ); return rc; /* The table pCur points to has a read lock */ } iCellDepth = pCur->iPage; iCellIdx = pCur->aiIdx[iCellDepth]; pPage = pCur->apPage[iCellDepth]; pCell = findCell(pPage, iCellIdx); /* If the page containing the entry to delete is not a leaf page, move ** the cursor to the largest entry in the tree that is smaller than ** the entry being deleted. This cell will replace the cell being deleted ** from the internal node. The 'previous' entry is used for this instead ** of the 'next' entry, as the previous entry is always a part of the ** sub-tree headed by the child page of the cell being deleted. This makes ** balancing the tree following the delete operation easier. */ if( !pPage->leaf ){ int notUsed; if( SQLITE_OK!=(rc = sqlite3BtreePrevious(pCur, ¬Used)) ){ return rc; } } /* Save the positions of any other cursors open on this table before ** making any modifications. Make the page containing the entry to be ** deleted writable. Then free any overflow pages associated with the ** entry and finally remove the cell itself from within the page. */ if( SQLITE_OK!=(rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur)) || SQLITE_OK!=(rc = sqlite3PagerWrite(pPage->pDbPage)) || SQLITE_OK!=(rc = clearCell(pPage, pCell)) || SQLITE_OK!=(rc = dropCell(pPage, iCellIdx, cellSizePtr(pPage, pCell))) ){ return rc; } /* If the cell deleted was not located on a leaf page, then the cursor ** is currently pointing to the largest entry in the sub-tree headed ** by the child-page of the cell that was just deleted from an internal ** node. The cell from the leaf node needs to be moved to the internal ** node to replace the deleted cell. */ if( !pPage->leaf ){ MemPage *pLeaf = pCur->apPage[pCur->iPage]; int nCell; Pgno n = pCur->apPage[iCellDepth+1]->pgno; unsigned char *pTmp; pCell = findCell(pLeaf, pLeaf->nCell-1); nCell = cellSizePtr(pLeaf, pCell); assert( MX_CELL_SIZE(pBt)>=nCell ); allocateTempSpace(pBt); pTmp = pBt->pTmpSpace; if( SQLITE_OK!=(rc = sqlite3PagerWrite(pLeaf->pDbPage)) || SQLITE_OK!=(rc = insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n)) || SQLITE_OK!=(rc = dropCell(pLeaf, pLeaf->nCell-1, nCell)) ){ return rc; } } /* Balance the tree. If the entry deleted was located on a leaf page, ** then the cursor still points to that page. In this case the first ** call to balance() repairs the tree, and the if(...) condition is ** never true. ** ** Otherwise, if the entry deleted was on an internal node page, then ** pCur is pointing to the leaf page from which a cell was removed to ** replace the cell deleted from the internal node. This is slightly ** tricky as the leaf node may be underfull, and the internal node may ** be either under or overfull. In this case run the balancing algorithm ** on the leaf node first. If the balance proceeds far enough up the ** tree that we can be sure that any problem in the internal node has ** been corrected, so be it. Otherwise, after balancing the leaf node, ** walk the cursor up the tree to the internal node and balance it as ** well. */ rc = balance(pCur); if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ while( pCur->iPage>iCellDepth ){ releasePage(pCur->apPage[pCur->iPage--]); } rc = balance(pCur); } if( rc==SQLITE_OK ){ moveToRoot(pCur); } return rc; } /* |
︙ | ︙ |