SQLite

Check-in [29304499ea]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the balance_nonroot() routine to reduce the amount of memcpy work that takes place. This is a work in progress.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | defrag-opt
Files: files | file ages | folders
SHA1: 29304499ea4b72dbb6701e10cc19b5d41f7e5ac9
User & Date: dan 2014-10-09 19:35:37.452
Context
2014-10-11
20:00
Attempt to further reduce memcpy() in balance_nonroot(). (check-in: fec849dcca user: dan tags: defrag-opt)
2014-10-09
19:35
Change the balance_nonroot() routine to reduce the amount of memcpy work that takes place. This is a work in progress. (check-in: 29304499ea user: dan tags: defrag-opt)
2014-09-27
05:00
Reduce the amount of memcpy() required by defragmentPage(). (check-in: 3edab9957c user: drh tags: defrag-opt)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btree.c.
5973
5974
5975
5976
5977
5978
5979











































5980
5981
5982
5983
5984
5985
5986
  }
  put2byte(&data[hdr+3], nCell);
  put2byte(&data[hdr+5], cellbody);
  pPage->nFree -= (nCell*2 + nUsable - cellbody);
  pPage->nCell = (u16)nCell;
}












































/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and
** NN neighbors on either side.
**







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
  }
  put2byte(&data[hdr+3], nCell);
  put2byte(&data[hdr+5], cellbody);
  pPage->nFree -= (nCell*2 + nUsable - cellbody);
  pPage->nCell = (u16)nCell;
}


static void rebuildPage(
  MemPage *pPg,                   /* Edit this page */
  int nRemove,                    /* Cells to remove from start of page */
  int nCell,                      /* Final number of cells on page */
  u8 **apCell,                    /* Array of nCell final cells */
  u16 *szCell                     /* Array of nCell cell sizes */
){
  const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
  u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
  const int usableSize = pPg->pBt->usableSize;
  u8 * const pEnd = &aData[usableSize];
  int i;
  u8 *pCellptr = pPg->aCellIdx;
  u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
  u8 *pData;

  i = get2byte(&aData[hdr+5]);
  memcpy(&pTmp[i], &aData[i], usableSize - i);
  pData = &aData[usableSize];

  for(i=0; i<nCell; i++){
    u8 *pCell = apCell[i];
    if( pCell>aData && pCell<pEnd ){
      pCell = &pTmp[pCell - aData];
    }
    pData -= szCell[i];
    memcpy(pData, pCell, szCell[i]);
    put2byte(pCellptr, (pData - aData));
    pCellptr += 2;
    assert( szCell[i]==cellSizePtr(pPg, pCell) );
  }

  pPg->nFree = (pData - pCellptr);
  pPg->nCell = nCell;
  pPg->nOverflow = 0;

  put2byte(&aData[hdr+1], 0);
  put2byte(&aData[hdr+3], pPg->nCell);
  put2byte(&aData[hdr+5], pData - aData);
  aData[hdr+7] = 0x00;
}

/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and
** NN neighbors on either side.
**
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
    releasePage(pNew);
  }

  return rc;
}
#endif /* SQLITE_OMIT_QUICKBALANCE */

#if 0
/*
** This function does not contribute anything to the operation of SQLite.
** it is sometimes activated temporarily while debugging code responsible 
** for setting pointer-map entries.
*/
static int ptrmapCheckPages(MemPage **apPage, int nPage){
  int i, j;







|







6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
    releasePage(pNew);
  }

  return rc;
}
#endif /* SQLITE_OMIT_QUICKBALANCE */

#if 1
/*
** This function does not contribute anything to the operation of SQLite.
** it is sometimes activated temporarily while debugging code responsible 
** for setting pointer-map entries.
*/
static int ptrmapCheckPages(MemPage **apPage, int nPage){
  int i, j;
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278







6279
6280
6281
6282
6283
6284
6285
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */
  MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aSpace1;                 /* Space for copies of dividers cells */
  Pgno pgno;                   /* Temp var to store a page number in */








  pBt = pParent->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );

#if 0
  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
#endif







<










>
>
>
>
>
>
>







6304
6305
6306
6307
6308
6309
6310

6311
6312
6313
6314
6315
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */

  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aSpace1;                 /* Space for copies of dividers cells */
  Pgno pgno;                   /* Temp var to store a page number in */

  int aShiftLeft[NB+2];
  int aShiftRight[NB+2];
  u8 abDone[NB+2];
  Pgno aPgno[NB+2];
  u16 aPgFlags[NB+2];

  memset(abDone, 0, sizeof(abDone));
  pBt = pParent->pBt;
  assert( sqlite3_mutex_held(pBt->mutex) );
  assert( sqlite3PagerIswriteable(pParent->pDbPage) );

#if 0
  TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
#endif
6380
6381
6382
6383
6384
6385
6386
6387
6388
6389
6390
6391
6392
6393
6394
6395
6396
6397
6398
6399
6400
6401
6402
6403
6404
6405
6406
6407
6408
6409
6410
6411
6412
6413
6414
6415
6416
6417
6418
6419
6420
6421
6422
6423
6424
6425
6426
6427
6428
6429
6430
6431
6432
6433
6434
6435
6436
6437
  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 3)&~3;

  /*
  ** Allocate space for memory structures
  */
  k = pBt->pageSize + ROUND8(sizeof(MemPage));
  szScratch =
       nMaxCells*sizeof(u8*)                       /* apCell */
     + nMaxCells*sizeof(u16)                       /* szCell */
     + pBt->pageSize                               /* aSpace1 */
     + k*nOld;                                     /* Page copies (apCopy) */
  apCell = sqlite3ScratchMalloc( szScratch ); 
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (u16*)&apCell[nMaxCells];
  aSpace1 = (u8*)&szCell[nMaxCells];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained from aSpace1[] and remove the divider cells
  ** from pParent.
  **
  ** If the siblings are on leaf pages, then the child pointers of the
  ** divider cells are stripped from the cells before they are copied
  ** into aSpace1[].  In this way, all cells in apCell[] are without
  ** child pointers.  If siblings are not leaves, then all cell in
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  leafCorrection = apOld[0]->leaf*4;
  leafData = apOld[0]->intKeyLeaf;
  for(i=0; i<nOld; i++){
    int limit;
    
    /* Before doing anything else, take a copy of the i'th original sibling
    ** The rest of this function will use data from the copies rather
    ** that the original pages since the original pages will be in the
    ** process of being overwritten.  */
    MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i];
    memcpy(pOld, apOld[i], sizeof(MemPage));
    pOld->aData = (void*)&pOld[1];
    memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize);

    limit = pOld->nCell+pOld->nOverflow;
    if( pOld->nOverflow>0 ){
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findOverflowCell(pOld, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);







<



<
|












|
|















<
<
<
<
<
|
<
<
<







6429
6430
6431
6432
6433
6434
6435

6436
6437
6438

6439
6440
6441
6442
6443
6444
6445
6446
6447
6448
6449
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
6468





6469



6470
6471
6472
6473
6474
6475
6476
  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 3)&~3;

  /*
  ** Allocate space for memory structures
  */

  szScratch =
       nMaxCells*sizeof(u8*)                       /* apCell */
     + nMaxCells*sizeof(u16)                       /* szCell */

     + pBt->pageSize;                              /* aSpace1 */
  apCell = sqlite3ScratchMalloc( szScratch ); 
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (u16*)&apCell[nMaxCells];
  aSpace1 = (u8*)&szCell[nMaxCells];
  assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained from aSpace1[]. The divider cells have already
  ** been removed from pParent.
  **
  ** If the siblings are on leaf pages, then the child pointers of the
  ** divider cells are stripped from the cells before they are copied
  ** into aSpace1[].  In this way, all cells in apCell[] are without
  ** child pointers.  If siblings are not leaves, then all cell in
  ** apCell[] include child pointers.  Either way, all cells in apCell[]
  ** are alike.
  **
  ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
  **       leafData:  1 if pPage holds key+data and pParent holds only keys.
  */
  leafCorrection = apOld[0]->leaf*4;
  leafData = apOld[0]->intKeyLeaf;
  for(i=0; i<nOld; i++){
    int limit;





    MemPage *pOld = apOld[i];




    limit = pOld->nCell+pOld->nOverflow;
    if( pOld->nOverflow>0 ){
      for(j=0; j<limit; j++){
        assert( nCell<nMaxCells );
        apCell[nCell] = findOverflowCell(pOld, j);
        szCell[nCell] = cellSizePtr(pOld, apCell[nCell]);
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
  ** file is corrupt.  The corruption will be detected and reported later
  ** in this procedure so there is no need to act upon it now.
  */
#if 0
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
#endif

  TRACE(("BALANCE: old: %d %d %d  ",
    apOld[0]->pgno, 
    nOld>=2 ? apOld[1]->pgno : 0,
    nOld>=3 ? apOld[2]->pgno : 0
  ));

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  if( apOld[0]->pgno<=1 ){
    rc = SQLITE_CORRUPT_BKPT;







|
|
|
|







6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
  ** file is corrupt.  The corruption will be detected and reported later
  ** in this procedure so there is no need to act upon it now.
  */
#if 0
  assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) );
#endif

  TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
    apOld[0]->pgno, apOld[0]->nCell,
    nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
    nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
  ));

  /*
  ** Allocate k new pages.  Reuse old pages where possible.
  */
  if( apOld[0]->pgno<=1 ){
    rc = SQLITE_CORRUPT_BKPT;
6578
6579
6580
6581
6582
6583
6584

6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623





6624
6625
6626

6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638


6639
6640

6641

6642

6643


6644
6645
6646
6647















6648







6649


6650












6651





6652


6653
6654







6655


6656



6657
6658


6659



6660

6661










6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716















6717
















6718
6719

6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
      if( rc ) goto balance_cleanup;

      apNew[i] = pNew;
      nNew++;

      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
        if( rc!=SQLITE_OK ){
          goto balance_cleanup;
        }
      }
    }
  }

  /* Free any old pages that were not reused as new pages.
  */
  while( i<nOld ){
    freePage(apOld[i], &rc);
    if( rc ) goto balance_cleanup;
    releasePage(apOld[i]);
    apOld[i] = 0;
    i++;
  }

  /*
  ** Put the new pages in ascending order.  This helps to
  ** keep entries in the disk file in order so that a scan
  ** of the table is a linear scan through the file.  That
  ** in turn helps the operating system to deliver pages
  ** from the disk more rapidly.
  **
  ** An O(n^2) insertion sort algorithm is used, but since
  ** n is never more than NB (a small constant), that should
  ** not be a problem.
  **
  ** When NB==3, this one optimization makes the database
  ** about 25% faster for large insertions and deletions.
  */
  for(i=0; i<k-1; i++){
    int minV = apNew[i]->pgno;





    int minI = i;
    for(j=i+1; j<k; j++){
      if( apNew[j]->pgno<(unsigned)minV ){

        minI = j;
        minV = apNew[j]->pgno;
      }
    }
    if( minI>i ){
      MemPage *pT;
      pT = apNew[i];
      apNew[i] = apNew[minI];
      apNew[minI] = pT;
    }
  }
  TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n",


    apNew[0]->pgno, szNew[0],
    nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,

    nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,

    nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,

    nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0));



  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  put4byte(pRight, apNew[nNew-1]->pgno);
















  /*







  ** Evenly distribute the data in apCell[] across the new pages.


  ** Insert divider cells into pParent as necessary.












  */





  j = 0;


  for(i=0; i<nNew; i++){
    /* Assemble the new sibling page. */







    MemPage *pNew = apNew[i];


    assert( j<nMaxCells );



    zeroPage(pNew, pageFlags);
    assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]);


    assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) );



    assert( pNew->nOverflow==0 );












    j = cntNew[i];

    /* If the sibling page assembled above was not the right-most sibling,
    ** insert a divider cell into the parent page.
    */
    assert( i<nNew-1 || j==nCell );
    if( j<nCell ){
      u8 *pCell;
      u8 *pTemp;
      int sz;

      assert( j<nMaxCells );
      pCell = apCell[j];
      sz = szCell[j] + leafCorrection;
      pTemp = &aOvflSpace[iOvflSpace];
      if( !pNew->leaf ){
        memcpy(&pNew->aData[8], pCell, 4);
      }else if( leafData ){
        /* If the tree is a leaf-data tree, and the siblings are leaves, 
        ** then there is no divider cell in apCell[]. Instead, the divider 
        ** cell consists of the integer key for the right-most cell of 
        ** the sibling-page assembled above only.
        */
        CellInfo info;
        j--;
        btreeParseCellPtr(pNew, apCell[j], &info);
        pCell = pTemp;
        sz = 4 + putVarint(&pCell[4], info.nKey);
        pTemp = 0;
      }else{
        pCell -= 4;
        /* Obscure case for non-leaf-data trees: If the cell at pCell was
        ** previously stored on a leaf node, and its reported size was 4
        ** bytes, then it may actually be smaller than this 
        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
        ** any cell). But it is important to pass the correct size to 
        ** insertCell(), so reparse the cell now.
        **
        ** Note that this can never happen in an SQLite data file, as all
        ** cells are at least 4 bytes. It only happens in b-trees used
        ** to evaluate "IN (SELECT ...)" and similar clauses.
        */
        if( szCell[j]==4 ){
          assert(leafCorrection==4);
          sz = cellSizePtr(pParent, pCell);
        }
      }
      iOvflSpace += sz;
      assert( sz<=pBt->maxLocal+23 );
      assert( iOvflSpace <= (int)pBt->pageSize );
      insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc);
      if( rc!=SQLITE_OK ) goto balance_cleanup;
      assert( sqlite3PagerIswriteable(pParent->pDbPage) );

      j++;















      nxDiv++;
















    }
  }

  assert( j==nCell );
  assert( nOld>0 );
  assert( nNew>0 );
  if( (pageFlags & PTF_LEAF)==0 ){
    u8 *zChild = &apCopy[nOld-1]->aData[8];
    memcpy(&apNew[nNew-1]->aData[8], zChild, 4);
  }

  if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
    /* The root page of the b-tree now contains no cells. The only sibling
    ** page is the right-child of the parent. Copy the contents of the
    ** child page into the parent, decreasing the overall height of the
    ** b-tree structure by one. This is described as the "balance-shallower"
    ** sub-algorithm in some documentation.







>













<
<
<
<
<
<
<
<
<
<

|
|
|
|
<

|
|
<

|
|

|
|
>
>
>
>
>
|
|
|
>
|
|


|
<
|
|
|


|
>
>
|

>

>

>
|
>
>




>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
|
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
>
>
|
>
>
|
<
>
>
>
>
>
>
>
|
>
>
|
>
>
>
|
<
>
>
|
>
>
>
|
>
|
>
>
>
>
>
>
>
>
>
>


<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


>
|


<
<
<
<







6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637










6638
6639
6640
6641
6642

6643
6644
6645

6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665

6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709
6710
6711
6712
6713
6714
6715
6716
6717
6718
6719
6720
6721
6722
6723
6724
6725
6726
6727
6728
6729
6730
6731
6732
6733
6734
6735
6736

6737
6738
6739
6740
6741
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751

6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772









6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854




6855
6856
6857
6858
6859
6860
6861
      rc = sqlite3PagerWrite(pNew->pDbPage);
      nNew++;
      if( rc ) goto balance_cleanup;
    }else{
      assert( i>0 );
      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
      if( rc ) goto balance_cleanup;
      zeroPage(pNew, pageFlags);
      apNew[i] = pNew;
      nNew++;

      /* Set the pointer-map entry for the new sibling page. */
      if( ISAUTOVACUUM ){
        ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
        if( rc!=SQLITE_OK ){
          goto balance_cleanup;
        }
      }
    }
  }











  /*
  ** Reassign page numbers so that the new pages are in ascending order. 
  ** This helps to keep entries in the disk file in order so that a scan
  ** of the table is closer to a linear scan through the file. That in turn 
  ** helps the operating system to deliver pages from the disk more rapidly.

  **
  ** An O(n^2) insertion sort algorithm is used, but since n is never more 
  ** than (NB+2) (a small constant), that should not be a problem.

  **
  ** When NB==3, this one optimization makes the database about 25% faster 
  ** for large insertions and deletions.
  */
  for(i=0; i<nNew; i++){
    aPgno[i] = apNew[i]->pgno;
    aPgFlags[i] = apNew[i]->pDbPage->flags;
  }
  for(i=0; i<nNew; i++){
    Pgno iGt = (i==0 ? 0 : apNew[i-1]->pgno);
    Pgno iMin = 0;
    u16 flags = 0;
    for(j=0; j<nNew; j++){
      Pgno iPgno = aPgno[j];
      if( iPgno>iGt && (iMin==0 || iPgno<iMin) ){
        iMin = iPgno;
        flags = aPgFlags[j];
      }
    }
    if( apNew[i]->pgno!=iMin ){

      apNew[i]->pDbPage->flags = flags;
      sqlite3PagerRekey(apNew[i]->pDbPage, iMin);
      apNew[i]->pgno = iMin;
    }
  }

  TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
         "%d(%d nc=%d) %d(%d nc=%d)\n",
    apNew[0]->pgno, szNew[0], cntNew[0],
    nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
    nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
    nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
    nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
    nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
    nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
    nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
    nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
  ));

  assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  put4byte(pRight, apNew[nNew-1]->pgno);

  j = 0;
  for(i=0; i<nNew; i++){
    /* At this point, "j" is the apCell[] index of the first cell currently
    ** stored on page apNew[i]. Or, if apNew[i] was not one of the original 
    ** sibling pages, "j" should be set to nCell. Variable iFirst is set
    ** to the apCell[] index of the first cell that will appear on the
    ** page following this balancing operation.  */
    int iFirst = (i==0 ? 0 : cntNew[i-1] + !leafData);     /* new first cell */
    assert( i<nOld || j==nCell );
    aShiftLeft[i] = j - iFirst;
    j += apNew[i]->nCell + apNew[i]->nOverflow;
    aShiftRight[i] = cntNew[i] - j;
    assert( i!=nOld-1 || j==nCell );
    if( j<nCell ) j += !leafData;
  }

  /* If the sibling pages are not leaves, ensure that the right-child pointer
  ** of the right-most new sibling page is set to the value that was 
  ** originally in the same field of the right-most old sibling page. */
  if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
    MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
    memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
  }

  /* Make any required updates to pointer map entries associated with 
  ** cells stored on sibling pages following the balance operation. Pointer
  ** map entries associated with divider cells are set by the insertCell()
  ** routine. The associated pointer map entries are:
  **
  **   a) if the cell contains a reference to an overflow chain, the
  **      entry associated with the first page in the overflow chain, and
  **
  **   b) if the sibling pages are not leaves, the child page associated
  **      with the cell.
  **
  ** If the sibling pages are not leaves, then the pointer map entry 
  ** associated with the right-child of each sibling may also need to be 
  ** updated. This happens below, after the sibling pages have been 
  ** populated, not here.
  */
  if( ISAUTOVACUUM ){
    MemPage *pNew = apNew[0];
    u8 *aOld = pNew->aData;
    int cntOldNext = pNew->nCell + pNew->nOverflow;
    int usableSize = pBt->usableSize;
    int iNew = 0;
    int iOld = 0;

    for(i=0; i<nCell; i++){

      u8 *pCell = apCell[i];
      if( i==cntOldNext ){
        MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
        cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
        aOld = pOld->aData;
      }
      if( i==cntNew[iNew] ){
        pNew = apNew[++iNew];
        if( !leafData ) continue;
      }

      /* Cell pCell is destined for new sibling page pNew. Originally, it
      ** was either part of sibling page iOld (possibly an overflow page), 
      ** or else the divider cell to the left of sibling page iOld. So,
      ** if sibling page iOld had the same page number as pNew, and if

      ** pCell really was a part of sibling page iOld (not a divider or
      ** overflow cell), we can skip updating the pointer map entries.  */
      if( pNew->pgno!=aPgno[iOld] || pCell<aOld || pCell>=&aOld[usableSize] ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, pCell, &rc);
        }
      }
    }
  }

  /* Insert new divider cells into pParent. */
  for(i=0; i<nNew-1; i++){
    u8 *pCell;
    u8 *pTemp;
    int sz;
    MemPage *pNew = apNew[i];
    j = cntNew[i];










    assert( j<nMaxCells );
    pCell = apCell[j];
    sz = szCell[j] + leafCorrection;
    pTemp = &aOvflSpace[iOvflSpace];
    if( !pNew->leaf ){
      memcpy(&pNew->aData[8], pCell, 4);
    }else if( leafData ){
      /* If the tree is a leaf-data tree, and the siblings are leaves, 
      ** then there is no divider cell in apCell[]. Instead, the divider 
      ** cell consists of the integer key for the right-most cell of 
      ** the sibling-page assembled above only.
      */
      CellInfo info;
      j--;
      btreeParseCellPtr(pNew, apCell[j], &info);
      pCell = pTemp;
      sz = 4 + putVarint(&pCell[4], info.nKey);
      pTemp = 0;
    }else{
      pCell -= 4;
      /* Obscure case for non-leaf-data trees: If the cell at pCell was
      ** previously stored on a leaf node, and its reported size was 4
      ** bytes, then it may actually be smaller than this 
      ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
      ** any cell). But it is important to pass the correct size to 
      ** insertCell(), so reparse the cell now.
      **
      ** Note that this can never happen in an SQLite data file, as all
      ** cells are at least 4 bytes. It only happens in b-trees used
      ** to evaluate "IN (SELECT ...)" and similar clauses.
      */
      if( szCell[j]==4 ){
        assert(leafCorrection==4);
        sz = cellSizePtr(pParent, pCell);
      }
    }
    iOvflSpace += sz;
    assert( sz<=pBt->maxLocal+23 );
    assert( iOvflSpace <= (int)pBt->pageSize );
    insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
    if( rc!=SQLITE_OK ) goto balance_cleanup;
    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
  }

  /* Now update the actual sibling pages. The order in which they are updated
  ** is important, as this code needs to avoid disrupting any page from which
  ** cells may still to be read. In practice, this means:
  **
  **   1) If the aShiftLeft[] entry is less than 0, it is not safe to 
  **      update the page until the page to the left of the current page
  **      (apNew[i-1]) has already been updated.
  **
  **   2) If the aShiftRight[] entry is less than 0, it is not safe to 
  **      update the page until the page to the right of the current page
  **      (apNew[i+1]) has already been updated.
  **
  ** If neither of the above apply, the page is safe to update.
  */
  assert( aShiftRight[nNew-1]>=0 && aShiftLeft[0]==0 );
  for(i=0; i<nNew*2; i++){
    int iPg = (i>=nNew ? i-nNew : nNew-1-i);
    if( abDone[iPg]==0 
     && (aShiftLeft[iPg]>=0 || abDone[iPg-1])
     && (aShiftRight[iPg]>=0 || abDone[iPg+1])
    ){
      MemPage *pNew = apNew[iPg];
      int iLeft = ((iPg==0) ? 0 : cntNew[iPg-1] + !leafData);
      rebuildPage(pNew, 
          aShiftLeft[iPg] < 0 ? (aShiftLeft[iPg]*-1) : 0,
          cntNew[iPg] - iLeft,
          &apCell[iLeft],
          &szCell[iLeft]
      );
      abDone[iPg] = 1;
      assert( pNew->nOverflow==0 );
      assert( pNew->nCell==(cntNew[iPg] - (iPg==0?0:cntNew[iPg-1]+!leafData)) );
    }
  }
  assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );

  assert( nOld>0 );
  assert( nNew>0 );





  if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
    /* The root page of the b-tree now contains no cells. The only sibling
    ** page is the right-child of the parent. Copy the contents of the
    ** child page into the parent, decreasing the overall height of the
    ** b-tree structure by one. This is described as the "balance-shallower"
    ** sub-algorithm in some documentation.
6742
6743
6744
6745
6746
6747
6748
6749
6750
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760
6761
6762
6763
6764
6765
6766
6767
6768
6769
6770
6771
6772
6773
6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845










6846

6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
    ** image.  */
    assert( nNew==1 );
    assert( apNew[0]->nFree == 
        (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 
    );
    copyNodeContent(apNew[0], pParent, &rc);
    freePage(apNew[0], &rc);
  }else if( ISAUTOVACUUM ){
    /* Fix the pointer-map entries for all the cells that were shifted around. 
    ** There are several different types of pointer-map entries that need to
    ** be dealt with by this routine. Some of these have been set already, but
    ** many have not. The following is a summary:
    **
    **   1) The entries associated with new sibling pages that were not
    **      siblings when this function was called. These have already
    **      been set. We don't need to worry about old siblings that were
    **      moved to the free-list - the freePage() code has taken care
    **      of those.
    **
    **   2) The pointer-map entries associated with the first overflow
    **      page in any overflow chains used by new divider cells. These 
    **      have also already been taken care of by the insertCell() code.
    **
    **   3) If the sibling pages are not leaves, then the child pages of
    **      cells stored on the sibling pages may need to be updated.
    **
    **   4) If the sibling pages are not internal intkey nodes, then any
    **      overflow pages used by these cells may need to be updated
    **      (internal intkey nodes never contain pointers to overflow pages).
    **
    **   5) If the sibling pages are not leaves, then the pointer-map
    **      entries for the right-child pages of each sibling may need
    **      to be updated.
    **
    ** Cases 1 and 2 are dealt with above by other code. The next
    ** block deals with cases 3 and 4 and the one after that, case 5. Since
    ** setting a pointer map entry is a relatively expensive operation, this
    ** code only sets pointer map entries for child or overflow pages that have
    ** actually moved between pages.  */
    MemPage *pNew = apNew[0];
    MemPage *pOld = apCopy[0];
    int nOverflow = pOld->nOverflow;
    int iNextOld = pOld->nCell + nOverflow;
    int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1);
    j = 0;                             /* Current 'old' sibling page */
    k = 0;                             /* Current 'new' sibling page */
    for(i=0; i<nCell; i++){
      int isDivider = 0;
      while( i==iNextOld ){
        /* Cell i is the cell immediately following the last cell on old
        ** sibling page j. If the siblings are not leaf pages of an
        ** intkey b-tree, then cell i was a divider cell. */
        assert( j+1 < ArraySize(apCopy) );
        assert( j+1 < nOld );
        pOld = apCopy[++j];
        iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow;
        if( pOld->nOverflow ){
          nOverflow = pOld->nOverflow;
          iOverflow = i + !leafData + pOld->aiOvfl[0];
        }
        isDivider = !leafData;  
      }

      assert(nOverflow>0 || iOverflow<i );
      assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1);
      assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1);
      if( i==iOverflow ){
        isDivider = 1;
        if( (--nOverflow)>0 ){
          iOverflow++;
        }
      }

      if( i==cntNew[k] ){
        /* Cell i is the cell immediately following the last cell on new
        ** sibling page k. If the siblings are not leaf pages of an
        ** intkey b-tree, then cell i is a divider cell.  */
        pNew = apNew[++k];
        if( !leafData ) continue;
      }
      assert( j<nOld );
      assert( k<nNew );

      /* If the cell was originally divider cell (and is not now) or
      ** an overflow cell, or if the cell was located on a different sibling
      ** page before the balancing, then the pointer map entries associated
      ** with any child or overflow pages need to be updated.  */
      if( isDivider || pOld->pgno!=pNew->pgno ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc);
        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, apCell[i], &rc);
        }
      }
    }

    if( !leafCorrection ){
      for(i=0; i<nNew; i++){
        u32 key = get4byte(&apNew[i]->aData[8]);
        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
      }
    }











#if 0

    /* The ptrmapCheckPages() contains assert() statements that verify that
    ** all pointer map pages are set correctly. This is helpful while 
    ** debugging. This is usually disabled because a corrupt database may
    ** cause an assert() statement to fail.  */
    ptrmapCheckPages(apNew, nNew);
    ptrmapCheckPages(&pParent, 1);
#endif
  }

  assert( pParent->isInit );
  TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
          nOld, nNew, nCell));

  /*
  ** Cleanup before returning.
  */
balance_cleanup:
  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){







|
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
|
|
|

>
>
>
>
>
>
>
>
>
>
|
>






<

|
<
<
<







6870
6871
6872
6873
6874
6875
6876
6877











6878










































6879









6880

























6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904

6905
6906



6907
6908
6909
6910
6911
6912
6913
    ** image.  */
    assert( nNew==1 );
    assert( apNew[0]->nFree == 
        (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 
    );
    copyNodeContent(apNew[0], pParent, &rc);
    freePage(apNew[0], &rc);
  }else if( ISAUTOVACUUM && !leafCorrection ){











    /* Fix the pointer map entries associated with the right-child of each










































    ** sibling page. All other pointer map entries have already been taken









    ** care of.  */

























    for(i=0; i<nNew; i++){
      u32 key = get4byte(&apNew[i]->aData[8]);
      ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
    }
  }

  assert( pParent->isInit );
  TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
          nOld, nNew, nCell));

  /* Free any old pages that were not reused as new pages.
  */
  for(i=nNew; i<nOld; i++){
    freePage(apOld[i], &rc);
  }

#if 1
  if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
    /* The ptrmapCheckPages() contains assert() statements that verify that
    ** all pointer map pages are set correctly. This is helpful while 
    ** debugging. This is usually disabled because a corrupt database may
    ** cause an assert() statement to fail.  */
    ptrmapCheckPages(apNew, nNew);
    ptrmapCheckPages(&pParent, 1);

  }
#endif




  /*
  ** Cleanup before returning.
  */
balance_cleanup:
  sqlite3ScratchFree(apCell);
  for(i=0; i<nOld; i++){
Changes to src/pager.c.
6831
6832
6833
6834
6835
6836
6837








6838
6839
6840
6841
6842
6843
6844
    pPgHdr->flags |= PGHDR_NEED_SYNC;
    sqlite3PcacheMakeDirty(pPgHdr);
    sqlite3PagerUnrefNotNull(pPgHdr);
  }

  return SQLITE_OK;
}








#endif

/*
** Return a pointer to the data for the specified page.
*/
void *sqlite3PagerGetData(DbPage *pPg){
  assert( pPg->nRef>0 || pPg->pPager->memDb );







>
>
>
>
>
>
>
>







6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
    pPgHdr->flags |= PGHDR_NEED_SYNC;
    sqlite3PcacheMakeDirty(pPgHdr);
    sqlite3PagerUnrefNotNull(pPgHdr);
  }

  return SQLITE_OK;
}

void sqlite3PagerRekey(DbPage *pPage, Pgno iNew){
  PgHdr *pPg = (PgHdr*)pPage;
  assert( pPg->flags & PGHDR_DIRTY );
  assert( !subjRequiresPage(pPg) );
  sqlite3PcacheMove(pPg, iNew);
}

#endif

/*
** Return a pointer to the data for the specified page.
*/
void *sqlite3PagerGetData(DbPage *pPg){
  assert( pPg->nRef>0 || pPg->pPager->memDb );
7230
7231
7232
7233
7234
7235
7236
7237

7238
** is empty, return 0.
*/
int sqlite3PagerWalFramesize(Pager *pPager){
  assert( pPager->eState>=PAGER_READER );
  return sqlite3WalFramesize(pPager->pWal);
}
#endif


#endif /* SQLITE_OMIT_DISKIO */








>

7238
7239
7240
7241
7242
7243
7244
7245
7246
7247
** is empty, return 0.
*/
int sqlite3PagerWalFramesize(Pager *pPager){
  assert( pPager->eState>=PAGER_READER );
  return sqlite3WalFramesize(pPager->pWal);
}
#endif


#endif /* SQLITE_OMIT_DISKIO */
Changes to src/pager.h.
183
184
185
186
187
188
189


190
191
192
193
194
195
196
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);



#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)







>
>







183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
int sqlite3PagerIsMemdb(Pager*);
void sqlite3PagerCacheStat(Pager *, int, int, int *);
void sqlite3PagerClearCache(Pager *);
int sqlite3SectorSize(sqlite3_file *);

/* Functions used to truncate the database file. */
void sqlite3PagerTruncateImage(Pager*,Pgno);

void sqlite3PagerRekey(DbPage*, Pgno);

#if defined(SQLITE_HAS_CODEC) && !defined(SQLITE_OMIT_WAL)
void *sqlite3PagerCodec(DbPage *);
#endif

/* Functions to support testing and debugging. */
#if !defined(NDEBUG) || defined(SQLITE_TEST)