SQLite

Check-in [1687287151]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the way that balance_nonroot() partitions cells between the sibling pages such that a scan of the cell size array is not required.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | btree-opt2
Files: files | file ages | folders
SHA1: 168728715156d756ac8c0df45710d054eee027ec
User & Date: drh 2015-06-22 20:02:04.079
Context
2015-06-23
02:37
Avoid computing cell sizes in balance_nonroot() until they are really needed. This gives an overall 1.7% performance gain for about 1000 extra bytes of code space. (check-in: 43844537e8 user: drh tags: btree-opt2)
2015-06-22
20:02
Change the way that balance_nonroot() partitions cells between the sibling pages such that a scan of the cell size array is not required. (check-in: 1687287151 user: drh tags: btree-opt2)
2015-06-20
14:11
Update the fuzztest data using the latest test vectors discovered by AFL. (check-in: b97f9cf73e user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btree.c.
3202
3203
3204
3205
3206
3207
3208

3209
3210
3211
3212
3213
3214
3215
    u8 *pCell = findCell(pPage, i);

    ptrmapPutOvflPtr(pPage, pCell, &rc);

    if( !pPage->leaf ){
      Pgno childPgno = get4byte(pCell);
      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);

    }
  }

  if( !pPage->leaf ){
    Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
    ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  }







>







3202
3203
3204
3205
3206
3207
3208
3209
3210
3211
3212
3213
3214
3215
3216
    u8 *pCell = findCell(pPage, i);

    ptrmapPutOvflPtr(pPage, pCell, &rc);

    if( !pPage->leaf ){
      Pgno childPgno = get4byte(pCell);
      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
      if( rc ) return rc;
    }
  }

  if( !pPage->leaf ){
    Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
    ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  }
6291
6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
** Some of the cells in apCell[] may currently be stored in pPg. This
** function works around problems caused by this by making a copy of any 
** such cells before overwriting the page data.
**
** The MemPage.nFree field is invalidated by this function. It is the 
** responsibility of the caller to set it correctly.
*/
static void rebuildPage(
  MemPage *pPg,                   /* Edit this page */
  int nCell,                      /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
  u8 * const aData = pPg->aData;           /* Pointer to data for pPg */







|







6292
6293
6294
6295
6296
6297
6298
6299
6300
6301
6302
6303
6304
6305
6306
** Some of the cells in apCell[] may currently be stored in pPg. This
** function works around problems caused by this by making a copy of any 
** such cells before overwriting the page data.
**
** The MemPage.nFree field is invalidated by this function. It is the 
** responsibility of the caller to set it correctly.
*/
static int rebuildPage(
  MemPage *pPg,                   /* Edit this page */
  int nCell,                      /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
  const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
  u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6316
6317
6318
6319
6320
6321
6322
6323
6324
6325


6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337

6338
6339
6340
6341
6342
6343
6344
  pData = pEnd;
  for(i=0; i<nCell; i++){
    u8 *pCell = apCell[i];
    if( pCell>aData && pCell<pEnd ){
      pCell = &pTmp[pCell - aData];
    }
    pData -= szCell[i];
    memcpy(pData, pCell, szCell[i]);
    put2byte(pCellptr, (pData - aData));
    pCellptr += 2;


    assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
    testcase( szCell[i]==pPg->xCellSize(pPg,pCell) );
  }

  /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
  pPg->nCell = nCell;
  pPg->nOverflow = 0;

  put2byte(&aData[hdr+1], 0);
  put2byte(&aData[hdr+3], pPg->nCell);
  put2byte(&aData[hdr+5], pData - aData);
  aData[hdr+7] = 0x00;

}

/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
** contains the size in bytes of each such cell. This function attempts to 
** add the cells stored in the array to page pPg. If it cannot (because 
** the page needs to be defragmented before the cells will fit), non-zero







<


>
>












>







6317
6318
6319
6320
6321
6322
6323

6324
6325
6326
6327
6328
6329
6330
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340
6341
6342
6343
6344
6345
6346
6347
  pData = pEnd;
  for(i=0; i<nCell; i++){
    u8 *pCell = apCell[i];
    if( pCell>aData && pCell<pEnd ){
      pCell = &pTmp[pCell - aData];
    }
    pData -= szCell[i];

    put2byte(pCellptr, (pData - aData));
    pCellptr += 2;
    if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
    memcpy(pData, pCell, szCell[i]);
    assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
    testcase( szCell[i]==pPg->xCellSize(pPg,pCell) );
  }

  /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
  pPg->nCell = nCell;
  pPg->nOverflow = 0;

  put2byte(&aData[hdr+1], 0);
  put2byte(&aData[hdr+3], pPg->nCell);
  put2byte(&aData[hdr+5], pData - aData);
  aData[hdr+7] = 0x00;
  return SQLITE_OK;
}

/*
** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
** contains the size in bytes of each such cell. This function attempts to 
** add the cells stored in the array to page pPg. If it cannot (because 
** the page needs to be defragmented before the cells will fit), non-zero
6450
6451
6452
6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
**
** This routine makes the necessary adjustments to pPg so that it contains
** the correct cells after being balanced.
**
** The pPg->nFree field is invalid when this function returns. It is the
** responsibility of the caller to set it correctly.
*/
static void editPage(
  MemPage *pPg,                   /* Edit this page */
  int iOld,                       /* Index of first cell currently on page */
  int iNew,                       /* Index of new first cell on page */
  int nNew,                       /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){







|







6453
6454
6455
6456
6457
6458
6459
6460
6461
6462
6463
6464
6465
6466
6467
**
** This routine makes the necessary adjustments to pPg so that it contains
** the correct cells after being balanced.
**
** The pPg->nFree field is invalid when this function returns. It is the
** responsibility of the caller to set it correctly.
*/
static int editPage(
  MemPage *pPg,                   /* Edit this page */
  int iOld,                       /* Index of first cell currently on page */
  int iNew,                       /* Index of new first cell on page */
  int nNew,                       /* Final number of cells on page */
  u8 **apCell,                    /* Array of cells */
  u16 *szCell                     /* Array of cell sizes */
){
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
    if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
      pCell = &pTmp[pCell - aData];
    }
    assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
  }
#endif

  return;
 editpage_fail:
  /* Unable to edit this page. Rebuild it from scratch instead. */
  rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}

/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and







|


|







6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
    if( pCell>=aData && pCell<&aData[pPg->pBt->usableSize] ){
      pCell = &pTmp[pCell - aData];
    }
    assert( 0==memcmp(pCell, &aData[iOff], szCell[i+iNew]) );
  }
#endif

  return SQLITE_OK;
 editpage_fail:
  /* Unable to edit this page. Rebuild it from scratch instead. */
  return rebuildPage(pPg, nNew, &apCell[iNew], &szCell[iNew]);
}

/*
** The following parameters determine how many adjacent pages get involved
** in a balancing operation.  NN is the number of neighbors on either side
** of the page that participate in the balancing operation.  NB is the
** total number of pages that participate, including the target page and
6616
6617
6618
6619
6620
6621
6622
6623

6624
6625
6626
6627
6628
6629
6630
    u8 *pCell = pPage->apOvfl[0];
    u16 szCell = pPage->xCellSize(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );
    assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
    zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
    rebuildPage(pNew, 1, &pCell, &szCell);

    pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page. If either of these
    ** operations fails, the return code is set, but the contents
    ** of the parent page are still manipulated by thh code below.







|
>







6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
    u8 *pCell = pPage->apOvfl[0];
    u16 szCell = pPage->xCellSize(pPage, pCell);
    u8 *pStop;

    assert( sqlite3PagerIswriteable(pNew->pDbPage) );
    assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
    zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
    rc = rebuildPage(pNew, 1, &pCell, &szCell);
    if( rc ) return rc;
    pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;

    /* If this is an auto-vacuum database, update the pointer map
    ** with entries for the new page, and any pointer from the 
    ** cell on the page to an overflow page. If either of these
    ** operations fails, the return code is set, but the contents
    ** of the parent page are still manipulated by thh code below.
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
  int i, j, k;                 /* Loop counters */
  int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  int rc = SQLITE_OK;          /* The return code */
  u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */
  int subtotal;                /* Subtotal of bytes in cells on one page */
  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */







<







6835
6836
6837
6838
6839
6840
6841

6842
6843
6844
6845
6846
6847
6848
  int i, j, k;                 /* Loop counters */
  int nxDiv;                   /* Next divider slot in pParent->aCell[] */
  int rc = SQLITE_OK;          /* The return code */
  u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
  int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
  int usableSpace;             /* Bytes in pPage beyond the header */
  int pageFlags;               /* Value of pPage->aData[0] */

  int iSpace1 = 0;             /* First unused byte of aSpace1[] */
  int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
  int szScratch;               /* Size of scratch memory requested */
  MemPage *apOld[NB];          /* pPage and up to two siblings */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  u8 *pRight;                  /* Location in parent of right-sibling pointer */
  u8 *apDiv[NB-1];             /* Divider cells in pParent */
7077
7078
7079
7080
7081
7082
7083
7084
7085
7086
7087


7088

7089





7090
7091
7092
7093


7094




7095
7096
7097





7098












7099
7100
7101
7102
7103
7104
7105
  **    szNew[i]: Spaced used on the i-th sibling page.
  **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
  **              the right of the i-th sibling page.
  ** usableSpace: Number of bytes of space available on each sibling.
  ** 
  */
  usableSpace = pBt->usableSize - 12 + leafCorrection;
  for(subtotal=k=i=0; i<nCell; i++){
    assert( i<nMaxCells );
    subtotal += szCell[i] + 2;
    if( subtotal > usableSpace ){


      szNew[k] = subtotal - szCell[i] - 2;

      cntNew[k] = i;





      if( leafData ){ i--; }
      subtotal = 0;
      k++;
      if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }


    }




  }
  szNew[k] = subtotal;
  cntNew[k] = nCell;





  k++;













  /*
  ** The packing computed by the previous block is biased toward the siblings
  ** on the left side (siblings with smaller keys). The left siblings are
  ** always nearly full, while the right-most sibling might be nearly empty.
  ** The next block of code attempts to adjust the packing of siblings to
  ** get a better balance.







|
<
|
|
>
>
|
>
|
>
>
>
>
>
|
|
<
|
>
>
|
>
>
>
>
|
|
|
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>







7080
7081
7082
7083
7084
7085
7086
7087

7088
7089
7090
7091
7092
7093
7094
7095
7096
7097
7098
7099
7100
7101

7102
7103
7104
7105
7106
7107
7108
7109
7110
7111
7112
7113
7114
7115
7116
7117
7118
7119
7120
7121
7122
7123
7124
7125
7126
7127
7128
7129
7130
7131
7132
7133
7134
7135
7136
7137
  **    szNew[i]: Spaced used on the i-th sibling page.
  **   cntNew[i]: Index in apCell[] and szCell[] for the first cell to
  **              the right of the i-th sibling page.
  ** usableSpace: Number of bytes of space available on each sibling.
  ** 
  */
  usableSpace = pBt->usableSize - 12 + leafCorrection;
  for(i=0; i<nOld; i++){

    MemPage *p = apOld[i];
    szNew[i] = usableSpace - p->nFree;
    if( szNew[i]<0 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
    for(j=0; j<p->nOverflow; j++){
      szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
    }
    cntNew[i] = cntOld[i];
  }
  k = nOld;
  for(i=0; i<k; i++){
    int sz;
    while( szNew[i]>usableSpace ){
      if( i+1>=k ){
        k = i+2;

        if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
        szNew[k-1] = 0;
        cntNew[k-1] = nCell;
      }
      sz = 2+szCell[cntNew[i]-1];
      szNew[i] -= sz;
      if( !leafData ){
        sz = cntNew[i]<nCell ? 2+szCell[cntNew[i]] : 0;
      }
      szNew[i+1] += sz;
      cntNew[i]--;
    }
    while( cntNew[i]<nCell ){
      sz = 2+szCell[cntNew[i]];
      if( szNew[i]+sz>usableSpace ) break;
      szNew[i] += sz;
      cntNew[i]++;
      if( !leafData ){
        sz = cntNew[i]<nCell ? 2+szCell[cntNew[i]] : 0;
      }
      szNew[i+1] -= sz;
    }
    if( cntNew[i]>=nCell ){
      k = i+1;
    }else if( cntNew[i] - (i>0 ? cntNew[i-1] : 0) <= 0 ){
      rc = SQLITE_CORRUPT_BKPT;
      goto balance_cleanup;
    }
  }

  /*
  ** The packing computed by the previous block is biased toward the siblings
  ** on the left side (siblings with smaller keys). The left siblings are
  ** always nearly full, while the right-most sibling might be nearly empty.
  ** The next block of code attempts to adjust the packing of siblings to
  ** get a better balance.
7120
7121
7122
7123
7124
7125
7126




7127
7128
7129
7130
7131
7132
7133
    assert( r<nMaxCells );
    while( szRight==0 
       || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2)) 
    ){
      szRight += szCell[d] + 2;
      szLeft -= szCell[r] + 2;
      cntNew[i-1]--;




      r = cntNew[i-1] - 1;
      d = r + 1 - leafData;
    }
    szNew[i] = szRight;
    szNew[i-1] = szLeft;
  }








>
>
>
>







7152
7153
7154
7155
7156
7157
7158
7159
7160
7161
7162
7163
7164
7165
7166
7167
7168
7169
    assert( r<nMaxCells );
    while( szRight==0 
       || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2)) 
    ){
      szRight += szCell[d] + 2;
      szLeft -= szCell[r] + 2;
      cntNew[i-1]--;
      if( cntNew[i-1] <= 0 ){
        rc = SQLITE_CORRUPT_BKPT;
        goto balance_cleanup;
      }
      r = cntNew[i-1] - 1;
      d = r + 1 - leafData;
    }
    szNew[i] = szRight;
    szNew[i-1] = szLeft;
  }

7290
7291
7292
7293
7294
7295
7296

7297
7298
7299

7300
7301
7302
7303
7304
7305
7306
      if( iOld>=nNew
       || pNew->pgno!=aPgno[iOld]
       || pCell<aOld
       || pCell>=&aOld[usableSize]
      ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);

        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, pCell, &rc);

        }
      }
    }
  }

  /* Insert new divider cells into pParent. */
  for(i=0; i<nNew-1; i++){







>



>







7326
7327
7328
7329
7330
7331
7332
7333
7334
7335
7336
7337
7338
7339
7340
7341
7342
7343
7344
      if( iOld>=nNew
       || pNew->pgno!=aPgno[iOld]
       || pCell<aOld
       || pCell>=&aOld[usableSize]
      ){
        if( !leafCorrection ){
          ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
          if( rc ) goto balance_cleanup;
        }
        if( szCell[i]>pNew->minLocal ){
          ptrmapPutOvflPtr(pNew, pCell, &rc);
          if( rc ) goto balance_cleanup;
        }
      }
    }
  }

  /* Insert new divider cells into pParent. */
  for(i=0; i<nNew-1; i++){
7400
7401
7402
7403
7404
7405
7406
7407

7408
7409
7410
7411
7412
7413
7414
        nNewCell = cntNew[0];
      }else{
        iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;
        iNew = cntNew[iPg-1] + !leafData;
        nNewCell = cntNew[iPg] - iNew;
      }

      editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);

      abDone[iPg]++;
      apNew[iPg]->nFree = usableSpace-szNew[iPg];
      assert( apNew[iPg]->nOverflow==0 );
      assert( apNew[iPg]->nCell==nNewCell );
    }
  }








|
>







7438
7439
7440
7441
7442
7443
7444
7445
7446
7447
7448
7449
7450
7451
7452
7453
        nNewCell = cntNew[0];
      }else{
        iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : nCell;
        iNew = cntNew[iPg-1] + !leafData;
        nNewCell = cntNew[iPg] - iNew;
      }

      rc = editPage(apNew[iPg], iOld, iNew, nNewCell, apCell, szCell);
      if( rc ) goto balance_cleanup;
      abDone[iPg]++;
      apNew[iPg]->nFree = usableSpace-szNew[iPg];
      assert( apNew[iPg]->nOverflow==0 );
      assert( apNew[iPg]->nCell==nNewCell );
    }
  }