SQLite

Check-in [8fdbe4abab]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Use 16-bit integers for indexing within a page in btree. Tighter bounds on the maximum number of cells within one page. (CVS 4796)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 8fdbe4abab4e9c292111579b03471f68c1e855fb
User & Date: drh 2008-02-19 14:59:35.000
Context
2008-02-19
15:15
Change non-exported memory interfaces to following the naming conventions. (CVS 4797) (check-in: 94774b4142 user: drh tags: trunk)
14:59
Use 16-bit integers for indexing within a page in btree. Tighter bounds on the maximum number of cells within one page. (CVS 4796) (check-in: 8fdbe4abab user: drh tags: trunk)
2008-02-18
22:24
Add the memory fault simulator to mem5.c. Enable soft heap limit on mem5.c. Limit the size of hash tables and the vdbefifo when using mem5.c. (CVS 4795) (check-in: 63da5d9754 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btree.c.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
** 2004 April 6
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.438 2008/01/31 14:54:44 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
** Including a description of file format and an overview of operation.
*/
#include "btreeInt.h"












|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
** 2004 April 6
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.439 2008/02/19 14:59:35 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** See the header comment on "btreeInt.h" for additional information.
** Including a description of file format and an overview of operation.
*/
#include "btreeInt.h"

618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
/*
** Compute the total number of bytes that a Cell needs in the cell
** data area of the btree-page.  The return number includes the cell
** data header and the local payload, but not any overflow page or
** the space used by the cell pointer.
*/
#ifndef NDEBUG
static int cellSize(MemPage *pPage, int iCell){
  CellInfo info;
  sqlite3BtreeParseCell(pPage, iCell, &info);
  return info.nSize;
}
#endif
static int cellSizePtr(MemPage *pPage, u8 *pCell){
  CellInfo info;
  sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  return info.nSize;
}

#ifndef SQLITE_OMIT_AUTOVACUUM
/*







|





|







618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
/*
** Compute the total number of bytes that a Cell needs in the cell
** data area of the btree-page.  The return number includes the cell
** data header and the local payload, but not any overflow page or
** the space used by the cell pointer.
*/
#ifndef NDEBUG
static u16 cellSize(MemPage *pPage, int iCell){
  CellInfo info;
  sqlite3BtreeParseCell(pPage, iCell, &info);
  return info.nSize;
}
#endif
static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
  CellInfo info;
  sqlite3BtreeParseCellPtr(pPage, pCell, &info);
  return info.nSize;
}

#ifndef SQLITE_OMIT_AUTOVACUUM
/*
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
** Add a list of cells to a page.  The page should be initially empty.
** The cells are guaranteed to fit on the page.
*/
static void assemblePage(
  MemPage *pPage,   /* The page to be assemblied */
  int nCell,        /* The number of cells to add to this page */
  u8 **apCell,      /* Pointers to cell bodies */
  int *aSize        /* Sizes of the cells */
){
  int i;            /* Loop counter */
  int totalSize;    /* Total size of all cells */
  int hdr;          /* Index of page header */
  int cellptr;      /* Address of next cell pointer */
  int cellbody;     /* Address of next cell body */
  u8 *data;         /* Data for the page */







|







4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
** Add a list of cells to a page.  The page should be initially empty.
** The cells are guaranteed to fit on the page.
*/
static void assemblePage(
  MemPage *pPage,   /* The page to be assemblied */
  int nCell,        /* The number of cells to add to this page */
  u8 **apCell,      /* Pointers to cell bodies */
  u16 *aSize        /* Sizes of the cells */
){
  int i;            /* Loop counter */
  int totalSize;    /* Total size of all cells */
  int hdr;          /* Index of page header */
  int cellptr;      /* Address of next cell pointer */
  int cellbody;     /* Address of next cell body */
  u8 *data;         /* Data for the page */
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
** which is also the right-most entry on the page.
*/
static int balance_quick(MemPage *pPage, MemPage *pParent){
  int rc;
  MemPage *pNew;
  Pgno pgnoNew;
  u8 *pCell;
  int szCell;
  CellInfo info;
  BtShared *pBt = pPage->pBt;
  int parentIdx = pParent->nCell;   /* pParent new divider cell index */
  int parentSize;                   /* Size of new divider cell */
  u8 parentCell[64];                /* Space for the new divider cell */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );







|







4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
** which is also the right-most entry on the page.
*/
static int balance_quick(MemPage *pPage, MemPage *pParent){
  int rc;
  MemPage *pNew;
  Pgno pgnoNew;
  u8 *pCell;
  u16 szCell;
  CellInfo info;
  BtShared *pBt = pPage->pBt;
  int parentIdx = pParent->nCell;   /* pParent new divider cell index */
  int parentSize;                   /* Size of new divider cell */
  u8 parentCell[64];                /* Space for the new divider cell */

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
  MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
  u8 *apDiv[NB];               /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  int *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aCopy[NB];               /* Space for holding data of apCopy[] */
  u8 *aSpace;                  /* Space to hold copies of dividers cells */
#ifndef SQLITE_OMIT_AUTOVACUUM
  u8 *aFrom = 0;
#endif

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );







|







4793
4794
4795
4796
4797
4798
4799
4800
4801
4802
4803
4804
4805
4806
4807
  MemPage *apCopy[NB];         /* Private copies of apOld[] pages */
  MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
  Pgno pgnoNew[NB+2];          /* Page numbers for each page in apNew[] */
  u8 *apDiv[NB];               /* Divider cells in pParent */
  int cntNew[NB+2];            /* Index in aCell[] of cell after i-th page */
  int szNew[NB+2];             /* Combined size of cells place on i-th page */
  u8 **apCell = 0;             /* All cells begin balanced */
  u16 *szCell;                 /* Local size of all cells in apCell[] */
  u8 *aCopy[NB];               /* Space for holding data of apCopy[] */
  u8 *aSpace;                  /* Space to hold copies of dividers cells */
#ifndef SQLITE_OMIT_AUTOVACUUM
  u8 *aFrom = 0;
#endif

  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
    apOld[i]->idxParent = k;
    apCopy[i] = 0;
    assert( i==nOld );
    nOld++;
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  }

  /* Make nMaxCells a multiple of 2 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 1)&~1;

  /*
  ** Allocate space for memory structures
  */
  apCell = sqlite3_malloc( 
       nMaxCells*sizeof(u8*)                           /* apCell */
     + nMaxCells*sizeof(int)                           /* szCell */
     + ROUND8(sizeof(MemPage))*NB                      /* aCopy */
     + pBt->pageSize*(5+NB)                            /* aSpace */
     + (ISAUTOVACUUM ? nMaxCells : 0)                  /* aFrom */
  );
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (int*)&apCell[nMaxCells];
  aCopy[0] = (u8*)&szCell[nMaxCells];
  assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  for(i=1; i<NB; i++){
    aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
    assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  }
  aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];







|

|





|
|
|
|
|





|







4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
    apOld[i]->idxParent = k;
    apCopy[i] = 0;
    assert( i==nOld );
    nOld++;
    nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
  }

  /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
  ** alignment */
  nMaxCells = (nMaxCells + 3)&~3;

  /*
  ** Allocate space for memory structures
  */
  apCell = sqlite3_malloc( 
       nMaxCells*sizeof(u8*)                       /* apCell */
     + nMaxCells*sizeof(u16)                       /* szCell */
     + (ROUND8(sizeof(MemPage))+pBt->pageSize)*NB  /* aCopy */
     + pBt->pageSize*5                             /* aSpace */
     + (ISAUTOVACUUM ? nMaxCells : 0)              /* aFrom */
  );
  if( apCell==0 ){
    rc = SQLITE_NOMEM;
    goto balance_cleanup;
  }
  szCell = (u16*)&apCell[nMaxCells];
  aCopy[0] = (u8*)&szCell[nMaxCells];
  assert( ((aCopy[0] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  for(i=1; i<NB; i++){
    aCopy[i] = &aCopy[i-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
    assert( ((aCopy[i] - (u8*)apCell) & 7)==0 ); /* 8-byte alignment required */
  }
  aSpace = &aCopy[NB-1][pBt->pageSize+ROUND8(sizeof(MemPage))];
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
          }
        }
      }
#endif
      nCell++;
    }
    if( i<nOld-1 ){
      int sz = cellSizePtr(pParent, apDiv[i]);
      if( leafData ){
        /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
        ** are duplicates of keys on the child pages.  We need to remove
        ** the divider cells from pParent, but the dividers cells are not
        ** added to apCell[] because they are duplicates of child cells.
        */
        dropCell(pParent, nxDiv, sz);







|







4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
          }
        }
      }
#endif
      nCell++;
    }
    if( i<nOld-1 ){
      u16 sz = cellSizePtr(pParent, apDiv[i]);
      if( leafData ){
        /* With the LEAFDATA flag, pParent cells hold only INTKEYs that
        ** are duplicates of keys on the child pages.  We need to remove
        ** the divider cells from pParent, but the dividers cells are not
        ** added to apCell[] because they are duplicates of child cells.
        */
        dropCell(pParent, nxDiv, sz);
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
static int balance_shallower(MemPage *pPage){
  MemPage *pChild;             /* The only child page of pPage */
  Pgno pgnoChild;              /* Page number for pChild */
  int rc = SQLITE_OK;          /* Return code from subprocedures */
  BtShared *pBt;                  /* The main BTree structure */
  int mxCellPerPage;           /* Maximum number of cells per page */
  u8 **apCell;                 /* All cells from pages being balanced */
  int *szCell;                 /* Local size of all cells */

  assert( pPage->pParent==0 );
  assert( pPage->nCell==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  pBt = pPage->pBt;
  mxCellPerPage = MX_CELL(pBt);
  apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(int)) );
  if( apCell==0 ) return SQLITE_NOMEM;
  szCell = (int*)&apCell[mxCellPerPage];
  if( pPage->leaf ){
    /* The table is completely empty */
    TRACE(("BALANCE: empty table %d\n", pPage->pgno));
  }else{
    /* The root page is empty but has one child.  Transfer the
    ** information from that one child into the root page if it 
    ** will fit.  This reduces the depth of the tree by one.







|






|

|







5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
static int balance_shallower(MemPage *pPage){
  MemPage *pChild;             /* The only child page of pPage */
  Pgno pgnoChild;              /* Page number for pChild */
  int rc = SQLITE_OK;          /* Return code from subprocedures */
  BtShared *pBt;                  /* The main BTree structure */
  int mxCellPerPage;           /* Maximum number of cells per page */
  u8 **apCell;                 /* All cells from pages being balanced */
  u16 *szCell;                 /* Local size of all cells */

  assert( pPage->pParent==0 );
  assert( pPage->nCell==0 );
  assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  pBt = pPage->pBt;
  mxCellPerPage = MX_CELL(pBt);
  apCell = sqlite3_malloc( mxCellPerPage*(sizeof(u8*)+sizeof(u16)) );
  if( apCell==0 ) return SQLITE_NOMEM;
  szCell = (u16*)&apCell[mxCellPerPage];
  if( pPage->leaf ){
    /* The table is completely empty */
    TRACE(("BALANCE: empty table %d\n", pPage->pgno));
  }else{
    /* The root page is empty but has one child.  Transfer the
    ** information from that one child into the root page if it 
    ** will fit.  This reduces the depth of the tree by one.
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
  newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
  if( newCell==0 ) return SQLITE_NOMEM;
  rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
  if( rc ) goto end_insert;
  assert( szNew==cellSizePtr(pPage, newCell) );
  assert( szNew<=MX_CELL_SIZE(pBt) );
  if( loc==0 && CURSOR_VALID==pCur->eState ){
    int szOld;
    assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
    rc = sqlite3PagerWrite(pPage->pDbPage);
    if( rc ){
      goto end_insert;
    }
    oldCell = findCell(pPage, pCur->idx);
    if( !pPage->leaf ){







|







5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
  newCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
  if( newCell==0 ) return SQLITE_NOMEM;
  rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew);
  if( rc ) goto end_insert;
  assert( szNew==cellSizePtr(pPage, newCell) );
  assert( szNew<=MX_CELL_SIZE(pBt) );
  if( loc==0 && CURSOR_VALID==pCur->eState ){
    u16 szOld;
    assert( pCur->idx>=0 && pCur->idx<pPage->nCell );
    rc = sqlite3PagerWrite(pPage->pDbPage);
    if( rc ){
      goto end_insert;
    }
    oldCell = findCell(pPage, pCur->idx);
    if( !pPage->leaf ){
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
    assert( !pPage->leafData );
    sqlite3BtreeGetTempCursor(pCur, &leafCur);
    rc = sqlite3BtreeNext(&leafCur, &notUsed);
    if( rc==SQLITE_OK ){
      rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
    }
    if( rc==SQLITE_OK ){
      int szNext;
      TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
         pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
      dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
      pNext = findCell(leafCur.pPage, leafCur.idx);
      szNext = cellSizePtr(leafCur.pPage, pNext);
      assert( MX_CELL_SIZE(pBt)>=szNext+4 );
      tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );







|







5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
    assert( !pPage->leafData );
    sqlite3BtreeGetTempCursor(pCur, &leafCur);
    rc = sqlite3BtreeNext(&leafCur, &notUsed);
    if( rc==SQLITE_OK ){
      rc = sqlite3PagerWrite(leafCur.pPage->pDbPage);
    }
    if( rc==SQLITE_OK ){
      u16 szNext;
      TRACE(("DELETE: table=%d delete internal from %d replace from leaf %d\n",
         pCur->pgnoRoot, pPage->pgno, leafCur.pPage->pgno));
      dropCell(pPage, pCur->idx, cellSizePtr(pPage, pCell));
      pNext = findCell(leafCur.pPage, leafCur.idx);
      szNext = cellSizePtr(leafCur.pPage, pNext);
      assert( MX_CELL_SIZE(pBt)>=szNext+4 );
      tempCell = sqlite3_malloc( MX_CELL_SIZE(pBt) );
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
  hit = sqlite3MallocZero( usableSize );
  if( hit ){
    memset(hit, 1, get2byte(&data[hdr+5]));
    nCell = get2byte(&data[hdr+3]);
    cellStart = hdr + 12 - 4*pPage->leaf;
    for(i=0; i<nCell; i++){
      int pc = get2byte(&data[cellStart+i*2]);
      int size = cellSizePtr(pPage, &data[pc]);
      int j;
      if( (pc+size-1)>=usableSize || pc<0 ){
        checkAppendMsg(pCheck, 0, 
            "Corruption detected in cell %d on page %d",i,iPage,0);
      }else{
        for(j=pc+size-1; j>=pc; j--) hit[j]++;
      }







|







6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
  hit = sqlite3MallocZero( usableSize );
  if( hit ){
    memset(hit, 1, get2byte(&data[hdr+5]));
    nCell = get2byte(&data[hdr+3]);
    cellStart = hdr + 12 - 4*pPage->leaf;
    for(i=0; i<nCell; i++){
      int pc = get2byte(&data[cellStart+i*2]);
      u16 size = cellSizePtr(pPage, &data[pc]);
      int j;
      if( (pc+size-1)>=usableSize || pc<0 ){
        checkAppendMsg(pCheck, 0, 
            "Corruption detected in cell %d on page %d",i,iPage,0);
      }else{
        for(j=pc+size-1; j>=pc; j--) hit[j]++;
      }
Changes to src/btreeInt.h.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
** 2004 April 6
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btreeInt.h,v 1.15 2007/12/21 04:47:26 danielk1977 Exp $
**
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
**     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
**     "Sorting And Searching", pages 473-480. Addison-Wesley
**     Publishing Company, Reading, Massachusetts.











|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
/*
** 2004 April 6
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btreeInt.h,v 1.16 2008/02/19 14:59:35 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
**     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
**     "Sorting And Searching", pages 473-480. Addison-Wesley
**     Publishing Company, Reading, Massachusetts.
217
218
219
220
221
222
223
224

225
226
227
228
229
230
231
232
233
234

/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt)  (pBt->pageSize-8)

/* The maximum number of cells on a single page of the database.  This
** assumes a minimum cell size of 3 bytes.  Such small cells will be

** exceedingly rare, but they are possible.
*/
#define MX_CELL(pBt) ((pBt->pageSize-8)/3)

/* Forward declarations */
typedef struct MemPage MemPage;
typedef struct BtLock BtLock;

/*
** This is a magic string that appears at the beginning of every







|
>
|

|







217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235

/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt)  (pBt->pageSize-8)

/* The maximum number of cells on a single page of the database.  This
** assumes a minimum cell size of 6 bytes  (4 bytes for the cell itself
** plus 2 bytes for the index to the cell in the page header).  Such
** small cells will be rare, but they are possible.
*/
#define MX_CELL(pBt) ((pBt->pageSize-8)/6)

/* Forward declarations */
typedef struct MemPage MemPage;
typedef struct BtLock BtLock;

/*
** This is a magic string that appears at the beginning of every