/ Check-in [c8d3bdd9]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment::-) (CVS 221)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:c8d3bdd91e333f3fc519558e40c07e7e7c2ebeec
User & Date: drh 2001-05-28 00:41:15
Context
2001-05-28
00:41
:-) (CVS 1720) check-in: d78febd1 user: drh tags: trunk
00:41
:-) (CVS 221) check-in: c8d3bdd9 user: drh tags: trunk
2001-05-26
13:15
:-) (CVS 220) check-in: 45a0e0fc user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to notes/notes2.txt.

     1         -How to do a B*Tree insert:
            1  +How to do a B-Tree insert:
     2      2   
     3         -add_to_page(pageptr, data, pgno){
     4         -  pgno.parent = pageptr
     5         -  if( data+pgno fits on pageptr ){
     6         -    add data+pgno to pageptr
     7         -    return
            3  +  insert(data){
            4  +    create a new cursor
            5  +    move cursor to the entry nearest data
            6  +    if( cursor.key == keyof(data) ){
            7  +      replace cursor.data with dataof(data)
            8  +      return
            9  +    }
           10  +    childpg = NULL
           11  +    add_to_page(cursor, data+childpg)
           12  +    delete the cursor
     8     13     }
     9         -  if( pageptr==root ){
    10         -    split pageptr+(data+pgno) into newpage1, center, newpage2
    11         -    pageptr = ptr(newpage1) + center + ptr(newpage2);
    12         -    return
           14  +
           15  +  add_to_page(cursor, data+childpg ){
           16  +    childpg->parent = cursor.page
           17  +    if( data+childpg fits on cursor.page ){
           18  +      insert data+childpg at cursor
           19  +      return
           20  +    }
           21  +    if( page==root ){
           22  +      split page+(data+childpg) into newpage1, center, newpage2
           23  +      cursor.page = &newpage1 + center + &newpage2;
           24  +      newpage1->parent = cursor.page
           25  +      newpage2->parent = cursor.page
           26  +      return
           27  +    }
           28  +    if( move_some_data_left || move_some_data_right ){
           29  +      insert data+childpg at cursor
           30  +      return
           31  +    }
           32  +    split page+(data+childpg) into page, center, newpage
           33  +    newpage->parent = page->parent
           34  +    move cursor to insertion point of center in parent page.
           35  +    add_to_page(cursor, center, (newpage));
    13     36     }
    14         -  if( move_some_data_left || move_some_data_right ){
    15         -    add data+pgno to pageptr
    16         -    return
           37  +
           38  +How to do a B-Tree delete:
           39  +
           40  +  delete(entry){
           41  +    if( entry is not a leaf ){
           42  +      p = predecessor of entry
           43  +      // note: if entry is not a leaf then p must
           44  +      // exist and must be a leaf
           45  +      free(entry.overflowptr)
           46  +      resize entry so that is is big enough to hold p.payload
           47  +      entry.payload = p.payload
           48  +      entry.overflowptr = p.overflowptr
           49  +      p.overflowptr = NULL
           50  +      delete(p)
           51  +      return
           52  +    }
           53  +    unlink entry from its page
           54  +    refill(page containing entry)
    17     55     }
    18         -  split pageptr+(data+pgno) into pageptr, center, newpage
    19         -  add_to_page(parent(pageptr), center, ptr(newpage));
    20         -  newpage.parent = parent(pageptr)
    21         -}
    22     56   
    23         -Cursor: pageptr, idx
    24         -
    25         -unlink_entry(cursor, olddata){
    26         -  if( cursor.pageptr is not a leaf page ){
    27         -    if( olddata!=nil) copy payload(cursor) into olddata
    28         -    n = next_entry(cursor)
    29         -    if( payloadsize(n) <= freesize(cursor) + payloadsize(cursor) ){
    30         -      copy payload(n) into payload(cursor)
    31         -      unlink_entry(n, nil)
           57  +  refill(page){
           58  +    if( page is more than half full ) return
           59  +    if( page is the root and contains no entries ){
           60  +      copy the one child page into this page thus reducing
           61  +      the height of the tree by one.
    32     62         return
    33     63       }
    34         -    p = prev_entry(cursor)
    35         -    if( payloadsize(p) <= freesize(cursor) + payloadsize(cursor) ){
    36         -      copy payload(p) into payload(cursor)
    37         -      unlink_entry(p, nil)
           64  +    if( able to merge page with neighbors ){
           65  +      do the merge
           66  +      refill(parent page)
    38     67         return
    39     68       }
    40         -    unlink(n, leafdata)
    41         -    pageptr = cursor.pageptr
    42         -    nextpgno = pageptr.aCell[cursor.idx].pgno;
    43         -    convert_cursor_to_free_block(cursor)
    44         -    add_to_page(pageptr, leafdata, nextpgno)
    45         -    return
           69  +    borrow entrys from neighbors
    46     70     }
    47         -  pageptr = cursor.pageptr;
    48         -  convert_cursor_to_free_block(cursor)
    49         -  if( usage(pageptr)<0.65 ){
    50         -    consolidate(pageptr)
    51         -  }
    52         -}
    53         -
    54         -consolidate(pageptr){
    55         -  parentpage = parentof(pageptr)
    56         -  idx = index_of_page(parentpage, pageptr);
    57         -  leftsibling = parentpage.cell[idx].pgno;
    58         -  rightsibling = parentpage.cell[idx+1].pgno;
    59         -  if( idx>0 ){
    60         -    cursor = makecursor(pageptr,idx-1)
    61         -    if( try_to_move_down(cursor) ) return
    62         -  }
    63         -  if( idx<max ){
    64         -    cursor = makecursor(pageptr,idx)
    65         -    try_to_move_down(cursor)
    66         -  }
    67         -  return
    68         -}
    69         -
    70         -try_to_move_down(cursor){
    71         -  pageptr = cursor.pageptr
    72         -  if( payload(cursor)+sizeof(left)+sizeof(right)<=pagesize ){
    73         -    put cursor and content of left into right
    74         -    remove cursor from pageptr
    75         -    if( pageptr is root ){
    76         -      if( cellcount(pageptr)==0 ){
    77         -        copy child into pageptr
    78         -        update parent field of child
    79         -      }
    80         -    }else if( usage(pageptr)<0.65 ){
    81         -      try_to_move_down(cursor)
    82         -    }
    83         -  }
    84         -}
    85         -
    86         -cursor_move_next(cursor){
    87         -  if( cursor.incr_noop ){
    88         -    cursor.incr_noop = FALSE;
    89         -    return;
    90         -  }
    91         -  if( is_leaf(cursor.pageptr) ){
    92         -    if( cursor.idx==cursor.pageptr.ncell ){
    93         -      if( cursor.pageptr==root ){
    94         -        nil cursor
    95         -        return
    96         -      }
    97         -      cursor_move_up(cursor)
    98         -      cursor_move_next(cursor)
    99         -    }else{
   100         -      cursor.idx++;
   101         -    }
   102         -    return
   103         -  }
   104         -  pgno = next_pgno(cursor)
   105         -  loop {
   106         -    cursor.pageptr = get(pgno);
   107         -    if( is_leaf(cursor.pageptr) ) break;
   108         -    pgno = first_pgno(pageptr);
   109         -  }
   110         -  cursor.idx = 0;
   111         -}

Changes to src/btree.c.

    17     17   ** Boston, MA  02111-1307, USA.
    18     18   **
    19     19   ** Author contact information:
    20     20   **   drh@hwaci.com
    21     21   **   http://www.hwaci.com/drh/
    22     22   **
    23     23   *************************************************************************
    24         -** $Id: btree.c,v 1.8 2001/05/26 13:15:44 drh Exp $
           24  +** $Id: btree.c,v 1.9 2001/05/28 00:41:15 drh Exp $
    25     25   */
    26     26   #include "sqliteInt.h"
    27     27   #include "pager.h"
    28     28   #include "btree.h"
    29     29   #include <assert.h>
    30     30   
    31     31   
    32     32   /*
    33     33   ** Primitive data types.  u32 must be 4 bytes and u16 must be 2 bytes.
    34     34   ** Change these typedefs when porting to new architectures.
    35     35   */
    36     36   typedef unsigned int u32;
    37     37   typedef unsigned short int u16;
           38  +typedef unsigned char u8;
    38     39   
    39     40   /*
    40     41   ** Forward declarations of structures used only in this file.
    41     42   */
    42     43   typedef struct Page1Header Page1Header;
    43     44   typedef struct MemPage MemPage;
    44     45   typedef struct PageHdr PageHdr;
................................................................................
    53     54   **
    54     55   ** This might need to change for computer architectures that require
    55     56   ** and 8-byte alignment boundry for structures.
    56     57   */
    57     58   #define ROUNDUP(X)  ((X+3) & ~3)
    58     59   
    59     60   /*
    60         -** The first pages of the database file contains some additional
           61  +** The first page of the database file contains some additional
    61     62   ** information used for housekeeping and sanity checking.  Otherwise,
    62     63   ** the first page is just like any other.  The additional information
    63     64   ** found on the first page is described by the following structure.
    64     65   */
    65     66   struct Page1Header {
    66         -  u32 magic1;       /* A magic number for sanity checking */
    67         -  u32 magic2;       /* A second magic number for sanity checking */
           67  +  u32 magic1;       /* A magic number to verify the file really is a database */
           68  +  u32 magic2;       /* A second magic number to be extra sure */
    68     69     Pgno firstList;   /* First free page in a list of all free pages */
    69     70   };
    70     71   #define MAGIC_1  0x7264dc61
    71     72   #define MAGIC_2  0x54e55d9e
    72     73   
    73     74   /*
    74     75   ** Each database page has a header as follows:
    75     76   **
    76     77   **      page1_header          Optional instance of Page1Header structure
    77     78   **      rightmost_pgno        Page number of the right-most child page
    78         -**      first_cell            Index into MemPage.aPage of first cell
           79  +**      first_cell            Index into MemPage.aDisk of first cell
    79     80   **      first_free            Index of first free block
    80     81   **
    81         -** MemPage.pStart always points to the rightmost_pgno.  First_free is
           82  +** MemPage.pHdr always points to the rightmost_pgno.  First_free is
    82     83   ** 0 if there is no free space on this page.  Otherwise, first_free is
    83         -** the index in MemPage.aPage[] of a FreeBlk structure that describes
           84  +** the index in MemPage.aDisk[] of a FreeBlk structure that describes
    84     85   ** the first block of free space.  All free space is defined by a linked
    85     86   ** list of FreeBlk structures.
    86     87   **
    87     88   ** Data is stored in a linked list of Cell structures.  First_cell is
    88         -** the index into MemPage.aPage[] of the first cell on the page.  The
           89  +** the index into MemPage.aDisk[] of the first cell on the page.  The
    89     90   ** Cells are kept in sorted order.
    90     91   */
    91     92   struct PageHdr {
    92         -  Pgno pgno;      /* Child page that comes after all cells on this page */
    93         -  u16 firstCell;  /* Index in MemPage.aPage[] of the first cell */
    94         -  u16 firstFree;  /* Index in MemPage.aPage[] of the first free block */
           93  +  Pgno rightChild;  /* Child page that comes after all cells on this page */
           94  +  u16 firstCell;    /* Index in MemPage.aDisk[] of the first cell */
           95  +  u16 firstFree;    /* Index in MemPage.aDisk[] of the first free block */
    95     96   };
    96     97   
    97     98   
    98     99   /*
    99    100   ** Entries on a page of the database are called "Cells".  Each Cell
   100    101   ** has a header and data.  This structure defines the header.  The
   101    102   ** definition of the complete Cell including the data is given below.
   102    103   */
   103    104   struct CellHdr {
   104         -  Pgno pgno;      /* Child page that comes before this cell */
          105  +  Pgno leftChild; /* Child page that comes before this cell */
   105    106     u16 nKey;       /* Number of bytes in the key */
   106         -  u16 iNext;      /* Index in MemPage.aPage[] of next cell in sorted order */
          107  +  u16 iNext;      /* Index in MemPage.aDisk[] of next cell in sorted order */
   107    108     u32 nData;      /* Number of bytes of data */
   108    109   }
   109    110   
   110    111   /*
   111    112   ** The minimum size of a complete Cell.  The Cell must contain a header
   112    113   ** and at least 4 bytes of data.
   113    114   */
................................................................................
   125    126   ** extra goes onto overflow pages.
   126    127   */
   127    128   #define MX_LOCAL_PAYLOAD \
   128    129     ((SQLITE_PAGE_SIZE-sizeof(PageHdr))/4-(sizeof(CellHdr)+sizeof(Pgno)))
   129    130   
   130    131   /*
   131    132   ** Data on a database page is stored as a linked list of Cell structures.
   132         -** Both the key and the data are stored in aData[].  The key always comes
   133         -** first.  The aData[] field grows as necessary to hold the key and data,
          133  +** Both the key and the data are stored in aPayload[].  The key always comes
          134  +** first.  The aPayload[] field grows as necessary to hold the key and data,
   134    135   ** up to a maximum of MX_LOCAL_PAYLOAD bytes.  If the size of the key and
   135    136   ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
   136    137   ** page number of the first overflow page.
   137    138   **
   138    139   ** Though this structure is fixed in size, the Cell on the database
   139    140   ** page varies in size.  Very cell has a CellHdr and at least 4 bytes
   140    141   ** of payload space.  Additional payload bytes (up to the maximum of
   141    142   ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
   142    143   ** needed.
   143    144   */
   144    145   struct Cell {
   145         -  CellHdr h;                     /* The cell header */
   146         -  char aData[MX_LOCAL_PAYLOAD];  /* Key and data */
   147         -  Pgno ovfl;                     /* The first overflow page */
          146  +  CellHdr h;                        /* The cell header */
          147  +  char aPayload[MX_LOCAL_PAYLOAD];  /* Key and data */
          148  +  Pgno ovfl;                        /* The first overflow page */
   148    149   };
   149    150   
   150    151   /*
   151    152   ** Free space on a page is remembered using a linked list of the FreeBlk
   152    153   ** structures.  Space on a database page is allocated in increments of
   153    154   ** at least 4 bytes and is always aligned to a 4-byte boundry.  The
   154    155   ** linked list of freeblocks is always kept in order by address.
   155    156   */
   156    157   struct FreeBlk {
   157    158     u16 iSize;      /* Number of bytes in this block of free space */
   158         -  u16 iNext;      /* Index in MemPage.aPage[] of the next free block */
          159  +  u16 iNext;      /* Index in MemPage.aDisk[] of the next free block */
   159    160   };
   160    161   
   161    162   /*
   162    163   ** Number of bytes on a single overflow page.
   163    164   */
   164    165   #define OVERFLOW_SIZE (SQLITE_PAGE_SIZE-sizeof(Pgno))
   165    166   
................................................................................
   172    173   ** Unused pages in the database are also represented by instances of
   173    174   ** the OverflowPage structure.  The Page1Header.freeList field is the
   174    175   ** page number of the first page in a linked list of unused database
   175    176   ** pages.
   176    177   */
   177    178   struct OverflowPage {
   178    179     Pgno next;
   179         -  char aData[OVERFLOW_SIZE];
          180  +  char aPayload[OVERFLOW_SIZE];
   180    181   };
   181    182   
   182    183   /*
   183    184   ** For every page in the database file, an instance of the following structure
   184         -** is stored in memory.  The aPage[] array contains the data obtained from
          185  +** is stored in memory.  The aDisk[] array contains the data obtained from
   185    186   ** the disk.  The rest is auxiliary data that held in memory only.  The
   186    187   ** auxiliary data is only valid for regular database pages - the auxiliary
   187    188   ** data is meaningless for overflow pages and pages on the freelist.
   188    189   **
   189         -** Of particular interest in the auxiliary data is the aCell[] entry.  Each
   190         -** aCell[] entry is a pointer to a Cell structure in aPage[].  The cells are
          190  +** Of particular interest in the auxiliary data is the apCell[] entry.  Each
          191  +** apCell[] entry is a pointer to a Cell structure in aDisk[].  The cells are
   191    192   ** put in this array so that they can be accessed in constant time, rather
   192    193   ** than in linear time which would be needed if we walked the linked list.
   193    194   **
   194    195   ** The pParent field points back to the parent page.  This allows us to
   195    196   ** walk up the BTree from any leaf to the root.  Care must be taken to
   196    197   ** unref() the parent page pointer when this page is no longer referenced.
   197    198   ** The pageDestructor() routine handles that.
   198    199   */
   199    200   struct MemPage {
   200         -  char aPage[SQLITE_PAGE_SIZE];  /* Page data stored on disk */
   201         -  unsigned char isInit;          /* True if auxiliary data is initialized */
   202         -  unsigned char validLeft;       /* True if MemPage.left is valid */
   203         -  unsigned char validRight;      /* True if MemPage.right is valid */
          201  +  char aDisk[SQLITE_PAGE_SIZE];  /* Page data stored on disk */
          202  +  int isInit;                    /* True if auxiliary data is initialized */
   204    203     MemPage *pParent;              /* The parent of this page.  NULL for root */
   205         -  Pgno left;                     /* Left sibling page.  0==none */
   206         -  Pgno right;                    /* Right sibling page.  0==none */
   207         -  int idxStart;                  /* Index in aPage[] of real data */
   208         -  PageHdr *pStart;               /* Points to aPage[idxStart] */
   209         -  int nFree;                     /* Number of free bytes in aPage[] */
          204  +  int idxStart;                  /* Index in aDisk[] of real data */
          205  +  PageHdr *pHdr;                 /* Points to aDisk[idxStart] */
          206  +  int nFree;                     /* Number of free bytes in aDisk[] */
   210    207     int nCell;                     /* Number of entries on this page */
   211         -  Cell *aCell[MX_CELL];          /* All data entires in sorted order */
          208  +  Cell *apCell[MX_CELL];         /* All data entires in sorted order */
   212    209   }
   213    210   
   214    211   /*
   215    212   ** The in-memory image of a disk page has the auxiliary information appended
   216    213   ** to the end.  EXTRA_SIZE is the number of bytes of space needed to hold
   217    214   ** that extra information.
   218    215   */
................................................................................
   228    225     int inTrans;          /* True if a transaction is in progress */
   229    226   };
   230    227   typedef Btree Bt;
   231    228   
   232    229   /*
   233    230   ** A cursor is a pointer to a particular entry in the BTree.
   234    231   ** The entry is identified by its MemPage and the index in
   235         -** MemPage.aCell[] of the entry.
          232  +** MemPage.apCell[] of the entry.
   236    233   */
   237    234   struct BtCursor {
   238         -  Btree *pBt;                     /* The pointer back to the BTree */
   239         -  BtCursor *pPrev, *pNext;        /* List of all cursors */
   240         -  MemPage *pPage;                 /* Page that contains the entry */
   241         -  int idx;                        /* Index of the entry in pPage->aCell[] */
   242         -  int skip_incr;                  /* */
          235  +  Btree *pBt;               /* The Btree to which this cursor belongs */
          236  +  BtCursor *pPrev, *pNext;  /* List of all cursors */
          237  +  MemPage *pPage;           /* Page that contains the entry */
          238  +  u16 idx;                  /* Index of the entry in pPage->apCell[] */
          239  +  u8 bSkipNext;             /* sqliteBtreeNext() is no-op if true */
          240  +  u8 iMatch;                /* compare result from last sqliteBtreeMoveto() */
   243    241   };
   244    242   
   245    243   /*
   246    244   ** Compute the total number of bytes that a Cell needs on the main
   247         -** database page.  The number returned includes the Cell header, but
   248         -** not any overflow pages.
          245  +** database page.  The number returned includes the Cell header,
          246  +** local payload storage, and the pointer to overflow pages (if
          247  +** applicable).  The point of this routine is that it does not
          248  +** include payload storage on overflow pages.
   249    249   */
   250    250   static int cellSize(Cell *pCell){
   251    251     int n = pCell->h.nKey + pCell->h.nData;
   252    252     if( n>MX_LOCAL_PAYLOAD ){
   253    253       n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
   254    254     }else{
   255    255       n = ROUNDUP(n);
................................................................................
   266    266   static void defragmentPage(MemPage *pPage){
   267    267     int pc;
   268    268     int i, n;
   269    269     FreeBlk *pFBlk;
   270    270     char newPage[SQLITE_PAGE_SIZE];
   271    271   
   272    272     pc = ROUNDUP(pPage->idxStart + sizeof(PageHdr));
   273         -  pPage->pStart->firstCell = pc;
   274         -  memcpy(newPage, pPage->aPage, pc);
          273  +  pPage->pHdr->firstCell = pc;
          274  +  memcpy(newPage, pPage->aDisk, pc);
   275    275     for(i=0; i<pPage->nCell; i++){
   276         -    Cell *pCell = &pPage->aCell[i];
          276  +    Cell *pCell = &pPage->apCell[i];
   277    277       n = cellSize(pCell);
   278    278       pCell->h.iNext = i<pPage->nCell ? pc + n : 0;
   279    279       memcpy(&newPage[pc], pCell, n);
   280         -    pPage->aCell[i] = (Cell*)&pPage->aPage[pc];
          280  +    pPage->apCell[i] = (Cell*)&pPage->aDisk[pc];
   281    281       pc += n;
   282    282     }
   283    283     assert( pPage->nFree==SQLITE_PAGE_SIZE-pc );
   284         -  memcpy(pPage->aPage, newPage, pc);
   285         -  pFBlk = &pPage->aPage[pc];
          284  +  memcpy(pPage->aDisk, newPage, pc);
          285  +  pFBlk = &pPage->aDisk[pc];
   286    286     pFBlk->iSize = SQLITE_PAGE_SIZE - pc;
   287    287     pFBlk->iNext = 0;
   288         -  pPage->pStart->firstFree = pc;
          288  +  pPage->pHdr->firstFree = pc;
   289    289     memset(&pFBlk[1], 0, SQLITE_PAGE_SIZE - pc - sizeof(FreeBlk));
   290    290   }
   291    291   
   292    292   /*
   293    293   ** Allocate space on a page.  The space needs to be at least
   294    294   ** nByte bytes in size.  (Actually, all allocations are rounded
   295    295   ** up to the next even multiple of 4.)  Return the index into
   296         -** pPage->aPage[] of the first byte of the new allocation.
          296  +** pPage->aDisk[] of the first byte of the new allocation.
   297    297   ** Or return 0 if there is not enough free space on the page to
   298    298   ** satisfy the allocation request.
   299    299   **
   300    300   ** If the page contains nBytes of free space but does not contain
   301    301   ** nBytes of contiguous free space, then defragementPage() is
   302    302   ** called to consolidate all free space before allocating the
   303    303   ** new chunk.
   304    304   */
   305    305   static int allocSpace(MemPage *pPage, int nByte){
   306    306     FreeBlk *p;
   307    307     u16 *pIdx;
   308    308     int start;
   309    309   
   310         -  nByte = ROUNDUP(nByte);
          310  +  assert( nByte==ROUNDUP(nByte) );
   311    311     if( pPage->nFree<nByte ) return 0;
   312         -  pIdx = &pPage->pStart->firstFree;
   313         -  p = (FreeBlk*)&pPage->aPage[*pIdx];
          312  +  pIdx = &pPage->pHdr->firstFree;
          313  +  p = (FreeBlk*)&pPage->aDisk[*pIdx];
   314    314     while( p->iSize<nByte ){
   315    315       if( p->iNext==0 ){
   316    316         defragmentPage(pPage);
   317         -      pIdx = &pPage->pStart->firstFree;
          317  +      pIdx = &pPage->pHdr->firstFree;
   318    318       }else{
   319    319         pIdx = &p->iNext;
   320    320       }
   321         -    p = (FreeBlk*)&pPage->aPage[*pIdx];
          321  +    p = (FreeBlk*)&pPage->aDisk[*pIdx];
   322    322     }
   323    323     if( p->iSize==nByte ){
   324    324       start = *pIdx;
   325    325       *pIdx = p->iNext;
   326    326     }else{
   327    327       start = *pIdx;
   328         -    FreeBlk *pNew = (FreeBlk*)&pPage->aPage[start + nByte];
          328  +    FreeBlk *pNew = (FreeBlk*)&pPage->aDisk[start + nByte];
   329    329       pNew->iNext = p->iNext;
   330    330       pNew->iSize = p->iSize - nByte;
   331    331       *pIdx = start + nByte;
   332    332     }
   333    333     pPage->nFree -= nByte;
   334    334     return start;
   335    335   }
   336    336   
   337    337   /*
   338         -** Return a section of the MemPage.aPage[] to the freelist.
   339         -** The first byte of the new free block is pPage->aPage[start]
          338  +** Return a section of the MemPage.aDisk[] to the freelist.
          339  +** The first byte of the new free block is pPage->aDisk[start]
   340    340   ** and the size of the block is "size".
   341    341   **
   342    342   ** Most of the effort here is involved in coalesing adjacent
   343    343   ** free blocks into a single big free block.
   344    344   */
   345    345   static void freeSpace(MemPage *pPage, int start, int size){
   346    346     int end = start + size;
................................................................................
   347    347     u16 *pIdx, idx;
   348    348     FreeBlk *pFBlk;
   349    349     FreeBlk *pNew;
   350    350     FreeBlk *pNext;
   351    351   
   352    352     assert( size == ROUNDUP(size) );
   353    353     assert( start == ROUNDUP(start) );
   354         -  pIdx = &pPage->pStart->firstFree;
          354  +  pIdx = &pPage->pHdr->firstFree;
   355    355     idx = *pIdx;
   356    356     while( idx!=0 && idx<start ){
   357         -    pFBlk = (FreeBlk*)&pPage->aPage[idx];
          357  +    pFBlk = (FreeBlk*)&pPage->aDisk[idx];
   358    358       if( idx + pFBlk->iSize == start ){
   359    359         pFBlk->iSize += size;
   360    360         if( idx + pFBlk->iSize == pFBlk->iNext ){
   361         -        pNext = (FreeBlk*)&pPage->aPage[pFblk->iNext];
          361  +        pNext = (FreeBlk*)&pPage->aDisk[pFblk->iNext];
   362    362           pFBlk->iSize += pNext->iSize;
   363    363           pFBlk->iNext = pNext->iNext;
   364    364         }
   365    365         pPage->nFree += size;
   366    366         return;
   367    367       }
   368    368       pIdx = &pFBlk->iNext;
   369    369       idx = *pIdx;
   370    370     }
   371         -  pNew = (FreeBlk*)&pPage->aPage[start];
          371  +  pNew = (FreeBlk*)&pPage->aDisk[start];
   372    372     if( idx != end ){
   373    373       pNew->iSize = size;
   374    374       pNew->iNext = idx;
   375    375     }else{
   376         -    pNext = (FreeBlk*)&pPage->aPage[idx];
          376  +    pNext = (FreeBlk*)&pPage->aDisk[idx];
   377    377       pNew->iSize = size + pNext->iSize;
   378    378       pNew->iNext = pNext->iNext;
   379    379     }
   380    380     *pIdx = start;
   381    381     pPage->nFree += size;
   382    382   }
   383    383   
   384    384   /*
   385    385   ** Initialize the auxiliary information for a disk block.
          386  +**
          387  +** The pParent field is always
   386    388   **
   387    389   ** Return SQLITE_OK on success.  If we see that the page does
   388    390   ** not contained a well-formed database page, then return 
   389    391   ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
   390    392   ** guarantee that the page is well-formed.  It only shows that
   391    393   ** we failed to detect any corruption.
   392    394   */
   393    395   static int initPage(MemPage *pPage, Pgno pgnoThis, MemPage *pParent){
   394         -  int idx;
   395         -  Cell *pCell;
   396         -  FreeBlk *pFBlk;
          396  +  int idx;           /* An index into pPage->aDisk[] */
          397  +  Cell *pCell;       /* A pointer to a Cell in pPage->aDisk[] */
          398  +  FreeBlk *pFBlk;    /* A pointer to a free block in pPage->aDisk[] */
          399  +  int sz;            /* The size of a Cell in bytes */
          400  +  int freeSpace;     /* Amount of free space on the page */
   397    401   
          402  +  if( pPage->pParent ){
          403  +    assert( pPage->pParent==pParent );
          404  +    return SQLITE_OK;
          405  +  }
          406  +  if( pParent ){
          407  +    pPage->pParent = pParent;
          408  +    sqlitepager_ref(pParent);
          409  +  }
          410  +  if( pPage->isInit ) return SQLITE_OK;
   398    411     pPage->idxStart = (pgnoThis==1) ? sizeof(Page1Header) : 0;
   399         -  pPage->pStart = (PageHdr*)&pPage->aPage[pPage->idxStart];
          412  +  pPage->pHdr = (PageHdr*)&pPage->aDisk[pPage->idxStart];
   400    413     pPage->isInit = 1;
   401         -  assert( pPage->pParent==0 );
   402         -  pPage->pParent = pParent;
   403         -  if( pParent ) sqlitepager_ref(pParent);
   404    414     pPage->nCell = 0;
   405         -  idx = pPage->pStart->firstCell;
          415  +  freeSpace = SQLITE_PAGE_SIZE - pPage->idxStart - sizeof(PageHeader);
          416  +  idx = pPage->pHdr->firstCell;
   406    417     while( idx!=0 ){
   407    418       if( idx>SQLITE_PAGE_SIZE-MN_CELL_SIZE ) goto page_format_error;
   408    419       if( idx<pPage->idxStart + sizeof(PageHeader) ) goto page_format_error;
   409         -    pCell = (Cell*)&pPage->aPage[idx];
   410         -    pPage->aCell[pPage->nCell++] = pCell;
          420  +    pCell = (Cell*)&pPage->aDisk[idx];
          421  +    sz = cellSize(pCell);
          422  +    if( idx+sz > SQLITE_PAGE_SIZE ) goto page_format_error;
          423  +    freeSpace -= sz;
          424  +    pPage->apCell[pPage->nCell++] = pCell;
   411    425       idx = pCell->h.iNext;
   412    426     }
   413    427     pPage->nFree = 0;
   414         -  idx = pPage->pStart->firstFree;
          428  +  idx = pPage->pHdr->firstFree;
   415    429     while( idx!=0 ){
   416    430       if( idx>SQLITE_PAGE_SIZE-sizeof(FreeBlk) ) goto page_format_error;
   417    431       if( idx<pPage->idxStart + sizeof(PageHeader) ) goto page_format_error;
   418         -    pFBlk = (FreeBlk*)&pPage->aPage[idx];
          432  +    pFBlk = (FreeBlk*)&pPage->aDisk[idx];
   419    433       pPage->nFree += pFBlk->iSize;
   420    434       if( pFBlk->iNext <= idx ) goto page_format_error;
   421    435       idx = pFBlk->iNext;
   422    436     }
          437  +  if( pPage->nFree!=freeSpace ) goto page_format_error;
   423    438     return SQLITE_OK;
   424    439   
   425    440   page_format_error:
   426    441     return SQLITE_CORRUPT;
   427    442   }
   428    443   
   429    444   /*
................................................................................
   605    620     pCur->pBt = pBt;
   606    621     rc = sqlitepager_get(pBt->pPager, 1, &pCur->pPage);
   607    622     if( rc!=SQLITE_OK ){
   608    623       sqliteFree(pCur);
   609    624       *ppCur = 0;
   610    625       return rc;
   611    626     }
   612         -  if( !pCur->pPage->isInit ){
   613         -    initPage(pCur->pPage, 1, 0);
   614         -  }
          627  +  initPage(pCur->pPage, 1, 0);
   615    628     pCur->idx = 0;
   616         -  pCur->depth = 0;
   617         -  pCur->aPage[0] = pCur->pPage;
   618    629     *ppCur = pCur;
   619    630     return SQLITE_OK;
   620    631   }
   621    632   
   622    633   /*
   623    634   ** Close a cursor. 
   624    635   */
................................................................................
   635    646     }
   636    647     sqlitepager_unref(pCur->pPage);
   637    648     if( pBt->pCursor==0 && pBt->inTrans==0 ){
   638    649       unlockBtree(pBt);
   639    650     }
   640    651     sqliteFree(pCur);
   641    652   }
          653  +
          654  +/*
          655  +** Make a temporary cursor by filling in the fields of pTempCur.
          656  +** The temporary cursor is not on the cursor list for the Btree.
          657  +*/
          658  +static void createTemporaryCursor(BtCursor *pCur, BtCursor *pTempCur){
          659  +  memcpy(pTempCur, pCur, sizeof(*pCur));
          660  +  pTempCur->pNext = 0;
          661  +  pTempCur->pPrev = 0;
          662  +  sqlitepager_ref(pTempCur->pPage);
          663  +}
          664  +
          665  +/*
          666  +** Delete a temporary cursor such as was made by the createTemporaryCursor()
          667  +** function above.
          668  +*/
          669  +static void destroyTemporaryCursor(BeCursor *pCur){
          670  +  sqlitepager_unref(pCur->pPage);
          671  +}
   642    672   
   643    673   /*
   644    674   ** Write the number of bytes of key for the entry the cursor is
   645    675   ** pointing to into *pSize.  Return SQLITE_OK.  Failure is not
   646    676   ** possible.
   647    677   */
   648    678   int sqliteBtreeKeySize(BtCursor *pCur, int *pSize){
................................................................................
   650    680     MemPage *pPage;
   651    681   
   652    682     pPage = pCur->pPage;
   653    683     assert( pPage!=0 );
   654    684     if( pCur->idx >= pPage->nCell ){
   655    685       *pSize = 0;
   656    686     }else{
   657         -    pCell = pPage->aCell[pCur->idx];
          687  +    pCell = pPage->apCell[pCur->idx];
   658    688       *psize = pCell->h.nKey;
   659    689     }
   660    690     return SQLITE_OK;
   661    691   }
   662    692   
   663    693   /*
   664    694   ** Read payload information from the entry that the pCur cursor is
................................................................................
   665    695   ** pointing to.  Begin reading the payload at "offset" and read
   666    696   ** a total of "amt" bytes.  Put the result in zBuf.
   667    697   **
   668    698   ** This routine does not make a distinction between key and data.
   669    699   ** It just reads bytes from the payload area.
   670    700   */
   671    701   static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){
   672         -  char *aData;
          702  +  char *aPayload;
   673    703     Pgno nextPage;
   674    704     assert( pCur!=0 && pCur->pPage!=0 );
   675    705     assert( pCur->idx>=0 && pCur->idx<pCur->nCell );
   676         -  aData = pCur->pPage->aCell[pCur->idx].aData;
          706  +  aPayload = pCur->pPage->apCell[pCur->idx].aPayload;
   677    707     if( offset<MX_LOCAL_PAYLOAD ){
   678    708       int a = amt;
   679    709       if( a+offset>MX_LOCAL_PAYLOAD ){
   680    710         a = MX_LOCAL_PAYLOAD - offset;
   681    711       }
   682         -    memcpy(zBuf, &aData[offset], a);
          712  +    memcpy(zBuf, &aPayload[offset], a);
   683    713       if( a==amt ){
   684    714         return SQLITE_OK;
   685    715       }
   686    716       offset += a;
   687    717       zBuf += a;
   688    718       amt -= a;
   689    719       if( amt>0 ){
   690    720         assert( a==ROUNDUP(a) );
   691         -      nextPage = *(Pgno*)&aData[a];
          721  +      nextPage = *(Pgno*)&aPayload[a];
   692    722       }
   693    723     }
   694    724     while( amt>0 && nextPage ){
   695    725       OverflowPage *pOvfl;
   696    726       rc = sqlitepager_get(pCur->pBt->pPager, nextPage, &pOvfl);
   697    727       if( rc!=0 ){
   698    728         return rc;
................................................................................
   699    729       }
   700    730       nextPage = pOvfl->next;
   701    731       if( offset<OVERFLOW_SIZE ){
   702    732         int a = amt;
   703    733         if( a + offset > OVERFLOW_SIZE ){
   704    734           a = OVERFLOW_SIZE - offset;
   705    735         }
   706         -      memcpy(zBuf, &pOvfl->aData[offset], a);
          736  +      memcpy(zBuf, &pOvfl->aPayload[offset], a);
   707    737         offset += a;
   708    738         amt -= a;
   709    739         zBuf += a;
   710    740       }
   711    741       sqlitepager_unref(pOvfl);
   712    742     }
   713    743     return amt==0 ? SQLITE_OK : SQLITE_CORRUPT;
................................................................................
   728    758     if( offset<0 ) return SQLITE_ERROR;
   729    759     if( amt==0 ) return SQLITE_OK;
   730    760     pPage = pCur->pPage;
   731    761     assert( pPage!=0 );
   732    762     if( pCur->idx >= pPage->nCell ){
   733    763       return SQLITE_ERROR;
   734    764     }
   735         -  pCell = pPage->aCell[pCur->idx];
          765  +  pCell = pPage->apCell[pCur->idx];
   736    766     if( amt+offset > pCell->h.nKey ){
   737    767     return getPayload(pCur, offset, amt, zBuf);
   738    768   }
   739    769   
   740    770   /*
   741    771   ** Write the number of bytes of data on the entry that the cursor
   742    772   ** is pointing to into *pSize.  Return SQLITE_OK.  Failure is
................................................................................
   747    777     MemPage *pPage;
   748    778   
   749    779     pPage = pCur->pPage;
   750    780     assert( pPage!=0 );
   751    781     if( pCur->idx >= pPage->nCell ){
   752    782       *pSize = 0;
   753    783     }else{
   754         -    pCell = pPage->aCell[pCur->idx];
          784  +    pCell = pPage->apCell[pCur->idx];
   755    785       *pSize = pCell->h.nData;
   756    786     }
   757    787     return SQLITE_OK;
   758    788   }
   759    789   
   760    790   /*
   761    791   ** Read part of the data associated with cursor pCur.  A total
................................................................................
   772    802     if( offset<0 ) return SQLITE_ERROR;
   773    803     if( amt==0 ) return SQLITE_OK;
   774    804     pPage = pCur->pPage;
   775    805     assert( pPage!=0 );
   776    806     if( pCur->idx >= pPage->nCell ){
   777    807       return SQLITE_ERROR;
   778    808     }
   779         -  pCell = pPage->aCell[pCur->idx];
          809  +  pCell = pPage->apCell[pCur->idx];
   780    810     if( amt+offset > pCell->h.nKey ){
   781    811     return getPayload(pCur, offset + pCell->h.nKey, amt, zBuf);
   782    812   }
   783    813   
   784    814   /*
   785    815   ** Compare the key for the entry that pCur points to against the 
   786    816   ** given key (pKey,nKeyOrig).  Put the comparison result in *pResult.
................................................................................
   796    826     Pgno nextPage;
   797    827     int nKey = nKeyOrig;
   798    828     int n;
   799    829     Cell *pCell;
   800    830   
   801    831     assert( pCur->pPage );
   802    832     assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
   803         -  pCell = &pCur->pPage->aCell[pCur->idx];
          833  +  pCell = &pCur->pPage->apCell[pCur->idx];
   804    834     if( nKey > pCell->h.nKey ){
   805    835       nKey = pCell->h.nKey;
   806    836     }
   807    837     n = nKey;
   808    838     if( n>MX_LOCAL_PAYLOAD ){
   809    839       n = MX_LOCAL_PAYLOAD;
   810    840     }
   811         -  c = memcmp(pCell->aData, pKey, n);
          841  +  c = memcmp(pCell->aPayload, pKey, n);
   812    842     if( c!=0 ){
   813    843       *pResult = c;
   814    844       return SQLITE_OK;
   815    845     }
   816    846     pKey += n;
   817    847     nKey -= n;
   818    848     nextPage = pCell->ovfl;
................................................................................
   826    856         return rc;
   827    857       }
   828    858       nextPage = pOvfl->next;
   829    859       n = nKey;
   830    860       if( n>OVERFLOW_SIZE ){
   831    861         n = OVERFLOW_SIZE;
   832    862       }
   833         -    c = memcmp(pOvfl->aData, pKey, n);
          863  +    c = memcmp(pOvfl->aPayload, pKey, n);
   834    864       sqlitepager_unref(pOvfl);
   835    865       if( c!=0 ){
   836    866         *pResult = c;
   837    867         return SQLITE_OK;
   838    868       }
   839    869       nKey -= n;
   840    870       pKey += n;
................................................................................
   843    873     *pResult = c;
   844    874     return SQLITE_OK;
   845    875   }
   846    876   
   847    877   /*
   848    878   ** Move the cursor down to a new child page.
   849    879   */
   850         -static int childPage(BtCursor *pCur, int newPgno){
          880  +static int moveToChild(BtCursor *pCur, int newPgno){
   851    881     int rc;
   852    882     MemPage *pNewPage;
   853    883   
   854    884     rc = sqlitepager_get(pCur->pBt->pPager, newPgno, &pNewPage);
   855    885     if( rc ){
   856    886       return rc;
   857    887     }
   858         -  if( !pNewPage->isInit ){
   859         -    initPage(pNewPage, newPgno, pCur->pPage);
   860         -  }
          888  +  initPage(pNewPage, newPgno, pCur->pPage);
   861    889     sqlitepager_unref(pCur->pPage);
   862    890     pCur->pPage = pNewPage;
   863    891     pCur->idx = 0;
   864    892     return SQLITE_OK;
   865    893   }
   866    894   
   867    895   /*
   868         -** Move the cursor up to the parent page
          896  +** Move the cursor up to the parent page.
          897  +**
          898  +** pCur->idx is set to the cell index that contains the pointer
          899  +** to the page we are coming from.  If we are coming from the
          900  +** right-most child page then pCur->idx is set to one more than
          901  +** the largets cell index.
   869    902   */
   870         -static int parentPage(BtCursor *pCur){
          903  +static int moveToParent(BtCursor *pCur){
   871    904     Pgno oldPgno;
   872    905     MemPage *pParent;
   873    906   
   874    907     pParent = pCur->pPage->pParent;
   875    908     oldPgno = sqlitepager_pagenumber(pCur->pPage);
   876    909     if( pParent==0 ){
   877    910       return SQLITE_INTERNAL;
   878    911     }
   879    912     sqlitepager_ref(pParent);
   880    913     sqlitepager_unref(pCur->pPage);
   881    914     pCur->pPage = pParent;
   882    915     pCur->idx = pPage->nCell;
   883    916     for(i=0; i<pPage->nCell; i++){
   884         -    if( pPage->aCell[i].h.pgno==oldPgno ){
          917  +    if( pPage->apCell[i].h.leftChild==oldPgno ){
   885    918         pCur->idx = i;
   886    919         break;
   887    920       }
   888    921     }
          922  +  return SQLITE_OK;
   889    923   }
   890    924   
   891    925   /*
   892    926   ** Move the cursor to the root page
   893    927   */
   894         -static int rootPage(BtCursor *pCur){
          928  +static int moveToRoot(BtCursor *pCur){
   895    929     MemPage *pNew;
   896    930     pNew = pCur->pBt->page1;
   897    931     sqlitepager_ref(pNew);
   898    932     sqlitepager_unref(pCur->pPage);
   899    933     pCur->pPage = pNew;
   900    934     pCur->idx = 0;
   901    935     return SQLITE_OK;
   902    936   }
          937  +
          938  +/*
          939  +** Move the cursor down to the left-most leaf entry beneath the
          940  +** entry to which it is currently pointing.
          941  +*/
          942  +static int moveToLeftmost(BtCursor *pCur){
          943  +  Pgno pgno;
          944  +  int rc;
          945  +
          946  +  while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
          947  +    rc = moveToChild(pCur, pgno);
          948  +    if( rc ) return rc;
          949  +  }
          950  +  return SQLITE_OK;
          951  +}
          952  +
   903    953   
   904    954   /* Move the cursor so that it points to an entry near pKey.
   905    955   ** Return a success code.
          956  +**
          957  +** If an exact match is not found, then the cursor is always
          958  +** left point at a root page which would hold the entry if it
          959  +** were present.  The cursor might point to an entry that comes
          960  +** before or after the key.
   906    961   **
   907    962   ** If pRes!=NULL, then *pRes is written with an integer code to
   908    963   ** describe the results.  *pRes is set to 0 if the cursor is left 
   909    964   ** pointing at an entry that exactly matches pKey.  *pRes is made
   910    965   ** negative if the cursor is on the largest entry less than pKey.
   911    966   ** *pRes is set positive if the cursor is on the smallest entry
   912    967   ** greater than pKey.  *pRes is not changed if the return value
   913    968   ** is something other than SQLITE_OK;
   914    969   */
   915    970   int sqliteBtreeMoveto(BtCursor *pCur, void *pKey, int nKey, int *pRes){
   916    971     int rc;
   917         -  rc = rootPage(pCur);
          972  +  rc = moveToRoot(pCur);
   918    973     if( rc ) return rc;
   919    974     for(;;){
   920    975       int lwr, upr;
   921    976       Pgno chldPg;
   922    977       MemPage *pPage = pCur->pPage;
   923    978       lwr = 0;
   924    979       upr = pPage->nCell-1;
   925    980       while( lwr<=upr ){
   926    981         int c;
   927    982         pCur->idx = (lwr+upr)/2;
   928    983         rc = compareKey(pCur, pKey, nKey, &c);
   929    984         if( rc ) return rc;
   930    985         if( c==0 ){
          986  +        pCur->iMatch = c;
   931    987           if( pRes ) *pRes = 0;
   932    988           return SQLITE_OK;
   933    989         }
   934    990         if( c<0 ){
   935    991           lwr = pCur->idx+1;
   936    992         }else{
   937    993           upr = pCur->idx-1;
   938    994         }
   939    995       }
   940    996       assert( lwr==upr+1 );
   941    997       if( lwr>=pPage->nCell ){
   942         -      chldPg = pPage->pStart->pgno;
          998  +      chldPg = pPage->pHdr->rightChild;
   943    999       }else{
   944         -      chldPg = pPage->aCell[lwr].pgno;
         1000  +      chldPg = pPage->apCell[lwr]->h.leftChild;
   945   1001       }
   946   1002       if( chldPg==0 ){
         1003  +      pCur->iMatch = c;
   947   1004         if( pRes ) *pRes = c;
   948   1005         return SQLITE_OK;
   949   1006       }
   950         -    rc = childPage(pCur, chldPg);
         1007  +    rc = moveToChild(pCur, chldPg);
   951   1008       if( rc ) return rc;
   952   1009     }
   953   1010   }
   954   1011   
   955   1012   /*
   956   1013   ** Advance the cursor to the next entry in the database.  If pRes!=NULL
   957   1014   ** then set *pRes=0 on success and set *pRes=1 if the cursor was
   958   1015   ** pointing to the last entry in the database.
   959   1016   */
   960   1017   int sqliteBtreeNext(BtCursor *pCur, int *pRes){
   961         -  MemPage *pPage;
   962   1018     int rc;
   963         -  int moved = 0;
   964         -  if( pCur->skip_next ){
   965         -    pCur->skip_next = 0;
         1019  +  if( pCur->bSkipNext ){
         1020  +    pCur->bSkipNext = 0;
   966   1021       if( pRes ) *pRes = 0;
   967   1022       return SQLITE_OK;
   968   1023     }
   969         -  pPage = pCur->pPage;
   970   1024     pCur->idx++;
   971         -  while( pCur->idx>=pPage->nCell ){
   972         -    if( pCur->depth==0 ){
   973         -      if( pRes ) *pRes = 1;
         1025  +  if( pCur->idx>=pCur->pPage->nCell ){
         1026  +    if( pPage->pHdr->rightChild ){
         1027  +      rc = moveToChild(pCur, pPage->pHdr->rightChild);
         1028  +      if( rc ) return rc;
         1029  +      rc = moveToLeftmost(pCur);
         1030  +      if( rc ) return rc;
         1031  +      if( pRes ) *pRes = 0;
   974   1032         return SQLITE_OK;
   975   1033       }
   976         -    rc = parentPage(pCur);
   977         -    if( rc ) return rc;
   978         -    moved = 1;
   979         -    pPage = pCur->pPage;
   980         -  }
   981         -  if( moved ){
         1034  +    do{
         1035  +      if( pCur->pParent==0 ){
         1036  +        if( pRes ) *pRes = 1;
         1037  +        return SQLITE_OK;
         1038  +      }
         1039  +      rc = moveToParent(pCur);
         1040  +      if( rc ) return rc;
         1041  +    }while( pCur->idx>=pCur->pPage->nCell );
   982   1042       if( pRes ) *pRes = 0;
   983   1043       return SQLITE_OK;
   984   1044     }
   985         -  while( pCur->idx<pPage->nCell && pPage->aCell[pCur->idx].pgno>0 ){
   986         -    rc = childPage(pCur, pPage->aCell[pCur->idx].pgno);
   987         -    if( rc ) return rc;
   988         -    pPage = pCur->pPage;
   989         -  }
         1045  +  rc = moveToLeftmost(pCur);
         1046  +  if( rc ) return rc;
   990   1047     if( pRes ) *pRes = 0;
   991   1048     return SQLITE_OK;
   992   1049   }
   993   1050   
   994   1051   /*
   995   1052   ** Allocate a new page from the database file.
   996   1053   **
................................................................................
  1027   1084     }
  1028   1085     return rc;
  1029   1086   }
  1030   1087   
  1031   1088   /*
  1032   1089   ** Add a page of the database file to the freelist.  Either pgno or
  1033   1090   ** pPage but not both may be 0. 
         1091  +**
         1092  +** sqlitepager_unref() is NOT called for pPage.  The calling routine
         1093  +** needs to do that.
  1034   1094   */
  1035   1095   static int freePage(Btree *pBt, void *pPage, Pgno pgno){
  1036   1096     Page1Header *pPage1 = (Page1Header*)pBt->page1;
  1037   1097     OverflowPage *pOvfl = (OverflowPage*)pPage;
  1038   1098     int rc;
  1039   1099     int needOvflUnref = 0;
  1040   1100     if( pgno==0 ){
................................................................................
  1054   1114     rc = sqlitepager_write(pOvfl);
  1055   1115     if( rc ){
  1056   1116       if( needOvflUnref ) sqlitepager_unref(pOvfl);
  1057   1117       return rc;
  1058   1118     }
  1059   1119     pOvfl->next = pPage1->freeList;
  1060   1120     pPage1->freeList = pgno;
  1061         -  memset(pOvfl->aData, 0, OVERFLOW_SIZE);
         1121  +  memset(pOvfl->aPayload, 0, OVERFLOW_SIZE);
  1062   1122     rc = sqlitepager_unref(pOvfl);
  1063   1123     return rc;
  1064   1124   }
  1065   1125   
  1066   1126   /*
  1067   1127   ** Erase all the data out of a cell.  This involves returning overflow
  1068   1128   ** pages back the freelist.
................................................................................
  1070   1130   static int clearCell(Btree *pBt, Cell *pCell){
  1071   1131     Pager *pPager = pBt->pPager;
  1072   1132     OverflowPage *pOvfl;
  1073   1133     Page1Header *pPage1 = (Page1Header*)pBt->page1;
  1074   1134     Pgno ovfl, nextOvfl;
  1075   1135     int rc;
  1076   1136   
         1137  +  if( pCell->h.nKey + pCell->h.nData <= MX_LOCAL_PAYLOAD ){
         1138  +    return SQLITE_OK;
         1139  +  }
  1077   1140     ovfl = pCell->ovfl;
  1078   1141     pCell->ovfl = 0;
  1079   1142     while( ovfl ){
  1080   1143       rc = sqlitepager_get(pPager, ovfl, &pOvfl);
  1081   1144       if( rc ) return rc;
  1082   1145       nextOvfl = pOvfl->next;
  1083   1146       freePage(pBt, pOvfl, ovfl);
  1084   1147       ovfl = nextOvfl;
  1085   1148       sqlitepager_unref(pOvfl);
  1086   1149     }
         1150  +  return SQLITE_OK;
  1087   1151   }
  1088   1152   
  1089   1153   /*
  1090   1154   ** Create a new cell from key and data.  Overflow pages are allocated as
  1091   1155   ** necessary and linked to this cell.  
  1092   1156   */
  1093   1157   static int fillInCell(
................................................................................
  1100   1164     Pgno *pNext;
  1101   1165     int spaceLeft;
  1102   1166     int n;
  1103   1167     int nPayload;
  1104   1168     char *pPayload;
  1105   1169     char *pSpace;
  1106   1170   
  1107         -  pCell->h.pgno = 0;
         1171  +  pCell->h.leftChild = 0;
  1108   1172     pCell->h.nKey = nKey;
  1109   1173     pCell->h.nData = nData;
  1110   1174     pCell->h.iNext = 0;
  1111   1175   
  1112   1176     pNext = &pCell->ovfl;
  1113         -  pSpace = pCell->aData;
         1177  +  pSpace = pCell->aPayload;
  1114   1178     spaceLeft = MX_LOCAL_PAYLOAD;
  1115   1179     pPayload = pKey;
  1116   1180     pKey = 0;
  1117   1181     nPayload = nKey;
  1118   1182     while( nPayload>0 ){
  1119   1183       if( spaceLeft==0 ){
  1120   1184         rc = allocatePage(pBt, &pOvfl, pNext);
  1121   1185         if( rc ){
  1122   1186           *pNext = 0;
  1123         -        clearCell(pCell);
         1187  +        clearCell(pBt, pCell);
  1124   1188           return rc;
  1125   1189         }
  1126   1190         spaceLeft = OVERFLOW_SIZE;
  1127         -      pSpace = pOvfl->aData;
         1191  +      pSpace = pOvfl->aPayload;
  1128   1192         pNextPg = &pOvfl->next;
  1129   1193       }
  1130   1194       n = nPayload;
  1131   1195       if( n>spaceLeft ) n = spaceLeft;
  1132   1196       memcpy(pSpace, pPayload, n);
  1133   1197       nPayload -= n;
  1134   1198       if( nPayload==0 && pData ){
................................................................................
  1143   1207     }
  1144   1208     return SQLITE_OK;
  1145   1209   }
  1146   1210   
  1147   1211   /*
  1148   1212   ** Attempt to move N or more bytes out of the page that the cursor
  1149   1213   ** points to into the left sibling page.  (The left sibling page
  1150         -** contains cells that are less than the cells on this page.)  Return
         1214  +** contains cells that are less than the cells on this page.)  The
         1215  +** entry that the cursor is pointing to cannot be moved.  Return
  1151   1216   ** TRUE if successful and FALSE if not.
  1152   1217   **
  1153   1218   ** Reasons for not being successful include: 
  1154   1219   **
  1155   1220   **    (1) there is no left sibling,
  1156   1221   **    (2) we could only move N-1 bytes or less,
  1157   1222   **    (3) some kind of file I/O error occurred
         1223  +**
         1224  +** Note that a partial rotation may have occurred even if this routine
         1225  +** returns FALSE.  Failure means we could not rotation a fill N bytes.
         1226  +** If it is possible to rotation some smaller number M, then the 
         1227  +** rotation occurs but we still return false.
         1228  +**
         1229  +** Example:  Consider a segment of the Btree that looks like the
         1230  +** figure below prior to rotation.  The cursor is pointing to the
         1231  +** entry *.  The sort order of the entries is A B C D E * F Y.
         1232  +**
         1233  +**
         1234  +**            -------------------------
         1235  +**                ... | C | Y | ...
         1236  +**            -------------------------
         1237  +**                     /     \
         1238  +**            ---------       -----------------
         1239  +**            | A | B |       | D | E | * | F |
         1240  +**            ---------       -----------------
         1241  +**
         1242  +** After rotation of two cells (D and E), the same Btree segment 
         1243  +** looks like this:
         1244  +**
         1245  +**            -------------------------
         1246  +**                ... | E | Y | ...
         1247  +**            -------------------------
         1248  +**                     /     \
         1249  +**    -----------------       ---------
         1250  +**    | A | B | C | D |       | * | F |
         1251  +**    -----------------       ---------
         1252  +**
         1253  +** The size of this rotation is the size by which the page containing
         1254  +** the cursor was reduced.  In this case, the size of D and E.
         1255  +**
  1158   1256   */
  1159   1257   static int rotateLeft(BtCursor *pCur, int N){
         1258  +  return 0;
         1259  +}
         1260  +
         1261  +/*
         1262  +** This routine is the same as rotateLeft() except that it move data
         1263  +** to the right instead of to the left.  See comments on the rotateLeft()
         1264  +** routine for additional information.
         1265  +*/
         1266  +static int rotateRight(BtCursor *pCur, int N){
         1267  +  return 0;
  1160   1268   }
  1161   1269   
  1162   1270   /*
  1163   1271   ** Split a single database page into two roughly equal-sized pages.
  1164   1272   **
  1165   1273   ** The input is an existing page and a new Cell.  The Cell might contain
  1166         -** a valid Cell.pgno field pointing to a child page.
         1274  +** a valid Cell.h.leftChild field pointing to a child page.
  1167   1275   **
  1168   1276   ** The output is the Cell that divides the two new pages.  The content
  1169         -** of this divider Cell is written into *pCenter.  pCenter->pgno points
  1170         -** to the new page that was created to hold the smaller half of the
  1171         -** cells from the divided page.  The larger cells from the divided
  1172         -** page are written to a newly allocated page and *ppOut is made to
  1173         -** point to that page.  Except, if ppOut==NULL then the larger cells
         1277  +** of this divider Cell is written into *pCenter.  pCenter->h.leftChild
         1278  +** holds the page number of the new page that was created to hold the 
         1279  +** smaller of the cells from the divided page.  The larger cells from
         1280  +** the divided page are written to a newly allocated page and *ppOut 
         1281  +** is made to point to that page.  Or if ppOut==NULL then the larger cells
  1174   1282   ** remain on pIn.
         1283  +**
         1284  +** Upon return, pCur should be pointing to the same cell, even if that
         1285  +** cell has moved to a new page.  The cell that pCur points to cannot
         1286  +** be the pCenter cell.
  1175   1287   */
  1176   1288   static int split(
  1177         -  MemPage *pIn,       /* The page that is to be divided */
         1289  +  BtCursor *pCur,     /* A cursor pointing at a cell on the page to be split */
  1178   1290     Cell *pNewCell,     /* A new cell to add to pIn before dividing it up */
  1179   1291     Cell *pCenter,      /* Write the cell that divides the two pages here */
  1180   1292     MemPage **ppOut     /* If not NULL, put larger cells in new page at *ppOut */
  1181   1293   ){
  1182   1294     
  1183   1295   }
  1184   1296   
  1185   1297   /*
         1298  +** Unlink a cell from a database page.  Add the space used by the cell
         1299  +** back to the freelist for the database page on which the cell used to
         1300  +** reside.
         1301  +**
         1302  +** This operation overwrites the cell header and content.
         1303  +*/
         1304  +static void unlinkCell(BtCursor *pCur){
         1305  +  MemPage *pPage;    /* Page containing cell to be unlinked */
         1306  +  int idx;           /* The index of the cell to be unlinked */
         1307  +  Cell *pCell;       /* Pointer to the cell to be unlinked */
         1308  +  u16 *piCell;       /* iNext pointer from prior cell */
         1309  +  int iCell;         /* Index in pPage->aDisk[] of cell to be unlinked */
         1310  +  int i;             /* Loop counter */
         1311  +
         1312  +  pPage = pCur->pPage;
         1313  +  idx = pCur->idx;
         1314  +  pCell = pPage->apCell[idx];
         1315  +  if( idx==0 ){
         1316  +    piCell = &pPage->pHdr->firstCell;
         1317  +  }else{
         1318  +    piCell = &pPage->apCell[idx-1]->h.iNext;
         1319  +  }
         1320  +  iCell = *piCell;
         1321  +  *piCell = pCell->h.iNext;
         1322  +  freeSpace(pPage, iCell, cellSize(pCell));
         1323  +  pPage->nCell--;
         1324  +  for(i=idx; i<pPage->nCell; i++){
         1325  +    pPage->apCell[i] = pPage->apCell[i+1];
         1326  +  }
         1327  +}
         1328  +
         1329  +/*
         1330  +** Add a Cell to a database page at the spot indicated by the cursor.
         1331  +**
  1186   1332   ** With this routine, we know that the Cell pNewCell will fit into the
  1187   1333   ** database page that pCur points to.  The calling routine has made
  1188   1334   ** sure it will fit.  All this routine needs to do is add the Cell
  1189         -** to the page.
         1335  +** to the page.  The addToPage() routine should be used for cases
         1336  +** were it is not know if the new cell will fit.
         1337  +**
         1338  +** The new cell is added to the page either before or after the cell
         1339  +** to which the cursor is pointing.  The new cell is added before
         1340  +** the cursor cell if pCur->iMatch>0 and the new cell is added after
         1341  +** the cursor cell if pCur->iMatch<0.  pCur->iMatch should have been set
         1342  +** by a prior call to sqliteBtreeMoveto() where the key was the key
         1343  +** of the cell being inserted.  If sqliteBtreeMoveto() ended up on a
         1344  +** cell that is larger than the key, then pCur->iMatch was set to a
         1345  +** positive number, hence we insert the new record before the pointer
         1346  +** if pCur->iMatch is positive.  If sqliteBtreeMaveto() ended up on a
         1347  +** cell that is smaller than the key then pCur->iMatch was set to a
         1348  +** negative number, hence we insert the new record after then pointer
         1349  +** if pCur->iMatch is negative.
  1190   1350   */
  1191   1351   static int insertCell(BtCursor *pCur, Cell *pNewCell){
         1352  +  int sz;
         1353  +  int idx;
         1354  +  int i;
         1355  +  Cell *pCell, *pIdx;
         1356  +  MemPage *pPage;
         1357  +
         1358  +  pPage = pCur->pPage;
         1359  +  sz = cellSize(pNewCell);
         1360  +  idx = allocateSpace(pPage, sz);
         1361  +  assert( idx>0 && idx<=SQLITE_PAGE_SIZE - sz );
         1362  +  pCell = (Cell*)&pPage->aDisk[idx];
         1363  +  memcpy(pCell, pNewCell, sz);
         1364  +  pIdx = pPage->aDisk[pCur->idx];
         1365  +  if( pCur->iMatch<0 ){
         1366  +    /* Insert the new cell after the cell pCur points to */
         1367  +    pCell->h.iNext = pIdx->h.iNext;
         1368  +    pIdx->h.iNext = idx;
         1369  +    for(i=pPage->nCell-1; i>pCur->idx; i--){
         1370  +      pPage->apCell[i+1] = pPage->apCell[i];
         1371  +    }
         1372  +    pPage->apCell[pCur->idx+1] = pCell;
         1373  +  }else{
         1374  +    /* Insert the new cell before the cell pCur points to */
         1375  +    pCell->h.iNext = pPage->pHdr->firstCell;
         1376  +    pPage->pHdr->firstCell = idx;
         1377  +    for(i=pPage->nCell; i>0; i++){
         1378  +      pPage->apCell[i] = pPage->apCell[i-1];
         1379  +    }
         1380  +    pPage->apCell[0] = pCell;
         1381  +  }
         1382  +  pPage->nCell++;
         1383  +  if( pCell->h.leftChild ){
         1384  +    MemPage *pChild = sqlitepager_lookup(pCur->pBt, pCell->h.leftChild);
         1385  +    if( pChild && pChild->pParent ){
         1386  +      sqlitepager_unref(pChild->pParent);
         1387  +      pChild->pParent = pPage;
         1388  +      sqlitepager_ref(pChild->pParent);
         1389  +    }
         1390  +  }
         1391  +  return SQLITE_OK;
  1192   1392   }
  1193   1393   
  1194   1394   /*
  1195         -** Insert pNewCell into the database page that pCur is pointing to.
  1196         -** pNewCell->h.pgno points to a child page that comes before pNewCell->data[],
  1197         -** unless pCur is a leaf page.
         1395  +** Insert pNewCell into the database page that pCur is pointing to at
         1396  +** the place where pCur is pointing.  
         1397  +**
         1398  +** This routine works just like insertCell() except that the cell
         1399  +** to be inserted need not fit on the page.  If the new cell does 
         1400  +** not fit, then the page sheds data to its siblings to try to get 
         1401  +** down to a size where the new cell will fit.  If that effort fails,
         1402  +** then the page is split.
  1198   1403   */
  1199   1404   static int addToPage(BtCursor *pCur, Cell *pNewCell){
  1200   1405     Cell tempCell;
  1201   1406     Cell centerCell;
  1202   1407   
  1203   1408     for(;;){
  1204   1409       MemPage *pPage = pCur->pPage;
................................................................................
  1208   1413         return SQLITE_OK;
  1209   1414       }
  1210   1415       if( pPage->pParent==0 ){
  1211   1416         MemPage *pRight;
  1212   1417         PageHdr *pHdr;
  1213   1418         FreeBlk *pFBlk;
  1214   1419         int pc;
  1215         -      rc = split(pPage, pNewCell, &centerCell, &pRight);
  1216         -      pHdr = pPage->pStart;
  1217         -      pHdr->pgno = sqlitepager_pagenumber(pRight);
         1420  +      rc = split(pCur, pNewCell, &centerCell, &pRight);
         1421  +      if( rc ) return rc;
         1422  +      pHdr = pPage->pHdr;
         1423  +      pHdr->right = sqlitepager_pagenumber(pRight);
  1218   1424         sqlitepager_unref(pRight);
  1219   1425         pHdr->firstCell = pc = pPage->idxStart + sizeof(*pHdr);
  1220   1426         sz = cellSize(&centerCell);
  1221         -      memcpy(&pPage->aPage[pc], &centerCell, sz);
         1427  +      memcpy(&pPage->aDisk[pc], &centerCell, sz);
  1222   1428         pc += sz;
  1223   1429         pHdr->firstFree = pc;
  1224         -      pFBlk = (FreeBlk*)&pPage->aPage[pc];
         1430  +      pFBlk = (FreeBlk*)&pPage->aDisk[pc];
  1225   1431         pFBlk->iSize = SQLITE_PAGE_SIZE - pc;
  1226   1432         pFBlk->iNext = 0;
  1227   1433         memset(&pFBlk[1], 0, pFBlk->iSize-sizeof(*pFBlk));
  1228   1434         return SQLITE_OK;
  1229   1435       }
  1230   1436       if( rotateLeft(pCur, sz - pPage->nFree) 
  1231   1437              || rotateRight(pCur, sz - pPage->nFree) ){
  1232   1438         insertCell(pCur, pNewCell);
  1233   1439         return SQLITE_OK;
  1234   1440       }
  1235         -    rc = split(pPage, pNewCell, &centerCell, 0);
  1236         -    parentPage(pCur);
         1441  +    rc = split(pCur, pNewCell, &centerCell, 0);
         1442  +    if( rc ) return rc;
         1443  +    moveToParent(pCur);
  1237   1444       tempCell = centerCell;
  1238   1445       pNewPage = &tempCell;
  1239   1446     }
         1447  +  /* NOT REACHED */
  1240   1448   }
  1241   1449   
  1242   1450   /*
  1243   1451   ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
  1244   1452   ** and the data is given by (pData,nData).  The cursor is used only to
  1245   1453   ** define what database the record should be inserted into.  The cursor
  1246   1454   ** is NOT left pointing at the new record.
................................................................................
  1256   1464     MemPage *pPage;
  1257   1465     Btree *pBt = pCur->pBt;
  1258   1466   
  1259   1467     rc = sqliteBtreeMoveTo(pCur, pKey, nKey, &loc);
  1260   1468     if( rc ) return rc;
  1261   1469     rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData);
  1262   1470     if( rc ) return rc;
  1263         -  newCell.h.pgno = pCur->pPage->aCell[pCur->idx].h.pgno;
  1264   1471     if( loc==0 ){
  1265         -    rc = clearCell(pBt, &pCur->pPage->aCell[pCur->idx]);
  1266         -    if( rc ){
  1267         -      return SQLITE_CORRUPT;
  1268         -    }
         1472  +    newCell.h.leftChild = pCur->pPage->apCell[pCur->idx]->h.leftChild;
         1473  +    rc = clearCell(pBt, pCur->pPage->apCell[pCur->idx]);
         1474  +    if( rc ) return rc;
  1269   1475       unlinkCell(pCur);
  1270   1476     }
  1271   1477     return addToPage(pCur, &newCell);
  1272   1478   }
  1273   1479   
  1274   1480   /*
  1275         -** Delete the record that the cursor is pointing to.  Leave the cursor
  1276         -** pointing at the next record after the one to which it currently points.
  1277         -** Also, set the pCur->skip_next flag so that the next sqliteBtreeNext() 
  1278         -** called for this cursor will be a no-op.
         1481  +** Check the page given as the argument to see if it is less than
         1482  +** half full.  If it is less than half full, then try to increase
         1483  +** its fill factor by grabbing cells from siblings or by merging
         1484  +** the page with siblings.
         1485  +*/
         1486  +static int refillPage(Btree *pBt, MemPage *pPage){
         1487  +  if( pPage->nFree < SQLITE_PAGE_SIZE/2 ){
         1488  +    return SQLITE_OK;
         1489  +  }
         1490  +  if( pPage->nCell==0 ){
         1491  +    assert( pPage->pParent==0 );
         1492  +    if( pPage->pHdr->rightChild ){
         1493  +      
         1494  +    }
         1495  +    return SQLITE_OK;
         1496  +  }
         1497  +  /** merge with siblings **/
         1498  +  /** borrow from siblings **/
         1499  +}
         1500  +
         1501  +/*
         1502  +** Replace the content of the cell that pCur is pointing to with the content
         1503  +** in pNewContent.  The pCur cell is not unlinked or moved in the Btree,
         1504  +** its content is just replaced.
         1505  +**
         1506  +** If the size of pNewContent is greater than the current size of the
         1507  +** cursor cell then the page that cursor points to might have to split.
         1508  +*/
         1509  +static int replaceContent(BtCursor *pCur, Cell *pNewContent){
         1510  +  Cell *pCell;       /* The cell whose content will be changed */
         1511  +  Pgno pgno;         /* Temporary storage for a page number */
         1512  +
         1513  +  pCell = pCur->pPage->apCell[pCur->idx];
         1514  +  rc = clearCell(pCur->pBt, pCell);
         1515  +  if( rc ) return rc;
         1516  +  pgno = pNewCell->h.leftChild;
         1517  +  pNewCell->h.leftChild = pCell->h.leftChild;
         1518  +  unlinkCell(pCur);
         1519  +  rc = addToPage(pCur, pNewCell);
         1520  +  pNewCell->h.leftChild = pgno;
         1521  +  return rc;
         1522  +}
         1523  +
         1524  +/*
         1525  +** Delete the record that the cursor is pointing to.
         1526  +**
         1527  +** The cursor is left point at either the next or the previous
         1528  +** entry.  If left pointing to the next entry, then the pCur->bSkipNext
         1529  +** flag is set which forces the next call to sqliteBtreeNext() to be
         1530  +** a no-op.  That way, you can always call sqliteBtreeNext() after
         1531  +** a delete and the cursor will be left pointing to the first entry
         1532  +** after the deleted entry.
  1279   1533   */
  1280   1534   int sqliteBtreeDelete(BtCursor *pCur){
         1535  +  MemPage *pPage = pCur->pPage;
         1536  +  Cell *pCell;
         1537  +  int rc;
         1538  +  pCell = pPage->apCell[pCur->idx];
         1539  +  if( pPage->pHdr->rightChild ){
         1540  +    /* The entry to be deleted is not on a leaf page.  Non-leaf entries 
         1541  +    ** cannot be deleted directly because they have to be present to
         1542  +    ** hold pointers to subpages.  So what we do is look at the next 
         1543  +    ** entry in sequence.  The next entry is guaranteed to exist and 
         1544  +    ** be a leaf.  We copy the payload from the next entry into this
         1545  +    ** entry, then delete the next entry.
         1546  +    */
         1547  +    BtCursor origCur;
         1548  +    createTemporaryCursor(pCur, &origCur);
         1549  +    rc = sqliteBtreeNext(pCur, 0);
         1550  +    if( rc==SQLITE_OK ){
         1551  +      pPage = pCur->pPage;
         1552  +      pCell = pPage->apCell[pCur->idx];
         1553  +      rc = replaceContent(&origCur, pCell);
         1554  +    }
         1555  +    destroyTemporaryCursor(&origCur);
         1556  +    if( rc ) return rc;
         1557  +  }
         1558  +  rc = clearCell(pCell);
         1559  +  if( rc ) return rc;
         1560  +  unlinkCell(pCur->pBt, pCell);
         1561  +  if( pCur->idx == 0 ){
         1562  +    pCur->bSkipNext = 1;
         1563  +  }else{
         1564  +    pCur->idx--;
         1565  +  }
         1566  +  rc = refillPage(pCur->pBt, pPage);
         1567  +  return rc;
  1281   1568   }