000001  /*
000002  ** 2004 April 6
000003  **
000004  ** The author disclaims copyright to this source code.  In place of
000005  ** a legal notice, here is a blessing:
000006  **
000007  **    May you do good and not evil.
000008  **    May you find forgiveness for yourself and forgive others.
000009  **    May you share freely, never taking more than you give.
000010  **
000011  *************************************************************************
000012  ** This file implements an external (disk-based) database using BTrees.
000013  ** See the header comment on "btreeInt.h" for additional information.
000014  ** Including a description of file format and an overview of operation.
000015  */
000016  #include "btreeInt.h"
000017  
000018  /*
000019  ** The header string that appears at the beginning of every
000020  ** SQLite database.
000021  */
000022  static const char zMagicHeader[] = SQLITE_FILE_HEADER;
000023  
000024  /*
000025  ** Set this global variable to 1 to enable tracing using the TRACE
000026  ** macro.
000027  */
000028  #if 0
000029  int sqlite3BtreeTrace=1;  /* True to enable tracing */
000030  # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
000031  #else
000032  # define TRACE(X)
000033  #endif
000034  
000035  /*
000036  ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
000037  ** But if the value is zero, make it 65536.
000038  **
000039  ** This routine is used to extract the "offset to cell content area" value
000040  ** from the header of a btree page.  If the page size is 65536 and the page
000041  ** is empty, the offset should be 65536, but the 2-byte value stores zero.
000042  ** This routine makes the necessary adjustment to 65536.
000043  */
000044  #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
000045  
000046  /*
000047  ** Values passed as the 5th argument to allocateBtreePage()
000048  */
000049  #define BTALLOC_ANY   0           /* Allocate any page */
000050  #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
000051  #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
000052  
000053  /*
000054  ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 
000055  ** defined, or 0 if it is. For example:
000056  **
000057  **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
000058  */
000059  #ifndef SQLITE_OMIT_AUTOVACUUM
000060  #define IfNotOmitAV(expr) (expr)
000061  #else
000062  #define IfNotOmitAV(expr) 0
000063  #endif
000064  
000065  #ifndef SQLITE_OMIT_SHARED_CACHE
000066  /*
000067  ** A list of BtShared objects that are eligible for participation
000068  ** in shared cache.  This variable has file scope during normal builds,
000069  ** but the test harness needs to access it so we make it global for 
000070  ** test builds.
000071  **
000072  ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
000073  */
000074  #ifdef SQLITE_TEST
000075  BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000076  #else
000077  static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
000078  #endif
000079  #endif /* SQLITE_OMIT_SHARED_CACHE */
000080  
000081  #ifndef SQLITE_OMIT_SHARED_CACHE
000082  /*
000083  ** Enable or disable the shared pager and schema features.
000084  **
000085  ** This routine has no effect on existing database connections.
000086  ** The shared cache setting effects only future calls to
000087  ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
000088  */
000089  int sqlite3_enable_shared_cache(int enable){
000090    sqlite3GlobalConfig.sharedCacheEnabled = enable;
000091    return SQLITE_OK;
000092  }
000093  #endif
000094  
000095  
000096  
000097  #ifdef SQLITE_OMIT_SHARED_CACHE
000098    /*
000099    ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
000100    ** and clearAllSharedCacheTableLocks()
000101    ** manipulate entries in the BtShared.pLock linked list used to store
000102    ** shared-cache table level locks. If the library is compiled with the
000103    ** shared-cache feature disabled, then there is only ever one user
000104    ** of each BtShared structure and so this locking is not necessary. 
000105    ** So define the lock related functions as no-ops.
000106    */
000107    #define querySharedCacheTableLock(a,b,c) SQLITE_OK
000108    #define setSharedCacheTableLock(a,b,c) SQLITE_OK
000109    #define clearAllSharedCacheTableLocks(a)
000110    #define downgradeAllSharedCacheTableLocks(a)
000111    #define hasSharedCacheTableLock(a,b,c,d) 1
000112    #define hasReadConflicts(a, b) 0
000113  #endif
000114  
000115  #ifndef SQLITE_OMIT_SHARED_CACHE
000116  
000117  #ifdef SQLITE_DEBUG
000118  /*
000119  **** This function is only used as part of an assert() statement. ***
000120  **
000121  ** Check to see if pBtree holds the required locks to read or write to the 
000122  ** table with root page iRoot.   Return 1 if it does and 0 if not.
000123  **
000124  ** For example, when writing to a table with root-page iRoot via 
000125  ** Btree connection pBtree:
000126  **
000127  **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
000128  **
000129  ** When writing to an index that resides in a sharable database, the 
000130  ** caller should have first obtained a lock specifying the root page of
000131  ** the corresponding table. This makes things a bit more complicated,
000132  ** as this module treats each table as a separate structure. To determine
000133  ** the table corresponding to the index being written, this
000134  ** function has to search through the database schema.
000135  **
000136  ** Instead of a lock on the table/index rooted at page iRoot, the caller may
000137  ** hold a write-lock on the schema table (root page 1). This is also
000138  ** acceptable.
000139  */
000140  static int hasSharedCacheTableLock(
000141    Btree *pBtree,         /* Handle that must hold lock */
000142    Pgno iRoot,            /* Root page of b-tree */
000143    int isIndex,           /* True if iRoot is the root of an index b-tree */
000144    int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
000145  ){
000146    Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
000147    Pgno iTab = 0;
000148    BtLock *pLock;
000149  
000150    /* If this database is not shareable, or if the client is reading
000151    ** and has the read-uncommitted flag set, then no lock is required. 
000152    ** Return true immediately.
000153    */
000154    if( (pBtree->sharable==0)
000155     || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
000156    ){
000157      return 1;
000158    }
000159  
000160    /* If the client is reading  or writing an index and the schema is
000161    ** not loaded, then it is too difficult to actually check to see if
000162    ** the correct locks are held.  So do not bother - just return true.
000163    ** This case does not come up very often anyhow.
000164    */
000165    if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
000166      return 1;
000167    }
000168  
000169    /* Figure out the root-page that the lock should be held on. For table
000170    ** b-trees, this is just the root page of the b-tree being read or
000171    ** written. For index b-trees, it is the root page of the associated
000172    ** table.  */
000173    if( isIndex ){
000174      HashElem *p;
000175      for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
000176        Index *pIdx = (Index *)sqliteHashData(p);
000177        if( pIdx->tnum==(int)iRoot ){
000178          if( iTab ){
000179            /* Two or more indexes share the same root page.  There must
000180            ** be imposter tables.  So just return true.  The assert is not
000181            ** useful in that case. */
000182            return 1;
000183          }
000184          iTab = pIdx->pTable->tnum;
000185        }
000186      }
000187    }else{
000188      iTab = iRoot;
000189    }
000190  
000191    /* Search for the required lock. Either a write-lock on root-page iTab, a 
000192    ** write-lock on the schema table, or (if the client is reading) a
000193    ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
000194    for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
000195      if( pLock->pBtree==pBtree 
000196       && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
000197       && pLock->eLock>=eLockType 
000198      ){
000199        return 1;
000200      }
000201    }
000202  
000203    /* Failed to find the required lock. */
000204    return 0;
000205  }
000206  #endif /* SQLITE_DEBUG */
000207  
000208  #ifdef SQLITE_DEBUG
000209  /*
000210  **** This function may be used as part of assert() statements only. ****
000211  **
000212  ** Return true if it would be illegal for pBtree to write into the
000213  ** table or index rooted at iRoot because other shared connections are
000214  ** simultaneously reading that same table or index.
000215  **
000216  ** It is illegal for pBtree to write if some other Btree object that
000217  ** shares the same BtShared object is currently reading or writing
000218  ** the iRoot table.  Except, if the other Btree object has the
000219  ** read-uncommitted flag set, then it is OK for the other object to
000220  ** have a read cursor.
000221  **
000222  ** For example, before writing to any part of the table or index
000223  ** rooted at page iRoot, one should call:
000224  **
000225  **    assert( !hasReadConflicts(pBtree, iRoot) );
000226  */
000227  static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
000228    BtCursor *p;
000229    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000230      if( p->pgnoRoot==iRoot 
000231       && p->pBtree!=pBtree
000232       && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
000233      ){
000234        return 1;
000235      }
000236    }
000237    return 0;
000238  }
000239  #endif    /* #ifdef SQLITE_DEBUG */
000240  
000241  /*
000242  ** Query to see if Btree handle p may obtain a lock of type eLock 
000243  ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
000244  ** SQLITE_OK if the lock may be obtained (by calling
000245  ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
000246  */
000247  static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
000248    BtShared *pBt = p->pBt;
000249    BtLock *pIter;
000250  
000251    assert( sqlite3BtreeHoldsMutex(p) );
000252    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000253    assert( p->db!=0 );
000254    assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
000255    
000256    /* If requesting a write-lock, then the Btree must have an open write
000257    ** transaction on this file. And, obviously, for this to be so there 
000258    ** must be an open write transaction on the file itself.
000259    */
000260    assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
000261    assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
000262    
000263    /* This routine is a no-op if the shared-cache is not enabled */
000264    if( !p->sharable ){
000265      return SQLITE_OK;
000266    }
000267  
000268    /* If some other connection is holding an exclusive lock, the
000269    ** requested lock may not be obtained.
000270    */
000271    if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
000272      sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
000273      return SQLITE_LOCKED_SHAREDCACHE;
000274    }
000275  
000276    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000277      /* The condition (pIter->eLock!=eLock) in the following if(...) 
000278      ** statement is a simplification of:
000279      **
000280      **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
000281      **
000282      ** since we know that if eLock==WRITE_LOCK, then no other connection
000283      ** may hold a WRITE_LOCK on any table in this file (since there can
000284      ** only be a single writer).
000285      */
000286      assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
000287      assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
000288      if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
000289        sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
000290        if( eLock==WRITE_LOCK ){
000291          assert( p==pBt->pWriter );
000292          pBt->btsFlags |= BTS_PENDING;
000293        }
000294        return SQLITE_LOCKED_SHAREDCACHE;
000295      }
000296    }
000297    return SQLITE_OK;
000298  }
000299  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000300  
000301  #ifndef SQLITE_OMIT_SHARED_CACHE
000302  /*
000303  ** Add a lock on the table with root-page iTable to the shared-btree used
000304  ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
000305  ** WRITE_LOCK.
000306  **
000307  ** This function assumes the following:
000308  **
000309  **   (a) The specified Btree object p is connected to a sharable
000310  **       database (one with the BtShared.sharable flag set), and
000311  **
000312  **   (b) No other Btree objects hold a lock that conflicts
000313  **       with the requested lock (i.e. querySharedCacheTableLock() has
000314  **       already been called and returned SQLITE_OK).
000315  **
000316  ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 
000317  ** is returned if a malloc attempt fails.
000318  */
000319  static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
000320    BtShared *pBt = p->pBt;
000321    BtLock *pLock = 0;
000322    BtLock *pIter;
000323  
000324    assert( sqlite3BtreeHoldsMutex(p) );
000325    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
000326    assert( p->db!=0 );
000327  
000328    /* A connection with the read-uncommitted flag set will never try to
000329    ** obtain a read-lock using this function. The only read-lock obtained
000330    ** by a connection in read-uncommitted mode is on the sqlite_master 
000331    ** table, and that lock is obtained in BtreeBeginTrans().  */
000332    assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
000333  
000334    /* This function should only be called on a sharable b-tree after it 
000335    ** has been determined that no other b-tree holds a conflicting lock.  */
000336    assert( p->sharable );
000337    assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
000338  
000339    /* First search the list for an existing lock on this table. */
000340    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
000341      if( pIter->iTable==iTable && pIter->pBtree==p ){
000342        pLock = pIter;
000343        break;
000344      }
000345    }
000346  
000347    /* If the above search did not find a BtLock struct associating Btree p
000348    ** with table iTable, allocate one and link it into the list.
000349    */
000350    if( !pLock ){
000351      pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
000352      if( !pLock ){
000353        return SQLITE_NOMEM_BKPT;
000354      }
000355      pLock->iTable = iTable;
000356      pLock->pBtree = p;
000357      pLock->pNext = pBt->pLock;
000358      pBt->pLock = pLock;
000359    }
000360  
000361    /* Set the BtLock.eLock variable to the maximum of the current lock
000362    ** and the requested lock. This means if a write-lock was already held
000363    ** and a read-lock requested, we don't incorrectly downgrade the lock.
000364    */
000365    assert( WRITE_LOCK>READ_LOCK );
000366    if( eLock>pLock->eLock ){
000367      pLock->eLock = eLock;
000368    }
000369  
000370    return SQLITE_OK;
000371  }
000372  #endif /* !SQLITE_OMIT_SHARED_CACHE */
000373  
000374  #ifndef SQLITE_OMIT_SHARED_CACHE
000375  /*
000376  ** Release all the table locks (locks obtained via calls to
000377  ** the setSharedCacheTableLock() procedure) held by Btree object p.
000378  **
000379  ** This function assumes that Btree p has an open read or write 
000380  ** transaction. If it does not, then the BTS_PENDING flag
000381  ** may be incorrectly cleared.
000382  */
000383  static void clearAllSharedCacheTableLocks(Btree *p){
000384    BtShared *pBt = p->pBt;
000385    BtLock **ppIter = &pBt->pLock;
000386  
000387    assert( sqlite3BtreeHoldsMutex(p) );
000388    assert( p->sharable || 0==*ppIter );
000389    assert( p->inTrans>0 );
000390  
000391    while( *ppIter ){
000392      BtLock *pLock = *ppIter;
000393      assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
000394      assert( pLock->pBtree->inTrans>=pLock->eLock );
000395      if( pLock->pBtree==p ){
000396        *ppIter = pLock->pNext;
000397        assert( pLock->iTable!=1 || pLock==&p->lock );
000398        if( pLock->iTable!=1 ){
000399          sqlite3_free(pLock);
000400        }
000401      }else{
000402        ppIter = &pLock->pNext;
000403      }
000404    }
000405  
000406    assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
000407    if( pBt->pWriter==p ){
000408      pBt->pWriter = 0;
000409      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000410    }else if( pBt->nTransaction==2 ){
000411      /* This function is called when Btree p is concluding its 
000412      ** transaction. If there currently exists a writer, and p is not
000413      ** that writer, then the number of locks held by connections other
000414      ** than the writer must be about to drop to zero. In this case
000415      ** set the BTS_PENDING flag to 0.
000416      **
000417      ** If there is not currently a writer, then BTS_PENDING must
000418      ** be zero already. So this next line is harmless in that case.
000419      */
000420      pBt->btsFlags &= ~BTS_PENDING;
000421    }
000422  }
000423  
000424  /*
000425  ** This function changes all write-locks held by Btree p into read-locks.
000426  */
000427  static void downgradeAllSharedCacheTableLocks(Btree *p){
000428    BtShared *pBt = p->pBt;
000429    if( pBt->pWriter==p ){
000430      BtLock *pLock;
000431      pBt->pWriter = 0;
000432      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
000433      for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
000434        assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
000435        pLock->eLock = READ_LOCK;
000436      }
000437    }
000438  }
000439  
000440  #endif /* SQLITE_OMIT_SHARED_CACHE */
000441  
000442  static void releasePage(MemPage *pPage);         /* Forward reference */
000443  static void releasePageOne(MemPage *pPage);      /* Forward reference */
000444  static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
000445  
000446  /*
000447  ***** This routine is used inside of assert() only ****
000448  **
000449  ** Verify that the cursor holds the mutex on its BtShared
000450  */
000451  #ifdef SQLITE_DEBUG
000452  static int cursorHoldsMutex(BtCursor *p){
000453    return sqlite3_mutex_held(p->pBt->mutex);
000454  }
000455  
000456  /* Verify that the cursor and the BtShared agree about what is the current
000457  ** database connetion. This is important in shared-cache mode. If the database 
000458  ** connection pointers get out-of-sync, it is possible for routines like
000459  ** btreeInitPage() to reference an stale connection pointer that references a
000460  ** a connection that has already closed.  This routine is used inside assert()
000461  ** statements only and for the purpose of double-checking that the btree code
000462  ** does keep the database connection pointers up-to-date.
000463  */
000464  static int cursorOwnsBtShared(BtCursor *p){
000465    assert( cursorHoldsMutex(p) );
000466    return (p->pBtree->db==p->pBt->db);
000467  }
000468  #endif
000469  
000470  /*
000471  ** Invalidate the overflow cache of the cursor passed as the first argument.
000472  ** on the shared btree structure pBt.
000473  */
000474  #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
000475  
000476  /*
000477  ** Invalidate the overflow page-list cache for all cursors opened
000478  ** on the shared btree structure pBt.
000479  */
000480  static void invalidateAllOverflowCache(BtShared *pBt){
000481    BtCursor *p;
000482    assert( sqlite3_mutex_held(pBt->mutex) );
000483    for(p=pBt->pCursor; p; p=p->pNext){
000484      invalidateOverflowCache(p);
000485    }
000486  }
000487  
000488  #ifndef SQLITE_OMIT_INCRBLOB
000489  /*
000490  ** This function is called before modifying the contents of a table
000491  ** to invalidate any incrblob cursors that are open on the
000492  ** row or one of the rows being modified.
000493  **
000494  ** If argument isClearTable is true, then the entire contents of the
000495  ** table is about to be deleted. In this case invalidate all incrblob
000496  ** cursors open on any row within the table with root-page pgnoRoot.
000497  **
000498  ** Otherwise, if argument isClearTable is false, then the row with
000499  ** rowid iRow is being replaced or deleted. In this case invalidate
000500  ** only those incrblob cursors open on that specific row.
000501  */
000502  static void invalidateIncrblobCursors(
000503    Btree *pBtree,          /* The database file to check */
000504    Pgno pgnoRoot,          /* The table that might be changing */
000505    i64 iRow,               /* The rowid that might be changing */
000506    int isClearTable        /* True if all rows are being deleted */
000507  ){
000508    BtCursor *p;
000509    if( pBtree->hasIncrblobCur==0 ) return;
000510    assert( sqlite3BtreeHoldsMutex(pBtree) );
000511    pBtree->hasIncrblobCur = 0;
000512    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
000513      if( (p->curFlags & BTCF_Incrblob)!=0 ){
000514        pBtree->hasIncrblobCur = 1;
000515        if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
000516          p->eState = CURSOR_INVALID;
000517        }
000518      }
000519    }
000520  }
000521  
000522  #else
000523    /* Stub function when INCRBLOB is omitted */
000524    #define invalidateIncrblobCursors(w,x,y,z)
000525  #endif /* SQLITE_OMIT_INCRBLOB */
000526  
000527  /*
000528  ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 
000529  ** when a page that previously contained data becomes a free-list leaf 
000530  ** page.
000531  **
000532  ** The BtShared.pHasContent bitvec exists to work around an obscure
000533  ** bug caused by the interaction of two useful IO optimizations surrounding
000534  ** free-list leaf pages:
000535  **
000536  **   1) When all data is deleted from a page and the page becomes
000537  **      a free-list leaf page, the page is not written to the database
000538  **      (as free-list leaf pages contain no meaningful data). Sometimes
000539  **      such a page is not even journalled (as it will not be modified,
000540  **      why bother journalling it?).
000541  **
000542  **   2) When a free-list leaf page is reused, its content is not read
000543  **      from the database or written to the journal file (why should it
000544  **      be, if it is not at all meaningful?).
000545  **
000546  ** By themselves, these optimizations work fine and provide a handy
000547  ** performance boost to bulk delete or insert operations. However, if
000548  ** a page is moved to the free-list and then reused within the same
000549  ** transaction, a problem comes up. If the page is not journalled when
000550  ** it is moved to the free-list and it is also not journalled when it
000551  ** is extracted from the free-list and reused, then the original data
000552  ** may be lost. In the event of a rollback, it may not be possible
000553  ** to restore the database to its original configuration.
000554  **
000555  ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 
000556  ** moved to become a free-list leaf page, the corresponding bit is
000557  ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
000558  ** optimization 2 above is omitted if the corresponding bit is already
000559  ** set in BtShared.pHasContent. The contents of the bitvec are cleared
000560  ** at the end of every transaction.
000561  */
000562  static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
000563    int rc = SQLITE_OK;
000564    if( !pBt->pHasContent ){
000565      assert( pgno<=pBt->nPage );
000566      pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
000567      if( !pBt->pHasContent ){
000568        rc = SQLITE_NOMEM_BKPT;
000569      }
000570    }
000571    if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
000572      rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
000573    }
000574    return rc;
000575  }
000576  
000577  /*
000578  ** Query the BtShared.pHasContent vector.
000579  **
000580  ** This function is called when a free-list leaf page is removed from the
000581  ** free-list for reuse. It returns false if it is safe to retrieve the
000582  ** page from the pager layer with the 'no-content' flag set. True otherwise.
000583  */
000584  static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
000585    Bitvec *p = pBt->pHasContent;
000586    return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
000587  }
000588  
000589  /*
000590  ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
000591  ** invoked at the conclusion of each write-transaction.
000592  */
000593  static void btreeClearHasContent(BtShared *pBt){
000594    sqlite3BitvecDestroy(pBt->pHasContent);
000595    pBt->pHasContent = 0;
000596  }
000597  
000598  /*
000599  ** Release all of the apPage[] pages for a cursor.
000600  */
000601  static void btreeReleaseAllCursorPages(BtCursor *pCur){
000602    int i;
000603    if( pCur->iPage>=0 ){
000604      for(i=0; i<pCur->iPage; i++){
000605        releasePageNotNull(pCur->apPage[i]);
000606      }
000607      releasePageNotNull(pCur->pPage);
000608      pCur->iPage = -1;
000609    }
000610  }
000611  
000612  /*
000613  ** The cursor passed as the only argument must point to a valid entry
000614  ** when this function is called (i.e. have eState==CURSOR_VALID). This
000615  ** function saves the current cursor key in variables pCur->nKey and
000616  ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error 
000617  ** code otherwise.
000618  **
000619  ** If the cursor is open on an intkey table, then the integer key
000620  ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
000621  ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is 
000622  ** set to point to a malloced buffer pCur->nKey bytes in size containing 
000623  ** the key.
000624  */
000625  static int saveCursorKey(BtCursor *pCur){
000626    int rc = SQLITE_OK;
000627    assert( CURSOR_VALID==pCur->eState );
000628    assert( 0==pCur->pKey );
000629    assert( cursorHoldsMutex(pCur) );
000630  
000631    if( pCur->curIntKey ){
000632      /* Only the rowid is required for a table btree */
000633      pCur->nKey = sqlite3BtreeIntegerKey(pCur);
000634    }else{
000635      /* For an index btree, save the complete key content */
000636      void *pKey;
000637      pCur->nKey = sqlite3BtreePayloadSize(pCur);
000638      pKey = sqlite3Malloc( pCur->nKey );
000639      if( pKey ){
000640        rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
000641        if( rc==SQLITE_OK ){
000642          pCur->pKey = pKey;
000643        }else{
000644          sqlite3_free(pKey);
000645        }
000646      }else{
000647        rc = SQLITE_NOMEM_BKPT;
000648      }
000649    }
000650    assert( !pCur->curIntKey || !pCur->pKey );
000651    return rc;
000652  }
000653  
000654  /*
000655  ** Save the current cursor position in the variables BtCursor.nKey 
000656  ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
000657  **
000658  ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
000659  ** prior to calling this routine.  
000660  */
000661  static int saveCursorPosition(BtCursor *pCur){
000662    int rc;
000663  
000664    assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
000665    assert( 0==pCur->pKey );
000666    assert( cursorHoldsMutex(pCur) );
000667  
000668    if( pCur->eState==CURSOR_SKIPNEXT ){
000669      pCur->eState = CURSOR_VALID;
000670    }else{
000671      pCur->skipNext = 0;
000672    }
000673  
000674    rc = saveCursorKey(pCur);
000675    if( rc==SQLITE_OK ){
000676      btreeReleaseAllCursorPages(pCur);
000677      pCur->eState = CURSOR_REQUIRESEEK;
000678    }
000679  
000680    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
000681    return rc;
000682  }
000683  
000684  /* Forward reference */
000685  static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
000686  
000687  /*
000688  ** Save the positions of all cursors (except pExcept) that are open on
000689  ** the table with root-page iRoot.  "Saving the cursor position" means that
000690  ** the location in the btree is remembered in such a way that it can be
000691  ** moved back to the same spot after the btree has been modified.  This
000692  ** routine is called just before cursor pExcept is used to modify the
000693  ** table, for example in BtreeDelete() or BtreeInsert().
000694  **
000695  ** If there are two or more cursors on the same btree, then all such 
000696  ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
000697  ** routine enforces that rule.  This routine only needs to be called in
000698  ** the uncommon case when pExpect has the BTCF_Multiple flag set.
000699  **
000700  ** If pExpect!=NULL and if no other cursors are found on the same root-page,
000701  ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
000702  ** pointless call to this routine.
000703  **
000704  ** Implementation note:  This routine merely checks to see if any cursors
000705  ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
000706  ** event that cursors are in need to being saved.
000707  */
000708  static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
000709    BtCursor *p;
000710    assert( sqlite3_mutex_held(pBt->mutex) );
000711    assert( pExcept==0 || pExcept->pBt==pBt );
000712    for(p=pBt->pCursor; p; p=p->pNext){
000713      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
000714    }
000715    if( p ) return saveCursorsOnList(p, iRoot, pExcept);
000716    if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
000717    return SQLITE_OK;
000718  }
000719  
000720  /* This helper routine to saveAllCursors does the actual work of saving
000721  ** the cursors if and when a cursor is found that actually requires saving.
000722  ** The common case is that no cursors need to be saved, so this routine is
000723  ** broken out from its caller to avoid unnecessary stack pointer movement.
000724  */
000725  static int SQLITE_NOINLINE saveCursorsOnList(
000726    BtCursor *p,         /* The first cursor that needs saving */
000727    Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
000728    BtCursor *pExcept    /* Do not save this cursor */
000729  ){
000730    do{
000731      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
000732        if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
000733          int rc = saveCursorPosition(p);
000734          if( SQLITE_OK!=rc ){
000735            return rc;
000736          }
000737        }else{
000738          testcase( p->iPage>=0 );
000739          btreeReleaseAllCursorPages(p);
000740        }
000741      }
000742      p = p->pNext;
000743    }while( p );
000744    return SQLITE_OK;
000745  }
000746  
000747  /*
000748  ** Clear the current cursor position.
000749  */
000750  void sqlite3BtreeClearCursor(BtCursor *pCur){
000751    assert( cursorHoldsMutex(pCur) );
000752    sqlite3_free(pCur->pKey);
000753    pCur->pKey = 0;
000754    pCur->eState = CURSOR_INVALID;
000755  }
000756  
000757  /*
000758  ** In this version of BtreeMoveto, pKey is a packed index record
000759  ** such as is generated by the OP_MakeRecord opcode.  Unpack the
000760  ** record and then call BtreeMovetoUnpacked() to do the work.
000761  */
000762  static int btreeMoveto(
000763    BtCursor *pCur,     /* Cursor open on the btree to be searched */
000764    const void *pKey,   /* Packed key if the btree is an index */
000765    i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
000766    int bias,           /* Bias search to the high end */
000767    int *pRes           /* Write search results here */
000768  ){
000769    int rc;                    /* Status code */
000770    UnpackedRecord *pIdxKey;   /* Unpacked index key */
000771  
000772    if( pKey ){
000773      assert( nKey==(i64)(int)nKey );
000774      pIdxKey = sqlite3VdbeAllocUnpackedRecord(pCur->pKeyInfo);
000775      if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
000776      sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
000777      if( pIdxKey->nField==0 ){
000778        rc = SQLITE_CORRUPT_BKPT;
000779        goto moveto_done;
000780      }
000781    }else{
000782      pIdxKey = 0;
000783    }
000784    rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
000785  moveto_done:
000786    if( pIdxKey ){
000787      sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
000788    }
000789    return rc;
000790  }
000791  
000792  /*
000793  ** Restore the cursor to the position it was in (or as close to as possible)
000794  ** when saveCursorPosition() was called. Note that this call deletes the 
000795  ** saved position info stored by saveCursorPosition(), so there can be
000796  ** at most one effective restoreCursorPosition() call after each 
000797  ** saveCursorPosition().
000798  */
000799  static int btreeRestoreCursorPosition(BtCursor *pCur){
000800    int rc;
000801    int skipNext;
000802    assert( cursorOwnsBtShared(pCur) );
000803    assert( pCur->eState>=CURSOR_REQUIRESEEK );
000804    if( pCur->eState==CURSOR_FAULT ){
000805      return pCur->skipNext;
000806    }
000807    pCur->eState = CURSOR_INVALID;
000808    rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
000809    if( rc==SQLITE_OK ){
000810      sqlite3_free(pCur->pKey);
000811      pCur->pKey = 0;
000812      assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
000813      pCur->skipNext |= skipNext;
000814      if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
000815        pCur->eState = CURSOR_SKIPNEXT;
000816      }
000817    }
000818    return rc;
000819  }
000820  
000821  #define restoreCursorPosition(p) \
000822    (p->eState>=CURSOR_REQUIRESEEK ? \
000823           btreeRestoreCursorPosition(p) : \
000824           SQLITE_OK)
000825  
000826  /*
000827  ** Determine whether or not a cursor has moved from the position where
000828  ** it was last placed, or has been invalidated for any other reason.
000829  ** Cursors can move when the row they are pointing at is deleted out
000830  ** from under them, for example.  Cursor might also move if a btree
000831  ** is rebalanced.
000832  **
000833  ** Calling this routine with a NULL cursor pointer returns false.
000834  **
000835  ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
000836  ** back to where it ought to be if this routine returns true.
000837  */
000838  int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
000839    return pCur->eState!=CURSOR_VALID;
000840  }
000841  
000842  /*
000843  ** Return a pointer to a fake BtCursor object that will always answer
000844  ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
000845  ** cursor returned must not be used with any other Btree interface.
000846  */
000847  BtCursor *sqlite3BtreeFakeValidCursor(void){
000848    static u8 fakeCursor = CURSOR_VALID;
000849    assert( offsetof(BtCursor, eState)==0 );
000850    return (BtCursor*)&fakeCursor;
000851  }
000852  
000853  /*
000854  ** This routine restores a cursor back to its original position after it
000855  ** has been moved by some outside activity (such as a btree rebalance or
000856  ** a row having been deleted out from under the cursor).  
000857  **
000858  ** On success, the *pDifferentRow parameter is false if the cursor is left
000859  ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
000860  ** was pointing to has been deleted, forcing the cursor to point to some
000861  ** nearby row.
000862  **
000863  ** This routine should only be called for a cursor that just returned
000864  ** TRUE from sqlite3BtreeCursorHasMoved().
000865  */
000866  int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
000867    int rc;
000868  
000869    assert( pCur!=0 );
000870    assert( pCur->eState!=CURSOR_VALID );
000871    rc = restoreCursorPosition(pCur);
000872    if( rc ){
000873      *pDifferentRow = 1;
000874      return rc;
000875    }
000876    if( pCur->eState!=CURSOR_VALID ){
000877      *pDifferentRow = 1;
000878    }else{
000879      assert( pCur->skipNext==0 );
000880      *pDifferentRow = 0;
000881    }
000882    return SQLITE_OK;
000883  }
000884  
000885  #ifdef SQLITE_ENABLE_CURSOR_HINTS
000886  /*
000887  ** Provide hints to the cursor.  The particular hint given (and the type
000888  ** and number of the varargs parameters) is determined by the eHintType
000889  ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
000890  */
000891  void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
000892    /* Used only by system that substitute their own storage engine */
000893  }
000894  #endif
000895  
000896  /*
000897  ** Provide flag hints to the cursor.
000898  */
000899  void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
000900    assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
000901    pCur->hints = x;
000902  }
000903  
000904  
000905  #ifndef SQLITE_OMIT_AUTOVACUUM
000906  /*
000907  ** Given a page number of a regular database page, return the page
000908  ** number for the pointer-map page that contains the entry for the
000909  ** input page number.
000910  **
000911  ** Return 0 (not a valid page) for pgno==1 since there is
000912  ** no pointer map associated with page 1.  The integrity_check logic
000913  ** requires that ptrmapPageno(*,1)!=1.
000914  */
000915  static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
000916    int nPagesPerMapPage;
000917    Pgno iPtrMap, ret;
000918    assert( sqlite3_mutex_held(pBt->mutex) );
000919    if( pgno<2 ) return 0;
000920    nPagesPerMapPage = (pBt->usableSize/5)+1;
000921    iPtrMap = (pgno-2)/nPagesPerMapPage;
000922    ret = (iPtrMap*nPagesPerMapPage) + 2; 
000923    if( ret==PENDING_BYTE_PAGE(pBt) ){
000924      ret++;
000925    }
000926    return ret;
000927  }
000928  
000929  /*
000930  ** Write an entry into the pointer map.
000931  **
000932  ** This routine updates the pointer map entry for page number 'key'
000933  ** so that it maps to type 'eType' and parent page number 'pgno'.
000934  **
000935  ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
000936  ** a no-op.  If an error occurs, the appropriate error code is written
000937  ** into *pRC.
000938  */
000939  static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
000940    DbPage *pDbPage;  /* The pointer map page */
000941    u8 *pPtrmap;      /* The pointer map data */
000942    Pgno iPtrmap;     /* The pointer map page number */
000943    int offset;       /* Offset in pointer map page */
000944    int rc;           /* Return code from subfunctions */
000945  
000946    if( *pRC ) return;
000947  
000948    assert( sqlite3_mutex_held(pBt->mutex) );
000949    /* The master-journal page number must never be used as a pointer map page */
000950    assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
000951  
000952    assert( pBt->autoVacuum );
000953    if( key==0 ){
000954      *pRC = SQLITE_CORRUPT_BKPT;
000955      return;
000956    }
000957    iPtrmap = PTRMAP_PAGENO(pBt, key);
000958    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
000959    if( rc!=SQLITE_OK ){
000960      *pRC = rc;
000961      return;
000962    }
000963    offset = PTRMAP_PTROFFSET(iPtrmap, key);
000964    if( offset<0 ){
000965      *pRC = SQLITE_CORRUPT_BKPT;
000966      goto ptrmap_exit;
000967    }
000968    assert( offset <= (int)pBt->usableSize-5 );
000969    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
000970  
000971    if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
000972      TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
000973      *pRC= rc = sqlite3PagerWrite(pDbPage);
000974      if( rc==SQLITE_OK ){
000975        pPtrmap[offset] = eType;
000976        put4byte(&pPtrmap[offset+1], parent);
000977      }
000978    }
000979  
000980  ptrmap_exit:
000981    sqlite3PagerUnref(pDbPage);
000982  }
000983  
000984  /*
000985  ** Read an entry from the pointer map.
000986  **
000987  ** This routine retrieves the pointer map entry for page 'key', writing
000988  ** the type and parent page number to *pEType and *pPgno respectively.
000989  ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
000990  */
000991  static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
000992    DbPage *pDbPage;   /* The pointer map page */
000993    int iPtrmap;       /* Pointer map page index */
000994    u8 *pPtrmap;       /* Pointer map page data */
000995    int offset;        /* Offset of entry in pointer map */
000996    int rc;
000997  
000998    assert( sqlite3_mutex_held(pBt->mutex) );
000999  
001000    iPtrmap = PTRMAP_PAGENO(pBt, key);
001001    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
001002    if( rc!=0 ){
001003      return rc;
001004    }
001005    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
001006  
001007    offset = PTRMAP_PTROFFSET(iPtrmap, key);
001008    if( offset<0 ){
001009      sqlite3PagerUnref(pDbPage);
001010      return SQLITE_CORRUPT_BKPT;
001011    }
001012    assert( offset <= (int)pBt->usableSize-5 );
001013    assert( pEType!=0 );
001014    *pEType = pPtrmap[offset];
001015    if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
001016  
001017    sqlite3PagerUnref(pDbPage);
001018    if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
001019    return SQLITE_OK;
001020  }
001021  
001022  #else /* if defined SQLITE_OMIT_AUTOVACUUM */
001023    #define ptrmapPut(w,x,y,z,rc)
001024    #define ptrmapGet(w,x,y,z) SQLITE_OK
001025    #define ptrmapPutOvflPtr(x, y, rc)
001026  #endif
001027  
001028  /*
001029  ** Given a btree page and a cell index (0 means the first cell on
001030  ** the page, 1 means the second cell, and so forth) return a pointer
001031  ** to the cell content.
001032  **
001033  ** findCellPastPtr() does the same except it skips past the initial
001034  ** 4-byte child pointer found on interior pages, if there is one.
001035  **
001036  ** This routine works only for pages that do not contain overflow cells.
001037  */
001038  #define findCell(P,I) \
001039    ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001040  #define findCellPastPtr(P,I) \
001041    ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
001042  
001043  
001044  /*
001045  ** This is common tail processing for btreeParseCellPtr() and
001046  ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
001047  ** on a single B-tree page.  Make necessary adjustments to the CellInfo
001048  ** structure.
001049  */
001050  static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
001051    MemPage *pPage,         /* Page containing the cell */
001052    u8 *pCell,              /* Pointer to the cell text. */
001053    CellInfo *pInfo         /* Fill in this structure */
001054  ){
001055    /* If the payload will not fit completely on the local page, we have
001056    ** to decide how much to store locally and how much to spill onto
001057    ** overflow pages.  The strategy is to minimize the amount of unused
001058    ** space on overflow pages while keeping the amount of local storage
001059    ** in between minLocal and maxLocal.
001060    **
001061    ** Warning:  changing the way overflow payload is distributed in any
001062    ** way will result in an incompatible file format.
001063    */
001064    int minLocal;  /* Minimum amount of payload held locally */
001065    int maxLocal;  /* Maximum amount of payload held locally */
001066    int surplus;   /* Overflow payload available for local storage */
001067  
001068    minLocal = pPage->minLocal;
001069    maxLocal = pPage->maxLocal;
001070    surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
001071    testcase( surplus==maxLocal );
001072    testcase( surplus==maxLocal+1 );
001073    if( surplus <= maxLocal ){
001074      pInfo->nLocal = (u16)surplus;
001075    }else{
001076      pInfo->nLocal = (u16)minLocal;
001077    }
001078    pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
001079  }
001080  
001081  /*
001082  ** The following routines are implementations of the MemPage.xParseCell()
001083  ** method.
001084  **
001085  ** Parse a cell content block and fill in the CellInfo structure.
001086  **
001087  ** btreeParseCellPtr()        =>   table btree leaf nodes
001088  ** btreeParseCellNoPayload()  =>   table btree internal nodes
001089  ** btreeParseCellPtrIndex()   =>   index btree nodes
001090  **
001091  ** There is also a wrapper function btreeParseCell() that works for
001092  ** all MemPage types and that references the cell by index rather than
001093  ** by pointer.
001094  */
001095  static void btreeParseCellPtrNoPayload(
001096    MemPage *pPage,         /* Page containing the cell */
001097    u8 *pCell,              /* Pointer to the cell text. */
001098    CellInfo *pInfo         /* Fill in this structure */
001099  ){
001100    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001101    assert( pPage->leaf==0 );
001102    assert( pPage->childPtrSize==4 );
001103  #ifndef SQLITE_DEBUG
001104    UNUSED_PARAMETER(pPage);
001105  #endif
001106    pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
001107    pInfo->nPayload = 0;
001108    pInfo->nLocal = 0;
001109    pInfo->pPayload = 0;
001110    return;
001111  }
001112  static void btreeParseCellPtr(
001113    MemPage *pPage,         /* Page containing the cell */
001114    u8 *pCell,              /* Pointer to the cell text. */
001115    CellInfo *pInfo         /* Fill in this structure */
001116  ){
001117    u8 *pIter;              /* For scanning through pCell */
001118    u32 nPayload;           /* Number of bytes of cell payload */
001119    u64 iKey;               /* Extracted Key value */
001120  
001121    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001122    assert( pPage->leaf==0 || pPage->leaf==1 );
001123    assert( pPage->intKeyLeaf );
001124    assert( pPage->childPtrSize==0 );
001125    pIter = pCell;
001126  
001127    /* The next block of code is equivalent to:
001128    **
001129    **     pIter += getVarint32(pIter, nPayload);
001130    **
001131    ** The code is inlined to avoid a function call.
001132    */
001133    nPayload = *pIter;
001134    if( nPayload>=0x80 ){
001135      u8 *pEnd = &pIter[8];
001136      nPayload &= 0x7f;
001137      do{
001138        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001139      }while( (*pIter)>=0x80 && pIter<pEnd );
001140    }
001141    pIter++;
001142  
001143    /* The next block of code is equivalent to:
001144    **
001145    **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
001146    **
001147    ** The code is inlined to avoid a function call.
001148    */
001149    iKey = *pIter;
001150    if( iKey>=0x80 ){
001151      u8 *pEnd = &pIter[7];
001152      iKey &= 0x7f;
001153      while(1){
001154        iKey = (iKey<<7) | (*++pIter & 0x7f);
001155        if( (*pIter)<0x80 ) break;
001156        if( pIter>=pEnd ){
001157          iKey = (iKey<<8) | *++pIter;
001158          break;
001159        }
001160      }
001161    }
001162    pIter++;
001163  
001164    pInfo->nKey = *(i64*)&iKey;
001165    pInfo->nPayload = nPayload;
001166    pInfo->pPayload = pIter;
001167    testcase( nPayload==pPage->maxLocal );
001168    testcase( nPayload==pPage->maxLocal+1 );
001169    if( nPayload<=pPage->maxLocal ){
001170      /* This is the (easy) common case where the entire payload fits
001171      ** on the local page.  No overflow is required.
001172      */
001173      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001174      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001175      pInfo->nLocal = (u16)nPayload;
001176    }else{
001177      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001178    }
001179  }
001180  static void btreeParseCellPtrIndex(
001181    MemPage *pPage,         /* Page containing the cell */
001182    u8 *pCell,              /* Pointer to the cell text. */
001183    CellInfo *pInfo         /* Fill in this structure */
001184  ){
001185    u8 *pIter;              /* For scanning through pCell */
001186    u32 nPayload;           /* Number of bytes of cell payload */
001187  
001188    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001189    assert( pPage->leaf==0 || pPage->leaf==1 );
001190    assert( pPage->intKeyLeaf==0 );
001191    pIter = pCell + pPage->childPtrSize;
001192    nPayload = *pIter;
001193    if( nPayload>=0x80 ){
001194      u8 *pEnd = &pIter[8];
001195      nPayload &= 0x7f;
001196      do{
001197        nPayload = (nPayload<<7) | (*++pIter & 0x7f);
001198      }while( *(pIter)>=0x80 && pIter<pEnd );
001199    }
001200    pIter++;
001201    pInfo->nKey = nPayload;
001202    pInfo->nPayload = nPayload;
001203    pInfo->pPayload = pIter;
001204    testcase( nPayload==pPage->maxLocal );
001205    testcase( nPayload==pPage->maxLocal+1 );
001206    if( nPayload<=pPage->maxLocal ){
001207      /* This is the (easy) common case where the entire payload fits
001208      ** on the local page.  No overflow is required.
001209      */
001210      pInfo->nSize = nPayload + (u16)(pIter - pCell);
001211      if( pInfo->nSize<4 ) pInfo->nSize = 4;
001212      pInfo->nLocal = (u16)nPayload;
001213    }else{
001214      btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
001215    }
001216  }
001217  static void btreeParseCell(
001218    MemPage *pPage,         /* Page containing the cell */
001219    int iCell,              /* The cell index.  First cell is 0 */
001220    CellInfo *pInfo         /* Fill in this structure */
001221  ){
001222    pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
001223  }
001224  
001225  /*
001226  ** The following routines are implementations of the MemPage.xCellSize
001227  ** method.
001228  **
001229  ** Compute the total number of bytes that a Cell needs in the cell
001230  ** data area of the btree-page.  The return number includes the cell
001231  ** data header and the local payload, but not any overflow page or
001232  ** the space used by the cell pointer.
001233  **
001234  ** cellSizePtrNoPayload()    =>   table internal nodes
001235  ** cellSizePtr()             =>   all index nodes & table leaf nodes
001236  */
001237  static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
001238    u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
001239    u8 *pEnd;                                /* End mark for a varint */
001240    u32 nSize;                               /* Size value to return */
001241  
001242  #ifdef SQLITE_DEBUG
001243    /* The value returned by this function should always be the same as
001244    ** the (CellInfo.nSize) value found by doing a full parse of the
001245    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001246    ** this function verifies that this invariant is not violated. */
001247    CellInfo debuginfo;
001248    pPage->xParseCell(pPage, pCell, &debuginfo);
001249  #endif
001250  
001251    nSize = *pIter;
001252    if( nSize>=0x80 ){
001253      pEnd = &pIter[8];
001254      nSize &= 0x7f;
001255      do{
001256        nSize = (nSize<<7) | (*++pIter & 0x7f);
001257      }while( *(pIter)>=0x80 && pIter<pEnd );
001258    }
001259    pIter++;
001260    if( pPage->intKey ){
001261      /* pIter now points at the 64-bit integer key value, a variable length 
001262      ** integer. The following block moves pIter to point at the first byte
001263      ** past the end of the key value. */
001264      pEnd = &pIter[9];
001265      while( (*pIter++)&0x80 && pIter<pEnd );
001266    }
001267    testcase( nSize==pPage->maxLocal );
001268    testcase( nSize==pPage->maxLocal+1 );
001269    if( nSize<=pPage->maxLocal ){
001270      nSize += (u32)(pIter - pCell);
001271      if( nSize<4 ) nSize = 4;
001272    }else{
001273      int minLocal = pPage->minLocal;
001274      nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
001275      testcase( nSize==pPage->maxLocal );
001276      testcase( nSize==pPage->maxLocal+1 );
001277      if( nSize>pPage->maxLocal ){
001278        nSize = minLocal;
001279      }
001280      nSize += 4 + (u16)(pIter - pCell);
001281    }
001282    assert( nSize==debuginfo.nSize || CORRUPT_DB );
001283    return (u16)nSize;
001284  }
001285  static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
001286    u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
001287    u8 *pEnd;              /* End mark for a varint */
001288  
001289  #ifdef SQLITE_DEBUG
001290    /* The value returned by this function should always be the same as
001291    ** the (CellInfo.nSize) value found by doing a full parse of the
001292    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
001293    ** this function verifies that this invariant is not violated. */
001294    CellInfo debuginfo;
001295    pPage->xParseCell(pPage, pCell, &debuginfo);
001296  #else
001297    UNUSED_PARAMETER(pPage);
001298  #endif
001299  
001300    assert( pPage->childPtrSize==4 );
001301    pEnd = pIter + 9;
001302    while( (*pIter++)&0x80 && pIter<pEnd );
001303    assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
001304    return (u16)(pIter - pCell);
001305  }
001306  
001307  
001308  #ifdef SQLITE_DEBUG
001309  /* This variation on cellSizePtr() is used inside of assert() statements
001310  ** only. */
001311  static u16 cellSize(MemPage *pPage, int iCell){
001312    return pPage->xCellSize(pPage, findCell(pPage, iCell));
001313  }
001314  #endif
001315  
001316  #ifndef SQLITE_OMIT_AUTOVACUUM
001317  /*
001318  ** If the cell pCell, part of page pPage contains a pointer
001319  ** to an overflow page, insert an entry into the pointer-map
001320  ** for the overflow page.
001321  */
001322  static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
001323    CellInfo info;
001324    if( *pRC ) return;
001325    assert( pCell!=0 );
001326    pPage->xParseCell(pPage, pCell, &info);
001327    if( info.nLocal<info.nPayload ){
001328      Pgno ovfl = get4byte(&pCell[info.nSize-4]);
001329      ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
001330    }
001331  }
001332  #endif
001333  
001334  
001335  /*
001336  ** Defragment the page given. This routine reorganizes cells within the
001337  ** page so that there are no free-blocks on the free-block list.
001338  **
001339  ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
001340  ** present in the page after this routine returns.
001341  **
001342  ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
001343  ** b-tree page so that there are no freeblocks or fragment bytes, all
001344  ** unused bytes are contained in the unallocated space region, and all
001345  ** cells are packed tightly at the end of the page.
001346  */
001347  static int defragmentPage(MemPage *pPage, int nMaxFrag){
001348    int i;                     /* Loop counter */
001349    int pc;                    /* Address of the i-th cell */
001350    int hdr;                   /* Offset to the page header */
001351    int size;                  /* Size of a cell */
001352    int usableSize;            /* Number of usable bytes on a page */
001353    int cellOffset;            /* Offset to the cell pointer array */
001354    int cbrk;                  /* Offset to the cell content area */
001355    int nCell;                 /* Number of cells on the page */
001356    unsigned char *data;       /* The page data */
001357    unsigned char *temp;       /* Temp area for cell content */
001358    unsigned char *src;        /* Source of content */
001359    int iCellFirst;            /* First allowable cell index */
001360    int iCellLast;             /* Last possible cell index */
001361  
001362    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001363    assert( pPage->pBt!=0 );
001364    assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
001365    assert( pPage->nOverflow==0 );
001366    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001367    temp = 0;
001368    src = data = pPage->aData;
001369    hdr = pPage->hdrOffset;
001370    cellOffset = pPage->cellOffset;
001371    nCell = pPage->nCell;
001372    assert( nCell==get2byte(&data[hdr+3]) );
001373    iCellFirst = cellOffset + 2*nCell;
001374    usableSize = pPage->pBt->usableSize;
001375  
001376    /* This block handles pages with two or fewer free blocks and nMaxFrag
001377    ** or fewer fragmented bytes. In this case it is faster to move the
001378    ** two (or one) blocks of cells using memmove() and add the required
001379    ** offsets to each pointer in the cell-pointer array than it is to 
001380    ** reconstruct the entire page.  */
001381    if( (int)data[hdr+7]<=nMaxFrag ){
001382      int iFree = get2byte(&data[hdr+1]);
001383      if( iFree ){
001384        int iFree2 = get2byte(&data[iFree]);
001385  
001386        /* pageFindSlot() has already verified that free blocks are sorted
001387        ** in order of offset within the page, and that no block extends
001388        ** past the end of the page. Provided the two free slots do not 
001389        ** overlap, this guarantees that the memmove() calls below will not
001390        ** overwrite the usableSize byte buffer, even if the database page
001391        ** is corrupt.  */
001392        assert( iFree2==0 || iFree2>iFree );
001393        assert( iFree+get2byte(&data[iFree+2]) <= usableSize );
001394        assert( iFree2==0 || iFree2+get2byte(&data[iFree2+2]) <= usableSize );
001395  
001396        if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
001397          u8 *pEnd = &data[cellOffset + nCell*2];
001398          u8 *pAddr;
001399          int sz2 = 0;
001400          int sz = get2byte(&data[iFree+2]);
001401          int top = get2byte(&data[hdr+5]);
001402          if( top>=iFree ){
001403            return SQLITE_CORRUPT_PGNO(pPage->pgno);
001404          }
001405          if( iFree2 ){
001406            assert( iFree+sz<=iFree2 ); /* Verified by pageFindSlot() */
001407            sz2 = get2byte(&data[iFree2+2]);
001408            assert( iFree+sz+sz2+iFree2-(iFree+sz) <= usableSize );
001409            memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
001410            sz += sz2;
001411          }
001412          cbrk = top+sz;
001413          assert( cbrk+(iFree-top) <= usableSize );
001414          memmove(&data[cbrk], &data[top], iFree-top);
001415          for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
001416            pc = get2byte(pAddr);
001417            if( pc<iFree ){ put2byte(pAddr, pc+sz); }
001418            else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
001419          }
001420          goto defragment_out;
001421        }
001422      }
001423    }
001424  
001425    cbrk = usableSize;
001426    iCellLast = usableSize - 4;
001427    for(i=0; i<nCell; i++){
001428      u8 *pAddr;     /* The i-th cell pointer */
001429      pAddr = &data[cellOffset + i*2];
001430      pc = get2byte(pAddr);
001431      testcase( pc==iCellFirst );
001432      testcase( pc==iCellLast );
001433      /* These conditions have already been verified in btreeInitPage()
001434      ** if PRAGMA cell_size_check=ON.
001435      */
001436      if( pc<iCellFirst || pc>iCellLast ){
001437        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001438      }
001439      assert( pc>=iCellFirst && pc<=iCellLast );
001440      size = pPage->xCellSize(pPage, &src[pc]);
001441      cbrk -= size;
001442      if( cbrk<iCellFirst || pc+size>usableSize ){
001443        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001444      }
001445      assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
001446      testcase( cbrk+size==usableSize );
001447      testcase( pc+size==usableSize );
001448      put2byte(pAddr, cbrk);
001449      if( temp==0 ){
001450        int x;
001451        if( cbrk==pc ) continue;
001452        temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
001453        x = get2byte(&data[hdr+5]);
001454        memcpy(&temp[x], &data[x], (cbrk+size) - x);
001455        src = temp;
001456      }
001457      memcpy(&data[cbrk], &src[pc], size);
001458    }
001459    data[hdr+7] = 0;
001460  
001461   defragment_out:
001462    if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
001463      return SQLITE_CORRUPT_PGNO(pPage->pgno);
001464    }
001465    assert( cbrk>=iCellFirst );
001466    put2byte(&data[hdr+5], cbrk);
001467    data[hdr+1] = 0;
001468    data[hdr+2] = 0;
001469    memset(&data[iCellFirst], 0, cbrk-iCellFirst);
001470    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001471    return SQLITE_OK;
001472  }
001473  
001474  /*
001475  ** Search the free-list on page pPg for space to store a cell nByte bytes in
001476  ** size. If one can be found, return a pointer to the space and remove it
001477  ** from the free-list.
001478  **
001479  ** If no suitable space can be found on the free-list, return NULL.
001480  **
001481  ** This function may detect corruption within pPg.  If corruption is
001482  ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
001483  **
001484  ** Slots on the free list that are between 1 and 3 bytes larger than nByte
001485  ** will be ignored if adding the extra space to the fragmentation count
001486  ** causes the fragmentation count to exceed 60.
001487  */
001488  static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
001489    const int hdr = pPg->hdrOffset;
001490    u8 * const aData = pPg->aData;
001491    int iAddr = hdr + 1;
001492    int pc = get2byte(&aData[iAddr]);
001493    int x;
001494    int usableSize = pPg->pBt->usableSize;
001495    int size;            /* Size of the free slot */
001496  
001497    assert( pc>0 );
001498    while( pc<=usableSize-4 ){
001499      /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
001500      ** freeblock form a big-endian integer which is the size of the freeblock
001501      ** in bytes, including the 4-byte header. */
001502      size = get2byte(&aData[pc+2]);
001503      if( (x = size - nByte)>=0 ){
001504        testcase( x==4 );
001505        testcase( x==3 );
001506        if( size+pc > usableSize ){
001507          *pRc = SQLITE_CORRUPT_PGNO(pPg->pgno);
001508          return 0;
001509        }else if( x<4 ){
001510          /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
001511          ** number of bytes in fragments may not exceed 60. */
001512          if( aData[hdr+7]>57 ) return 0;
001513  
001514          /* Remove the slot from the free-list. Update the number of
001515          ** fragmented bytes within the page. */
001516          memcpy(&aData[iAddr], &aData[pc], 2);
001517          aData[hdr+7] += (u8)x;
001518        }else{
001519          /* The slot remains on the free-list. Reduce its size to account
001520           ** for the portion used by the new allocation. */
001521          put2byte(&aData[pc+2], x);
001522        }
001523        return &aData[pc + x];
001524      }
001525      iAddr = pc;
001526      pc = get2byte(&aData[pc]);
001527      if( pc<iAddr+size ) break;
001528    }
001529    if( pc ){
001530      *pRc = SQLITE_CORRUPT_PGNO(pPg->pgno);
001531    }
001532  
001533    return 0;
001534  }
001535  
001536  /*
001537  ** Allocate nByte bytes of space from within the B-Tree page passed
001538  ** as the first argument. Write into *pIdx the index into pPage->aData[]
001539  ** of the first byte of allocated space. Return either SQLITE_OK or
001540  ** an error code (usually SQLITE_CORRUPT).
001541  **
001542  ** The caller guarantees that there is sufficient space to make the
001543  ** allocation.  This routine might need to defragment in order to bring
001544  ** all the space together, however.  This routine will avoid using
001545  ** the first two bytes past the cell pointer area since presumably this
001546  ** allocation is being made in order to insert a new cell, so we will
001547  ** also end up needing a new cell pointer.
001548  */
001549  static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
001550    const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
001551    u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
001552    int top;                             /* First byte of cell content area */
001553    int rc = SQLITE_OK;                  /* Integer return code */
001554    int gap;        /* First byte of gap between cell pointers and cell content */
001555    
001556    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001557    assert( pPage->pBt );
001558    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001559    assert( nByte>=0 );  /* Minimum cell size is 4 */
001560    assert( pPage->nFree>=nByte );
001561    assert( pPage->nOverflow==0 );
001562    assert( nByte < (int)(pPage->pBt->usableSize-8) );
001563  
001564    assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
001565    gap = pPage->cellOffset + 2*pPage->nCell;
001566    assert( gap<=65536 );
001567    /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
001568    ** and the reserved space is zero (the usual value for reserved space)
001569    ** then the cell content offset of an empty page wants to be 65536.
001570    ** However, that integer is too large to be stored in a 2-byte unsigned
001571    ** integer, so a value of 0 is used in its place. */
001572    top = get2byte(&data[hdr+5]);
001573    assert( top<=(int)pPage->pBt->usableSize ); /* Prevent by getAndInitPage() */
001574    if( gap>top ){
001575      if( top==0 && pPage->pBt->usableSize==65536 ){
001576        top = 65536;
001577      }else{
001578        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001579      }
001580    }
001581  
001582    /* If there is enough space between gap and top for one more cell pointer
001583    ** array entry offset, and if the freelist is not empty, then search the
001584    ** freelist looking for a free slot big enough to satisfy the request.
001585    */
001586    testcase( gap+2==top );
001587    testcase( gap+1==top );
001588    testcase( gap==top );
001589    if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
001590      u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
001591      if( pSpace ){
001592        assert( pSpace>=data && (pSpace - data)<65536 );
001593        *pIdx = (int)(pSpace - data);
001594        return SQLITE_OK;
001595      }else if( rc ){
001596        return rc;
001597      }
001598    }
001599  
001600    /* The request could not be fulfilled using a freelist slot.  Check
001601    ** to see if defragmentation is necessary.
001602    */
001603    testcase( gap+2+nByte==top );
001604    if( gap+2+nByte>top ){
001605      assert( pPage->nCell>0 || CORRUPT_DB );
001606      rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
001607      if( rc ) return rc;
001608      top = get2byteNotZero(&data[hdr+5]);
001609      assert( gap+2+nByte<=top );
001610    }
001611  
001612  
001613    /* Allocate memory from the gap in between the cell pointer array
001614    ** and the cell content area.  The btreeInitPage() call has already
001615    ** validated the freelist.  Given that the freelist is valid, there
001616    ** is no way that the allocation can extend off the end of the page.
001617    ** The assert() below verifies the previous sentence.
001618    */
001619    top -= nByte;
001620    put2byte(&data[hdr+5], top);
001621    assert( top+nByte <= (int)pPage->pBt->usableSize );
001622    *pIdx = top;
001623    return SQLITE_OK;
001624  }
001625  
001626  /*
001627  ** Return a section of the pPage->aData to the freelist.
001628  ** The first byte of the new free block is pPage->aData[iStart]
001629  ** and the size of the block is iSize bytes.
001630  **
001631  ** Adjacent freeblocks are coalesced.
001632  **
001633  ** Note that even though the freeblock list was checked by btreeInitPage(),
001634  ** that routine will not detect overlap between cells or freeblocks.  Nor
001635  ** does it detect cells or freeblocks that encrouch into the reserved bytes
001636  ** at the end of the page.  So do additional corruption checks inside this
001637  ** routine and return SQLITE_CORRUPT if any problems are found.
001638  */
001639  static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
001640    u16 iPtr;                             /* Address of ptr to next freeblock */
001641    u16 iFreeBlk;                         /* Address of the next freeblock */
001642    u8 hdr;                               /* Page header size.  0 or 100 */
001643    u8 nFrag = 0;                         /* Reduction in fragmentation */
001644    u16 iOrigSize = iSize;                /* Original value of iSize */
001645    u16 x;                                /* Offset to cell content area */
001646    u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
001647    unsigned char *data = pPage->aData;   /* Page content */
001648  
001649    assert( pPage->pBt!=0 );
001650    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001651    assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
001652    assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
001653    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001654    assert( iSize>=4 );   /* Minimum cell size is 4 */
001655    assert( iStart<=pPage->pBt->usableSize-4 );
001656  
001657    /* The list of freeblocks must be in ascending order.  Find the 
001658    ** spot on the list where iStart should be inserted.
001659    */
001660    hdr = pPage->hdrOffset;
001661    iPtr = hdr + 1;
001662    if( data[iPtr+1]==0 && data[iPtr]==0 ){
001663      iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
001664    }else{
001665      while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
001666        if( iFreeBlk<iPtr+4 ){
001667          if( iFreeBlk==0 ) break;
001668          return SQLITE_CORRUPT_PGNO(pPage->pgno);
001669        }
001670        iPtr = iFreeBlk;
001671      }
001672      if( iFreeBlk>pPage->pBt->usableSize-4 ){
001673        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001674      }
001675      assert( iFreeBlk>iPtr || iFreeBlk==0 );
001676    
001677      /* At this point:
001678      **    iFreeBlk:   First freeblock after iStart, or zero if none
001679      **    iPtr:       The address of a pointer to iFreeBlk
001680      **
001681      ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
001682      */
001683      if( iFreeBlk && iEnd+3>=iFreeBlk ){
001684        nFrag = iFreeBlk - iEnd;
001685        if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
001686        iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
001687        if( iEnd > pPage->pBt->usableSize ){
001688          return SQLITE_CORRUPT_PGNO(pPage->pgno);
001689        }
001690        iSize = iEnd - iStart;
001691        iFreeBlk = get2byte(&data[iFreeBlk]);
001692      }
001693    
001694      /* If iPtr is another freeblock (that is, if iPtr is not the freelist
001695      ** pointer in the page header) then check to see if iStart should be
001696      ** coalesced onto the end of iPtr.
001697      */
001698      if( iPtr>hdr+1 ){
001699        int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
001700        if( iPtrEnd+3>=iStart ){
001701          if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
001702          nFrag += iStart - iPtrEnd;
001703          iSize = iEnd - iPtr;
001704          iStart = iPtr;
001705        }
001706      }
001707      if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
001708      data[hdr+7] -= nFrag;
001709    }
001710    x = get2byte(&data[hdr+5]);
001711    if( iStart<=x ){
001712      /* The new freeblock is at the beginning of the cell content area,
001713      ** so just extend the cell content area rather than create another
001714      ** freelist entry */
001715      if( iStart<x || iPtr!=hdr+1 ) return SQLITE_CORRUPT_PGNO(pPage->pgno);
001716      put2byte(&data[hdr+1], iFreeBlk);
001717      put2byte(&data[hdr+5], iEnd);
001718    }else{
001719      /* Insert the new freeblock into the freelist */
001720      put2byte(&data[iPtr], iStart);
001721    }
001722    if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
001723      /* Overwrite deleted information with zeros when the secure_delete
001724      ** option is enabled */
001725      memset(&data[iStart], 0, iSize);
001726    }
001727    put2byte(&data[iStart], iFreeBlk);
001728    put2byte(&data[iStart+2], iSize);
001729    pPage->nFree += iOrigSize;
001730    return SQLITE_OK;
001731  }
001732  
001733  /*
001734  ** Decode the flags byte (the first byte of the header) for a page
001735  ** and initialize fields of the MemPage structure accordingly.
001736  **
001737  ** Only the following combinations are supported.  Anything different
001738  ** indicates a corrupt database files:
001739  **
001740  **         PTF_ZERODATA
001741  **         PTF_ZERODATA | PTF_LEAF
001742  **         PTF_LEAFDATA | PTF_INTKEY
001743  **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
001744  */
001745  static int decodeFlags(MemPage *pPage, int flagByte){
001746    BtShared *pBt;     /* A copy of pPage->pBt */
001747  
001748    assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
001749    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001750    pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
001751    flagByte &= ~PTF_LEAF;
001752    pPage->childPtrSize = 4-4*pPage->leaf;
001753    pPage->xCellSize = cellSizePtr;
001754    pBt = pPage->pBt;
001755    if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
001756      /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
001757      ** interior table b-tree page. */
001758      assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
001759      /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
001760      ** leaf table b-tree page. */
001761      assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
001762      pPage->intKey = 1;
001763      if( pPage->leaf ){
001764        pPage->intKeyLeaf = 1;
001765        pPage->xParseCell = btreeParseCellPtr;
001766      }else{
001767        pPage->intKeyLeaf = 0;
001768        pPage->xCellSize = cellSizePtrNoPayload;
001769        pPage->xParseCell = btreeParseCellPtrNoPayload;
001770      }
001771      pPage->maxLocal = pBt->maxLeaf;
001772      pPage->minLocal = pBt->minLeaf;
001773    }else if( flagByte==PTF_ZERODATA ){
001774      /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
001775      ** interior index b-tree page. */
001776      assert( (PTF_ZERODATA)==2 );
001777      /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
001778      ** leaf index b-tree page. */
001779      assert( (PTF_ZERODATA|PTF_LEAF)==10 );
001780      pPage->intKey = 0;
001781      pPage->intKeyLeaf = 0;
001782      pPage->xParseCell = btreeParseCellPtrIndex;
001783      pPage->maxLocal = pBt->maxLocal;
001784      pPage->minLocal = pBt->minLocal;
001785    }else{
001786      /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
001787      ** an error. */
001788      return SQLITE_CORRUPT_PGNO(pPage->pgno);
001789    }
001790    pPage->max1bytePayload = pBt->max1bytePayload;
001791    return SQLITE_OK;
001792  }
001793  
001794  /*
001795  ** Initialize the auxiliary information for a disk block.
001796  **
001797  ** Return SQLITE_OK on success.  If we see that the page does
001798  ** not contain a well-formed database page, then return 
001799  ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
001800  ** guarantee that the page is well-formed.  It only shows that
001801  ** we failed to detect any corruption.
001802  */
001803  static int btreeInitPage(MemPage *pPage){
001804    int pc;            /* Address of a freeblock within pPage->aData[] */
001805    u8 hdr;            /* Offset to beginning of page header */
001806    u8 *data;          /* Equal to pPage->aData */
001807    BtShared *pBt;        /* The main btree structure */
001808    int usableSize;    /* Amount of usable space on each page */
001809    u16 cellOffset;    /* Offset from start of page to first cell pointer */
001810    int nFree;         /* Number of unused bytes on the page */
001811    int top;           /* First byte of the cell content area */
001812    int iCellFirst;    /* First allowable cell or freeblock offset */
001813    int iCellLast;     /* Last possible cell or freeblock offset */
001814  
001815    assert( pPage->pBt!=0 );
001816    assert( pPage->pBt->db!=0 );
001817    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
001818    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
001819    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
001820    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
001821    assert( pPage->isInit==0 );
001822  
001823    pBt = pPage->pBt;
001824    hdr = pPage->hdrOffset;
001825    data = pPage->aData;
001826    /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
001827    ** the b-tree page type. */
001828    if( decodeFlags(pPage, data[hdr]) ){
001829      return SQLITE_CORRUPT_PGNO(pPage->pgno);
001830    }
001831    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
001832    pPage->maskPage = (u16)(pBt->pageSize - 1);
001833    pPage->nOverflow = 0;
001834    usableSize = pBt->usableSize;
001835    pPage->cellOffset = cellOffset = hdr + 8 + pPage->childPtrSize;
001836    pPage->aDataEnd = &data[usableSize];
001837    pPage->aCellIdx = &data[cellOffset];
001838    pPage->aDataOfst = &data[pPage->childPtrSize];
001839    /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
001840    ** the start of the cell content area. A zero value for this integer is
001841    ** interpreted as 65536. */
001842    top = get2byteNotZero(&data[hdr+5]);
001843    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
001844    ** number of cells on the page. */
001845    pPage->nCell = get2byte(&data[hdr+3]);
001846    if( pPage->nCell>MX_CELL(pBt) ){
001847      /* To many cells for a single page.  The page must be corrupt */
001848      return SQLITE_CORRUPT_PGNO(pPage->pgno);
001849    }
001850    testcase( pPage->nCell==MX_CELL(pBt) );
001851    /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
001852    ** possible for a root page of a table that contains no rows) then the
001853    ** offset to the cell content area will equal the page size minus the
001854    ** bytes of reserved space. */
001855    assert( pPage->nCell>0 || top==usableSize || CORRUPT_DB );
001856  
001857    /* A malformed database page might cause us to read past the end
001858    ** of page when parsing a cell.  
001859    **
001860    ** The following block of code checks early to see if a cell extends
001861    ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
001862    ** returned if it does.
001863    */
001864    iCellFirst = cellOffset + 2*pPage->nCell;
001865    iCellLast = usableSize - 4;
001866    if( pBt->db->flags & SQLITE_CellSizeCk ){
001867      int i;            /* Index into the cell pointer array */
001868      int sz;           /* Size of a cell */
001869  
001870      if( !pPage->leaf ) iCellLast--;
001871      for(i=0; i<pPage->nCell; i++){
001872        pc = get2byteAligned(&data[cellOffset+i*2]);
001873        testcase( pc==iCellFirst );
001874        testcase( pc==iCellLast );
001875        if( pc<iCellFirst || pc>iCellLast ){
001876          return SQLITE_CORRUPT_PGNO(pPage->pgno);
001877        }
001878        sz = pPage->xCellSize(pPage, &data[pc]);
001879        testcase( pc+sz==usableSize );
001880        if( pc+sz>usableSize ){
001881          return SQLITE_CORRUPT_PGNO(pPage->pgno);
001882        }
001883      }
001884      if( !pPage->leaf ) iCellLast++;
001885    }  
001886  
001887    /* Compute the total free space on the page
001888    ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
001889    ** start of the first freeblock on the page, or is zero if there are no
001890    ** freeblocks. */
001891    pc = get2byte(&data[hdr+1]);
001892    nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
001893    if( pc>0 ){
001894      u32 next, size;
001895      if( pc<iCellFirst ){
001896        /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
001897        ** always be at least one cell before the first freeblock.
001898        */
001899        return SQLITE_CORRUPT_PGNO(pPage->pgno); 
001900      }
001901      while( 1 ){
001902        if( pc>iCellLast ){
001903          /* Freeblock off the end of the page */
001904          return SQLITE_CORRUPT_PGNO(pPage->pgno);
001905        }
001906        next = get2byte(&data[pc]);
001907        size = get2byte(&data[pc+2]);
001908        nFree = nFree + size;
001909        if( next<=pc+size+3 ) break;
001910        pc = next;
001911      }
001912      if( next>0 ){
001913        /* Freeblock not in ascending order */
001914        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001915      }
001916      if( pc+size>(unsigned int)usableSize ){
001917        /* Last freeblock extends past page end */
001918        return SQLITE_CORRUPT_PGNO(pPage->pgno);
001919      }
001920    }
001921  
001922    /* At this point, nFree contains the sum of the offset to the start
001923    ** of the cell-content area plus the number of free bytes within
001924    ** the cell-content area. If this is greater than the usable-size
001925    ** of the page, then the page must be corrupted. This check also
001926    ** serves to verify that the offset to the start of the cell-content
001927    ** area, according to the page header, lies within the page.
001928    */
001929    if( nFree>usableSize ){
001930      return SQLITE_CORRUPT_PGNO(pPage->pgno);
001931    }
001932    pPage->nFree = (u16)(nFree - iCellFirst);
001933    pPage->isInit = 1;
001934    return SQLITE_OK;
001935  }
001936  
001937  /*
001938  ** Set up a raw page so that it looks like a database page holding
001939  ** no entries.
001940  */
001941  static void zeroPage(MemPage *pPage, int flags){
001942    unsigned char *data = pPage->aData;
001943    BtShared *pBt = pPage->pBt;
001944    u8 hdr = pPage->hdrOffset;
001945    u16 first;
001946  
001947    assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
001948    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
001949    assert( sqlite3PagerGetData(pPage->pDbPage) == data );
001950    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
001951    assert( sqlite3_mutex_held(pBt->mutex) );
001952    if( pBt->btsFlags & BTS_FAST_SECURE ){
001953      memset(&data[hdr], 0, pBt->usableSize - hdr);
001954    }
001955    data[hdr] = (char)flags;
001956    first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
001957    memset(&data[hdr+1], 0, 4);
001958    data[hdr+7] = 0;
001959    put2byte(&data[hdr+5], pBt->usableSize);
001960    pPage->nFree = (u16)(pBt->usableSize - first);
001961    decodeFlags(pPage, flags);
001962    pPage->cellOffset = first;
001963    pPage->aDataEnd = &data[pBt->usableSize];
001964    pPage->aCellIdx = &data[first];
001965    pPage->aDataOfst = &data[pPage->childPtrSize];
001966    pPage->nOverflow = 0;
001967    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
001968    pPage->maskPage = (u16)(pBt->pageSize - 1);
001969    pPage->nCell = 0;
001970    pPage->isInit = 1;
001971  }
001972  
001973  
001974  /*
001975  ** Convert a DbPage obtained from the pager into a MemPage used by
001976  ** the btree layer.
001977  */
001978  static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
001979    MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
001980    if( pgno!=pPage->pgno ){
001981      pPage->aData = sqlite3PagerGetData(pDbPage);
001982      pPage->pDbPage = pDbPage;
001983      pPage->pBt = pBt;
001984      pPage->pgno = pgno;
001985      pPage->hdrOffset = pgno==1 ? 100 : 0;
001986    }
001987    assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
001988    return pPage; 
001989  }
001990  
001991  /*
001992  ** Get a page from the pager.  Initialize the MemPage.pBt and
001993  ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
001994  **
001995  ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
001996  ** about the content of the page at this time.  So do not go to the disk
001997  ** to fetch the content.  Just fill in the content with zeros for now.
001998  ** If in the future we call sqlite3PagerWrite() on this page, that
001999  ** means we have started to be concerned about content and the disk
002000  ** read should occur at that point.
002001  */
002002  static int btreeGetPage(
002003    BtShared *pBt,       /* The btree */
002004    Pgno pgno,           /* Number of the page to fetch */
002005    MemPage **ppPage,    /* Return the page in this parameter */
002006    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002007  ){
002008    int rc;
002009    DbPage *pDbPage;
002010  
002011    assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
002012    assert( sqlite3_mutex_held(pBt->mutex) );
002013    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
002014    if( rc ) return rc;
002015    *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
002016    return SQLITE_OK;
002017  }
002018  
002019  /*
002020  ** Retrieve a page from the pager cache. If the requested page is not
002021  ** already in the pager cache return NULL. Initialize the MemPage.pBt and
002022  ** MemPage.aData elements if needed.
002023  */
002024  static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
002025    DbPage *pDbPage;
002026    assert( sqlite3_mutex_held(pBt->mutex) );
002027    pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
002028    if( pDbPage ){
002029      return btreePageFromDbPage(pDbPage, pgno, pBt);
002030    }
002031    return 0;
002032  }
002033  
002034  /*
002035  ** Return the size of the database file in pages. If there is any kind of
002036  ** error, return ((unsigned int)-1).
002037  */
002038  static Pgno btreePagecount(BtShared *pBt){
002039    return pBt->nPage;
002040  }
002041  u32 sqlite3BtreeLastPage(Btree *p){
002042    assert( sqlite3BtreeHoldsMutex(p) );
002043    assert( ((p->pBt->nPage)&0x80000000)==0 );
002044    return btreePagecount(p->pBt);
002045  }
002046  
002047  /*
002048  ** Get a page from the pager and initialize it.
002049  **
002050  ** If pCur!=0 then the page is being fetched as part of a moveToChild()
002051  ** call.  Do additional sanity checking on the page in this case.
002052  ** And if the fetch fails, this routine must decrement pCur->iPage.
002053  **
002054  ** The page is fetched as read-write unless pCur is not NULL and is
002055  ** a read-only cursor.
002056  **
002057  ** If an error occurs, then *ppPage is undefined. It
002058  ** may remain unchanged, or it may be set to an invalid value.
002059  */
002060  static int getAndInitPage(
002061    BtShared *pBt,                  /* The database file */
002062    Pgno pgno,                      /* Number of the page to get */
002063    MemPage **ppPage,               /* Write the page pointer here */
002064    BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
002065    int bReadOnly                   /* True for a read-only page */
002066  ){
002067    int rc;
002068    DbPage *pDbPage;
002069    assert( sqlite3_mutex_held(pBt->mutex) );
002070    assert( pCur==0 || ppPage==&pCur->pPage );
002071    assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
002072    assert( pCur==0 || pCur->iPage>0 );
002073  
002074    if( pgno>btreePagecount(pBt) ){
002075      rc = SQLITE_CORRUPT_BKPT;
002076      goto getAndInitPage_error;
002077    }
002078    rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
002079    if( rc ){
002080      goto getAndInitPage_error;
002081    }
002082    *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
002083    if( (*ppPage)->isInit==0 ){
002084      btreePageFromDbPage(pDbPage, pgno, pBt);
002085      rc = btreeInitPage(*ppPage);
002086      if( rc!=SQLITE_OK ){
002087        releasePage(*ppPage);
002088        goto getAndInitPage_error;
002089      }
002090    }
002091    assert( (*ppPage)->pgno==pgno );
002092    assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
002093  
002094    /* If obtaining a child page for a cursor, we must verify that the page is
002095    ** compatible with the root page. */
002096    if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
002097      rc = SQLITE_CORRUPT_PGNO(pgno);
002098      releasePage(*ppPage);
002099      goto getAndInitPage_error;
002100    }
002101    return SQLITE_OK;
002102  
002103  getAndInitPage_error:
002104    if( pCur ){
002105      pCur->iPage--;
002106      pCur->pPage = pCur->apPage[pCur->iPage];
002107    }
002108    testcase( pgno==0 );
002109    assert( pgno!=0 || rc==SQLITE_CORRUPT );
002110    return rc;
002111  }
002112  
002113  /*
002114  ** Release a MemPage.  This should be called once for each prior
002115  ** call to btreeGetPage.
002116  **
002117  ** Page1 is a special case and must be released using releasePageOne().
002118  */
002119  static void releasePageNotNull(MemPage *pPage){
002120    assert( pPage->aData );
002121    assert( pPage->pBt );
002122    assert( pPage->pDbPage!=0 );
002123    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002124    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002125    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002126    sqlite3PagerUnrefNotNull(pPage->pDbPage);
002127  }
002128  static void releasePage(MemPage *pPage){
002129    if( pPage ) releasePageNotNull(pPage);
002130  }
002131  static void releasePageOne(MemPage *pPage){
002132    assert( pPage!=0 );
002133    assert( pPage->aData );
002134    assert( pPage->pBt );
002135    assert( pPage->pDbPage!=0 );
002136    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
002137    assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
002138    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002139    sqlite3PagerUnrefPageOne(pPage->pDbPage);
002140  }
002141  
002142  /*
002143  ** Get an unused page.
002144  **
002145  ** This works just like btreeGetPage() with the addition:
002146  **
002147  **   *  If the page is already in use for some other purpose, immediately
002148  **      release it and return an SQLITE_CURRUPT error.
002149  **   *  Make sure the isInit flag is clear
002150  */
002151  static int btreeGetUnusedPage(
002152    BtShared *pBt,       /* The btree */
002153    Pgno pgno,           /* Number of the page to fetch */
002154    MemPage **ppPage,    /* Return the page in this parameter */
002155    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
002156  ){
002157    int rc = btreeGetPage(pBt, pgno, ppPage, flags);
002158    if( rc==SQLITE_OK ){
002159      if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
002160        releasePage(*ppPage);
002161        *ppPage = 0;
002162        return SQLITE_CORRUPT_BKPT;
002163      }
002164      (*ppPage)->isInit = 0;
002165    }else{
002166      *ppPage = 0;
002167    }
002168    return rc;
002169  }
002170  
002171  
002172  /*
002173  ** During a rollback, when the pager reloads information into the cache
002174  ** so that the cache is restored to its original state at the start of
002175  ** the transaction, for each page restored this routine is called.
002176  **
002177  ** This routine needs to reset the extra data section at the end of the
002178  ** page to agree with the restored data.
002179  */
002180  static void pageReinit(DbPage *pData){
002181    MemPage *pPage;
002182    pPage = (MemPage *)sqlite3PagerGetExtra(pData);
002183    assert( sqlite3PagerPageRefcount(pData)>0 );
002184    if( pPage->isInit ){
002185      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
002186      pPage->isInit = 0;
002187      if( sqlite3PagerPageRefcount(pData)>1 ){
002188        /* pPage might not be a btree page;  it might be an overflow page
002189        ** or ptrmap page or a free page.  In those cases, the following
002190        ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
002191        ** But no harm is done by this.  And it is very important that
002192        ** btreeInitPage() be called on every btree page so we make
002193        ** the call for every page that comes in for re-initing. */
002194        btreeInitPage(pPage);
002195      }
002196    }
002197  }
002198  
002199  /*
002200  ** Invoke the busy handler for a btree.
002201  */
002202  static int btreeInvokeBusyHandler(void *pArg){
002203    BtShared *pBt = (BtShared*)pArg;
002204    assert( pBt->db );
002205    assert( sqlite3_mutex_held(pBt->db->mutex) );
002206    return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
002207  }
002208  
002209  /*
002210  ** Open a database file.
002211  ** 
002212  ** zFilename is the name of the database file.  If zFilename is NULL
002213  ** then an ephemeral database is created.  The ephemeral database might
002214  ** be exclusively in memory, or it might use a disk-based memory cache.
002215  ** Either way, the ephemeral database will be automatically deleted 
002216  ** when sqlite3BtreeClose() is called.
002217  **
002218  ** If zFilename is ":memory:" then an in-memory database is created
002219  ** that is automatically destroyed when it is closed.
002220  **
002221  ** The "flags" parameter is a bitmask that might contain bits like
002222  ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
002223  **
002224  ** If the database is already opened in the same database connection
002225  ** and we are in shared cache mode, then the open will fail with an
002226  ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
002227  ** objects in the same database connection since doing so will lead
002228  ** to problems with locking.
002229  */
002230  int sqlite3BtreeOpen(
002231    sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
002232    const char *zFilename,  /* Name of the file containing the BTree database */
002233    sqlite3 *db,            /* Associated database handle */
002234    Btree **ppBtree,        /* Pointer to new Btree object written here */
002235    int flags,              /* Options */
002236    int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
002237  ){
002238    BtShared *pBt = 0;             /* Shared part of btree structure */
002239    Btree *p;                      /* Handle to return */
002240    sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
002241    int rc = SQLITE_OK;            /* Result code from this function */
002242    u8 nReserve;                   /* Byte of unused space on each page */
002243    unsigned char zDbHeader[100];  /* Database header content */
002244  
002245    /* True if opening an ephemeral, temporary database */
002246    const int isTempDb = zFilename==0 || zFilename[0]==0;
002247  
002248    /* Set the variable isMemdb to true for an in-memory database, or 
002249    ** false for a file-based database.
002250    */
002251  #ifdef SQLITE_OMIT_MEMORYDB
002252    const int isMemdb = 0;
002253  #else
002254    const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
002255                         || (isTempDb && sqlite3TempInMemory(db))
002256                         || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
002257  #endif
002258  
002259    assert( db!=0 );
002260    assert( pVfs!=0 );
002261    assert( sqlite3_mutex_held(db->mutex) );
002262    assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
002263  
002264    /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
002265    assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
002266  
002267    /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
002268    assert( (flags & BTREE_SINGLE)==0 || isTempDb );
002269  
002270    if( isMemdb ){
002271      flags |= BTREE_MEMORY;
002272    }
002273    if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
002274      vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
002275    }
002276    p = sqlite3MallocZero(sizeof(Btree));
002277    if( !p ){
002278      return SQLITE_NOMEM_BKPT;
002279    }
002280    p->inTrans = TRANS_NONE;
002281    p->db = db;
002282  #ifndef SQLITE_OMIT_SHARED_CACHE
002283    p->lock.pBtree = p;
002284    p->lock.iTable = 1;
002285  #endif
002286  
002287  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002288    /*
002289    ** If this Btree is a candidate for shared cache, try to find an
002290    ** existing BtShared object that we can share with
002291    */
002292    if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
002293      if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
002294        int nFilename = sqlite3Strlen30(zFilename)+1;
002295        int nFullPathname = pVfs->mxPathname+1;
002296        char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
002297        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002298  
002299        p->sharable = 1;
002300        if( !zFullPathname ){
002301          sqlite3_free(p);
002302          return SQLITE_NOMEM_BKPT;
002303        }
002304        if( isMemdb ){
002305          memcpy(zFullPathname, zFilename, nFilename);
002306        }else{
002307          rc = sqlite3OsFullPathname(pVfs, zFilename,
002308                                     nFullPathname, zFullPathname);
002309          if( rc ){
002310            sqlite3_free(zFullPathname);
002311            sqlite3_free(p);
002312            return rc;
002313          }
002314        }
002315  #if SQLITE_THREADSAFE
002316        mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
002317        sqlite3_mutex_enter(mutexOpen);
002318        mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
002319        sqlite3_mutex_enter(mutexShared);
002320  #endif
002321        for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
002322          assert( pBt->nRef>0 );
002323          if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
002324                   && sqlite3PagerVfs(pBt->pPager)==pVfs ){
002325            int iDb;
002326            for(iDb=db->nDb-1; iDb>=0; iDb--){
002327              Btree *pExisting = db->aDb[iDb].pBt;
002328              if( pExisting && pExisting->pBt==pBt ){
002329                sqlite3_mutex_leave(mutexShared);
002330                sqlite3_mutex_leave(mutexOpen);
002331                sqlite3_free(zFullPathname);
002332                sqlite3_free(p);
002333                return SQLITE_CONSTRAINT;
002334              }
002335            }
002336            p->pBt = pBt;
002337            pBt->nRef++;
002338            break;
002339          }
002340        }
002341        sqlite3_mutex_leave(mutexShared);
002342        sqlite3_free(zFullPathname);
002343      }
002344  #ifdef SQLITE_DEBUG
002345      else{
002346        /* In debug mode, we mark all persistent databases as sharable
002347        ** even when they are not.  This exercises the locking code and
002348        ** gives more opportunity for asserts(sqlite3_mutex_held())
002349        ** statements to find locking problems.
002350        */
002351        p->sharable = 1;
002352      }
002353  #endif
002354    }
002355  #endif
002356    if( pBt==0 ){
002357      /*
002358      ** The following asserts make sure that structures used by the btree are
002359      ** the right size.  This is to guard against size changes that result
002360      ** when compiling on a different architecture.
002361      */
002362      assert( sizeof(i64)==8 );
002363      assert( sizeof(u64)==8 );
002364      assert( sizeof(u32)==4 );
002365      assert( sizeof(u16)==2 );
002366      assert( sizeof(Pgno)==4 );
002367    
002368      pBt = sqlite3MallocZero( sizeof(*pBt) );
002369      if( pBt==0 ){
002370        rc = SQLITE_NOMEM_BKPT;
002371        goto btree_open_out;
002372      }
002373      rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
002374                            sizeof(MemPage), flags, vfsFlags, pageReinit);
002375      if( rc==SQLITE_OK ){
002376        sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
002377        rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
002378      }
002379      if( rc!=SQLITE_OK ){
002380        goto btree_open_out;
002381      }
002382      pBt->openFlags = (u8)flags;
002383      pBt->db = db;
002384      sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
002385      p->pBt = pBt;
002386    
002387      pBt->pCursor = 0;
002388      pBt->pPage1 = 0;
002389      if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
002390  #if defined(SQLITE_SECURE_DELETE)
002391      pBt->btsFlags |= BTS_SECURE_DELETE;
002392  #elif defined(SQLITE_FAST_SECURE_DELETE)
002393      pBt->btsFlags |= BTS_OVERWRITE;
002394  #endif
002395      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
002396      ** determined by the 2-byte integer located at an offset of 16 bytes from
002397      ** the beginning of the database file. */
002398      pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
002399      if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
002400           || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
002401        pBt->pageSize = 0;
002402  #ifndef SQLITE_OMIT_AUTOVACUUM
002403        /* If the magic name ":memory:" will create an in-memory database, then
002404        ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
002405        ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
002406        ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
002407        ** regular file-name. In this case the auto-vacuum applies as per normal.
002408        */
002409        if( zFilename && !isMemdb ){
002410          pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
002411          pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
002412        }
002413  #endif
002414        nReserve = 0;
002415      }else{
002416        /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
002417        ** determined by the one-byte unsigned integer found at an offset of 20
002418        ** into the database file header. */
002419        nReserve = zDbHeader[20];
002420        pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002421  #ifndef SQLITE_OMIT_AUTOVACUUM
002422        pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
002423        pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
002424  #endif
002425      }
002426      rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002427      if( rc ) goto btree_open_out;
002428      pBt->usableSize = pBt->pageSize - nReserve;
002429      assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
002430     
002431  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002432      /* Add the new BtShared object to the linked list sharable BtShareds.
002433      */
002434      pBt->nRef = 1;
002435      if( p->sharable ){
002436        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
002437        MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
002438        if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
002439          pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
002440          if( pBt->mutex==0 ){
002441            rc = SQLITE_NOMEM_BKPT;
002442            goto btree_open_out;
002443          }
002444        }
002445        sqlite3_mutex_enter(mutexShared);
002446        pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
002447        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
002448        sqlite3_mutex_leave(mutexShared);
002449      }
002450  #endif
002451    }
002452  
002453  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
002454    /* If the new Btree uses a sharable pBtShared, then link the new
002455    ** Btree into the list of all sharable Btrees for the same connection.
002456    ** The list is kept in ascending order by pBt address.
002457    */
002458    if( p->sharable ){
002459      int i;
002460      Btree *pSib;
002461      for(i=0; i<db->nDb; i++){
002462        if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
002463          while( pSib->pPrev ){ pSib = pSib->pPrev; }
002464          if( (uptr)p->pBt<(uptr)pSib->pBt ){
002465            p->pNext = pSib;
002466            p->pPrev = 0;
002467            pSib->pPrev = p;
002468          }else{
002469            while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
002470              pSib = pSib->pNext;
002471            }
002472            p->pNext = pSib->pNext;
002473            p->pPrev = pSib;
002474            if( p->pNext ){
002475              p->pNext->pPrev = p;
002476            }
002477            pSib->pNext = p;
002478          }
002479          break;
002480        }
002481      }
002482    }
002483  #endif
002484    *ppBtree = p;
002485  
002486  btree_open_out:
002487    if( rc!=SQLITE_OK ){
002488      if( pBt && pBt->pPager ){
002489        sqlite3PagerClose(pBt->pPager, 0);
002490      }
002491      sqlite3_free(pBt);
002492      sqlite3_free(p);
002493      *ppBtree = 0;
002494    }else{
002495      sqlite3_file *pFile;
002496  
002497      /* If the B-Tree was successfully opened, set the pager-cache size to the
002498      ** default value. Except, when opening on an existing shared pager-cache,
002499      ** do not change the pager-cache size.
002500      */
002501      if( sqlite3BtreeSchema(p, 0, 0)==0 ){
002502        sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
002503      }
002504  
002505      pFile = sqlite3PagerFile(pBt->pPager);
002506      if( pFile->pMethods ){
002507        sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
002508      }
002509    }
002510    if( mutexOpen ){
002511      assert( sqlite3_mutex_held(mutexOpen) );
002512      sqlite3_mutex_leave(mutexOpen);
002513    }
002514    assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
002515    return rc;
002516  }
002517  
002518  /*
002519  ** Decrement the BtShared.nRef counter.  When it reaches zero,
002520  ** remove the BtShared structure from the sharing list.  Return
002521  ** true if the BtShared.nRef counter reaches zero and return
002522  ** false if it is still positive.
002523  */
002524  static int removeFromSharingList(BtShared *pBt){
002525  #ifndef SQLITE_OMIT_SHARED_CACHE
002526    MUTEX_LOGIC( sqlite3_mutex *pMaster; )
002527    BtShared *pList;
002528    int removed = 0;
002529  
002530    assert( sqlite3_mutex_notheld(pBt->mutex) );
002531    MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
002532    sqlite3_mutex_enter(pMaster);
002533    pBt->nRef--;
002534    if( pBt->nRef<=0 ){
002535      if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
002536        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
002537      }else{
002538        pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
002539        while( ALWAYS(pList) && pList->pNext!=pBt ){
002540          pList=pList->pNext;
002541        }
002542        if( ALWAYS(pList) ){
002543          pList->pNext = pBt->pNext;
002544        }
002545      }
002546      if( SQLITE_THREADSAFE ){
002547        sqlite3_mutex_free(pBt->mutex);
002548      }
002549      removed = 1;
002550    }
002551    sqlite3_mutex_leave(pMaster);
002552    return removed;
002553  #else
002554    return 1;
002555  #endif
002556  }
002557  
002558  /*
002559  ** Make sure pBt->pTmpSpace points to an allocation of 
002560  ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
002561  ** pointer.
002562  */
002563  static void allocateTempSpace(BtShared *pBt){
002564    if( !pBt->pTmpSpace ){
002565      pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
002566  
002567      /* One of the uses of pBt->pTmpSpace is to format cells before
002568      ** inserting them into a leaf page (function fillInCell()). If
002569      ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
002570      ** by the various routines that manipulate binary cells. Which
002571      ** can mean that fillInCell() only initializes the first 2 or 3
002572      ** bytes of pTmpSpace, but that the first 4 bytes are copied from
002573      ** it into a database page. This is not actually a problem, but it
002574      ** does cause a valgrind error when the 1 or 2 bytes of unitialized 
002575      ** data is passed to system call write(). So to avoid this error,
002576      ** zero the first 4 bytes of temp space here.
002577      **
002578      ** Also:  Provide four bytes of initialized space before the
002579      ** beginning of pTmpSpace as an area available to prepend the
002580      ** left-child pointer to the beginning of a cell.
002581      */
002582      if( pBt->pTmpSpace ){
002583        memset(pBt->pTmpSpace, 0, 8);
002584        pBt->pTmpSpace += 4;
002585      }
002586    }
002587  }
002588  
002589  /*
002590  ** Free the pBt->pTmpSpace allocation
002591  */
002592  static void freeTempSpace(BtShared *pBt){
002593    if( pBt->pTmpSpace ){
002594      pBt->pTmpSpace -= 4;
002595      sqlite3PageFree(pBt->pTmpSpace);
002596      pBt->pTmpSpace = 0;
002597    }
002598  }
002599  
002600  /*
002601  ** Close an open database and invalidate all cursors.
002602  */
002603  int sqlite3BtreeClose(Btree *p){
002604    BtShared *pBt = p->pBt;
002605    BtCursor *pCur;
002606  
002607    /* Close all cursors opened via this handle.  */
002608    assert( sqlite3_mutex_held(p->db->mutex) );
002609    sqlite3BtreeEnter(p);
002610    pCur = pBt->pCursor;
002611    while( pCur ){
002612      BtCursor *pTmp = pCur;
002613      pCur = pCur->pNext;
002614      if( pTmp->pBtree==p ){
002615        sqlite3BtreeCloseCursor(pTmp);
002616      }
002617    }
002618  
002619    /* Rollback any active transaction and free the handle structure.
002620    ** The call to sqlite3BtreeRollback() drops any table-locks held by
002621    ** this handle.
002622    */
002623    sqlite3BtreeRollback(p, SQLITE_OK, 0);
002624    sqlite3BtreeLeave(p);
002625  
002626    /* If there are still other outstanding references to the shared-btree
002627    ** structure, return now. The remainder of this procedure cleans 
002628    ** up the shared-btree.
002629    */
002630    assert( p->wantToLock==0 && p->locked==0 );
002631    if( !p->sharable || removeFromSharingList(pBt) ){
002632      /* The pBt is no longer on the sharing list, so we can access
002633      ** it without having to hold the mutex.
002634      **
002635      ** Clean out and delete the BtShared object.
002636      */
002637      assert( !pBt->pCursor );
002638      sqlite3PagerClose(pBt->pPager, p->db);
002639      if( pBt->xFreeSchema && pBt->pSchema ){
002640        pBt->xFreeSchema(pBt->pSchema);
002641      }
002642      sqlite3DbFree(0, pBt->pSchema);
002643      freeTempSpace(pBt);
002644      sqlite3_free(pBt);
002645    }
002646  
002647  #ifndef SQLITE_OMIT_SHARED_CACHE
002648    assert( p->wantToLock==0 );
002649    assert( p->locked==0 );
002650    if( p->pPrev ) p->pPrev->pNext = p->pNext;
002651    if( p->pNext ) p->pNext->pPrev = p->pPrev;
002652  #endif
002653  
002654    sqlite3_free(p);
002655    return SQLITE_OK;
002656  }
002657  
002658  /*
002659  ** Change the "soft" limit on the number of pages in the cache.
002660  ** Unused and unmodified pages will be recycled when the number of
002661  ** pages in the cache exceeds this soft limit.  But the size of the
002662  ** cache is allowed to grow larger than this limit if it contains
002663  ** dirty pages or pages still in active use.
002664  */
002665  int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
002666    BtShared *pBt = p->pBt;
002667    assert( sqlite3_mutex_held(p->db->mutex) );
002668    sqlite3BtreeEnter(p);
002669    sqlite3PagerSetCachesize(pBt->pPager, mxPage);
002670    sqlite3BtreeLeave(p);
002671    return SQLITE_OK;
002672  }
002673  
002674  /*
002675  ** Change the "spill" limit on the number of pages in the cache.
002676  ** If the number of pages exceeds this limit during a write transaction,
002677  ** the pager might attempt to "spill" pages to the journal early in
002678  ** order to free up memory.
002679  **
002680  ** The value returned is the current spill size.  If zero is passed
002681  ** as an argument, no changes are made to the spill size setting, so
002682  ** using mxPage of 0 is a way to query the current spill size.
002683  */
002684  int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
002685    BtShared *pBt = p->pBt;
002686    int res;
002687    assert( sqlite3_mutex_held(p->db->mutex) );
002688    sqlite3BtreeEnter(p);
002689    res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
002690    sqlite3BtreeLeave(p);
002691    return res;
002692  }
002693  
002694  #if SQLITE_MAX_MMAP_SIZE>0
002695  /*
002696  ** Change the limit on the amount of the database file that may be
002697  ** memory mapped.
002698  */
002699  int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
002700    BtShared *pBt = p->pBt;
002701    assert( sqlite3_mutex_held(p->db->mutex) );
002702    sqlite3BtreeEnter(p);
002703    sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
002704    sqlite3BtreeLeave(p);
002705    return SQLITE_OK;
002706  }
002707  #endif /* SQLITE_MAX_MMAP_SIZE>0 */
002708  
002709  /*
002710  ** Change the way data is synced to disk in order to increase or decrease
002711  ** how well the database resists damage due to OS crashes and power
002712  ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
002713  ** there is a high probability of damage)  Level 2 is the default.  There
002714  ** is a very low but non-zero probability of damage.  Level 3 reduces the
002715  ** probability of damage to near zero but with a write performance reduction.
002716  */
002717  #ifndef SQLITE_OMIT_PAGER_PRAGMAS
002718  int sqlite3BtreeSetPagerFlags(
002719    Btree *p,              /* The btree to set the safety level on */
002720    unsigned pgFlags       /* Various PAGER_* flags */
002721  ){
002722    BtShared *pBt = p->pBt;
002723    assert( sqlite3_mutex_held(p->db->mutex) );
002724    sqlite3BtreeEnter(p);
002725    sqlite3PagerSetFlags(pBt->pPager, pgFlags);
002726    sqlite3BtreeLeave(p);
002727    return SQLITE_OK;
002728  }
002729  #endif
002730  
002731  /*
002732  ** Change the default pages size and the number of reserved bytes per page.
002733  ** Or, if the page size has already been fixed, return SQLITE_READONLY 
002734  ** without changing anything.
002735  **
002736  ** The page size must be a power of 2 between 512 and 65536.  If the page
002737  ** size supplied does not meet this constraint then the page size is not
002738  ** changed.
002739  **
002740  ** Page sizes are constrained to be a power of two so that the region
002741  ** of the database file used for locking (beginning at PENDING_BYTE,
002742  ** the first byte past the 1GB boundary, 0x40000000) needs to occur
002743  ** at the beginning of a page.
002744  **
002745  ** If parameter nReserve is less than zero, then the number of reserved
002746  ** bytes per page is left unchanged.
002747  **
002748  ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
002749  ** and autovacuum mode can no longer be changed.
002750  */
002751  int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
002752    int rc = SQLITE_OK;
002753    BtShared *pBt = p->pBt;
002754    assert( nReserve>=-1 && nReserve<=255 );
002755    sqlite3BtreeEnter(p);
002756  #if SQLITE_HAS_CODEC
002757    if( nReserve>pBt->optimalReserve ) pBt->optimalReserve = (u8)nReserve;
002758  #endif
002759    if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
002760      sqlite3BtreeLeave(p);
002761      return SQLITE_READONLY;
002762    }
002763    if( nReserve<0 ){
002764      nReserve = pBt->pageSize - pBt->usableSize;
002765    }
002766    assert( nReserve>=0 && nReserve<=255 );
002767    if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
002768          ((pageSize-1)&pageSize)==0 ){
002769      assert( (pageSize & 7)==0 );
002770      assert( !pBt->pCursor );
002771      pBt->pageSize = (u32)pageSize;
002772      freeTempSpace(pBt);
002773    }
002774    rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
002775    pBt->usableSize = pBt->pageSize - (u16)nReserve;
002776    if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
002777    sqlite3BtreeLeave(p);
002778    return rc;
002779  }
002780  
002781  /*
002782  ** Return the currently defined page size
002783  */
002784  int sqlite3BtreeGetPageSize(Btree *p){
002785    return p->pBt->pageSize;
002786  }
002787  
002788  /*
002789  ** This function is similar to sqlite3BtreeGetReserve(), except that it
002790  ** may only be called if it is guaranteed that the b-tree mutex is already
002791  ** held.
002792  **
002793  ** This is useful in one special case in the backup API code where it is
002794  ** known that the shared b-tree mutex is held, but the mutex on the 
002795  ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
002796  ** were to be called, it might collide with some other operation on the
002797  ** database handle that owns *p, causing undefined behavior.
002798  */
002799  int sqlite3BtreeGetReserveNoMutex(Btree *p){
002800    int n;
002801    assert( sqlite3_mutex_held(p->pBt->mutex) );
002802    n = p->pBt->pageSize - p->pBt->usableSize;
002803    return n;
002804  }
002805  
002806  /*
002807  ** Return the number of bytes of space at the end of every page that
002808  ** are intentually left unused.  This is the "reserved" space that is
002809  ** sometimes used by extensions.
002810  **
002811  ** If SQLITE_HAS_MUTEX is defined then the number returned is the
002812  ** greater of the current reserved space and the maximum requested
002813  ** reserve space.
002814  */
002815  int sqlite3BtreeGetOptimalReserve(Btree *p){
002816    int n;
002817    sqlite3BtreeEnter(p);
002818    n = sqlite3BtreeGetReserveNoMutex(p);
002819  #ifdef SQLITE_HAS_CODEC
002820    if( n<p->pBt->optimalReserve ) n = p->pBt->optimalReserve;
002821  #endif
002822    sqlite3BtreeLeave(p);
002823    return n;
002824  }
002825  
002826  
002827  /*
002828  ** Set the maximum page count for a database if mxPage is positive.
002829  ** No changes are made if mxPage is 0 or negative.
002830  ** Regardless of the value of mxPage, return the maximum page count.
002831  */
002832  int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
002833    int n;
002834    sqlite3BtreeEnter(p);
002835    n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
002836    sqlite3BtreeLeave(p);
002837    return n;
002838  }
002839  
002840  /*
002841  ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
002842  **
002843  **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
002844  **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
002845  **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
002846  **    newFlag==(-1)    No changes
002847  **
002848  ** This routine acts as a query if newFlag is less than zero
002849  **
002850  ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
002851  ** freelist leaf pages are not written back to the database.  Thus in-page
002852  ** deleted content is cleared, but freelist deleted content is not.
002853  **
002854  ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
002855  ** that freelist leaf pages are written back into the database, increasing
002856  ** the amount of disk I/O.
002857  */
002858  int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
002859    int b;
002860    if( p==0 ) return 0;
002861    sqlite3BtreeEnter(p);
002862    assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
002863    assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
002864    if( newFlag>=0 ){
002865      p->pBt->btsFlags &= ~BTS_FAST_SECURE;
002866      p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
002867    }
002868    b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
002869    sqlite3BtreeLeave(p);
002870    return b;
002871  }
002872  
002873  /*
002874  ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
002875  ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
002876  ** is disabled. The default value for the auto-vacuum property is 
002877  ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
002878  */
002879  int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
002880  #ifdef SQLITE_OMIT_AUTOVACUUM
002881    return SQLITE_READONLY;
002882  #else
002883    BtShared *pBt = p->pBt;
002884    int rc = SQLITE_OK;
002885    u8 av = (u8)autoVacuum;
002886  
002887    sqlite3BtreeEnter(p);
002888    if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
002889      rc = SQLITE_READONLY;
002890    }else{
002891      pBt->autoVacuum = av ?1:0;
002892      pBt->incrVacuum = av==2 ?1:0;
002893    }
002894    sqlite3BtreeLeave(p);
002895    return rc;
002896  #endif
002897  }
002898  
002899  /*
002900  ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
002901  ** enabled 1 is returned. Otherwise 0.
002902  */
002903  int sqlite3BtreeGetAutoVacuum(Btree *p){
002904  #ifdef SQLITE_OMIT_AUTOVACUUM
002905    return BTREE_AUTOVACUUM_NONE;
002906  #else
002907    int rc;
002908    sqlite3BtreeEnter(p);
002909    rc = (
002910      (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
002911      (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
002912      BTREE_AUTOVACUUM_INCR
002913    );
002914    sqlite3BtreeLeave(p);
002915    return rc;
002916  #endif
002917  }
002918  
002919  /*
002920  ** If the user has not set the safety-level for this database connection
002921  ** using "PRAGMA synchronous", and if the safety-level is not already
002922  ** set to the value passed to this function as the second parameter,
002923  ** set it so.
002924  */
002925  #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
002926      && !defined(SQLITE_OMIT_WAL)
002927  static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
002928    sqlite3 *db;
002929    Db *pDb;
002930    if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
002931      while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
002932      if( pDb->bSyncSet==0 
002933       && pDb->safety_level!=safety_level 
002934       && pDb!=&db->aDb[1] 
002935      ){
002936        pDb->safety_level = safety_level;
002937        sqlite3PagerSetFlags(pBt->pPager,
002938            pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
002939      }
002940    }
002941  }
002942  #else
002943  # define setDefaultSyncFlag(pBt,safety_level)
002944  #endif
002945  
002946  /*
002947  ** Get a reference to pPage1 of the database file.  This will
002948  ** also acquire a readlock on that file.
002949  **
002950  ** SQLITE_OK is returned on success.  If the file is not a
002951  ** well-formed database file, then SQLITE_CORRUPT is returned.
002952  ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
002953  ** is returned if we run out of memory. 
002954  */
002955  static int lockBtree(BtShared *pBt){
002956    int rc;              /* Result code from subfunctions */
002957    MemPage *pPage1;     /* Page 1 of the database file */
002958    int nPage;           /* Number of pages in the database */
002959    int nPageFile = 0;   /* Number of pages in the database file */
002960    int nPageHeader;     /* Number of pages in the database according to hdr */
002961  
002962    assert( sqlite3_mutex_held(pBt->mutex) );
002963    assert( pBt->pPage1==0 );
002964    rc = sqlite3PagerSharedLock(pBt->pPager);
002965    if( rc!=SQLITE_OK ) return rc;
002966    rc = btreeGetPage(pBt, 1, &pPage1, 0);
002967    if( rc!=SQLITE_OK ) return rc;
002968  
002969    /* Do some checking to help insure the file we opened really is
002970    ** a valid database file. 
002971    */
002972    nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
002973    sqlite3PagerPagecount(pBt->pPager, &nPageFile);
002974    if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
002975      nPage = nPageFile;
002976    }
002977    if( nPage>0 ){
002978      u32 pageSize;
002979      u32 usableSize;
002980      u8 *page1 = pPage1->aData;
002981      rc = SQLITE_NOTADB;
002982      /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
002983      ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
002984      ** 61 74 20 33 00. */
002985      if( memcmp(page1, zMagicHeader, 16)!=0 ){
002986        goto page1_init_failed;
002987      }
002988  
002989  #ifdef SQLITE_OMIT_WAL
002990      if( page1[18]>1 ){
002991        pBt->btsFlags |= BTS_READ_ONLY;
002992      }
002993      if( page1[19]>1 ){
002994        goto page1_init_failed;
002995      }
002996  #else
002997      if( page1[18]>2 ){
002998        pBt->btsFlags |= BTS_READ_ONLY;
002999      }
003000      if( page1[19]>2 ){
003001        goto page1_init_failed;
003002      }
003003  
003004      /* If the write version is set to 2, this database should be accessed
003005      ** in WAL mode. If the log is not already open, open it now. Then 
003006      ** return SQLITE_OK and return without populating BtShared.pPage1.
003007      ** The caller detects this and calls this function again. This is
003008      ** required as the version of page 1 currently in the page1 buffer
003009      ** may not be the latest version - there may be a newer one in the log
003010      ** file.
003011      */
003012      if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
003013        int isOpen = 0;
003014        rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
003015        if( rc!=SQLITE_OK ){
003016          goto page1_init_failed;
003017        }else{
003018          setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
003019          if( isOpen==0 ){
003020            releasePageOne(pPage1);
003021            return SQLITE_OK;
003022          }
003023        }
003024        rc = SQLITE_NOTADB;
003025      }else{
003026        setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
003027      }
003028  #endif
003029  
003030      /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
003031      ** fractions and the leaf payload fraction values must be 64, 32, and 32.
003032      **
003033      ** The original design allowed these amounts to vary, but as of
003034      ** version 3.6.0, we require them to be fixed.
003035      */
003036      if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
003037        goto page1_init_failed;
003038      }
003039      /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
003040      ** determined by the 2-byte integer located at an offset of 16 bytes from
003041      ** the beginning of the database file. */
003042      pageSize = (page1[16]<<8) | (page1[17]<<16);
003043      /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
003044      ** between 512 and 65536 inclusive. */
003045      if( ((pageSize-1)&pageSize)!=0
003046       || pageSize>SQLITE_MAX_PAGE_SIZE 
003047       || pageSize<=256 
003048      ){
003049        goto page1_init_failed;
003050      }
003051      assert( (pageSize & 7)==0 );
003052      /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
003053      ** integer at offset 20 is the number of bytes of space at the end of
003054      ** each page to reserve for extensions. 
003055      **
003056      ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
003057      ** determined by the one-byte unsigned integer found at an offset of 20
003058      ** into the database file header. */
003059      usableSize = pageSize - page1[20];
003060      if( (u32)pageSize!=pBt->pageSize ){
003061        /* After reading the first page of the database assuming a page size
003062        ** of BtShared.pageSize, we have discovered that the page-size is
003063        ** actually pageSize. Unlock the database, leave pBt->pPage1 at
003064        ** zero and return SQLITE_OK. The caller will call this function
003065        ** again with the correct page-size.
003066        */
003067        releasePageOne(pPage1);
003068        pBt->usableSize = usableSize;
003069        pBt->pageSize = pageSize;
003070        freeTempSpace(pBt);
003071        rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
003072                                     pageSize-usableSize);
003073        return rc;
003074      }
003075      if( (pBt->db->flags & SQLITE_WriteSchema)==0 && nPage>nPageFile ){
003076        rc = SQLITE_CORRUPT_BKPT;
003077        goto page1_init_failed;
003078      }
003079      /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
003080      ** be less than 480. In other words, if the page size is 512, then the
003081      ** reserved space size cannot exceed 32. */
003082      if( usableSize<480 ){
003083        goto page1_init_failed;
003084      }
003085      pBt->pageSize = pageSize;
003086      pBt->usableSize = usableSize;
003087  #ifndef SQLITE_OMIT_AUTOVACUUM
003088      pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
003089      pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
003090  #endif
003091    }
003092  
003093    /* maxLocal is the maximum amount of payload to store locally for
003094    ** a cell.  Make sure it is small enough so that at least minFanout
003095    ** cells can will fit on one page.  We assume a 10-byte page header.
003096    ** Besides the payload, the cell must store:
003097    **     2-byte pointer to the cell
003098    **     4-byte child pointer
003099    **     9-byte nKey value
003100    **     4-byte nData value
003101    **     4-byte overflow page pointer
003102    ** So a cell consists of a 2-byte pointer, a header which is as much as
003103    ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
003104    ** page pointer.
003105    */
003106    pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
003107    pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
003108    pBt->maxLeaf = (u16)(pBt->usableSize - 35);
003109    pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
003110    if( pBt->maxLocal>127 ){
003111      pBt->max1bytePayload = 127;
003112    }else{
003113      pBt->max1bytePayload = (u8)pBt->maxLocal;
003114    }
003115    assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
003116    pBt->pPage1 = pPage1;
003117    pBt->nPage = nPage;
003118    return SQLITE_OK;
003119  
003120  page1_init_failed:
003121    releasePageOne(pPage1);
003122    pBt->pPage1 = 0;
003123    return rc;
003124  }
003125  
003126  #ifndef NDEBUG
003127  /*
003128  ** Return the number of cursors open on pBt. This is for use
003129  ** in assert() expressions, so it is only compiled if NDEBUG is not
003130  ** defined.
003131  **
003132  ** Only write cursors are counted if wrOnly is true.  If wrOnly is
003133  ** false then all cursors are counted.
003134  **
003135  ** For the purposes of this routine, a cursor is any cursor that
003136  ** is capable of reading or writing to the database.  Cursors that
003137  ** have been tripped into the CURSOR_FAULT state are not counted.
003138  */
003139  static int countValidCursors(BtShared *pBt, int wrOnly){
003140    BtCursor *pCur;
003141    int r = 0;
003142    for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
003143      if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
003144       && pCur->eState!=CURSOR_FAULT ) r++; 
003145    }
003146    return r;
003147  }
003148  #endif
003149  
003150  /*
003151  ** If there are no outstanding cursors and we are not in the middle
003152  ** of a transaction but there is a read lock on the database, then
003153  ** this routine unrefs the first page of the database file which 
003154  ** has the effect of releasing the read lock.
003155  **
003156  ** If there is a transaction in progress, this routine is a no-op.
003157  */
003158  static void unlockBtreeIfUnused(BtShared *pBt){
003159    assert( sqlite3_mutex_held(pBt->mutex) );
003160    assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
003161    if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
003162      MemPage *pPage1 = pBt->pPage1;
003163      assert( pPage1->aData );
003164      assert( sqlite3PagerRefcount(pBt->pPager)==1 );
003165      pBt->pPage1 = 0;
003166      releasePageOne(pPage1);
003167    }
003168  }
003169  
003170  /*
003171  ** If pBt points to an empty file then convert that empty file
003172  ** into a new empty database by initializing the first page of
003173  ** the database.
003174  */
003175  static int newDatabase(BtShared *pBt){
003176    MemPage *pP1;
003177    unsigned char *data;
003178    int rc;
003179  
003180    assert( sqlite3_mutex_held(pBt->mutex) );
003181    if( pBt->nPage>0 ){
003182      return SQLITE_OK;
003183    }
003184    pP1 = pBt->pPage1;
003185    assert( pP1!=0 );
003186    data = pP1->aData;
003187    rc = sqlite3PagerWrite(pP1->pDbPage);
003188    if( rc ) return rc;
003189    memcpy(data, zMagicHeader, sizeof(zMagicHeader));
003190    assert( sizeof(zMagicHeader)==16 );
003191    data[16] = (u8)((pBt->pageSize>>8)&0xff);
003192    data[17] = (u8)((pBt->pageSize>>16)&0xff);
003193    data[18] = 1;
003194    data[19] = 1;
003195    assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
003196    data[20] = (u8)(pBt->pageSize - pBt->usableSize);
003197    data[21] = 64;
003198    data[22] = 32;
003199    data[23] = 32;
003200    memset(&data[24], 0, 100-24);
003201    zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
003202    pBt->btsFlags |= BTS_PAGESIZE_FIXED;
003203  #ifndef SQLITE_OMIT_AUTOVACUUM
003204    assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
003205    assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
003206    put4byte(&data[36 + 4*4], pBt->autoVacuum);
003207    put4byte(&data[36 + 7*4], pBt->incrVacuum);
003208  #endif
003209    pBt->nPage = 1;
003210    data[31] = 1;
003211    return SQLITE_OK;
003212  }
003213  
003214  /*
003215  ** Initialize the first page of the database file (creating a database
003216  ** consisting of a single page and no schema objects). Return SQLITE_OK
003217  ** if successful, or an SQLite error code otherwise.
003218  */
003219  int sqlite3BtreeNewDb(Btree *p){
003220    int rc;
003221    sqlite3BtreeEnter(p);
003222    p->pBt->nPage = 0;
003223    rc = newDatabase(p->pBt);
003224    sqlite3BtreeLeave(p);
003225    return rc;
003226  }
003227  
003228  /*
003229  ** Attempt to start a new transaction. A write-transaction
003230  ** is started if the second argument is nonzero, otherwise a read-
003231  ** transaction.  If the second argument is 2 or more and exclusive
003232  ** transaction is started, meaning that no other process is allowed
003233  ** to access the database.  A preexisting transaction may not be
003234  ** upgraded to exclusive by calling this routine a second time - the
003235  ** exclusivity flag only works for a new transaction.
003236  **
003237  ** A write-transaction must be started before attempting any 
003238  ** changes to the database.  None of the following routines 
003239  ** will work unless a transaction is started first:
003240  **
003241  **      sqlite3BtreeCreateTable()
003242  **      sqlite3BtreeCreateIndex()
003243  **      sqlite3BtreeClearTable()
003244  **      sqlite3BtreeDropTable()
003245  **      sqlite3BtreeInsert()
003246  **      sqlite3BtreeDelete()
003247  **      sqlite3BtreeUpdateMeta()
003248  **
003249  ** If an initial attempt to acquire the lock fails because of lock contention
003250  ** and the database was previously unlocked, then invoke the busy handler
003251  ** if there is one.  But if there was previously a read-lock, do not
003252  ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
003253  ** returned when there is already a read-lock in order to avoid a deadlock.
003254  **
003255  ** Suppose there are two processes A and B.  A has a read lock and B has
003256  ** a reserved lock.  B tries to promote to exclusive but is blocked because
003257  ** of A's read lock.  A tries to promote to reserved but is blocked by B.
003258  ** One or the other of the two processes must give way or there can be
003259  ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
003260  ** when A already has a read lock, we encourage A to give up and let B
003261  ** proceed.
003262  */
003263  int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
003264    BtShared *pBt = p->pBt;
003265    int rc = SQLITE_OK;
003266  
003267    sqlite3BtreeEnter(p);
003268    btreeIntegrity(p);
003269  
003270    /* If the btree is already in a write-transaction, or it
003271    ** is already in a read-transaction and a read-transaction
003272    ** is requested, this is a no-op.
003273    */
003274    if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
003275      goto trans_begun;
003276    }
003277    assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
003278  
003279    /* Write transactions are not possible on a read-only database */
003280    if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
003281      rc = SQLITE_READONLY;
003282      goto trans_begun;
003283    }
003284  
003285  #ifndef SQLITE_OMIT_SHARED_CACHE
003286    {
003287      sqlite3 *pBlock = 0;
003288      /* If another database handle has already opened a write transaction 
003289      ** on this shared-btree structure and a second write transaction is
003290      ** requested, return SQLITE_LOCKED.
003291      */
003292      if( (wrflag && pBt->inTransaction==TRANS_WRITE)
003293       || (pBt->btsFlags & BTS_PENDING)!=0
003294      ){
003295        pBlock = pBt->pWriter->db;
003296      }else if( wrflag>1 ){
003297        BtLock *pIter;
003298        for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
003299          if( pIter->pBtree!=p ){
003300            pBlock = pIter->pBtree->db;
003301            break;
003302          }
003303        }
003304      }
003305      if( pBlock ){
003306        sqlite3ConnectionBlocked(p->db, pBlock);
003307        rc = SQLITE_LOCKED_SHAREDCACHE;
003308        goto trans_begun;
003309      }
003310    }
003311  #endif
003312  
003313    /* Any read-only or read-write transaction implies a read-lock on 
003314    ** page 1. So if some other shared-cache client already has a write-lock 
003315    ** on page 1, the transaction cannot be opened. */
003316    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
003317    if( SQLITE_OK!=rc ) goto trans_begun;
003318  
003319    pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
003320    if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
003321    do {
003322      /* Call lockBtree() until either pBt->pPage1 is populated or
003323      ** lockBtree() returns something other than SQLITE_OK. lockBtree()
003324      ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
003325      ** reading page 1 it discovers that the page-size of the database 
003326      ** file is not pBt->pageSize. In this case lockBtree() will update
003327      ** pBt->pageSize to the page-size of the file on disk.
003328      */
003329      while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
003330  
003331      if( rc==SQLITE_OK && wrflag ){
003332        if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
003333          rc = SQLITE_READONLY;
003334        }else{
003335          rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
003336          if( rc==SQLITE_OK ){
003337            rc = newDatabase(pBt);
003338          }
003339        }
003340      }
003341    
003342      if( rc!=SQLITE_OK ){
003343        unlockBtreeIfUnused(pBt);
003344      }
003345    }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
003346            btreeInvokeBusyHandler(pBt) );
003347  
003348    if( rc==SQLITE_OK ){
003349      if( p->inTrans==TRANS_NONE ){
003350        pBt->nTransaction++;
003351  #ifndef SQLITE_OMIT_SHARED_CACHE
003352        if( p->sharable ){
003353          assert( p->lock.pBtree==p && p->lock.iTable==1 );
003354          p->lock.eLock = READ_LOCK;
003355          p->lock.pNext = pBt->pLock;
003356          pBt->pLock = &p->lock;
003357        }
003358  #endif
003359      }
003360      p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
003361      if( p->inTrans>pBt->inTransaction ){
003362        pBt->inTransaction = p->inTrans;
003363      }
003364      if( wrflag ){
003365        MemPage *pPage1 = pBt->pPage1;
003366  #ifndef SQLITE_OMIT_SHARED_CACHE
003367        assert( !pBt->pWriter );
003368        pBt->pWriter = p;
003369        pBt->btsFlags &= ~BTS_EXCLUSIVE;
003370        if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
003371  #endif
003372  
003373        /* If the db-size header field is incorrect (as it may be if an old
003374        ** client has been writing the database file), update it now. Doing
003375        ** this sooner rather than later means the database size can safely 
003376        ** re-read the database size from page 1 if a savepoint or transaction
003377        ** rollback occurs within the transaction.
003378        */
003379        if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
003380          rc = sqlite3PagerWrite(pPage1->pDbPage);
003381          if( rc==SQLITE_OK ){
003382            put4byte(&pPage1->aData[28], pBt->nPage);
003383          }
003384        }
003385      }
003386    }
003387  
003388  
003389  trans_begun:
003390    if( rc==SQLITE_OK && wrflag ){
003391      /* This call makes sure that the pager has the correct number of
003392      ** open savepoints. If the second parameter is greater than 0 and
003393      ** the sub-journal is not already open, then it will be opened here.
003394      */
003395      rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
003396    }
003397  
003398    btreeIntegrity(p);
003399    sqlite3BtreeLeave(p);
003400    return rc;
003401  }
003402  
003403  #ifndef SQLITE_OMIT_AUTOVACUUM
003404  
003405  /*
003406  ** Set the pointer-map entries for all children of page pPage. Also, if
003407  ** pPage contains cells that point to overflow pages, set the pointer
003408  ** map entries for the overflow pages as well.
003409  */
003410  static int setChildPtrmaps(MemPage *pPage){
003411    int i;                             /* Counter variable */
003412    int nCell;                         /* Number of cells in page pPage */
003413    int rc;                            /* Return code */
003414    BtShared *pBt = pPage->pBt;
003415    Pgno pgno = pPage->pgno;
003416  
003417    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003418    rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003419    if( rc!=SQLITE_OK ) return rc;
003420    nCell = pPage->nCell;
003421  
003422    for(i=0; i<nCell; i++){
003423      u8 *pCell = findCell(pPage, i);
003424  
003425      ptrmapPutOvflPtr(pPage, pCell, &rc);
003426  
003427      if( !pPage->leaf ){
003428        Pgno childPgno = get4byte(pCell);
003429        ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003430      }
003431    }
003432  
003433    if( !pPage->leaf ){
003434      Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
003435      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
003436    }
003437  
003438    return rc;
003439  }
003440  
003441  /*
003442  ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
003443  ** that it points to iTo. Parameter eType describes the type of pointer to
003444  ** be modified, as  follows:
003445  **
003446  ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
003447  **                   page of pPage.
003448  **
003449  ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
003450  **                   page pointed to by one of the cells on pPage.
003451  **
003452  ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
003453  **                   overflow page in the list.
003454  */
003455  static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
003456    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
003457    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
003458    if( eType==PTRMAP_OVERFLOW2 ){
003459      /* The pointer is always the first 4 bytes of the page in this case.  */
003460      if( get4byte(pPage->aData)!=iFrom ){
003461        return SQLITE_CORRUPT_PGNO(pPage->pgno);
003462      }
003463      put4byte(pPage->aData, iTo);
003464    }else{
003465      int i;
003466      int nCell;
003467      int rc;
003468  
003469      rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
003470      if( rc ) return rc;
003471      nCell = pPage->nCell;
003472  
003473      for(i=0; i<nCell; i++){
003474        u8 *pCell = findCell(pPage, i);
003475        if( eType==PTRMAP_OVERFLOW1 ){
003476          CellInfo info;
003477          pPage->xParseCell(pPage, pCell, &info);
003478          if( info.nLocal<info.nPayload ){
003479            if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
003480              return SQLITE_CORRUPT_PGNO(pPage->pgno);
003481            }
003482            if( iFrom==get4byte(pCell+info.nSize-4) ){
003483              put4byte(pCell+info.nSize-4, iTo);
003484              break;
003485            }
003486          }
003487        }else{
003488          if( get4byte(pCell)==iFrom ){
003489            put4byte(pCell, iTo);
003490            break;
003491          }
003492        }
003493      }
003494    
003495      if( i==nCell ){
003496        if( eType!=PTRMAP_BTREE || 
003497            get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
003498          return SQLITE_CORRUPT_PGNO(pPage->pgno);
003499        }
003500        put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
003501      }
003502    }
003503    return SQLITE_OK;
003504  }
003505  
003506  
003507  /*
003508  ** Move the open database page pDbPage to location iFreePage in the 
003509  ** database. The pDbPage reference remains valid.
003510  **
003511  ** The isCommit flag indicates that there is no need to remember that
003512  ** the journal needs to be sync()ed before database page pDbPage->pgno 
003513  ** can be written to. The caller has already promised not to write to that
003514  ** page.
003515  */
003516  static int relocatePage(
003517    BtShared *pBt,           /* Btree */
003518    MemPage *pDbPage,        /* Open page to move */
003519    u8 eType,                /* Pointer map 'type' entry for pDbPage */
003520    Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
003521    Pgno iFreePage,          /* The location to move pDbPage to */
003522    int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
003523  ){
003524    MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
003525    Pgno iDbPage = pDbPage->pgno;
003526    Pager *pPager = pBt->pPager;
003527    int rc;
003528  
003529    assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
003530        eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
003531    assert( sqlite3_mutex_held(pBt->mutex) );
003532    assert( pDbPage->pBt==pBt );
003533  
003534    /* Move page iDbPage from its current location to page number iFreePage */
003535    TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
003536        iDbPage, iFreePage, iPtrPage, eType));
003537    rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
003538    if( rc!=SQLITE_OK ){
003539      return rc;
003540    }
003541    pDbPage->pgno = iFreePage;
003542  
003543    /* If pDbPage was a btree-page, then it may have child pages and/or cells
003544    ** that point to overflow pages. The pointer map entries for all these
003545    ** pages need to be changed.
003546    **
003547    ** If pDbPage is an overflow page, then the first 4 bytes may store a
003548    ** pointer to a subsequent overflow page. If this is the case, then
003549    ** the pointer map needs to be updated for the subsequent overflow page.
003550    */
003551    if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
003552      rc = setChildPtrmaps(pDbPage);
003553      if( rc!=SQLITE_OK ){
003554        return rc;
003555      }
003556    }else{
003557      Pgno nextOvfl = get4byte(pDbPage->aData);
003558      if( nextOvfl!=0 ){
003559        ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
003560        if( rc!=SQLITE_OK ){
003561          return rc;
003562        }
003563      }
003564    }
003565  
003566    /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
003567    ** that it points at iFreePage. Also fix the pointer map entry for
003568    ** iPtrPage.
003569    */
003570    if( eType!=PTRMAP_ROOTPAGE ){
003571      rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
003572      if( rc!=SQLITE_OK ){
003573        return rc;
003574      }
003575      rc = sqlite3PagerWrite(pPtrPage->pDbPage);
003576      if( rc!=SQLITE_OK ){
003577        releasePage(pPtrPage);
003578        return rc;
003579      }
003580      rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
003581      releasePage(pPtrPage);
003582      if( rc==SQLITE_OK ){
003583        ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
003584      }
003585    }
003586    return rc;
003587  }
003588  
003589  /* Forward declaration required by incrVacuumStep(). */
003590  static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
003591  
003592  /*
003593  ** Perform a single step of an incremental-vacuum. If successful, return
003594  ** SQLITE_OK. If there is no work to do (and therefore no point in 
003595  ** calling this function again), return SQLITE_DONE. Or, if an error 
003596  ** occurs, return some other error code.
003597  **
003598  ** More specifically, this function attempts to re-organize the database so 
003599  ** that the last page of the file currently in use is no longer in use.
003600  **
003601  ** Parameter nFin is the number of pages that this database would contain
003602  ** were this function called until it returns SQLITE_DONE.
003603  **
003604  ** If the bCommit parameter is non-zero, this function assumes that the 
003605  ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 
003606  ** or an error. bCommit is passed true for an auto-vacuum-on-commit 
003607  ** operation, or false for an incremental vacuum.
003608  */
003609  static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
003610    Pgno nFreeList;           /* Number of pages still on the free-list */
003611    int rc;
003612  
003613    assert( sqlite3_mutex_held(pBt->mutex) );
003614    assert( iLastPg>nFin );
003615  
003616    if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
003617      u8 eType;
003618      Pgno iPtrPage;
003619  
003620      nFreeList = get4byte(&pBt->pPage1->aData[36]);
003621      if( nFreeList==0 ){
003622        return SQLITE_DONE;
003623      }
003624  
003625      rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
003626      if( rc!=SQLITE_OK ){
003627        return rc;
003628      }
003629      if( eType==PTRMAP_ROOTPAGE ){
003630        return SQLITE_CORRUPT_BKPT;
003631      }
003632  
003633      if( eType==PTRMAP_FREEPAGE ){
003634        if( bCommit==0 ){
003635          /* Remove the page from the files free-list. This is not required
003636          ** if bCommit is non-zero. In that case, the free-list will be
003637          ** truncated to zero after this function returns, so it doesn't 
003638          ** matter if it still contains some garbage entries.
003639          */
003640          Pgno iFreePg;
003641          MemPage *pFreePg;
003642          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
003643          if( rc!=SQLITE_OK ){
003644            return rc;
003645          }
003646          assert( iFreePg==iLastPg );
003647          releasePage(pFreePg);
003648        }
003649      } else {
003650        Pgno iFreePg;             /* Index of free page to move pLastPg to */
003651        MemPage *pLastPg;
003652        u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
003653        Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
003654  
003655        rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
003656        if( rc!=SQLITE_OK ){
003657          return rc;
003658        }
003659  
003660        /* If bCommit is zero, this loop runs exactly once and page pLastPg
003661        ** is swapped with the first free page pulled off the free list.
003662        **
003663        ** On the other hand, if bCommit is greater than zero, then keep
003664        ** looping until a free-page located within the first nFin pages
003665        ** of the file is found.
003666        */
003667        if( bCommit==0 ){
003668          eMode = BTALLOC_LE;
003669          iNear = nFin;
003670        }
003671        do {
003672          MemPage *pFreePg;
003673          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
003674          if( rc!=SQLITE_OK ){
003675            releasePage(pLastPg);
003676            return rc;
003677          }
003678          releasePage(pFreePg);
003679        }while( bCommit && iFreePg>nFin );
003680        assert( iFreePg<iLastPg );
003681        
003682        rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
003683        releasePage(pLastPg);
003684        if( rc!=SQLITE_OK ){
003685          return rc;
003686        }
003687      }
003688    }
003689  
003690    if( bCommit==0 ){
003691      do {
003692        iLastPg--;
003693      }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
003694      pBt->bDoTruncate = 1;
003695      pBt->nPage = iLastPg;
003696    }
003697    return SQLITE_OK;
003698  }
003699  
003700  /*
003701  ** The database opened by the first argument is an auto-vacuum database
003702  ** nOrig pages in size containing nFree free pages. Return the expected 
003703  ** size of the database in pages following an auto-vacuum operation.
003704  */
003705  static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
003706    int nEntry;                     /* Number of entries on one ptrmap page */
003707    Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
003708    Pgno nFin;                      /* Return value */
003709  
003710    nEntry = pBt->usableSize/5;
003711    nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
003712    nFin = nOrig - nFree - nPtrmap;
003713    if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
003714      nFin--;
003715    }
003716    while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
003717      nFin--;
003718    }
003719  
003720    return nFin;
003721  }
003722  
003723  /*
003724  ** A write-transaction must be opened before calling this function.
003725  ** It performs a single unit of work towards an incremental vacuum.
003726  **
003727  ** If the incremental vacuum is finished after this function has run,
003728  ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
003729  ** SQLITE_OK is returned. Otherwise an SQLite error code. 
003730  */
003731  int sqlite3BtreeIncrVacuum(Btree *p){
003732    int rc;
003733    BtShared *pBt = p->pBt;
003734  
003735    sqlite3BtreeEnter(p);
003736    assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
003737    if( !pBt->autoVacuum ){
003738      rc = SQLITE_DONE;
003739    }else{
003740      Pgno nOrig = btreePagecount(pBt);
003741      Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
003742      Pgno nFin = finalDbSize(pBt, nOrig, nFree);
003743  
003744      if( nOrig<nFin ){
003745        rc = SQLITE_CORRUPT_BKPT;
003746      }else if( nFree>0 ){
003747        rc = saveAllCursors(pBt, 0, 0);
003748        if( rc==SQLITE_OK ){
003749          invalidateAllOverflowCache(pBt);
003750          rc = incrVacuumStep(pBt, nFin, nOrig, 0);
003751        }
003752        if( rc==SQLITE_OK ){
003753          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003754          put4byte(&pBt->pPage1->aData[28], pBt->nPage);
003755        }
003756      }else{
003757        rc = SQLITE_DONE;
003758      }
003759    }
003760    sqlite3BtreeLeave(p);
003761    return rc;
003762  }
003763  
003764  /*
003765  ** This routine is called prior to sqlite3PagerCommit when a transaction
003766  ** is committed for an auto-vacuum database.
003767  **
003768  ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
003769  ** the database file should be truncated to during the commit process. 
003770  ** i.e. the database has been reorganized so that only the first *pnTrunc
003771  ** pages are in use.
003772  */
003773  static int autoVacuumCommit(BtShared *pBt){
003774    int rc = SQLITE_OK;
003775    Pager *pPager = pBt->pPager;
003776    VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
003777  
003778    assert( sqlite3_mutex_held(pBt->mutex) );
003779    invalidateAllOverflowCache(pBt);
003780    assert(pBt->autoVacuum);
003781    if( !pBt->incrVacuum ){
003782      Pgno nFin;         /* Number of pages in database after autovacuuming */
003783      Pgno nFree;        /* Number of pages on the freelist initially */
003784      Pgno iFree;        /* The next page to be freed */
003785      Pgno nOrig;        /* Database size before freeing */
003786  
003787      nOrig = btreePagecount(pBt);
003788      if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
003789        /* It is not possible to create a database for which the final page
003790        ** is either a pointer-map page or the pending-byte page. If one
003791        ** is encountered, this indicates corruption.
003792        */
003793        return SQLITE_CORRUPT_BKPT;
003794      }
003795  
003796      nFree = get4byte(&pBt->pPage1->aData[36]);
003797      nFin = finalDbSize(pBt, nOrig, nFree);
003798      if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
003799      if( nFin<nOrig ){
003800        rc = saveAllCursors(pBt, 0, 0);
003801      }
003802      for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
003803        rc = incrVacuumStep(pBt, nFin, iFree, 1);
003804      }
003805      if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
003806        rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
003807        put4byte(&pBt->pPage1->aData[32], 0);
003808        put4byte(&pBt->pPage1->aData[36], 0);
003809        put4byte(&pBt->pPage1->aData[28], nFin);
003810        pBt->bDoTruncate = 1;
003811        pBt->nPage = nFin;
003812      }
003813      if( rc!=SQLITE_OK ){
003814        sqlite3PagerRollback(pPager);
003815      }
003816    }
003817  
003818    assert( nRef>=sqlite3PagerRefcount(pPager) );
003819    return rc;
003820  }
003821  
003822  #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
003823  # define setChildPtrmaps(x) SQLITE_OK
003824  #endif
003825  
003826  /*
003827  ** This routine does the first phase of a two-phase commit.  This routine
003828  ** causes a rollback journal to be created (if it does not already exist)
003829  ** and populated with enough information so that if a power loss occurs
003830  ** the database can be restored to its original state by playing back
003831  ** the journal.  Then the contents of the journal are flushed out to
003832  ** the disk.  After the journal is safely on oxide, the changes to the
003833  ** database are written into the database file and flushed to oxide.
003834  ** At the end of this call, the rollback journal still exists on the
003835  ** disk and we are still holding all locks, so the transaction has not
003836  ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
003837  ** commit process.
003838  **
003839  ** This call is a no-op if no write-transaction is currently active on pBt.
003840  **
003841  ** Otherwise, sync the database file for the btree pBt. zMaster points to
003842  ** the name of a master journal file that should be written into the
003843  ** individual journal file, or is NULL, indicating no master journal file 
003844  ** (single database transaction).
003845  **
003846  ** When this is called, the master journal should already have been
003847  ** created, populated with this journal pointer and synced to disk.
003848  **
003849  ** Once this is routine has returned, the only thing required to commit
003850  ** the write-transaction for this database file is to delete the journal.
003851  */
003852  int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
003853    int rc = SQLITE_OK;
003854    if( p->inTrans==TRANS_WRITE ){
003855      BtShared *pBt = p->pBt;
003856      sqlite3BtreeEnter(p);
003857  #ifndef SQLITE_OMIT_AUTOVACUUM
003858      if( pBt->autoVacuum ){
003859        rc = autoVacuumCommit(pBt);
003860        if( rc!=SQLITE_OK ){
003861          sqlite3BtreeLeave(p);
003862          return rc;
003863        }
003864      }
003865      if( pBt->bDoTruncate ){
003866        sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
003867      }
003868  #endif
003869      rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
003870      sqlite3BtreeLeave(p);
003871    }
003872    return rc;
003873  }
003874  
003875  /*
003876  ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
003877  ** at the conclusion of a transaction.
003878  */
003879  static void btreeEndTransaction(Btree *p){
003880    BtShared *pBt = p->pBt;
003881    sqlite3 *db = p->db;
003882    assert( sqlite3BtreeHoldsMutex(p) );
003883  
003884  #ifndef SQLITE_OMIT_AUTOVACUUM
003885    pBt->bDoTruncate = 0;
003886  #endif
003887    if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
003888      /* If there are other active statements that belong to this database
003889      ** handle, downgrade to a read-only transaction. The other statements
003890      ** may still be reading from the database.  */
003891      downgradeAllSharedCacheTableLocks(p);
003892      p->inTrans = TRANS_READ;
003893    }else{
003894      /* If the handle had any kind of transaction open, decrement the 
003895      ** transaction count of the shared btree. If the transaction count 
003896      ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
003897      ** call below will unlock the pager.  */
003898      if( p->inTrans!=TRANS_NONE ){
003899        clearAllSharedCacheTableLocks(p);
003900        pBt->nTransaction--;
003901        if( 0==pBt->nTransaction ){
003902          pBt->inTransaction = TRANS_NONE;
003903        }
003904      }
003905  
003906      /* Set the current transaction state to TRANS_NONE and unlock the 
003907      ** pager if this call closed the only read or write transaction.  */
003908      p->inTrans = TRANS_NONE;
003909      unlockBtreeIfUnused(pBt);
003910    }
003911  
003912    btreeIntegrity(p);
003913  }
003914  
003915  /*
003916  ** Commit the transaction currently in progress.
003917  **
003918  ** This routine implements the second phase of a 2-phase commit.  The
003919  ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
003920  ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
003921  ** routine did all the work of writing information out to disk and flushing the
003922  ** contents so that they are written onto the disk platter.  All this
003923  ** routine has to do is delete or truncate or zero the header in the
003924  ** the rollback journal (which causes the transaction to commit) and
003925  ** drop locks.
003926  **
003927  ** Normally, if an error occurs while the pager layer is attempting to 
003928  ** finalize the underlying journal file, this function returns an error and
003929  ** the upper layer will attempt a rollback. However, if the second argument
003930  ** is non-zero then this b-tree transaction is part of a multi-file 
003931  ** transaction. In this case, the transaction has already been committed 
003932  ** (by deleting a master journal file) and the caller will ignore this 
003933  ** functions return code. So, even if an error occurs in the pager layer,
003934  ** reset the b-tree objects internal state to indicate that the write
003935  ** transaction has been closed. This is quite safe, as the pager will have
003936  ** transitioned to the error state.
003937  **
003938  ** This will release the write lock on the database file.  If there
003939  ** are no active cursors, it also releases the read lock.
003940  */
003941  int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
003942  
003943    if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
003944    sqlite3BtreeEnter(p);
003945    btreeIntegrity(p);
003946  
003947    /* If the handle has a write-transaction open, commit the shared-btrees 
003948    ** transaction and set the shared state to TRANS_READ.
003949    */
003950    if( p->inTrans==TRANS_WRITE ){
003951      int rc;
003952      BtShared *pBt = p->pBt;
003953      assert( pBt->inTransaction==TRANS_WRITE );
003954      assert( pBt->nTransaction>0 );
003955      rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
003956      if( rc!=SQLITE_OK && bCleanup==0 ){
003957        sqlite3BtreeLeave(p);
003958        return rc;
003959      }
003960      p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
003961      pBt->inTransaction = TRANS_READ;
003962      btreeClearHasContent(pBt);
003963    }
003964  
003965    btreeEndTransaction(p);
003966    sqlite3BtreeLeave(p);
003967    return SQLITE_OK;
003968  }
003969  
003970  /*
003971  ** Do both phases of a commit.
003972  */
003973  int sqlite3BtreeCommit(Btree *p){
003974    int rc;
003975    sqlite3BtreeEnter(p);
003976    rc = sqlite3BtreeCommitPhaseOne(p, 0);
003977    if( rc==SQLITE_OK ){
003978      rc = sqlite3BtreeCommitPhaseTwo(p, 0);
003979    }
003980    sqlite3BtreeLeave(p);
003981    return rc;
003982  }
003983  
003984  /*
003985  ** This routine sets the state to CURSOR_FAULT and the error
003986  ** code to errCode for every cursor on any BtShared that pBtree
003987  ** references.  Or if the writeOnly flag is set to 1, then only
003988  ** trip write cursors and leave read cursors unchanged.
003989  **
003990  ** Every cursor is a candidate to be tripped, including cursors
003991  ** that belong to other database connections that happen to be
003992  ** sharing the cache with pBtree.
003993  **
003994  ** This routine gets called when a rollback occurs. If the writeOnly
003995  ** flag is true, then only write-cursors need be tripped - read-only
003996  ** cursors save their current positions so that they may continue 
003997  ** following the rollback. Or, if writeOnly is false, all cursors are 
003998  ** tripped. In general, writeOnly is false if the transaction being
003999  ** rolled back modified the database schema. In this case b-tree root
004000  ** pages may be moved or deleted from the database altogether, making
004001  ** it unsafe for read cursors to continue.
004002  **
004003  ** If the writeOnly flag is true and an error is encountered while 
004004  ** saving the current position of a read-only cursor, all cursors, 
004005  ** including all read-cursors are tripped.
004006  **
004007  ** SQLITE_OK is returned if successful, or if an error occurs while
004008  ** saving a cursor position, an SQLite error code.
004009  */
004010  int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
004011    BtCursor *p;
004012    int rc = SQLITE_OK;
004013  
004014    assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
004015    if( pBtree ){
004016      sqlite3BtreeEnter(pBtree);
004017      for(p=pBtree->pBt->pCursor; p; p=p->pNext){
004018        if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
004019          if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
004020            rc = saveCursorPosition(p);
004021            if( rc!=SQLITE_OK ){
004022              (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
004023              break;
004024            }
004025          }
004026        }else{
004027          sqlite3BtreeClearCursor(p);
004028          p->eState = CURSOR_FAULT;
004029          p->skipNext = errCode;
004030        }
004031        btreeReleaseAllCursorPages(p);
004032      }
004033      sqlite3BtreeLeave(pBtree);
004034    }
004035    return rc;
004036  }
004037  
004038  /*
004039  ** Rollback the transaction in progress.
004040  **
004041  ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
004042  ** Only write cursors are tripped if writeOnly is true but all cursors are
004043  ** tripped if writeOnly is false.  Any attempt to use
004044  ** a tripped cursor will result in an error.
004045  **
004046  ** This will release the write lock on the database file.  If there
004047  ** are no active cursors, it also releases the read lock.
004048  */
004049  int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
004050    int rc;
004051    BtShared *pBt = p->pBt;
004052    MemPage *pPage1;
004053  
004054    assert( writeOnly==1 || writeOnly==0 );
004055    assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
004056    sqlite3BtreeEnter(p);
004057    if( tripCode==SQLITE_OK ){
004058      rc = tripCode = saveAllCursors(pBt, 0, 0);
004059      if( rc ) writeOnly = 0;
004060    }else{
004061      rc = SQLITE_OK;
004062    }
004063    if( tripCode ){
004064      int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
004065      assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
004066      if( rc2!=SQLITE_OK ) rc = rc2;
004067    }
004068    btreeIntegrity(p);
004069  
004070    if( p->inTrans==TRANS_WRITE ){
004071      int rc2;
004072  
004073      assert( TRANS_WRITE==pBt->inTransaction );
004074      rc2 = sqlite3PagerRollback(pBt->pPager);
004075      if( rc2!=SQLITE_OK ){
004076        rc = rc2;
004077      }
004078  
004079      /* The rollback may have destroyed the pPage1->aData value.  So
004080      ** call btreeGetPage() on page 1 again to make
004081      ** sure pPage1->aData is set correctly. */
004082      if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
004083        int nPage = get4byte(28+(u8*)pPage1->aData);
004084        testcase( nPage==0 );
004085        if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
004086        testcase( pBt->nPage!=nPage );
004087        pBt->nPage = nPage;
004088        releasePageOne(pPage1);
004089      }
004090      assert( countValidCursors(pBt, 1)==0 );
004091      pBt->inTransaction = TRANS_READ;
004092      btreeClearHasContent(pBt);
004093    }
004094  
004095    btreeEndTransaction(p);
004096    sqlite3BtreeLeave(p);
004097    return rc;
004098  }
004099  
004100  /*
004101  ** Start a statement subtransaction. The subtransaction can be rolled
004102  ** back independently of the main transaction. You must start a transaction 
004103  ** before starting a subtransaction. The subtransaction is ended automatically 
004104  ** if the main transaction commits or rolls back.
004105  **
004106  ** Statement subtransactions are used around individual SQL statements
004107  ** that are contained within a BEGIN...COMMIT block.  If a constraint
004108  ** error occurs within the statement, the effect of that one statement
004109  ** can be rolled back without having to rollback the entire transaction.
004110  **
004111  ** A statement sub-transaction is implemented as an anonymous savepoint. The
004112  ** value passed as the second parameter is the total number of savepoints,
004113  ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
004114  ** are no active savepoints and no other statement-transactions open,
004115  ** iStatement is 1. This anonymous savepoint can be released or rolled back
004116  ** using the sqlite3BtreeSavepoint() function.
004117  */
004118  int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
004119    int rc;
004120    BtShared *pBt = p->pBt;
004121    sqlite3BtreeEnter(p);
004122    assert( p->inTrans==TRANS_WRITE );
004123    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
004124    assert( iStatement>0 );
004125    assert( iStatement>p->db->nSavepoint );
004126    assert( pBt->inTransaction==TRANS_WRITE );
004127    /* At the pager level, a statement transaction is a savepoint with
004128    ** an index greater than all savepoints created explicitly using
004129    ** SQL statements. It is illegal to open, release or rollback any
004130    ** such savepoints while the statement transaction savepoint is active.
004131    */
004132    rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
004133    sqlite3BtreeLeave(p);
004134    return rc;
004135  }
004136  
004137  /*
004138  ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
004139  ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
004140  ** savepoint identified by parameter iSavepoint, depending on the value 
004141  ** of op.
004142  **
004143  ** Normally, iSavepoint is greater than or equal to zero. However, if op is
004144  ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 
004145  ** contents of the entire transaction are rolled back. This is different
004146  ** from a normal transaction rollback, as no locks are released and the
004147  ** transaction remains open.
004148  */
004149  int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
004150    int rc = SQLITE_OK;
004151    if( p && p->inTrans==TRANS_WRITE ){
004152      BtShared *pBt = p->pBt;
004153      assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
004154      assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
004155      sqlite3BtreeEnter(p);
004156      if( op==SAVEPOINT_ROLLBACK ){
004157        rc = saveAllCursors(pBt, 0, 0);
004158      }
004159      if( rc==SQLITE_OK ){
004160        rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
004161      }
004162      if( rc==SQLITE_OK ){
004163        if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
004164          pBt->nPage = 0;
004165        }
004166        rc = newDatabase(pBt);
004167        pBt->nPage = get4byte(28 + pBt->pPage1->aData);
004168  
004169        /* The database size was written into the offset 28 of the header
004170        ** when the transaction started, so we know that the value at offset
004171        ** 28 is nonzero. */
004172        assert( pBt->nPage>0 );
004173      }
004174      sqlite3BtreeLeave(p);
004175    }
004176    return rc;
004177  }
004178  
004179  /*
004180  ** Create a new cursor for the BTree whose root is on the page
004181  ** iTable. If a read-only cursor is requested, it is assumed that
004182  ** the caller already has at least a read-only transaction open
004183  ** on the database already. If a write-cursor is requested, then
004184  ** the caller is assumed to have an open write transaction.
004185  **
004186  ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
004187  ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
004188  ** can be used for reading or for writing if other conditions for writing
004189  ** are also met.  These are the conditions that must be met in order
004190  ** for writing to be allowed:
004191  **
004192  ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
004193  **
004194  ** 2:  Other database connections that share the same pager cache
004195  **     but which are not in the READ_UNCOMMITTED state may not have
004196  **     cursors open with wrFlag==0 on the same table.  Otherwise
004197  **     the changes made by this write cursor would be visible to
004198  **     the read cursors in the other database connection.
004199  **
004200  ** 3:  The database must be writable (not on read-only media)
004201  **
004202  ** 4:  There must be an active transaction.
004203  **
004204  ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
004205  ** is set.  If FORDELETE is set, that is a hint to the implementation that
004206  ** this cursor will only be used to seek to and delete entries of an index
004207  ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
004208  ** this implementation.  But in a hypothetical alternative storage engine 
004209  ** in which index entries are automatically deleted when corresponding table
004210  ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
004211  ** operations on this cursor can be no-ops and all READ operations can 
004212  ** return a null row (2-bytes: 0x01 0x00).
004213  **
004214  ** No checking is done to make sure that page iTable really is the
004215  ** root page of a b-tree.  If it is not, then the cursor acquired
004216  ** will not work correctly.
004217  **
004218  ** It is assumed that the sqlite3BtreeCursorZero() has been called
004219  ** on pCur to initialize the memory space prior to invoking this routine.
004220  */
004221  static int btreeCursor(
004222    Btree *p,                              /* The btree */
004223    int iTable,                            /* Root page of table to open */
004224    int wrFlag,                            /* 1 to write. 0 read-only */
004225    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
004226    BtCursor *pCur                         /* Space for new cursor */
004227  ){
004228    BtShared *pBt = p->pBt;                /* Shared b-tree handle */
004229    BtCursor *pX;                          /* Looping over other all cursors */
004230  
004231    assert( sqlite3BtreeHoldsMutex(p) );
004232    assert( wrFlag==0 
004233         || wrFlag==BTREE_WRCSR 
004234         || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE) 
004235    );
004236  
004237    /* The following assert statements verify that if this is a sharable 
004238    ** b-tree database, the connection is holding the required table locks, 
004239    ** and that no other connection has any open cursor that conflicts with 
004240    ** this lock.  */
004241    assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1)) );
004242    assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
004243  
004244    /* Assert that the caller has opened the required transaction. */
004245    assert( p->inTrans>TRANS_NONE );
004246    assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
004247    assert( pBt->pPage1 && pBt->pPage1->aData );
004248    assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
004249  
004250    if( wrFlag ){
004251      allocateTempSpace(pBt);
004252      if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
004253    }
004254    if( iTable==1 && btreePagecount(pBt)==0 ){
004255      assert( wrFlag==0 );
004256      iTable = 0;
004257    }
004258  
004259    /* Now that no other errors can occur, finish filling in the BtCursor
004260    ** variables and link the cursor into the BtShared list.  */
004261    pCur->pgnoRoot = (Pgno)iTable;
004262    pCur->iPage = -1;
004263    pCur->pKeyInfo = pKeyInfo;
004264    pCur->pBtree = p;
004265    pCur->pBt = pBt;
004266    pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
004267    pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
004268    /* If there are two or more cursors on the same btree, then all such
004269    ** cursors *must* have the BTCF_Multiple flag set. */
004270    for(pX=pBt->pCursor; pX; pX=pX->pNext){
004271      if( pX->pgnoRoot==(Pgno)iTable ){
004272        pX->curFlags |= BTCF_Multiple;
004273        pCur->curFlags |= BTCF_Multiple;
004274      }
004275    }
004276    pCur->pNext = pBt->pCursor;
004277    pBt->pCursor = pCur;
004278    pCur->eState = CURSOR_INVALID;
004279    return SQLITE_OK;
004280  }
004281  int sqlite3BtreeCursor(
004282    Btree *p,                                   /* The btree */
004283    int iTable,                                 /* Root page of table to open */
004284    int wrFlag,                                 /* 1 to write. 0 read-only */
004285    struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
004286    BtCursor *pCur                              /* Write new cursor here */
004287  ){
004288    int rc;
004289    if( iTable<1 ){
004290      rc = SQLITE_CORRUPT_BKPT;
004291    }else{
004292      sqlite3BtreeEnter(p);
004293      rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
004294      sqlite3BtreeLeave(p);
004295    }
004296    return rc;
004297  }
004298  
004299  /*
004300  ** Return the size of a BtCursor object in bytes.
004301  **
004302  ** This interfaces is needed so that users of cursors can preallocate
004303  ** sufficient storage to hold a cursor.  The BtCursor object is opaque
004304  ** to users so they cannot do the sizeof() themselves - they must call
004305  ** this routine.
004306  */
004307  int sqlite3BtreeCursorSize(void){
004308    return ROUND8(sizeof(BtCursor));
004309  }
004310  
004311  /*
004312  ** Initialize memory that will be converted into a BtCursor object.
004313  **
004314  ** The simple approach here would be to memset() the entire object
004315  ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
004316  ** do not need to be zeroed and they are large, so we can save a lot
004317  ** of run-time by skipping the initialization of those elements.
004318  */
004319  void sqlite3BtreeCursorZero(BtCursor *p){
004320    memset(p, 0, offsetof(BtCursor, iPage));
004321  }
004322  
004323  /*
004324  ** Close a cursor.  The read lock on the database file is released
004325  ** when the last cursor is closed.
004326  */
004327  int sqlite3BtreeCloseCursor(BtCursor *pCur){
004328    Btree *pBtree = pCur->pBtree;
004329    if( pBtree ){
004330      BtShared *pBt = pCur->pBt;
004331      sqlite3BtreeEnter(pBtree);
004332      assert( pBt->pCursor!=0 );
004333      if( pBt->pCursor==pCur ){
004334        pBt->pCursor = pCur->pNext;
004335      }else{
004336        BtCursor *pPrev = pBt->pCursor;
004337        do{
004338          if( pPrev->pNext==pCur ){
004339            pPrev->pNext = pCur->pNext;
004340            break;
004341          }
004342          pPrev = pPrev->pNext;
004343        }while( ALWAYS(pPrev) );
004344      }
004345      btreeReleaseAllCursorPages(pCur);
004346      unlockBtreeIfUnused(pBt);
004347      sqlite3_free(pCur->aOverflow);
004348      sqlite3_free(pCur->pKey);
004349      sqlite3BtreeLeave(pBtree);
004350    }
004351    return SQLITE_OK;
004352  }
004353  
004354  /*
004355  ** Make sure the BtCursor* given in the argument has a valid
004356  ** BtCursor.info structure.  If it is not already valid, call
004357  ** btreeParseCell() to fill it in.
004358  **
004359  ** BtCursor.info is a cache of the information in the current cell.
004360  ** Using this cache reduces the number of calls to btreeParseCell().
004361  */
004362  #ifndef NDEBUG
004363    static void assertCellInfo(BtCursor *pCur){
004364      CellInfo info;
004365      memset(&info, 0, sizeof(info));
004366      btreeParseCell(pCur->pPage, pCur->ix, &info);
004367      assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
004368    }
004369  #else
004370    #define assertCellInfo(x)
004371  #endif
004372  static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
004373    if( pCur->info.nSize==0 ){
004374      pCur->curFlags |= BTCF_ValidNKey;
004375      btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
004376    }else{
004377      assertCellInfo(pCur);
004378    }
004379  }
004380  
004381  #ifndef NDEBUG  /* The next routine used only within assert() statements */
004382  /*
004383  ** Return true if the given BtCursor is valid.  A valid cursor is one
004384  ** that is currently pointing to a row in a (non-empty) table.
004385  ** This is a verification routine is used only within assert() statements.
004386  */
004387  int sqlite3BtreeCursorIsValid(BtCursor *pCur){
004388    return pCur && pCur->eState==CURSOR_VALID;
004389  }
004390  #endif /* NDEBUG */
004391  int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
004392    assert( pCur!=0 );
004393    return pCur->eState==CURSOR_VALID;
004394  }
004395  
004396  /*
004397  ** Return the value of the integer key or "rowid" for a table btree.
004398  ** This routine is only valid for a cursor that is pointing into a
004399  ** ordinary table btree.  If the cursor points to an index btree or
004400  ** is invalid, the result of this routine is undefined.
004401  */
004402  i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
004403    assert( cursorHoldsMutex(pCur) );
004404    assert( pCur->eState==CURSOR_VALID );
004405    assert( pCur->curIntKey );
004406    getCellInfo(pCur);
004407    return pCur->info.nKey;
004408  }
004409  
004410  /*
004411  ** Return the number of bytes of payload for the entry that pCur is
004412  ** currently pointing to.  For table btrees, this will be the amount
004413  ** of data.  For index btrees, this will be the size of the key.
004414  **
004415  ** The caller must guarantee that the cursor is pointing to a non-NULL
004416  ** valid entry.  In other words, the calling procedure must guarantee
004417  ** that the cursor has Cursor.eState==CURSOR_VALID.
004418  */
004419  u32 sqlite3BtreePayloadSize(BtCursor *pCur){
004420    assert( cursorHoldsMutex(pCur) );
004421    assert( pCur->eState==CURSOR_VALID );
004422    getCellInfo(pCur);
004423    return pCur->info.nPayload;
004424  }
004425  
004426  /*
004427  ** Given the page number of an overflow page in the database (parameter
004428  ** ovfl), this function finds the page number of the next page in the 
004429  ** linked list of overflow pages. If possible, it uses the auto-vacuum
004430  ** pointer-map data instead of reading the content of page ovfl to do so. 
004431  **
004432  ** If an error occurs an SQLite error code is returned. Otherwise:
004433  **
004434  ** The page number of the next overflow page in the linked list is 
004435  ** written to *pPgnoNext. If page ovfl is the last page in its linked 
004436  ** list, *pPgnoNext is set to zero. 
004437  **
004438  ** If ppPage is not NULL, and a reference to the MemPage object corresponding
004439  ** to page number pOvfl was obtained, then *ppPage is set to point to that
004440  ** reference. It is the responsibility of the caller to call releasePage()
004441  ** on *ppPage to free the reference. In no reference was obtained (because
004442  ** the pointer-map was used to obtain the value for *pPgnoNext), then
004443  ** *ppPage is set to zero.
004444  */
004445  static int getOverflowPage(
004446    BtShared *pBt,               /* The database file */
004447    Pgno ovfl,                   /* Current overflow page number */
004448    MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
004449    Pgno *pPgnoNext              /* OUT: Next overflow page number */
004450  ){
004451    Pgno next = 0;
004452    MemPage *pPage = 0;
004453    int rc = SQLITE_OK;
004454  
004455    assert( sqlite3_mutex_held(pBt->mutex) );
004456    assert(pPgnoNext);
004457  
004458  #ifndef SQLITE_OMIT_AUTOVACUUM
004459    /* Try to find the next page in the overflow list using the
004460    ** autovacuum pointer-map pages. Guess that the next page in 
004461    ** the overflow list is page number (ovfl+1). If that guess turns 
004462    ** out to be wrong, fall back to loading the data of page 
004463    ** number ovfl to determine the next page number.
004464    */
004465    if( pBt->autoVacuum ){
004466      Pgno pgno;
004467      Pgno iGuess = ovfl+1;
004468      u8 eType;
004469  
004470      while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
004471        iGuess++;
004472      }
004473  
004474      if( iGuess<=btreePagecount(pBt) ){
004475        rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
004476        if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
004477          next = iGuess;
004478          rc = SQLITE_DONE;
004479        }
004480      }
004481    }
004482  #endif
004483  
004484    assert( next==0 || rc==SQLITE_DONE );
004485    if( rc==SQLITE_OK ){
004486      rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
004487      assert( rc==SQLITE_OK || pPage==0 );
004488      if( rc==SQLITE_OK ){
004489        next = get4byte(pPage->aData);
004490      }
004491    }
004492  
004493    *pPgnoNext = next;
004494    if( ppPage ){
004495      *ppPage = pPage;
004496    }else{
004497      releasePage(pPage);
004498    }
004499    return (rc==SQLITE_DONE ? SQLITE_OK : rc);
004500  }
004501  
004502  /*
004503  ** Copy data from a buffer to a page, or from a page to a buffer.
004504  **
004505  ** pPayload is a pointer to data stored on database page pDbPage.
004506  ** If argument eOp is false, then nByte bytes of data are copied
004507  ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
004508  ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
004509  ** of data are copied from the buffer pBuf to pPayload.
004510  **
004511  ** SQLITE_OK is returned on success, otherwise an error code.
004512  */
004513  static int copyPayload(
004514    void *pPayload,           /* Pointer to page data */
004515    void *pBuf,               /* Pointer to buffer */
004516    int nByte,                /* Number of bytes to copy */
004517    int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
004518    DbPage *pDbPage           /* Page containing pPayload */
004519  ){
004520    if( eOp ){
004521      /* Copy data from buffer to page (a write operation) */
004522      int rc = sqlite3PagerWrite(pDbPage);
004523      if( rc!=SQLITE_OK ){
004524        return rc;
004525      }
004526      memcpy(pPayload, pBuf, nByte);
004527    }else{
004528      /* Copy data from page to buffer (a read operation) */
004529      memcpy(pBuf, pPayload, nByte);
004530    }
004531    return SQLITE_OK;
004532  }
004533  
004534  /*
004535  ** This function is used to read or overwrite payload information
004536  ** for the entry that the pCur cursor is pointing to. The eOp
004537  ** argument is interpreted as follows:
004538  **
004539  **   0: The operation is a read. Populate the overflow cache.
004540  **   1: The operation is a write. Populate the overflow cache.
004541  **
004542  ** A total of "amt" bytes are read or written beginning at "offset".
004543  ** Data is read to or from the buffer pBuf.
004544  **
004545  ** The content being read or written might appear on the main page
004546  ** or be scattered out on multiple overflow pages.
004547  **
004548  ** If the current cursor entry uses one or more overflow pages
004549  ** this function may allocate space for and lazily populate
004550  ** the overflow page-list cache array (BtCursor.aOverflow). 
004551  ** Subsequent calls use this cache to make seeking to the supplied offset 
004552  ** more efficient.
004553  **
004554  ** Once an overflow page-list cache has been allocated, it must be
004555  ** invalidated if some other cursor writes to the same table, or if
004556  ** the cursor is moved to a different row. Additionally, in auto-vacuum
004557  ** mode, the following events may invalidate an overflow page-list cache.
004558  **
004559  **   * An incremental vacuum,
004560  **   * A commit in auto_vacuum="full" mode,
004561  **   * Creating a table (may require moving an overflow page).
004562  */
004563  static int accessPayload(
004564    BtCursor *pCur,      /* Cursor pointing to entry to read from */
004565    u32 offset,          /* Begin reading this far into payload */
004566    u32 amt,             /* Read this many bytes */
004567    unsigned char *pBuf, /* Write the bytes into this buffer */ 
004568    int eOp              /* zero to read. non-zero to write. */
004569  ){
004570    unsigned char *aPayload;
004571    int rc = SQLITE_OK;
004572    int iIdx = 0;
004573    MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
004574    BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
004575  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004576    unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
004577  #endif
004578  
004579    assert( pPage );
004580    assert( eOp==0 || eOp==1 );
004581    assert( pCur->eState==CURSOR_VALID );
004582    assert( pCur->ix<pPage->nCell );
004583    assert( cursorHoldsMutex(pCur) );
004584  
004585    getCellInfo(pCur);
004586    aPayload = pCur->info.pPayload;
004587    assert( offset+amt <= pCur->info.nPayload );
004588  
004589    assert( aPayload > pPage->aData );
004590    if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
004591      /* Trying to read or write past the end of the data is an error.  The
004592      ** conditional above is really:
004593      **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
004594      ** but is recast into its current form to avoid integer overflow problems
004595      */
004596      return SQLITE_CORRUPT_PGNO(pPage->pgno);
004597    }
004598  
004599    /* Check if data must be read/written to/from the btree page itself. */
004600    if( offset<pCur->info.nLocal ){
004601      int a = amt;
004602      if( a+offset>pCur->info.nLocal ){
004603        a = pCur->info.nLocal - offset;
004604      }
004605      rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
004606      offset = 0;
004607      pBuf += a;
004608      amt -= a;
004609    }else{
004610      offset -= pCur->info.nLocal;
004611    }
004612  
004613  
004614    if( rc==SQLITE_OK && amt>0 ){
004615      const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
004616      Pgno nextPage;
004617  
004618      nextPage = get4byte(&aPayload[pCur->info.nLocal]);
004619  
004620      /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
004621      **
004622      ** The aOverflow[] array is sized at one entry for each overflow page
004623      ** in the overflow chain. The page number of the first overflow page is
004624      ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
004625      ** means "not yet known" (the cache is lazily populated).
004626      */
004627      if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
004628        int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
004629        if( nOvfl>pCur->nOvflAlloc ){
004630          Pgno *aNew = (Pgno*)sqlite3Realloc(
004631              pCur->aOverflow, nOvfl*2*sizeof(Pgno)
004632          );
004633          if( aNew==0 ){
004634            return SQLITE_NOMEM_BKPT;
004635          }else{
004636            pCur->nOvflAlloc = nOvfl*2;
004637            pCur->aOverflow = aNew;
004638          }
004639        }
004640        memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
004641        pCur->curFlags |= BTCF_ValidOvfl;
004642      }else{
004643        /* If the overflow page-list cache has been allocated and the
004644        ** entry for the first required overflow page is valid, skip
004645        ** directly to it.
004646        */
004647        if( pCur->aOverflow[offset/ovflSize] ){
004648          iIdx = (offset/ovflSize);
004649          nextPage = pCur->aOverflow[iIdx];
004650          offset = (offset%ovflSize);
004651        }
004652      }
004653  
004654      assert( rc==SQLITE_OK && amt>0 );
004655      while( nextPage ){
004656        /* If required, populate the overflow page-list cache. */
004657        assert( pCur->aOverflow[iIdx]==0
004658                || pCur->aOverflow[iIdx]==nextPage
004659                || CORRUPT_DB );
004660        pCur->aOverflow[iIdx] = nextPage;
004661  
004662        if( offset>=ovflSize ){
004663          /* The only reason to read this page is to obtain the page
004664          ** number for the next page in the overflow chain. The page
004665          ** data is not required. So first try to lookup the overflow
004666          ** page-list cache, if any, then fall back to the getOverflowPage()
004667          ** function.
004668          */
004669          assert( pCur->curFlags & BTCF_ValidOvfl );
004670          assert( pCur->pBtree->db==pBt->db );
004671          if( pCur->aOverflow[iIdx+1] ){
004672            nextPage = pCur->aOverflow[iIdx+1];
004673          }else{
004674            rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
004675          }
004676          offset -= ovflSize;
004677        }else{
004678          /* Need to read this page properly. It contains some of the
004679          ** range of data that is being read (eOp==0) or written (eOp!=0).
004680          */
004681  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004682          sqlite3_file *fd;      /* File from which to do direct overflow read */
004683  #endif
004684          int a = amt;
004685          if( a + offset > ovflSize ){
004686            a = ovflSize - offset;
004687          }
004688  
004689  #ifdef SQLITE_DIRECT_OVERFLOW_READ
004690          /* If all the following are true:
004691          **
004692          **   1) this is a read operation, and 
004693          **   2) data is required from the start of this overflow page, and
004694          **   3) there is no open write-transaction, and
004695          **   4) the database is file-backed, and
004696          **   5) the page is not in the WAL file
004697          **   6) at least 4 bytes have already been read into the output buffer 
004698          **
004699          ** then data can be read directly from the database file into the
004700          ** output buffer, bypassing the page-cache altogether. This speeds
004701          ** up loading large records that span many overflow pages.
004702          */
004703          if( eOp==0                                             /* (1) */
004704           && offset==0                                          /* (2) */
004705           && pBt->inTransaction==TRANS_READ                     /* (3) */
004706           && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
004707           && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
004708           && &pBuf[-4]>=pBufStart                               /* (6) */
004709          ){
004710            u8 aSave[4];
004711            u8 *aWrite = &pBuf[-4];
004712            assert( aWrite>=pBufStart );                         /* due to (6) */
004713            memcpy(aSave, aWrite, 4);
004714            rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
004715            nextPage = get4byte(aWrite);
004716            memcpy(aWrite, aSave, 4);
004717          }else
004718  #endif
004719  
004720          {
004721            DbPage *pDbPage;
004722            rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
004723                (eOp==0 ? PAGER_GET_READONLY : 0)
004724            );
004725            if( rc==SQLITE_OK ){
004726              aPayload = sqlite3PagerGetData(pDbPage);
004727              nextPage = get4byte(aPayload);
004728              rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
004729              sqlite3PagerUnref(pDbPage);
004730              offset = 0;
004731            }
004732          }
004733          amt -= a;
004734          if( amt==0 ) return rc;
004735          pBuf += a;
004736        }
004737        if( rc ) break;
004738        iIdx++;
004739      }
004740    }
004741  
004742    if( rc==SQLITE_OK && amt>0 ){
004743      /* Overflow chain ends prematurely */
004744      return SQLITE_CORRUPT_PGNO(pPage->pgno);
004745    }
004746    return rc;
004747  }
004748  
004749  /*
004750  ** Read part of the payload for the row at which that cursor pCur is currently
004751  ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
004752  ** begins at "offset".
004753  **
004754  ** pCur can be pointing to either a table or an index b-tree.
004755  ** If pointing to a table btree, then the content section is read.  If
004756  ** pCur is pointing to an index b-tree then the key section is read.
004757  **
004758  ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
004759  ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
004760  ** cursor might be invalid or might need to be restored before being read.
004761  **
004762  ** Return SQLITE_OK on success or an error code if anything goes
004763  ** wrong.  An error is returned if "offset+amt" is larger than
004764  ** the available payload.
004765  */
004766  int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004767    assert( cursorHoldsMutex(pCur) );
004768    assert( pCur->eState==CURSOR_VALID );
004769    assert( pCur->iPage>=0 && pCur->pPage );
004770    assert( pCur->ix<pCur->pPage->nCell );
004771    return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
004772  }
004773  
004774  /*
004775  ** This variant of sqlite3BtreePayload() works even if the cursor has not
004776  ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
004777  ** interface.
004778  */
004779  #ifndef SQLITE_OMIT_INCRBLOB
004780  static SQLITE_NOINLINE int accessPayloadChecked(
004781    BtCursor *pCur,
004782    u32 offset,
004783    u32 amt,
004784    void *pBuf
004785  ){
004786    int rc;
004787    if ( pCur->eState==CURSOR_INVALID ){
004788      return SQLITE_ABORT;
004789    }
004790    assert( cursorOwnsBtShared(pCur) );
004791    rc = btreeRestoreCursorPosition(pCur);
004792    return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
004793  }
004794  int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
004795    if( pCur->eState==CURSOR_VALID ){
004796      assert( cursorOwnsBtShared(pCur) );
004797      return accessPayload(pCur, offset, amt, pBuf, 0);
004798    }else{
004799      return accessPayloadChecked(pCur, offset, amt, pBuf);
004800    }
004801  }
004802  #endif /* SQLITE_OMIT_INCRBLOB */
004803  
004804  /*
004805  ** Return a pointer to payload information from the entry that the 
004806  ** pCur cursor is pointing to.  The pointer is to the beginning of
004807  ** the key if index btrees (pPage->intKey==0) and is the data for
004808  ** table btrees (pPage->intKey==1). The number of bytes of available
004809  ** key/data is written into *pAmt.  If *pAmt==0, then the value
004810  ** returned will not be a valid pointer.
004811  **
004812  ** This routine is an optimization.  It is common for the entire key
004813  ** and data to fit on the local page and for there to be no overflow
004814  ** pages.  When that is so, this routine can be used to access the
004815  ** key and data without making a copy.  If the key and/or data spills
004816  ** onto overflow pages, then accessPayload() must be used to reassemble
004817  ** the key/data and copy it into a preallocated buffer.
004818  **
004819  ** The pointer returned by this routine looks directly into the cached
004820  ** page of the database.  The data might change or move the next time
004821  ** any btree routine is called.
004822  */
004823  static const void *fetchPayload(
004824    BtCursor *pCur,      /* Cursor pointing to entry to read from */
004825    u32 *pAmt            /* Write the number of available bytes here */
004826  ){
004827    int amt;
004828    assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
004829    assert( pCur->eState==CURSOR_VALID );
004830    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
004831    assert( cursorOwnsBtShared(pCur) );
004832    assert( pCur->ix<pCur->pPage->nCell );
004833    assert( pCur->info.nSize>0 );
004834    assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
004835    assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
004836    amt = pCur->info.nLocal;
004837    if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
004838      /* There is too little space on the page for the expected amount
004839      ** of local content. Database must be corrupt. */
004840      assert( CORRUPT_DB );
004841      amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
004842    }
004843    *pAmt = (u32)amt;
004844    return (void*)pCur->info.pPayload;
004845  }
004846  
004847  
004848  /*
004849  ** For the entry that cursor pCur is point to, return as
004850  ** many bytes of the key or data as are available on the local
004851  ** b-tree page.  Write the number of available bytes into *pAmt.
004852  **
004853  ** The pointer returned is ephemeral.  The key/data may move
004854  ** or be destroyed on the next call to any Btree routine,
004855  ** including calls from other threads against the same cache.
004856  ** Hence, a mutex on the BtShared should be held prior to calling
004857  ** this routine.
004858  **
004859  ** These routines is used to get quick access to key and data
004860  ** in the common case where no overflow pages are used.
004861  */
004862  const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
004863    return fetchPayload(pCur, pAmt);
004864  }
004865  
004866  
004867  /*
004868  ** Move the cursor down to a new child page.  The newPgno argument is the
004869  ** page number of the child page to move to.
004870  **
004871  ** This function returns SQLITE_CORRUPT if the page-header flags field of
004872  ** the new child page does not match the flags field of the parent (i.e.
004873  ** if an intkey page appears to be the parent of a non-intkey page, or
004874  ** vice-versa).
004875  */
004876  static int moveToChild(BtCursor *pCur, u32 newPgno){
004877    BtShared *pBt = pCur->pBt;
004878  
004879    assert( cursorOwnsBtShared(pCur) );
004880    assert( pCur->eState==CURSOR_VALID );
004881    assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
004882    assert( pCur->iPage>=0 );
004883    if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
004884      return SQLITE_CORRUPT_BKPT;
004885    }
004886    pCur->info.nSize = 0;
004887    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
004888    pCur->aiIdx[pCur->iPage] = pCur->ix;
004889    pCur->apPage[pCur->iPage] = pCur->pPage;
004890    pCur->ix = 0;
004891    pCur->iPage++;
004892    return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
004893  }
004894  
004895  #ifdef SQLITE_DEBUG
004896  /*
004897  ** Page pParent is an internal (non-leaf) tree page. This function 
004898  ** asserts that page number iChild is the left-child if the iIdx'th
004899  ** cell in page pParent. Or, if iIdx is equal to the total number of
004900  ** cells in pParent, that page number iChild is the right-child of
004901  ** the page.
004902  */
004903  static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
004904    if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
004905                              ** in a corrupt database */
004906    assert( iIdx<=pParent->nCell );
004907    if( iIdx==pParent->nCell ){
004908      assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
004909    }else{
004910      assert( get4byte(findCell(pParent, iIdx))==iChild );
004911    }
004912  }
004913  #else
004914  #  define assertParentIndex(x,y,z) 
004915  #endif
004916  
004917  /*
004918  ** Move the cursor up to the parent page.
004919  **
004920  ** pCur->idx is set to the cell index that contains the pointer
004921  ** to the page we are coming from.  If we are coming from the
004922  ** right-most child page then pCur->idx is set to one more than
004923  ** the largest cell index.
004924  */
004925  static void moveToParent(BtCursor *pCur){
004926    MemPage *pLeaf;
004927    assert( cursorOwnsBtShared(pCur) );
004928    assert( pCur->eState==CURSOR_VALID );
004929    assert( pCur->iPage>0 );
004930    assert( pCur->pPage );
004931    assertParentIndex(
004932      pCur->apPage[pCur->iPage-1], 
004933      pCur->aiIdx[pCur->iPage-1], 
004934      pCur->pPage->pgno
004935    );
004936    testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
004937    pCur->info.nSize = 0;
004938    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
004939    pCur->ix = pCur->aiIdx[pCur->iPage-1];
004940    pLeaf = pCur->pPage;
004941    pCur->pPage = pCur->apPage[--pCur->iPage];
004942    releasePageNotNull(pLeaf);
004943  }
004944  
004945  /*
004946  ** Move the cursor to point to the root page of its b-tree structure.
004947  **
004948  ** If the table has a virtual root page, then the cursor is moved to point
004949  ** to the virtual root page instead of the actual root page. A table has a
004950  ** virtual root page when the actual root page contains no cells and a 
004951  ** single child page. This can only happen with the table rooted at page 1.
004952  **
004953  ** If the b-tree structure is empty, the cursor state is set to 
004954  ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
004955  ** the cursor is set to point to the first cell located on the root
004956  ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
004957  **
004958  ** If this function returns successfully, it may be assumed that the
004959  ** page-header flags indicate that the [virtual] root-page is the expected 
004960  ** kind of b-tree page (i.e. if when opening the cursor the caller did not
004961  ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
004962  ** indicating a table b-tree, or if the caller did specify a KeyInfo 
004963  ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
004964  ** b-tree).
004965  */
004966  static int moveToRoot(BtCursor *pCur){
004967    MemPage *pRoot;
004968    int rc = SQLITE_OK;
004969  
004970    assert( cursorOwnsBtShared(pCur) );
004971    assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
004972    assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
004973    assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
004974    assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
004975    assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
004976  
004977    if( pCur->iPage>=0 ){
004978      if( pCur->iPage ){
004979        releasePageNotNull(pCur->pPage);
004980        while( --pCur->iPage ){
004981          releasePageNotNull(pCur->apPage[pCur->iPage]);
004982        }
004983        pCur->pPage = pCur->apPage[0];
004984        goto skip_init;
004985      }
004986    }else if( pCur->pgnoRoot==0 ){
004987      pCur->eState = CURSOR_INVALID;
004988      return SQLITE_EMPTY;
004989    }else{
004990      assert( pCur->iPage==(-1) );
004991      if( pCur->eState>=CURSOR_REQUIRESEEK ){
004992        if( pCur->eState==CURSOR_FAULT ){
004993          assert( pCur->skipNext!=SQLITE_OK );
004994          return pCur->skipNext;
004995        }
004996        sqlite3BtreeClearCursor(pCur);
004997      }
004998      rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
004999                          0, pCur->curPagerFlags);
005000      if( rc!=SQLITE_OK ){
005001        pCur->eState = CURSOR_INVALID;
005002        return rc;
005003      }
005004      pCur->iPage = 0;
005005      pCur->curIntKey = pCur->pPage->intKey;
005006    }
005007    pRoot = pCur->pPage;
005008    assert( pRoot->pgno==pCur->pgnoRoot );
005009  
005010    /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
005011    ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
005012    ** NULL, the caller expects a table b-tree. If this is not the case,
005013    ** return an SQLITE_CORRUPT error. 
005014    **
005015    ** Earlier versions of SQLite assumed that this test could not fail
005016    ** if the root page was already loaded when this function was called (i.e.
005017    ** if pCur->iPage>=0). But this is not so if the database is corrupted 
005018    ** in such a way that page pRoot is linked into a second b-tree table 
005019    ** (or the freelist).  */
005020    assert( pRoot->intKey==1 || pRoot->intKey==0 );
005021    if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
005022      return SQLITE_CORRUPT_PGNO(pCur->pPage->pgno);
005023    }
005024  
005025  skip_init:  
005026    pCur->ix = 0;
005027    pCur->info.nSize = 0;
005028    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
005029  
005030    pRoot = pCur->pPage;
005031    if( pRoot->nCell>0 ){
005032      pCur->eState = CURSOR_VALID;
005033    }else if( !pRoot->leaf ){
005034      Pgno subpage;
005035      if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
005036      subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
005037      pCur->eState = CURSOR_VALID;
005038      rc = moveToChild(pCur, subpage);
005039    }else{
005040      pCur->eState = CURSOR_INVALID;
005041      rc = SQLITE_EMPTY;
005042    }
005043    return rc;
005044  }
005045  
005046  /*
005047  ** Move the cursor down to the left-most leaf entry beneath the
005048  ** entry to which it is currently pointing.
005049  **
005050  ** The left-most leaf is the one with the smallest key - the first
005051  ** in ascending order.
005052  */
005053  static int moveToLeftmost(BtCursor *pCur){
005054    Pgno pgno;
005055    int rc = SQLITE_OK;
005056    MemPage *pPage;
005057  
005058    assert( cursorOwnsBtShared(pCur) );
005059    assert( pCur->eState==CURSOR_VALID );
005060    while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
005061      assert( pCur->ix<pPage->nCell );
005062      pgno = get4byte(findCell(pPage, pCur->ix));
005063      rc = moveToChild(pCur, pgno);
005064    }
005065    return rc;
005066  }
005067  
005068  /*
005069  ** Move the cursor down to the right-most leaf entry beneath the
005070  ** page to which it is currently pointing.  Notice the difference
005071  ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
005072  ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
005073  ** finds the right-most entry beneath the *page*.
005074  **
005075  ** The right-most entry is the one with the largest key - the last
005076  ** key in ascending order.
005077  */
005078  static int moveToRightmost(BtCursor *pCur){
005079    Pgno pgno;
005080    int rc = SQLITE_OK;
005081    MemPage *pPage = 0;
005082  
005083    assert( cursorOwnsBtShared(pCur) );
005084    assert( pCur->eState==CURSOR_VALID );
005085    while( !(pPage = pCur->pPage)->leaf ){
005086      pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005087      pCur->ix = pPage->nCell;
005088      rc = moveToChild(pCur, pgno);
005089      if( rc ) return rc;
005090    }
005091    pCur->ix = pPage->nCell-1;
005092    assert( pCur->info.nSize==0 );
005093    assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
005094    return SQLITE_OK;
005095  }
005096  
005097  /* Move the cursor to the first entry in the table.  Return SQLITE_OK
005098  ** on success.  Set *pRes to 0 if the cursor actually points to something
005099  ** or set *pRes to 1 if the table is empty.
005100  */
005101  int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
005102    int rc;
005103  
005104    assert( cursorOwnsBtShared(pCur) );
005105    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005106    rc = moveToRoot(pCur);
005107    if( rc==SQLITE_OK ){
005108      assert( pCur->pPage->nCell>0 );
005109      *pRes = 0;
005110      rc = moveToLeftmost(pCur);
005111    }else if( rc==SQLITE_EMPTY ){
005112      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005113      *pRes = 1;
005114      rc = SQLITE_OK;
005115    }
005116    return rc;
005117  }
005118  
005119  /* Move the cursor to the last entry in the table.  Return SQLITE_OK
005120  ** on success.  Set *pRes to 0 if the cursor actually points to something
005121  ** or set *pRes to 1 if the table is empty.
005122  */
005123  int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
005124    int rc;
005125   
005126    assert( cursorOwnsBtShared(pCur) );
005127    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005128  
005129    /* If the cursor already points to the last entry, this is a no-op. */
005130    if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
005131  #ifdef SQLITE_DEBUG
005132      /* This block serves to assert() that the cursor really does point 
005133      ** to the last entry in the b-tree. */
005134      int ii;
005135      for(ii=0; ii<pCur->iPage; ii++){
005136        assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
005137      }
005138      assert( pCur->ix==pCur->pPage->nCell-1 );
005139      assert( pCur->pPage->leaf );
005140  #endif
005141      return SQLITE_OK;
005142    }
005143  
005144    rc = moveToRoot(pCur);
005145    if( rc==SQLITE_OK ){
005146      assert( pCur->eState==CURSOR_VALID );
005147      *pRes = 0;
005148      rc = moveToRightmost(pCur);
005149      if( rc==SQLITE_OK ){
005150        pCur->curFlags |= BTCF_AtLast;
005151      }else{
005152        pCur->curFlags &= ~BTCF_AtLast;
005153      }
005154    }else if( rc==SQLITE_EMPTY ){
005155      assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005156      *pRes = 1;
005157      rc = SQLITE_OK;
005158    }
005159    return rc;
005160  }
005161  
005162  /* Move the cursor so that it points to an entry near the key 
005163  ** specified by pIdxKey or intKey.   Return a success code.
005164  **
005165  ** For INTKEY tables, the intKey parameter is used.  pIdxKey 
005166  ** must be NULL.  For index tables, pIdxKey is used and intKey
005167  ** is ignored.
005168  **
005169  ** If an exact match is not found, then the cursor is always
005170  ** left pointing at a leaf page which would hold the entry if it
005171  ** were present.  The cursor might point to an entry that comes
005172  ** before or after the key.
005173  **
005174  ** An integer is written into *pRes which is the result of
005175  ** comparing the key with the entry to which the cursor is 
005176  ** pointing.  The meaning of the integer written into
005177  ** *pRes is as follows:
005178  **
005179  **     *pRes<0      The cursor is left pointing at an entry that
005180  **                  is smaller than intKey/pIdxKey or if the table is empty
005181  **                  and the cursor is therefore left point to nothing.
005182  **
005183  **     *pRes==0     The cursor is left pointing at an entry that
005184  **                  exactly matches intKey/pIdxKey.
005185  **
005186  **     *pRes>0      The cursor is left pointing at an entry that
005187  **                  is larger than intKey/pIdxKey.
005188  **
005189  ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
005190  ** exists an entry in the table that exactly matches pIdxKey.  
005191  */
005192  int sqlite3BtreeMovetoUnpacked(
005193    BtCursor *pCur,          /* The cursor to be moved */
005194    UnpackedRecord *pIdxKey, /* Unpacked index key */
005195    i64 intKey,              /* The table key */
005196    int biasRight,           /* If true, bias the search to the high end */
005197    int *pRes                /* Write search results here */
005198  ){
005199    int rc;
005200    RecordCompare xRecordCompare;
005201  
005202    assert( cursorOwnsBtShared(pCur) );
005203    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005204    assert( pRes );
005205    assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
005206    assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
005207  
005208    /* If the cursor is already positioned at the point we are trying
005209    ** to move to, then just return without doing any work */
005210    if( pIdxKey==0
005211     && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
005212    ){
005213      if( pCur->info.nKey==intKey ){
005214        *pRes = 0;
005215        return SQLITE_OK;
005216      }
005217      if( pCur->info.nKey<intKey ){
005218        if( (pCur->curFlags & BTCF_AtLast)!=0 ){
005219          *pRes = -1;
005220          return SQLITE_OK;
005221        }
005222        /* If the requested key is one more than the previous key, then
005223        ** try to get there using sqlite3BtreeNext() rather than a full
005224        ** binary search.  This is an optimization only.  The correct answer
005225        ** is still obtained without this case, only a little more slowely */
005226        if( pCur->info.nKey+1==intKey && !pCur->skipNext ){
005227          *pRes = 0;
005228          rc = sqlite3BtreeNext(pCur, 0);
005229          if( rc==SQLITE_OK ){
005230            getCellInfo(pCur);
005231            if( pCur->info.nKey==intKey ){
005232              return SQLITE_OK;
005233            }
005234          }else if( rc==SQLITE_DONE ){
005235            rc = SQLITE_OK;
005236          }else{
005237            return rc;
005238          }
005239        }
005240      }
005241    }
005242  
005243    if( pIdxKey ){
005244      xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
005245      pIdxKey->errCode = 0;
005246      assert( pIdxKey->default_rc==1 
005247           || pIdxKey->default_rc==0 
005248           || pIdxKey->default_rc==-1
005249      );
005250    }else{
005251      xRecordCompare = 0; /* All keys are integers */
005252    }
005253  
005254    rc = moveToRoot(pCur);
005255    if( rc ){
005256      if( rc==SQLITE_EMPTY ){
005257        assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
005258        *pRes = -1;
005259        return SQLITE_OK;
005260      }
005261      return rc;
005262    }
005263    assert( pCur->pPage );
005264    assert( pCur->pPage->isInit );
005265    assert( pCur->eState==CURSOR_VALID );
005266    assert( pCur->pPage->nCell > 0 );
005267    assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
005268    assert( pCur->curIntKey || pIdxKey );
005269    for(;;){
005270      int lwr, upr, idx, c;
005271      Pgno chldPg;
005272      MemPage *pPage = pCur->pPage;
005273      u8 *pCell;                          /* Pointer to current cell in pPage */
005274  
005275      /* pPage->nCell must be greater than zero. If this is the root-page
005276      ** the cursor would have been INVALID above and this for(;;) loop
005277      ** not run. If this is not the root-page, then the moveToChild() routine
005278      ** would have already detected db corruption. Similarly, pPage must
005279      ** be the right kind (index or table) of b-tree page. Otherwise
005280      ** a moveToChild() or moveToRoot() call would have detected corruption.  */
005281      assert( pPage->nCell>0 );
005282      assert( pPage->intKey==(pIdxKey==0) );
005283      lwr = 0;
005284      upr = pPage->nCell-1;
005285      assert( biasRight==0 || biasRight==1 );
005286      idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
005287      pCur->ix = (u16)idx;
005288      if( xRecordCompare==0 ){
005289        for(;;){
005290          i64 nCellKey;
005291          pCell = findCellPastPtr(pPage, idx);
005292          if( pPage->intKeyLeaf ){
005293            while( 0x80 <= *(pCell++) ){
005294              if( pCell>=pPage->aDataEnd ){
005295                return SQLITE_CORRUPT_PGNO(pPage->pgno);
005296              }
005297            }
005298          }
005299          getVarint(pCell, (u64*)&nCellKey);
005300          if( nCellKey<intKey ){
005301            lwr = idx+1;
005302            if( lwr>upr ){ c = -1; break; }
005303          }else if( nCellKey>intKey ){
005304            upr = idx-1;
005305            if( lwr>upr ){ c = +1; break; }
005306          }else{
005307            assert( nCellKey==intKey );
005308            pCur->ix = (u16)idx;
005309            if( !pPage->leaf ){
005310              lwr = idx;
005311              goto moveto_next_layer;
005312            }else{
005313              pCur->curFlags |= BTCF_ValidNKey;
005314              pCur->info.nKey = nCellKey;
005315              pCur->info.nSize = 0;
005316              *pRes = 0;
005317              return SQLITE_OK;
005318            }
005319          }
005320          assert( lwr+upr>=0 );
005321          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
005322        }
005323      }else{
005324        for(;;){
005325          int nCell;  /* Size of the pCell cell in bytes */
005326          pCell = findCellPastPtr(pPage, idx);
005327  
005328          /* The maximum supported page-size is 65536 bytes. This means that
005329          ** the maximum number of record bytes stored on an index B-Tree
005330          ** page is less than 16384 bytes and may be stored as a 2-byte
005331          ** varint. This information is used to attempt to avoid parsing 
005332          ** the entire cell by checking for the cases where the record is 
005333          ** stored entirely within the b-tree page by inspecting the first 
005334          ** 2 bytes of the cell.
005335          */
005336          nCell = pCell[0];
005337          if( nCell<=pPage->max1bytePayload ){
005338            /* This branch runs if the record-size field of the cell is a
005339            ** single byte varint and the record fits entirely on the main
005340            ** b-tree page.  */
005341            testcase( pCell+nCell+1==pPage->aDataEnd );
005342            c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
005343          }else if( !(pCell[1] & 0x80) 
005344            && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
005345          ){
005346            /* The record-size field is a 2 byte varint and the record 
005347            ** fits entirely on the main b-tree page.  */
005348            testcase( pCell+nCell+2==pPage->aDataEnd );
005349            c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
005350          }else{
005351            /* The record flows over onto one or more overflow pages. In
005352            ** this case the whole cell needs to be parsed, a buffer allocated
005353            ** and accessPayload() used to retrieve the record into the
005354            ** buffer before VdbeRecordCompare() can be called. 
005355            **
005356            ** If the record is corrupt, the xRecordCompare routine may read
005357            ** up to two varints past the end of the buffer. An extra 18 
005358            ** bytes of padding is allocated at the end of the buffer in
005359            ** case this happens.  */
005360            void *pCellKey;
005361            u8 * const pCellBody = pCell - pPage->childPtrSize;
005362            pPage->xParseCell(pPage, pCellBody, &pCur->info);
005363            nCell = (int)pCur->info.nKey;
005364            testcase( nCell<0 );   /* True if key size is 2^32 or more */
005365            testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
005366            testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
005367            testcase( nCell==2 );  /* Minimum legal index key size */
005368            if( nCell<2 ){
005369              rc = SQLITE_CORRUPT_PGNO(pPage->pgno);
005370              goto moveto_finish;
005371            }
005372            pCellKey = sqlite3Malloc( nCell+18 );
005373            if( pCellKey==0 ){
005374              rc = SQLITE_NOMEM_BKPT;
005375              goto moveto_finish;
005376            }
005377            pCur->ix = (u16)idx;
005378            rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
005379            pCur->curFlags &= ~BTCF_ValidOvfl;
005380            if( rc ){
005381              sqlite3_free(pCellKey);
005382              goto moveto_finish;
005383            }
005384            c = xRecordCompare(nCell, pCellKey, pIdxKey);
005385            sqlite3_free(pCellKey);
005386          }
005387          assert( 
005388              (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
005389           && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
005390          );
005391          if( c<0 ){
005392            lwr = idx+1;
005393          }else if( c>0 ){
005394            upr = idx-1;
005395          }else{
005396            assert( c==0 );
005397            *pRes = 0;
005398            rc = SQLITE_OK;
005399            pCur->ix = (u16)idx;
005400            if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
005401            goto moveto_finish;
005402          }
005403          if( lwr>upr ) break;
005404          assert( lwr+upr>=0 );
005405          idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
005406        }
005407      }
005408      assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
005409      assert( pPage->isInit );
005410      if( pPage->leaf ){
005411        assert( pCur->ix<pCur->pPage->nCell );
005412        pCur->ix = (u16)idx;
005413        *pRes = c;
005414        rc = SQLITE_OK;
005415        goto moveto_finish;
005416      }
005417  moveto_next_layer:
005418      if( lwr>=pPage->nCell ){
005419        chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
005420      }else{
005421        chldPg = get4byte(findCell(pPage, lwr));
005422      }
005423      pCur->ix = (u16)lwr;
005424      rc = moveToChild(pCur, chldPg);
005425      if( rc ) break;
005426    }
005427  moveto_finish:
005428    pCur->info.nSize = 0;
005429    assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005430    return rc;
005431  }
005432  
005433  
005434  /*
005435  ** Return TRUE if the cursor is not pointing at an entry of the table.
005436  **
005437  ** TRUE will be returned after a call to sqlite3BtreeNext() moves
005438  ** past the last entry in the table or sqlite3BtreePrev() moves past
005439  ** the first entry.  TRUE is also returned if the table is empty.
005440  */
005441  int sqlite3BtreeEof(BtCursor *pCur){
005442    /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
005443    ** have been deleted? This API will need to change to return an error code
005444    ** as well as the boolean result value.
005445    */
005446    return (CURSOR_VALID!=pCur->eState);
005447  }
005448  
005449  /*
005450  ** Return an estimate for the number of rows in the table that pCur is
005451  ** pointing to.  Return a negative number if no estimate is currently 
005452  ** available.
005453  */
005454  i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
005455    i64 n;
005456    u8 i;
005457  
005458    assert( cursorOwnsBtShared(pCur) );
005459    assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
005460  
005461    /* Currently this interface is only called by the OP_IfSmaller
005462    ** opcode, and it that case the cursor will always be valid and
005463    ** will always point to a leaf node. */
005464    if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
005465    if( NEVER(pCur->pPage->leaf==0) ) return -1;
005466  
005467    n = pCur->pPage->nCell;
005468    for(i=0; i<pCur->iPage; i++){
005469      n *= pCur->apPage[i]->nCell;
005470    }
005471    return n;
005472  }
005473  
005474  /*
005475  ** Advance the cursor to the next entry in the database. 
005476  ** Return value:
005477  **
005478  **    SQLITE_OK        success
005479  **    SQLITE_DONE      cursor is already pointing at the last element
005480  **    otherwise        some kind of error occurred
005481  **
005482  ** The main entry point is sqlite3BtreeNext().  That routine is optimized
005483  ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
005484  ** to the next cell on the current page.  The (slower) btreeNext() helper
005485  ** routine is called when it is necessary to move to a different page or
005486  ** to restore the cursor.
005487  **
005488  ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
005489  ** cursor corresponds to an SQL index and this routine could have been
005490  ** skipped if the SQL index had been a unique index.  The F argument
005491  ** is a hint to the implement.  SQLite btree implementation does not use
005492  ** this hint, but COMDB2 does.
005493  */
005494  static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
005495    int rc;
005496    int idx;
005497    MemPage *pPage;
005498  
005499    assert( cursorOwnsBtShared(pCur) );
005500    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005501    if( pCur->eState!=CURSOR_VALID ){
005502      assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
005503      rc = restoreCursorPosition(pCur);
005504      if( rc!=SQLITE_OK ){
005505        return rc;
005506      }
005507      if( CURSOR_INVALID==pCur->eState ){
005508        return SQLITE_DONE;
005509      }
005510      if( pCur->skipNext ){
005511        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
005512        pCur->eState = CURSOR_VALID;
005513        if( pCur->skipNext>0 ){
005514          pCur->skipNext = 0;
005515          return SQLITE_OK;
005516        }
005517        pCur->skipNext = 0;
005518      }
005519    }
005520  
005521    pPage = pCur->pPage;
005522    idx = ++pCur->ix;
005523    assert( pPage->isInit );
005524  
005525    /* If the database file is corrupt, it is possible for the value of idx 
005526    ** to be invalid here. This can only occur if a second cursor modifies
005527    ** the page while cursor pCur is holding a reference to it. Which can
005528    ** only happen if the database is corrupt in such a way as to link the
005529    ** page into more than one b-tree structure. */
005530    testcase( idx>pPage->nCell );
005531  
005532    if( idx>=pPage->nCell ){
005533      if( !pPage->leaf ){
005534        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
005535        if( rc ) return rc;
005536        return moveToLeftmost(pCur);
005537      }
005538      do{
005539        if( pCur->iPage==0 ){
005540          pCur->eState = CURSOR_INVALID;
005541          return SQLITE_DONE;
005542        }
005543        moveToParent(pCur);
005544        pPage = pCur->pPage;
005545      }while( pCur->ix>=pPage->nCell );
005546      if( pPage->intKey ){
005547        return sqlite3BtreeNext(pCur, 0);
005548      }else{
005549        return SQLITE_OK;
005550      }
005551    }
005552    if( pPage->leaf ){
005553      return SQLITE_OK;
005554    }else{
005555      return moveToLeftmost(pCur);
005556    }
005557  }
005558  int sqlite3BtreeNext(BtCursor *pCur, int flags){
005559    MemPage *pPage;
005560    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
005561    assert( cursorOwnsBtShared(pCur) );
005562    assert( flags==0 || flags==1 );
005563    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005564    pCur->info.nSize = 0;
005565    pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
005566    if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
005567    pPage = pCur->pPage;
005568    if( (++pCur->ix)>=pPage->nCell ){
005569      pCur->ix--;
005570      return btreeNext(pCur);
005571    }
005572    if( pPage->leaf ){
005573      return SQLITE_OK;
005574    }else{
005575      return moveToLeftmost(pCur);
005576    }
005577  }
005578  
005579  /*
005580  ** Step the cursor to the back to the previous entry in the database.
005581  ** Return values:
005582  **
005583  **     SQLITE_OK     success
005584  **     SQLITE_DONE   the cursor is already on the first element of the table
005585  **     otherwise     some kind of error occurred
005586  **
005587  ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
005588  ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
005589  ** to the previous cell on the current page.  The (slower) btreePrevious()
005590  ** helper routine is called when it is necessary to move to a different page
005591  ** or to restore the cursor.
005592  **
005593  ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
005594  ** the cursor corresponds to an SQL index and this routine could have been
005595  ** skipped if the SQL index had been a unique index.  The F argument is a
005596  ** hint to the implement.  The native SQLite btree implementation does not
005597  ** use this hint, but COMDB2 does.
005598  */
005599  static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
005600    int rc;
005601    MemPage *pPage;
005602  
005603    assert( cursorOwnsBtShared(pCur) );
005604    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005605    assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
005606    assert( pCur->info.nSize==0 );
005607    if( pCur->eState!=CURSOR_VALID ){
005608      rc = restoreCursorPosition(pCur);
005609      if( rc!=SQLITE_OK ){
005610        return rc;
005611      }
005612      if( CURSOR_INVALID==pCur->eState ){
005613        return SQLITE_DONE;
005614      }
005615      if( pCur->skipNext ){
005616        assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT );
005617        pCur->eState = CURSOR_VALID;
005618        if( pCur->skipNext<0 ){
005619          pCur->skipNext = 0;
005620          return SQLITE_OK;
005621        }
005622        pCur->skipNext = 0;
005623      }
005624    }
005625  
005626    pPage = pCur->pPage;
005627    assert( pPage->isInit );
005628    if( !pPage->leaf ){
005629      int idx = pCur->ix;
005630      rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
005631      if( rc ) return rc;
005632      rc = moveToRightmost(pCur);
005633    }else{
005634      while( pCur->ix==0 ){
005635        if( pCur->iPage==0 ){
005636          pCur->eState = CURSOR_INVALID;
005637          return SQLITE_DONE;
005638        }
005639        moveToParent(pCur);
005640      }
005641      assert( pCur->info.nSize==0 );
005642      assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
005643  
005644      pCur->ix--;
005645      pPage = pCur->pPage;
005646      if( pPage->intKey && !pPage->leaf ){
005647        rc = sqlite3BtreePrevious(pCur, 0);
005648      }else{
005649        rc = SQLITE_OK;
005650      }
005651    }
005652    return rc;
005653  }
005654  int sqlite3BtreePrevious(BtCursor *pCur, int flags){
005655    assert( cursorOwnsBtShared(pCur) );
005656    assert( flags==0 || flags==1 );
005657    assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID );
005658    UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
005659    pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
005660    pCur->info.nSize = 0;
005661    if( pCur->eState!=CURSOR_VALID
005662     || pCur->ix==0
005663     || pCur->pPage->leaf==0
005664    ){
005665      return btreePrevious(pCur);
005666    }
005667    pCur->ix--;
005668    return SQLITE_OK;
005669  }
005670  
005671  /*
005672  ** Allocate a new page from the database file.
005673  **
005674  ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
005675  ** has already been called on the new page.)  The new page has also
005676  ** been referenced and the calling routine is responsible for calling
005677  ** sqlite3PagerUnref() on the new page when it is done.
005678  **
005679  ** SQLITE_OK is returned on success.  Any other return value indicates
005680  ** an error.  *ppPage is set to NULL in the event of an error.
005681  **
005682  ** If the "nearby" parameter is not 0, then an effort is made to 
005683  ** locate a page close to the page number "nearby".  This can be used in an
005684  ** attempt to keep related pages close to each other in the database file,
005685  ** which in turn can make database access faster.
005686  **
005687  ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
005688  ** anywhere on the free-list, then it is guaranteed to be returned.  If
005689  ** eMode is BTALLOC_LT then the page returned will be less than or equal
005690  ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
005691  ** are no restrictions on which page is returned.
005692  */
005693  static int allocateBtreePage(
005694    BtShared *pBt,         /* The btree */
005695    MemPage **ppPage,      /* Store pointer to the allocated page here */
005696    Pgno *pPgno,           /* Store the page number here */
005697    Pgno nearby,           /* Search for a page near this one */
005698    u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
005699  ){
005700    MemPage *pPage1;
005701    int rc;
005702    u32 n;     /* Number of pages on the freelist */
005703    u32 k;     /* Number of leaves on the trunk of the freelist */
005704    MemPage *pTrunk = 0;
005705    MemPage *pPrevTrunk = 0;
005706    Pgno mxPage;     /* Total size of the database file */
005707  
005708    assert( sqlite3_mutex_held(pBt->mutex) );
005709    assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
005710    pPage1 = pBt->pPage1;
005711    mxPage = btreePagecount(pBt);
005712    /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
005713    ** stores stores the total number of pages on the freelist. */
005714    n = get4byte(&pPage1->aData[36]);
005715    testcase( n==mxPage-1 );
005716    if( n>=mxPage ){
005717      return SQLITE_CORRUPT_BKPT;
005718    }
005719    if( n>0 ){
005720      /* There are pages on the freelist.  Reuse one of those pages. */
005721      Pgno iTrunk;
005722      u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
005723      u32 nSearch = 0;   /* Count of the number of search attempts */
005724      
005725      /* If eMode==BTALLOC_EXACT and a query of the pointer-map
005726      ** shows that the page 'nearby' is somewhere on the free-list, then
005727      ** the entire-list will be searched for that page.
005728      */
005729  #ifndef SQLITE_OMIT_AUTOVACUUM
005730      if( eMode==BTALLOC_EXACT ){
005731        if( nearby<=mxPage ){
005732          u8 eType;
005733          assert( nearby>0 );
005734          assert( pBt->autoVacuum );
005735          rc = ptrmapGet(pBt, nearby, &eType, 0);
005736          if( rc ) return rc;
005737          if( eType==PTRMAP_FREEPAGE ){
005738            searchList = 1;
005739          }
005740        }
005741      }else if( eMode==BTALLOC_LE ){
005742        searchList = 1;
005743      }
005744  #endif
005745  
005746      /* Decrement the free-list count by 1. Set iTrunk to the index of the
005747      ** first free-list trunk page. iPrevTrunk is initially 1.
005748      */
005749      rc = sqlite3PagerWrite(pPage1->pDbPage);
005750      if( rc ) return rc;
005751      put4byte(&pPage1->aData[36], n-1);
005752  
005753      /* The code within this loop is run only once if the 'searchList' variable
005754      ** is not true. Otherwise, it runs once for each trunk-page on the
005755      ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
005756      ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
005757      */
005758      do {
005759        pPrevTrunk = pTrunk;
005760        if( pPrevTrunk ){
005761          /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
005762          ** is the page number of the next freelist trunk page in the list or
005763          ** zero if this is the last freelist trunk page. */
005764          iTrunk = get4byte(&pPrevTrunk->aData[0]);
005765        }else{
005766          /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
005767          ** stores the page number of the first page of the freelist, or zero if
005768          ** the freelist is empty. */
005769          iTrunk = get4byte(&pPage1->aData[32]);
005770        }
005771        testcase( iTrunk==mxPage );
005772        if( iTrunk>mxPage || nSearch++ > n ){
005773          rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
005774        }else{
005775          rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
005776        }
005777        if( rc ){
005778          pTrunk = 0;
005779          goto end_allocate_page;
005780        }
005781        assert( pTrunk!=0 );
005782        assert( pTrunk->aData!=0 );
005783        /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
005784        ** is the number of leaf page pointers to follow. */
005785        k = get4byte(&pTrunk->aData[4]);
005786        if( k==0 && !searchList ){
005787          /* The trunk has no leaves and the list is not being searched. 
005788          ** So extract the trunk page itself and use it as the newly 
005789          ** allocated page */
005790          assert( pPrevTrunk==0 );
005791          rc = sqlite3PagerWrite(pTrunk->pDbPage);
005792          if( rc ){
005793            goto end_allocate_page;
005794          }
005795          *pPgno = iTrunk;
005796          memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
005797          *ppPage = pTrunk;
005798          pTrunk = 0;
005799          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
005800        }else if( k>(u32)(pBt->usableSize/4 - 2) ){
005801          /* Value of k is out of range.  Database corruption */
005802          rc = SQLITE_CORRUPT_PGNO(iTrunk);
005803          goto end_allocate_page;
005804  #ifndef SQLITE_OMIT_AUTOVACUUM
005805        }else if( searchList 
005806              && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 
005807        ){
005808          /* The list is being searched and this trunk page is the page
005809          ** to allocate, regardless of whether it has leaves.
005810          */
005811          *pPgno = iTrunk;
005812          *ppPage = pTrunk;
005813          searchList = 0;
005814          rc = sqlite3PagerWrite(pTrunk->pDbPage);
005815          if( rc ){
005816            goto end_allocate_page;
005817          }
005818          if( k==0 ){
005819            if( !pPrevTrunk ){
005820              memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
005821            }else{
005822              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
005823              if( rc!=SQLITE_OK ){
005824                goto end_allocate_page;
005825              }
005826              memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
005827            }
005828          }else{
005829            /* The trunk page is required by the caller but it contains 
005830            ** pointers to free-list leaves. The first leaf becomes a trunk
005831            ** page in this case.
005832            */
005833            MemPage *pNewTrunk;
005834            Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
005835            if( iNewTrunk>mxPage ){ 
005836              rc = SQLITE_CORRUPT_PGNO(iTrunk);
005837              goto end_allocate_page;
005838            }
005839            testcase( iNewTrunk==mxPage );
005840            rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
005841            if( rc!=SQLITE_OK ){
005842              goto end_allocate_page;
005843            }
005844            rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
005845            if( rc!=SQLITE_OK ){
005846              releasePage(pNewTrunk);
005847              goto end_allocate_page;
005848            }
005849            memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
005850            put4byte(&pNewTrunk->aData[4], k-1);
005851            memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
005852            releasePage(pNewTrunk);
005853            if( !pPrevTrunk ){
005854              assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
005855              put4byte(&pPage1->aData[32], iNewTrunk);
005856            }else{
005857              rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
005858              if( rc ){
005859                goto end_allocate_page;
005860              }
005861              put4byte(&pPrevTrunk->aData[0], iNewTrunk);
005862            }
005863          }
005864          pTrunk = 0;
005865          TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
005866  #endif
005867        }else if( k>0 ){
005868          /* Extract a leaf from the trunk */
005869          u32 closest;
005870          Pgno iPage;
005871          unsigned char *aData = pTrunk->aData;
005872          if( nearby>0 ){
005873            u32 i;
005874            closest = 0;
005875            if( eMode==BTALLOC_LE ){
005876              for(i=0; i<k; i++){
005877                iPage = get4byte(&aData[8+i*4]);
005878                if( iPage<=nearby ){
005879                  closest = i;
005880                  break;
005881                }
005882              }
005883            }else{
005884              int dist;
005885              dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
005886              for(i=1; i<k; i++){
005887                int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
005888                if( d2<dist ){
005889                  closest = i;
005890                  dist = d2;
005891                }
005892              }
005893            }
005894          }else{
005895            closest = 0;
005896          }
005897  
005898          iPage = get4byte(&aData[8+closest*4]);
005899          testcase( iPage==mxPage );
005900          if( iPage>mxPage ){
005901            rc = SQLITE_CORRUPT_PGNO(iTrunk);
005902            goto end_allocate_page;
005903          }
005904          testcase( iPage==mxPage );
005905          if( !searchList 
005906           || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 
005907          ){
005908            int noContent;
005909            *pPgno = iPage;
005910            TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
005911                   ": %d more free pages\n",
005912                   *pPgno, closest+1, k, pTrunk->pgno, n-1));
005913            rc = sqlite3PagerWrite(pTrunk->pDbPage);
005914            if( rc ) goto end_allocate_page;
005915            if( closest<k-1 ){
005916              memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
005917            }
005918            put4byte(&aData[4], k-1);
005919            noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
005920            rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
005921            if( rc==SQLITE_OK ){
005922              rc = sqlite3PagerWrite((*ppPage)->pDbPage);
005923              if( rc!=SQLITE_OK ){
005924                releasePage(*ppPage);
005925                *ppPage = 0;
005926              }
005927            }
005928            searchList = 0;
005929          }
005930        }
005931        releasePage(pPrevTrunk);
005932        pPrevTrunk = 0;
005933      }while( searchList );
005934    }else{
005935      /* There are no pages on the freelist, so append a new page to the
005936      ** database image.
005937      **
005938      ** Normally, new pages allocated by this block can be requested from the
005939      ** pager layer with the 'no-content' flag set. This prevents the pager
005940      ** from trying to read the pages content from disk. However, if the
005941      ** current transaction has already run one or more incremental-vacuum
005942      ** steps, then the page we are about to allocate may contain content
005943      ** that is required in the event of a rollback. In this case, do
005944      ** not set the no-content flag. This causes the pager to load and journal
005945      ** the current page content before overwriting it.
005946      **
005947      ** Note that the pager will not actually attempt to load or journal 
005948      ** content for any page that really does lie past the end of the database
005949      ** file on disk. So the effects of disabling the no-content optimization
005950      ** here are confined to those pages that lie between the end of the
005951      ** database image and the end of the database file.
005952      */
005953      int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
005954  
005955      rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
005956      if( rc ) return rc;
005957      pBt->nPage++;
005958      if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
005959  
005960  #ifndef SQLITE_OMIT_AUTOVACUUM
005961      if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
005962        /* If *pPgno refers to a pointer-map page, allocate two new pages
005963        ** at the end of the file instead of one. The first allocated page
005964        ** becomes a new pointer-map page, the second is used by the caller.
005965        */
005966        MemPage *pPg = 0;
005967        TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
005968        assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
005969        rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
005970        if( rc==SQLITE_OK ){
005971          rc = sqlite3PagerWrite(pPg->pDbPage);
005972          releasePage(pPg);
005973        }
005974        if( rc ) return rc;
005975        pBt->nPage++;
005976        if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
005977      }
005978  #endif
005979      put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
005980      *pPgno = pBt->nPage;
005981  
005982      assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
005983      rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
005984      if( rc ) return rc;
005985      rc = sqlite3PagerWrite((*ppPage)->pDbPage);
005986      if( rc!=SQLITE_OK ){
005987        releasePage(*ppPage);
005988        *ppPage = 0;
005989      }
005990      TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
005991    }
005992  
005993    assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
005994  
005995  end_allocate_page:
005996    releasePage(pTrunk);
005997    releasePage(pPrevTrunk);
005998    assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
005999    assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
006000    return rc;
006001  }
006002  
006003  /*
006004  ** This function is used to add page iPage to the database file free-list. 
006005  ** It is assumed that the page is not already a part of the free-list.
006006  **
006007  ** The value passed as the second argument to this function is optional.
006008  ** If the caller happens to have a pointer to the MemPage object 
006009  ** corresponding to page iPage handy, it may pass it as the second value. 
006010  ** Otherwise, it may pass NULL.
006011  **
006012  ** If a pointer to a MemPage object is passed as the second argument,
006013  ** its reference count is not altered by this function.
006014  */
006015  static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
006016    MemPage *pTrunk = 0;                /* Free-list trunk page */
006017    Pgno iTrunk = 0;                    /* Page number of free-list trunk page */ 
006018    MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
006019    MemPage *pPage;                     /* Page being freed. May be NULL. */
006020    int rc;                             /* Return Code */
006021    int nFree;                          /* Initial number of pages on free-list */
006022  
006023    assert( sqlite3_mutex_held(pBt->mutex) );
006024    assert( CORRUPT_DB || iPage>1 );
006025    assert( !pMemPage || pMemPage->pgno==iPage );
006026  
006027    if( iPage<2 ) return SQLITE_CORRUPT_BKPT;
006028    if( pMemPage ){
006029      pPage = pMemPage;
006030      sqlite3PagerRef(pPage->pDbPage);
006031    }else{
006032      pPage = btreePageLookup(pBt, iPage);
006033    }
006034  
006035    /* Increment the free page count on pPage1 */
006036    rc = sqlite3PagerWrite(pPage1->pDbPage);
006037    if( rc ) goto freepage_out;
006038    nFree = get4byte(&pPage1->aData[36]);
006039    put4byte(&pPage1->aData[36], nFree+1);
006040  
006041    if( pBt->btsFlags & BTS_SECURE_DELETE ){
006042      /* If the secure_delete option is enabled, then
006043      ** always fully overwrite deleted information with zeros.
006044      */
006045      if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
006046       ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
006047      ){
006048        goto freepage_out;
006049      }
006050      memset(pPage->aData, 0, pPage->pBt->pageSize);
006051    }
006052  
006053    /* If the database supports auto-vacuum, write an entry in the pointer-map
006054    ** to indicate that the page is free.
006055    */
006056    if( ISAUTOVACUUM ){
006057      ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
006058      if( rc ) goto freepage_out;
006059    }
006060  
006061    /* Now manipulate the actual database free-list structure. There are two
006062    ** possibilities. If the free-list is currently empty, or if the first
006063    ** trunk page in the free-list is full, then this page will become a
006064    ** new free-list trunk page. Otherwise, it will become a leaf of the
006065    ** first trunk page in the current free-list. This block tests if it
006066    ** is possible to add the page as a new free-list leaf.
006067    */
006068    if( nFree!=0 ){
006069      u32 nLeaf;                /* Initial number of leaf cells on trunk page */
006070  
006071      iTrunk = get4byte(&pPage1->aData[32]);
006072      rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
006073      if( rc!=SQLITE_OK ){
006074        goto freepage_out;
006075      }
006076  
006077      nLeaf = get4byte(&pTrunk->aData[4]);
006078      assert( pBt->usableSize>32 );
006079      if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
006080        rc = SQLITE_CORRUPT_BKPT;
006081        goto freepage_out;
006082      }
006083      if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
006084        /* In this case there is room on the trunk page to insert the page
006085        ** being freed as a new leaf.
006086        **
006087        ** Note that the trunk page is not really full until it contains
006088        ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
006089        ** coded.  But due to a coding error in versions of SQLite prior to
006090        ** 3.6.0, databases with freelist trunk pages holding more than
006091        ** usableSize/4 - 8 entries will be reported as corrupt.  In order
006092        ** to maintain backwards compatibility with older versions of SQLite,
006093        ** we will continue to restrict the number of entries to usableSize/4 - 8
006094        ** for now.  At some point in the future (once everyone has upgraded
006095        ** to 3.6.0 or later) we should consider fixing the conditional above
006096        ** to read "usableSize/4-2" instead of "usableSize/4-8".
006097        **
006098        ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
006099        ** avoid using the last six entries in the freelist trunk page array in
006100        ** order that database files created by newer versions of SQLite can be
006101        ** read by older versions of SQLite.
006102        */
006103        rc = sqlite3PagerWrite(pTrunk->pDbPage);
006104        if( rc==SQLITE_OK ){
006105          put4byte(&pTrunk->aData[4], nLeaf+1);
006106          put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
006107          if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
006108            sqlite3PagerDontWrite(pPage->pDbPage);
006109          }
006110          rc = btreeSetHasContent(pBt, iPage);
006111        }
006112        TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
006113        goto freepage_out;
006114      }
006115    }
006116  
006117    /* If control flows to this point, then it was not possible to add the
006118    ** the page being freed as a leaf page of the first trunk in the free-list.
006119    ** Possibly because the free-list is empty, or possibly because the 
006120    ** first trunk in the free-list is full. Either way, the page being freed
006121    ** will become the new first trunk page in the free-list.
006122    */
006123    if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
006124      goto freepage_out;
006125    }
006126    rc = sqlite3PagerWrite(pPage->pDbPage);
006127    if( rc!=SQLITE_OK ){
006128      goto freepage_out;
006129    }
006130    put4byte(pPage->aData, iTrunk);
006131    put4byte(&pPage->aData[4], 0);
006132    put4byte(&pPage1->aData[32], iPage);
006133    TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
006134  
006135  freepage_out:
006136    if( pPage ){
006137      pPage->isInit = 0;
006138    }
006139    releasePage(pPage);
006140    releasePage(pTrunk);
006141    return rc;
006142  }
006143  static void freePage(MemPage *pPage, int *pRC){
006144    if( (*pRC)==SQLITE_OK ){
006145      *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
006146    }
006147  }
006148  
006149  /*
006150  ** Free any overflow pages associated with the given Cell.  Write the
006151  ** local Cell size (the number of bytes on the original page, omitting
006152  ** overflow) into *pnSize.
006153  */
006154  static int clearCell(
006155    MemPage *pPage,          /* The page that contains the Cell */
006156    unsigned char *pCell,    /* First byte of the Cell */
006157    CellInfo *pInfo          /* Size information about the cell */
006158  ){
006159    BtShared *pBt;
006160    Pgno ovflPgno;
006161    int rc;
006162    int nOvfl;
006163    u32 ovflPageSize;
006164  
006165    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006166    pPage->xParseCell(pPage, pCell, pInfo);
006167    if( pInfo->nLocal==pInfo->nPayload ){
006168      return SQLITE_OK;  /* No overflow pages. Return without doing anything */
006169    }
006170    if( pCell+pInfo->nSize-1 > pPage->aData+pPage->maskPage ){
006171      /* Cell extends past end of page */
006172      return SQLITE_CORRUPT_PGNO(pPage->pgno);
006173    }
006174    ovflPgno = get4byte(pCell + pInfo->nSize - 4);
006175    pBt = pPage->pBt;
006176    assert( pBt->usableSize > 4 );
006177    ovflPageSize = pBt->usableSize - 4;
006178    nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
006179    assert( nOvfl>0 || 
006180      (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
006181    );
006182    while( nOvfl-- ){
006183      Pgno iNext = 0;
006184      MemPage *pOvfl = 0;
006185      if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
006186        /* 0 is not a legal page number and page 1 cannot be an 
006187        ** overflow page. Therefore if ovflPgno<2 or past the end of the 
006188        ** file the database must be corrupt. */
006189        return SQLITE_CORRUPT_BKPT;
006190      }
006191      if( nOvfl ){
006192        rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
006193        if( rc ) return rc;
006194      }
006195  
006196      if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
006197       && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
006198      ){
006199        /* There is no reason any cursor should have an outstanding reference 
006200        ** to an overflow page belonging to a cell that is being deleted/updated.
006201        ** So if there exists more than one reference to this page, then it 
006202        ** must not really be an overflow page and the database must be corrupt. 
006203        ** It is helpful to detect this before calling freePage2(), as 
006204        ** freePage2() may zero the page contents if secure-delete mode is
006205        ** enabled. If this 'overflow' page happens to be a page that the
006206        ** caller is iterating through or using in some other way, this
006207        ** can be problematic.
006208        */
006209        rc = SQLITE_CORRUPT_BKPT;
006210      }else{
006211        rc = freePage2(pBt, pOvfl, ovflPgno);
006212      }
006213  
006214      if( pOvfl ){
006215        sqlite3PagerUnref(pOvfl->pDbPage);
006216      }
006217      if( rc ) return rc;
006218      ovflPgno = iNext;
006219    }
006220    return SQLITE_OK;
006221  }
006222  
006223  /*
006224  ** Create the byte sequence used to represent a cell on page pPage
006225  ** and write that byte sequence into pCell[].  Overflow pages are
006226  ** allocated and filled in as necessary.  The calling procedure
006227  ** is responsible for making sure sufficient space has been allocated
006228  ** for pCell[].
006229  **
006230  ** Note that pCell does not necessary need to point to the pPage->aData
006231  ** area.  pCell might point to some temporary storage.  The cell will
006232  ** be constructed in this temporary area then copied into pPage->aData
006233  ** later.
006234  */
006235  static int fillInCell(
006236    MemPage *pPage,                /* The page that contains the cell */
006237    unsigned char *pCell,          /* Complete text of the cell */
006238    const BtreePayload *pX,        /* Payload with which to construct the cell */
006239    int *pnSize                    /* Write cell size here */
006240  ){
006241    int nPayload;
006242    const u8 *pSrc;
006243    int nSrc, n, rc, mn;
006244    int spaceLeft;
006245    MemPage *pToRelease;
006246    unsigned char *pPrior;
006247    unsigned char *pPayload;
006248    BtShared *pBt;
006249    Pgno pgnoOvfl;
006250    int nHeader;
006251  
006252    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006253  
006254    /* pPage is not necessarily writeable since pCell might be auxiliary
006255    ** buffer space that is separate from the pPage buffer area */
006256    assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
006257              || sqlite3PagerIswriteable(pPage->pDbPage) );
006258  
006259    /* Fill in the header. */
006260    nHeader = pPage->childPtrSize;
006261    if( pPage->intKey ){
006262      nPayload = pX->nData + pX->nZero;
006263      pSrc = pX->pData;
006264      nSrc = pX->nData;
006265      assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
006266      nHeader += putVarint32(&pCell[nHeader], nPayload);
006267      nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
006268    }else{
006269      assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
006270      nSrc = nPayload = (int)pX->nKey;
006271      pSrc = pX->pKey;
006272      nHeader += putVarint32(&pCell[nHeader], nPayload);
006273    }
006274    
006275    /* Fill in the payload */
006276    pPayload = &pCell[nHeader];
006277    if( nPayload<=pPage->maxLocal ){
006278      /* This is the common case where everything fits on the btree page
006279      ** and no overflow pages are required. */
006280      n = nHeader + nPayload;
006281      testcase( n==3 );
006282      testcase( n==4 );
006283      if( n<4 ) n = 4;
006284      *pnSize = n;
006285      assert( nSrc<=nPayload );
006286      testcase( nSrc<nPayload );
006287      memcpy(pPayload, pSrc, nSrc);
006288      memset(pPayload+nSrc, 0, nPayload-nSrc);
006289      return SQLITE_OK;
006290    }
006291  
006292    /* If we reach this point, it means that some of the content will need
006293    ** to spill onto overflow pages.
006294    */
006295    mn = pPage->minLocal;
006296    n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
006297    testcase( n==pPage->maxLocal );
006298    testcase( n==pPage->maxLocal+1 );
006299    if( n > pPage->maxLocal ) n = mn;
006300    spaceLeft = n;
006301    *pnSize = n + nHeader + 4;
006302    pPrior = &pCell[nHeader+n];
006303    pToRelease = 0;
006304    pgnoOvfl = 0;
006305    pBt = pPage->pBt;
006306  
006307    /* At this point variables should be set as follows:
006308    **
006309    **   nPayload           Total payload size in bytes
006310    **   pPayload           Begin writing payload here
006311    **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
006312    **                      that means content must spill into overflow pages.
006313    **   *pnSize            Size of the local cell (not counting overflow pages)
006314    **   pPrior             Where to write the pgno of the first overflow page
006315    **
006316    ** Use a call to btreeParseCellPtr() to verify that the values above
006317    ** were computed correctly.
006318    */
006319  #ifdef SQLITE_DEBUG
006320    {
006321      CellInfo info;
006322      pPage->xParseCell(pPage, pCell, &info);
006323      assert( nHeader==(int)(info.pPayload - pCell) );
006324      assert( info.nKey==pX->nKey );
006325      assert( *pnSize == info.nSize );
006326      assert( spaceLeft == info.nLocal );
006327    }
006328  #endif
006329  
006330    /* Write the payload into the local Cell and any extra into overflow pages */
006331    while( 1 ){
006332      n = nPayload;
006333      if( n>spaceLeft ) n = spaceLeft;
006334  
006335      /* If pToRelease is not zero than pPayload points into the data area
006336      ** of pToRelease.  Make sure pToRelease is still writeable. */
006337      assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006338  
006339      /* If pPayload is part of the data area of pPage, then make sure pPage
006340      ** is still writeable */
006341      assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
006342              || sqlite3PagerIswriteable(pPage->pDbPage) );
006343  
006344      if( nSrc>=n ){
006345        memcpy(pPayload, pSrc, n);
006346      }else if( nSrc>0 ){
006347        n = nSrc;
006348        memcpy(pPayload, pSrc, n);
006349      }else{
006350        memset(pPayload, 0, n);
006351      }
006352      nPayload -= n;
006353      if( nPayload<=0 ) break;
006354      pPayload += n;
006355      pSrc += n;
006356      nSrc -= n;
006357      spaceLeft -= n;
006358      if( spaceLeft==0 ){
006359        MemPage *pOvfl = 0;
006360  #ifndef SQLITE_OMIT_AUTOVACUUM
006361        Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
006362        if( pBt->autoVacuum ){
006363          do{
006364            pgnoOvfl++;
006365          } while( 
006366            PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 
006367          );
006368        }
006369  #endif
006370        rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
006371  #ifndef SQLITE_OMIT_AUTOVACUUM
006372        /* If the database supports auto-vacuum, and the second or subsequent
006373        ** overflow page is being allocated, add an entry to the pointer-map
006374        ** for that page now. 
006375        **
006376        ** If this is the first overflow page, then write a partial entry 
006377        ** to the pointer-map. If we write nothing to this pointer-map slot,
006378        ** then the optimistic overflow chain processing in clearCell()
006379        ** may misinterpret the uninitialized values and delete the
006380        ** wrong pages from the database.
006381        */
006382        if( pBt->autoVacuum && rc==SQLITE_OK ){
006383          u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
006384          ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
006385          if( rc ){
006386            releasePage(pOvfl);
006387          }
006388        }
006389  #endif
006390        if( rc ){
006391          releasePage(pToRelease);
006392          return rc;
006393        }
006394  
006395        /* If pToRelease is not zero than pPrior points into the data area
006396        ** of pToRelease.  Make sure pToRelease is still writeable. */
006397        assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
006398  
006399        /* If pPrior is part of the data area of pPage, then make sure pPage
006400        ** is still writeable */
006401        assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
006402              || sqlite3PagerIswriteable(pPage->pDbPage) );
006403  
006404        put4byte(pPrior, pgnoOvfl);
006405        releasePage(pToRelease);
006406        pToRelease = pOvfl;
006407        pPrior = pOvfl->aData;
006408        put4byte(pPrior, 0);
006409        pPayload = &pOvfl->aData[4];
006410        spaceLeft = pBt->usableSize - 4;
006411      }
006412    }
006413    releasePage(pToRelease);
006414    return SQLITE_OK;
006415  }
006416  
006417  /*
006418  ** Remove the i-th cell from pPage.  This routine effects pPage only.
006419  ** The cell content is not freed or deallocated.  It is assumed that
006420  ** the cell content has been copied someplace else.  This routine just
006421  ** removes the reference to the cell from pPage.
006422  **
006423  ** "sz" must be the number of bytes in the cell.
006424  */
006425  static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
006426    u32 pc;         /* Offset to cell content of cell being deleted */
006427    u8 *data;       /* pPage->aData */
006428    u8 *ptr;        /* Used to move bytes around within data[] */
006429    int rc;         /* The return code */
006430    int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
006431  
006432    if( *pRC ) return;
006433    assert( idx>=0 && idx<pPage->nCell );
006434    assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
006435    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006436    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006437    data = pPage->aData;
006438    ptr = &pPage->aCellIdx[2*idx];
006439    pc = get2byte(ptr);
006440    hdr = pPage->hdrOffset;
006441    testcase( pc==get2byte(&data[hdr+5]) );
006442    testcase( pc+sz==pPage->pBt->usableSize );
006443    if( pc+sz > pPage->pBt->usableSize ){
006444      *pRC = SQLITE_CORRUPT_BKPT;
006445      return;
006446    }
006447    rc = freeSpace(pPage, pc, sz);
006448    if( rc ){
006449      *pRC = rc;
006450      return;
006451    }
006452    pPage->nCell--;
006453    if( pPage->nCell==0 ){
006454      memset(&data[hdr+1], 0, 4);
006455      data[hdr+7] = 0;
006456      put2byte(&data[hdr+5], pPage->pBt->usableSize);
006457      pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
006458                         - pPage->childPtrSize - 8;
006459    }else{
006460      memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
006461      put2byte(&data[hdr+3], pPage->nCell);
006462      pPage->nFree += 2;
006463    }
006464  }
006465  
006466  /*
006467  ** Insert a new cell on pPage at cell index "i".  pCell points to the
006468  ** content of the cell.
006469  **
006470  ** If the cell content will fit on the page, then put it there.  If it
006471  ** will not fit, then make a copy of the cell content into pTemp if
006472  ** pTemp is not null.  Regardless of pTemp, allocate a new entry
006473  ** in pPage->apOvfl[] and make it point to the cell content (either
006474  ** in pTemp or the original pCell) and also record its index. 
006475  ** Allocating a new entry in pPage->aCell[] implies that 
006476  ** pPage->nOverflow is incremented.
006477  **
006478  ** *pRC must be SQLITE_OK when this routine is called.
006479  */
006480  static void insertCell(
006481    MemPage *pPage,   /* Page into which we are copying */
006482    int i,            /* New cell becomes the i-th cell of the page */
006483    u8 *pCell,        /* Content of the new cell */
006484    int sz,           /* Bytes of content in pCell */
006485    u8 *pTemp,        /* Temp storage space for pCell, if needed */
006486    Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
006487    int *pRC          /* Read and write return code from here */
006488  ){
006489    int idx = 0;      /* Where to write new cell content in data[] */
006490    int j;            /* Loop counter */
006491    u8 *data;         /* The content of the whole page */
006492    u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
006493  
006494    assert( *pRC==SQLITE_OK );
006495    assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
006496    assert( MX_CELL(pPage->pBt)<=10921 );
006497    assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
006498    assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
006499    assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
006500    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006501    /* The cell should normally be sized correctly.  However, when moving a
006502    ** malformed cell from a leaf page to an interior page, if the cell size
006503    ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size
006504    ** might be less than 8 (leaf-size + pointer) on the interior node.  Hence
006505    ** the term after the || in the following assert(). */
006506    assert( sz==pPage->xCellSize(pPage, pCell) || (sz==8 && iChild>0) );
006507    if( pPage->nOverflow || sz+2>pPage->nFree ){
006508      if( pTemp ){
006509        memcpy(pTemp, pCell, sz);
006510        pCell = pTemp;
006511      }
006512      if( iChild ){
006513        put4byte(pCell, iChild);
006514      }
006515      j = pPage->nOverflow++;
006516      /* Comparison against ArraySize-1 since we hold back one extra slot
006517      ** as a contingency.  In other words, never need more than 3 overflow
006518      ** slots but 4 are allocated, just to be safe. */
006519      assert( j < ArraySize(pPage->apOvfl)-1 );
006520      pPage->apOvfl[j] = pCell;
006521      pPage->aiOvfl[j] = (u16)i;
006522  
006523      /* When multiple overflows occur, they are always sequential and in
006524      ** sorted order.  This invariants arise because multiple overflows can
006525      ** only occur when inserting divider cells into the parent page during
006526      ** balancing, and the dividers are adjacent and sorted.
006527      */
006528      assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
006529      assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
006530    }else{
006531      int rc = sqlite3PagerWrite(pPage->pDbPage);
006532      if( rc!=SQLITE_OK ){
006533        *pRC = rc;
006534        return;
006535      }
006536      assert( sqlite3PagerIswriteable(pPage->pDbPage) );
006537      data = pPage->aData;
006538      assert( &data[pPage->cellOffset]==pPage->aCellIdx );
006539      rc = allocateSpace(pPage, sz, &idx);
006540      if( rc ){ *pRC = rc; return; }
006541      /* The allocateSpace() routine guarantees the following properties
006542      ** if it returns successfully */
006543      assert( idx >= 0 );
006544      assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
006545      assert( idx+sz <= (int)pPage->pBt->usableSize );
006546      pPage->nFree -= (u16)(2 + sz);
006547      memcpy(&data[idx], pCell, sz);
006548      if( iChild ){
006549        put4byte(&data[idx], iChild);
006550      }
006551      pIns = pPage->aCellIdx + i*2;
006552      memmove(pIns+2, pIns, 2*(pPage->nCell - i));
006553      put2byte(pIns, idx);
006554      pPage->nCell++;
006555      /* increment the cell count */
006556      if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
006557      assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell );
006558  #ifndef SQLITE_OMIT_AUTOVACUUM
006559      if( pPage->pBt->autoVacuum ){
006560        /* The cell may contain a pointer to an overflow page. If so, write
006561        ** the entry for the overflow page into the pointer map.
006562        */
006563        ptrmapPutOvflPtr(pPage, pCell, pRC);
006564      }
006565  #endif
006566    }
006567  }
006568  
006569  /*
006570  ** A CellArray object contains a cache of pointers and sizes for a
006571  ** consecutive sequence of cells that might be held on multiple pages.
006572  */
006573  typedef struct CellArray CellArray;
006574  struct CellArray {
006575    int nCell;              /* Number of cells in apCell[] */
006576    MemPage *pRef;          /* Reference page */
006577    u8 **apCell;            /* All cells begin balanced */
006578    u16 *szCell;            /* Local size of all cells in apCell[] */
006579  };
006580  
006581  /*
006582  ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
006583  ** computed.
006584  */
006585  static void populateCellCache(CellArray *p, int idx, int N){
006586    assert( idx>=0 && idx+N<=p->nCell );
006587    while( N>0 ){
006588      assert( p->apCell[idx]!=0 );
006589      if( p->szCell[idx]==0 ){
006590        p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
006591      }else{
006592        assert( CORRUPT_DB ||
006593                p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
006594      }
006595      idx++;
006596      N--;
006597    }
006598  }
006599  
006600  /*
006601  ** Return the size of the Nth element of the cell array
006602  */
006603  static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
006604    assert( N>=0 && N<p->nCell );
006605    assert( p->szCell[N]==0 );
006606    p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
006607    return p->szCell[N];
006608  }
006609  static u16 cachedCellSize(CellArray *p, int N){
006610    assert( N>=0 && N<p->nCell );
006611    if( p->szCell[N] ) return p->szCell[N];
006612    return computeCellSize(p, N);
006613  }
006614  
006615  /*
006616  ** Array apCell[] contains pointers to nCell b-tree page cells. The 
006617  ** szCell[] array contains the size in bytes of each cell. This function
006618  ** replaces the current contents of page pPg with the contents of the cell
006619  ** array.
006620  **
006621  ** Some of the cells in apCell[] may currently be stored in pPg. This
006622  ** function works around problems caused by this by making a copy of any 
006623  ** such cells before overwriting the page data.
006624  **
006625  ** The MemPage.nFree field is invalidated by this function. It is the 
006626  ** responsibility of the caller to set it correctly.
006627  */
006628  static int rebuildPage(
006629    MemPage *pPg,                   /* Edit this page */
006630    int nCell,                      /* Final number of cells on page */
006631    u8 **apCell,                    /* Array of cells */
006632    u16 *szCell                     /* Array of cell sizes */
006633  ){
006634    const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
006635    u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
006636    const int usableSize = pPg->pBt->usableSize;
006637    u8 * const pEnd = &aData[usableSize];
006638    int i;
006639    u8 *pCellptr = pPg->aCellIdx;
006640    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
006641    u8 *pData;
006642  
006643    i = get2byte(&aData[hdr+5]);
006644    memcpy(&pTmp[i], &aData[i], usableSize - i);
006645  
006646    pData = pEnd;
006647    for(i=0; i<nCell; i++){
006648      u8 *pCell = apCell[i];
006649      if( SQLITE_WITHIN(pCell,aData,pEnd) ){
006650        pCell = &pTmp[pCell - aData];
006651      }
006652      pData -= szCell[i];
006653      put2byte(pCellptr, (pData - aData));
006654      pCellptr += 2;
006655      if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
006656      memcpy(pData, pCell, szCell[i]);
006657      assert( szCell[i]==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
006658      testcase( szCell[i]!=pPg->xCellSize(pPg,pCell) );
006659    }
006660  
006661    /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
006662    pPg->nCell = nCell;
006663    pPg->nOverflow = 0;
006664  
006665    put2byte(&aData[hdr+1], 0);
006666    put2byte(&aData[hdr+3], pPg->nCell);
006667    put2byte(&aData[hdr+5], pData - aData);
006668    aData[hdr+7] = 0x00;
006669    return SQLITE_OK;
006670  }
006671  
006672  /*
006673  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell
006674  ** contains the size in bytes of each such cell. This function attempts to 
006675  ** add the cells stored in the array to page pPg. If it cannot (because 
006676  ** the page needs to be defragmented before the cells will fit), non-zero
006677  ** is returned. Otherwise, if the cells are added successfully, zero is
006678  ** returned.
006679  **
006680  ** Argument pCellptr points to the first entry in the cell-pointer array
006681  ** (part of page pPg) to populate. After cell apCell[0] is written to the
006682  ** page body, a 16-bit offset is written to pCellptr. And so on, for each
006683  ** cell in the array. It is the responsibility of the caller to ensure
006684  ** that it is safe to overwrite this part of the cell-pointer array.
006685  **
006686  ** When this function is called, *ppData points to the start of the 
006687  ** content area on page pPg. If the size of the content area is extended,
006688  ** *ppData is updated to point to the new start of the content area
006689  ** before returning.
006690  **
006691  ** Finally, argument pBegin points to the byte immediately following the
006692  ** end of the space required by this page for the cell-pointer area (for
006693  ** all cells - not just those inserted by the current call). If the content
006694  ** area must be extended to before this point in order to accomodate all
006695  ** cells in apCell[], then the cells do not fit and non-zero is returned.
006696  */
006697  static int pageInsertArray(
006698    MemPage *pPg,                   /* Page to add cells to */
006699    u8 *pBegin,                     /* End of cell-pointer array */
006700    u8 **ppData,                    /* IN/OUT: Page content -area pointer */
006701    u8 *pCellptr,                   /* Pointer to cell-pointer area */
006702    int iFirst,                     /* Index of first cell to add */
006703    int nCell,                      /* Number of cells to add to pPg */
006704    CellArray *pCArray              /* Array of cells */
006705  ){
006706    int i;
006707    u8 *aData = pPg->aData;
006708    u8 *pData = *ppData;
006709    int iEnd = iFirst + nCell;
006710    assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
006711    for(i=iFirst; i<iEnd; i++){
006712      int sz, rc;
006713      u8 *pSlot;
006714      sz = cachedCellSize(pCArray, i);
006715      if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
006716        if( (pData - pBegin)<sz ) return 1;
006717        pData -= sz;
006718        pSlot = pData;
006719      }
006720      /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
006721      ** database.  But they might for a corrupt database.  Hence use memmove()
006722      ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
006723      assert( (pSlot+sz)<=pCArray->apCell[i]
006724           || pSlot>=(pCArray->apCell[i]+sz)
006725           || CORRUPT_DB );
006726      memmove(pSlot, pCArray->apCell[i], sz);
006727      put2byte(pCellptr, (pSlot - aData));
006728      pCellptr += 2;
006729    }
006730    *ppData = pData;
006731    return 0;
006732  }
006733  
006734  /*
006735  ** Array apCell[] contains nCell pointers to b-tree cells. Array szCell 
006736  ** contains the size in bytes of each such cell. This function adds the
006737  ** space associated with each cell in the array that is currently stored 
006738  ** within the body of pPg to the pPg free-list. The cell-pointers and other
006739  ** fields of the page are not updated.
006740  **
006741  ** This function returns the total number of cells added to the free-list.
006742  */
006743  static int pageFreeArray(
006744    MemPage *pPg,                   /* Page to edit */
006745    int iFirst,                     /* First cell to delete */
006746    int nCell,                      /* Cells to delete */
006747    CellArray *pCArray              /* Array of cells */
006748  ){
006749    u8 * const aData = pPg->aData;
006750    u8 * const pEnd = &aData[pPg->pBt->usableSize];
006751    u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
006752    int nRet = 0;
006753    int i;
006754    int iEnd = iFirst + nCell;
006755    u8 *pFree = 0;
006756    int szFree = 0;
006757  
006758    for(i=iFirst; i<iEnd; i++){
006759      u8 *pCell = pCArray->apCell[i];
006760      if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
006761        int sz;
006762        /* No need to use cachedCellSize() here.  The sizes of all cells that
006763        ** are to be freed have already been computing while deciding which
006764        ** cells need freeing */
006765        sz = pCArray->szCell[i];  assert( sz>0 );
006766        if( pFree!=(pCell + sz) ){
006767          if( pFree ){
006768            assert( pFree>aData && (pFree - aData)<65536 );
006769            freeSpace(pPg, (u16)(pFree - aData), szFree);
006770          }
006771          pFree = pCell;
006772          szFree = sz;
006773          if( pFree+sz>pEnd ) return 0;
006774        }else{
006775          pFree = pCell;
006776          szFree += sz;
006777        }
006778        nRet++;
006779      }
006780    }
006781    if( pFree ){
006782      assert( pFree>aData && (pFree - aData)<65536 );
006783      freeSpace(pPg, (u16)(pFree - aData), szFree);
006784    }
006785    return nRet;
006786  }
006787  
006788  /*
006789  ** apCell[] and szCell[] contains pointers to and sizes of all cells in the
006790  ** pages being balanced.  The current page, pPg, has pPg->nCell cells starting
006791  ** with apCell[iOld].  After balancing, this page should hold nNew cells
006792  ** starting at apCell[iNew].
006793  **
006794  ** This routine makes the necessary adjustments to pPg so that it contains
006795  ** the correct cells after being balanced.
006796  **
006797  ** The pPg->nFree field is invalid when this function returns. It is the
006798  ** responsibility of the caller to set it correctly.
006799  */
006800  static int editPage(
006801    MemPage *pPg,                   /* Edit this page */
006802    int iOld,                       /* Index of first cell currently on page */
006803    int iNew,                       /* Index of new first cell on page */
006804    int nNew,                       /* Final number of cells on page */
006805    CellArray *pCArray              /* Array of cells and sizes */
006806  ){
006807    u8 * const aData = pPg->aData;
006808    const int hdr = pPg->hdrOffset;
006809    u8 *pBegin = &pPg->aCellIdx[nNew * 2];
006810    int nCell = pPg->nCell;       /* Cells stored on pPg */
006811    u8 *pData;
006812    u8 *pCellptr;
006813    int i;
006814    int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
006815    int iNewEnd = iNew + nNew;
006816  
006817  #ifdef SQLITE_DEBUG
006818    u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
006819    memcpy(pTmp, aData, pPg->pBt->usableSize);
006820  #endif
006821  
006822    /* Remove cells from the start and end of the page */
006823    if( iOld<iNew ){
006824      int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
006825      memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
006826      nCell -= nShift;
006827    }
006828    if( iNewEnd < iOldEnd ){
006829      nCell -= pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
006830    }
006831  
006832    pData = &aData[get2byteNotZero(&aData[hdr+5])];
006833    if( pData<pBegin ) goto editpage_fail;
006834  
006835    /* Add cells to the start of the page */
006836    if( iNew<iOld ){
006837      int nAdd = MIN(nNew,iOld-iNew);
006838      assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
006839      pCellptr = pPg->aCellIdx;
006840      memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
006841      if( pageInsertArray(
006842            pPg, pBegin, &pData, pCellptr,
006843            iNew, nAdd, pCArray
006844      ) ) goto editpage_fail;
006845      nCell += nAdd;
006846    }
006847  
006848    /* Add any overflow cells */
006849    for(i=0; i<pPg->nOverflow; i++){
006850      int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
006851      if( iCell>=0 && iCell<nNew ){
006852        pCellptr = &pPg->aCellIdx[iCell * 2];
006853        memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
006854        nCell++;
006855        if( pageInsertArray(
006856              pPg, pBegin, &pData, pCellptr,
006857              iCell+iNew, 1, pCArray
006858        ) ) goto editpage_fail;
006859      }
006860    }
006861  
006862    /* Append cells to the end of the page */
006863    pCellptr = &pPg->aCellIdx[nCell*2];
006864    if( pageInsertArray(
006865          pPg, pBegin, &pData, pCellptr,
006866          iNew+nCell, nNew-nCell, pCArray
006867    ) ) goto editpage_fail;
006868  
006869    pPg->nCell = nNew;
006870    pPg->nOverflow = 0;
006871  
006872    put2byte(&aData[hdr+3], pPg->nCell);
006873    put2byte(&aData[hdr+5], pData - aData);
006874  
006875  #ifdef SQLITE_DEBUG
006876    for(i=0; i<nNew && !CORRUPT_DB; i++){
006877      u8 *pCell = pCArray->apCell[i+iNew];
006878      int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
006879      if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
006880        pCell = &pTmp[pCell - aData];
006881      }
006882      assert( 0==memcmp(pCell, &aData[iOff],
006883              pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
006884    }
006885  #endif
006886  
006887    return SQLITE_OK;
006888   editpage_fail:
006889    /* Unable to edit this page. Rebuild it from scratch instead. */
006890    populateCellCache(pCArray, iNew, nNew);
006891    return rebuildPage(pPg, nNew, &pCArray->apCell[iNew], &pCArray->szCell[iNew]);
006892  }
006893  
006894  /*
006895  ** The following parameters determine how many adjacent pages get involved
006896  ** in a balancing operation.  NN is the number of neighbors on either side
006897  ** of the page that participate in the balancing operation.  NB is the
006898  ** total number of pages that participate, including the target page and
006899  ** NN neighbors on either side.
006900  **
006901  ** The minimum value of NN is 1 (of course).  Increasing NN above 1
006902  ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
006903  ** in exchange for a larger degradation in INSERT and UPDATE performance.
006904  ** The value of NN appears to give the best results overall.
006905  */
006906  #define NN 1             /* Number of neighbors on either side of pPage */
006907  #define NB (NN*2+1)      /* Total pages involved in the balance */
006908  
006909  
006910  #ifndef SQLITE_OMIT_QUICKBALANCE
006911  /*
006912  ** This version of balance() handles the common special case where
006913  ** a new entry is being inserted on the extreme right-end of the
006914  ** tree, in other words, when the new entry will become the largest
006915  ** entry in the tree.
006916  **
006917  ** Instead of trying to balance the 3 right-most leaf pages, just add
006918  ** a new page to the right-hand side and put the one new entry in
006919  ** that page.  This leaves the right side of the tree somewhat
006920  ** unbalanced.  But odds are that we will be inserting new entries
006921  ** at the end soon afterwards so the nearly empty page will quickly
006922  ** fill up.  On average.
006923  **
006924  ** pPage is the leaf page which is the right-most page in the tree.
006925  ** pParent is its parent.  pPage must have a single overflow entry
006926  ** which is also the right-most entry on the page.
006927  **
006928  ** The pSpace buffer is used to store a temporary copy of the divider
006929  ** cell that will be inserted into pParent. Such a cell consists of a 4
006930  ** byte page number followed by a variable length integer. In other
006931  ** words, at most 13 bytes. Hence the pSpace buffer must be at
006932  ** least 13 bytes in size.
006933  */
006934  static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
006935    BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
006936    MemPage *pNew;                       /* Newly allocated page */
006937    int rc;                              /* Return Code */
006938    Pgno pgnoNew;                        /* Page number of pNew */
006939  
006940    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
006941    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
006942    assert( pPage->nOverflow==1 );
006943  
006944    /* This error condition is now caught prior to reaching this function */
006945    if( NEVER(pPage->nCell==0) ) return SQLITE_CORRUPT_BKPT;
006946  
006947    /* Allocate a new page. This page will become the right-sibling of 
006948    ** pPage. Make the parent page writable, so that the new divider cell
006949    ** may be inserted. If both these operations are successful, proceed.
006950    */
006951    rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
006952  
006953    if( rc==SQLITE_OK ){
006954  
006955      u8 *pOut = &pSpace[4];
006956      u8 *pCell = pPage->apOvfl[0];
006957      u16 szCell = pPage->xCellSize(pPage, pCell);
006958      u8 *pStop;
006959  
006960      assert( sqlite3PagerIswriteable(pNew->pDbPage) );
006961      assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
006962      zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
006963      rc = rebuildPage(pNew, 1, &pCell, &szCell);
006964      if( NEVER(rc) ) return rc;
006965      pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
006966  
006967      /* If this is an auto-vacuum database, update the pointer map
006968      ** with entries for the new page, and any pointer from the 
006969      ** cell on the page to an overflow page. If either of these
006970      ** operations fails, the return code is set, but the contents
006971      ** of the parent page are still manipulated by thh code below.
006972      ** That is Ok, at this point the parent page is guaranteed to
006973      ** be marked as dirty. Returning an error code will cause a
006974      ** rollback, undoing any changes made to the parent page.
006975      */
006976      if( ISAUTOVACUUM ){
006977        ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
006978        if( szCell>pNew->minLocal ){
006979          ptrmapPutOvflPtr(pNew, pCell, &rc);
006980        }
006981      }
006982    
006983      /* Create a divider cell to insert into pParent. The divider cell
006984      ** consists of a 4-byte page number (the page number of pPage) and
006985      ** a variable length key value (which must be the same value as the
006986      ** largest key on pPage).
006987      **
006988      ** To find the largest key value on pPage, first find the right-most 
006989      ** cell on pPage. The first two fields of this cell are the 
006990      ** record-length (a variable length integer at most 32-bits in size)
006991      ** and the key value (a variable length integer, may have any value).
006992      ** The first of the while(...) loops below skips over the record-length
006993      ** field. The second while(...) loop copies the key value from the
006994      ** cell on pPage into the pSpace buffer.
006995      */
006996      pCell = findCell(pPage, pPage->nCell-1);
006997      pStop = &pCell[9];
006998      while( (*(pCell++)&0x80) && pCell<pStop );
006999      pStop = &pCell[9];
007000      while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
007001  
007002      /* Insert the new divider cell into pParent. */
007003      if( rc==SQLITE_OK ){
007004        insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
007005                     0, pPage->pgno, &rc);
007006      }
007007  
007008      /* Set the right-child pointer of pParent to point to the new page. */
007009      put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
007010    
007011      /* Release the reference to the new page. */
007012      releasePage(pNew);
007013    }
007014  
007015    return rc;
007016  }
007017  #endif /* SQLITE_OMIT_QUICKBALANCE */
007018  
007019  #if 0
007020  /*
007021  ** This function does not contribute anything to the operation of SQLite.
007022  ** it is sometimes activated temporarily while debugging code responsible 
007023  ** for setting pointer-map entries.
007024  */
007025  static int ptrmapCheckPages(MemPage **apPage, int nPage){
007026    int i, j;
007027    for(i=0; i<nPage; i++){
007028      Pgno n;
007029      u8 e;
007030      MemPage *pPage = apPage[i];
007031      BtShared *pBt = pPage->pBt;
007032      assert( pPage->isInit );
007033  
007034      for(j=0; j<pPage->nCell; j++){
007035        CellInfo info;
007036        u8 *z;
007037       
007038        z = findCell(pPage, j);
007039        pPage->xParseCell(pPage, z, &info);
007040        if( info.nLocal<info.nPayload ){
007041          Pgno ovfl = get4byte(&z[info.nSize-4]);
007042          ptrmapGet(pBt, ovfl, &e, &n);
007043          assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
007044        }
007045        if( !pPage->leaf ){
007046          Pgno child = get4byte(z);
007047          ptrmapGet(pBt, child, &e, &n);
007048          assert( n==pPage->pgno && e==PTRMAP_BTREE );
007049        }
007050      }
007051      if( !pPage->leaf ){
007052        Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
007053        ptrmapGet(pBt, child, &e, &n);
007054        assert( n==pPage->pgno && e==PTRMAP_BTREE );
007055      }
007056    }
007057    return 1;
007058  }
007059  #endif
007060  
007061  /*
007062  ** This function is used to copy the contents of the b-tree node stored 
007063  ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
007064  ** the pointer-map entries for each child page are updated so that the
007065  ** parent page stored in the pointer map is page pTo. If pFrom contained
007066  ** any cells with overflow page pointers, then the corresponding pointer
007067  ** map entries are also updated so that the parent page is page pTo.
007068  **
007069  ** If pFrom is currently carrying any overflow cells (entries in the
007070  ** MemPage.apOvfl[] array), they are not copied to pTo. 
007071  **
007072  ** Before returning, page pTo is reinitialized using btreeInitPage().
007073  **
007074  ** The performance of this function is not critical. It is only used by 
007075  ** the balance_shallower() and balance_deeper() procedures, neither of
007076  ** which are called often under normal circumstances.
007077  */
007078  static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
007079    if( (*pRC)==SQLITE_OK ){
007080      BtShared * const pBt = pFrom->pBt;
007081      u8 * const aFrom = pFrom->aData;
007082      u8 * const aTo = pTo->aData;
007083      int const iFromHdr = pFrom->hdrOffset;
007084      int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
007085      int rc;
007086      int iData;
007087    
007088    
007089      assert( pFrom->isInit );
007090      assert( pFrom->nFree>=iToHdr );
007091      assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
007092    
007093      /* Copy the b-tree node content from page pFrom to page pTo. */
007094      iData = get2byte(&aFrom[iFromHdr+5]);
007095      memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
007096      memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
007097    
007098      /* Reinitialize page pTo so that the contents of the MemPage structure
007099      ** match the new data. The initialization of pTo can actually fail under
007100      ** fairly obscure circumstances, even though it is a copy of initialized 
007101      ** page pFrom.
007102      */
007103      pTo->isInit = 0;
007104      rc = btreeInitPage(pTo);
007105      if( rc!=SQLITE_OK ){
007106        *pRC = rc;
007107        return;
007108      }
007109    
007110      /* If this is an auto-vacuum database, update the pointer-map entries
007111      ** for any b-tree or overflow pages that pTo now contains the pointers to.
007112      */
007113      if( ISAUTOVACUUM ){
007114        *pRC = setChildPtrmaps(pTo);
007115      }
007116    }
007117  }
007118  
007119  /*
007120  ** This routine redistributes cells on the iParentIdx'th child of pParent
007121  ** (hereafter "the page") and up to 2 siblings so that all pages have about the
007122  ** same amount of free space. Usually a single sibling on either side of the
007123  ** page are used in the balancing, though both siblings might come from one
007124  ** side if the page is the first or last child of its parent. If the page 
007125  ** has fewer than 2 siblings (something which can only happen if the page
007126  ** is a root page or a child of a root page) then all available siblings
007127  ** participate in the balancing.
007128  **
007129  ** The number of siblings of the page might be increased or decreased by 
007130  ** one or two in an effort to keep pages nearly full but not over full. 
007131  **
007132  ** Note that when this routine is called, some of the cells on the page
007133  ** might not actually be stored in MemPage.aData[]. This can happen
007134  ** if the page is overfull. This routine ensures that all cells allocated
007135  ** to the page and its siblings fit into MemPage.aData[] before returning.
007136  **
007137  ** In the course of balancing the page and its siblings, cells may be
007138  ** inserted into or removed from the parent page (pParent). Doing so
007139  ** may cause the parent page to become overfull or underfull. If this
007140  ** happens, it is the responsibility of the caller to invoke the correct
007141  ** balancing routine to fix this problem (see the balance() routine). 
007142  **
007143  ** If this routine fails for any reason, it might leave the database
007144  ** in a corrupted state. So if this routine fails, the database should
007145  ** be rolled back.
007146  **
007147  ** The third argument to this function, aOvflSpace, is a pointer to a
007148  ** buffer big enough to hold one page. If while inserting cells into the parent
007149  ** page (pParent) the parent page becomes overfull, this buffer is
007150  ** used to store the parent's overflow cells. Because this function inserts
007151  ** a maximum of four divider cells into the parent page, and the maximum
007152  ** size of a cell stored within an internal node is always less than 1/4
007153  ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
007154  ** enough for all overflow cells.
007155  **
007156  ** If aOvflSpace is set to a null pointer, this function returns 
007157  ** SQLITE_NOMEM.
007158  */
007159  static int balance_nonroot(
007160    MemPage *pParent,               /* Parent page of siblings being balanced */
007161    int iParentIdx,                 /* Index of "the page" in pParent */
007162    u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
007163    int isRoot,                     /* True if pParent is a root-page */
007164    int bBulk                       /* True if this call is part of a bulk load */
007165  ){
007166    BtShared *pBt;               /* The whole database */
007167    int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
007168    int nNew = 0;                /* Number of pages in apNew[] */
007169    int nOld;                    /* Number of pages in apOld[] */
007170    int i, j, k;                 /* Loop counters */
007171    int nxDiv;                   /* Next divider slot in pParent->aCell[] */
007172    int rc = SQLITE_OK;          /* The return code */
007173    u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
007174    int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
007175    int usableSpace;             /* Bytes in pPage beyond the header */
007176    int pageFlags;               /* Value of pPage->aData[0] */
007177    int iSpace1 = 0;             /* First unused byte of aSpace1[] */
007178    int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
007179    int szScratch;               /* Size of scratch memory requested */
007180    MemPage *apOld[NB];          /* pPage and up to two siblings */
007181    MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
007182    u8 *pRight;                  /* Location in parent of right-sibling pointer */
007183    u8 *apDiv[NB-1];             /* Divider cells in pParent */
007184    int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
007185    int cntOld[NB+2];            /* Old index in b.apCell[] */
007186    int szNew[NB+2];             /* Combined size of cells placed on i-th page */
007187    u8 *aSpace1;                 /* Space for copies of dividers cells */
007188    Pgno pgno;                   /* Temp var to store a page number in */
007189    u8 abDone[NB+2];             /* True after i'th new page is populated */
007190    Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
007191    Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
007192    u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
007193    CellArray b;                  /* Parsed information on cells being balanced */
007194  
007195    memset(abDone, 0, sizeof(abDone));
007196    b.nCell = 0;
007197    b.apCell = 0;
007198    pBt = pParent->pBt;
007199    assert( sqlite3_mutex_held(pBt->mutex) );
007200    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007201  
007202  #if 0
007203    TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno));
007204  #endif
007205  
007206    /* At this point pParent may have at most one overflow cell. And if
007207    ** this overflow cell is present, it must be the cell with 
007208    ** index iParentIdx. This scenario comes about when this function
007209    ** is called (indirectly) from sqlite3BtreeDelete().
007210    */
007211    assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
007212    assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
007213  
007214    if( !aOvflSpace ){
007215      return SQLITE_NOMEM_BKPT;
007216    }
007217  
007218    /* Find the sibling pages to balance. Also locate the cells in pParent 
007219    ** that divide the siblings. An attempt is made to find NN siblings on 
007220    ** either side of pPage. More siblings are taken from one side, however, 
007221    ** if there are fewer than NN siblings on the other side. If pParent
007222    ** has NB or fewer children then all children of pParent are taken.  
007223    **
007224    ** This loop also drops the divider cells from the parent page. This
007225    ** way, the remainder of the function does not have to deal with any
007226    ** overflow cells in the parent page, since if any existed they will
007227    ** have already been removed.
007228    */
007229    i = pParent->nOverflow + pParent->nCell;
007230    if( i<2 ){
007231      nxDiv = 0;
007232    }else{
007233      assert( bBulk==0 || bBulk==1 );
007234      if( iParentIdx==0 ){                 
007235        nxDiv = 0;
007236      }else if( iParentIdx==i ){
007237        nxDiv = i-2+bBulk;
007238      }else{
007239        nxDiv = iParentIdx-1;
007240      }
007241      i = 2-bBulk;
007242    }
007243    nOld = i+1;
007244    if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
007245      pRight = &pParent->aData[pParent->hdrOffset+8];
007246    }else{
007247      pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
007248    }
007249    pgno = get4byte(pRight);
007250    while( 1 ){
007251      rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
007252      if( rc ){
007253        memset(apOld, 0, (i+1)*sizeof(MemPage*));
007254        goto balance_cleanup;
007255      }
007256      nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow;
007257      if( (i--)==0 ) break;
007258  
007259      if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
007260        apDiv[i] = pParent->apOvfl[0];
007261        pgno = get4byte(apDiv[i]);
007262        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007263        pParent->nOverflow = 0;
007264      }else{
007265        apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
007266        pgno = get4byte(apDiv[i]);
007267        szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
007268  
007269        /* Drop the cell from the parent page. apDiv[i] still points to
007270        ** the cell within the parent, even though it has been dropped.
007271        ** This is safe because dropping a cell only overwrites the first
007272        ** four bytes of it, and this function does not need the first
007273        ** four bytes of the divider cell. So the pointer is safe to use
007274        ** later on.  
007275        **
007276        ** But not if we are in secure-delete mode. In secure-delete mode,
007277        ** the dropCell() routine will overwrite the entire cell with zeroes.
007278        ** In this case, temporarily copy the cell into the aOvflSpace[]
007279        ** buffer. It will be copied out again as soon as the aSpace[] buffer
007280        ** is allocated.  */
007281        if( pBt->btsFlags & BTS_FAST_SECURE ){
007282          int iOff;
007283  
007284          iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
007285          if( (iOff+szNew[i])>(int)pBt->usableSize ){
007286            rc = SQLITE_CORRUPT_BKPT;
007287            memset(apOld, 0, (i+1)*sizeof(MemPage*));
007288            goto balance_cleanup;
007289          }else{
007290            memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
007291            apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
007292          }
007293        }
007294        dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
007295      }
007296    }
007297  
007298    /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
007299    ** alignment */
007300    nMaxCells = (nMaxCells + 3)&~3;
007301  
007302    /*
007303    ** Allocate space for memory structures
007304    */
007305    szScratch =
007306         nMaxCells*sizeof(u8*)                       /* b.apCell */
007307       + nMaxCells*sizeof(u16)                       /* b.szCell */
007308       + pBt->pageSize;                              /* aSpace1 */
007309  
007310    assert( szScratch<=6*(int)pBt->pageSize );
007311    b.apCell = sqlite3StackAllocRaw(0, szScratch );
007312    if( b.apCell==0 ){
007313      rc = SQLITE_NOMEM_BKPT;
007314      goto balance_cleanup;
007315    }
007316    b.szCell = (u16*)&b.apCell[nMaxCells];
007317    aSpace1 = (u8*)&b.szCell[nMaxCells];
007318    assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
007319  
007320    /*
007321    ** Load pointers to all cells on sibling pages and the divider cells
007322    ** into the local b.apCell[] array.  Make copies of the divider cells
007323    ** into space obtained from aSpace1[]. The divider cells have already
007324    ** been removed from pParent.
007325    **
007326    ** If the siblings are on leaf pages, then the child pointers of the
007327    ** divider cells are stripped from the cells before they are copied
007328    ** into aSpace1[].  In this way, all cells in b.apCell[] are without
007329    ** child pointers.  If siblings are not leaves, then all cell in
007330    ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
007331    ** are alike.
007332    **
007333    ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
007334    **       leafData:  1 if pPage holds key+data and pParent holds only keys.
007335    */
007336    b.pRef = apOld[0];
007337    leafCorrection = b.pRef->leaf*4;
007338    leafData = b.pRef->intKeyLeaf;
007339    for(i=0; i<nOld; i++){
007340      MemPage *pOld = apOld[i];
007341      int limit = pOld->nCell;
007342      u8 *aData = pOld->aData;
007343      u16 maskPage = pOld->maskPage;
007344      u8 *piCell = aData + pOld->cellOffset;
007345      u8 *piEnd;
007346  
007347      /* Verify that all sibling pages are of the same "type" (table-leaf,
007348      ** table-interior, index-leaf, or index-interior).
007349      */
007350      if( pOld->aData[0]!=apOld[0]->aData[0] ){
007351        rc = SQLITE_CORRUPT_BKPT;
007352        goto balance_cleanup;
007353      }
007354  
007355      /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
007356      ** constains overflow cells, include them in the b.apCell[] array
007357      ** in the correct spot.
007358      **
007359      ** Note that when there are multiple overflow cells, it is always the
007360      ** case that they are sequential and adjacent.  This invariant arises
007361      ** because multiple overflows can only occurs when inserting divider
007362      ** cells into a parent on a prior balance, and divider cells are always
007363      ** adjacent and are inserted in order.  There is an assert() tagged
007364      ** with "NOTE 1" in the overflow cell insertion loop to prove this
007365      ** invariant.
007366      **
007367      ** This must be done in advance.  Once the balance starts, the cell
007368      ** offset section of the btree page will be overwritten and we will no
007369      ** long be able to find the cells if a pointer to each cell is not saved
007370      ** first.
007371      */
007372      memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
007373      if( pOld->nOverflow>0 ){
007374        limit = pOld->aiOvfl[0];
007375        for(j=0; j<limit; j++){
007376          b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007377          piCell += 2;
007378          b.nCell++;
007379        }
007380        for(k=0; k<pOld->nOverflow; k++){
007381          assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
007382          b.apCell[b.nCell] = pOld->apOvfl[k];
007383          b.nCell++;
007384        }
007385      }
007386      piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
007387      while( piCell<piEnd ){
007388        assert( b.nCell<nMaxCells );
007389        b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
007390        piCell += 2;
007391        b.nCell++;
007392      }
007393  
007394      cntOld[i] = b.nCell;
007395      if( i<nOld-1 && !leafData){
007396        u16 sz = (u16)szNew[i];
007397        u8 *pTemp;
007398        assert( b.nCell<nMaxCells );
007399        b.szCell[b.nCell] = sz;
007400        pTemp = &aSpace1[iSpace1];
007401        iSpace1 += sz;
007402        assert( sz<=pBt->maxLocal+23 );
007403        assert( iSpace1 <= (int)pBt->pageSize );
007404        memcpy(pTemp, apDiv[i], sz);
007405        b.apCell[b.nCell] = pTemp+leafCorrection;
007406        assert( leafCorrection==0 || leafCorrection==4 );
007407        b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
007408        if( !pOld->leaf ){
007409          assert( leafCorrection==0 );
007410          assert( pOld->hdrOffset==0 );
007411          /* The right pointer of the child page pOld becomes the left
007412          ** pointer of the divider cell */
007413          memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
007414        }else{
007415          assert( leafCorrection==4 );
007416          while( b.szCell[b.nCell]<4 ){
007417            /* Do not allow any cells smaller than 4 bytes. If a smaller cell
007418            ** does exist, pad it with 0x00 bytes. */
007419            assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
007420            assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
007421            aSpace1[iSpace1++] = 0x00;
007422            b.szCell[b.nCell]++;
007423          }
007424        }
007425        b.nCell++;
007426      }
007427    }
007428  
007429    /*
007430    ** Figure out the number of pages needed to hold all b.nCell cells.
007431    ** Store this number in "k".  Also compute szNew[] which is the total
007432    ** size of all cells on the i-th page and cntNew[] which is the index
007433    ** in b.apCell[] of the cell that divides page i from page i+1.  
007434    ** cntNew[k] should equal b.nCell.
007435    **
007436    ** Values computed by this block:
007437    **
007438    **           k: The total number of sibling pages
007439    **    szNew[i]: Spaced used on the i-th sibling page.
007440    **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
007441    **              the right of the i-th sibling page.
007442    ** usableSpace: Number of bytes of space available on each sibling.
007443    ** 
007444    */
007445    usableSpace = pBt->usableSize - 12 + leafCorrection;
007446    for(i=0; i<nOld; i++){
007447      MemPage *p = apOld[i];
007448      szNew[i] = usableSpace - p->nFree;
007449      for(j=0; j<p->nOverflow; j++){
007450        szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
007451      }
007452      cntNew[i] = cntOld[i];
007453    }
007454    k = nOld;
007455    for(i=0; i<k; i++){
007456      int sz;
007457      while( szNew[i]>usableSpace ){
007458        if( i+1>=k ){
007459          k = i+2;
007460          if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
007461          szNew[k-1] = 0;
007462          cntNew[k-1] = b.nCell;
007463        }
007464        sz = 2 + cachedCellSize(&b, cntNew[i]-1);
007465        szNew[i] -= sz;
007466        if( !leafData ){
007467          if( cntNew[i]<b.nCell ){
007468            sz = 2 + cachedCellSize(&b, cntNew[i]);
007469          }else{
007470            sz = 0;
007471          }
007472        }
007473        szNew[i+1] += sz;
007474        cntNew[i]--;
007475      }
007476      while( cntNew[i]<b.nCell ){
007477        sz = 2 + cachedCellSize(&b, cntNew[i]);
007478        if( szNew[i]+sz>usableSpace ) break;
007479        szNew[i] += sz;
007480        cntNew[i]++;
007481        if( !leafData ){
007482          if( cntNew[i]<b.nCell ){
007483            sz = 2 + cachedCellSize(&b, cntNew[i]);
007484          }else{
007485            sz = 0;
007486          }
007487        }
007488        szNew[i+1] -= sz;
007489      }
007490      if( cntNew[i]>=b.nCell ){
007491        k = i+1;
007492      }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
007493        rc = SQLITE_CORRUPT_BKPT;
007494        goto balance_cleanup;
007495      }
007496    }
007497  
007498    /*
007499    ** The packing computed by the previous block is biased toward the siblings
007500    ** on the left side (siblings with smaller keys). The left siblings are
007501    ** always nearly full, while the right-most sibling might be nearly empty.
007502    ** The next block of code attempts to adjust the packing of siblings to
007503    ** get a better balance.
007504    **
007505    ** This adjustment is more than an optimization.  The packing above might
007506    ** be so out of balance as to be illegal.  For example, the right-most
007507    ** sibling might be completely empty.  This adjustment is not optional.
007508    */
007509    for(i=k-1; i>0; i--){
007510      int szRight = szNew[i];  /* Size of sibling on the right */
007511      int szLeft = szNew[i-1]; /* Size of sibling on the left */
007512      int r;              /* Index of right-most cell in left sibling */
007513      int d;              /* Index of first cell to the left of right sibling */
007514  
007515      r = cntNew[i-1] - 1;
007516      d = r + 1 - leafData;
007517      (void)cachedCellSize(&b, d);
007518      do{
007519        assert( d<nMaxCells );
007520        assert( r<nMaxCells );
007521        (void)cachedCellSize(&b, r);
007522        if( szRight!=0
007523         && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
007524          break;
007525        }
007526        szRight += b.szCell[d] + 2;
007527        szLeft -= b.szCell[r] + 2;
007528        cntNew[i-1] = r;
007529        r--;
007530        d--;
007531      }while( r>=0 );
007532      szNew[i] = szRight;
007533      szNew[i-1] = szLeft;
007534      if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
007535        rc = SQLITE_CORRUPT_BKPT;
007536        goto balance_cleanup;
007537      }
007538    }
007539  
007540    /* Sanity check:  For a non-corrupt database file one of the follwing
007541    ** must be true:
007542    **    (1) We found one or more cells (cntNew[0])>0), or
007543    **    (2) pPage is a virtual root page.  A virtual root page is when
007544    **        the real root page is page 1 and we are the only child of
007545    **        that page.
007546    */
007547    assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
007548    TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
007549      apOld[0]->pgno, apOld[0]->nCell,
007550      nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
007551      nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
007552    ));
007553  
007554    /*
007555    ** Allocate k new pages.  Reuse old pages where possible.
007556    */
007557    pageFlags = apOld[0]->aData[0];
007558    for(i=0; i<k; i++){
007559      MemPage *pNew;
007560      if( i<nOld ){
007561        pNew = apNew[i] = apOld[i];
007562        apOld[i] = 0;
007563        rc = sqlite3PagerWrite(pNew->pDbPage);
007564        nNew++;
007565        if( rc ) goto balance_cleanup;
007566      }else{
007567        assert( i>0 );
007568        rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
007569        if( rc ) goto balance_cleanup;
007570        zeroPage(pNew, pageFlags);
007571        apNew[i] = pNew;
007572        nNew++;
007573        cntOld[i] = b.nCell;
007574  
007575        /* Set the pointer-map entry for the new sibling page. */
007576        if( ISAUTOVACUUM ){
007577          ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
007578          if( rc!=SQLITE_OK ){
007579            goto balance_cleanup;
007580          }
007581        }
007582      }
007583    }
007584  
007585    /*
007586    ** Reassign page numbers so that the new pages are in ascending order. 
007587    ** This helps to keep entries in the disk file in order so that a scan
007588    ** of the table is closer to a linear scan through the file. That in turn 
007589    ** helps the operating system to deliver pages from the disk more rapidly.
007590    **
007591    ** An O(n^2) insertion sort algorithm is used, but since n is never more 
007592    ** than (NB+2) (a small constant), that should not be a problem.
007593    **
007594    ** When NB==3, this one optimization makes the database about 25% faster 
007595    ** for large insertions and deletions.
007596    */
007597    for(i=0; i<nNew; i++){
007598      aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
007599      aPgFlags[i] = apNew[i]->pDbPage->flags;
007600      for(j=0; j<i; j++){
007601        if( aPgno[j]==aPgno[i] ){
007602          /* This branch is taken if the set of sibling pages somehow contains
007603          ** duplicate entries. This can happen if the database is corrupt. 
007604          ** It would be simpler to detect this as part of the loop below, but
007605          ** we do the detection here in order to avoid populating the pager
007606          ** cache with two separate objects associated with the same
007607          ** page number.  */
007608          assert( CORRUPT_DB );
007609          rc = SQLITE_CORRUPT_BKPT;
007610          goto balance_cleanup;
007611        }
007612      }
007613    }
007614    for(i=0; i<nNew; i++){
007615      int iBest = 0;                /* aPgno[] index of page number to use */
007616      for(j=1; j<nNew; j++){
007617        if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
007618      }
007619      pgno = aPgOrder[iBest];
007620      aPgOrder[iBest] = 0xffffffff;
007621      if( iBest!=i ){
007622        if( iBest>i ){
007623          sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
007624        }
007625        sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
007626        apNew[i]->pgno = pgno;
007627      }
007628    }
007629  
007630    TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
007631           "%d(%d nc=%d) %d(%d nc=%d)\n",
007632      apNew[0]->pgno, szNew[0], cntNew[0],
007633      nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
007634      nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
007635      nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
007636      nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
007637      nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
007638      nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
007639      nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
007640      nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
007641    ));
007642  
007643    assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007644    put4byte(pRight, apNew[nNew-1]->pgno);
007645  
007646    /* If the sibling pages are not leaves, ensure that the right-child pointer
007647    ** of the right-most new sibling page is set to the value that was 
007648    ** originally in the same field of the right-most old sibling page. */
007649    if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
007650      MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
007651      memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
007652    }
007653  
007654    /* Make any required updates to pointer map entries associated with 
007655    ** cells stored on sibling pages following the balance operation. Pointer
007656    ** map entries associated with divider cells are set by the insertCell()
007657    ** routine. The associated pointer map entries are:
007658    **
007659    **   a) if the cell contains a reference to an overflow chain, the
007660    **      entry associated with the first page in the overflow chain, and
007661    **
007662    **   b) if the sibling pages are not leaves, the child page associated
007663    **      with the cell.
007664    **
007665    ** If the sibling pages are not leaves, then the pointer map entry 
007666    ** associated with the right-child of each sibling may also need to be 
007667    ** updated. This happens below, after the sibling pages have been 
007668    ** populated, not here.
007669    */
007670    if( ISAUTOVACUUM ){
007671      MemPage *pNew = apNew[0];
007672      u8 *aOld = pNew->aData;
007673      int cntOldNext = pNew->nCell + pNew->nOverflow;
007674      int usableSize = pBt->usableSize;
007675      int iNew = 0;
007676      int iOld = 0;
007677  
007678      for(i=0; i<b.nCell; i++){
007679        u8 *pCell = b.apCell[i];
007680        if( i==cntOldNext ){
007681          MemPage *pOld = (++iOld)<nNew ? apNew[iOld] : apOld[iOld];
007682          cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
007683          aOld = pOld->aData;
007684        }
007685        if( i==cntNew[iNew] ){
007686          pNew = apNew[++iNew];
007687          if( !leafData ) continue;
007688        }
007689  
007690        /* Cell pCell is destined for new sibling page pNew. Originally, it
007691        ** was either part of sibling page iOld (possibly an overflow cell), 
007692        ** or else the divider cell to the left of sibling page iOld. So,
007693        ** if sibling page iOld had the same page number as pNew, and if
007694        ** pCell really was a part of sibling page iOld (not a divider or
007695        ** overflow cell), we can skip updating the pointer map entries.  */
007696        if( iOld>=nNew
007697         || pNew->pgno!=aPgno[iOld]
007698         || !SQLITE_WITHIN(pCell,aOld,&aOld[usableSize])
007699        ){
007700          if( !leafCorrection ){
007701            ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
007702          }
007703          if( cachedCellSize(&b,i)>pNew->minLocal ){
007704            ptrmapPutOvflPtr(pNew, pCell, &rc);
007705          }
007706          if( rc ) goto balance_cleanup;
007707        }
007708      }
007709    }
007710  
007711    /* Insert new divider cells into pParent. */
007712    for(i=0; i<nNew-1; i++){
007713      u8 *pCell;
007714      u8 *pTemp;
007715      int sz;
007716      MemPage *pNew = apNew[i];
007717      j = cntNew[i];
007718  
007719      assert( j<nMaxCells );
007720      assert( b.apCell[j]!=0 );
007721      pCell = b.apCell[j];
007722      sz = b.szCell[j] + leafCorrection;
007723      pTemp = &aOvflSpace[iOvflSpace];
007724      if( !pNew->leaf ){
007725        memcpy(&pNew->aData[8], pCell, 4);
007726      }else if( leafData ){
007727        /* If the tree is a leaf-data tree, and the siblings are leaves, 
007728        ** then there is no divider cell in b.apCell[]. Instead, the divider 
007729        ** cell consists of the integer key for the right-most cell of 
007730        ** the sibling-page assembled above only.
007731        */
007732        CellInfo info;
007733        j--;
007734        pNew->xParseCell(pNew, b.apCell[j], &info);
007735        pCell = pTemp;
007736        sz = 4 + putVarint(&pCell[4], info.nKey);
007737        pTemp = 0;
007738      }else{
007739        pCell -= 4;
007740        /* Obscure case for non-leaf-data trees: If the cell at pCell was
007741        ** previously stored on a leaf node, and its reported size was 4
007742        ** bytes, then it may actually be smaller than this 
007743        ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
007744        ** any cell). But it is important to pass the correct size to 
007745        ** insertCell(), so reparse the cell now.
007746        **
007747        ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
007748        ** and WITHOUT ROWID tables with exactly one column which is the
007749        ** primary key.
007750        */
007751        if( b.szCell[j]==4 ){
007752          assert(leafCorrection==4);
007753          sz = pParent->xCellSize(pParent, pCell);
007754        }
007755      }
007756      iOvflSpace += sz;
007757      assert( sz<=pBt->maxLocal+23 );
007758      assert( iOvflSpace <= (int)pBt->pageSize );
007759      insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
007760      if( rc!=SQLITE_OK ) goto balance_cleanup;
007761      assert( sqlite3PagerIswriteable(pParent->pDbPage) );
007762    }
007763  
007764    /* Now update the actual sibling pages. The order in which they are updated
007765    ** is important, as this code needs to avoid disrupting any page from which
007766    ** cells may still to be read. In practice, this means:
007767    **
007768    **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
007769    **      then it is not safe to update page apNew[iPg] until after
007770    **      the left-hand sibling apNew[iPg-1] has been updated.
007771    **
007772    **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
007773    **      then it is not safe to update page apNew[iPg] until after
007774    **      the right-hand sibling apNew[iPg+1] has been updated.
007775    **
007776    ** If neither of the above apply, the page is safe to update.
007777    **
007778    ** The iPg value in the following loop starts at nNew-1 goes down
007779    ** to 0, then back up to nNew-1 again, thus making two passes over
007780    ** the pages.  On the initial downward pass, only condition (1) above
007781    ** needs to be tested because (2) will always be true from the previous
007782    ** step.  On the upward pass, both conditions are always true, so the
007783    ** upwards pass simply processes pages that were missed on the downward
007784    ** pass.
007785    */
007786    for(i=1-nNew; i<nNew; i++){
007787      int iPg = i<0 ? -i : i;
007788      assert( iPg>=0 && iPg<nNew );
007789      if( abDone[iPg] ) continue;         /* Skip pages already processed */
007790      if( i>=0                            /* On the upwards pass, or... */
007791       || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
007792      ){
007793        int iNew;
007794        int iOld;
007795        int nNewCell;
007796  
007797        /* Verify condition (1):  If cells are moving left, update iPg
007798        ** only after iPg-1 has already been updated. */
007799        assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
007800  
007801        /* Verify condition (2):  If cells are moving right, update iPg
007802        ** only after iPg+1 has already been updated. */
007803        assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
007804  
007805        if( iPg==0 ){
007806          iNew = iOld = 0;
007807          nNewCell = cntNew[0];
007808        }else{
007809          iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
007810          iNew = cntNew[iPg-1] + !leafData;
007811          nNewCell = cntNew[iPg] - iNew;
007812        }
007813  
007814        rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
007815        if( rc ) goto balance_cleanup;
007816        abDone[iPg]++;
007817        apNew[iPg]->nFree = usableSpace-szNew[iPg];
007818        assert( apNew[iPg]->nOverflow==0 );
007819        assert( apNew[iPg]->nCell==nNewCell );
007820      }
007821    }
007822  
007823    /* All pages have been processed exactly once */
007824    assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
007825  
007826    assert( nOld>0 );
007827    assert( nNew>0 );
007828  
007829    if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
007830      /* The root page of the b-tree now contains no cells. The only sibling
007831      ** page is the right-child of the parent. Copy the contents of the
007832      ** child page into the parent, decreasing the overall height of the
007833      ** b-tree structure by one. This is described as the "balance-shallower"
007834      ** sub-algorithm in some documentation.
007835      **
007836      ** If this is an auto-vacuum database, the call to copyNodeContent() 
007837      ** sets all pointer-map entries corresponding to database image pages 
007838      ** for which the pointer is stored within the content being copied.
007839      **
007840      ** It is critical that the child page be defragmented before being
007841      ** copied into the parent, because if the parent is page 1 then it will
007842      ** by smaller than the child due to the database header, and so all the
007843      ** free space needs to be up front.
007844      */
007845      assert( nNew==1 || CORRUPT_DB );
007846      rc = defragmentPage(apNew[0], -1);
007847      testcase( rc!=SQLITE_OK );
007848      assert( apNew[0]->nFree == 
007849          (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2)
007850        || rc!=SQLITE_OK
007851      );
007852      copyNodeContent(apNew[0], pParent, &rc);
007853      freePage(apNew[0], &rc);
007854    }else if( ISAUTOVACUUM && !leafCorrection ){
007855      /* Fix the pointer map entries associated with the right-child of each
007856      ** sibling page. All other pointer map entries have already been taken
007857      ** care of.  */
007858      for(i=0; i<nNew; i++){
007859        u32 key = get4byte(&apNew[i]->aData[8]);
007860        ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
007861      }
007862    }
007863  
007864    assert( pParent->isInit );
007865    TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
007866            nOld, nNew, b.nCell));
007867  
007868    /* Free any old pages that were not reused as new pages.
007869    */
007870    for(i=nNew; i<nOld; i++){
007871      freePage(apOld[i], &rc);
007872    }
007873  
007874  #if 0
007875    if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
007876      /* The ptrmapCheckPages() contains assert() statements that verify that
007877      ** all pointer map pages are set correctly. This is helpful while 
007878      ** debugging. This is usually disabled because a corrupt database may
007879      ** cause an assert() statement to fail.  */
007880      ptrmapCheckPages(apNew, nNew);
007881      ptrmapCheckPages(&pParent, 1);
007882    }
007883  #endif
007884  
007885    /*
007886    ** Cleanup before returning.
007887    */
007888  balance_cleanup:
007889    sqlite3StackFree(0, b.apCell);
007890    for(i=0; i<nOld; i++){
007891      releasePage(apOld[i]);
007892    }
007893    for(i=0; i<nNew; i++){
007894      releasePage(apNew[i]);
007895    }
007896  
007897    return rc;
007898  }
007899  
007900  
007901  /*
007902  ** This function is called when the root page of a b-tree structure is
007903  ** overfull (has one or more overflow pages).
007904  **
007905  ** A new child page is allocated and the contents of the current root
007906  ** page, including overflow cells, are copied into the child. The root
007907  ** page is then overwritten to make it an empty page with the right-child 
007908  ** pointer pointing to the new page.
007909  **
007910  ** Before returning, all pointer-map entries corresponding to pages 
007911  ** that the new child-page now contains pointers to are updated. The
007912  ** entry corresponding to the new right-child pointer of the root
007913  ** page is also updated.
007914  **
007915  ** If successful, *ppChild is set to contain a reference to the child 
007916  ** page and SQLITE_OK is returned. In this case the caller is required
007917  ** to call releasePage() on *ppChild exactly once. If an error occurs,
007918  ** an error code is returned and *ppChild is set to 0.
007919  */
007920  static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
007921    int rc;                        /* Return value from subprocedures */
007922    MemPage *pChild = 0;           /* Pointer to a new child page */
007923    Pgno pgnoChild = 0;            /* Page number of the new child page */
007924    BtShared *pBt = pRoot->pBt;    /* The BTree */
007925  
007926    assert( pRoot->nOverflow>0 );
007927    assert( sqlite3_mutex_held(pBt->mutex) );
007928  
007929    /* Make pRoot, the root page of the b-tree, writable. Allocate a new 
007930    ** page that will become the new right-child of pPage. Copy the contents
007931    ** of the node stored on pRoot into the new child page.
007932    */
007933    rc = sqlite3PagerWrite(pRoot->pDbPage);
007934    if( rc==SQLITE_OK ){
007935      rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
007936      copyNodeContent(pRoot, pChild, &rc);
007937      if( ISAUTOVACUUM ){
007938        ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
007939      }
007940    }
007941    if( rc ){
007942      *ppChild = 0;
007943      releasePage(pChild);
007944      return rc;
007945    }
007946    assert( sqlite3PagerIswriteable(pChild->pDbPage) );
007947    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
007948    assert( pChild->nCell==pRoot->nCell );
007949  
007950    TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
007951  
007952    /* Copy the overflow cells from pRoot to pChild */
007953    memcpy(pChild->aiOvfl, pRoot->aiOvfl,
007954           pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
007955    memcpy(pChild->apOvfl, pRoot->apOvfl,
007956           pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
007957    pChild->nOverflow = pRoot->nOverflow;
007958  
007959    /* Zero the contents of pRoot. Then install pChild as the right-child. */
007960    zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
007961    put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
007962  
007963    *ppChild = pChild;
007964    return SQLITE_OK;
007965  }
007966  
007967  /*
007968  ** The page that pCur currently points to has just been modified in
007969  ** some way. This function figures out if this modification means the
007970  ** tree needs to be balanced, and if so calls the appropriate balancing 
007971  ** routine. Balancing routines are:
007972  **
007973  **   balance_quick()
007974  **   balance_deeper()
007975  **   balance_nonroot()
007976  */
007977  static int balance(BtCursor *pCur){
007978    int rc = SQLITE_OK;
007979    const int nMin = pCur->pBt->usableSize * 2 / 3;
007980    u8 aBalanceQuickSpace[13];
007981    u8 *pFree = 0;
007982  
007983    VVA_ONLY( int balance_quick_called = 0 );
007984    VVA_ONLY( int balance_deeper_called = 0 );
007985  
007986    do {
007987      int iPage = pCur->iPage;
007988      MemPage *pPage = pCur->pPage;
007989  
007990      if( iPage==0 ){
007991        if( pPage->nOverflow ){
007992          /* The root page of the b-tree is overfull. In this case call the
007993          ** balance_deeper() function to create a new child for the root-page
007994          ** and copy the current contents of the root-page to it. The
007995          ** next iteration of the do-loop will balance the child page.
007996          */ 
007997          assert( balance_deeper_called==0 );
007998          VVA_ONLY( balance_deeper_called++ );
007999          rc = balance_deeper(pPage, &pCur->apPage[1]);
008000          if( rc==SQLITE_OK ){
008001            pCur->iPage = 1;
008002            pCur->ix = 0;
008003            pCur->aiIdx[0] = 0;
008004            pCur->apPage[0] = pPage;
008005            pCur->pPage = pCur->apPage[1];
008006            assert( pCur->pPage->nOverflow );
008007          }
008008        }else{
008009          break;
008010        }
008011      }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
008012        break;
008013      }else{
008014        MemPage * const pParent = pCur->apPage[iPage-1];
008015        int const iIdx = pCur->aiIdx[iPage-1];
008016  
008017        rc = sqlite3PagerWrite(pParent->pDbPage);
008018        if( rc==SQLITE_OK ){
008019  #ifndef SQLITE_OMIT_QUICKBALANCE
008020          if( pPage->intKeyLeaf
008021           && pPage->nOverflow==1
008022           && pPage->aiOvfl[0]==pPage->nCell
008023           && pParent->pgno!=1
008024           && pParent->nCell==iIdx
008025          ){
008026            /* Call balance_quick() to create a new sibling of pPage on which
008027            ** to store the overflow cell. balance_quick() inserts a new cell
008028            ** into pParent, which may cause pParent overflow. If this
008029            ** happens, the next iteration of the do-loop will balance pParent 
008030            ** use either balance_nonroot() or balance_deeper(). Until this
008031            ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
008032            ** buffer. 
008033            **
008034            ** The purpose of the following assert() is to check that only a
008035            ** single call to balance_quick() is made for each call to this
008036            ** function. If this were not verified, a subtle bug involving reuse
008037            ** of the aBalanceQuickSpace[] might sneak in.
008038            */
008039            assert( balance_quick_called==0 ); 
008040            VVA_ONLY( balance_quick_called++ );
008041            rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
008042          }else
008043  #endif
008044          {
008045            /* In this case, call balance_nonroot() to redistribute cells
008046            ** between pPage and up to 2 of its sibling pages. This involves
008047            ** modifying the contents of pParent, which may cause pParent to
008048            ** become overfull or underfull. The next iteration of the do-loop
008049            ** will balance the parent page to correct this.
008050            ** 
008051            ** If the parent page becomes overfull, the overflow cell or cells
008052            ** are stored in the pSpace buffer allocated immediately below. 
008053            ** A subsequent iteration of the do-loop will deal with this by
008054            ** calling balance_nonroot() (balance_deeper() may be called first,
008055            ** but it doesn't deal with overflow cells - just moves them to a
008056            ** different page). Once this subsequent call to balance_nonroot() 
008057            ** has completed, it is safe to release the pSpace buffer used by
008058            ** the previous call, as the overflow cell data will have been 
008059            ** copied either into the body of a database page or into the new
008060            ** pSpace buffer passed to the latter call to balance_nonroot().
008061            */
008062            u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
008063            rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
008064                                 pCur->hints&BTREE_BULKLOAD);
008065            if( pFree ){
008066              /* If pFree is not NULL, it points to the pSpace buffer used 
008067              ** by a previous call to balance_nonroot(). Its contents are
008068              ** now stored either on real database pages or within the 
008069              ** new pSpace buffer, so it may be safely freed here. */
008070              sqlite3PageFree(pFree);
008071            }
008072  
008073            /* The pSpace buffer will be freed after the next call to
008074            ** balance_nonroot(), or just before this function returns, whichever
008075            ** comes first. */
008076            pFree = pSpace;
008077          }
008078        }
008079  
008080        pPage->nOverflow = 0;
008081  
008082        /* The next iteration of the do-loop balances the parent page. */
008083        releasePage(pPage);
008084        pCur->iPage--;
008085        assert( pCur->iPage>=0 );
008086        pCur->pPage = pCur->apPage[pCur->iPage];
008087      }
008088    }while( rc==SQLITE_OK );
008089  
008090    if( pFree ){
008091      sqlite3PageFree(pFree);
008092    }
008093    return rc;
008094  }
008095  
008096  
008097  /*
008098  ** Insert a new record into the BTree.  The content of the new record
008099  ** is described by the pX object.  The pCur cursor is used only to
008100  ** define what table the record should be inserted into, and is left
008101  ** pointing at a random location.
008102  **
008103  ** For a table btree (used for rowid tables), only the pX.nKey value of
008104  ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
008105  ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
008106  ** hold the content of the row.
008107  **
008108  ** For an index btree (used for indexes and WITHOUT ROWID tables), the
008109  ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The 
008110  ** pX.pData,nData,nZero fields must be zero.
008111  **
008112  ** If the seekResult parameter is non-zero, then a successful call to
008113  ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
008114  ** been performed.  In other words, if seekResult!=0 then the cursor
008115  ** is currently pointing to a cell that will be adjacent to the cell
008116  ** to be inserted.  If seekResult<0 then pCur points to a cell that is
008117  ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
008118  ** that is larger than (pKey,nKey).
008119  **
008120  ** If seekResult==0, that means pCur is pointing at some unknown location.
008121  ** In that case, this routine must seek the cursor to the correct insertion
008122  ** point for (pKey,nKey) before doing the insertion.  For index btrees,
008123  ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
008124  ** key values and pX->aMem can be used instead of pX->pKey to avoid having
008125  ** to decode the key.
008126  */
008127  int sqlite3BtreeInsert(
008128    BtCursor *pCur,                /* Insert data into the table of this cursor */
008129    const BtreePayload *pX,        /* Content of the row to be inserted */
008130    int flags,                     /* True if this is likely an append */
008131    int seekResult                 /* Result of prior MovetoUnpacked() call */
008132  ){
008133    int rc;
008134    int loc = seekResult;          /* -1: before desired location  +1: after */
008135    int szNew = 0;
008136    int idx;
008137    MemPage *pPage;
008138    Btree *p = pCur->pBtree;
008139    BtShared *pBt = p->pBt;
008140    unsigned char *oldCell;
008141    unsigned char *newCell = 0;
008142  
008143    assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
008144  
008145    if( pCur->eState==CURSOR_FAULT ){
008146      assert( pCur->skipNext!=SQLITE_OK );
008147      return pCur->skipNext;
008148    }
008149  
008150    assert( cursorOwnsBtShared(pCur) );
008151    assert( (pCur->curFlags & BTCF_WriteFlag)!=0
008152                && pBt->inTransaction==TRANS_WRITE
008153                && (pBt->btsFlags & BTS_READ_ONLY)==0 );
008154    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
008155  
008156    /* Assert that the caller has been consistent. If this cursor was opened
008157    ** expecting an index b-tree, then the caller should be inserting blob
008158    ** keys with no associated data. If the cursor was opened expecting an
008159    ** intkey table, the caller should be inserting integer keys with a
008160    ** blob of associated data.  */
008161    assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
008162  
008163    /* Save the positions of any other cursors open on this table.
008164    **
008165    ** In some cases, the call to btreeMoveto() below is a no-op. For
008166    ** example, when inserting data into a table with auto-generated integer
008167    ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 
008168    ** integer key to use. It then calls this function to actually insert the 
008169    ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
008170    ** that the cursor is already where it needs to be and returns without
008171    ** doing any work. To avoid thwarting these optimizations, it is important
008172    ** not to clear the cursor here.
008173    */
008174    if( pCur->curFlags & BTCF_Multiple ){
008175      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
008176      if( rc ) return rc;
008177    }
008178  
008179    if( pCur->pKeyInfo==0 ){
008180      assert( pX->pKey==0 );
008181      /* If this is an insert into a table b-tree, invalidate any incrblob 
008182      ** cursors open on the row being replaced */
008183      invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
008184  
008185      /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing 
008186      ** to a row with the same key as the new entry being inserted.  */
008187      assert( (flags & BTREE_SAVEPOSITION)==0 || 
008188              ((pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey) );
008189  
008190      /* If the cursor is currently on the last row and we are appending a
008191      ** new row onto the end, set the "loc" to avoid an unnecessary
008192      ** btreeMoveto() call */
008193      if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
008194        loc = 0;
008195      }else if( loc==0 ){
008196        rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
008197        if( rc ) return rc;
008198      }
008199    }else if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
008200      if( pX->nMem ){
008201        UnpackedRecord r;
008202        r.pKeyInfo = pCur->pKeyInfo;
008203        r.aMem = pX->aMem;
008204        r.nField = pX->nMem;
008205        r.default_rc = 0;
008206        r.errCode = 0;
008207        r.r1 = 0;
008208        r.r2 = 0;
008209        r.eqSeen = 0;
008210        rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
008211      }else{
008212        rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
008213      }
008214      if( rc ) return rc;
008215    }
008216    assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) );
008217  
008218    pPage = pCur->pPage;
008219    assert( pPage->intKey || pX->nKey>=0 );
008220    assert( pPage->leaf || !pPage->intKey );
008221  
008222    TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
008223            pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
008224            loc==0 ? "overwrite" : "new entry"));
008225    assert( pPage->isInit );
008226    newCell = pBt->pTmpSpace;
008227    assert( newCell!=0 );
008228    rc = fillInCell(pPage, newCell, pX, &szNew);
008229    if( rc ) goto end_insert;
008230    assert( szNew==pPage->xCellSize(pPage, newCell) );
008231    assert( szNew <= MX_CELL_SIZE(pBt) );
008232    idx = pCur->ix;
008233    if( loc==0 ){
008234      CellInfo info;
008235      assert( idx<pPage->nCell );
008236      rc = sqlite3PagerWrite(pPage->pDbPage);
008237      if( rc ){
008238        goto end_insert;
008239      }
008240      oldCell = findCell(pPage, idx);
008241      if( !pPage->leaf ){
008242        memcpy(newCell, oldCell, 4);
008243      }
008244      rc = clearCell(pPage, oldCell, &info);
008245      if( info.nSize==szNew && info.nLocal==info.nPayload 
008246       && (!ISAUTOVACUUM || szNew<pPage->minLocal)
008247      ){
008248        /* Overwrite the old cell with the new if they are the same size.
008249        ** We could also try to do this if the old cell is smaller, then add
008250        ** the leftover space to the free list.  But experiments show that
008251        ** doing that is no faster then skipping this optimization and just
008252        ** calling dropCell() and insertCell(). 
008253        **
008254        ** This optimization cannot be used on an autovacuum database if the
008255        ** new entry uses overflow pages, as the insertCell() call below is
008256        ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
008257        assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
008258        if( oldCell+szNew > pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT;
008259        memcpy(oldCell, newCell, szNew);
008260        return SQLITE_OK;
008261      }
008262      dropCell(pPage, idx, info.nSize, &rc);
008263      if( rc ) goto end_insert;
008264    }else if( loc<0 && pPage->nCell>0 ){
008265      assert( pPage->leaf );
008266      idx = ++pCur->ix;
008267      pCur->curFlags &= ~BTCF_ValidNKey;
008268    }else{
008269      assert( pPage->leaf );
008270    }
008271    insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
008272    assert( pPage->nOverflow==0 || rc==SQLITE_OK );
008273    assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
008274  
008275    /* If no error has occurred and pPage has an overflow cell, call balance() 
008276    ** to redistribute the cells within the tree. Since balance() may move
008277    ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
008278    ** variables.
008279    **
008280    ** Previous versions of SQLite called moveToRoot() to move the cursor
008281    ** back to the root page as balance() used to invalidate the contents
008282    ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
008283    ** set the cursor state to "invalid". This makes common insert operations
008284    ** slightly faster.
008285    **
008286    ** There is a subtle but important optimization here too. When inserting
008287    ** multiple records into an intkey b-tree using a single cursor (as can
008288    ** happen while processing an "INSERT INTO ... SELECT" statement), it
008289    ** is advantageous to leave the cursor pointing to the last entry in
008290    ** the b-tree if possible. If the cursor is left pointing to the last
008291    ** entry in the table, and the next row inserted has an integer key
008292    ** larger than the largest existing key, it is possible to insert the
008293    ** row without seeking the cursor. This can be a big performance boost.
008294    */
008295    pCur->info.nSize = 0;
008296    if( pPage->nOverflow ){
008297      assert( rc==SQLITE_OK );
008298      pCur->curFlags &= ~(BTCF_ValidNKey);
008299      rc = balance(pCur);
008300  
008301      /* Must make sure nOverflow is reset to zero even if the balance()
008302      ** fails. Internal data structure corruption will result otherwise. 
008303      ** Also, set the cursor state to invalid. This stops saveCursorPosition()
008304      ** from trying to save the current position of the cursor.  */
008305      pCur->pPage->nOverflow = 0;
008306      pCur->eState = CURSOR_INVALID;
008307      if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
008308        btreeReleaseAllCursorPages(pCur);
008309        if( pCur->pKeyInfo ){
008310          assert( pCur->pKey==0 );
008311          pCur->pKey = sqlite3Malloc( pX->nKey );
008312          if( pCur->pKey==0 ){
008313            rc = SQLITE_NOMEM;
008314          }else{
008315            memcpy(pCur->pKey, pX->pKey, pX->nKey);
008316          }
008317        }
008318        pCur->eState = CURSOR_REQUIRESEEK;
008319        pCur->nKey = pX->nKey;
008320      }
008321    }
008322    assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
008323  
008324  end_insert:
008325    return rc;
008326  }
008327  
008328  /*
008329  ** Delete the entry that the cursor is pointing to. 
008330  **
008331  ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
008332  ** the cursor is left pointing at an arbitrary location after the delete.
008333  ** But if that bit is set, then the cursor is left in a state such that
008334  ** the next call to BtreeNext() or BtreePrev() moves it to the same row
008335  ** as it would have been on if the call to BtreeDelete() had been omitted.
008336  **
008337  ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
008338  ** associated with a single table entry and its indexes.  Only one of those
008339  ** deletes is considered the "primary" delete.  The primary delete occurs
008340  ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
008341  ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
008342  ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
008343  ** but which might be used by alternative storage engines.
008344  */
008345  int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
008346    Btree *p = pCur->pBtree;
008347    BtShared *pBt = p->pBt;              
008348    int rc;                              /* Return code */
008349    MemPage *pPage;                      /* Page to delete cell from */
008350    unsigned char *pCell;                /* Pointer to cell to delete */
008351    int iCellIdx;                        /* Index of cell to delete */
008352    int iCellDepth;                      /* Depth of node containing pCell */ 
008353    CellInfo info;                       /* Size of the cell being deleted */
008354    int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
008355    u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
008356  
008357    assert( cursorOwnsBtShared(pCur) );
008358    assert( pBt->inTransaction==TRANS_WRITE );
008359    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
008360    assert( pCur->curFlags & BTCF_WriteFlag );
008361    assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
008362    assert( !hasReadConflicts(p, pCur->pgnoRoot) );
008363    assert( pCur->ix<pCur->pPage->nCell );
008364    assert( pCur->eState==CURSOR_VALID );
008365    assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
008366  
008367    iCellDepth = pCur->iPage;
008368    iCellIdx = pCur->ix;
008369    pPage = pCur->pPage;
008370    pCell = findCell(pPage, iCellIdx);
008371  
008372    /* If the bPreserve flag is set to true, then the cursor position must
008373    ** be preserved following this delete operation. If the current delete
008374    ** will cause a b-tree rebalance, then this is done by saving the cursor
008375    ** key and leaving the cursor in CURSOR_REQUIRESEEK state before 
008376    ** returning. 
008377    **
008378    ** Or, if the current delete will not cause a rebalance, then the cursor
008379    ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
008380    ** before or after the deleted entry. In this case set bSkipnext to true.  */
008381    if( bPreserve ){
008382      if( !pPage->leaf 
008383       || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
008384      ){
008385        /* A b-tree rebalance will be required after deleting this entry.
008386        ** Save the cursor key.  */
008387        rc = saveCursorKey(pCur);
008388        if( rc ) return rc;
008389      }else{
008390        bSkipnext = 1;
008391      }
008392    }
008393  
008394    /* If the page containing the entry to delete is not a leaf page, move
008395    ** the cursor to the largest entry in the tree that is smaller than
008396    ** the entry being deleted. This cell will replace the cell being deleted
008397    ** from the internal node. The 'previous' entry is used for this instead
008398    ** of the 'next' entry, as the previous entry is always a part of the
008399    ** sub-tree headed by the child page of the cell being deleted. This makes
008400    ** balancing the tree following the delete operation easier.  */
008401    if( !pPage->leaf ){
008402      rc = sqlite3BtreePrevious(pCur, 0);
008403      assert( rc!=SQLITE_DONE );
008404      if( rc ) return rc;
008405    }
008406  
008407    /* Save the positions of any other cursors open on this table before
008408    ** making any modifications.  */
008409    if( pCur->curFlags & BTCF_Multiple ){
008410      rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
008411      if( rc ) return rc;
008412    }
008413  
008414    /* If this is a delete operation to remove a row from a table b-tree,
008415    ** invalidate any incrblob cursors open on the row being deleted.  */
008416    if( pCur->pKeyInfo==0 ){
008417      invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
008418    }
008419  
008420    /* Make the page containing the entry to be deleted writable. Then free any
008421    ** overflow pages associated with the entry and finally remove the cell
008422    ** itself from within the page.  */
008423    rc = sqlite3PagerWrite(pPage->pDbPage);
008424    if( rc ) return rc;
008425    rc = clearCell(pPage, pCell, &info);
008426    dropCell(pPage, iCellIdx, info.nSize, &rc);
008427    if( rc ) return rc;
008428  
008429    /* If the cell deleted was not located on a leaf page, then the cursor
008430    ** is currently pointing to the largest entry in the sub-tree headed
008431    ** by the child-page of the cell that was just deleted from an internal
008432    ** node. The cell from the leaf node needs to be moved to the internal
008433    ** node to replace the deleted cell.  */
008434    if( !pPage->leaf ){
008435      MemPage *pLeaf = pCur->pPage;
008436      int nCell;
008437      Pgno n;
008438      unsigned char *pTmp;
008439  
008440      if( iCellDepth<pCur->iPage-1 ){
008441        n = pCur->apPage[iCellDepth+1]->pgno;
008442      }else{
008443        n = pCur->pPage->pgno;
008444      }
008445      pCell = findCell(pLeaf, pLeaf->nCell-1);
008446      if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
008447      nCell = pLeaf->xCellSize(pLeaf, pCell);
008448      assert( MX_CELL_SIZE(pBt) >= nCell );
008449      pTmp = pBt->pTmpSpace;
008450      assert( pTmp!=0 );
008451      rc = sqlite3PagerWrite(pLeaf->pDbPage);
008452      if( rc==SQLITE_OK ){
008453        insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
008454      }
008455      dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
008456      if( rc ) return rc;
008457    }
008458  
008459    /* Balance the tree. If the entry deleted was located on a leaf page,
008460    ** then the cursor still points to that page. In this case the first
008461    ** call to balance() repairs the tree, and the if(...) condition is
008462    ** never true.
008463    **
008464    ** Otherwise, if the entry deleted was on an internal node page, then
008465    ** pCur is pointing to the leaf page from which a cell was removed to
008466    ** replace the cell deleted from the internal node. This is slightly
008467    ** tricky as the leaf node may be underfull, and the internal node may
008468    ** be either under or overfull. In this case run the balancing algorithm
008469    ** on the leaf node first. If the balance proceeds far enough up the
008470    ** tree that we can be sure that any problem in the internal node has
008471    ** been corrected, so be it. Otherwise, after balancing the leaf node,
008472    ** walk the cursor up the tree to the internal node and balance it as 
008473    ** well.  */
008474    rc = balance(pCur);
008475    if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
008476      releasePageNotNull(pCur->pPage);
008477      pCur->iPage--;
008478      while( pCur->iPage>iCellDepth ){
008479        releasePage(pCur->apPage[pCur->iPage--]);
008480      }
008481      pCur->pPage = pCur->apPage[pCur->iPage];
008482      rc = balance(pCur);
008483    }
008484  
008485    if( rc==SQLITE_OK ){
008486      if( bSkipnext ){
008487        assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
008488        assert( pPage==pCur->pPage || CORRUPT_DB );
008489        assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
008490        pCur->eState = CURSOR_SKIPNEXT;
008491        if( iCellIdx>=pPage->nCell ){
008492          pCur->skipNext = -1;
008493          pCur->ix = pPage->nCell-1;
008494        }else{
008495          pCur->skipNext = 1;
008496        }
008497      }else{
008498        rc = moveToRoot(pCur);
008499        if( bPreserve ){
008500          btreeReleaseAllCursorPages(pCur);
008501          pCur->eState = CURSOR_REQUIRESEEK;
008502        }
008503        if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
008504      }
008505    }
008506    return rc;
008507  }
008508  
008509  /*
008510  ** Create a new BTree table.  Write into *piTable the page
008511  ** number for the root page of the new table.
008512  **
008513  ** The type of type is determined by the flags parameter.  Only the
008514  ** following values of flags are currently in use.  Other values for
008515  ** flags might not work:
008516  **
008517  **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
008518  **     BTREE_ZERODATA                  Used for SQL indices
008519  */
008520  static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){
008521    BtShared *pBt = p->pBt;
008522    MemPage *pRoot;
008523    Pgno pgnoRoot;
008524    int rc;
008525    int ptfFlags;          /* Page-type flage for the root page of new table */
008526  
008527    assert( sqlite3BtreeHoldsMutex(p) );
008528    assert( pBt->inTransaction==TRANS_WRITE );
008529    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
008530  
008531  #ifdef SQLITE_OMIT_AUTOVACUUM
008532    rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
008533    if( rc ){
008534      return rc;
008535    }
008536  #else
008537    if( pBt->autoVacuum ){
008538      Pgno pgnoMove;      /* Move a page here to make room for the root-page */
008539      MemPage *pPageMove; /* The page to move to. */
008540  
008541      /* Creating a new table may probably require moving an existing database
008542      ** to make room for the new tables root page. In case this page turns
008543      ** out to be an overflow page, delete all overflow page-map caches
008544      ** held by open cursors.
008545      */
008546      invalidateAllOverflowCache(pBt);
008547  
008548      /* Read the value of meta[3] from the database to determine where the
008549      ** root page of the new table should go. meta[3] is the largest root-page
008550      ** created so far, so the new root-page is (meta[3]+1).
008551      */
008552      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
008553      pgnoRoot++;
008554  
008555      /* The new root-page may not be allocated on a pointer-map page, or the
008556      ** PENDING_BYTE page.
008557      */
008558      while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
008559          pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
008560        pgnoRoot++;
008561      }
008562      assert( pgnoRoot>=3 || CORRUPT_DB );
008563      testcase( pgnoRoot<3 );
008564  
008565      /* Allocate a page. The page that currently resides at pgnoRoot will
008566      ** be moved to the allocated page (unless the allocated page happens
008567      ** to reside at pgnoRoot).
008568      */
008569      rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
008570      if( rc!=SQLITE_OK ){
008571        return rc;
008572      }
008573  
008574      if( pgnoMove!=pgnoRoot ){
008575        /* pgnoRoot is the page that will be used for the root-page of
008576        ** the new table (assuming an error did not occur). But we were
008577        ** allocated pgnoMove. If required (i.e. if it was not allocated
008578        ** by extending the file), the current page at position pgnoMove
008579        ** is already journaled.
008580        */
008581        u8 eType = 0;
008582        Pgno iPtrPage = 0;
008583  
008584        /* Save the positions of any open cursors. This is required in
008585        ** case they are holding a reference to an xFetch reference
008586        ** corresponding to page pgnoRoot.  */
008587        rc = saveAllCursors(pBt, 0, 0);
008588        releasePage(pPageMove);
008589        if( rc!=SQLITE_OK ){
008590          return rc;
008591        }
008592  
008593        /* Move the page currently at pgnoRoot to pgnoMove. */
008594        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
008595        if( rc!=SQLITE_OK ){
008596          return rc;
008597        }
008598        rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
008599        if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
008600          rc = SQLITE_CORRUPT_BKPT;
008601        }
008602        if( rc!=SQLITE_OK ){
008603          releasePage(pRoot);
008604          return rc;
008605        }
008606        assert( eType!=PTRMAP_ROOTPAGE );
008607        assert( eType!=PTRMAP_FREEPAGE );
008608        rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
008609        releasePage(pRoot);
008610  
008611        /* Obtain the page at pgnoRoot */
008612        if( rc!=SQLITE_OK ){
008613          return rc;
008614        }
008615        rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
008616        if( rc!=SQLITE_OK ){
008617          return rc;
008618        }
008619        rc = sqlite3PagerWrite(pRoot->pDbPage);
008620        if( rc!=SQLITE_OK ){
008621          releasePage(pRoot);
008622          return rc;
008623        }
008624      }else{
008625        pRoot = pPageMove;
008626      } 
008627  
008628      /* Update the pointer-map and meta-data with the new root-page number. */
008629      ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
008630      if( rc ){
008631        releasePage(pRoot);
008632        return rc;
008633      }
008634  
008635      /* When the new root page was allocated, page 1 was made writable in
008636      ** order either to increase the database filesize, or to decrement the
008637      ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
008638      */
008639      assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
008640      rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
008641      if( NEVER(rc) ){
008642        releasePage(pRoot);
008643        return rc;
008644      }
008645  
008646    }else{
008647      rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
008648      if( rc ) return rc;
008649    }
008650  #endif
008651    assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
008652    if( createTabFlags & BTREE_INTKEY ){
008653      ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
008654    }else{
008655      ptfFlags = PTF_ZERODATA | PTF_LEAF;
008656    }
008657    zeroPage(pRoot, ptfFlags);
008658    sqlite3PagerUnref(pRoot->pDbPage);
008659    assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
008660    *piTable = (int)pgnoRoot;
008661    return SQLITE_OK;
008662  }
008663  int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){
008664    int rc;
008665    sqlite3BtreeEnter(p);
008666    rc = btreeCreateTable(p, piTable, flags);
008667    sqlite3BtreeLeave(p);
008668    return rc;
008669  }
008670  
008671  /*
008672  ** Erase the given database page and all its children.  Return
008673  ** the page to the freelist.
008674  */
008675  static int clearDatabasePage(
008676    BtShared *pBt,           /* The BTree that contains the table */
008677    Pgno pgno,               /* Page number to clear */
008678    int freePageFlag,        /* Deallocate page if true */
008679    int *pnChange            /* Add number of Cells freed to this counter */
008680  ){
008681    MemPage *pPage;
008682    int rc;
008683    unsigned char *pCell;
008684    int i;
008685    int hdr;
008686    CellInfo info;
008687  
008688    assert( sqlite3_mutex_held(pBt->mutex) );
008689    if( pgno>btreePagecount(pBt) ){
008690      return SQLITE_CORRUPT_BKPT;
008691    }
008692    rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
008693    if( rc ) return rc;
008694    if( pPage->bBusy ){
008695      rc = SQLITE_CORRUPT_BKPT;
008696      goto cleardatabasepage_out;
008697    }
008698    pPage->bBusy = 1;
008699    hdr = pPage->hdrOffset;
008700    for(i=0; i<pPage->nCell; i++){
008701      pCell = findCell(pPage, i);
008702      if( !pPage->leaf ){
008703        rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
008704        if( rc ) goto cleardatabasepage_out;
008705      }
008706      rc = clearCell(pPage, pCell, &info);
008707      if( rc ) goto cleardatabasepage_out;
008708    }
008709    if( !pPage->leaf ){
008710      rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
008711      if( rc ) goto cleardatabasepage_out;
008712    }else if( pnChange ){
008713      assert( pPage->intKey || CORRUPT_DB );
008714      testcase( !pPage->intKey );
008715      *pnChange += pPage->nCell;
008716    }
008717    if( freePageFlag ){
008718      freePage(pPage, &rc);
008719    }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
008720      zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
008721    }
008722  
008723  cleardatabasepage_out:
008724    pPage->bBusy = 0;
008725    releasePage(pPage);
008726    return rc;
008727  }
008728  
008729  /*
008730  ** Delete all information from a single table in the database.  iTable is
008731  ** the page number of the root of the table.  After this routine returns,
008732  ** the root page is empty, but still exists.
008733  **
008734  ** This routine will fail with SQLITE_LOCKED if there are any open
008735  ** read cursors on the table.  Open write cursors are moved to the
008736  ** root of the table.
008737  **
008738  ** If pnChange is not NULL, then table iTable must be an intkey table. The
008739  ** integer value pointed to by pnChange is incremented by the number of
008740  ** entries in the table.
008741  */
008742  int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
008743    int rc;
008744    BtShared *pBt = p->pBt;
008745    sqlite3BtreeEnter(p);
008746    assert( p->inTrans==TRANS_WRITE );
008747  
008748    rc = saveAllCursors(pBt, (Pgno)iTable, 0);
008749  
008750    if( SQLITE_OK==rc ){
008751      /* Invalidate all incrblob cursors open on table iTable (assuming iTable
008752      ** is the root of a table b-tree - if it is not, the following call is
008753      ** a no-op).  */
008754      invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
008755      rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
008756    }
008757    sqlite3BtreeLeave(p);
008758    return rc;
008759  }
008760  
008761  /*
008762  ** Delete all information from the single table that pCur is open on.
008763  **
008764  ** This routine only work for pCur on an ephemeral table.
008765  */
008766  int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
008767    return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
008768  }
008769  
008770  /*
008771  ** Erase all information in a table and add the root of the table to
008772  ** the freelist.  Except, the root of the principle table (the one on
008773  ** page 1) is never added to the freelist.
008774  **
008775  ** This routine will fail with SQLITE_LOCKED if there are any open
008776  ** cursors on the table.
008777  **
008778  ** If AUTOVACUUM is enabled and the page at iTable is not the last
008779  ** root page in the database file, then the last root page 
008780  ** in the database file is moved into the slot formerly occupied by
008781  ** iTable and that last slot formerly occupied by the last root page
008782  ** is added to the freelist instead of iTable.  In this say, all
008783  ** root pages are kept at the beginning of the database file, which
008784  ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the 
008785  ** page number that used to be the last root page in the file before
008786  ** the move.  If no page gets moved, *piMoved is set to 0.
008787  ** The last root page is recorded in meta[3] and the value of
008788  ** meta[3] is updated by this procedure.
008789  */
008790  static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
008791    int rc;
008792    MemPage *pPage = 0;
008793    BtShared *pBt = p->pBt;
008794  
008795    assert( sqlite3BtreeHoldsMutex(p) );
008796    assert( p->inTrans==TRANS_WRITE );
008797    assert( iTable>=2 );
008798  
008799    rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
008800    if( rc ) return rc;
008801    rc = sqlite3BtreeClearTable(p, iTable, 0);
008802    if( rc ){
008803      releasePage(pPage);
008804      return rc;
008805    }
008806  
008807    *piMoved = 0;
008808  
008809  #ifdef SQLITE_OMIT_AUTOVACUUM
008810    freePage(pPage, &rc);
008811    releasePage(pPage);
008812  #else
008813    if( pBt->autoVacuum ){
008814      Pgno maxRootPgno;
008815      sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
008816  
008817      if( iTable==maxRootPgno ){
008818        /* If the table being dropped is the table with the largest root-page
008819        ** number in the database, put the root page on the free list. 
008820        */
008821        freePage(pPage, &rc);
008822        releasePage(pPage);
008823        if( rc!=SQLITE_OK ){
008824          return rc;
008825        }
008826      }else{
008827        /* The table being dropped does not have the largest root-page
008828        ** number in the database. So move the page that does into the 
008829        ** gap left by the deleted root-page.
008830        */
008831        MemPage *pMove;
008832        releasePage(pPage);
008833        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
008834        if( rc!=SQLITE_OK ){
008835          return rc;
008836        }
008837        rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
008838        releasePage(pMove);
008839        if( rc!=SQLITE_OK ){
008840          return rc;
008841        }
008842        pMove = 0;
008843        rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
008844        freePage(pMove, &rc);
008845        releasePage(pMove);
008846        if( rc!=SQLITE_OK ){
008847          return rc;
008848        }
008849        *piMoved = maxRootPgno;
008850      }
008851  
008852      /* Set the new 'max-root-page' value in the database header. This
008853      ** is the old value less one, less one more if that happens to
008854      ** be a root-page number, less one again if that is the
008855      ** PENDING_BYTE_PAGE.
008856      */
008857      maxRootPgno--;
008858      while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
008859             || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
008860        maxRootPgno--;
008861      }
008862      assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
008863  
008864      rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
008865    }else{
008866      freePage(pPage, &rc);
008867      releasePage(pPage);
008868    }
008869  #endif
008870    return rc;  
008871  }
008872  int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
008873    int rc;
008874    sqlite3BtreeEnter(p);
008875    rc = btreeDropTable(p, iTable, piMoved);
008876    sqlite3BtreeLeave(p);
008877    return rc;
008878  }
008879  
008880  
008881  /*
008882  ** This function may only be called if the b-tree connection already
008883  ** has a read or write transaction open on the database.
008884  **
008885  ** Read the meta-information out of a database file.  Meta[0]
008886  ** is the number of free pages currently in the database.  Meta[1]
008887  ** through meta[15] are available for use by higher layers.  Meta[0]
008888  ** is read-only, the others are read/write.
008889  ** 
008890  ** The schema layer numbers meta values differently.  At the schema
008891  ** layer (and the SetCookie and ReadCookie opcodes) the number of
008892  ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
008893  **
008894  ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
008895  ** of reading the value out of the header, it instead loads the "DataVersion"
008896  ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
008897  ** database file.  It is a number computed by the pager.  But its access
008898  ** pattern is the same as header meta values, and so it is convenient to
008899  ** read it from this routine.
008900  */
008901  void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
008902    BtShared *pBt = p->pBt;
008903  
008904    sqlite3BtreeEnter(p);
008905    assert( p->inTrans>TRANS_NONE );
008906    assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) );
008907    assert( pBt->pPage1 );
008908    assert( idx>=0 && idx<=15 );
008909  
008910    if( idx==BTREE_DATA_VERSION ){
008911      *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
008912    }else{
008913      *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
008914    }
008915  
008916    /* If auto-vacuum is disabled in this build and this is an auto-vacuum
008917    ** database, mark the database as read-only.  */
008918  #ifdef SQLITE_OMIT_AUTOVACUUM
008919    if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
008920      pBt->btsFlags |= BTS_READ_ONLY;
008921    }
008922  #endif
008923  
008924    sqlite3BtreeLeave(p);
008925  }
008926  
008927  /*
008928  ** Write meta-information back into the database.  Meta[0] is
008929  ** read-only and may not be written.
008930  */
008931  int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
008932    BtShared *pBt = p->pBt;
008933    unsigned char *pP1;
008934    int rc;
008935    assert( idx>=1 && idx<=15 );
008936    sqlite3BtreeEnter(p);
008937    assert( p->inTrans==TRANS_WRITE );
008938    assert( pBt->pPage1!=0 );
008939    pP1 = pBt->pPage1->aData;
008940    rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
008941    if( rc==SQLITE_OK ){
008942      put4byte(&pP1[36 + idx*4], iMeta);
008943  #ifndef SQLITE_OMIT_AUTOVACUUM
008944      if( idx==BTREE_INCR_VACUUM ){
008945        assert( pBt->autoVacuum || iMeta==0 );
008946        assert( iMeta==0 || iMeta==1 );
008947        pBt->incrVacuum = (u8)iMeta;
008948      }
008949  #endif
008950    }
008951    sqlite3BtreeLeave(p);
008952    return rc;
008953  }
008954  
008955  #ifndef SQLITE_OMIT_BTREECOUNT
008956  /*
008957  ** The first argument, pCur, is a cursor opened on some b-tree. Count the
008958  ** number of entries in the b-tree and write the result to *pnEntry.
008959  **
008960  ** SQLITE_OK is returned if the operation is successfully executed. 
008961  ** Otherwise, if an error is encountered (i.e. an IO error or database
008962  ** corruption) an SQLite error code is returned.
008963  */
008964  int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){
008965    i64 nEntry = 0;                      /* Value to return in *pnEntry */
008966    int rc;                              /* Return code */
008967  
008968    rc = moveToRoot(pCur);
008969    if( rc==SQLITE_EMPTY ){
008970      *pnEntry = 0;
008971      return SQLITE_OK;
008972    }
008973  
008974    /* Unless an error occurs, the following loop runs one iteration for each
008975    ** page in the B-Tree structure (not including overflow pages). 
008976    */
008977    while( rc==SQLITE_OK ){
008978      int iIdx;                          /* Index of child node in parent */
008979      MemPage *pPage;                    /* Current page of the b-tree */
008980  
008981      /* If this is a leaf page or the tree is not an int-key tree, then 
008982      ** this page contains countable entries. Increment the entry counter
008983      ** accordingly.
008984      */
008985      pPage = pCur->pPage;
008986      if( pPage->leaf || !pPage->intKey ){
008987        nEntry += pPage->nCell;
008988      }
008989  
008990      /* pPage is a leaf node. This loop navigates the cursor so that it 
008991      ** points to the first interior cell that it points to the parent of
008992      ** the next page in the tree that has not yet been visited. The
008993      ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
008994      ** of the page, or to the number of cells in the page if the next page
008995      ** to visit is the right-child of its parent.
008996      **
008997      ** If all pages in the tree have been visited, return SQLITE_OK to the
008998      ** caller.
008999      */
009000      if( pPage->leaf ){
009001        do {
009002          if( pCur->iPage==0 ){
009003            /* All pages of the b-tree have been visited. Return successfully. */
009004            *pnEntry = nEntry;
009005            return moveToRoot(pCur);
009006          }
009007          moveToParent(pCur);
009008        }while ( pCur->ix>=pCur->pPage->nCell );
009009  
009010        pCur->ix++;
009011        pPage = pCur->pPage;
009012      }
009013  
009014      /* Descend to the child node of the cell that the cursor currently 
009015      ** points at. This is the right-child if (iIdx==pPage->nCell).
009016      */
009017      iIdx = pCur->ix;
009018      if( iIdx==pPage->nCell ){
009019        rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
009020      }else{
009021        rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
009022      }
009023    }
009024  
009025    /* An error has occurred. Return an error code. */
009026    return rc;
009027  }
009028  #endif
009029  
009030  /*
009031  ** Return the pager associated with a BTree.  This routine is used for
009032  ** testing and debugging only.
009033  */
009034  Pager *sqlite3BtreePager(Btree *p){
009035    return p->pBt->pPager;
009036  }
009037  
009038  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009039  /*
009040  ** Append a message to the error message string.
009041  */
009042  static void checkAppendMsg(
009043    IntegrityCk *pCheck,
009044    const char *zFormat,
009045    ...
009046  ){
009047    va_list ap;
009048    if( !pCheck->mxErr ) return;
009049    pCheck->mxErr--;
009050    pCheck->nErr++;
009051    va_start(ap, zFormat);
009052    if( pCheck->errMsg.nChar ){
009053      sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1);
009054    }
009055    if( pCheck->zPfx ){
009056      sqlite3XPrintf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
009057    }
009058    sqlite3VXPrintf(&pCheck->errMsg, zFormat, ap);
009059    va_end(ap);
009060    if( pCheck->errMsg.accError==STRACCUM_NOMEM ){
009061      pCheck->mallocFailed = 1;
009062    }
009063  }
009064  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009065  
009066  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009067  
009068  /*
009069  ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
009070  ** corresponds to page iPg is already set.
009071  */
009072  static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
009073    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
009074    return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
009075  }
009076  
009077  /*
009078  ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
009079  */
009080  static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
009081    assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
009082    pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
009083  }
009084  
009085  
009086  /*
009087  ** Add 1 to the reference count for page iPage.  If this is the second
009088  ** reference to the page, add an error message to pCheck->zErrMsg.
009089  ** Return 1 if there are 2 or more references to the page and 0 if
009090  ** if this is the first reference to the page.
009091  **
009092  ** Also check that the page number is in bounds.
009093  */
009094  static int checkRef(IntegrityCk *pCheck, Pgno iPage){
009095    if( iPage==0 ) return 1;
009096    if( iPage>pCheck->nPage ){
009097      checkAppendMsg(pCheck, "invalid page number %d", iPage);
009098      return 1;
009099    }
009100    if( getPageReferenced(pCheck, iPage) ){
009101      checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
009102      return 1;
009103    }
009104    setPageReferenced(pCheck, iPage);
009105    return 0;
009106  }
009107  
009108  #ifndef SQLITE_OMIT_AUTOVACUUM
009109  /*
009110  ** Check that the entry in the pointer-map for page iChild maps to 
009111  ** page iParent, pointer type ptrType. If not, append an error message
009112  ** to pCheck.
009113  */
009114  static void checkPtrmap(
009115    IntegrityCk *pCheck,   /* Integrity check context */
009116    Pgno iChild,           /* Child page number */
009117    u8 eType,              /* Expected pointer map type */
009118    Pgno iParent           /* Expected pointer map parent page number */
009119  ){
009120    int rc;
009121    u8 ePtrmapType;
009122    Pgno iPtrmapParent;
009123  
009124    rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
009125    if( rc!=SQLITE_OK ){
009126      if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1;
009127      checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
009128      return;
009129    }
009130  
009131    if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
009132      checkAppendMsg(pCheck,
009133        "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 
009134        iChild, eType, iParent, ePtrmapType, iPtrmapParent);
009135    }
009136  }
009137  #endif
009138  
009139  /*
009140  ** Check the integrity of the freelist or of an overflow page list.
009141  ** Verify that the number of pages on the list is N.
009142  */
009143  static void checkList(
009144    IntegrityCk *pCheck,  /* Integrity checking context */
009145    int isFreeList,       /* True for a freelist.  False for overflow page list */
009146    int iPage,            /* Page number for first page in the list */
009147    int N                 /* Expected number of pages in the list */
009148  ){
009149    int i;
009150    int expected = N;
009151    int iFirst = iPage;
009152    while( N-- > 0 && pCheck->mxErr ){
009153      DbPage *pOvflPage;
009154      unsigned char *pOvflData;
009155      if( iPage<1 ){
009156        checkAppendMsg(pCheck,
009157           "%d of %d pages missing from overflow list starting at %d",
009158            N+1, expected, iFirst);
009159        break;
009160      }
009161      if( checkRef(pCheck, iPage) ) break;
009162      if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
009163        checkAppendMsg(pCheck, "failed to get page %d", iPage);
009164        break;
009165      }
009166      pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
009167      if( isFreeList ){
009168        int n = get4byte(&pOvflData[4]);
009169  #ifndef SQLITE_OMIT_AUTOVACUUM
009170        if( pCheck->pBt->autoVacuum ){
009171          checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
009172        }
009173  #endif
009174        if( n>(int)pCheck->pBt->usableSize/4-2 ){
009175          checkAppendMsg(pCheck,
009176             "freelist leaf count too big on page %d", iPage);
009177          N--;
009178        }else{
009179          for(i=0; i<n; i++){
009180            Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
009181  #ifndef SQLITE_OMIT_AUTOVACUUM
009182            if( pCheck->pBt->autoVacuum ){
009183              checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
009184            }
009185  #endif
009186            checkRef(pCheck, iFreePage);
009187          }
009188          N -= n;
009189        }
009190      }
009191  #ifndef SQLITE_OMIT_AUTOVACUUM
009192      else{
009193        /* If this database supports auto-vacuum and iPage is not the last
009194        ** page in this overflow list, check that the pointer-map entry for
009195        ** the following page matches iPage.
009196        */
009197        if( pCheck->pBt->autoVacuum && N>0 ){
009198          i = get4byte(pOvflData);
009199          checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
009200        }
009201      }
009202  #endif
009203      iPage = get4byte(pOvflData);
009204      sqlite3PagerUnref(pOvflPage);
009205  
009206      if( isFreeList && N<(iPage!=0) ){
009207        checkAppendMsg(pCheck, "free-page count in header is too small");
009208      }
009209    }
009210  }
009211  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009212  
009213  /*
009214  ** An implementation of a min-heap.
009215  **
009216  ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
009217  ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
009218  ** and aHeap[N*2+1].
009219  **
009220  ** The heap property is this:  Every node is less than or equal to both
009221  ** of its daughter nodes.  A consequence of the heap property is that the
009222  ** root node aHeap[1] is always the minimum value currently in the heap.
009223  **
009224  ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
009225  ** the heap, preserving the heap property.  The btreeHeapPull() routine
009226  ** removes the root element from the heap (the minimum value in the heap)
009227  ** and then moves other nodes around as necessary to preserve the heap
009228  ** property.
009229  **
009230  ** This heap is used for cell overlap and coverage testing.  Each u32
009231  ** entry represents the span of a cell or freeblock on a btree page.  
009232  ** The upper 16 bits are the index of the first byte of a range and the
009233  ** lower 16 bits are the index of the last byte of that range.
009234  */
009235  static void btreeHeapInsert(u32 *aHeap, u32 x){
009236    u32 j, i = ++aHeap[0];
009237    aHeap[i] = x;
009238    while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
009239      x = aHeap[j];
009240      aHeap[j] = aHeap[i];
009241      aHeap[i] = x;
009242      i = j;
009243    }
009244  }
009245  static int btreeHeapPull(u32 *aHeap, u32 *pOut){
009246    u32 j, i, x;
009247    if( (x = aHeap[0])==0 ) return 0;
009248    *pOut = aHeap[1];
009249    aHeap[1] = aHeap[x];
009250    aHeap[x] = 0xffffffff;
009251    aHeap[0]--;
009252    i = 1;
009253    while( (j = i*2)<=aHeap[0] ){
009254      if( aHeap[j]>aHeap[j+1] ) j++;
009255      if( aHeap[i]<aHeap[j] ) break;
009256      x = aHeap[i];
009257      aHeap[i] = aHeap[j];
009258      aHeap[j] = x;
009259      i = j;
009260    }
009261    return 1;  
009262  }
009263  
009264  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009265  /*
009266  ** Do various sanity checks on a single page of a tree.  Return
009267  ** the tree depth.  Root pages return 0.  Parents of root pages
009268  ** return 1, and so forth.
009269  ** 
009270  ** These checks are done:
009271  **
009272  **      1.  Make sure that cells and freeblocks do not overlap
009273  **          but combine to completely cover the page.
009274  **      2.  Make sure integer cell keys are in order.
009275  **      3.  Check the integrity of overflow pages.
009276  **      4.  Recursively call checkTreePage on all children.
009277  **      5.  Verify that the depth of all children is the same.
009278  */
009279  static int checkTreePage(
009280    IntegrityCk *pCheck,  /* Context for the sanity check */
009281    int iPage,            /* Page number of the page to check */
009282    i64 *piMinKey,        /* Write minimum integer primary key here */
009283    i64 maxKey            /* Error if integer primary key greater than this */
009284  ){
009285    MemPage *pPage = 0;      /* The page being analyzed */
009286    int i;                   /* Loop counter */
009287    int rc;                  /* Result code from subroutine call */
009288    int depth = -1, d2;      /* Depth of a subtree */
009289    int pgno;                /* Page number */
009290    int nFrag;               /* Number of fragmented bytes on the page */
009291    int hdr;                 /* Offset to the page header */
009292    int cellStart;           /* Offset to the start of the cell pointer array */
009293    int nCell;               /* Number of cells */
009294    int doCoverageCheck = 1; /* True if cell coverage checking should be done */
009295    int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
009296                             ** False if IPK must be strictly less than maxKey */
009297    u8 *data;                /* Page content */
009298    u8 *pCell;               /* Cell content */
009299    u8 *pCellIdx;            /* Next element of the cell pointer array */
009300    BtShared *pBt;           /* The BtShared object that owns pPage */
009301    u32 pc;                  /* Address of a cell */
009302    u32 usableSize;          /* Usable size of the page */
009303    u32 contentOffset;       /* Offset to the start of the cell content area */
009304    u32 *heap = 0;           /* Min-heap used for checking cell coverage */
009305    u32 x, prev = 0;         /* Next and previous entry on the min-heap */
009306    const char *saved_zPfx = pCheck->zPfx;
009307    int saved_v1 = pCheck->v1;
009308    int saved_v2 = pCheck->v2;
009309    u8 savedIsInit = 0;
009310  
009311    /* Check that the page exists
009312    */
009313    pBt = pCheck->pBt;
009314    usableSize = pBt->usableSize;
009315    if( iPage==0 ) return 0;
009316    if( checkRef(pCheck, iPage) ) return 0;
009317    pCheck->zPfx = "Page %d: ";
009318    pCheck->v1 = iPage;
009319    if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){
009320      checkAppendMsg(pCheck,
009321         "unable to get the page. error code=%d", rc);
009322      goto end_of_check;
009323    }
009324  
009325    /* Clear MemPage.isInit to make sure the corruption detection code in
009326    ** btreeInitPage() is executed.  */
009327    savedIsInit = pPage->isInit;
009328    pPage->isInit = 0;
009329    if( (rc = btreeInitPage(pPage))!=0 ){
009330      assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
009331      checkAppendMsg(pCheck,
009332                     "btreeInitPage() returns error code %d", rc);
009333      goto end_of_check;
009334    }
009335    data = pPage->aData;
009336    hdr = pPage->hdrOffset;
009337  
009338    /* Set up for cell analysis */
009339    pCheck->zPfx = "On tree page %d cell %d: ";
009340    contentOffset = get2byteNotZero(&data[hdr+5]);
009341    assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
009342  
009343    /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
009344    ** number of cells on the page. */
009345    nCell = get2byte(&data[hdr+3]);
009346    assert( pPage->nCell==nCell );
009347  
009348    /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
009349    ** immediately follows the b-tree page header. */
009350    cellStart = hdr + 12 - 4*pPage->leaf;
009351    assert( pPage->aCellIdx==&data[cellStart] );
009352    pCellIdx = &data[cellStart + 2*(nCell-1)];
009353  
009354    if( !pPage->leaf ){
009355      /* Analyze the right-child page of internal pages */
009356      pgno = get4byte(&data[hdr+8]);
009357  #ifndef SQLITE_OMIT_AUTOVACUUM
009358      if( pBt->autoVacuum ){
009359        pCheck->zPfx = "On page %d at right child: ";
009360        checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009361      }
009362  #endif
009363      depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009364      keyCanBeEqual = 0;
009365    }else{
009366      /* For leaf pages, the coverage check will occur in the same loop
009367      ** as the other cell checks, so initialize the heap.  */
009368      heap = pCheck->heap;
009369      heap[0] = 0;
009370    }
009371  
009372    /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
009373    ** integer offsets to the cell contents. */
009374    for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
009375      CellInfo info;
009376  
009377      /* Check cell size */
009378      pCheck->v2 = i;
009379      assert( pCellIdx==&data[cellStart + i*2] );
009380      pc = get2byteAligned(pCellIdx);
009381      pCellIdx -= 2;
009382      if( pc<contentOffset || pc>usableSize-4 ){
009383        checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
009384                               pc, contentOffset, usableSize-4);
009385        doCoverageCheck = 0;
009386        continue;
009387      }
009388      pCell = &data[pc];
009389      pPage->xParseCell(pPage, pCell, &info);
009390      if( pc+info.nSize>usableSize ){
009391        checkAppendMsg(pCheck, "Extends off end of page");
009392        doCoverageCheck = 0;
009393        continue;
009394      }
009395  
009396      /* Check for integer primary key out of range */
009397      if( pPage->intKey ){
009398        if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
009399          checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
009400        }
009401        maxKey = info.nKey;
009402        keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
009403      }
009404  
009405      /* Check the content overflow list */
009406      if( info.nPayload>info.nLocal ){
009407        int nPage;       /* Number of pages on the overflow chain */
009408        Pgno pgnoOvfl;   /* First page of the overflow chain */
009409        assert( pc + info.nSize - 4 <= usableSize );
009410        nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
009411        pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
009412  #ifndef SQLITE_OMIT_AUTOVACUUM
009413        if( pBt->autoVacuum ){
009414          checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
009415        }
009416  #endif
009417        checkList(pCheck, 0, pgnoOvfl, nPage);
009418      }
009419  
009420      if( !pPage->leaf ){
009421        /* Check sanity of left child page for internal pages */
009422        pgno = get4byte(pCell);
009423  #ifndef SQLITE_OMIT_AUTOVACUUM
009424        if( pBt->autoVacuum ){
009425          checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
009426        }
009427  #endif
009428        d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
009429        keyCanBeEqual = 0;
009430        if( d2!=depth ){
009431          checkAppendMsg(pCheck, "Child page depth differs");
009432          depth = d2;
009433        }
009434      }else{
009435        /* Populate the coverage-checking heap for leaf pages */
009436        btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
009437      }
009438    }
009439    *piMinKey = maxKey;
009440  
009441    /* Check for complete coverage of the page
009442    */
009443    pCheck->zPfx = 0;
009444    if( doCoverageCheck && pCheck->mxErr>0 ){
009445      /* For leaf pages, the min-heap has already been initialized and the
009446      ** cells have already been inserted.  But for internal pages, that has
009447      ** not yet been done, so do it now */
009448      if( !pPage->leaf ){
009449        heap = pCheck->heap;
009450        heap[0] = 0;
009451        for(i=nCell-1; i>=0; i--){
009452          u32 size;
009453          pc = get2byteAligned(&data[cellStart+i*2]);
009454          size = pPage->xCellSize(pPage, &data[pc]);
009455          btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
009456        }
009457      }
009458      /* Add the freeblocks to the min-heap
009459      **
009460      ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
009461      ** is the offset of the first freeblock, or zero if there are no
009462      ** freeblocks on the page. 
009463      */
009464      i = get2byte(&data[hdr+1]);
009465      while( i>0 ){
009466        int size, j;
009467        assert( (u32)i<=usableSize-4 );     /* Enforced by btreeInitPage() */
009468        size = get2byte(&data[i+2]);
009469        assert( (u32)(i+size)<=usableSize );  /* Enforced by btreeInitPage() */
009470        btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
009471        /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
009472        ** big-endian integer which is the offset in the b-tree page of the next
009473        ** freeblock in the chain, or zero if the freeblock is the last on the
009474        ** chain. */
009475        j = get2byte(&data[i]);
009476        /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
009477        ** increasing offset. */
009478        assert( j==0 || j>i+size );  /* Enforced by btreeInitPage() */
009479        assert( (u32)j<=usableSize-4 );   /* Enforced by btreeInitPage() */
009480        i = j;
009481      }
009482      /* Analyze the min-heap looking for overlap between cells and/or 
009483      ** freeblocks, and counting the number of untracked bytes in nFrag.
009484      ** 
009485      ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
009486      ** There is an implied first entry the covers the page header, the cell
009487      ** pointer index, and the gap between the cell pointer index and the start
009488      ** of cell content.  
009489      **
009490      ** The loop below pulls entries from the min-heap in order and compares
009491      ** the start_address against the previous end_address.  If there is an
009492      ** overlap, that means bytes are used multiple times.  If there is a gap,
009493      ** that gap is added to the fragmentation count.
009494      */
009495      nFrag = 0;
009496      prev = contentOffset - 1;   /* Implied first min-heap entry */
009497      while( btreeHeapPull(heap,&x) ){
009498        if( (prev&0xffff)>=(x>>16) ){
009499          checkAppendMsg(pCheck,
009500            "Multiple uses for byte %u of page %d", x>>16, iPage);
009501          break;
009502        }else{
009503          nFrag += (x>>16) - (prev&0xffff) - 1;
009504          prev = x;
009505        }
009506      }
009507      nFrag += usableSize - (prev&0xffff) - 1;
009508      /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
009509      ** is stored in the fifth field of the b-tree page header.
009510      ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
009511      ** number of fragmented free bytes within the cell content area.
009512      */
009513      if( heap[0]==0 && nFrag!=data[hdr+7] ){
009514        checkAppendMsg(pCheck,
009515            "Fragmentation of %d bytes reported as %d on page %d",
009516            nFrag, data[hdr+7], iPage);
009517      }
009518    }
009519  
009520  end_of_check:
009521    if( !doCoverageCheck ) pPage->isInit = savedIsInit;
009522    releasePage(pPage);
009523    pCheck->zPfx = saved_zPfx;
009524    pCheck->v1 = saved_v1;
009525    pCheck->v2 = saved_v2;
009526    return depth+1;
009527  }
009528  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009529  
009530  #ifndef SQLITE_OMIT_INTEGRITY_CHECK
009531  /*
009532  ** This routine does a complete check of the given BTree file.  aRoot[] is
009533  ** an array of pages numbers were each page number is the root page of
009534  ** a table.  nRoot is the number of entries in aRoot.
009535  **
009536  ** A read-only or read-write transaction must be opened before calling
009537  ** this function.
009538  **
009539  ** Write the number of error seen in *pnErr.  Except for some memory
009540  ** allocation errors,  an error message held in memory obtained from
009541  ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
009542  ** returned.  If a memory allocation error occurs, NULL is returned.
009543  */
009544  char *sqlite3BtreeIntegrityCheck(
009545    Btree *p,     /* The btree to be checked */
009546    int *aRoot,   /* An array of root pages numbers for individual trees */
009547    int nRoot,    /* Number of entries in aRoot[] */
009548    int mxErr,    /* Stop reporting errors after this many */
009549    int *pnErr    /* Write number of errors seen to this variable */
009550  ){
009551    Pgno i;
009552    IntegrityCk sCheck;
009553    BtShared *pBt = p->pBt;
009554    int savedDbFlags = pBt->db->flags;
009555    char zErr[100];
009556    VVA_ONLY( int nRef );
009557  
009558    sqlite3BtreeEnter(p);
009559    assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
009560    VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
009561    assert( nRef>=0 );
009562    sCheck.pBt = pBt;
009563    sCheck.pPager = pBt->pPager;
009564    sCheck.nPage = btreePagecount(sCheck.pBt);
009565    sCheck.mxErr = mxErr;
009566    sCheck.nErr = 0;
009567    sCheck.mallocFailed = 0;
009568    sCheck.zPfx = 0;
009569    sCheck.v1 = 0;
009570    sCheck.v2 = 0;
009571    sCheck.aPgRef = 0;
009572    sCheck.heap = 0;
009573    sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
009574    sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
009575    if( sCheck.nPage==0 ){
009576      goto integrity_ck_cleanup;
009577    }
009578  
009579    sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
009580    if( !sCheck.aPgRef ){
009581      sCheck.mallocFailed = 1;
009582      goto integrity_ck_cleanup;
009583    }
009584    sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
009585    if( sCheck.heap==0 ){
009586      sCheck.mallocFailed = 1;
009587      goto integrity_ck_cleanup;
009588    }
009589  
009590    i = PENDING_BYTE_PAGE(pBt);
009591    if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
009592  
009593    /* Check the integrity of the freelist
009594    */
009595    sCheck.zPfx = "Main freelist: ";
009596    checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
009597              get4byte(&pBt->pPage1->aData[36]));
009598    sCheck.zPfx = 0;
009599  
009600    /* Check all the tables.
009601    */
009602    testcase( pBt->db->flags & SQLITE_CellSizeCk );
009603    pBt->db->flags &= ~SQLITE_CellSizeCk;
009604    for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
009605      i64 notUsed;
009606      if( aRoot[i]==0 ) continue;
009607  #ifndef SQLITE_OMIT_AUTOVACUUM
009608      if( pBt->autoVacuum && aRoot[i]>1 ){
009609        checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
009610      }
009611  #endif
009612      checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
009613    }
009614    pBt->db->flags = savedDbFlags;
009615  
009616    /* Make sure every page in the file is referenced
009617    */
009618    for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
009619  #ifdef SQLITE_OMIT_AUTOVACUUM
009620      if( getPageReferenced(&sCheck, i)==0 ){
009621        checkAppendMsg(&sCheck, "Page %d is never used", i);
009622      }
009623  #else
009624      /* If the database supports auto-vacuum, make sure no tables contain
009625      ** references to pointer-map pages.
009626      */
009627      if( getPageReferenced(&sCheck, i)==0 && 
009628         (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
009629        checkAppendMsg(&sCheck, "Page %d is never used", i);
009630      }
009631      if( getPageReferenced(&sCheck, i)!=0 && 
009632         (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
009633        checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
009634      }
009635  #endif
009636    }
009637  
009638    /* Clean  up and report errors.
009639    */
009640  integrity_ck_cleanup:
009641    sqlite3PageFree(sCheck.heap);
009642    sqlite3_free(sCheck.aPgRef);
009643    if( sCheck.mallocFailed ){
009644      sqlite3StrAccumReset(&sCheck.errMsg);
009645      sCheck.nErr++;
009646    }
009647    *pnErr = sCheck.nErr;
009648    if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg);
009649    /* Make sure this analysis did not leave any unref() pages. */
009650    assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
009651    sqlite3BtreeLeave(p);
009652    return sqlite3StrAccumFinish(&sCheck.errMsg);
009653  }
009654  #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
009655  
009656  /*
009657  ** Return the full pathname of the underlying database file.  Return
009658  ** an empty string if the database is in-memory or a TEMP database.
009659  **
009660  ** The pager filename is invariant as long as the pager is
009661  ** open so it is safe to access without the BtShared mutex.
009662  */
009663  const char *sqlite3BtreeGetFilename(Btree *p){
009664    assert( p->pBt->pPager!=0 );
009665    return sqlite3PagerFilename(p->pBt->pPager, 1);
009666  }
009667  
009668  /*
009669  ** Return the pathname of the journal file for this database. The return
009670  ** value of this routine is the same regardless of whether the journal file
009671  ** has been created or not.
009672  **
009673  ** The pager journal filename is invariant as long as the pager is
009674  ** open so it is safe to access without the BtShared mutex.
009675  */
009676  const char *sqlite3BtreeGetJournalname(Btree *p){
009677    assert( p->pBt->pPager!=0 );
009678    return sqlite3PagerJournalname(p->pBt->pPager);
009679  }
009680  
009681  /*
009682  ** Return non-zero if a transaction is active.
009683  */
009684  int sqlite3BtreeIsInTrans(Btree *p){
009685    assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
009686    return (p && (p->inTrans==TRANS_WRITE));
009687  }
009688  
009689  #ifndef SQLITE_OMIT_WAL
009690  /*
009691  ** Run a checkpoint on the Btree passed as the first argument.
009692  **
009693  ** Return SQLITE_LOCKED if this or any other connection has an open 
009694  ** transaction on the shared-cache the argument Btree is connected to.
009695  **
009696  ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
009697  */
009698  int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
009699    int rc = SQLITE_OK;
009700    if( p ){
009701      BtShared *pBt = p->pBt;
009702      sqlite3BtreeEnter(p);
009703      if( pBt->inTransaction!=TRANS_NONE ){
009704        rc = SQLITE_LOCKED;
009705      }else{
009706        rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
009707      }
009708      sqlite3BtreeLeave(p);
009709    }
009710    return rc;
009711  }
009712  #endif
009713  
009714  /*
009715  ** Return non-zero if a read (or write) transaction is active.
009716  */
009717  int sqlite3BtreeIsInReadTrans(Btree *p){
009718    assert( p );
009719    assert( sqlite3_mutex_held(p->db->mutex) );
009720    return p->inTrans!=TRANS_NONE;
009721  }
009722  
009723  int sqlite3BtreeIsInBackup(Btree *p){
009724    assert( p );
009725    assert( sqlite3_mutex_held(p->db->mutex) );
009726    return p->nBackup!=0;
009727  }
009728  
009729  /*
009730  ** This function returns a pointer to a blob of memory associated with
009731  ** a single shared-btree. The memory is used by client code for its own
009732  ** purposes (for example, to store a high-level schema associated with 
009733  ** the shared-btree). The btree layer manages reference counting issues.
009734  **
009735  ** The first time this is called on a shared-btree, nBytes bytes of memory
009736  ** are allocated, zeroed, and returned to the caller. For each subsequent 
009737  ** call the nBytes parameter is ignored and a pointer to the same blob
009738  ** of memory returned. 
009739  **
009740  ** If the nBytes parameter is 0 and the blob of memory has not yet been
009741  ** allocated, a null pointer is returned. If the blob has already been
009742  ** allocated, it is returned as normal.
009743  **
009744  ** Just before the shared-btree is closed, the function passed as the 
009745  ** xFree argument when the memory allocation was made is invoked on the 
009746  ** blob of allocated memory. The xFree function should not call sqlite3_free()
009747  ** on the memory, the btree layer does that.
009748  */
009749  void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
009750    BtShared *pBt = p->pBt;
009751    sqlite3BtreeEnter(p);
009752    if( !pBt->pSchema && nBytes ){
009753      pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
009754      pBt->xFreeSchema = xFree;
009755    }
009756    sqlite3BtreeLeave(p);
009757    return pBt->pSchema;
009758  }
009759  
009760  /*
009761  ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 
009762  ** btree as the argument handle holds an exclusive lock on the 
009763  ** sqlite_master table. Otherwise SQLITE_OK.
009764  */
009765  int sqlite3BtreeSchemaLocked(Btree *p){
009766    int rc;
009767    assert( sqlite3_mutex_held(p->db->mutex) );
009768    sqlite3BtreeEnter(p);
009769    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
009770    assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
009771    sqlite3BtreeLeave(p);
009772    return rc;
009773  }
009774  
009775  
009776  #ifndef SQLITE_OMIT_SHARED_CACHE
009777  /*
009778  ** Obtain a lock on the table whose root page is iTab.  The
009779  ** lock is a write lock if isWritelock is true or a read lock
009780  ** if it is false.
009781  */
009782  int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
009783    int rc = SQLITE_OK;
009784    assert( p->inTrans!=TRANS_NONE );
009785    if( p->sharable ){
009786      u8 lockType = READ_LOCK + isWriteLock;
009787      assert( READ_LOCK+1==WRITE_LOCK );
009788      assert( isWriteLock==0 || isWriteLock==1 );
009789  
009790      sqlite3BtreeEnter(p);
009791      rc = querySharedCacheTableLock(p, iTab, lockType);
009792      if( rc==SQLITE_OK ){
009793        rc = setSharedCacheTableLock(p, iTab, lockType);
009794      }
009795      sqlite3BtreeLeave(p);
009796    }
009797    return rc;
009798  }
009799  #endif
009800  
009801  #ifndef SQLITE_OMIT_INCRBLOB
009802  /*
009803  ** Argument pCsr must be a cursor opened for writing on an 
009804  ** INTKEY table currently pointing at a valid table entry. 
009805  ** This function modifies the data stored as part of that entry.
009806  **
009807  ** Only the data content may only be modified, it is not possible to 
009808  ** change the length of the data stored. If this function is called with
009809  ** parameters that attempt to write past the end of the existing data,
009810  ** no modifications are made and SQLITE_CORRUPT is returned.
009811  */
009812  int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
009813    int rc;
009814    assert( cursorOwnsBtShared(pCsr) );
009815    assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
009816    assert( pCsr->curFlags & BTCF_Incrblob );
009817  
009818    rc = restoreCursorPosition(pCsr);
009819    if( rc!=SQLITE_OK ){
009820      return rc;
009821    }
009822    assert( pCsr->eState!=CURSOR_REQUIRESEEK );
009823    if( pCsr->eState!=CURSOR_VALID ){
009824      return SQLITE_ABORT;
009825    }
009826  
009827    /* Save the positions of all other cursors open on this table. This is
009828    ** required in case any of them are holding references to an xFetch
009829    ** version of the b-tree page modified by the accessPayload call below.
009830    **
009831    ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
009832    ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
009833    ** saveAllCursors can only return SQLITE_OK.
009834    */
009835    VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
009836    assert( rc==SQLITE_OK );
009837  
009838    /* Check some assumptions: 
009839    **   (a) the cursor is open for writing,
009840    **   (b) there is a read/write transaction open,
009841    **   (c) the connection holds a write-lock on the table (if required),
009842    **   (d) there are no conflicting read-locks, and
009843    **   (e) the cursor points at a valid row of an intKey table.
009844    */
009845    if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
009846      return SQLITE_READONLY;
009847    }
009848    assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
009849                && pCsr->pBt->inTransaction==TRANS_WRITE );
009850    assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
009851    assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
009852    assert( pCsr->pPage->intKey );
009853  
009854    return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
009855  }
009856  
009857  /* 
009858  ** Mark this cursor as an incremental blob cursor.
009859  */
009860  void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
009861    pCur->curFlags |= BTCF_Incrblob;
009862    pCur->pBtree->hasIncrblobCur = 1;
009863  }
009864  #endif
009865  
009866  /*
009867  ** Set both the "read version" (single byte at byte offset 18) and 
009868  ** "write version" (single byte at byte offset 19) fields in the database
009869  ** header to iVersion.
009870  */
009871  int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
009872    BtShared *pBt = pBtree->pBt;
009873    int rc;                         /* Return code */
009874   
009875    assert( iVersion==1 || iVersion==2 );
009876  
009877    /* If setting the version fields to 1, do not automatically open the
009878    ** WAL connection, even if the version fields are currently set to 2.
009879    */
009880    pBt->btsFlags &= ~BTS_NO_WAL;
009881    if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
009882  
009883    rc = sqlite3BtreeBeginTrans(pBtree, 0);
009884    if( rc==SQLITE_OK ){
009885      u8 *aData = pBt->pPage1->aData;
009886      if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
009887        rc = sqlite3BtreeBeginTrans(pBtree, 2);
009888        if( rc==SQLITE_OK ){
009889          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
009890          if( rc==SQLITE_OK ){
009891            aData[18] = (u8)iVersion;
009892            aData[19] = (u8)iVersion;
009893          }
009894        }
009895      }
009896    }
009897  
009898    pBt->btsFlags &= ~BTS_NO_WAL;
009899    return rc;
009900  }
009901  
009902  /*
009903  ** Return true if the cursor has a hint specified.  This routine is
009904  ** only used from within assert() statements
009905  */
009906  int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
009907    return (pCsr->hints & mask)!=0;
009908  }
009909  
009910  /*
009911  ** Return true if the given Btree is read-only.
009912  */
009913  int sqlite3BtreeIsReadonly(Btree *p){
009914    return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
009915  }
009916  
009917  /*
009918  ** Return the size of the header added to each page by this module.
009919  */
009920  int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
009921  
009922  #if !defined(SQLITE_OMIT_SHARED_CACHE)
009923  /*
009924  ** Return true if the Btree passed as the only argument is sharable.
009925  */
009926  int sqlite3BtreeSharable(Btree *p){
009927    return p->sharable;
009928  }
009929  
009930  /*
009931  ** Return the number of connections to the BtShared object accessed by
009932  ** the Btree handle passed as the only argument. For private caches 
009933  ** this is always 1. For shared caches it may be 1 or greater.
009934  */
009935  int sqlite3BtreeConnectionCount(Btree *p){
009936    testcase( p->sharable );
009937    return p->pBt->nRef;
009938  }
009939  #endif