/ Artifact [ede8348a]
Login

Artifact ede8348a7d623257ee6c06ca4796ceaee13b8657:


     1  /*
     2  ** 2004 April 6
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file implements an external (disk-based) database using BTrees.
    13  ** See the header comment on "btreeInt.h" for additional information.
    14  ** Including a description of file format and an overview of operation.
    15  */
    16  #include "btreeInt.h"
    17  
    18  /*
    19  ** The header string that appears at the beginning of every
    20  ** SQLite database.
    21  */
    22  static const char zMagicHeader[] = SQLITE_FILE_HEADER;
    23  
    24  /*
    25  ** Set this global variable to 1 to enable tracing using the TRACE
    26  ** macro.
    27  */
    28  #if 0
    29  int sqlite3BtreeTrace=1;  /* True to enable tracing */
    30  # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
    31  #else
    32  # define TRACE(X)
    33  #endif
    34  
    35  /*
    36  ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
    37  ** But if the value is zero, make it 65536.
    38  **
    39  ** This routine is used to extract the "offset to cell content area" value
    40  ** from the header of a btree page.  If the page size is 65536 and the page
    41  ** is empty, the offset should be 65536, but the 2-byte value stores zero.
    42  ** This routine makes the necessary adjustment to 65536.
    43  */
    44  #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
    45  
    46  /*
    47  ** Values passed as the 5th argument to allocateBtreePage()
    48  */
    49  #define BTALLOC_ANY   0           /* Allocate any page */
    50  #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
    51  #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
    52  
    53  /*
    54  ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not 
    55  ** defined, or 0 if it is. For example:
    56  **
    57  **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
    58  */
    59  #ifndef SQLITE_OMIT_AUTOVACUUM
    60  #define IfNotOmitAV(expr) (expr)
    61  #else
    62  #define IfNotOmitAV(expr) 0
    63  #endif
    64  
    65  #ifndef SQLITE_OMIT_SHARED_CACHE
    66  /*
    67  ** A list of BtShared objects that are eligible for participation
    68  ** in shared cache.  This variable has file scope during normal builds,
    69  ** but the test harness needs to access it so we make it global for 
    70  ** test builds.
    71  **
    72  ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MASTER.
    73  */
    74  #ifdef SQLITE_TEST
    75  BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    76  #else
    77  static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
    78  #endif
    79  #endif /* SQLITE_OMIT_SHARED_CACHE */
    80  
    81  #ifndef SQLITE_OMIT_SHARED_CACHE
    82  /*
    83  ** Enable or disable the shared pager and schema features.
    84  **
    85  ** This routine has no effect on existing database connections.
    86  ** The shared cache setting effects only future calls to
    87  ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
    88  */
    89  int sqlite3_enable_shared_cache(int enable){
    90    sqlite3GlobalConfig.sharedCacheEnabled = enable;
    91    return SQLITE_OK;
    92  }
    93  #endif
    94  
    95  
    96  
    97  #ifdef SQLITE_OMIT_SHARED_CACHE
    98    /*
    99    ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
   100    ** and clearAllSharedCacheTableLocks()
   101    ** manipulate entries in the BtShared.pLock linked list used to store
   102    ** shared-cache table level locks. If the library is compiled with the
   103    ** shared-cache feature disabled, then there is only ever one user
   104    ** of each BtShared structure and so this locking is not necessary. 
   105    ** So define the lock related functions as no-ops.
   106    */
   107    #define querySharedCacheTableLock(a,b,c) SQLITE_OK
   108    #define setSharedCacheTableLock(a,b,c) SQLITE_OK
   109    #define clearAllSharedCacheTableLocks(a)
   110    #define downgradeAllSharedCacheTableLocks(a)
   111    #define hasSharedCacheTableLock(a,b,c,d) 1
   112    #define hasReadConflicts(a, b) 0
   113  #endif
   114  
   115  #ifndef SQLITE_OMIT_SHARED_CACHE
   116  
   117  #ifdef SQLITE_DEBUG
   118  /*
   119  **** This function is only used as part of an assert() statement. ***
   120  **
   121  ** Check to see if pBtree holds the required locks to read or write to the 
   122  ** table with root page iRoot.   Return 1 if it does and 0 if not.
   123  **
   124  ** For example, when writing to a table with root-page iRoot via 
   125  ** Btree connection pBtree:
   126  **
   127  **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
   128  **
   129  ** When writing to an index that resides in a sharable database, the 
   130  ** caller should have first obtained a lock specifying the root page of
   131  ** the corresponding table. This makes things a bit more complicated,
   132  ** as this module treats each table as a separate structure. To determine
   133  ** the table corresponding to the index being written, this
   134  ** function has to search through the database schema.
   135  **
   136  ** Instead of a lock on the table/index rooted at page iRoot, the caller may
   137  ** hold a write-lock on the schema table (root page 1). This is also
   138  ** acceptable.
   139  */
   140  static int hasSharedCacheTableLock(
   141    Btree *pBtree,         /* Handle that must hold lock */
   142    Pgno iRoot,            /* Root page of b-tree */
   143    int isIndex,           /* True if iRoot is the root of an index b-tree */
   144    int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
   145  ){
   146    Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
   147    Pgno iTab = 0;
   148    BtLock *pLock;
   149  
   150    /* If this database is not shareable, or if the client is reading
   151    ** and has the read-uncommitted flag set, then no lock is required. 
   152    ** Return true immediately.
   153    */
   154    if( (pBtree->sharable==0)
   155     || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommitted))
   156    ){
   157      return 1;
   158    }
   159  
   160    /* If the client is reading  or writing an index and the schema is
   161    ** not loaded, then it is too difficult to actually check to see if
   162    ** the correct locks are held.  So do not bother - just return true.
   163    ** This case does not come up very often anyhow.
   164    */
   165    if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
   166      return 1;
   167    }
   168  
   169    /* Figure out the root-page that the lock should be held on. For table
   170    ** b-trees, this is just the root page of the b-tree being read or
   171    ** written. For index b-trees, it is the root page of the associated
   172    ** table.  */
   173    if( isIndex ){
   174      HashElem *p;
   175      for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
   176        Index *pIdx = (Index *)sqliteHashData(p);
   177        if( pIdx->tnum==(int)iRoot ){
   178          iTab = pIdx->pTable->tnum;
   179        }
   180      }
   181    }else{
   182      iTab = iRoot;
   183    }
   184  
   185    /* Search for the required lock. Either a write-lock on root-page iTab, a 
   186    ** write-lock on the schema table, or (if the client is reading) a
   187    ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
   188    for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
   189      if( pLock->pBtree==pBtree 
   190       && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
   191       && pLock->eLock>=eLockType 
   192      ){
   193        return 1;
   194      }
   195    }
   196  
   197    /* Failed to find the required lock. */
   198    return 0;
   199  }
   200  #endif /* SQLITE_DEBUG */
   201  
   202  #ifdef SQLITE_DEBUG
   203  /*
   204  **** This function may be used as part of assert() statements only. ****
   205  **
   206  ** Return true if it would be illegal for pBtree to write into the
   207  ** table or index rooted at iRoot because other shared connections are
   208  ** simultaneously reading that same table or index.
   209  **
   210  ** It is illegal for pBtree to write if some other Btree object that
   211  ** shares the same BtShared object is currently reading or writing
   212  ** the iRoot table.  Except, if the other Btree object has the
   213  ** read-uncommitted flag set, then it is OK for the other object to
   214  ** have a read cursor.
   215  **
   216  ** For example, before writing to any part of the table or index
   217  ** rooted at page iRoot, one should call:
   218  **
   219  **    assert( !hasReadConflicts(pBtree, iRoot) );
   220  */
   221  static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
   222    BtCursor *p;
   223    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
   224      if( p->pgnoRoot==iRoot 
   225       && p->pBtree!=pBtree
   226       && 0==(p->pBtree->db->flags & SQLITE_ReadUncommitted)
   227      ){
   228        return 1;
   229      }
   230    }
   231    return 0;
   232  }
   233  #endif    /* #ifdef SQLITE_DEBUG */
   234  
   235  /*
   236  ** Query to see if Btree handle p may obtain a lock of type eLock 
   237  ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
   238  ** SQLITE_OK if the lock may be obtained (by calling
   239  ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
   240  */
   241  static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
   242    BtShared *pBt = p->pBt;
   243    BtLock *pIter;
   244  
   245    assert( sqlite3BtreeHoldsMutex(p) );
   246    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   247    assert( p->db!=0 );
   248    assert( !(p->db->flags&SQLITE_ReadUncommitted)||eLock==WRITE_LOCK||iTab==1 );
   249    
   250    /* If requesting a write-lock, then the Btree must have an open write
   251    ** transaction on this file. And, obviously, for this to be so there 
   252    ** must be an open write transaction on the file itself.
   253    */
   254    assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
   255    assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
   256    
   257    /* This routine is a no-op if the shared-cache is not enabled */
   258    if( !p->sharable ){
   259      return SQLITE_OK;
   260    }
   261  
   262    /* If some other connection is holding an exclusive lock, the
   263    ** requested lock may not be obtained.
   264    */
   265    if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
   266      sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
   267      return SQLITE_LOCKED_SHAREDCACHE;
   268    }
   269  
   270    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   271      /* The condition (pIter->eLock!=eLock) in the following if(...) 
   272      ** statement is a simplification of:
   273      **
   274      **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
   275      **
   276      ** since we know that if eLock==WRITE_LOCK, then no other connection
   277      ** may hold a WRITE_LOCK on any table in this file (since there can
   278      ** only be a single writer).
   279      */
   280      assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
   281      assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
   282      if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
   283        sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
   284        if( eLock==WRITE_LOCK ){
   285          assert( p==pBt->pWriter );
   286          pBt->btsFlags |= BTS_PENDING;
   287        }
   288        return SQLITE_LOCKED_SHAREDCACHE;
   289      }
   290    }
   291    return SQLITE_OK;
   292  }
   293  #endif /* !SQLITE_OMIT_SHARED_CACHE */
   294  
   295  #ifndef SQLITE_OMIT_SHARED_CACHE
   296  /*
   297  ** Add a lock on the table with root-page iTable to the shared-btree used
   298  ** by Btree handle p. Parameter eLock must be either READ_LOCK or 
   299  ** WRITE_LOCK.
   300  **
   301  ** This function assumes the following:
   302  **
   303  **   (a) The specified Btree object p is connected to a sharable
   304  **       database (one with the BtShared.sharable flag set), and
   305  **
   306  **   (b) No other Btree objects hold a lock that conflicts
   307  **       with the requested lock (i.e. querySharedCacheTableLock() has
   308  **       already been called and returned SQLITE_OK).
   309  **
   310  ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM 
   311  ** is returned if a malloc attempt fails.
   312  */
   313  static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
   314    BtShared *pBt = p->pBt;
   315    BtLock *pLock = 0;
   316    BtLock *pIter;
   317  
   318    assert( sqlite3BtreeHoldsMutex(p) );
   319    assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
   320    assert( p->db!=0 );
   321  
   322    /* A connection with the read-uncommitted flag set will never try to
   323    ** obtain a read-lock using this function. The only read-lock obtained
   324    ** by a connection in read-uncommitted mode is on the sqlite_master 
   325    ** table, and that lock is obtained in BtreeBeginTrans().  */
   326    assert( 0==(p->db->flags&SQLITE_ReadUncommitted) || eLock==WRITE_LOCK );
   327  
   328    /* This function should only be called on a sharable b-tree after it 
   329    ** has been determined that no other b-tree holds a conflicting lock.  */
   330    assert( p->sharable );
   331    assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
   332  
   333    /* First search the list for an existing lock on this table. */
   334    for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
   335      if( pIter->iTable==iTable && pIter->pBtree==p ){
   336        pLock = pIter;
   337        break;
   338      }
   339    }
   340  
   341    /* If the above search did not find a BtLock struct associating Btree p
   342    ** with table iTable, allocate one and link it into the list.
   343    */
   344    if( !pLock ){
   345      pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
   346      if( !pLock ){
   347        return SQLITE_NOMEM;
   348      }
   349      pLock->iTable = iTable;
   350      pLock->pBtree = p;
   351      pLock->pNext = pBt->pLock;
   352      pBt->pLock = pLock;
   353    }
   354  
   355    /* Set the BtLock.eLock variable to the maximum of the current lock
   356    ** and the requested lock. This means if a write-lock was already held
   357    ** and a read-lock requested, we don't incorrectly downgrade the lock.
   358    */
   359    assert( WRITE_LOCK>READ_LOCK );
   360    if( eLock>pLock->eLock ){
   361      pLock->eLock = eLock;
   362    }
   363  
   364    return SQLITE_OK;
   365  }
   366  #endif /* !SQLITE_OMIT_SHARED_CACHE */
   367  
   368  #ifndef SQLITE_OMIT_SHARED_CACHE
   369  /*
   370  ** Release all the table locks (locks obtained via calls to
   371  ** the setSharedCacheTableLock() procedure) held by Btree object p.
   372  **
   373  ** This function assumes that Btree p has an open read or write 
   374  ** transaction. If it does not, then the BTS_PENDING flag
   375  ** may be incorrectly cleared.
   376  */
   377  static void clearAllSharedCacheTableLocks(Btree *p){
   378    BtShared *pBt = p->pBt;
   379    BtLock **ppIter = &pBt->pLock;
   380  
   381    assert( sqlite3BtreeHoldsMutex(p) );
   382    assert( p->sharable || 0==*ppIter );
   383    assert( p->inTrans>0 );
   384  
   385    while( *ppIter ){
   386      BtLock *pLock = *ppIter;
   387      assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
   388      assert( pLock->pBtree->inTrans>=pLock->eLock );
   389      if( pLock->pBtree==p ){
   390        *ppIter = pLock->pNext;
   391        assert( pLock->iTable!=1 || pLock==&p->lock );
   392        if( pLock->iTable!=1 ){
   393          sqlite3_free(pLock);
   394        }
   395      }else{
   396        ppIter = &pLock->pNext;
   397      }
   398    }
   399  
   400    assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
   401    if( pBt->pWriter==p ){
   402      pBt->pWriter = 0;
   403      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
   404    }else if( pBt->nTransaction==2 ){
   405      /* This function is called when Btree p is concluding its 
   406      ** transaction. If there currently exists a writer, and p is not
   407      ** that writer, then the number of locks held by connections other
   408      ** than the writer must be about to drop to zero. In this case
   409      ** set the BTS_PENDING flag to 0.
   410      **
   411      ** If there is not currently a writer, then BTS_PENDING must
   412      ** be zero already. So this next line is harmless in that case.
   413      */
   414      pBt->btsFlags &= ~BTS_PENDING;
   415    }
   416  }
   417  
   418  /*
   419  ** This function changes all write-locks held by Btree p into read-locks.
   420  */
   421  static void downgradeAllSharedCacheTableLocks(Btree *p){
   422    BtShared *pBt = p->pBt;
   423    if( pBt->pWriter==p ){
   424      BtLock *pLock;
   425      pBt->pWriter = 0;
   426      pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
   427      for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
   428        assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
   429        pLock->eLock = READ_LOCK;
   430      }
   431    }
   432  }
   433  
   434  #endif /* SQLITE_OMIT_SHARED_CACHE */
   435  
   436  static void releasePage(MemPage *pPage);  /* Forward reference */
   437  
   438  /*
   439  ***** This routine is used inside of assert() only ****
   440  **
   441  ** Verify that the cursor holds the mutex on its BtShared
   442  */
   443  #ifdef SQLITE_DEBUG
   444  static int cursorHoldsMutex(BtCursor *p){
   445    return sqlite3_mutex_held(p->pBt->mutex);
   446  }
   447  #endif
   448  
   449  /*
   450  ** Invalidate the overflow cache of the cursor passed as the first argument.
   451  ** on the shared btree structure pBt.
   452  */
   453  #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
   454  
   455  /*
   456  ** Invalidate the overflow page-list cache for all cursors opened
   457  ** on the shared btree structure pBt.
   458  */
   459  static void invalidateAllOverflowCache(BtShared *pBt){
   460    BtCursor *p;
   461    assert( sqlite3_mutex_held(pBt->mutex) );
   462    for(p=pBt->pCursor; p; p=p->pNext){
   463      invalidateOverflowCache(p);
   464    }
   465  }
   466  
   467  #ifndef SQLITE_OMIT_INCRBLOB
   468  /*
   469  ** This function is called before modifying the contents of a table
   470  ** to invalidate any incrblob cursors that are open on the
   471  ** row or one of the rows being modified.
   472  **
   473  ** If argument isClearTable is true, then the entire contents of the
   474  ** table is about to be deleted. In this case invalidate all incrblob
   475  ** cursors open on any row within the table with root-page pgnoRoot.
   476  **
   477  ** Otherwise, if argument isClearTable is false, then the row with
   478  ** rowid iRow is being replaced or deleted. In this case invalidate
   479  ** only those incrblob cursors open on that specific row.
   480  */
   481  static void invalidateIncrblobCursors(
   482    Btree *pBtree,          /* The database file to check */
   483    i64 iRow,               /* The rowid that might be changing */
   484    int isClearTable        /* True if all rows are being deleted */
   485  ){
   486    BtCursor *p;
   487    BtShared *pBt = pBtree->pBt;
   488    assert( sqlite3BtreeHoldsMutex(pBtree) );
   489    for(p=pBt->pCursor; p; p=p->pNext){
   490      if( (p->curFlags & BTCF_Incrblob)!=0
   491       && (isClearTable || p->info.nKey==iRow)
   492      ){
   493        p->eState = CURSOR_INVALID;
   494      }
   495    }
   496  }
   497  
   498  #else
   499    /* Stub function when INCRBLOB is omitted */
   500    #define invalidateIncrblobCursors(x,y,z)
   501  #endif /* SQLITE_OMIT_INCRBLOB */
   502  
   503  /*
   504  ** Set bit pgno of the BtShared.pHasContent bitvec. This is called 
   505  ** when a page that previously contained data becomes a free-list leaf 
   506  ** page.
   507  **
   508  ** The BtShared.pHasContent bitvec exists to work around an obscure
   509  ** bug caused by the interaction of two useful IO optimizations surrounding
   510  ** free-list leaf pages:
   511  **
   512  **   1) When all data is deleted from a page and the page becomes
   513  **      a free-list leaf page, the page is not written to the database
   514  **      (as free-list leaf pages contain no meaningful data). Sometimes
   515  **      such a page is not even journalled (as it will not be modified,
   516  **      why bother journalling it?).
   517  **
   518  **   2) When a free-list leaf page is reused, its content is not read
   519  **      from the database or written to the journal file (why should it
   520  **      be, if it is not at all meaningful?).
   521  **
   522  ** By themselves, these optimizations work fine and provide a handy
   523  ** performance boost to bulk delete or insert operations. However, if
   524  ** a page is moved to the free-list and then reused within the same
   525  ** transaction, a problem comes up. If the page is not journalled when
   526  ** it is moved to the free-list and it is also not journalled when it
   527  ** is extracted from the free-list and reused, then the original data
   528  ** may be lost. In the event of a rollback, it may not be possible
   529  ** to restore the database to its original configuration.
   530  **
   531  ** The solution is the BtShared.pHasContent bitvec. Whenever a page is 
   532  ** moved to become a free-list leaf page, the corresponding bit is
   533  ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
   534  ** optimization 2 above is omitted if the corresponding bit is already
   535  ** set in BtShared.pHasContent. The contents of the bitvec are cleared
   536  ** at the end of every transaction.
   537  */
   538  static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
   539    int rc = SQLITE_OK;
   540    if( !pBt->pHasContent ){
   541      assert( pgno<=pBt->nPage );
   542      pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
   543      if( !pBt->pHasContent ){
   544        rc = SQLITE_NOMEM;
   545      }
   546    }
   547    if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
   548      rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
   549    }
   550    return rc;
   551  }
   552  
   553  /*
   554  ** Query the BtShared.pHasContent vector.
   555  **
   556  ** This function is called when a free-list leaf page is removed from the
   557  ** free-list for reuse. It returns false if it is safe to retrieve the
   558  ** page from the pager layer with the 'no-content' flag set. True otherwise.
   559  */
   560  static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
   561    Bitvec *p = pBt->pHasContent;
   562    return (p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTest(p, pgno)));
   563  }
   564  
   565  /*
   566  ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
   567  ** invoked at the conclusion of each write-transaction.
   568  */
   569  static void btreeClearHasContent(BtShared *pBt){
   570    sqlite3BitvecDestroy(pBt->pHasContent);
   571    pBt->pHasContent = 0;
   572  }
   573  
   574  /*
   575  ** Release all of the apPage[] pages for a cursor.
   576  */
   577  static void btreeReleaseAllCursorPages(BtCursor *pCur){
   578    int i;
   579    for(i=0; i<=pCur->iPage; i++){
   580      releasePage(pCur->apPage[i]);
   581      pCur->apPage[i] = 0;
   582    }
   583    pCur->iPage = -1;
   584  }
   585  
   586  
   587  /*
   588  ** Save the current cursor position in the variables BtCursor.nKey 
   589  ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
   590  **
   591  ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
   592  ** prior to calling this routine.  
   593  */
   594  static int saveCursorPosition(BtCursor *pCur){
   595    int rc;
   596  
   597    assert( CURSOR_VALID==pCur->eState );
   598    assert( 0==pCur->pKey );
   599    assert( cursorHoldsMutex(pCur) );
   600  
   601    rc = sqlite3BtreeKeySize(pCur, &pCur->nKey);
   602    assert( rc==SQLITE_OK );  /* KeySize() cannot fail */
   603  
   604    /* If this is an intKey table, then the above call to BtreeKeySize()
   605    ** stores the integer key in pCur->nKey. In this case this value is
   606    ** all that is required. Otherwise, if pCur is not open on an intKey
   607    ** table, then malloc space for and store the pCur->nKey bytes of key 
   608    ** data.
   609    */
   610    if( 0==pCur->apPage[0]->intKey ){
   611      void *pKey = sqlite3Malloc( pCur->nKey );
   612      if( pKey ){
   613        rc = sqlite3BtreeKey(pCur, 0, (int)pCur->nKey, pKey);
   614        if( rc==SQLITE_OK ){
   615          pCur->pKey = pKey;
   616        }else{
   617          sqlite3_free(pKey);
   618        }
   619      }else{
   620        rc = SQLITE_NOMEM;
   621      }
   622    }
   623    assert( !pCur->apPage[0]->intKey || !pCur->pKey );
   624  
   625    if( rc==SQLITE_OK ){
   626      btreeReleaseAllCursorPages(pCur);
   627      pCur->eState = CURSOR_REQUIRESEEK;
   628    }
   629  
   630    invalidateOverflowCache(pCur);
   631    return rc;
   632  }
   633  
   634  /* Forward reference */
   635  static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
   636  
   637  /*
   638  ** Save the positions of all cursors (except pExcept) that are open on
   639  ** the table with root-page iRoot.  "Saving the cursor position" means that
   640  ** the location in the btree is remembered in such a way that it can be
   641  ** moved back to the same spot after the btree has been modified.  This
   642  ** routine is called just before cursor pExcept is used to modify the
   643  ** table, for example in BtreeDelete() or BtreeInsert().
   644  **
   645  ** Implementation note:  This routine merely checks to see if any cursors
   646  ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
   647  ** event that cursors are in need to being saved.
   648  */
   649  static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
   650    BtCursor *p;
   651    assert( sqlite3_mutex_held(pBt->mutex) );
   652    assert( pExcept==0 || pExcept->pBt==pBt );
   653    for(p=pBt->pCursor; p; p=p->pNext){
   654      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
   655    }
   656    return p ? saveCursorsOnList(p, iRoot, pExcept) : SQLITE_OK;
   657  }
   658  
   659  /* This helper routine to saveAllCursors does the actual work of saving
   660  ** the cursors if and when a cursor is found that actually requires saving.
   661  ** The common case is that no cursors need to be saved, so this routine is
   662  ** broken out from its caller to avoid unnecessary stack pointer movement.
   663  */
   664  static int SQLITE_NOINLINE saveCursorsOnList(
   665    BtCursor *p,         /* The first cursor that needs saving */
   666    Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
   667    BtCursor *pExcept    /* Do not save this cursor */
   668  ){
   669    do{
   670      if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
   671        if( p->eState==CURSOR_VALID ){
   672          int rc = saveCursorPosition(p);
   673          if( SQLITE_OK!=rc ){
   674            return rc;
   675          }
   676        }else{
   677          testcase( p->iPage>0 );
   678          btreeReleaseAllCursorPages(p);
   679        }
   680      }
   681      p = p->pNext;
   682    }while( p );
   683    return SQLITE_OK;
   684  }
   685  
   686  /*
   687  ** Clear the current cursor position.
   688  */
   689  void sqlite3BtreeClearCursor(BtCursor *pCur){
   690    assert( cursorHoldsMutex(pCur) );
   691    sqlite3_free(pCur->pKey);
   692    pCur->pKey = 0;
   693    pCur->eState = CURSOR_INVALID;
   694  }
   695  
   696  /*
   697  ** In this version of BtreeMoveto, pKey is a packed index record
   698  ** such as is generated by the OP_MakeRecord opcode.  Unpack the
   699  ** record and then call BtreeMovetoUnpacked() to do the work.
   700  */
   701  static int btreeMoveto(
   702    BtCursor *pCur,     /* Cursor open on the btree to be searched */
   703    const void *pKey,   /* Packed key if the btree is an index */
   704    i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
   705    int bias,           /* Bias search to the high end */
   706    int *pRes           /* Write search results here */
   707  ){
   708    int rc;                    /* Status code */
   709    UnpackedRecord *pIdxKey;   /* Unpacked index key */
   710    char aSpace[200];          /* Temp space for pIdxKey - to avoid a malloc */
   711    char *pFree = 0;
   712  
   713    if( pKey ){
   714      assert( nKey==(i64)(int)nKey );
   715      pIdxKey = sqlite3VdbeAllocUnpackedRecord(
   716          pCur->pKeyInfo, aSpace, sizeof(aSpace), &pFree
   717      );
   718      if( pIdxKey==0 ) return SQLITE_NOMEM;
   719      sqlite3VdbeRecordUnpack(pCur->pKeyInfo, (int)nKey, pKey, pIdxKey);
   720      if( pIdxKey->nField==0 ){
   721        sqlite3DbFree(pCur->pKeyInfo->db, pFree);
   722        return SQLITE_CORRUPT_BKPT;
   723      }
   724    }else{
   725      pIdxKey = 0;
   726    }
   727    rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
   728    if( pFree ){
   729      sqlite3DbFree(pCur->pKeyInfo->db, pFree);
   730    }
   731    return rc;
   732  }
   733  
   734  /*
   735  ** Restore the cursor to the position it was in (or as close to as possible)
   736  ** when saveCursorPosition() was called. Note that this call deletes the 
   737  ** saved position info stored by saveCursorPosition(), so there can be
   738  ** at most one effective restoreCursorPosition() call after each 
   739  ** saveCursorPosition().
   740  */
   741  static int btreeRestoreCursorPosition(BtCursor *pCur){
   742    int rc;
   743    assert( cursorHoldsMutex(pCur) );
   744    assert( pCur->eState>=CURSOR_REQUIRESEEK );
   745    if( pCur->eState==CURSOR_FAULT ){
   746      return pCur->skipNext;
   747    }
   748    pCur->eState = CURSOR_INVALID;
   749    rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &pCur->skipNext);
   750    if( rc==SQLITE_OK ){
   751      sqlite3_free(pCur->pKey);
   752      pCur->pKey = 0;
   753      assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
   754      if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
   755        pCur->eState = CURSOR_SKIPNEXT;
   756      }
   757    }
   758    return rc;
   759  }
   760  
   761  #define restoreCursorPosition(p) \
   762    (p->eState>=CURSOR_REQUIRESEEK ? \
   763           btreeRestoreCursorPosition(p) : \
   764           SQLITE_OK)
   765  
   766  /*
   767  ** Determine whether or not a cursor has moved from the position where
   768  ** it was last placed, or has been invalidated for any other reason.
   769  ** Cursors can move when the row they are pointing at is deleted out
   770  ** from under them, for example.  Cursor might also move if a btree
   771  ** is rebalanced.
   772  **
   773  ** Calling this routine with a NULL cursor pointer returns false.
   774  **
   775  ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
   776  ** back to where it ought to be if this routine returns true.
   777  */
   778  int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
   779    return pCur && pCur->eState!=CURSOR_VALID;
   780  }
   781  
   782  /*
   783  ** This routine restores a cursor back to its original position after it
   784  ** has been moved by some outside activity (such as a btree rebalance or
   785  ** a row having been deleted out from under the cursor).  
   786  **
   787  ** On success, the *pDifferentRow parameter is false if the cursor is left
   788  ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
   789  ** was pointing to has been deleted, forcing the cursor to point to some
   790  ** nearby row.
   791  **
   792  ** This routine should only be called for a cursor that just returned
   793  ** TRUE from sqlite3BtreeCursorHasMoved().
   794  */
   795  int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
   796    int rc;
   797  
   798    assert( pCur!=0 );
   799    assert( pCur->eState!=CURSOR_VALID );
   800    rc = restoreCursorPosition(pCur);
   801    if( rc ){
   802      *pDifferentRow = 1;
   803      return rc;
   804    }
   805    if( pCur->eState!=CURSOR_VALID || NEVER(pCur->skipNext!=0) ){
   806      *pDifferentRow = 1;
   807    }else{
   808      *pDifferentRow = 0;
   809    }
   810    return SQLITE_OK;
   811  }
   812  
   813  #ifndef SQLITE_OMIT_AUTOVACUUM
   814  /*
   815  ** Given a page number of a regular database page, return the page
   816  ** number for the pointer-map page that contains the entry for the
   817  ** input page number.
   818  **
   819  ** Return 0 (not a valid page) for pgno==1 since there is
   820  ** no pointer map associated with page 1.  The integrity_check logic
   821  ** requires that ptrmapPageno(*,1)!=1.
   822  */
   823  static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
   824    int nPagesPerMapPage;
   825    Pgno iPtrMap, ret;
   826    assert( sqlite3_mutex_held(pBt->mutex) );
   827    if( pgno<2 ) return 0;
   828    nPagesPerMapPage = (pBt->usableSize/5)+1;
   829    iPtrMap = (pgno-2)/nPagesPerMapPage;
   830    ret = (iPtrMap*nPagesPerMapPage) + 2; 
   831    if( ret==PENDING_BYTE_PAGE(pBt) ){
   832      ret++;
   833    }
   834    return ret;
   835  }
   836  
   837  /*
   838  ** Write an entry into the pointer map.
   839  **
   840  ** This routine updates the pointer map entry for page number 'key'
   841  ** so that it maps to type 'eType' and parent page number 'pgno'.
   842  **
   843  ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
   844  ** a no-op.  If an error occurs, the appropriate error code is written
   845  ** into *pRC.
   846  */
   847  static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
   848    DbPage *pDbPage;  /* The pointer map page */
   849    u8 *pPtrmap;      /* The pointer map data */
   850    Pgno iPtrmap;     /* The pointer map page number */
   851    int offset;       /* Offset in pointer map page */
   852    int rc;           /* Return code from subfunctions */
   853  
   854    if( *pRC ) return;
   855  
   856    assert( sqlite3_mutex_held(pBt->mutex) );
   857    /* The master-journal page number must never be used as a pointer map page */
   858    assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
   859  
   860    assert( pBt->autoVacuum );
   861    if( key==0 ){
   862      *pRC = SQLITE_CORRUPT_BKPT;
   863      return;
   864    }
   865    iPtrmap = PTRMAP_PAGENO(pBt, key);
   866    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   867    if( rc!=SQLITE_OK ){
   868      *pRC = rc;
   869      return;
   870    }
   871    offset = PTRMAP_PTROFFSET(iPtrmap, key);
   872    if( offset<0 ){
   873      *pRC = SQLITE_CORRUPT_BKPT;
   874      goto ptrmap_exit;
   875    }
   876    assert( offset <= (int)pBt->usableSize-5 );
   877    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   878  
   879    if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
   880      TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
   881      *pRC= rc = sqlite3PagerWrite(pDbPage);
   882      if( rc==SQLITE_OK ){
   883        pPtrmap[offset] = eType;
   884        put4byte(&pPtrmap[offset+1], parent);
   885      }
   886    }
   887  
   888  ptrmap_exit:
   889    sqlite3PagerUnref(pDbPage);
   890  }
   891  
   892  /*
   893  ** Read an entry from the pointer map.
   894  **
   895  ** This routine retrieves the pointer map entry for page 'key', writing
   896  ** the type and parent page number to *pEType and *pPgno respectively.
   897  ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
   898  */
   899  static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
   900    DbPage *pDbPage;   /* The pointer map page */
   901    int iPtrmap;       /* Pointer map page index */
   902    u8 *pPtrmap;       /* Pointer map page data */
   903    int offset;        /* Offset of entry in pointer map */
   904    int rc;
   905  
   906    assert( sqlite3_mutex_held(pBt->mutex) );
   907  
   908    iPtrmap = PTRMAP_PAGENO(pBt, key);
   909    rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage);
   910    if( rc!=0 ){
   911      return rc;
   912    }
   913    pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
   914  
   915    offset = PTRMAP_PTROFFSET(iPtrmap, key);
   916    if( offset<0 ){
   917      sqlite3PagerUnref(pDbPage);
   918      return SQLITE_CORRUPT_BKPT;
   919    }
   920    assert( offset <= (int)pBt->usableSize-5 );
   921    assert( pEType!=0 );
   922    *pEType = pPtrmap[offset];
   923    if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
   924  
   925    sqlite3PagerUnref(pDbPage);
   926    if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_BKPT;
   927    return SQLITE_OK;
   928  }
   929  
   930  #else /* if defined SQLITE_OMIT_AUTOVACUUM */
   931    #define ptrmapPut(w,x,y,z,rc)
   932    #define ptrmapGet(w,x,y,z) SQLITE_OK
   933    #define ptrmapPutOvflPtr(x, y, rc)
   934  #endif
   935  
   936  /*
   937  ** Given a btree page and a cell index (0 means the first cell on
   938  ** the page, 1 means the second cell, and so forth) return a pointer
   939  ** to the cell content.
   940  **
   941  ** This routine works only for pages that do not contain overflow cells.
   942  */
   943  #define findCell(P,I) \
   944    ((P)->aData + ((P)->maskPage & get2byte(&(P)->aCellIdx[2*(I)])))
   945  #define findCellv2(D,M,O,I) (D+(M&get2byte(D+(O+2*(I)))))
   946  
   947  
   948  /*
   949  ** This a more complex version of findCell() that works for
   950  ** pages that do contain overflow cells.
   951  */
   952  static u8 *findOverflowCell(MemPage *pPage, int iCell){
   953    int i;
   954    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   955    for(i=pPage->nOverflow-1; i>=0; i--){
   956      int k;
   957      k = pPage->aiOvfl[i];
   958      if( k<=iCell ){
   959        if( k==iCell ){
   960          return pPage->apOvfl[i];
   961        }
   962        iCell--;
   963      }
   964    }
   965    return findCell(pPage, iCell);
   966  }
   967  
   968  /*
   969  ** Parse a cell content block and fill in the CellInfo structure.  There
   970  ** are two versions of this function.  btreeParseCell() takes a 
   971  ** cell index as the second argument and btreeParseCellPtr() 
   972  ** takes a pointer to the body of the cell as its second argument.
   973  */
   974  static void btreeParseCellPtr(
   975    MemPage *pPage,         /* Page containing the cell */
   976    u8 *pCell,              /* Pointer to the cell text. */
   977    CellInfo *pInfo         /* Fill in this structure */
   978  ){
   979    u8 *pIter;              /* For scanning through pCell */
   980    u32 nPayload;           /* Number of bytes of cell payload */
   981  
   982    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
   983    assert( pPage->leaf==0 || pPage->leaf==1 );
   984    if( pPage->intKeyLeaf ){
   985      assert( pPage->childPtrSize==0 );
   986      pIter = pCell + getVarint32(pCell, nPayload);
   987      pIter += getVarint(pIter, (u64*)&pInfo->nKey);
   988    }else if( pPage->noPayload ){
   989      assert( pPage->childPtrSize==4 );
   990      pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
   991      pInfo->nPayload = 0;
   992      pInfo->nLocal = 0;
   993      pInfo->iOverflow = 0;
   994      pInfo->pPayload = 0;
   995      return;
   996    }else{
   997      pIter = pCell + pPage->childPtrSize;
   998      pIter += getVarint32(pIter, nPayload);
   999      pInfo->nKey = nPayload;
  1000    }
  1001    pInfo->nPayload = nPayload;
  1002    pInfo->pPayload = pIter;
  1003    testcase( nPayload==pPage->maxLocal );
  1004    testcase( nPayload==pPage->maxLocal+1 );
  1005    if( nPayload<=pPage->maxLocal ){
  1006      /* This is the (easy) common case where the entire payload fits
  1007      ** on the local page.  No overflow is required.
  1008      */
  1009      pInfo->nSize = nPayload + (u16)(pIter - pCell);
  1010      if( pInfo->nSize<4 ) pInfo->nSize = 4;
  1011      pInfo->nLocal = (u16)nPayload;
  1012      pInfo->iOverflow = 0;
  1013    }else{
  1014      /* If the payload will not fit completely on the local page, we have
  1015      ** to decide how much to store locally and how much to spill onto
  1016      ** overflow pages.  The strategy is to minimize the amount of unused
  1017      ** space on overflow pages while keeping the amount of local storage
  1018      ** in between minLocal and maxLocal.
  1019      **
  1020      ** Warning:  changing the way overflow payload is distributed in any
  1021      ** way will result in an incompatible file format.
  1022      */
  1023      int minLocal;  /* Minimum amount of payload held locally */
  1024      int maxLocal;  /* Maximum amount of payload held locally */
  1025      int surplus;   /* Overflow payload available for local storage */
  1026  
  1027      minLocal = pPage->minLocal;
  1028      maxLocal = pPage->maxLocal;
  1029      surplus = minLocal + (nPayload - minLocal)%(pPage->pBt->usableSize - 4);
  1030      testcase( surplus==maxLocal );
  1031      testcase( surplus==maxLocal+1 );
  1032      if( surplus <= maxLocal ){
  1033        pInfo->nLocal = (u16)surplus;
  1034      }else{
  1035        pInfo->nLocal = (u16)minLocal;
  1036      }
  1037      pInfo->iOverflow = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell);
  1038      pInfo->nSize = pInfo->iOverflow + 4;
  1039    }
  1040  }
  1041  static void btreeParseCell(
  1042    MemPage *pPage,         /* Page containing the cell */
  1043    int iCell,              /* The cell index.  First cell is 0 */
  1044    CellInfo *pInfo         /* Fill in this structure */
  1045  ){
  1046    btreeParseCellPtr(pPage, findCell(pPage, iCell), pInfo);
  1047  }
  1048  
  1049  /*
  1050  ** Compute the total number of bytes that a Cell needs in the cell
  1051  ** data area of the btree-page.  The return number includes the cell
  1052  ** data header and the local payload, but not any overflow page or
  1053  ** the space used by the cell pointer.
  1054  */
  1055  static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
  1056    u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
  1057    u8 *pEnd;                                /* End mark for a varint */
  1058    u32 nSize;                               /* Size value to return */
  1059  
  1060  #ifdef SQLITE_DEBUG
  1061    /* The value returned by this function should always be the same as
  1062    ** the (CellInfo.nSize) value found by doing a full parse of the
  1063    ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
  1064    ** this function verifies that this invariant is not violated. */
  1065    CellInfo debuginfo;
  1066    btreeParseCellPtr(pPage, pCell, &debuginfo);
  1067  #endif
  1068  
  1069    if( pPage->noPayload ){
  1070      pEnd = &pIter[9];
  1071      while( (*pIter++)&0x80 && pIter<pEnd );
  1072      assert( pPage->childPtrSize==4 );
  1073      return (u16)(pIter - pCell);
  1074    }
  1075    nSize = *pIter;
  1076    if( nSize>=0x80 ){
  1077      pEnd = &pIter[9];
  1078      nSize &= 0x7f;
  1079      do{
  1080        nSize = (nSize<<7) | (*++pIter & 0x7f);
  1081      }while( *(pIter)>=0x80 && pIter<pEnd );
  1082    }
  1083    pIter++;
  1084    if( pPage->intKey ){
  1085      /* pIter now points at the 64-bit integer key value, a variable length 
  1086      ** integer. The following block moves pIter to point at the first byte
  1087      ** past the end of the key value. */
  1088      pEnd = &pIter[9];
  1089      while( (*pIter++)&0x80 && pIter<pEnd );
  1090    }
  1091    testcase( nSize==pPage->maxLocal );
  1092    testcase( nSize==pPage->maxLocal+1 );
  1093    if( nSize<=pPage->maxLocal ){
  1094      nSize += (u32)(pIter - pCell);
  1095      if( nSize<4 ) nSize = 4;
  1096    }else{
  1097      int minLocal = pPage->minLocal;
  1098      nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
  1099      testcase( nSize==pPage->maxLocal );
  1100      testcase( nSize==pPage->maxLocal+1 );
  1101      if( nSize>pPage->maxLocal ){
  1102        nSize = minLocal;
  1103      }
  1104      nSize += 4 + (u16)(pIter - pCell);
  1105    }
  1106    assert( nSize==debuginfo.nSize || CORRUPT_DB );
  1107    return (u16)nSize;
  1108  }
  1109  
  1110  #ifdef SQLITE_DEBUG
  1111  /* This variation on cellSizePtr() is used inside of assert() statements
  1112  ** only. */
  1113  static u16 cellSize(MemPage *pPage, int iCell){
  1114    return cellSizePtr(pPage, findCell(pPage, iCell));
  1115  }
  1116  #endif
  1117  
  1118  #ifndef SQLITE_OMIT_AUTOVACUUM
  1119  /*
  1120  ** If the cell pCell, part of page pPage contains a pointer
  1121  ** to an overflow page, insert an entry into the pointer-map
  1122  ** for the overflow page.
  1123  */
  1124  static void ptrmapPutOvflPtr(MemPage *pPage, u8 *pCell, int *pRC){
  1125    CellInfo info;
  1126    if( *pRC ) return;
  1127    assert( pCell!=0 );
  1128    btreeParseCellPtr(pPage, pCell, &info);
  1129    if( info.iOverflow ){
  1130      Pgno ovfl = get4byte(&pCell[info.iOverflow]);
  1131      ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
  1132    }
  1133  }
  1134  #endif
  1135  
  1136  
  1137  /*
  1138  ** Defragment the page given.  All Cells are moved to the
  1139  ** end of the page and all free space is collected into one
  1140  ** big FreeBlk that occurs in between the header and cell
  1141  ** pointer array and the cell content area.
  1142  */
  1143  static int defragmentPage(MemPage *pPage){
  1144    int i;                     /* Loop counter */
  1145    int pc;                    /* Address of the i-th cell */
  1146    int hdr;                   /* Offset to the page header */
  1147    int size;                  /* Size of a cell */
  1148    int usableSize;            /* Number of usable bytes on a page */
  1149    int cellOffset;            /* Offset to the cell pointer array */
  1150    int cbrk;                  /* Offset to the cell content area */
  1151    int nCell;                 /* Number of cells on the page */
  1152    unsigned char *data;       /* The page data */
  1153    unsigned char *temp;       /* Temp area for cell content */
  1154    int iCellFirst;            /* First allowable cell index */
  1155    int iCellLast;             /* Last possible cell index */
  1156  
  1157  
  1158    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1159    assert( pPage->pBt!=0 );
  1160    assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
  1161    assert( pPage->nOverflow==0 );
  1162    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1163    temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
  1164    data = pPage->aData;
  1165    hdr = pPage->hdrOffset;
  1166    cellOffset = pPage->cellOffset;
  1167    nCell = pPage->nCell;
  1168    assert( nCell==get2byte(&data[hdr+3]) );
  1169    usableSize = pPage->pBt->usableSize;
  1170    cbrk = get2byte(&data[hdr+5]);
  1171    memcpy(&temp[cbrk], &data[cbrk], usableSize - cbrk);
  1172    cbrk = usableSize;
  1173    iCellFirst = cellOffset + 2*nCell;
  1174    iCellLast = usableSize - 4;
  1175    for(i=0; i<nCell; i++){
  1176      u8 *pAddr;     /* The i-th cell pointer */
  1177      pAddr = &data[cellOffset + i*2];
  1178      pc = get2byte(pAddr);
  1179      testcase( pc==iCellFirst );
  1180      testcase( pc==iCellLast );
  1181  #if !defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1182      /* These conditions have already been verified in btreeInitPage()
  1183      ** if SQLITE_ENABLE_OVERSIZE_CELL_CHECK is defined 
  1184      */
  1185      if( pc<iCellFirst || pc>iCellLast ){
  1186        return SQLITE_CORRUPT_BKPT;
  1187      }
  1188  #endif
  1189      assert( pc>=iCellFirst && pc<=iCellLast );
  1190      size = cellSizePtr(pPage, &temp[pc]);
  1191      cbrk -= size;
  1192  #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1193      if( cbrk<iCellFirst ){
  1194        return SQLITE_CORRUPT_BKPT;
  1195      }
  1196  #else
  1197      if( cbrk<iCellFirst || pc+size>usableSize ){
  1198        return SQLITE_CORRUPT_BKPT;
  1199      }
  1200  #endif
  1201      assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
  1202      testcase( cbrk+size==usableSize );
  1203      testcase( pc+size==usableSize );
  1204      memcpy(&data[cbrk], &temp[pc], size);
  1205      put2byte(pAddr, cbrk);
  1206    }
  1207    assert( cbrk>=iCellFirst );
  1208    put2byte(&data[hdr+5], cbrk);
  1209    data[hdr+1] = 0;
  1210    data[hdr+2] = 0;
  1211    data[hdr+7] = 0;
  1212    memset(&data[iCellFirst], 0, cbrk-iCellFirst);
  1213    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1214    if( cbrk-iCellFirst!=pPage->nFree ){
  1215      return SQLITE_CORRUPT_BKPT;
  1216    }
  1217    return SQLITE_OK;
  1218  }
  1219  
  1220  /*
  1221  ** Allocate nByte bytes of space from within the B-Tree page passed
  1222  ** as the first argument. Write into *pIdx the index into pPage->aData[]
  1223  ** of the first byte of allocated space. Return either SQLITE_OK or
  1224  ** an error code (usually SQLITE_CORRUPT).
  1225  **
  1226  ** The caller guarantees that there is sufficient space to make the
  1227  ** allocation.  This routine might need to defragment in order to bring
  1228  ** all the space together, however.  This routine will avoid using
  1229  ** the first two bytes past the cell pointer area since presumably this
  1230  ** allocation is being made in order to insert a new cell, so we will
  1231  ** also end up needing a new cell pointer.
  1232  */
  1233  static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
  1234    const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
  1235    u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
  1236    int top;                             /* First byte of cell content area */
  1237    int gap;        /* First byte of gap between cell pointers and cell content */
  1238    int rc;         /* Integer return code */
  1239    int usableSize; /* Usable size of the page */
  1240    
  1241    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1242    assert( pPage->pBt );
  1243    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1244    assert( nByte>=0 );  /* Minimum cell size is 4 */
  1245    assert( pPage->nFree>=nByte );
  1246    assert( pPage->nOverflow==0 );
  1247    usableSize = pPage->pBt->usableSize;
  1248    assert( nByte < usableSize-8 );
  1249  
  1250    assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
  1251    gap = pPage->cellOffset + 2*pPage->nCell;
  1252    assert( gap<=65536 );
  1253    top = get2byte(&data[hdr+5]);
  1254    if( gap>top ){
  1255      if( top==0 ){
  1256        top = 65536;
  1257      }else{
  1258        return SQLITE_CORRUPT_BKPT;
  1259      }
  1260    }
  1261  
  1262    /* If there is enough space between gap and top for one more cell pointer
  1263    ** array entry offset, and if the freelist is not empty, then search the
  1264    ** freelist looking for a free slot big enough to satisfy the request.
  1265    */
  1266    testcase( gap+2==top );
  1267    testcase( gap+1==top );
  1268    testcase( gap==top );
  1269    if( gap+2<=top && (data[hdr+1] || data[hdr+2]) ){
  1270      int pc, addr;
  1271      for(addr=hdr+1; (pc = get2byte(&data[addr]))>0; addr=pc){
  1272        int size;            /* Size of the free slot */
  1273        if( pc>usableSize-4 || pc<addr+4 ){
  1274          return SQLITE_CORRUPT_BKPT;
  1275        }
  1276        size = get2byte(&data[pc+2]);
  1277        if( size>=nByte ){
  1278          int x = size - nByte;
  1279          testcase( x==4 );
  1280          testcase( x==3 );
  1281          if( x<4 ){
  1282            if( data[hdr+7]>=60 ) goto defragment_page;
  1283            /* Remove the slot from the free-list. Update the number of
  1284            ** fragmented bytes within the page. */
  1285            memcpy(&data[addr], &data[pc], 2);
  1286            data[hdr+7] += (u8)x;
  1287          }else if( size+pc > usableSize ){
  1288            return SQLITE_CORRUPT_BKPT;
  1289          }else{
  1290            /* The slot remains on the free-list. Reduce its size to account
  1291            ** for the portion used by the new allocation. */
  1292            put2byte(&data[pc+2], x);
  1293          }
  1294          *pIdx = pc + x;
  1295          return SQLITE_OK;
  1296        }
  1297      }
  1298    }
  1299  
  1300    /* The request could not be fulfilled using a freelist slot.  Check
  1301    ** to see if defragmentation is necessary.
  1302    */
  1303    testcase( gap+2+nByte==top );
  1304    if( gap+2+nByte>top ){
  1305  defragment_page:
  1306      testcase( pPage->nCell==0 );
  1307      rc = defragmentPage(pPage);
  1308      if( rc ) return rc;
  1309      top = get2byteNotZero(&data[hdr+5]);
  1310      assert( gap+nByte<=top );
  1311    }
  1312  
  1313  
  1314    /* Allocate memory from the gap in between the cell pointer array
  1315    ** and the cell content area.  The btreeInitPage() call has already
  1316    ** validated the freelist.  Given that the freelist is valid, there
  1317    ** is no way that the allocation can extend off the end of the page.
  1318    ** The assert() below verifies the previous sentence.
  1319    */
  1320    top -= nByte;
  1321    put2byte(&data[hdr+5], top);
  1322    assert( top+nByte <= (int)pPage->pBt->usableSize );
  1323    *pIdx = top;
  1324    return SQLITE_OK;
  1325  }
  1326  
  1327  /*
  1328  ** Return a section of the pPage->aData to the freelist.
  1329  ** The first byte of the new free block is pPage->aData[iStart]
  1330  ** and the size of the block is iSize bytes.
  1331  **
  1332  ** Adjacent freeblocks are coalesced.
  1333  **
  1334  ** Note that even though the freeblock list was checked by btreeInitPage(),
  1335  ** that routine will not detect overlap between cells or freeblocks.  Nor
  1336  ** does it detect cells or freeblocks that encrouch into the reserved bytes
  1337  ** at the end of the page.  So do additional corruption checks inside this
  1338  ** routine and return SQLITE_CORRUPT if any problems are found.
  1339  */
  1340  static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
  1341    u16 iPtr;                             /* Address of ptr to next freeblock */
  1342    u16 iFreeBlk;                         /* Address of the next freeblock */
  1343    u8 hdr;                               /* Page header size.  0 or 100 */
  1344    u8 nFrag = 0;                         /* Reduction in fragmentation */
  1345    u16 iOrigSize = iSize;                /* Original value of iSize */
  1346    u32 iLast = pPage->pBt->usableSize-4; /* Largest possible freeblock offset */
  1347    u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
  1348    unsigned char *data = pPage->aData;   /* Page content */
  1349  
  1350    assert( pPage->pBt!=0 );
  1351    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1352    assert( iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
  1353    assert( iEnd <= pPage->pBt->usableSize );
  1354    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1355    assert( iSize>=4 );   /* Minimum cell size is 4 */
  1356    assert( iStart<=iLast );
  1357  
  1358    /* Overwrite deleted information with zeros when the secure_delete
  1359    ** option is enabled */
  1360    if( pPage->pBt->btsFlags & BTS_SECURE_DELETE ){
  1361      memset(&data[iStart], 0, iSize);
  1362    }
  1363  
  1364    /* The list of freeblocks must be in ascending order.  Find the 
  1365    ** spot on the list where iStart should be inserted.
  1366    */
  1367    hdr = pPage->hdrOffset;
  1368    iPtr = hdr + 1;
  1369    if( data[iPtr+1]==0 && data[iPtr]==0 ){
  1370      iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
  1371    }else{
  1372      while( (iFreeBlk = get2byte(&data[iPtr]))>0 && iFreeBlk<iStart ){
  1373        if( iFreeBlk<iPtr+4 ) return SQLITE_CORRUPT_BKPT;
  1374        iPtr = iFreeBlk;
  1375      }
  1376      if( iFreeBlk>iLast ) return SQLITE_CORRUPT_BKPT;
  1377      assert( iFreeBlk>iPtr || iFreeBlk==0 );
  1378    
  1379      /* At this point:
  1380      **    iFreeBlk:   First freeblock after iStart, or zero if none
  1381      **    iPtr:       The address of a pointer iFreeBlk
  1382      **
  1383      ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
  1384      */
  1385      if( iFreeBlk && iEnd+3>=iFreeBlk ){
  1386        nFrag = iFreeBlk - iEnd;
  1387        if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_BKPT;
  1388        iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
  1389        iSize = iEnd - iStart;
  1390        iFreeBlk = get2byte(&data[iFreeBlk]);
  1391      }
  1392    
  1393      /* If iPtr is another freeblock (that is, if iPtr is not the freelist
  1394      ** pointer in the page header) then check to see if iStart should be
  1395      ** coalesced onto the end of iPtr.
  1396      */
  1397      if( iPtr>hdr+1 ){
  1398        int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
  1399        if( iPtrEnd+3>=iStart ){
  1400          if( iPtrEnd>iStart ) return SQLITE_CORRUPT_BKPT;
  1401          nFrag += iStart - iPtrEnd;
  1402          iSize = iEnd - iPtr;
  1403          iStart = iPtr;
  1404        }
  1405      }
  1406      if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_BKPT;
  1407      data[hdr+7] -= nFrag;
  1408    }
  1409    if( iStart==get2byte(&data[hdr+5]) ){
  1410      /* The new freeblock is at the beginning of the cell content area,
  1411      ** so just extend the cell content area rather than create another
  1412      ** freelist entry */
  1413      if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_BKPT;
  1414      put2byte(&data[hdr+1], iFreeBlk);
  1415      put2byte(&data[hdr+5], iEnd);
  1416    }else{
  1417      /* Insert the new freeblock into the freelist */
  1418      put2byte(&data[iPtr], iStart);
  1419      put2byte(&data[iStart], iFreeBlk);
  1420      put2byte(&data[iStart+2], iSize);
  1421    }
  1422    pPage->nFree += iOrigSize;
  1423    return SQLITE_OK;
  1424  }
  1425  
  1426  /*
  1427  ** Decode the flags byte (the first byte of the header) for a page
  1428  ** and initialize fields of the MemPage structure accordingly.
  1429  **
  1430  ** Only the following combinations are supported.  Anything different
  1431  ** indicates a corrupt database files:
  1432  **
  1433  **         PTF_ZERODATA
  1434  **         PTF_ZERODATA | PTF_LEAF
  1435  **         PTF_LEAFDATA | PTF_INTKEY
  1436  **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
  1437  */
  1438  static int decodeFlags(MemPage *pPage, int flagByte){
  1439    BtShared *pBt;     /* A copy of pPage->pBt */
  1440  
  1441    assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
  1442    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1443    pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
  1444    flagByte &= ~PTF_LEAF;
  1445    pPage->childPtrSize = 4-4*pPage->leaf;
  1446    pBt = pPage->pBt;
  1447    if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
  1448      pPage->intKey = 1;
  1449      pPage->intKeyLeaf = pPage->leaf;
  1450      pPage->noPayload = !pPage->leaf;
  1451      pPage->maxLocal = pBt->maxLeaf;
  1452      pPage->minLocal = pBt->minLeaf;
  1453    }else if( flagByte==PTF_ZERODATA ){
  1454      pPage->intKey = 0;
  1455      pPage->intKeyLeaf = 0;
  1456      pPage->noPayload = 0;
  1457      pPage->maxLocal = pBt->maxLocal;
  1458      pPage->minLocal = pBt->minLocal;
  1459    }else{
  1460      return SQLITE_CORRUPT_BKPT;
  1461    }
  1462    pPage->max1bytePayload = pBt->max1bytePayload;
  1463    return SQLITE_OK;
  1464  }
  1465  
  1466  /*
  1467  ** Initialize the auxiliary information for a disk block.
  1468  **
  1469  ** Return SQLITE_OK on success.  If we see that the page does
  1470  ** not contain a well-formed database page, then return 
  1471  ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
  1472  ** guarantee that the page is well-formed.  It only shows that
  1473  ** we failed to detect any corruption.
  1474  */
  1475  static int btreeInitPage(MemPage *pPage){
  1476  
  1477    assert( pPage->pBt!=0 );
  1478    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1479    assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
  1480    assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
  1481    assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
  1482  
  1483    if( !pPage->isInit ){
  1484      u16 pc;            /* Address of a freeblock within pPage->aData[] */
  1485      u8 hdr;            /* Offset to beginning of page header */
  1486      u8 *data;          /* Equal to pPage->aData */
  1487      BtShared *pBt;        /* The main btree structure */
  1488      int usableSize;    /* Amount of usable space on each page */
  1489      u16 cellOffset;    /* Offset from start of page to first cell pointer */
  1490      int nFree;         /* Number of unused bytes on the page */
  1491      int top;           /* First byte of the cell content area */
  1492      int iCellFirst;    /* First allowable cell or freeblock offset */
  1493      int iCellLast;     /* Last possible cell or freeblock offset */
  1494  
  1495      pBt = pPage->pBt;
  1496  
  1497      hdr = pPage->hdrOffset;
  1498      data = pPage->aData;
  1499      if( decodeFlags(pPage, data[hdr]) ) return SQLITE_CORRUPT_BKPT;
  1500      assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1501      pPage->maskPage = (u16)(pBt->pageSize - 1);
  1502      pPage->nOverflow = 0;
  1503      usableSize = pBt->usableSize;
  1504      pPage->cellOffset = cellOffset = hdr + 12 - 4*pPage->leaf;
  1505      pPage->aDataEnd = &data[usableSize];
  1506      pPage->aCellIdx = &data[cellOffset];
  1507      top = get2byteNotZero(&data[hdr+5]);
  1508      pPage->nCell = get2byte(&data[hdr+3]);
  1509      if( pPage->nCell>MX_CELL(pBt) ){
  1510        /* To many cells for a single page.  The page must be corrupt */
  1511        return SQLITE_CORRUPT_BKPT;
  1512      }
  1513      testcase( pPage->nCell==MX_CELL(pBt) );
  1514  
  1515      /* A malformed database page might cause us to read past the end
  1516      ** of page when parsing a cell.  
  1517      **
  1518      ** The following block of code checks early to see if a cell extends
  1519      ** past the end of a page boundary and causes SQLITE_CORRUPT to be 
  1520      ** returned if it does.
  1521      */
  1522      iCellFirst = cellOffset + 2*pPage->nCell;
  1523      iCellLast = usableSize - 4;
  1524  #if defined(SQLITE_ENABLE_OVERSIZE_CELL_CHECK)
  1525      {
  1526        int i;            /* Index into the cell pointer array */
  1527        int sz;           /* Size of a cell */
  1528  
  1529        if( !pPage->leaf ) iCellLast--;
  1530        for(i=0; i<pPage->nCell; i++){
  1531          pc = get2byte(&data[cellOffset+i*2]);
  1532          testcase( pc==iCellFirst );
  1533          testcase( pc==iCellLast );
  1534          if( pc<iCellFirst || pc>iCellLast ){
  1535            return SQLITE_CORRUPT_BKPT;
  1536          }
  1537          sz = cellSizePtr(pPage, &data[pc]);
  1538          testcase( pc+sz==usableSize );
  1539          if( pc+sz>usableSize ){
  1540            return SQLITE_CORRUPT_BKPT;
  1541          }
  1542        }
  1543        if( !pPage->leaf ) iCellLast++;
  1544      }  
  1545  #endif
  1546  
  1547      /* Compute the total free space on the page */
  1548      pc = get2byte(&data[hdr+1]);
  1549      nFree = data[hdr+7] + top;
  1550      while( pc>0 ){
  1551        u16 next, size;
  1552        if( pc<iCellFirst || pc>iCellLast ){
  1553          /* Start of free block is off the page */
  1554          return SQLITE_CORRUPT_BKPT; 
  1555        }
  1556        next = get2byte(&data[pc]);
  1557        size = get2byte(&data[pc+2]);
  1558        if( (next>0 && next<=pc+size+3) || pc+size>usableSize ){
  1559          /* Free blocks must be in ascending order. And the last byte of
  1560          ** the free-block must lie on the database page.  */
  1561          return SQLITE_CORRUPT_BKPT; 
  1562        }
  1563        nFree = nFree + size;
  1564        pc = next;
  1565      }
  1566  
  1567      /* At this point, nFree contains the sum of the offset to the start
  1568      ** of the cell-content area plus the number of free bytes within
  1569      ** the cell-content area. If this is greater than the usable-size
  1570      ** of the page, then the page must be corrupted. This check also
  1571      ** serves to verify that the offset to the start of the cell-content
  1572      ** area, according to the page header, lies within the page.
  1573      */
  1574      if( nFree>usableSize ){
  1575        return SQLITE_CORRUPT_BKPT; 
  1576      }
  1577      pPage->nFree = (u16)(nFree - iCellFirst);
  1578      pPage->isInit = 1;
  1579    }
  1580    return SQLITE_OK;
  1581  }
  1582  
  1583  /*
  1584  ** Set up a raw page so that it looks like a database page holding
  1585  ** no entries.
  1586  */
  1587  static void zeroPage(MemPage *pPage, int flags){
  1588    unsigned char *data = pPage->aData;
  1589    BtShared *pBt = pPage->pBt;
  1590    u8 hdr = pPage->hdrOffset;
  1591    u16 first;
  1592  
  1593    assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
  1594    assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1595    assert( sqlite3PagerGetData(pPage->pDbPage) == data );
  1596    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  1597    assert( sqlite3_mutex_held(pBt->mutex) );
  1598    if( pBt->btsFlags & BTS_SECURE_DELETE ){
  1599      memset(&data[hdr], 0, pBt->usableSize - hdr);
  1600    }
  1601    data[hdr] = (char)flags;
  1602    first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
  1603    memset(&data[hdr+1], 0, 4);
  1604    data[hdr+7] = 0;
  1605    put2byte(&data[hdr+5], pBt->usableSize);
  1606    pPage->nFree = (u16)(pBt->usableSize - first);
  1607    decodeFlags(pPage, flags);
  1608    pPage->cellOffset = first;
  1609    pPage->aDataEnd = &data[pBt->usableSize];
  1610    pPage->aCellIdx = &data[first];
  1611    pPage->nOverflow = 0;
  1612    assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
  1613    pPage->maskPage = (u16)(pBt->pageSize - 1);
  1614    pPage->nCell = 0;
  1615    pPage->isInit = 1;
  1616  }
  1617  
  1618  
  1619  /*
  1620  ** Convert a DbPage obtained from the pager into a MemPage used by
  1621  ** the btree layer.
  1622  */
  1623  static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
  1624    MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
  1625    pPage->aData = sqlite3PagerGetData(pDbPage);
  1626    pPage->pDbPage = pDbPage;
  1627    pPage->pBt = pBt;
  1628    pPage->pgno = pgno;
  1629    pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  1630    return pPage; 
  1631  }
  1632  
  1633  /*
  1634  ** Get a page from the pager.  Initialize the MemPage.pBt and
  1635  ** MemPage.aData elements if needed.
  1636  **
  1637  ** If the noContent flag is set, it means that we do not care about
  1638  ** the content of the page at this time.  So do not go to the disk
  1639  ** to fetch the content.  Just fill in the content with zeros for now.
  1640  ** If in the future we call sqlite3PagerWrite() on this page, that
  1641  ** means we have started to be concerned about content and the disk
  1642  ** read should occur at that point.
  1643  */
  1644  static int btreeGetPage(
  1645    BtShared *pBt,       /* The btree */
  1646    Pgno pgno,           /* Number of the page to fetch */
  1647    MemPage **ppPage,    /* Return the page in this parameter */
  1648    int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
  1649  ){
  1650    int rc;
  1651    DbPage *pDbPage;
  1652  
  1653    assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
  1654    assert( sqlite3_mutex_held(pBt->mutex) );
  1655    rc = sqlite3PagerAcquire(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
  1656    if( rc ) return rc;
  1657    *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
  1658    return SQLITE_OK;
  1659  }
  1660  
  1661  /*
  1662  ** Retrieve a page from the pager cache. If the requested page is not
  1663  ** already in the pager cache return NULL. Initialize the MemPage.pBt and
  1664  ** MemPage.aData elements if needed.
  1665  */
  1666  static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
  1667    DbPage *pDbPage;
  1668    assert( sqlite3_mutex_held(pBt->mutex) );
  1669    pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
  1670    if( pDbPage ){
  1671      return btreePageFromDbPage(pDbPage, pgno, pBt);
  1672    }
  1673    return 0;
  1674  }
  1675  
  1676  /*
  1677  ** Return the size of the database file in pages. If there is any kind of
  1678  ** error, return ((unsigned int)-1).
  1679  */
  1680  static Pgno btreePagecount(BtShared *pBt){
  1681    return pBt->nPage;
  1682  }
  1683  u32 sqlite3BtreeLastPage(Btree *p){
  1684    assert( sqlite3BtreeHoldsMutex(p) );
  1685    assert( ((p->pBt->nPage)&0x8000000)==0 );
  1686    return btreePagecount(p->pBt);
  1687  }
  1688  
  1689  /*
  1690  ** Get a page from the pager and initialize it.  This routine is just a
  1691  ** convenience wrapper around separate calls to btreeGetPage() and 
  1692  ** btreeInitPage().
  1693  **
  1694  ** If an error occurs, then the value *ppPage is set to is undefined. It
  1695  ** may remain unchanged, or it may be set to an invalid value.
  1696  */
  1697  static int getAndInitPage(
  1698    BtShared *pBt,                  /* The database file */
  1699    Pgno pgno,                      /* Number of the page to get */
  1700    MemPage **ppPage,               /* Write the page pointer here */
  1701    int bReadonly                   /* PAGER_GET_READONLY or 0 */
  1702  ){
  1703    int rc;
  1704    assert( sqlite3_mutex_held(pBt->mutex) );
  1705    assert( bReadonly==PAGER_GET_READONLY || bReadonly==0 );
  1706  
  1707    if( pgno>btreePagecount(pBt) ){
  1708      rc = SQLITE_CORRUPT_BKPT;
  1709    }else{
  1710      rc = btreeGetPage(pBt, pgno, ppPage, bReadonly);
  1711      if( rc==SQLITE_OK && (*ppPage)->isInit==0 ){
  1712        rc = btreeInitPage(*ppPage);
  1713        if( rc!=SQLITE_OK ){
  1714          releasePage(*ppPage);
  1715        }
  1716      }
  1717    }
  1718  
  1719    testcase( pgno==0 );
  1720    assert( pgno!=0 || rc==SQLITE_CORRUPT );
  1721    return rc;
  1722  }
  1723  
  1724  /*
  1725  ** Release a MemPage.  This should be called once for each prior
  1726  ** call to btreeGetPage.
  1727  */
  1728  static void releasePage(MemPage *pPage){
  1729    if( pPage ){
  1730      assert( pPage->aData );
  1731      assert( pPage->pBt );
  1732      assert( pPage->pDbPage!=0 );
  1733      assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
  1734      assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
  1735      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1736      sqlite3PagerUnrefNotNull(pPage->pDbPage);
  1737    }
  1738  }
  1739  
  1740  /*
  1741  ** During a rollback, when the pager reloads information into the cache
  1742  ** so that the cache is restored to its original state at the start of
  1743  ** the transaction, for each page restored this routine is called.
  1744  **
  1745  ** This routine needs to reset the extra data section at the end of the
  1746  ** page to agree with the restored data.
  1747  */
  1748  static void pageReinit(DbPage *pData){
  1749    MemPage *pPage;
  1750    pPage = (MemPage *)sqlite3PagerGetExtra(pData);
  1751    assert( sqlite3PagerPageRefcount(pData)>0 );
  1752    if( pPage->isInit ){
  1753      assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  1754      pPage->isInit = 0;
  1755      if( sqlite3PagerPageRefcount(pData)>1 ){
  1756        /* pPage might not be a btree page;  it might be an overflow page
  1757        ** or ptrmap page or a free page.  In those cases, the following
  1758        ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
  1759        ** But no harm is done by this.  And it is very important that
  1760        ** btreeInitPage() be called on every btree page so we make
  1761        ** the call for every page that comes in for re-initing. */
  1762        btreeInitPage(pPage);
  1763      }
  1764    }
  1765  }
  1766  
  1767  /*
  1768  ** Invoke the busy handler for a btree.
  1769  */
  1770  static int btreeInvokeBusyHandler(void *pArg){
  1771    BtShared *pBt = (BtShared*)pArg;
  1772    assert( pBt->db );
  1773    assert( sqlite3_mutex_held(pBt->db->mutex) );
  1774    return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
  1775  }
  1776  
  1777  /*
  1778  ** Open a database file.
  1779  ** 
  1780  ** zFilename is the name of the database file.  If zFilename is NULL
  1781  ** then an ephemeral database is created.  The ephemeral database might
  1782  ** be exclusively in memory, or it might use a disk-based memory cache.
  1783  ** Either way, the ephemeral database will be automatically deleted 
  1784  ** when sqlite3BtreeClose() is called.
  1785  **
  1786  ** If zFilename is ":memory:" then an in-memory database is created
  1787  ** that is automatically destroyed when it is closed.
  1788  **
  1789  ** The "flags" parameter is a bitmask that might contain bits like
  1790  ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
  1791  **
  1792  ** If the database is already opened in the same database connection
  1793  ** and we are in shared cache mode, then the open will fail with an
  1794  ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
  1795  ** objects in the same database connection since doing so will lead
  1796  ** to problems with locking.
  1797  */
  1798  int sqlite3BtreeOpen(
  1799    sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
  1800    const char *zFilename,  /* Name of the file containing the BTree database */
  1801    sqlite3 *db,            /* Associated database handle */
  1802    Btree **ppBtree,        /* Pointer to new Btree object written here */
  1803    int flags,              /* Options */
  1804    int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
  1805  ){
  1806    BtShared *pBt = 0;             /* Shared part of btree structure */
  1807    Btree *p;                      /* Handle to return */
  1808    sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
  1809    int rc = SQLITE_OK;            /* Result code from this function */
  1810    u8 nReserve;                   /* Byte of unused space on each page */
  1811    unsigned char zDbHeader[100];  /* Database header content */
  1812  
  1813    /* True if opening an ephemeral, temporary database */
  1814    const int isTempDb = zFilename==0 || zFilename[0]==0;
  1815  
  1816    /* Set the variable isMemdb to true for an in-memory database, or 
  1817    ** false for a file-based database.
  1818    */
  1819  #ifdef SQLITE_OMIT_MEMORYDB
  1820    const int isMemdb = 0;
  1821  #else
  1822    const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
  1823                         || (isTempDb && sqlite3TempInMemory(db))
  1824                         || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
  1825  #endif
  1826  
  1827    assert( db!=0 );
  1828    assert( pVfs!=0 );
  1829    assert( sqlite3_mutex_held(db->mutex) );
  1830    assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
  1831  
  1832    /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
  1833    assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
  1834  
  1835    /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
  1836    assert( (flags & BTREE_SINGLE)==0 || isTempDb );
  1837  
  1838    if( isMemdb ){
  1839      flags |= BTREE_MEMORY;
  1840    }
  1841    if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
  1842      vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
  1843    }
  1844    p = sqlite3MallocZero(sizeof(Btree));
  1845    if( !p ){
  1846      return SQLITE_NOMEM;
  1847    }
  1848    p->inTrans = TRANS_NONE;
  1849    p->db = db;
  1850  #ifndef SQLITE_OMIT_SHARED_CACHE
  1851    p->lock.pBtree = p;
  1852    p->lock.iTable = 1;
  1853  #endif
  1854  
  1855  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1856    /*
  1857    ** If this Btree is a candidate for shared cache, try to find an
  1858    ** existing BtShared object that we can share with
  1859    */
  1860    if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
  1861      if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
  1862        int nFullPathname = pVfs->mxPathname+1;
  1863        char *zFullPathname = sqlite3Malloc(nFullPathname);
  1864        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  1865        p->sharable = 1;
  1866        if( !zFullPathname ){
  1867          sqlite3_free(p);
  1868          return SQLITE_NOMEM;
  1869        }
  1870        if( isMemdb ){
  1871          memcpy(zFullPathname, zFilename, sqlite3Strlen30(zFilename)+1);
  1872        }else{
  1873          rc = sqlite3OsFullPathname(pVfs, zFilename,
  1874                                     nFullPathname, zFullPathname);
  1875          if( rc ){
  1876            sqlite3_free(zFullPathname);
  1877            sqlite3_free(p);
  1878            return rc;
  1879          }
  1880        }
  1881  #if SQLITE_THREADSAFE
  1882        mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
  1883        sqlite3_mutex_enter(mutexOpen);
  1884        mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);
  1885        sqlite3_mutex_enter(mutexShared);
  1886  #endif
  1887        for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
  1888          assert( pBt->nRef>0 );
  1889          if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
  1890                   && sqlite3PagerVfs(pBt->pPager)==pVfs ){
  1891            int iDb;
  1892            for(iDb=db->nDb-1; iDb>=0; iDb--){
  1893              Btree *pExisting = db->aDb[iDb].pBt;
  1894              if( pExisting && pExisting->pBt==pBt ){
  1895                sqlite3_mutex_leave(mutexShared);
  1896                sqlite3_mutex_leave(mutexOpen);
  1897                sqlite3_free(zFullPathname);
  1898                sqlite3_free(p);
  1899                return SQLITE_CONSTRAINT;
  1900              }
  1901            }
  1902            p->pBt = pBt;
  1903            pBt->nRef++;
  1904            break;
  1905          }
  1906        }
  1907        sqlite3_mutex_leave(mutexShared);
  1908        sqlite3_free(zFullPathname);
  1909      }
  1910  #ifdef SQLITE_DEBUG
  1911      else{
  1912        /* In debug mode, we mark all persistent databases as sharable
  1913        ** even when they are not.  This exercises the locking code and
  1914        ** gives more opportunity for asserts(sqlite3_mutex_held())
  1915        ** statements to find locking problems.
  1916        */
  1917        p->sharable = 1;
  1918      }
  1919  #endif
  1920    }
  1921  #endif
  1922    if( pBt==0 ){
  1923      /*
  1924      ** The following asserts make sure that structures used by the btree are
  1925      ** the right size.  This is to guard against size changes that result
  1926      ** when compiling on a different architecture.
  1927      */
  1928      assert( sizeof(i64)==8 || sizeof(i64)==4 );
  1929      assert( sizeof(u64)==8 || sizeof(u64)==4 );
  1930      assert( sizeof(u32)==4 );
  1931      assert( sizeof(u16)==2 );
  1932      assert( sizeof(Pgno)==4 );
  1933    
  1934      pBt = sqlite3MallocZero( sizeof(*pBt) );
  1935      if( pBt==0 ){
  1936        rc = SQLITE_NOMEM;
  1937        goto btree_open_out;
  1938      }
  1939      rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
  1940                            EXTRA_SIZE, flags, vfsFlags, pageReinit);
  1941      if( rc==SQLITE_OK ){
  1942        sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
  1943        rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
  1944      }
  1945      if( rc!=SQLITE_OK ){
  1946        goto btree_open_out;
  1947      }
  1948      pBt->openFlags = (u8)flags;
  1949      pBt->db = db;
  1950      sqlite3PagerSetBusyhandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
  1951      p->pBt = pBt;
  1952    
  1953      pBt->pCursor = 0;
  1954      pBt->pPage1 = 0;
  1955      if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
  1956  #ifdef SQLITE_SECURE_DELETE
  1957      pBt->btsFlags |= BTS_SECURE_DELETE;
  1958  #endif
  1959      pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
  1960      if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
  1961           || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
  1962        pBt->pageSize = 0;
  1963  #ifndef SQLITE_OMIT_AUTOVACUUM
  1964        /* If the magic name ":memory:" will create an in-memory database, then
  1965        ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
  1966        ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
  1967        ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
  1968        ** regular file-name. In this case the auto-vacuum applies as per normal.
  1969        */
  1970        if( zFilename && !isMemdb ){
  1971          pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
  1972          pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
  1973        }
  1974  #endif
  1975        nReserve = 0;
  1976      }else{
  1977        nReserve = zDbHeader[20];
  1978        pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  1979  #ifndef SQLITE_OMIT_AUTOVACUUM
  1980        pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
  1981        pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
  1982  #endif
  1983      }
  1984      rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  1985      if( rc ) goto btree_open_out;
  1986      pBt->usableSize = pBt->pageSize - nReserve;
  1987      assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
  1988     
  1989  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  1990      /* Add the new BtShared object to the linked list sharable BtShareds.
  1991      */
  1992      if( p->sharable ){
  1993        MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
  1994        pBt->nRef = 1;
  1995        MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER);)
  1996        if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
  1997          pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
  1998          if( pBt->mutex==0 ){
  1999            rc = SQLITE_NOMEM;
  2000            db->mallocFailed = 0;
  2001            goto btree_open_out;
  2002          }
  2003        }
  2004        sqlite3_mutex_enter(mutexShared);
  2005        pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
  2006        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
  2007        sqlite3_mutex_leave(mutexShared);
  2008      }
  2009  #endif
  2010    }
  2011  
  2012  #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
  2013    /* If the new Btree uses a sharable pBtShared, then link the new
  2014    ** Btree into the list of all sharable Btrees for the same connection.
  2015    ** The list is kept in ascending order by pBt address.
  2016    */
  2017    if( p->sharable ){
  2018      int i;
  2019      Btree *pSib;
  2020      for(i=0; i<db->nDb; i++){
  2021        if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
  2022          while( pSib->pPrev ){ pSib = pSib->pPrev; }
  2023          if( p->pBt<pSib->pBt ){
  2024            p->pNext = pSib;
  2025            p->pPrev = 0;
  2026            pSib->pPrev = p;
  2027          }else{
  2028            while( pSib->pNext && pSib->pNext->pBt<p->pBt ){
  2029              pSib = pSib->pNext;
  2030            }
  2031            p->pNext = pSib->pNext;
  2032            p->pPrev = pSib;
  2033            if( p->pNext ){
  2034              p->pNext->pPrev = p;
  2035            }
  2036            pSib->pNext = p;
  2037          }
  2038          break;
  2039        }
  2040      }
  2041    }
  2042  #endif
  2043    *ppBtree = p;
  2044  
  2045  btree_open_out:
  2046    if( rc!=SQLITE_OK ){
  2047      if( pBt && pBt->pPager ){
  2048        sqlite3PagerClose(pBt->pPager);
  2049      }
  2050      sqlite3_free(pBt);
  2051      sqlite3_free(p);
  2052      *ppBtree = 0;
  2053    }else{
  2054      /* If the B-Tree was successfully opened, set the pager-cache size to the
  2055      ** default value. Except, when opening on an existing shared pager-cache,
  2056      ** do not change the pager-cache size.
  2057      */
  2058      if( sqlite3BtreeSchema(p, 0, 0)==0 ){
  2059        sqlite3PagerSetCachesize(p->pBt->pPager, SQLITE_DEFAULT_CACHE_SIZE);
  2060      }
  2061    }
  2062    if( mutexOpen ){
  2063      assert( sqlite3_mutex_held(mutexOpen) );
  2064      sqlite3_mutex_leave(mutexOpen);
  2065    }
  2066    return rc;
  2067  }
  2068  
  2069  /*
  2070  ** Decrement the BtShared.nRef counter.  When it reaches zero,
  2071  ** remove the BtShared structure from the sharing list.  Return
  2072  ** true if the BtShared.nRef counter reaches zero and return
  2073  ** false if it is still positive.
  2074  */
  2075  static int removeFromSharingList(BtShared *pBt){
  2076  #ifndef SQLITE_OMIT_SHARED_CACHE
  2077    MUTEX_LOGIC( sqlite3_mutex *pMaster; )
  2078    BtShared *pList;
  2079    int removed = 0;
  2080  
  2081    assert( sqlite3_mutex_notheld(pBt->mutex) );
  2082    MUTEX_LOGIC( pMaster = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER); )
  2083    sqlite3_mutex_enter(pMaster);
  2084    pBt->nRef--;
  2085    if( pBt->nRef<=0 ){
  2086      if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
  2087        GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
  2088      }else{
  2089        pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
  2090        while( ALWAYS(pList) && pList->pNext!=pBt ){
  2091          pList=pList->pNext;
  2092        }
  2093        if( ALWAYS(pList) ){
  2094          pList->pNext = pBt->pNext;
  2095        }
  2096      }
  2097      if( SQLITE_THREADSAFE ){
  2098        sqlite3_mutex_free(pBt->mutex);
  2099      }
  2100      removed = 1;
  2101    }
  2102    sqlite3_mutex_leave(pMaster);
  2103    return removed;
  2104  #else
  2105    return 1;
  2106  #endif
  2107  }
  2108  
  2109  /*
  2110  ** Make sure pBt->pTmpSpace points to an allocation of 
  2111  ** MX_CELL_SIZE(pBt) bytes.
  2112  */
  2113  static void allocateTempSpace(BtShared *pBt){
  2114    if( !pBt->pTmpSpace ){
  2115      pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
  2116  
  2117      /* One of the uses of pBt->pTmpSpace is to format cells before
  2118      ** inserting them into a leaf page (function fillInCell()). If
  2119      ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
  2120      ** by the various routines that manipulate binary cells. Which
  2121      ** can mean that fillInCell() only initializes the first 2 or 3
  2122      ** bytes of pTmpSpace, but that the first 4 bytes are copied from
  2123      ** it into a database page. This is not actually a problem, but it
  2124      ** does cause a valgrind error when the 1 or 2 bytes of unitialized 
  2125      ** data is passed to system call write(). So to avoid this error,
  2126      ** zero the first 4 bytes of temp space here.  */
  2127      if( pBt->pTmpSpace ) memset(pBt->pTmpSpace, 0, 4);
  2128    }
  2129  }
  2130  
  2131  /*
  2132  ** Free the pBt->pTmpSpace allocation
  2133  */
  2134  static void freeTempSpace(BtShared *pBt){
  2135    sqlite3PageFree( pBt->pTmpSpace);
  2136    pBt->pTmpSpace = 0;
  2137  }
  2138  
  2139  /*
  2140  ** Close an open database and invalidate all cursors.
  2141  */
  2142  int sqlite3BtreeClose(Btree *p){
  2143    BtShared *pBt = p->pBt;
  2144    BtCursor *pCur;
  2145  
  2146    /* Close all cursors opened via this handle.  */
  2147    assert( sqlite3_mutex_held(p->db->mutex) );
  2148    sqlite3BtreeEnter(p);
  2149    pCur = pBt->pCursor;
  2150    while( pCur ){
  2151      BtCursor *pTmp = pCur;
  2152      pCur = pCur->pNext;
  2153      if( pTmp->pBtree==p ){
  2154        sqlite3BtreeCloseCursor(pTmp);
  2155      }
  2156    }
  2157  
  2158    /* Rollback any active transaction and free the handle structure.
  2159    ** The call to sqlite3BtreeRollback() drops any table-locks held by
  2160    ** this handle.
  2161    */
  2162    sqlite3BtreeRollback(p, SQLITE_OK);
  2163    sqlite3BtreeLeave(p);
  2164  
  2165    /* If there are still other outstanding references to the shared-btree
  2166    ** structure, return now. The remainder of this procedure cleans 
  2167    ** up the shared-btree.
  2168    */
  2169    assert( p->wantToLock==0 && p->locked==0 );
  2170    if( !p->sharable || removeFromSharingList(pBt) ){
  2171      /* The pBt is no longer on the sharing list, so we can access
  2172      ** it without having to hold the mutex.
  2173      **
  2174      ** Clean out and delete the BtShared object.
  2175      */
  2176      assert( !pBt->pCursor );
  2177      sqlite3PagerClose(pBt->pPager);
  2178      if( pBt->xFreeSchema && pBt->pSchema ){
  2179        pBt->xFreeSchema(pBt->pSchema);
  2180      }
  2181      sqlite3DbFree(0, pBt->pSchema);
  2182      freeTempSpace(pBt);
  2183      sqlite3_free(pBt);
  2184    }
  2185  
  2186  #ifndef SQLITE_OMIT_SHARED_CACHE
  2187    assert( p->wantToLock==0 );
  2188    assert( p->locked==0 );
  2189    if( p->pPrev ) p->pPrev->pNext = p->pNext;
  2190    if( p->pNext ) p->pNext->pPrev = p->pPrev;
  2191  #endif
  2192  
  2193    sqlite3_free(p);
  2194    return SQLITE_OK;
  2195  }
  2196  
  2197  /*
  2198  ** Change the limit on the number of pages allowed in the cache.
  2199  **
  2200  ** The maximum number of cache pages is set to the absolute
  2201  ** value of mxPage.  If mxPage is negative, the pager will
  2202  ** operate asynchronously - it will not stop to do fsync()s
  2203  ** to insure data is written to the disk surface before
  2204  ** continuing.  Transactions still work if synchronous is off,
  2205  ** and the database cannot be corrupted if this program
  2206  ** crashes.  But if the operating system crashes or there is
  2207  ** an abrupt power failure when synchronous is off, the database
  2208  ** could be left in an inconsistent and unrecoverable state.
  2209  ** Synchronous is on by default so database corruption is not
  2210  ** normally a worry.
  2211  */
  2212  int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
  2213    BtShared *pBt = p->pBt;
  2214    assert( sqlite3_mutex_held(p->db->mutex) );
  2215    sqlite3BtreeEnter(p);
  2216    sqlite3PagerSetCachesize(pBt->pPager, mxPage);
  2217    sqlite3BtreeLeave(p);
  2218    return SQLITE_OK;
  2219  }
  2220  
  2221  #if SQLITE_MAX_MMAP_SIZE>0
  2222  /*
  2223  ** Change the limit on the amount of the database file that may be
  2224  ** memory mapped.
  2225  */
  2226  int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
  2227    BtShared *pBt = p->pBt;
  2228    assert( sqlite3_mutex_held(p->db->mutex) );
  2229    sqlite3BtreeEnter(p);
  2230    sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
  2231    sqlite3BtreeLeave(p);
  2232    return SQLITE_OK;
  2233  }
  2234  #endif /* SQLITE_MAX_MMAP_SIZE>0 */
  2235  
  2236  /*
  2237  ** Change the way data is synced to disk in order to increase or decrease
  2238  ** how well the database resists damage due to OS crashes and power
  2239  ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
  2240  ** there is a high probability of damage)  Level 2 is the default.  There
  2241  ** is a very low but non-zero probability of damage.  Level 3 reduces the
  2242  ** probability of damage to near zero but with a write performance reduction.
  2243  */
  2244  #ifndef SQLITE_OMIT_PAGER_PRAGMAS
  2245  int sqlite3BtreeSetPagerFlags(
  2246    Btree *p,              /* The btree to set the safety level on */
  2247    unsigned pgFlags       /* Various PAGER_* flags */
  2248  ){
  2249    BtShared *pBt = p->pBt;
  2250    assert( sqlite3_mutex_held(p->db->mutex) );
  2251    sqlite3BtreeEnter(p);
  2252    sqlite3PagerSetFlags(pBt->pPager, pgFlags);
  2253    sqlite3BtreeLeave(p);
  2254    return SQLITE_OK;
  2255  }
  2256  #endif
  2257  
  2258  /*
  2259  ** Return TRUE if the given btree is set to safety level 1.  In other
  2260  ** words, return TRUE if no sync() occurs on the disk files.
  2261  */
  2262  int sqlite3BtreeSyncDisabled(Btree *p){
  2263    BtShared *pBt = p->pBt;
  2264    int rc;
  2265    assert( sqlite3_mutex_held(p->db->mutex) );  
  2266    sqlite3BtreeEnter(p);
  2267    assert( pBt && pBt->pPager );
  2268    rc = sqlite3PagerNosync(pBt->pPager);
  2269    sqlite3BtreeLeave(p);
  2270    return rc;
  2271  }
  2272  
  2273  /*
  2274  ** Change the default pages size and the number of reserved bytes per page.
  2275  ** Or, if the page size has already been fixed, return SQLITE_READONLY 
  2276  ** without changing anything.
  2277  **
  2278  ** The page size must be a power of 2 between 512 and 65536.  If the page
  2279  ** size supplied does not meet this constraint then the page size is not
  2280  ** changed.
  2281  **
  2282  ** Page sizes are constrained to be a power of two so that the region
  2283  ** of the database file used for locking (beginning at PENDING_BYTE,
  2284  ** the first byte past the 1GB boundary, 0x40000000) needs to occur
  2285  ** at the beginning of a page.
  2286  **
  2287  ** If parameter nReserve is less than zero, then the number of reserved
  2288  ** bytes per page is left unchanged.
  2289  **
  2290  ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
  2291  ** and autovacuum mode can no longer be changed.
  2292  */
  2293  int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
  2294    int rc = SQLITE_OK;
  2295    BtShared *pBt = p->pBt;
  2296    assert( nReserve>=-1 && nReserve<=255 );
  2297    sqlite3BtreeEnter(p);
  2298    if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
  2299      sqlite3BtreeLeave(p);
  2300      return SQLITE_READONLY;
  2301    }
  2302    if( nReserve<0 ){
  2303      nReserve = pBt->pageSize - pBt->usableSize;
  2304    }
  2305    assert( nReserve>=0 && nReserve<=255 );
  2306    if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
  2307          ((pageSize-1)&pageSize)==0 ){
  2308      assert( (pageSize & 7)==0 );
  2309      assert( !pBt->pPage1 && !pBt->pCursor );
  2310      pBt->pageSize = (u32)pageSize;
  2311      freeTempSpace(pBt);
  2312    }
  2313    rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
  2314    pBt->usableSize = pBt->pageSize - (u16)nReserve;
  2315    if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2316    sqlite3BtreeLeave(p);
  2317    return rc;
  2318  }
  2319  
  2320  /*
  2321  ** Return the currently defined page size
  2322  */
  2323  int sqlite3BtreeGetPageSize(Btree *p){
  2324    return p->pBt->pageSize;
  2325  }
  2326  
  2327  #if defined(SQLITE_HAS_CODEC) || defined(SQLITE_DEBUG)
  2328  /*
  2329  ** This function is similar to sqlite3BtreeGetReserve(), except that it
  2330  ** may only be called if it is guaranteed that the b-tree mutex is already
  2331  ** held.
  2332  **
  2333  ** This is useful in one special case in the backup API code where it is
  2334  ** known that the shared b-tree mutex is held, but the mutex on the 
  2335  ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
  2336  ** were to be called, it might collide with some other operation on the
  2337  ** database handle that owns *p, causing undefined behavior.
  2338  */
  2339  int sqlite3BtreeGetReserveNoMutex(Btree *p){
  2340    assert( sqlite3_mutex_held(p->pBt->mutex) );
  2341    return p->pBt->pageSize - p->pBt->usableSize;
  2342  }
  2343  #endif /* SQLITE_HAS_CODEC || SQLITE_DEBUG */
  2344  
  2345  #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
  2346  /*
  2347  ** Return the number of bytes of space at the end of every page that
  2348  ** are intentually left unused.  This is the "reserved" space that is
  2349  ** sometimes used by extensions.
  2350  */
  2351  int sqlite3BtreeGetReserve(Btree *p){
  2352    int n;
  2353    sqlite3BtreeEnter(p);
  2354    n = p->pBt->pageSize - p->pBt->usableSize;
  2355    sqlite3BtreeLeave(p);
  2356    return n;
  2357  }
  2358  
  2359  /*
  2360  ** Set the maximum page count for a database if mxPage is positive.
  2361  ** No changes are made if mxPage is 0 or negative.
  2362  ** Regardless of the value of mxPage, return the maximum page count.
  2363  */
  2364  int sqlite3BtreeMaxPageCount(Btree *p, int mxPage){
  2365    int n;
  2366    sqlite3BtreeEnter(p);
  2367    n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
  2368    sqlite3BtreeLeave(p);
  2369    return n;
  2370  }
  2371  
  2372  /*
  2373  ** Set the BTS_SECURE_DELETE flag if newFlag is 0 or 1.  If newFlag is -1,
  2374  ** then make no changes.  Always return the value of the BTS_SECURE_DELETE
  2375  ** setting after the change.
  2376  */
  2377  int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
  2378    int b;
  2379    if( p==0 ) return 0;
  2380    sqlite3BtreeEnter(p);
  2381    if( newFlag>=0 ){
  2382      p->pBt->btsFlags &= ~BTS_SECURE_DELETE;
  2383      if( newFlag ) p->pBt->btsFlags |= BTS_SECURE_DELETE;
  2384    } 
  2385    b = (p->pBt->btsFlags & BTS_SECURE_DELETE)!=0;
  2386    sqlite3BtreeLeave(p);
  2387    return b;
  2388  }
  2389  #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM) */
  2390  
  2391  /*
  2392  ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
  2393  ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
  2394  ** is disabled. The default value for the auto-vacuum property is 
  2395  ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
  2396  */
  2397  int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
  2398  #ifdef SQLITE_OMIT_AUTOVACUUM
  2399    return SQLITE_READONLY;
  2400  #else
  2401    BtShared *pBt = p->pBt;
  2402    int rc = SQLITE_OK;
  2403    u8 av = (u8)autoVacuum;
  2404  
  2405    sqlite3BtreeEnter(p);
  2406    if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
  2407      rc = SQLITE_READONLY;
  2408    }else{
  2409      pBt->autoVacuum = av ?1:0;
  2410      pBt->incrVacuum = av==2 ?1:0;
  2411    }
  2412    sqlite3BtreeLeave(p);
  2413    return rc;
  2414  #endif
  2415  }
  2416  
  2417  /*
  2418  ** Return the value of the 'auto-vacuum' property. If auto-vacuum is 
  2419  ** enabled 1 is returned. Otherwise 0.
  2420  */
  2421  int sqlite3BtreeGetAutoVacuum(Btree *p){
  2422  #ifdef SQLITE_OMIT_AUTOVACUUM
  2423    return BTREE_AUTOVACUUM_NONE;
  2424  #else
  2425    int rc;
  2426    sqlite3BtreeEnter(p);
  2427    rc = (
  2428      (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
  2429      (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
  2430      BTREE_AUTOVACUUM_INCR
  2431    );
  2432    sqlite3BtreeLeave(p);
  2433    return rc;
  2434  #endif
  2435  }
  2436  
  2437  
  2438  /*
  2439  ** Get a reference to pPage1 of the database file.  This will
  2440  ** also acquire a readlock on that file.
  2441  **
  2442  ** SQLITE_OK is returned on success.  If the file is not a
  2443  ** well-formed database file, then SQLITE_CORRUPT is returned.
  2444  ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
  2445  ** is returned if we run out of memory. 
  2446  */
  2447  static int lockBtree(BtShared *pBt){
  2448    int rc;              /* Result code from subfunctions */
  2449    MemPage *pPage1;     /* Page 1 of the database file */
  2450    int nPage;           /* Number of pages in the database */
  2451    int nPageFile = 0;   /* Number of pages in the database file */
  2452    int nPageHeader;     /* Number of pages in the database according to hdr */
  2453  
  2454    assert( sqlite3_mutex_held(pBt->mutex) );
  2455    assert( pBt->pPage1==0 );
  2456    rc = sqlite3PagerSharedLock(pBt->pPager);
  2457    if( rc!=SQLITE_OK ) return rc;
  2458    rc = btreeGetPage(pBt, 1, &pPage1, 0);
  2459    if( rc!=SQLITE_OK ) return rc;
  2460  
  2461    /* Do some checking to help insure the file we opened really is
  2462    ** a valid database file. 
  2463    */
  2464    nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
  2465    sqlite3PagerPagecount(pBt->pPager, &nPageFile);
  2466    if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
  2467      nPage = nPageFile;
  2468    }
  2469    if( nPage>0 ){
  2470      u32 pageSize;
  2471      u32 usableSize;
  2472      u8 *page1 = pPage1->aData;
  2473      rc = SQLITE_NOTADB;
  2474      if( memcmp(page1, zMagicHeader, 16)!=0 ){
  2475        goto page1_init_failed;
  2476      }
  2477  
  2478  #ifdef SQLITE_OMIT_WAL
  2479      if( page1[18]>1 ){
  2480        pBt->btsFlags |= BTS_READ_ONLY;
  2481      }
  2482      if( page1[19]>1 ){
  2483        goto page1_init_failed;
  2484      }
  2485  #else
  2486      if( page1[18]>2 ){
  2487        pBt->btsFlags |= BTS_READ_ONLY;
  2488      }
  2489      if( page1[19]>2 ){
  2490        goto page1_init_failed;
  2491      }
  2492  
  2493      /* If the write version is set to 2, this database should be accessed
  2494      ** in WAL mode. If the log is not already open, open it now. Then 
  2495      ** return SQLITE_OK and return without populating BtShared.pPage1.
  2496      ** The caller detects this and calls this function again. This is
  2497      ** required as the version of page 1 currently in the page1 buffer
  2498      ** may not be the latest version - there may be a newer one in the log
  2499      ** file.
  2500      */
  2501      if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
  2502        int isOpen = 0;
  2503        rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
  2504        if( rc!=SQLITE_OK ){
  2505          goto page1_init_failed;
  2506        }else if( isOpen==0 ){
  2507          releasePage(pPage1);
  2508          return SQLITE_OK;
  2509        }
  2510        rc = SQLITE_NOTADB;
  2511      }
  2512  #endif
  2513  
  2514      /* The maximum embedded fraction must be exactly 25%.  And the minimum
  2515      ** embedded fraction must be 12.5% for both leaf-data and non-leaf-data.
  2516      ** The original design allowed these amounts to vary, but as of
  2517      ** version 3.6.0, we require them to be fixed.
  2518      */
  2519      if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
  2520        goto page1_init_failed;
  2521      }
  2522      pageSize = (page1[16]<<8) | (page1[17]<<16);
  2523      if( ((pageSize-1)&pageSize)!=0
  2524       || pageSize>SQLITE_MAX_PAGE_SIZE 
  2525       || pageSize<=256 
  2526      ){
  2527        goto page1_init_failed;
  2528      }
  2529      assert( (pageSize & 7)==0 );
  2530      usableSize = pageSize - page1[20];
  2531      if( (u32)pageSize!=pBt->pageSize ){
  2532        /* After reading the first page of the database assuming a page size
  2533        ** of BtShared.pageSize, we have discovered that the page-size is
  2534        ** actually pageSize. Unlock the database, leave pBt->pPage1 at
  2535        ** zero and return SQLITE_OK. The caller will call this function
  2536        ** again with the correct page-size.
  2537        */
  2538        releasePage(pPage1);
  2539        pBt->usableSize = usableSize;
  2540        pBt->pageSize = pageSize;
  2541        freeTempSpace(pBt);
  2542        rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
  2543                                     pageSize-usableSize);
  2544        return rc;
  2545      }
  2546      if( (pBt->db->flags & SQLITE_RecoveryMode)==0 && nPage>nPageFile ){
  2547        rc = SQLITE_CORRUPT_BKPT;
  2548        goto page1_init_failed;
  2549      }
  2550      if( usableSize<480 ){
  2551        goto page1_init_failed;
  2552      }
  2553      pBt->pageSize = pageSize;
  2554      pBt->usableSize = usableSize;
  2555  #ifndef SQLITE_OMIT_AUTOVACUUM
  2556      pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
  2557      pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
  2558  #endif
  2559    }
  2560  
  2561    /* maxLocal is the maximum amount of payload to store locally for
  2562    ** a cell.  Make sure it is small enough so that at least minFanout
  2563    ** cells can will fit on one page.  We assume a 10-byte page header.
  2564    ** Besides the payload, the cell must store:
  2565    **     2-byte pointer to the cell
  2566    **     4-byte child pointer
  2567    **     9-byte nKey value
  2568    **     4-byte nData value
  2569    **     4-byte overflow page pointer
  2570    ** So a cell consists of a 2-byte pointer, a header which is as much as
  2571    ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
  2572    ** page pointer.
  2573    */
  2574    pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
  2575    pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
  2576    pBt->maxLeaf = (u16)(pBt->usableSize - 35);
  2577    pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
  2578    if( pBt->maxLocal>127 ){
  2579      pBt->max1bytePayload = 127;
  2580    }else{
  2581      pBt->max1bytePayload = (u8)pBt->maxLocal;
  2582    }
  2583    assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
  2584    pBt->pPage1 = pPage1;
  2585    pBt->nPage = nPage;
  2586    return SQLITE_OK;
  2587  
  2588  page1_init_failed:
  2589    releasePage(pPage1);
  2590    pBt->pPage1 = 0;
  2591    return rc;
  2592  }
  2593  
  2594  #ifndef NDEBUG
  2595  /*
  2596  ** Return the number of cursors open on pBt. This is for use
  2597  ** in assert() expressions, so it is only compiled if NDEBUG is not
  2598  ** defined.
  2599  **
  2600  ** Only write cursors are counted if wrOnly is true.  If wrOnly is
  2601  ** false then all cursors are counted.
  2602  **
  2603  ** For the purposes of this routine, a cursor is any cursor that
  2604  ** is capable of reading or writing to the database.  Cursors that
  2605  ** have been tripped into the CURSOR_FAULT state are not counted.
  2606  */
  2607  static int countValidCursors(BtShared *pBt, int wrOnly){
  2608    BtCursor *pCur;
  2609    int r = 0;
  2610    for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
  2611      if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
  2612       && pCur->eState!=CURSOR_FAULT ) r++; 
  2613    }
  2614    return r;
  2615  }
  2616  #endif
  2617  
  2618  /*
  2619  ** If there are no outstanding cursors and we are not in the middle
  2620  ** of a transaction but there is a read lock on the database, then
  2621  ** this routine unrefs the first page of the database file which 
  2622  ** has the effect of releasing the read lock.
  2623  **
  2624  ** If there is a transaction in progress, this routine is a no-op.
  2625  */
  2626  static void unlockBtreeIfUnused(BtShared *pBt){
  2627    assert( sqlite3_mutex_held(pBt->mutex) );
  2628    assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
  2629    if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
  2630      MemPage *pPage1 = pBt->pPage1;
  2631      assert( pPage1->aData );
  2632      assert( sqlite3PagerRefcount(pBt->pPager)==1 );
  2633      pBt->pPage1 = 0;
  2634      releasePage(pPage1);
  2635    }
  2636  }
  2637  
  2638  /*
  2639  ** If pBt points to an empty file then convert that empty file
  2640  ** into a new empty database by initializing the first page of
  2641  ** the database.
  2642  */
  2643  static int newDatabase(BtShared *pBt){
  2644    MemPage *pP1;
  2645    unsigned char *data;
  2646    int rc;
  2647  
  2648    assert( sqlite3_mutex_held(pBt->mutex) );
  2649    if( pBt->nPage>0 ){
  2650      return SQLITE_OK;
  2651    }
  2652    pP1 = pBt->pPage1;
  2653    assert( pP1!=0 );
  2654    data = pP1->aData;
  2655    rc = sqlite3PagerWrite(pP1->pDbPage);
  2656    if( rc ) return rc;
  2657    memcpy(data, zMagicHeader, sizeof(zMagicHeader));
  2658    assert( sizeof(zMagicHeader)==16 );
  2659    data[16] = (u8)((pBt->pageSize>>8)&0xff);
  2660    data[17] = (u8)((pBt->pageSize>>16)&0xff);
  2661    data[18] = 1;
  2662    data[19] = 1;
  2663    assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
  2664    data[20] = (u8)(pBt->pageSize - pBt->usableSize);
  2665    data[21] = 64;
  2666    data[22] = 32;
  2667    data[23] = 32;
  2668    memset(&data[24], 0, 100-24);
  2669    zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
  2670    pBt->btsFlags |= BTS_PAGESIZE_FIXED;
  2671  #ifndef SQLITE_OMIT_AUTOVACUUM
  2672    assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
  2673    assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
  2674    put4byte(&data[36 + 4*4], pBt->autoVacuum);
  2675    put4byte(&data[36 + 7*4], pBt->incrVacuum);
  2676  #endif
  2677    pBt->nPage = 1;
  2678    data[31] = 1;
  2679    return SQLITE_OK;
  2680  }
  2681  
  2682  /*
  2683  ** Initialize the first page of the database file (creating a database
  2684  ** consisting of a single page and no schema objects). Return SQLITE_OK
  2685  ** if successful, or an SQLite error code otherwise.
  2686  */
  2687  int sqlite3BtreeNewDb(Btree *p){
  2688    int rc;
  2689    sqlite3BtreeEnter(p);
  2690    p->pBt->nPage = 0;
  2691    rc = newDatabase(p->pBt);
  2692    sqlite3BtreeLeave(p);
  2693    return rc;
  2694  }
  2695  
  2696  /*
  2697  ** Attempt to start a new transaction. A write-transaction
  2698  ** is started if the second argument is nonzero, otherwise a read-
  2699  ** transaction.  If the second argument is 2 or more and exclusive
  2700  ** transaction is started, meaning that no other process is allowed
  2701  ** to access the database.  A preexisting transaction may not be
  2702  ** upgraded to exclusive by calling this routine a second time - the
  2703  ** exclusivity flag only works for a new transaction.
  2704  **
  2705  ** A write-transaction must be started before attempting any 
  2706  ** changes to the database.  None of the following routines 
  2707  ** will work unless a transaction is started first:
  2708  **
  2709  **      sqlite3BtreeCreateTable()
  2710  **      sqlite3BtreeCreateIndex()
  2711  **      sqlite3BtreeClearTable()
  2712  **      sqlite3BtreeDropTable()
  2713  **      sqlite3BtreeInsert()
  2714  **      sqlite3BtreeDelete()
  2715  **      sqlite3BtreeUpdateMeta()
  2716  **
  2717  ** If an initial attempt to acquire the lock fails because of lock contention
  2718  ** and the database was previously unlocked, then invoke the busy handler
  2719  ** if there is one.  But if there was previously a read-lock, do not
  2720  ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is 
  2721  ** returned when there is already a read-lock in order to avoid a deadlock.
  2722  **
  2723  ** Suppose there are two processes A and B.  A has a read lock and B has
  2724  ** a reserved lock.  B tries to promote to exclusive but is blocked because
  2725  ** of A's read lock.  A tries to promote to reserved but is blocked by B.
  2726  ** One or the other of the two processes must give way or there can be
  2727  ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
  2728  ** when A already has a read lock, we encourage A to give up and let B
  2729  ** proceed.
  2730  */
  2731  int sqlite3BtreeBeginTrans(Btree *p, int wrflag){
  2732    sqlite3 *pBlock = 0;
  2733    BtShared *pBt = p->pBt;
  2734    int rc = SQLITE_OK;
  2735  
  2736    sqlite3BtreeEnter(p);
  2737    btreeIntegrity(p);
  2738  
  2739    /* If the btree is already in a write-transaction, or it
  2740    ** is already in a read-transaction and a read-transaction
  2741    ** is requested, this is a no-op.
  2742    */
  2743    if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
  2744      goto trans_begun;
  2745    }
  2746    assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
  2747  
  2748    /* Write transactions are not possible on a read-only database */
  2749    if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
  2750      rc = SQLITE_READONLY;
  2751      goto trans_begun;
  2752    }
  2753  
  2754  #ifndef SQLITE_OMIT_SHARED_CACHE
  2755    /* If another database handle has already opened a write transaction 
  2756    ** on this shared-btree structure and a second write transaction is
  2757    ** requested, return SQLITE_LOCKED.
  2758    */
  2759    if( (wrflag && pBt->inTransaction==TRANS_WRITE)
  2760     || (pBt->btsFlags & BTS_PENDING)!=0
  2761    ){
  2762      pBlock = pBt->pWriter->db;
  2763    }else if( wrflag>1 ){
  2764      BtLock *pIter;
  2765      for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
  2766        if( pIter->pBtree!=p ){
  2767          pBlock = pIter->pBtree->db;
  2768          break;
  2769        }
  2770      }
  2771    }
  2772    if( pBlock ){
  2773      sqlite3ConnectionBlocked(p->db, pBlock);
  2774      rc = SQLITE_LOCKED_SHAREDCACHE;
  2775      goto trans_begun;
  2776    }
  2777  #endif
  2778  
  2779    /* Any read-only or read-write transaction implies a read-lock on 
  2780    ** page 1. So if some other shared-cache client already has a write-lock 
  2781    ** on page 1, the transaction cannot be opened. */
  2782    rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK);
  2783    if( SQLITE_OK!=rc ) goto trans_begun;
  2784  
  2785    pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
  2786    if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
  2787    do {
  2788      /* Call lockBtree() until either pBt->pPage1 is populated or
  2789      ** lockBtree() returns something other than SQLITE_OK. lockBtree()
  2790      ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
  2791      ** reading page 1 it discovers that the page-size of the database 
  2792      ** file is not pBt->pageSize. In this case lockBtree() will update
  2793      ** pBt->pageSize to the page-size of the file on disk.
  2794      */
  2795      while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
  2796  
  2797      if( rc==SQLITE_OK && wrflag ){
  2798        if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
  2799          rc = SQLITE_READONLY;
  2800        }else{
  2801          rc = sqlite3PagerBegin(pBt->pPager,wrflag>1,sqlite3TempInMemory(p->db));
  2802          if( rc==SQLITE_OK ){
  2803            rc = newDatabase(pBt);
  2804          }
  2805        }
  2806      }
  2807    
  2808      if( rc!=SQLITE_OK ){
  2809        unlockBtreeIfUnused(pBt);
  2810      }
  2811    }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
  2812            btreeInvokeBusyHandler(pBt) );
  2813  
  2814    if( rc==SQLITE_OK ){
  2815      if( p->inTrans==TRANS_NONE ){
  2816        pBt->nTransaction++;
  2817  #ifndef SQLITE_OMIT_SHARED_CACHE
  2818        if( p->sharable ){
  2819          assert( p->lock.pBtree==p && p->lock.iTable==1 );
  2820          p->lock.eLock = READ_LOCK;
  2821          p->lock.pNext = pBt->pLock;
  2822          pBt->pLock = &p->lock;
  2823        }
  2824  #endif
  2825      }
  2826      p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
  2827      if( p->inTrans>pBt->inTransaction ){
  2828        pBt->inTransaction = p->inTrans;
  2829      }
  2830      if( wrflag ){
  2831        MemPage *pPage1 = pBt->pPage1;
  2832  #ifndef SQLITE_OMIT_SHARED_CACHE
  2833        assert( !pBt->pWriter );
  2834        pBt->pWriter = p;
  2835        pBt->btsFlags &= ~BTS_EXCLUSIVE;
  2836        if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
  2837  #endif
  2838  
  2839        /* If the db-size header field is incorrect (as it may be if an old
  2840        ** client has been writing the database file), update it now. Doing
  2841        ** this sooner rather than later means the database size can safely 
  2842        ** re-read the database size from page 1 if a savepoint or transaction
  2843        ** rollback occurs within the transaction.
  2844        */
  2845        if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
  2846          rc = sqlite3PagerWrite(pPage1->pDbPage);
  2847          if( rc==SQLITE_OK ){
  2848            put4byte(&pPage1->aData[28], pBt->nPage);
  2849          }
  2850        }
  2851      }
  2852    }
  2853  
  2854  
  2855  trans_begun:
  2856    if( rc==SQLITE_OK && wrflag ){
  2857      /* This call makes sure that the pager has the correct number of
  2858      ** open savepoints. If the second parameter is greater than 0 and
  2859      ** the sub-journal is not already open, then it will be opened here.
  2860      */
  2861      rc = sqlite3PagerOpenSavepoint(pBt->pPager, p->db->nSavepoint);
  2862    }
  2863  
  2864    btreeIntegrity(p);
  2865    sqlite3BtreeLeave(p);
  2866    return rc;
  2867  }
  2868  
  2869  #ifndef SQLITE_OMIT_AUTOVACUUM
  2870  
  2871  /*
  2872  ** Set the pointer-map entries for all children of page pPage. Also, if
  2873  ** pPage contains cells that point to overflow pages, set the pointer
  2874  ** map entries for the overflow pages as well.
  2875  */
  2876  static int setChildPtrmaps(MemPage *pPage){
  2877    int i;                             /* Counter variable */
  2878    int nCell;                         /* Number of cells in page pPage */
  2879    int rc;                            /* Return code */
  2880    BtShared *pBt = pPage->pBt;
  2881    u8 isInitOrig = pPage->isInit;
  2882    Pgno pgno = pPage->pgno;
  2883  
  2884    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2885    rc = btreeInitPage(pPage);
  2886    if( rc!=SQLITE_OK ){
  2887      goto set_child_ptrmaps_out;
  2888    }
  2889    nCell = pPage->nCell;
  2890  
  2891    for(i=0; i<nCell; i++){
  2892      u8 *pCell = findCell(pPage, i);
  2893  
  2894      ptrmapPutOvflPtr(pPage, pCell, &rc);
  2895  
  2896      if( !pPage->leaf ){
  2897        Pgno childPgno = get4byte(pCell);
  2898        ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  2899      }
  2900    }
  2901  
  2902    if( !pPage->leaf ){
  2903      Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
  2904      ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
  2905    }
  2906  
  2907  set_child_ptrmaps_out:
  2908    pPage->isInit = isInitOrig;
  2909    return rc;
  2910  }
  2911  
  2912  /*
  2913  ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
  2914  ** that it points to iTo. Parameter eType describes the type of pointer to
  2915  ** be modified, as  follows:
  2916  **
  2917  ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child 
  2918  **                   page of pPage.
  2919  **
  2920  ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
  2921  **                   page pointed to by one of the cells on pPage.
  2922  **
  2923  ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
  2924  **                   overflow page in the list.
  2925  */
  2926  static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
  2927    assert( sqlite3_mutex_held(pPage->pBt->mutex) );
  2928    assert( sqlite3PagerIswriteable(pPage->pDbPage) );
  2929    if( eType==PTRMAP_OVERFLOW2 ){
  2930      /* The pointer is always the first 4 bytes of the page in this case.  */
  2931      if( get4byte(pPage->aData)!=iFrom ){
  2932        return SQLITE_CORRUPT_BKPT;
  2933      }
  2934      put4byte(pPage->aData, iTo);
  2935    }else{
  2936      u8 isInitOrig = pPage->isInit;
  2937      int i;
  2938      int nCell;
  2939  
  2940      btreeInitPage(pPage);
  2941      nCell = pPage->nCell;
  2942  
  2943      for(i=0; i<nCell; i++){
  2944        u8 *pCell = findCell(pPage, i);
  2945        if( eType==PTRMAP_OVERFLOW1 ){
  2946          CellInfo info;
  2947          btreeParseCellPtr(pPage, pCell, &info);
  2948          if( info.iOverflow
  2949           && pCell+info.iOverflow+3<=pPage->aData+pPage->maskPage
  2950           && iFrom==get4byte(&pCell[info.iOverflow])
  2951          ){
  2952            put4byte(&pCell[info.iOverflow], iTo);
  2953            break;
  2954          }
  2955        }else{
  2956          if( get4byte(pCell)==iFrom ){
  2957            put4byte(pCell, iTo);
  2958            break;
  2959          }
  2960        }
  2961      }
  2962    
  2963      if( i==nCell ){
  2964        if( eType!=PTRMAP_BTREE || 
  2965            get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
  2966          return SQLITE_CORRUPT_BKPT;
  2967        }
  2968        put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
  2969      }
  2970  
  2971      pPage->isInit = isInitOrig;
  2972    }
  2973    return SQLITE_OK;
  2974  }
  2975  
  2976  
  2977  /*
  2978  ** Move the open database page pDbPage to location iFreePage in the 
  2979  ** database. The pDbPage reference remains valid.
  2980  **
  2981  ** The isCommit flag indicates that there is no need to remember that
  2982  ** the journal needs to be sync()ed before database page pDbPage->pgno 
  2983  ** can be written to. The caller has already promised not to write to that
  2984  ** page.
  2985  */
  2986  static int relocatePage(
  2987    BtShared *pBt,           /* Btree */
  2988    MemPage *pDbPage,        /* Open page to move */
  2989    u8 eType,                /* Pointer map 'type' entry for pDbPage */
  2990    Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
  2991    Pgno iFreePage,          /* The location to move pDbPage to */
  2992    int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
  2993  ){
  2994    MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
  2995    Pgno iDbPage = pDbPage->pgno;
  2996    Pager *pPager = pBt->pPager;
  2997    int rc;
  2998  
  2999    assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 || 
  3000        eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
  3001    assert( sqlite3_mutex_held(pBt->mutex) );
  3002    assert( pDbPage->pBt==pBt );
  3003  
  3004    /* Move page iDbPage from its current location to page number iFreePage */
  3005    TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n", 
  3006        iDbPage, iFreePage, iPtrPage, eType));
  3007    rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
  3008    if( rc!=SQLITE_OK ){
  3009      return rc;
  3010    }
  3011    pDbPage->pgno = iFreePage;
  3012  
  3013    /* If pDbPage was a btree-page, then it may have child pages and/or cells
  3014    ** that point to overflow pages. The pointer map entries for all these
  3015    ** pages need to be changed.
  3016    **
  3017    ** If pDbPage is an overflow page, then the first 4 bytes may store a
  3018    ** pointer to a subsequent overflow page. If this is the case, then
  3019    ** the pointer map needs to be updated for the subsequent overflow page.
  3020    */
  3021    if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
  3022      rc = setChildPtrmaps(pDbPage);
  3023      if( rc!=SQLITE_OK ){
  3024        return rc;
  3025      }
  3026    }else{
  3027      Pgno nextOvfl = get4byte(pDbPage->aData);
  3028      if( nextOvfl!=0 ){
  3029        ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
  3030        if( rc!=SQLITE_OK ){
  3031          return rc;
  3032        }
  3033      }
  3034    }
  3035  
  3036    /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
  3037    ** that it points at iFreePage. Also fix the pointer map entry for
  3038    ** iPtrPage.
  3039    */
  3040    if( eType!=PTRMAP_ROOTPAGE ){
  3041      rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
  3042      if( rc!=SQLITE_OK ){
  3043        return rc;
  3044      }
  3045      rc = sqlite3PagerWrite(pPtrPage->pDbPage);
  3046      if( rc!=SQLITE_OK ){
  3047        releasePage(pPtrPage);
  3048        return rc;
  3049      }
  3050      rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
  3051      releasePage(pPtrPage);
  3052      if( rc==SQLITE_OK ){
  3053        ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
  3054      }
  3055    }
  3056    return rc;
  3057  }
  3058  
  3059  /* Forward declaration required by incrVacuumStep(). */
  3060  static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
  3061  
  3062  /*
  3063  ** Perform a single step of an incremental-vacuum. If successful, return
  3064  ** SQLITE_OK. If there is no work to do (and therefore no point in 
  3065  ** calling this function again), return SQLITE_DONE. Or, if an error 
  3066  ** occurs, return some other error code.
  3067  **
  3068  ** More specifically, this function attempts to re-organize the database so 
  3069  ** that the last page of the file currently in use is no longer in use.
  3070  **
  3071  ** Parameter nFin is the number of pages that this database would contain
  3072  ** were this function called until it returns SQLITE_DONE.
  3073  **
  3074  ** If the bCommit parameter is non-zero, this function assumes that the 
  3075  ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE 
  3076  ** or an error. bCommit is passed true for an auto-vacuum-on-commit 
  3077  ** operation, or false for an incremental vacuum.
  3078  */
  3079  static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
  3080    Pgno nFreeList;           /* Number of pages still on the free-list */
  3081    int rc;
  3082  
  3083    assert( sqlite3_mutex_held(pBt->mutex) );
  3084    assert( iLastPg>nFin );
  3085  
  3086    if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
  3087      u8 eType;
  3088      Pgno iPtrPage;
  3089  
  3090      nFreeList = get4byte(&pBt->pPage1->aData[36]);
  3091      if( nFreeList==0 ){
  3092        return SQLITE_DONE;
  3093      }
  3094  
  3095      rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
  3096      if( rc!=SQLITE_OK ){
  3097        return rc;
  3098      }
  3099      if( eType==PTRMAP_ROOTPAGE ){
  3100        return SQLITE_CORRUPT_BKPT;
  3101      }
  3102  
  3103      if( eType==PTRMAP_FREEPAGE ){
  3104        if( bCommit==0 ){
  3105          /* Remove the page from the files free-list. This is not required
  3106          ** if bCommit is non-zero. In that case, the free-list will be
  3107          ** truncated to zero after this function returns, so it doesn't 
  3108          ** matter if it still contains some garbage entries.
  3109          */
  3110          Pgno iFreePg;
  3111          MemPage *pFreePg;
  3112          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
  3113          if( rc!=SQLITE_OK ){
  3114            return rc;
  3115          }
  3116          assert( iFreePg==iLastPg );
  3117          releasePage(pFreePg);
  3118        }
  3119      } else {
  3120        Pgno iFreePg;             /* Index of free page to move pLastPg to */
  3121        MemPage *pLastPg;
  3122        u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
  3123        Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
  3124  
  3125        rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
  3126        if( rc!=SQLITE_OK ){
  3127          return rc;
  3128        }
  3129  
  3130        /* If bCommit is zero, this loop runs exactly once and page pLastPg
  3131        ** is swapped with the first free page pulled off the free list.
  3132        **
  3133        ** On the other hand, if bCommit is greater than zero, then keep
  3134        ** looping until a free-page located within the first nFin pages
  3135        ** of the file is found.
  3136        */
  3137        if( bCommit==0 ){
  3138          eMode = BTALLOC_LE;
  3139          iNear = nFin;
  3140        }
  3141        do {
  3142          MemPage *pFreePg;
  3143          rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
  3144          if( rc!=SQLITE_OK ){
  3145            releasePage(pLastPg);
  3146            return rc;
  3147          }
  3148          releasePage(pFreePg);
  3149        }while( bCommit && iFreePg>nFin );
  3150        assert( iFreePg<iLastPg );
  3151        
  3152        rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
  3153        releasePage(pLastPg);
  3154        if( rc!=SQLITE_OK ){
  3155          return rc;
  3156        }
  3157      }
  3158    }
  3159  
  3160    if( bCommit==0 ){
  3161      do {
  3162        iLastPg--;
  3163      }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
  3164      pBt->bDoTruncate = 1;
  3165      pBt->nPage = iLastPg;
  3166    }
  3167    return SQLITE_OK;
  3168  }
  3169  
  3170  /*
  3171  ** The database opened by the first argument is an auto-vacuum database
  3172  ** nOrig pages in size containing nFree free pages. Return the expected 
  3173  ** size of the database in pages following an auto-vacuum operation.
  3174  */
  3175  static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
  3176    int nEntry;                     /* Number of entries on one ptrmap page */
  3177    Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
  3178    Pgno nFin;                      /* Return value */
  3179  
  3180    nEntry = pBt->usableSize/5;
  3181    nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
  3182    nFin = nOrig - nFree - nPtrmap;
  3183    if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
  3184      nFin--;
  3185    }
  3186    while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
  3187      nFin--;
  3188    }
  3189  
  3190    return nFin;
  3191  }
  3192  
  3193  /*
  3194  ** A write-transaction must be opened before calling this function.
  3195  ** It performs a single unit of work towards an incremental vacuum.
  3196  **
  3197  ** If the incremental vacuum is finished after this function has run,
  3198  ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
  3199  ** SQLITE_OK is returned. Otherwise an SQLite error code. 
  3200  */
  3201  int sqlite3BtreeIncrVacuum(Btree *p){
  3202    int rc;
  3203    BtShared *pBt = p->pBt;
  3204  
  3205    sqlite3BtreeEnter(p);
  3206    assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
  3207    if( !pBt->autoVacuum ){
  3208      rc = SQLITE_DONE;
  3209    }else{
  3210      Pgno nOrig = btreePagecount(pBt);
  3211      Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
  3212      Pgno nFin = finalDbSize(pBt, nOrig, nFree);
  3213  
  3214      if( nOrig<nFin ){
  3215        rc = SQLITE_CORRUPT_BKPT;
  3216      }else if( nFree>0 ){
  3217        rc = saveAllCursors(pBt, 0, 0);
  3218        if( rc==SQLITE_OK ){
  3219          invalidateAllOverflowCache(pBt);
  3220          rc = incrVacuumStep(pBt, nFin, nOrig, 0);
  3221        }
  3222        if( rc==SQLITE_OK ){
  3223          rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  3224          put4byte(&pBt->pPage1->aData[28], pBt->nPage);
  3225        }
  3226      }else{
  3227        rc = SQLITE_DONE;
  3228      }
  3229    }
  3230    sqlite3BtreeLeave(p);
  3231    return rc;
  3232  }
  3233  
  3234  /*
  3235  ** This routine is called prior to sqlite3PagerCommit when a transaction
  3236  ** is committed for an auto-vacuum database.
  3237  **
  3238  ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
  3239  ** the database file should be truncated to during the commit process. 
  3240  ** i.e. the database has been reorganized so that only the first *pnTrunc
  3241  ** pages are in use.
  3242  */
  3243  static int autoVacuumCommit(BtShared *pBt){
  3244    int rc = SQLITE_OK;
  3245    Pager *pPager = pBt->pPager;
  3246    VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager) );
  3247  
  3248    assert( sqlite3_mutex_held(pBt->mutex) );
  3249    invalidateAllOverflowCache(pBt);
  3250    assert(pBt->autoVacuum);
  3251    if( !pBt->incrVacuum ){
  3252      Pgno nFin;         /* Number of pages in database after autovacuuming */
  3253      Pgno nFree;        /* Number of pages on the freelist initially */
  3254      Pgno iFree;        /* The next page to be freed */
  3255      Pgno nOrig;        /* Database size before freeing */
  3256  
  3257      nOrig = btreePagecount(pBt);
  3258      if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
  3259        /* It is not possible to create a database for which the final page
  3260        ** is either a pointer-map page or the pending-byte page. If one
  3261        ** is encountered, this indicates corruption.
  3262        */
  3263        return SQLITE_CORRUPT_BKPT;
  3264      }
  3265  
  3266      nFree = get4byte(&pBt->pPage1->aData[36]);
  3267      nFin = finalDbSize(pBt, nOrig, nFree);
  3268      if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
  3269      if( nFin<nOrig ){
  3270        rc = saveAllCursors(pBt, 0, 0);
  3271      }
  3272      for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
  3273        rc = incrVacuumStep(pBt, nFin, iFree, 1);
  3274      }
  3275      if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
  3276        rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
  3277        put4byte(&pBt->pPage1->aData[32], 0);
  3278        put4byte(&pBt->pPage1->aData[36], 0);
  3279        put4byte(&pBt->pPage1->aData[28], nFin);
  3280        pBt->bDoTruncate = 1;
  3281        pBt->nPage = nFin;
  3282      }
  3283      if( rc!=SQLITE_OK ){
  3284        sqlite3PagerRollback(pPager);
  3285      }
  3286    }
  3287  
  3288    assert( nRef>=sqlite3PagerRefcount(pPager) );
  3289    return rc;
  3290  }
  3291  
  3292  #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
  3293  # define setChildPtrmaps(x) SQLITE_OK
  3294  #endif
  3295  
  3296  /*
  3297  ** This routine does the first phase of a two-phase commit.  This routine
  3298  ** causes a rollback journal to be created (if it does not already exist)
  3299  ** and populated with enough information so that if a power loss occurs
  3300  ** the database can be restored to its original state by playing back
  3301  ** the journal.  Then the contents of the journal are flushed out to
  3302  ** the disk.  After the journal is safely on oxide, the changes to the
  3303  ** database are written into the database file and flushed to oxide.
  3304  ** At the end of this call, the rollback journal still exists on the
  3305  ** disk and we are still holding all locks, so the transaction has not
  3306  ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
  3307  ** commit process.
  3308  **
  3309  ** This call is a no-op if no write-transaction is currently active on pBt.
  3310  **
  3311  ** Otherwise, sync the database file for the btree pBt. zMaster points to
  3312  ** the name of a master journal file that should be written into the
  3313  ** individual journal file, or is NULL, indicating no master journal file 
  3314  ** (single database transaction).
  3315  **
  3316  ** When this is called, the master journal should already have been
  3317  ** created, populated with this journal pointer and synced to disk.
  3318  **
  3319  ** Once this is routine has returned, the only thing required to commit
  3320  ** the write-transaction for this database file is to delete the journal.
  3321  */
  3322  int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster){
  3323    int rc = SQLITE_OK;
  3324    if( p->inTrans==TRANS_WRITE ){
  3325      BtShared *pBt = p->pBt;
  3326      sqlite3BtreeEnter(p);
  3327  #ifndef SQLITE_OMIT_AUTOVACUUM
  3328      if( pBt->autoVacuum ){
  3329        rc = autoVacuumCommit(pBt);
  3330        if( rc!=SQLITE_OK ){
  3331          sqlite3BtreeLeave(p);
  3332          return rc;
  3333        }
  3334      }
  3335      if( pBt->bDoTruncate ){
  3336        sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
  3337      }
  3338  #endif
  3339      rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zMaster, 0);
  3340      sqlite3BtreeLeave(p);
  3341    }
  3342    return rc;
  3343  }
  3344  
  3345  /*
  3346  ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
  3347  ** at the conclusion of a transaction.
  3348  */
  3349  static void btreeEndTransaction(Btree *p){
  3350    BtShared *pBt = p->pBt;
  3351    sqlite3 *db = p->db;
  3352    assert( sqlite3BtreeHoldsMutex(p) );
  3353  
  3354  #ifndef SQLITE_OMIT_AUTOVACUUM
  3355    pBt->bDoTruncate = 0;
  3356  #endif
  3357    if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
  3358      /* If there are other active statements that belong to this database
  3359      ** handle, downgrade to a read-only transaction. The other statements
  3360      ** may still be reading from the database.  */
  3361      downgradeAllSharedCacheTableLocks(p);
  3362      p->inTrans = TRANS_READ;
  3363    }else{
  3364      /* If the handle had any kind of transaction open, decrement the 
  3365      ** transaction count of the shared btree. If the transaction count 
  3366      ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
  3367      ** call below will unlock the pager.  */
  3368      if( p->inTrans!=TRANS_NONE ){
  3369        clearAllSharedCacheTableLocks(p);
  3370        pBt->nTransaction--;
  3371        if( 0==pBt->nTransaction ){
  3372          pBt->inTransaction = TRANS_NONE;
  3373        }
  3374      }
  3375  
  3376      /* Set the current transaction state to TRANS_NONE and unlock the 
  3377      ** pager if this call closed the only read or write transaction.  */
  3378      p->inTrans = TRANS_NONE;
  3379      unlockBtreeIfUnused(pBt);
  3380    }
  3381  
  3382    btreeIntegrity(p);
  3383  }
  3384  
  3385  /*
  3386  ** Commit the transaction currently in progress.
  3387  **
  3388  ** This routine implements the second phase of a 2-phase commit.  The
  3389  ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
  3390  ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
  3391  ** routine did all the work of writing information out to disk and flushing the
  3392  ** contents so that they are written onto the disk platter.  All this
  3393  ** routine has to do is delete or truncate or zero the header in the
  3394  ** the rollback journal (which causes the transaction to commit) and
  3395  ** drop locks.
  3396  **
  3397  ** Normally, if an error occurs while the pager layer is attempting to 
  3398  ** finalize the underlying journal file, this function returns an error and
  3399  ** the upper layer will attempt a rollback. However, if the second argument
  3400  ** is non-zero then this b-tree transaction is part of a multi-file 
  3401  ** transaction. In this case, the transaction has already been committed 
  3402  ** (by deleting a master journal file) and the caller will ignore this 
  3403  ** functions return code. So, even if an error occurs in the pager layer,
  3404  ** reset the b-tree objects internal state to indicate that the write
  3405  ** transaction has been closed. This is quite safe, as the pager will have
  3406  ** transitioned to the error state.
  3407  **
  3408  ** This will release the write lock on the database file.  If there
  3409  ** are no active cursors, it also releases the read lock.
  3410  */
  3411  int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
  3412  
  3413    if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
  3414    sqlite3BtreeEnter(p);
  3415    btreeIntegrity(p);
  3416  
  3417    /* If the handle has a write-transaction open, commit the shared-btrees 
  3418    ** transaction and set the shared state to TRANS_READ.
  3419    */
  3420    if( p->inTrans==TRANS_WRITE ){
  3421      int rc;
  3422      BtShared *pBt = p->pBt;
  3423      assert( pBt->inTransaction==TRANS_WRITE );
  3424      assert( pBt->nTransaction>0 );
  3425      rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
  3426      if( rc!=SQLITE_OK && bCleanup==0 ){
  3427        sqlite3BtreeLeave(p);
  3428        return rc;
  3429      }
  3430      pBt->inTransaction = TRANS_READ;
  3431      btreeClearHasContent(pBt);
  3432    }
  3433  
  3434    btreeEndTransaction(p);
  3435    sqlite3BtreeLeave(p);
  3436    return SQLITE_OK;
  3437  }
  3438  
  3439  /*
  3440  ** Do both phases of a commit.
  3441  */
  3442  int sqlite3BtreeCommit(Btree *p){
  3443    int rc;
  3444    sqlite3BtreeEnter(p);
  3445    rc = sqlite3BtreeCommitPhaseOne(p, 0);
  3446    if( rc==SQLITE_OK ){
  3447      rc = sqlite3BtreeCommitPhaseTwo(p, 0);
  3448    }
  3449    sqlite3BtreeLeave(p);
  3450    return rc;
  3451  }
  3452  
  3453  /*
  3454  ** This routine sets the state to CURSOR_FAULT and the error
  3455  ** code to errCode for every cursor on BtShared that pBtree
  3456  ** references.
  3457  **
  3458  ** Every cursor is tripped, including cursors that belong
  3459  ** to other database connections that happen to be sharing
  3460  ** the cache with pBtree.
  3461  **
  3462  ** This routine gets called when a rollback occurs.
  3463  ** All cursors using the same cache must be tripped
  3464  ** to prevent them from trying to use the btree after
  3465  ** the rollback.  The rollback may have deleted tables
  3466  ** or moved root pages, so it is not sufficient to
  3467  ** save the state of the cursor.  The cursor must be
  3468  ** invalidated.
  3469  */
  3470  void sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode){
  3471    BtCursor *p;
  3472    if( pBtree==0 ) return;
  3473    sqlite3BtreeEnter(pBtree);
  3474    for(p=pBtree->pBt->pCursor; p; p=p->pNext){
  3475      int i;
  3476      sqlite3BtreeClearCursor(p);
  3477      p->eState = CURSOR_FAULT;
  3478      p->skipNext = errCode;
  3479      for(i=0; i<=p->iPage; i++){
  3480        releasePage(p->apPage[i]);
  3481        p->apPage[i] = 0;
  3482      }
  3483    }
  3484    sqlite3BtreeLeave(pBtree);
  3485  }
  3486  
  3487  /*
  3488  ** Rollback the transaction in progress.  All cursors will be
  3489  ** invalided by this operation.  Any attempt to use a cursor
  3490  ** that was open at the beginning of this operation will result
  3491  ** in an error.
  3492  **
  3493  ** This will release the write lock on the database file.  If there
  3494  ** are no active cursors, it also releases the read lock.
  3495  */
  3496  int sqlite3BtreeRollback(Btree *p, int tripCode){
  3497    int rc;
  3498    BtShared *pBt = p->pBt;
  3499    MemPage *pPage1;
  3500  
  3501    sqlite3BtreeEnter(p);
  3502    if( tripCode==SQLITE_OK ){
  3503      rc = tripCode = saveAllCursors(pBt, 0, 0);
  3504    }else{
  3505      rc = SQLITE_OK;
  3506    }
  3507    if( tripCode ){
  3508      sqlite3BtreeTripAllCursors(p, tripCode);
  3509    }
  3510    btreeIntegrity(p);
  3511  
  3512    if( p->inTrans==TRANS_WRITE ){
  3513      int rc2;
  3514  
  3515      assert( TRANS_WRITE==pBt->inTransaction );
  3516      rc2 = sqlite3PagerRollback(pBt->pPager);
  3517      if( rc2!=SQLITE_OK ){
  3518        rc = rc2;
  3519      }
  3520  
  3521      /* The rollback may have destroyed the pPage1->aData value.  So
  3522      ** call btreeGetPage() on page 1 again to make
  3523      ** sure pPage1->aData is set correctly. */
  3524      if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
  3525        int nPage = get4byte(28+(u8*)pPage1->aData);
  3526        testcase( nPage==0 );
  3527        if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
  3528        testcase( pBt->nPage!=nPage );
  3529        pBt->nPage = nPage;
  3530        releasePage(pPage1);
  3531      }
  3532      assert( countValidCursors(pBt, 1)==0 );
  3533      pBt->inTransaction = TRANS_READ;
  3534      btreeClearHasContent(pBt);
  3535    }
  3536  
  3537    btreeEndTransaction(p);
  3538    sqlite3BtreeLeave(p);
  3539    return rc;
  3540  }
  3541  
  3542  /*
  3543  ** Start a statement subtransaction. The subtransaction can be rolled
  3544  ** back independently of the main transaction. You must start a transaction 
  3545  ** before starting a subtransaction. The subtransaction is ended automatically 
  3546  ** if the main transaction commits or rolls back.
  3547  **
  3548  ** Statement subtransactions are used around individual SQL statements
  3549  ** that are contained within a BEGIN...COMMIT block.  If a constraint
  3550  ** error occurs within the statement, the effect of that one statement
  3551  ** can be rolled back without having to rollback the entire transaction.
  3552  **
  3553  ** A statement sub-transaction is implemented as an anonymous savepoint. The
  3554  ** value passed as the second parameter is the total number of savepoints,
  3555  ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
  3556  ** are no active savepoints and no other statement-transactions open,
  3557  ** iStatement is 1. This anonymous savepoint can be released or rolled back
  3558  ** using the sqlite3BtreeSavepoint() function.
  3559  */
  3560  int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
  3561    int rc;
  3562    BtShared *pBt = p->pBt;
  3563    sqlite3BtreeEnter(p);
  3564    assert( p->inTrans==TRANS_WRITE );
  3565    assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
  3566    assert( iStatement>0 );
  3567    assert( iStatement>p->db->nSavepoint );
  3568    assert( pBt->inTransaction==TRANS_WRITE );
  3569    /* At the pager level, a statement transaction is a savepoint with
  3570    ** an index greater than all savepoints created explicitly using
  3571    ** SQL statements. It is illegal to open, release or rollback any
  3572    ** such savepoints while the statement transaction savepoint is active.
  3573    */
  3574    rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
  3575    sqlite3BtreeLeave(p);
  3576    return rc;
  3577  }
  3578  
  3579  /*
  3580  ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
  3581  ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
  3582  ** savepoint identified by parameter iSavepoint, depending on the value 
  3583  ** of op.
  3584  **
  3585  ** Normally, iSavepoint is greater than or equal to zero. However, if op is
  3586  ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the 
  3587  ** contents of the entire transaction are rolled back. This is different
  3588  ** from a normal transaction rollback, as no locks are released and the
  3589  ** transaction remains open.
  3590  */
  3591  int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
  3592    int rc = SQLITE_OK;
  3593    if( p && p->inTrans==TRANS_WRITE ){
  3594      BtShared *pBt = p->pBt;
  3595      assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
  3596      assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
  3597      sqlite3BtreeEnter(p);
  3598      rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
  3599      if( rc==SQLITE_OK ){
  3600        if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
  3601          pBt->nPage = 0;
  3602        }
  3603        rc = newDatabase(pBt);
  3604        pBt->nPage = get4byte(28 + pBt->pPage1->aData);
  3605  
  3606        /* The database size was written into the offset 28 of the header
  3607        ** when the transaction started, so we know that the value at offset
  3608        ** 28 is nonzero. */
  3609        assert( pBt->nPage>0 );
  3610      }
  3611      sqlite3BtreeLeave(p);
  3612    }
  3613    return rc;
  3614  }
  3615  
  3616  /*
  3617  ** Create a new cursor for the BTree whose root is on the page
  3618  ** iTable. If a read-only cursor is requested, it is assumed that
  3619  ** the caller already has at least a read-only transaction open
  3620  ** on the database already. If a write-cursor is requested, then
  3621  ** the caller is assumed to have an open write transaction.
  3622  **
  3623  ** If wrFlag==0, then the cursor can only be used for reading.
  3624  ** If wrFlag==1, then the cursor can be used for reading or for
  3625  ** writing if other conditions for writing are also met.  These
  3626  ** are the conditions that must be met in order for writing to
  3627  ** be allowed:
  3628  **
  3629  ** 1:  The cursor must have been opened with wrFlag==1
  3630  **
  3631  ** 2:  Other database connections that share the same pager cache
  3632  **     but which are not in the READ_UNCOMMITTED state may not have
  3633  **     cursors open with wrFlag==0 on the same table.  Otherwise
  3634  **     the changes made by this write cursor would be visible to
  3635  **     the read cursors in the other database connection.
  3636  **
  3637  ** 3:  The database must be writable (not on read-only media)
  3638  **
  3639  ** 4:  There must be an active transaction.
  3640  **
  3641  ** No checking is done to make sure that page iTable really is the
  3642  ** root page of a b-tree.  If it is not, then the cursor acquired
  3643  ** will not work correctly.
  3644  **
  3645  ** It is assumed that the sqlite3BtreeCursorZero() has been called
  3646  ** on pCur to initialize the memory space prior to invoking this routine.
  3647  */
  3648  static int btreeCursor(
  3649    Btree *p,                              /* The btree */
  3650    int iTable,                            /* Root page of table to open */
  3651    int wrFlag,                            /* 1 to write. 0 read-only */
  3652    struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
  3653    BtCursor *pCur                         /* Space for new cursor */
  3654  ){
  3655    BtShared *pBt = p->pBt;                /* Shared b-tree handle */
  3656  
  3657    assert( sqlite3BtreeHoldsMutex(p) );
  3658    assert( wrFlag==0 || wrFlag==1 );
  3659  
  3660    /* The following assert statements verify that if this is a sharable 
  3661    ** b-tree database, the connection is holding the required table locks, 
  3662    ** and that no other connection has any open cursor that conflicts with 
  3663    ** this lock.  */
  3664    assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, wrFlag+1) );
  3665    assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
  3666  
  3667    /* Assert that the caller has opened the required transaction. */
  3668    assert( p->inTrans>TRANS_NONE );
  3669    assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
  3670    assert( pBt->pPage1 && pBt->pPage1->aData );
  3671  
  3672    if( NEVER(wrFlag && (pBt->btsFlags & BTS_READ_ONLY)!=0) ){
  3673      return SQLITE_READONLY;
  3674    }
  3675    if( wrFlag ){
  3676      allocateTempSpace(pBt);
  3677      if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM;
  3678    }
  3679    if( iTable==1 && btreePagecount(pBt)==0 ){
  3680      assert( wrFlag==0 );
  3681      iTable = 0;
  3682    }
  3683  
  3684    /* Now that no other errors can occur, finish filling in the BtCursor
  3685    ** variables and link the cursor into the BtShared list.  */
  3686    pCur->pgnoRoot = (Pgno)iTable;
  3687    pCur->iPage = -1;
  3688    pCur->pKeyInfo = pKeyInfo;
  3689    pCur->pBtree = p;
  3690    pCur->pBt = pBt;
  3691    assert( wrFlag==0 || wrFlag==BTCF_WriteFlag );
  3692    pCur->curFlags = wrFlag;
  3693    pCur->pNext = pBt->pCursor;
  3694    if( pCur->pNext ){
  3695      pCur->pNext->pPrev = pCur;
  3696    }
  3697    pBt->pCursor = pCur;
  3698    pCur->eState = CURSOR_INVALID;
  3699    return SQLITE_OK;
  3700  }
  3701  int sqlite3BtreeCursor(
  3702    Btree *p,                                   /* The btree */
  3703    int iTable,                                 /* Root page of table to open */
  3704    int wrFlag,                                 /* 1 to write. 0 read-only */
  3705    struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
  3706    BtCursor *pCur                              /* Write new cursor here */
  3707  ){
  3708    int rc;
  3709    sqlite3BtreeEnter(p);
  3710    rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
  3711    sqlite3BtreeLeave(p);
  3712    return rc;
  3713  }
  3714  
  3715  /*
  3716  ** Return the size of a BtCursor object in bytes.
  3717  **
  3718  ** This interfaces is needed so that users of cursors can preallocate
  3719  ** sufficient storage to hold a cursor.  The BtCursor object is opaque
  3720  ** to users so they cannot do the sizeof() themselves - they must call
  3721  ** this routine.
  3722  */
  3723  int sqlite3BtreeCursorSize(void){
  3724    return ROUND8(sizeof(BtCursor));
  3725  }
  3726  
  3727  /*
  3728  ** Initialize memory that will be converted into a BtCursor object.
  3729  **
  3730  ** The simple approach here would be to memset() the entire object
  3731  ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
  3732  ** do not need to be zeroed and they are large, so we can save a lot
  3733  ** of run-time by skipping the initialization of those elements.
  3734  */
  3735  void sqlite3BtreeCursorZero(BtCursor *p){
  3736    memset(p, 0, offsetof(BtCursor, iPage));
  3737  }
  3738  
  3739  /*
  3740  ** Close a cursor.  The read lock on the database file is released
  3741  ** when the last cursor is closed.
  3742  */
  3743  int sqlite3BtreeCloseCursor(BtCursor *pCur){
  3744    Btree *pBtree = pCur->pBtree;
  3745    if( pBtree ){
  3746      int i;
  3747      BtShared *pBt = pCur->pBt;
  3748      sqlite3BtreeEnter(pBtree);
  3749      sqlite3BtreeClearCursor(pCur);
  3750      if( pCur->pPrev ){
  3751        pCur->pPrev->pNext = pCur->pNext;
  3752      }else{
  3753        pBt->pCursor = pCur->pNext;
  3754      }
  3755      if( pCur->pNext ){
  3756        pCur->pNext->pPrev = pCur->pPrev;
  3757      }
  3758      for(i=0; i<=pCur->iPage; i++){
  3759        releasePage(pCur->apPage[i]);
  3760      }
  3761      unlockBtreeIfUnused(pBt);
  3762      sqlite3DbFree(pBtree->db, pCur->aOverflow);
  3763      /* sqlite3_free(pCur); */
  3764      sqlite3BtreeLeave(pBtree);
  3765    }
  3766    return SQLITE_OK;
  3767  }
  3768  
  3769  /*
  3770  ** Make sure the BtCursor* given in the argument has a valid
  3771  ** BtCursor.info structure.  If it is not already valid, call
  3772  ** btreeParseCell() to fill it in.
  3773  **
  3774  ** BtCursor.info is a cache of the information in the current cell.
  3775  ** Using this cache reduces the number of calls to btreeParseCell().
  3776  **
  3777  ** 2007-06-25:  There is a bug in some versions of MSVC that cause the
  3778  ** compiler to crash when getCellInfo() is implemented as a macro.
  3779  ** But there is a measureable speed advantage to using the macro on gcc
  3780  ** (when less compiler optimizations like -Os or -O0 are used and the
  3781  ** compiler is not doing aggressive inlining.)  So we use a real function
  3782  ** for MSVC and a macro for everything else.  Ticket #2457.
  3783  */
  3784  #ifndef NDEBUG
  3785    static void assertCellInfo(BtCursor *pCur){
  3786      CellInfo info;
  3787      int iPage = pCur->iPage;
  3788      memset(&info, 0, sizeof(info));
  3789      btreeParseCell(pCur->apPage[iPage], pCur->aiIdx[iPage], &info);
  3790      assert( CORRUPT_DB || memcmp(&info, &pCur->info, sizeof(info))==0 );
  3791    }
  3792  #else
  3793    #define assertCellInfo(x)
  3794  #endif
  3795  #ifdef _MSC_VER
  3796    /* Use a real function in MSVC to work around bugs in that compiler. */
  3797    static void getCellInfo(BtCursor *pCur){
  3798      if( pCur->info.nSize==0 ){
  3799        int iPage = pCur->iPage;
  3800        btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);
  3801        pCur->curFlags |= BTCF_ValidNKey;
  3802      }else{
  3803        assertCellInfo(pCur);
  3804      }
  3805    }
  3806  #else /* if not _MSC_VER */
  3807    /* Use a macro in all other compilers so that the function is inlined */
  3808  #define getCellInfo(pCur)                                                      \
  3809    if( pCur->info.nSize==0 ){                                                   \
  3810      int iPage = pCur->iPage;                                                   \
  3811      btreeParseCell(pCur->apPage[iPage],pCur->aiIdx[iPage],&pCur->info);        \
  3812      pCur->curFlags |= BTCF_ValidNKey;                                          \
  3813    }else{                                                                       \
  3814      assertCellInfo(pCur);                                                      \
  3815    }
  3816  #endif /* _MSC_VER */
  3817  
  3818  #ifndef NDEBUG  /* The next routine used only within assert() statements */
  3819  /*
  3820  ** Return true if the given BtCursor is valid.  A valid cursor is one
  3821  ** that is currently pointing to a row in a (non-empty) table.
  3822  ** This is a verification routine is used only within assert() statements.
  3823  */
  3824  int sqlite3BtreeCursorIsValid(BtCursor *pCur){
  3825    return pCur && pCur->eState==CURSOR_VALID;
  3826  }
  3827  #endif /* NDEBUG */
  3828  
  3829  /*
  3830  ** Set *pSize to the size of the buffer needed to hold the value of
  3831  ** the key for the current entry.  If the cursor is not pointing
  3832  ** to a valid entry, *pSize is set to 0. 
  3833  **
  3834  ** For a table with the INTKEY flag set, this routine returns the key
  3835  ** itself, not the number of bytes in the key.
  3836  **
  3837  ** The caller must position the cursor prior to invoking this routine.
  3838  ** 
  3839  ** This routine cannot fail.  It always returns SQLITE_OK.  
  3840  */
  3841  int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize){
  3842    assert( cursorHoldsMutex(pCur) );
  3843    assert( pCur->eState==CURSOR_INVALID || pCur->eState==CURSOR_VALID );
  3844    if( pCur->eState!=CURSOR_VALID ){
  3845      *pSize = 0;
  3846    }else{
  3847      getCellInfo(pCur);
  3848      *pSize = pCur->info.nKey;
  3849    }
  3850    return SQLITE_OK;
  3851  }
  3852  
  3853  /*
  3854  ** Set *pSize to the number of bytes of data in the entry the
  3855  ** cursor currently points to.
  3856  **
  3857  ** The caller must guarantee that the cursor is pointing to a non-NULL
  3858  ** valid entry.  In other words, the calling procedure must guarantee
  3859  ** that the cursor has Cursor.eState==CURSOR_VALID.
  3860  **
  3861  ** Failure is not possible.  This function always returns SQLITE_OK.
  3862  ** It might just as well be a procedure (returning void) but we continue
  3863  ** to return an integer result code for historical reasons.
  3864  */
  3865  int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize){
  3866    assert( cursorHoldsMutex(pCur) );
  3867    assert( pCur->eState==CURSOR_VALID );
  3868    assert( pCur->apPage[pCur->iPage]->intKeyLeaf==1 );
  3869    getCellInfo(pCur);
  3870    *pSize = pCur->info.nPayload;
  3871    return SQLITE_OK;
  3872  }
  3873  
  3874  /*
  3875  ** Given the page number of an overflow page in the database (parameter
  3876  ** ovfl), this function finds the page number of the next page in the 
  3877  ** linked list of overflow pages. If possible, it uses the auto-vacuum
  3878  ** pointer-map data instead of reading the content of page ovfl to do so. 
  3879  **
  3880  ** If an error occurs an SQLite error code is returned. Otherwise:
  3881  **
  3882  ** The page number of the next overflow page in the linked list is 
  3883  ** written to *pPgnoNext. If page ovfl is the last page in its linked 
  3884  ** list, *pPgnoNext is set to zero. 
  3885  **
  3886  ** If ppPage is not NULL, and a reference to the MemPage object corresponding
  3887  ** to page number pOvfl was obtained, then *ppPage is set to point to that
  3888  ** reference. It is the responsibility of the caller to call releasePage()
  3889  ** on *ppPage to free the reference. In no reference was obtained (because
  3890  ** the pointer-map was used to obtain the value for *pPgnoNext), then
  3891  ** *ppPage is set to zero.
  3892  */
  3893  static int getOverflowPage(
  3894    BtShared *pBt,               /* The database file */
  3895    Pgno ovfl,                   /* Current overflow page number */
  3896    MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
  3897    Pgno *pPgnoNext              /* OUT: Next overflow page number */
  3898  ){
  3899    Pgno next = 0;
  3900    MemPage *pPage = 0;
  3901    int rc = SQLITE_OK;
  3902  
  3903    assert( sqlite3_mutex_held(pBt->mutex) );
  3904    assert(pPgnoNext);
  3905  
  3906  #ifndef SQLITE_OMIT_AUTOVACUUM
  3907    /* Try to find the next page in the overflow list using the
  3908    ** autovacuum pointer-map pages. Guess that the next page in 
  3909    ** the overflow list is page number (ovfl+1). If that guess turns 
  3910    ** out to be wrong, fall back to loading the data of page 
  3911    ** number ovfl to determine the next page number.
  3912    */
  3913    if( pBt->autoVacuum ){
  3914      Pgno pgno;
  3915      Pgno iGuess = ovfl+1;
  3916      u8 eType;
  3917  
  3918      while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
  3919        iGuess++;
  3920      }
  3921  
  3922      if( iGuess<=btreePagecount(pBt) ){
  3923        rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
  3924        if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
  3925          next = iGuess;
  3926          rc = SQLITE_DONE;
  3927        }
  3928      }
  3929    }
  3930  #endif
  3931  
  3932    assert( next==0 || rc==SQLITE_DONE );
  3933    if( rc==SQLITE_OK ){
  3934      rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
  3935      assert( rc==SQLITE_OK || pPage==0 );
  3936      if( rc==SQLITE_OK ){
  3937        next = get4byte(pPage->aData);
  3938      }
  3939    }
  3940  
  3941    *pPgnoNext = next;
  3942    if( ppPage ){
  3943      *ppPage = pPage;
  3944    }else{
  3945      releasePage(pPage);
  3946    }
  3947    return (rc==SQLITE_DONE ? SQLITE_OK : rc);
  3948  }
  3949  
  3950  /*
  3951  ** Copy data from a buffer to a page, or from a page to a buffer.
  3952  **
  3953  ** pPayload is a pointer to data stored on database page pDbPage.
  3954  ** If argument eOp is false, then nByte bytes of data are copied
  3955  ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
  3956  ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
  3957  ** of data are copied from the buffer pBuf to pPayload.
  3958  **
  3959  ** SQLITE_OK is returned on success, otherwise an error code.
  3960  */
  3961  static int copyPayload(
  3962    void *pPayload,           /* Pointer to page data */
  3963    void *pBuf,               /* Pointer to buffer */
  3964    int nByte,                /* Number of bytes to copy */
  3965    int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
  3966    DbPage *pDbPage           /* Page containing pPayload */
  3967  ){
  3968    if( eOp ){
  3969      /* Copy data from buffer to page (a write operation) */
  3970      int rc = sqlite3PagerWrite(pDbPage);
  3971      if( rc!=SQLITE_OK ){
  3972        return rc;
  3973      }
  3974      memcpy(pPayload, pBuf, nByte);
  3975    }else{
  3976      /* Copy data from page to buffer (a read operation) */
  3977      memcpy(pBuf, pPayload, nByte);
  3978    }
  3979    return SQLITE_OK;
  3980  }
  3981  
  3982  /*
  3983  ** This function is used to read or overwrite payload information
  3984  ** for the entry that the pCur cursor is pointing to. The eOp
  3985  ** argument is interpreted as follows:
  3986  **
  3987  **   0: The operation is a read. Populate the overflow cache.
  3988  **   1: The operation is a write. Populate the overflow cache.
  3989  **   2: The operation is a read. Do not populate the overflow cache.
  3990  **
  3991  ** A total of "amt" bytes are read or written beginning at "offset".
  3992  ** Data is read to or from the buffer pBuf.
  3993  **
  3994  ** The content being read or written might appear on the main page
  3995  ** or be scattered out on multiple overflow pages.
  3996  **
  3997  ** If the current cursor entry uses one or more overflow pages and the
  3998  ** eOp argument is not 2, this function may allocate space for and lazily 
  3999  ** populates the overflow page-list cache array (BtCursor.aOverflow). 
  4000  ** Subsequent calls use this cache to make seeking to the supplied offset 
  4001  ** more efficient.
  4002  **
  4003  ** Once an overflow page-list cache has been allocated, it may be
  4004  ** invalidated if some other cursor writes to the same table, or if
  4005  ** the cursor is moved to a different row. Additionally, in auto-vacuum
  4006  ** mode, the following events may invalidate an overflow page-list cache.
  4007  **
  4008  **   * An incremental vacuum,
  4009  **   * A commit in auto_vacuum="full" mode,
  4010  **   * Creating a table (may require moving an overflow page).
  4011  */
  4012  static int accessPayload(
  4013    BtCursor *pCur,      /* Cursor pointing to entry to read from */
  4014    u32 offset,          /* Begin reading this far into payload */
  4015    u32 amt,             /* Read this many bytes */
  4016    unsigned char *pBuf, /* Write the bytes into this buffer */ 
  4017    int eOp              /* zero to read. non-zero to write. */
  4018  ){
  4019    unsigned char *aPayload;
  4020    int rc = SQLITE_OK;
  4021    int iIdx = 0;
  4022    MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  4023    BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
  4024  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4025    int bEnd;                                 /* True if reading to end of data */
  4026  #endif
  4027  
  4028    assert( pPage );
  4029    assert( pCur->eState==CURSOR_VALID );
  4030    assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  4031    assert( cursorHoldsMutex(pCur) );
  4032    assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */
  4033  
  4034    getCellInfo(pCur);
  4035    aPayload = pCur->info.pPayload;
  4036  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4037    bEnd = offset+amt==pCur->info.nPayload;
  4038  #endif
  4039    assert( offset+amt <= pCur->info.nPayload );
  4040  
  4041    if( &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize] ){
  4042      /* Trying to read or write past the end of the data is an error */
  4043      return SQLITE_CORRUPT_BKPT;
  4044    }
  4045  
  4046    /* Check if data must be read/written to/from the btree page itself. */
  4047    if( offset<pCur->info.nLocal ){
  4048      int a = amt;
  4049      if( a+offset>pCur->info.nLocal ){
  4050        a = pCur->info.nLocal - offset;
  4051      }
  4052      rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
  4053      offset = 0;
  4054      pBuf += a;
  4055      amt -= a;
  4056    }else{
  4057      offset -= pCur->info.nLocal;
  4058    }
  4059  
  4060    if( rc==SQLITE_OK && amt>0 ){
  4061      const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
  4062      Pgno nextPage;
  4063  
  4064      nextPage = get4byte(&aPayload[pCur->info.nLocal]);
  4065  
  4066      /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
  4067      ** Except, do not allocate aOverflow[] for eOp==2.
  4068      **
  4069      ** The aOverflow[] array is sized at one entry for each overflow page
  4070      ** in the overflow chain. The page number of the first overflow page is
  4071      ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
  4072      ** means "not yet known" (the cache is lazily populated).
  4073      */
  4074      if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
  4075        int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
  4076        if( nOvfl>pCur->nOvflAlloc ){
  4077          Pgno *aNew = (Pgno*)sqlite3DbRealloc(
  4078              pCur->pBtree->db, pCur->aOverflow, nOvfl*2*sizeof(Pgno)
  4079          );
  4080          if( aNew==0 ){
  4081            rc = SQLITE_NOMEM;
  4082          }else{
  4083            pCur->nOvflAlloc = nOvfl*2;
  4084            pCur->aOverflow = aNew;
  4085          }
  4086        }
  4087        if( rc==SQLITE_OK ){
  4088          memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
  4089          pCur->curFlags |= BTCF_ValidOvfl;
  4090        }
  4091      }
  4092  
  4093      /* If the overflow page-list cache has been allocated and the
  4094      ** entry for the first required overflow page is valid, skip
  4095      ** directly to it.
  4096      */
  4097      if( (pCur->curFlags & BTCF_ValidOvfl)!=0
  4098       && pCur->aOverflow[offset/ovflSize]
  4099      ){
  4100        iIdx = (offset/ovflSize);
  4101        nextPage = pCur->aOverflow[iIdx];
  4102        offset = (offset%ovflSize);
  4103      }
  4104  
  4105      for( ; rc==SQLITE_OK && amt>0 && nextPage; iIdx++){
  4106  
  4107        /* If required, populate the overflow page-list cache. */
  4108        if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
  4109          assert(!pCur->aOverflow[iIdx] || pCur->aOverflow[iIdx]==nextPage);
  4110          pCur->aOverflow[iIdx] = nextPage;
  4111        }
  4112  
  4113        if( offset>=ovflSize ){
  4114          /* The only reason to read this page is to obtain the page
  4115          ** number for the next page in the overflow chain. The page
  4116          ** data is not required. So first try to lookup the overflow
  4117          ** page-list cache, if any, then fall back to the getOverflowPage()
  4118          ** function.
  4119          **
  4120          ** Note that the aOverflow[] array must be allocated because eOp!=2
  4121          ** here.  If eOp==2, then offset==0 and this branch is never taken.
  4122          */
  4123          assert( eOp!=2 );
  4124          assert( pCur->curFlags & BTCF_ValidOvfl );
  4125          if( pCur->aOverflow[iIdx+1] ){
  4126            nextPage = pCur->aOverflow[iIdx+1];
  4127          }else{
  4128            rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
  4129          }
  4130          offset -= ovflSize;
  4131        }else{
  4132          /* Need to read this page properly. It contains some of the
  4133          ** range of data that is being read (eOp==0) or written (eOp!=0).
  4134          */
  4135  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4136          sqlite3_file *fd;
  4137  #endif
  4138          int a = amt;
  4139          if( a + offset > ovflSize ){
  4140            a = ovflSize - offset;
  4141          }
  4142  
  4143  #ifdef SQLITE_DIRECT_OVERFLOW_READ
  4144          /* If all the following are true:
  4145          **
  4146          **   1) this is a read operation, and 
  4147          **   2) data is required from the start of this overflow page, and
  4148          **   3) the database is file-backed, and
  4149          **   4) there is no open write-transaction, and
  4150          **   5) the database is not a WAL database,
  4151          **   6) all data from the page is being read.
  4152          **
  4153          ** then data can be read directly from the database file into the
  4154          ** output buffer, bypassing the page-cache altogether. This speeds
  4155          ** up loading large records that span many overflow pages.
  4156          */
  4157          if( (eOp&0x01)==0                                      /* (1) */
  4158           && offset==0                                          /* (2) */
  4159           && (bEnd || a==ovflSize)                              /* (6) */
  4160           && pBt->inTransaction==TRANS_READ                     /* (4) */
  4161           && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (3) */
  4162           && pBt->pPage1->aData[19]==0x01                       /* (5) */
  4163          ){
  4164            u8 aSave[4];
  4165            u8 *aWrite = &pBuf[-4];
4166 memcpy(aSave, aWrite, 4);
4167 rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1)); 4168 nextPage = get4byte(aWrite); 4169 memcpy(aWrite, aSave, 4); 4170 }else 4171 #endif 4172 4173 { 4174 DbPage *pDbPage; 4175 rc = sqlite3PagerAcquire(pBt->pPager, nextPage, &pDbPage, 4176 ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0) 4177 ); 4178 if( rc==SQLITE_OK ){ 4179 aPayload = sqlite3PagerGetData(pDbPage); 4180 nextPage = get4byte(aPayload); 4181 rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage); 4182 sqlite3PagerUnref(pDbPage); 4183 offset = 0; 4184 } 4185 } 4186 amt -= a; 4187 pBuf += a; 4188 } 4189 } 4190 } 4191 4192 if( rc==SQLITE_OK && amt>0 ){ 4193 return SQLITE_CORRUPT_BKPT; 4194 } 4195 return rc; 4196 } 4197 4198 /* 4199 ** Read part of the key associated with cursor pCur. Exactly 4200 ** "amt" bytes will be transferred into pBuf[]. The transfer 4201 ** begins at "offset". 4202 ** 4203 ** The caller must ensure that pCur is pointing to a valid row 4204 ** in the table. 4205 ** 4206 ** Return SQLITE_OK on success or an error code if anything goes 4207 ** wrong. An error is returned if "offset+amt" is larger than 4208 ** the available payload. 4209 */ 4210 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 4211 assert( cursorHoldsMutex(pCur) ); 4212 assert( pCur->eState==CURSOR_VALID ); 4213 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 4214 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4215 return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0); 4216 } 4217 4218 /* 4219 ** Read part of the data associated with cursor pCur. Exactly 4220 ** "amt" bytes will be transfered into pBuf[]. The transfer 4221 ** begins at "offset". 4222 ** 4223 ** Return SQLITE_OK on success or an error code if anything goes 4224 ** wrong. An error is returned if "offset+amt" is larger than 4225 ** the available payload. 4226 */ 4227 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){ 4228 int rc; 4229 4230 #ifndef SQLITE_OMIT_INCRBLOB 4231 if ( pCur->eState==CURSOR_INVALID ){ 4232 return SQLITE_ABORT; 4233 } 4234 #endif 4235 4236 assert( cursorHoldsMutex(pCur) ); 4237 rc = restoreCursorPosition(pCur); 4238 if( rc==SQLITE_OK ){ 4239 assert( pCur->eState==CURSOR_VALID ); 4240 assert( pCur->iPage>=0 && pCur->apPage[pCur->iPage] ); 4241 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4242 rc = accessPayload(pCur, offset, amt, pBuf, 0); 4243 } 4244 return rc; 4245 } 4246 4247 /* 4248 ** Return a pointer to payload information from the entry that the 4249 ** pCur cursor is pointing to. The pointer is to the beginning of 4250 ** the key if index btrees (pPage->intKey==0) and is the data for 4251 ** table btrees (pPage->intKey==1). The number of bytes of available 4252 ** key/data is written into *pAmt. If *pAmt==0, then the value 4253 ** returned will not be a valid pointer. 4254 ** 4255 ** This routine is an optimization. It is common for the entire key 4256 ** and data to fit on the local page and for there to be no overflow 4257 ** pages. When that is so, this routine can be used to access the 4258 ** key and data without making a copy. If the key and/or data spills 4259 ** onto overflow pages, then accessPayload() must be used to reassemble 4260 ** the key/data and copy it into a preallocated buffer. 4261 ** 4262 ** The pointer returned by this routine looks directly into the cached 4263 ** page of the database. The data might change or move the next time 4264 ** any btree routine is called. 4265 */ 4266 static const void *fetchPayload( 4267 BtCursor *pCur, /* Cursor pointing to entry to read from */ 4268 u32 *pAmt /* Write the number of available bytes here */ 4269 ){ 4270 assert( pCur!=0 && pCur->iPage>=0 && pCur->apPage[pCur->iPage]); 4271 assert( pCur->eState==CURSOR_VALID ); 4272 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4273 assert( cursorHoldsMutex(pCur) ); 4274 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4275 assert( pCur->info.nSize>0 ); 4276 *pAmt = pCur->info.nLocal; 4277 return (void*)pCur->info.pPayload; 4278 } 4279 4280 4281 /* 4282 ** For the entry that cursor pCur is point to, return as 4283 ** many bytes of the key or data as are available on the local 4284 ** b-tree page. Write the number of available bytes into *pAmt. 4285 ** 4286 ** The pointer returned is ephemeral. The key/data may move 4287 ** or be destroyed on the next call to any Btree routine, 4288 ** including calls from other threads against the same cache. 4289 ** Hence, a mutex on the BtShared should be held prior to calling 4290 ** this routine. 4291 ** 4292 ** These routines is used to get quick access to key and data 4293 ** in the common case where no overflow pages are used. 4294 */ 4295 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, u32 *pAmt){ 4296 return fetchPayload(pCur, pAmt); 4297 } 4298 const void *sqlite3BtreeDataFetch(BtCursor *pCur, u32 *pAmt){ 4299 return fetchPayload(pCur, pAmt); 4300 } 4301 4302 4303 /* 4304 ** Move the cursor down to a new child page. The newPgno argument is the 4305 ** page number of the child page to move to. 4306 ** 4307 ** This function returns SQLITE_CORRUPT if the page-header flags field of 4308 ** the new child page does not match the flags field of the parent (i.e. 4309 ** if an intkey page appears to be the parent of a non-intkey page, or 4310 ** vice-versa). 4311 */ 4312 static int moveToChild(BtCursor *pCur, u32 newPgno){ 4313 int rc; 4314 int i = pCur->iPage; 4315 MemPage *pNewPage; 4316 BtShared *pBt = pCur->pBt; 4317 4318 assert( cursorHoldsMutex(pCur) ); 4319 assert( pCur->eState==CURSOR_VALID ); 4320 assert( pCur->iPage<BTCURSOR_MAX_DEPTH ); 4321 assert( pCur->iPage>=0 ); 4322 if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){ 4323 return SQLITE_CORRUPT_BKPT; 4324 } 4325 rc = getAndInitPage(pBt, newPgno, &pNewPage, 4326 (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0); 4327 if( rc ) return rc; 4328 pCur->apPage[i+1] = pNewPage; 4329 pCur->aiIdx[i+1] = 0; 4330 pCur->iPage++; 4331 4332 pCur->info.nSize = 0; 4333 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 4334 if( pNewPage->nCell<1 || pNewPage->intKey!=pCur->apPage[i]->intKey ){ 4335 return SQLITE_CORRUPT_BKPT; 4336 } 4337 return SQLITE_OK; 4338 } 4339 4340 #if 0 4341 /* 4342 ** Page pParent is an internal (non-leaf) tree page. This function 4343 ** asserts that page number iChild is the left-child if the iIdx'th 4344 ** cell in page pParent. Or, if iIdx is equal to the total number of 4345 ** cells in pParent, that page number iChild is the right-child of 4346 ** the page. 4347 */ 4348 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){ 4349 assert( iIdx<=pParent->nCell ); 4350 if( iIdx==pParent->nCell ){ 4351 assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild ); 4352 }else{ 4353 assert( get4byte(findCell(pParent, iIdx))==iChild ); 4354 } 4355 } 4356 #else 4357 # define assertParentIndex(x,y,z) 4358 #endif 4359 4360 /* 4361 ** Move the cursor up to the parent page. 4362 ** 4363 ** pCur->idx is set to the cell index that contains the pointer 4364 ** to the page we are coming from. If we are coming from the 4365 ** right-most child page then pCur->idx is set to one more than 4366 ** the largest cell index. 4367 */ 4368 static void moveToParent(BtCursor *pCur){ 4369 assert( cursorHoldsMutex(pCur) ); 4370 assert( pCur->eState==CURSOR_VALID ); 4371 assert( pCur->iPage>0 ); 4372 assert( pCur->apPage[pCur->iPage] ); 4373 4374 /* UPDATE: It is actually possible for the condition tested by the assert 4375 ** below to be untrue if the database file is corrupt. This can occur if 4376 ** one cursor has modified page pParent while a reference to it is held 4377 ** by a second cursor. Which can only happen if a single page is linked 4378 ** into more than one b-tree structure in a corrupt database. */ 4379 #if 0 4380 assertParentIndex( 4381 pCur->apPage[pCur->iPage-1], 4382 pCur->aiIdx[pCur->iPage-1], 4383 pCur->apPage[pCur->iPage]->pgno 4384 ); 4385 #endif 4386 testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell ); 4387 4388 releasePage(pCur->apPage[pCur->iPage]); 4389 pCur->iPage--; 4390 pCur->info.nSize = 0; 4391 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 4392 } 4393 4394 /* 4395 ** Move the cursor to point to the root page of its b-tree structure. 4396 ** 4397 ** If the table has a virtual root page, then the cursor is moved to point 4398 ** to the virtual root page instead of the actual root page. A table has a 4399 ** virtual root page when the actual root page contains no cells and a 4400 ** single child page. This can only happen with the table rooted at page 1. 4401 ** 4402 ** If the b-tree structure is empty, the cursor state is set to 4403 ** CURSOR_INVALID. Otherwise, the cursor is set to point to the first 4404 ** cell located on the root (or virtual root) page and the cursor state 4405 ** is set to CURSOR_VALID. 4406 ** 4407 ** If this function returns successfully, it may be assumed that the 4408 ** page-header flags indicate that the [virtual] root-page is the expected 4409 ** kind of b-tree page (i.e. if when opening the cursor the caller did not 4410 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D, 4411 ** indicating a table b-tree, or if the caller did specify a KeyInfo 4412 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index 4413 ** b-tree). 4414 */ 4415 static int moveToRoot(BtCursor *pCur){ 4416 MemPage *pRoot; 4417 int rc = SQLITE_OK; 4418 4419 assert( cursorHoldsMutex(pCur) ); 4420 assert( CURSOR_INVALID < CURSOR_REQUIRESEEK ); 4421 assert( CURSOR_VALID < CURSOR_REQUIRESEEK ); 4422 assert( CURSOR_FAULT > CURSOR_REQUIRESEEK ); 4423 if( pCur->eState>=CURSOR_REQUIRESEEK ){ 4424 if( pCur->eState==CURSOR_FAULT ){ 4425 assert( pCur->skipNext!=SQLITE_OK ); 4426 return pCur->skipNext; 4427 } 4428 sqlite3BtreeClearCursor(pCur); 4429 } 4430 4431 if( pCur->iPage>=0 ){ 4432 while( pCur->iPage ) releasePage(pCur->apPage[pCur->iPage--]); 4433 }else if( pCur->pgnoRoot==0 ){ 4434 pCur->eState = CURSOR_INVALID; 4435 return SQLITE_OK; 4436 }else{ 4437 rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->apPage[0], 4438 (pCur->curFlags & BTCF_WriteFlag)==0 ? PAGER_GET_READONLY : 0); 4439 if( rc!=SQLITE_OK ){ 4440 pCur->eState = CURSOR_INVALID; 4441 return rc; 4442 } 4443 pCur->iPage = 0; 4444 } 4445 pRoot = pCur->apPage[0]; 4446 assert( pRoot->pgno==pCur->pgnoRoot ); 4447 4448 /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor 4449 ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is 4450 ** NULL, the caller expects a table b-tree. If this is not the case, 4451 ** return an SQLITE_CORRUPT error. 4452 ** 4453 ** Earlier versions of SQLite assumed that this test could not fail 4454 ** if the root page was already loaded when this function was called (i.e. 4455 ** if pCur->iPage>=0). But this is not so if the database is corrupted 4456 ** in such a way that page pRoot is linked into a second b-tree table 4457 ** (or the freelist). */ 4458 assert( pRoot->intKey==1 || pRoot->intKey==0 ); 4459 if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){ 4460 return SQLITE_CORRUPT_BKPT; 4461 } 4462 4463 pCur->aiIdx[0] = 0; 4464 pCur->info.nSize = 0; 4465 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl); 4466 4467 if( pRoot->nCell>0 ){ 4468 pCur->eState = CURSOR_VALID; 4469 }else if( !pRoot->leaf ){ 4470 Pgno subpage; 4471 if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT; 4472 subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]); 4473 pCur->eState = CURSOR_VALID; 4474 rc = moveToChild(pCur, subpage); 4475 }else{ 4476 pCur->eState = CURSOR_INVALID; 4477 } 4478 return rc; 4479 } 4480 4481 /* 4482 ** Move the cursor down to the left-most leaf entry beneath the 4483 ** entry to which it is currently pointing. 4484 ** 4485 ** The left-most leaf is the one with the smallest key - the first 4486 ** in ascending order. 4487 */ 4488 static int moveToLeftmost(BtCursor *pCur){ 4489 Pgno pgno; 4490 int rc = SQLITE_OK; 4491 MemPage *pPage; 4492 4493 assert( cursorHoldsMutex(pCur) ); 4494 assert( pCur->eState==CURSOR_VALID ); 4495 while( rc==SQLITE_OK && !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4496 assert( pCur->aiIdx[pCur->iPage]<pPage->nCell ); 4497 pgno = get4byte(findCell(pPage, pCur->aiIdx[pCur->iPage])); 4498 rc = moveToChild(pCur, pgno); 4499 } 4500 return rc; 4501 } 4502 4503 /* 4504 ** Move the cursor down to the right-most leaf entry beneath the 4505 ** page to which it is currently pointing. Notice the difference 4506 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 4507 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 4508 ** finds the right-most entry beneath the *page*. 4509 ** 4510 ** The right-most entry is the one with the largest key - the last 4511 ** key in ascending order. 4512 */ 4513 static int moveToRightmost(BtCursor *pCur){ 4514 Pgno pgno; 4515 int rc = SQLITE_OK; 4516 MemPage *pPage = 0; 4517 4518 assert( cursorHoldsMutex(pCur) ); 4519 assert( pCur->eState==CURSOR_VALID ); 4520 while( !(pPage = pCur->apPage[pCur->iPage])->leaf ){ 4521 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4522 pCur->aiIdx[pCur->iPage] = pPage->nCell; 4523 rc = moveToChild(pCur, pgno); 4524 if( rc ) return rc; 4525 } 4526 pCur->aiIdx[pCur->iPage] = pPage->nCell-1; 4527 assert( pCur->info.nSize==0 ); 4528 assert( (pCur->curFlags & BTCF_ValidNKey)==0 ); 4529 return SQLITE_OK; 4530 } 4531 4532 /* Move the cursor to the first entry in the table. Return SQLITE_OK 4533 ** on success. Set *pRes to 0 if the cursor actually points to something 4534 ** or set *pRes to 1 if the table is empty. 4535 */ 4536 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){ 4537 int rc; 4538 4539 assert( cursorHoldsMutex(pCur) ); 4540 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4541 rc = moveToRoot(pCur); 4542 if( rc==SQLITE_OK ){ 4543 if( pCur->eState==CURSOR_INVALID ){ 4544 assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 ); 4545 *pRes = 1; 4546 }else{ 4547 assert( pCur->apPage[pCur->iPage]->nCell>0 ); 4548 *pRes = 0; 4549 rc = moveToLeftmost(pCur); 4550 } 4551 } 4552 return rc; 4553 } 4554 4555 /* Move the cursor to the last entry in the table. Return SQLITE_OK 4556 ** on success. Set *pRes to 0 if the cursor actually points to something 4557 ** or set *pRes to 1 if the table is empty. 4558 */ 4559 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){ 4560 int rc; 4561 4562 assert( cursorHoldsMutex(pCur) ); 4563 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4564 4565 /* If the cursor already points to the last entry, this is a no-op. */ 4566 if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){ 4567 #ifdef SQLITE_DEBUG 4568 /* This block serves to assert() that the cursor really does point 4569 ** to the last entry in the b-tree. */ 4570 int ii; 4571 for(ii=0; ii<pCur->iPage; ii++){ 4572 assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell ); 4573 } 4574 assert( pCur->aiIdx[pCur->iPage]==pCur->apPage[pCur->iPage]->nCell-1 ); 4575 assert( pCur->apPage[pCur->iPage]->leaf ); 4576 #endif 4577 return SQLITE_OK; 4578 } 4579 4580 rc = moveToRoot(pCur); 4581 if( rc==SQLITE_OK ){ 4582 if( CURSOR_INVALID==pCur->eState ){ 4583 assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 ); 4584 *pRes = 1; 4585 }else{ 4586 assert( pCur->eState==CURSOR_VALID ); 4587 *pRes = 0; 4588 rc = moveToRightmost(pCur); 4589 if( rc==SQLITE_OK ){ 4590 pCur->curFlags |= BTCF_AtLast; 4591 }else{ 4592 pCur->curFlags &= ~BTCF_AtLast; 4593 } 4594 4595 } 4596 } 4597 return rc; 4598 } 4599 4600 /* Move the cursor so that it points to an entry near the key 4601 ** specified by pIdxKey or intKey. Return a success code. 4602 ** 4603 ** For INTKEY tables, the intKey parameter is used. pIdxKey 4604 ** must be NULL. For index tables, pIdxKey is used and intKey 4605 ** is ignored. 4606 ** 4607 ** If an exact match is not found, then the cursor is always 4608 ** left pointing at a leaf page which would hold the entry if it 4609 ** were present. The cursor might point to an entry that comes 4610 ** before or after the key. 4611 ** 4612 ** An integer is written into *pRes which is the result of 4613 ** comparing the key with the entry to which the cursor is 4614 ** pointing. The meaning of the integer written into 4615 ** *pRes is as follows: 4616 ** 4617 ** *pRes<0 The cursor is left pointing at an entry that 4618 ** is smaller than intKey/pIdxKey or if the table is empty 4619 ** and the cursor is therefore left point to nothing. 4620 ** 4621 ** *pRes==0 The cursor is left pointing at an entry that 4622 ** exactly matches intKey/pIdxKey. 4623 ** 4624 ** *pRes>0 The cursor is left pointing at an entry that 4625 ** is larger than intKey/pIdxKey. 4626 ** 4627 */ 4628 int sqlite3BtreeMovetoUnpacked( 4629 BtCursor *pCur, /* The cursor to be moved */ 4630 UnpackedRecord *pIdxKey, /* Unpacked index key */ 4631 i64 intKey, /* The table key */ 4632 int biasRight, /* If true, bias the search to the high end */ 4633 int *pRes /* Write search results here */ 4634 ){ 4635 int rc; 4636 RecordCompare xRecordCompare; 4637 4638 assert( cursorHoldsMutex(pCur) ); 4639 assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) ); 4640 assert( pRes ); 4641 assert( (pIdxKey==0)==(pCur->pKeyInfo==0) ); 4642 4643 /* If the cursor is already positioned at the point we are trying 4644 ** to move to, then just return without doing any work */ 4645 if( pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0 4646 && pCur->apPage[0]->intKey 4647 ){ 4648 if( pCur->info.nKey==intKey ){ 4649 *pRes = 0; 4650 return SQLITE_OK; 4651 } 4652 if( (pCur->curFlags & BTCF_AtLast)!=0 && pCur->info.nKey<intKey ){ 4653 *pRes = -1; 4654 return SQLITE_OK; 4655 } 4656 } 4657 4658 if( pIdxKey ){ 4659 xRecordCompare = sqlite3VdbeFindCompare(pIdxKey); 4660 pIdxKey->errCode = 0; 4661 assert( pIdxKey->default_rc==1 4662 || pIdxKey->default_rc==0 4663 || pIdxKey->default_rc==-1 4664 ); 4665 }else{ 4666 xRecordCompare = 0; /* All keys are integers */ 4667 } 4668 4669 rc = moveToRoot(pCur); 4670 if( rc ){ 4671 return rc; 4672 } 4673 assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage] ); 4674 assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->isInit ); 4675 assert( pCur->eState==CURSOR_INVALID || pCur->apPage[pCur->iPage]->nCell>0 ); 4676 if( pCur->eState==CURSOR_INVALID ){ 4677 *pRes = -1; 4678 assert( pCur->pgnoRoot==0 || pCur->apPage[pCur->iPage]->nCell==0 ); 4679 return SQLITE_OK; 4680 } 4681 assert( pCur->apPage[0]->intKey || pIdxKey ); 4682 for(;;){ 4683 int lwr, upr, idx, c; 4684 Pgno chldPg; 4685 MemPage *pPage = pCur->apPage[pCur->iPage]; 4686 u8 *pCell; /* Pointer to current cell in pPage */ 4687 4688 /* pPage->nCell must be greater than zero. If this is the root-page 4689 ** the cursor would have been INVALID above and this for(;;) loop 4690 ** not run. If this is not the root-page, then the moveToChild() routine 4691 ** would have already detected db corruption. Similarly, pPage must 4692 ** be the right kind (index or table) of b-tree page. Otherwise 4693 ** a moveToChild() or moveToRoot() call would have detected corruption. */ 4694 assert( pPage->nCell>0 ); 4695 assert( pPage->intKey==(pIdxKey==0) ); 4696 lwr = 0; 4697 upr = pPage->nCell-1; 4698 assert( biasRight==0 || biasRight==1 ); 4699 idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */ 4700 pCur->aiIdx[pCur->iPage] = (u16)idx; 4701 if( xRecordCompare==0 ){ 4702 for(;;){ 4703 i64 nCellKey; 4704 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4705 if( pPage->intKeyLeaf ){ 4706 while( 0x80 <= *(pCell++) ){ 4707 if( pCell>=pPage->aDataEnd ) return SQLITE_CORRUPT_BKPT; 4708 } 4709 } 4710 getVarint(pCell, (u64*)&nCellKey); 4711 if( nCellKey<intKey ){ 4712 lwr = idx+1; 4713 if( lwr>upr ){ c = -1; break; } 4714 }else if( nCellKey>intKey ){ 4715 upr = idx-1; 4716 if( lwr>upr ){ c = +1; break; } 4717 }else{ 4718 assert( nCellKey==intKey ); 4719 pCur->curFlags |= BTCF_ValidNKey; 4720 pCur->info.nKey = nCellKey; 4721 pCur->aiIdx[pCur->iPage] = (u16)idx; 4722 if( !pPage->leaf ){ 4723 lwr = idx; 4724 goto moveto_next_layer; 4725 }else{ 4726 *pRes = 0; 4727 rc = SQLITE_OK; 4728 goto moveto_finish; 4729 } 4730 } 4731 assert( lwr+upr>=0 ); 4732 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2; */ 4733 } 4734 }else{ 4735 for(;;){ 4736 int nCell; 4737 pCell = findCell(pPage, idx) + pPage->childPtrSize; 4738 4739 /* The maximum supported page-size is 65536 bytes. This means that 4740 ** the maximum number of record bytes stored on an index B-Tree 4741 ** page is less than 16384 bytes and may be stored as a 2-byte 4742 ** varint. This information is used to attempt to avoid parsing 4743 ** the entire cell by checking for the cases where the record is 4744 ** stored entirely within the b-tree page by inspecting the first 4745 ** 2 bytes of the cell. 4746 */ 4747 nCell = pCell[0]; 4748 if( nCell<=pPage->max1bytePayload ){ 4749 /* This branch runs if the record-size field of the cell is a 4750 ** single byte varint and the record fits entirely on the main 4751 ** b-tree page. */ 4752 testcase( pCell+nCell+1==pPage->aDataEnd ); 4753 c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey); 4754 }else if( !(pCell[1] & 0x80) 4755 && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal 4756 ){ 4757 /* The record-size field is a 2 byte varint and the record 4758 ** fits entirely on the main b-tree page. */ 4759 testcase( pCell+nCell+2==pPage->aDataEnd ); 4760 c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey); 4761 }else{ 4762 /* The record flows over onto one or more overflow pages. In 4763 ** this case the whole cell needs to be parsed, a buffer allocated 4764 ** and accessPayload() used to retrieve the record into the 4765 ** buffer before VdbeRecordCompare() can be called. */ 4766 void *pCellKey; 4767 u8 * const pCellBody = pCell - pPage->childPtrSize; 4768 btreeParseCellPtr(pPage, pCellBody, &pCur->info); 4769 nCell = (int)pCur->info.nKey; 4770 pCellKey = sqlite3Malloc( nCell ); 4771 if( pCellKey==0 ){ 4772 rc = SQLITE_NOMEM; 4773 goto moveto_finish; 4774 } 4775 pCur->aiIdx[pCur->iPage] = (u16)idx; 4776 rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2); 4777 if( rc ){ 4778 sqlite3_free(pCellKey); 4779 goto moveto_finish; 4780 } 4781 c = xRecordCompare(nCell, pCellKey, pIdxKey); 4782 sqlite3_free(pCellKey); 4783 } 4784 assert( 4785 (pIdxKey->errCode!=SQLITE_CORRUPT || c==0) 4786 && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed) 4787 ); 4788 if( c<0 ){ 4789 lwr = idx+1; 4790 }else if( c>0 ){ 4791 upr = idx-1; 4792 }else{ 4793 assert( c==0 ); 4794 *pRes = 0; 4795 rc = SQLITE_OK; 4796 pCur->aiIdx[pCur->iPage] = (u16)idx; 4797 if( pIdxKey->errCode ) rc = SQLITE_CORRUPT; 4798 goto moveto_finish; 4799 } 4800 if( lwr>upr ) break; 4801 assert( lwr+upr>=0 ); 4802 idx = (lwr+upr)>>1; /* idx = (lwr+upr)/2 */ 4803 } 4804 } 4805 assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) ); 4806 assert( pPage->isInit ); 4807 if( pPage->leaf ){ 4808 assert( pCur->aiIdx[pCur->iPage]<pCur->apPage[pCur->iPage]->nCell ); 4809 pCur->aiIdx[pCur->iPage] = (u16)idx; 4810 *pRes = c; 4811 rc = SQLITE_OK; 4812 goto moveto_finish; 4813 } 4814 moveto_next_layer: 4815 if( lwr>=pPage->nCell ){ 4816 chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]); 4817 }else{ 4818 chldPg = get4byte(findCell(pPage, lwr)); 4819 } 4820 pCur->aiIdx[pCur->iPage] = (u16)lwr; 4821 rc = moveToChild(pCur, chldPg); 4822 if( rc ) break; 4823 } 4824 moveto_finish: 4825 pCur->info.nSize = 0; 4826 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 4827 return rc; 4828 } 4829 4830 4831 /* 4832 ** Return TRUE if the cursor is not pointing at an entry of the table. 4833 ** 4834 ** TRUE will be returned after a call to sqlite3BtreeNext() moves 4835 ** past the last entry in the table or sqlite3BtreePrev() moves past 4836 ** the first entry. TRUE is also returned if the table is empty. 4837 */ 4838 int sqlite3BtreeEof(BtCursor *pCur){ 4839 /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries 4840 ** have been deleted? This API will need to change to return an error code 4841 ** as well as the boolean result value. 4842 */ 4843 return (CURSOR_VALID!=pCur->eState); 4844 } 4845 4846 /* 4847 ** Advance the cursor to the next entry in the database. If 4848 ** successful then set *pRes=0. If the cursor 4849 ** was already pointing to the last entry in the database before 4850 ** this routine was called, then set *pRes=1. 4851 ** 4852 ** The main entry point is sqlite3BtreeNext(). That routine is optimized 4853 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx 4854 ** to the next cell on the current page. The (slower) btreeNext() helper 4855 ** routine is called when it is necessary to move to a different page or 4856 ** to restore the cursor. 4857 ** 4858 ** The calling function will set *pRes to 0 or 1. The initial *pRes value 4859 ** will be 1 if the cursor being stepped corresponds to an SQL index and 4860 ** if this routine could have been skipped if that SQL index had been 4861 ** a unique index. Otherwise the caller will have set *pRes to zero. 4862 ** Zero is the common case. The btree implementation is free to use the 4863 ** initial *pRes value as a hint to improve performance, but the current 4864 ** SQLite btree implementation does not. (Note that the comdb2 btree 4865 ** implementation does use this hint, however.) 4866 */ 4867 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur, int *pRes){ 4868 int rc; 4869 int idx; 4870 MemPage *pPage; 4871 4872 assert( cursorHoldsMutex(pCur) ); 4873 assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); 4874 assert( *pRes==0 ); 4875 if( pCur->eState!=CURSOR_VALID ){ 4876 assert( (pCur->curFlags & BTCF_ValidOvfl)==0 ); 4877 rc = restoreCursorPosition(pCur); 4878 if( rc!=SQLITE_OK ){ 4879 return rc; 4880 } 4881 if( CURSOR_INVALID==pCur->eState ){ 4882 *pRes = 1; 4883 return SQLITE_OK; 4884 } 4885 if( pCur->skipNext ){ 4886 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT ); 4887 pCur->eState = CURSOR_VALID; 4888 if( pCur->skipNext>0 ){ 4889 pCur->skipNext = 0; 4890 return SQLITE_OK; 4891 } 4892 pCur->skipNext = 0; 4893 } 4894 } 4895 4896 pPage = pCur->apPage[pCur->iPage]; 4897 idx = ++pCur->aiIdx[pCur->iPage]; 4898 assert( pPage->isInit ); 4899 4900 /* If the database file is corrupt, it is possible for the value of idx 4901 ** to be invalid here. This can only occur if a second cursor modifies 4902 ** the page while cursor pCur is holding a reference to it. Which can 4903 ** only happen if the database is corrupt in such a way as to link the 4904 ** page into more than one b-tree structure. */ 4905 testcase( idx>pPage->nCell ); 4906 4907 if( idx>=pPage->nCell ){ 4908 if( !pPage->leaf ){ 4909 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 4910 if( rc ) return rc; 4911 return moveToLeftmost(pCur); 4912 } 4913 do{ 4914 if( pCur->iPage==0 ){ 4915 *pRes = 1; 4916 pCur->eState = CURSOR_INVALID; 4917 return SQLITE_OK; 4918 } 4919 moveToParent(pCur); 4920 pPage = pCur->apPage[pCur->iPage]; 4921 }while( pCur->aiIdx[pCur->iPage]>=pPage->nCell ); 4922 if( pPage->intKey ){ 4923 return sqlite3BtreeNext(pCur, pRes); 4924 }else{ 4925 return SQLITE_OK; 4926 } 4927 } 4928 if( pPage->leaf ){ 4929 return SQLITE_OK; 4930 }else{ 4931 return moveToLeftmost(pCur); 4932 } 4933 } 4934 int sqlite3BtreeNext(BtCursor *pCur, int *pRes){ 4935 MemPage *pPage; 4936 assert( cursorHoldsMutex(pCur) ); 4937 assert( pRes!=0 ); 4938 assert( *pRes==0 || *pRes==1 ); 4939 assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); 4940 pCur->info.nSize = 0; 4941 pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl); 4942 *pRes = 0; 4943 if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur, pRes); 4944 pPage = pCur->apPage[pCur->iPage]; 4945 if( (++pCur->aiIdx[pCur->iPage])>=pPage->nCell ){ 4946 pCur->aiIdx[pCur->iPage]--; 4947 return btreeNext(pCur, pRes); 4948 } 4949 if( pPage->leaf ){ 4950 return SQLITE_OK; 4951 }else{ 4952 return moveToLeftmost(pCur); 4953 } 4954 } 4955 4956 /* 4957 ** Step the cursor to the back to the previous entry in the database. If 4958 ** successful then set *pRes=0. If the cursor 4959 ** was already pointing to the first entry in the database before 4960 ** this routine was called, then set *pRes=1. 4961 ** 4962 ** The main entry point is sqlite3BtreePrevious(). That routine is optimized 4963 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx 4964 ** to the previous cell on the current page. The (slower) btreePrevious() 4965 ** helper routine is called when it is necessary to move to a different page 4966 ** or to restore the cursor. 4967 ** 4968 ** The calling function will set *pRes to 0 or 1. The initial *pRes value 4969 ** will be 1 if the cursor being stepped corresponds to an SQL index and 4970 ** if this routine could have been skipped if that SQL index had been 4971 ** a unique index. Otherwise the caller will have set *pRes to zero. 4972 ** Zero is the common case. The btree implementation is free to use the 4973 ** initial *pRes value as a hint to improve performance, but the current 4974 ** SQLite btree implementation does not. (Note that the comdb2 btree 4975 ** implementation does use this hint, however.) 4976 */ 4977 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur, int *pRes){ 4978 int rc; 4979 MemPage *pPage; 4980 4981 assert( cursorHoldsMutex(pCur) ); 4982 assert( pRes!=0 ); 4983 assert( *pRes==0 ); 4984 assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); 4985 assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 ); 4986 assert( pCur->info.nSize==0 ); 4987 if( pCur->eState!=CURSOR_VALID ){ 4988 rc = restoreCursorPosition(pCur); 4989 if( rc!=SQLITE_OK ){ 4990 return rc; 4991 } 4992 if( CURSOR_INVALID==pCur->eState ){ 4993 *pRes = 1; 4994 return SQLITE_OK; 4995 } 4996 if( pCur->skipNext ){ 4997 assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_SKIPNEXT ); 4998 pCur->eState = CURSOR_VALID; 4999 if( pCur->skipNext<0 ){ 5000 pCur->skipNext = 0; 5001 return SQLITE_OK; 5002 } 5003 pCur->skipNext = 0; 5004 } 5005 } 5006 5007 pPage = pCur->apPage[pCur->iPage]; 5008 assert( pPage->isInit ); 5009 if( !pPage->leaf ){ 5010 int idx = pCur->aiIdx[pCur->iPage]; 5011 rc = moveToChild(pCur, get4byte(findCell(pPage, idx))); 5012 if( rc ) return rc; 5013 rc = moveToRightmost(pCur); 5014 }else{ 5015 while( pCur->aiIdx[pCur->iPage]==0 ){ 5016 if( pCur->iPage==0 ){ 5017 pCur->eState = CURSOR_INVALID; 5018 *pRes = 1; 5019 return SQLITE_OK; 5020 } 5021 moveToParent(pCur); 5022 } 5023 assert( pCur->info.nSize==0 ); 5024 assert( (pCur->curFlags & (BTCF_ValidNKey|BTCF_ValidOvfl))==0 ); 5025 5026 pCur->aiIdx[pCur->iPage]--; 5027 pPage = pCur->apPage[pCur->iPage]; 5028 if( pPage->intKey && !pPage->leaf ){ 5029 rc = sqlite3BtreePrevious(pCur, pRes); 5030 }else{ 5031 rc = SQLITE_OK; 5032 } 5033 } 5034 return rc; 5035 } 5036 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes){ 5037 assert( cursorHoldsMutex(pCur) ); 5038 assert( pRes!=0 ); 5039 assert( *pRes==0 || *pRes==1 ); 5040 assert( pCur->skipNext==0 || pCur->eState!=CURSOR_VALID ); 5041 *pRes = 0; 5042 pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey); 5043 pCur->info.nSize = 0; 5044 if( pCur->eState!=CURSOR_VALID 5045 || pCur->aiIdx[pCur->iPage]==0 5046 || pCur->apPage[pCur->iPage]->leaf==0 5047 ){ 5048 return btreePrevious(pCur, pRes); 5049 } 5050 pCur->aiIdx[pCur->iPage]--; 5051 return SQLITE_OK; 5052 } 5053 5054 /* 5055 ** Allocate a new page from the database file. 5056 ** 5057 ** The new page is marked as dirty. (In other words, sqlite3PagerWrite() 5058 ** has already been called on the new page.) The new page has also 5059 ** been referenced and the calling routine is responsible for calling 5060 ** sqlite3PagerUnref() on the new page when it is done. 5061 ** 5062 ** SQLITE_OK is returned on success. Any other return value indicates 5063 ** an error. *ppPage and *pPgno are undefined in the event of an error. 5064 ** Do not invoke sqlite3PagerUnref() on *ppPage if an error is returned. 5065 ** 5066 ** If the "nearby" parameter is not 0, then an effort is made to 5067 ** locate a page close to the page number "nearby". This can be used in an 5068 ** attempt to keep related pages close to each other in the database file, 5069 ** which in turn can make database access faster. 5070 ** 5071 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists 5072 ** anywhere on the free-list, then it is guaranteed to be returned. If 5073 ** eMode is BTALLOC_LT then the page returned will be less than or equal 5074 ** to nearby if any such page exists. If eMode is BTALLOC_ANY then there 5075 ** are no restrictions on which page is returned. 5076 */ 5077 static int allocateBtreePage( 5078 BtShared *pBt, /* The btree */ 5079 MemPage **ppPage, /* Store pointer to the allocated page here */ 5080 Pgno *pPgno, /* Store the page number here */ 5081 Pgno nearby, /* Search for a page near this one */ 5082 u8 eMode /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */ 5083 ){ 5084 MemPage *pPage1; 5085 int rc; 5086 u32 n; /* Number of pages on the freelist */ 5087 u32 k; /* Number of leaves on the trunk of the freelist */ 5088 MemPage *pTrunk = 0; 5089 MemPage *pPrevTrunk = 0; 5090 Pgno mxPage; /* Total size of the database file */ 5091 5092 assert( sqlite3_mutex_held(pBt->mutex) ); 5093 assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) ); 5094 pPage1 = pBt->pPage1; 5095 mxPage = btreePagecount(pBt); 5096 n = get4byte(&pPage1->aData[36]); 5097 testcase( n==mxPage-1 ); 5098 if( n>=mxPage ){ 5099 return SQLITE_CORRUPT_BKPT; 5100 } 5101 if( n>0 ){ 5102 /* There are pages on the freelist. Reuse one of those pages. */ 5103 Pgno iTrunk; 5104 u8 searchList = 0; /* If the free-list must be searched for 'nearby' */ 5105 5106 /* If eMode==BTALLOC_EXACT and a query of the pointer-map 5107 ** shows that the page 'nearby' is somewhere on the free-list, then 5108 ** the entire-list will be searched for that page. 5109 */ 5110 #ifndef SQLITE_OMIT_AUTOVACUUM 5111 if( eMode==BTALLOC_EXACT ){ 5112 if( nearby<=mxPage ){ 5113 u8 eType; 5114 assert( nearby>0 ); 5115 assert( pBt->autoVacuum ); 5116 rc = ptrmapGet(pBt, nearby, &eType, 0); 5117 if( rc ) return rc; 5118 if( eType==PTRMAP_FREEPAGE ){ 5119 searchList = 1; 5120 } 5121 } 5122 }else if( eMode==BTALLOC_LE ){ 5123 searchList = 1; 5124 } 5125 #endif 5126 5127 /* Decrement the free-list count by 1. Set iTrunk to the index of the 5128 ** first free-list trunk page. iPrevTrunk is initially 1. 5129 */ 5130 rc = sqlite3PagerWrite(pPage1->pDbPage); 5131 if( rc ) return rc; 5132 put4byte(&pPage1->aData[36], n-1); 5133 5134 /* The code within this loop is run only once if the 'searchList' variable 5135 ** is not true. Otherwise, it runs once for each trunk-page on the 5136 ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT) 5137 ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT) 5138 */ 5139 do { 5140 pPrevTrunk = pTrunk; 5141 if( pPrevTrunk ){ 5142 iTrunk = get4byte(&pPrevTrunk->aData[0]); 5143 }else{ 5144 iTrunk = get4byte(&pPage1->aData[32]); 5145 } 5146 testcase( iTrunk==mxPage ); 5147 if( iTrunk>mxPage ){ 5148 rc = SQLITE_CORRUPT_BKPT; 5149 }else{ 5150 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 5151 } 5152 if( rc ){ 5153 pTrunk = 0; 5154 goto end_allocate_page; 5155 } 5156 assert( pTrunk!=0 ); 5157 assert( pTrunk->aData!=0 ); 5158 5159 k = get4byte(&pTrunk->aData[4]); /* # of leaves on this trunk page */ 5160 if( k==0 && !searchList ){ 5161 /* The trunk has no leaves and the list is not being searched. 5162 ** So extract the trunk page itself and use it as the newly 5163 ** allocated page */ 5164 assert( pPrevTrunk==0 ); 5165 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5166 if( rc ){ 5167 goto end_allocate_page; 5168 } 5169 *pPgno = iTrunk; 5170 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 5171 *ppPage = pTrunk; 5172 pTrunk = 0; 5173 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 5174 }else if( k>(u32)(pBt->usableSize/4 - 2) ){ 5175 /* Value of k is out of range. Database corruption */ 5176 rc = SQLITE_CORRUPT_BKPT; 5177 goto end_allocate_page; 5178 #ifndef SQLITE_OMIT_AUTOVACUUM 5179 }else if( searchList 5180 && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE)) 5181 ){ 5182 /* The list is being searched and this trunk page is the page 5183 ** to allocate, regardless of whether it has leaves. 5184 */ 5185 *pPgno = iTrunk; 5186 *ppPage = pTrunk; 5187 searchList = 0; 5188 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5189 if( rc ){ 5190 goto end_allocate_page; 5191 } 5192 if( k==0 ){ 5193 if( !pPrevTrunk ){ 5194 memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4); 5195 }else{ 5196 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 5197 if( rc!=SQLITE_OK ){ 5198 goto end_allocate_page; 5199 } 5200 memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4); 5201 } 5202 }else{ 5203 /* The trunk page is required by the caller but it contains 5204 ** pointers to free-list leaves. The first leaf becomes a trunk 5205 ** page in this case. 5206 */ 5207 MemPage *pNewTrunk; 5208 Pgno iNewTrunk = get4byte(&pTrunk->aData[8]); 5209 if( iNewTrunk>mxPage ){ 5210 rc = SQLITE_CORRUPT_BKPT; 5211 goto end_allocate_page; 5212 } 5213 testcase( iNewTrunk==mxPage ); 5214 rc = btreeGetPage(pBt, iNewTrunk, &pNewTrunk, 0); 5215 if( rc!=SQLITE_OK ){ 5216 goto end_allocate_page; 5217 } 5218 rc = sqlite3PagerWrite(pNewTrunk->pDbPage); 5219 if( rc!=SQLITE_OK ){ 5220 releasePage(pNewTrunk); 5221 goto end_allocate_page; 5222 } 5223 memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4); 5224 put4byte(&pNewTrunk->aData[4], k-1); 5225 memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4); 5226 releasePage(pNewTrunk); 5227 if( !pPrevTrunk ){ 5228 assert( sqlite3PagerIswriteable(pPage1->pDbPage) ); 5229 put4byte(&pPage1->aData[32], iNewTrunk); 5230 }else{ 5231 rc = sqlite3PagerWrite(pPrevTrunk->pDbPage); 5232 if( rc ){ 5233 goto end_allocate_page; 5234 } 5235 put4byte(&pPrevTrunk->aData[0], iNewTrunk); 5236 } 5237 } 5238 pTrunk = 0; 5239 TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1)); 5240 #endif 5241 }else if( k>0 ){ 5242 /* Extract a leaf from the trunk */ 5243 u32 closest; 5244 Pgno iPage; 5245 unsigned char *aData = pTrunk->aData; 5246 if( nearby>0 ){ 5247 u32 i; 5248 closest = 0; 5249 if( eMode==BTALLOC_LE ){ 5250 for(i=0; i<k; i++){ 5251 iPage = get4byte(&aData[8+i*4]); 5252 if( iPage<=nearby ){ 5253 closest = i; 5254 break; 5255 } 5256 } 5257 }else{ 5258 int dist; 5259 dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby); 5260 for(i=1; i<k; i++){ 5261 int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby); 5262 if( d2<dist ){ 5263 closest = i; 5264 dist = d2; 5265 } 5266 } 5267 } 5268 }else{ 5269 closest = 0; 5270 } 5271 5272 iPage = get4byte(&aData[8+closest*4]); 5273 testcase( iPage==mxPage ); 5274 if( iPage>mxPage ){ 5275 rc = SQLITE_CORRUPT_BKPT; 5276 goto end_allocate_page; 5277 } 5278 testcase( iPage==mxPage ); 5279 if( !searchList 5280 || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE)) 5281 ){ 5282 int noContent; 5283 *pPgno = iPage; 5284 TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d" 5285 ": %d more free pages\n", 5286 *pPgno, closest+1, k, pTrunk->pgno, n-1)); 5287 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5288 if( rc ) goto end_allocate_page; 5289 if( closest<k-1 ){ 5290 memcpy(&aData[8+closest*4], &aData[4+k*4], 4); 5291 } 5292 put4byte(&aData[4], k-1); 5293 noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0; 5294 rc = btreeGetPage(pBt, *pPgno, ppPage, noContent); 5295 if( rc==SQLITE_OK ){ 5296 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 5297 if( rc!=SQLITE_OK ){ 5298 releasePage(*ppPage); 5299 } 5300 } 5301 searchList = 0; 5302 } 5303 } 5304 releasePage(pPrevTrunk); 5305 pPrevTrunk = 0; 5306 }while( searchList ); 5307 }else{ 5308 /* There are no pages on the freelist, so append a new page to the 5309 ** database image. 5310 ** 5311 ** Normally, new pages allocated by this block can be requested from the 5312 ** pager layer with the 'no-content' flag set. This prevents the pager 5313 ** from trying to read the pages content from disk. However, if the 5314 ** current transaction has already run one or more incremental-vacuum 5315 ** steps, then the page we are about to allocate may contain content 5316 ** that is required in the event of a rollback. In this case, do 5317 ** not set the no-content flag. This causes the pager to load and journal 5318 ** the current page content before overwriting it. 5319 ** 5320 ** Note that the pager will not actually attempt to load or journal 5321 ** content for any page that really does lie past the end of the database 5322 ** file on disk. So the effects of disabling the no-content optimization 5323 ** here are confined to those pages that lie between the end of the 5324 ** database image and the end of the database file. 5325 */ 5326 int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0; 5327 5328 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 5329 if( rc ) return rc; 5330 pBt->nPage++; 5331 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++; 5332 5333 #ifndef SQLITE_OMIT_AUTOVACUUM 5334 if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){ 5335 /* If *pPgno refers to a pointer-map page, allocate two new pages 5336 ** at the end of the file instead of one. The first allocated page 5337 ** becomes a new pointer-map page, the second is used by the caller. 5338 */ 5339 MemPage *pPg = 0; 5340 TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage)); 5341 assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) ); 5342 rc = btreeGetPage(pBt, pBt->nPage, &pPg, bNoContent); 5343 if( rc==SQLITE_OK ){ 5344 rc = sqlite3PagerWrite(pPg->pDbPage); 5345 releasePage(pPg); 5346 } 5347 if( rc ) return rc; 5348 pBt->nPage++; 5349 if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; } 5350 } 5351 #endif 5352 put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage); 5353 *pPgno = pBt->nPage; 5354 5355 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 5356 rc = btreeGetPage(pBt, *pPgno, ppPage, bNoContent); 5357 if( rc ) return rc; 5358 rc = sqlite3PagerWrite((*ppPage)->pDbPage); 5359 if( rc!=SQLITE_OK ){ 5360 releasePage(*ppPage); 5361 } 5362 TRACE(("ALLOCATE: %d from end of file\n", *pPgno)); 5363 } 5364 5365 assert( *pPgno!=PENDING_BYTE_PAGE(pBt) ); 5366 5367 end_allocate_page: 5368 releasePage(pTrunk); 5369 releasePage(pPrevTrunk); 5370 if( rc==SQLITE_OK ){ 5371 if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){ 5372 releasePage(*ppPage); 5373 *ppPage = 0; 5374 return SQLITE_CORRUPT_BKPT; 5375 } 5376 (*ppPage)->isInit = 0; 5377 }else{ 5378 *ppPage = 0; 5379 } 5380 assert( rc!=SQLITE_OK || sqlite3PagerIswriteable((*ppPage)->pDbPage) ); 5381 return rc; 5382 } 5383 5384 /* 5385 ** This function is used to add page iPage to the database file free-list. 5386 ** It is assumed that the page is not already a part of the free-list. 5387 ** 5388 ** The value passed as the second argument to this function is optional. 5389 ** If the caller happens to have a pointer to the MemPage object 5390 ** corresponding to page iPage handy, it may pass it as the second value. 5391 ** Otherwise, it may pass NULL. 5392 ** 5393 ** If a pointer to a MemPage object is passed as the second argument, 5394 ** its reference count is not altered by this function. 5395 */ 5396 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){ 5397 MemPage *pTrunk = 0; /* Free-list trunk page */ 5398 Pgno iTrunk = 0; /* Page number of free-list trunk page */ 5399 MemPage *pPage1 = pBt->pPage1; /* Local reference to page 1 */ 5400 MemPage *pPage; /* Page being freed. May be NULL. */ 5401 int rc; /* Return Code */ 5402 int nFree; /* Initial number of pages on free-list */ 5403 5404 assert( sqlite3_mutex_held(pBt->mutex) ); 5405 assert( iPage>1 ); 5406 assert( !pMemPage || pMemPage->pgno==iPage ); 5407 5408 if( pMemPage ){ 5409 pPage = pMemPage; 5410 sqlite3PagerRef(pPage->pDbPage); 5411 }else{ 5412 pPage = btreePageLookup(pBt, iPage); 5413 } 5414 5415 /* Increment the free page count on pPage1 */ 5416 rc = sqlite3PagerWrite(pPage1->pDbPage); 5417 if( rc ) goto freepage_out; 5418 nFree = get4byte(&pPage1->aData[36]); 5419 put4byte(&pPage1->aData[36], nFree+1); 5420 5421 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 5422 /* If the secure_delete option is enabled, then 5423 ** always fully overwrite deleted information with zeros. 5424 */ 5425 if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) ) 5426 || ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0) 5427 ){ 5428 goto freepage_out; 5429 } 5430 memset(pPage->aData, 0, pPage->pBt->pageSize); 5431 } 5432 5433 /* If the database supports auto-vacuum, write an entry in the pointer-map 5434 ** to indicate that the page is free. 5435 */ 5436 if( ISAUTOVACUUM ){ 5437 ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc); 5438 if( rc ) goto freepage_out; 5439 } 5440 5441 /* Now manipulate the actual database free-list structure. There are two 5442 ** possibilities. If the free-list is currently empty, or if the first 5443 ** trunk page in the free-list is full, then this page will become a 5444 ** new free-list trunk page. Otherwise, it will become a leaf of the 5445 ** first trunk page in the current free-list. This block tests if it 5446 ** is possible to add the page as a new free-list leaf. 5447 */ 5448 if( nFree!=0 ){ 5449 u32 nLeaf; /* Initial number of leaf cells on trunk page */ 5450 5451 iTrunk = get4byte(&pPage1->aData[32]); 5452 rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0); 5453 if( rc!=SQLITE_OK ){ 5454 goto freepage_out; 5455 } 5456 5457 nLeaf = get4byte(&pTrunk->aData[4]); 5458 assert( pBt->usableSize>32 ); 5459 if( nLeaf > (u32)pBt->usableSize/4 - 2 ){ 5460 rc = SQLITE_CORRUPT_BKPT; 5461 goto freepage_out; 5462 } 5463 if( nLeaf < (u32)pBt->usableSize/4 - 8 ){ 5464 /* In this case there is room on the trunk page to insert the page 5465 ** being freed as a new leaf. 5466 ** 5467 ** Note that the trunk page is not really full until it contains 5468 ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have 5469 ** coded. But due to a coding error in versions of SQLite prior to 5470 ** 3.6.0, databases with freelist trunk pages holding more than 5471 ** usableSize/4 - 8 entries will be reported as corrupt. In order 5472 ** to maintain backwards compatibility with older versions of SQLite, 5473 ** we will continue to restrict the number of entries to usableSize/4 - 8 5474 ** for now. At some point in the future (once everyone has upgraded 5475 ** to 3.6.0 or later) we should consider fixing the conditional above 5476 ** to read "usableSize/4-2" instead of "usableSize/4-8". 5477 */ 5478 rc = sqlite3PagerWrite(pTrunk->pDbPage); 5479 if( rc==SQLITE_OK ){ 5480 put4byte(&pTrunk->aData[4], nLeaf+1); 5481 put4byte(&pTrunk->aData[8+nLeaf*4], iPage); 5482 if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){ 5483 sqlite3PagerDontWrite(pPage->pDbPage); 5484 } 5485 rc = btreeSetHasContent(pBt, iPage); 5486 } 5487 TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno)); 5488 goto freepage_out; 5489 } 5490 } 5491 5492 /* If control flows to this point, then it was not possible to add the 5493 ** the page being freed as a leaf page of the first trunk in the free-list. 5494 ** Possibly because the free-list is empty, or possibly because the 5495 ** first trunk in the free-list is full. Either way, the page being freed 5496 ** will become the new first trunk page in the free-list. 5497 */ 5498 if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){ 5499 goto freepage_out; 5500 } 5501 rc = sqlite3PagerWrite(pPage->pDbPage); 5502 if( rc!=SQLITE_OK ){ 5503 goto freepage_out; 5504 } 5505 put4byte(pPage->aData, iTrunk); 5506 put4byte(&pPage->aData[4], 0); 5507 put4byte(&pPage1->aData[32], iPage); 5508 TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk)); 5509 5510 freepage_out: 5511 if( pPage ){ 5512 pPage->isInit = 0; 5513 } 5514 releasePage(pPage); 5515 releasePage(pTrunk); 5516 return rc; 5517 } 5518 static void freePage(MemPage *pPage, int *pRC){ 5519 if( (*pRC)==SQLITE_OK ){ 5520 *pRC = freePage2(pPage->pBt, pPage, pPage->pgno); 5521 } 5522 } 5523 5524 /* 5525 ** Free any overflow pages associated with the given Cell. Write the 5526 ** local Cell size (the number of bytes on the original page, omitting 5527 ** overflow) into *pnSize. 5528 */ 5529 static int clearCell( 5530 MemPage *pPage, /* The page that contains the Cell */ 5531 unsigned char *pCell, /* First byte of the Cell */ 5532 u16 *pnSize /* Write the size of the Cell here */ 5533 ){ 5534 BtShared *pBt = pPage->pBt; 5535 CellInfo info; 5536 Pgno ovflPgno; 5537 int rc; 5538 int nOvfl; 5539 u32 ovflPageSize; 5540 5541 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5542 btreeParseCellPtr(pPage, pCell, &info); 5543 *pnSize = info.nSize; 5544 if( info.iOverflow==0 ){ 5545 return SQLITE_OK; /* No overflow pages. Return without doing anything */ 5546 } 5547 if( pCell+info.iOverflow+3 > pPage->aData+pPage->maskPage ){ 5548 return SQLITE_CORRUPT_BKPT; /* Cell extends past end of page */ 5549 } 5550 ovflPgno = get4byte(&pCell[info.iOverflow]); 5551 assert( pBt->usableSize > 4 ); 5552 ovflPageSize = pBt->usableSize - 4; 5553 nOvfl = (info.nPayload - info.nLocal + ovflPageSize - 1)/ovflPageSize; 5554 assert( ovflPgno==0 || nOvfl>0 ); 5555 while( nOvfl-- ){ 5556 Pgno iNext = 0; 5557 MemPage *pOvfl = 0; 5558 if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){ 5559 /* 0 is not a legal page number and page 1 cannot be an 5560 ** overflow page. Therefore if ovflPgno<2 or past the end of the 5561 ** file the database must be corrupt. */ 5562 return SQLITE_CORRUPT_BKPT; 5563 } 5564 if( nOvfl ){ 5565 rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext); 5566 if( rc ) return rc; 5567 } 5568 5569 if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) ) 5570 && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1 5571 ){ 5572 /* There is no reason any cursor should have an outstanding reference 5573 ** to an overflow page belonging to a cell that is being deleted/updated. 5574 ** So if there exists more than one reference to this page, then it 5575 ** must not really be an overflow page and the database must be corrupt. 5576 ** It is helpful to detect this before calling freePage2(), as 5577 ** freePage2() may zero the page contents if secure-delete mode is 5578 ** enabled. If this 'overflow' page happens to be a page that the 5579 ** caller is iterating through or using in some other way, this 5580 ** can be problematic. 5581 */ 5582 rc = SQLITE_CORRUPT_BKPT; 5583 }else{ 5584 rc = freePage2(pBt, pOvfl, ovflPgno); 5585 } 5586 5587 if( pOvfl ){ 5588 sqlite3PagerUnref(pOvfl->pDbPage); 5589 } 5590 if( rc ) return rc; 5591 ovflPgno = iNext; 5592 } 5593 return SQLITE_OK; 5594 } 5595 5596 /* 5597 ** Create the byte sequence used to represent a cell on page pPage 5598 ** and write that byte sequence into pCell[]. Overflow pages are 5599 ** allocated and filled in as necessary. The calling procedure 5600 ** is responsible for making sure sufficient space has been allocated 5601 ** for pCell[]. 5602 ** 5603 ** Note that pCell does not necessary need to point to the pPage->aData 5604 ** area. pCell might point to some temporary storage. The cell will 5605 ** be constructed in this temporary area then copied into pPage->aData 5606 ** later. 5607 */ 5608 static int fillInCell( 5609 MemPage *pPage, /* The page that contains the cell */ 5610 unsigned char *pCell, /* Complete text of the cell */ 5611 const void *pKey, i64 nKey, /* The key */ 5612 const void *pData,int nData, /* The data */ 5613 int nZero, /* Extra zero bytes to append to pData */ 5614 int *pnSize /* Write cell size here */ 5615 ){ 5616 int nPayload; 5617 const u8 *pSrc; 5618 int nSrc, n, rc; 5619 int spaceLeft; 5620 MemPage *pOvfl = 0; 5621 MemPage *pToRelease = 0; 5622 unsigned char *pPrior; 5623 unsigned char *pPayload; 5624 BtShared *pBt = pPage->pBt; 5625 Pgno pgnoOvfl = 0; 5626 int nHeader; 5627 5628 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5629 5630 /* pPage is not necessarily writeable since pCell might be auxiliary 5631 ** buffer space that is separate from the pPage buffer area */ 5632 assert( pCell<pPage->aData || pCell>=&pPage->aData[pBt->pageSize] 5633 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5634 5635 /* Fill in the header. */ 5636 nHeader = pPage->childPtrSize; 5637 nPayload = nData + nZero; 5638 if( pPage->intKeyLeaf ){ 5639 nHeader += putVarint32(&pCell[nHeader], nPayload); 5640 }else{ 5641 assert( nData==0 ); 5642 assert( nZero==0 ); 5643 } 5644 nHeader += putVarint(&pCell[nHeader], *(u64*)&nKey); 5645 5646 /* Fill in the payload size */ 5647 if( pPage->intKey ){ 5648 pSrc = pData; 5649 nSrc = nData; 5650 nData = 0; 5651 }else{ 5652 if( NEVER(nKey>0x7fffffff || pKey==0) ){ 5653 return SQLITE_CORRUPT_BKPT; 5654 } 5655 nPayload = (int)nKey; 5656 pSrc = pKey; 5657 nSrc = (int)nKey; 5658 } 5659 if( nPayload<=pPage->maxLocal ){ 5660 n = nHeader + nPayload; 5661 testcase( n==3 ); 5662 testcase( n==4 ); 5663 if( n<4 ) n = 4; 5664 *pnSize = n; 5665 spaceLeft = nPayload; 5666 pPrior = pCell; 5667 }else{ 5668 int mn = pPage->minLocal; 5669 n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4); 5670 testcase( n==pPage->maxLocal ); 5671 testcase( n==pPage->maxLocal+1 ); 5672 if( n > pPage->maxLocal ) n = mn; 5673 spaceLeft = n; 5674 *pnSize = n + nHeader + 4; 5675 pPrior = &pCell[nHeader+n]; 5676 } 5677 pPayload = &pCell[nHeader]; 5678 5679 /* At this point variables should be set as follows: 5680 ** 5681 ** nPayload Total payload size in bytes 5682 ** pPayload Begin writing payload here 5683 ** spaceLeft Space available at pPayload. If nPayload>spaceLeft, 5684 ** that means content must spill into overflow pages. 5685 ** *pnSize Size of the local cell (not counting overflow pages) 5686 ** pPrior Where to write the pgno of the first overflow page 5687 ** 5688 ** Use a call to btreeParseCellPtr() to verify that the values above 5689 ** were computed correctly. 5690 */ 5691 #if SQLITE_DEBUG 5692 { 5693 CellInfo info; 5694 btreeParseCellPtr(pPage, pCell, &info); 5695 assert( nHeader=(int)(info.pPayload - pCell) ); 5696 assert( info.nKey==nKey ); 5697 assert( *pnSize == info.nSize ); 5698 assert( spaceLeft == info.nLocal ); 5699 assert( pPrior == &pCell[info.iOverflow] ); 5700 } 5701 #endif 5702 5703 /* Write the payload into the local Cell and any extra into overflow pages */ 5704 while( nPayload>0 ){ 5705 if( spaceLeft==0 ){ 5706 #ifndef SQLITE_OMIT_AUTOVACUUM 5707 Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */ 5708 if( pBt->autoVacuum ){ 5709 do{ 5710 pgnoOvfl++; 5711 } while( 5712 PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt) 5713 ); 5714 } 5715 #endif 5716 rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0); 5717 #ifndef SQLITE_OMIT_AUTOVACUUM 5718 /* If the database supports auto-vacuum, and the second or subsequent 5719 ** overflow page is being allocated, add an entry to the pointer-map 5720 ** for that page now. 5721 ** 5722 ** If this is the first overflow page, then write a partial entry 5723 ** to the pointer-map. If we write nothing to this pointer-map slot, 5724 ** then the optimistic overflow chain processing in clearCell() 5725 ** may misinterpret the uninitialized values and delete the 5726 ** wrong pages from the database. 5727 */ 5728 if( pBt->autoVacuum && rc==SQLITE_OK ){ 5729 u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1); 5730 ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc); 5731 if( rc ){ 5732 releasePage(pOvfl); 5733 } 5734 } 5735 #endif 5736 if( rc ){ 5737 releasePage(pToRelease); 5738 return rc; 5739 } 5740 5741 /* If pToRelease is not zero than pPrior points into the data area 5742 ** of pToRelease. Make sure pToRelease is still writeable. */ 5743 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5744 5745 /* If pPrior is part of the data area of pPage, then make sure pPage 5746 ** is still writeable */ 5747 assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize] 5748 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5749 5750 put4byte(pPrior, pgnoOvfl); 5751 releasePage(pToRelease); 5752 pToRelease = pOvfl; 5753 pPrior = pOvfl->aData; 5754 put4byte(pPrior, 0); 5755 pPayload = &pOvfl->aData[4]; 5756 spaceLeft = pBt->usableSize - 4; 5757 } 5758 n = nPayload; 5759 if( n>spaceLeft ) n = spaceLeft; 5760 5761 /* If pToRelease is not zero than pPayload points into the data area 5762 ** of pToRelease. Make sure pToRelease is still writeable. */ 5763 assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) ); 5764 5765 /* If pPayload is part of the data area of pPage, then make sure pPage 5766 ** is still writeable */ 5767 assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize] 5768 || sqlite3PagerIswriteable(pPage->pDbPage) ); 5769 5770 if( nSrc>0 ){ 5771 if( n>nSrc ) n = nSrc; 5772 assert( pSrc ); 5773 memcpy(pPayload, pSrc, n); 5774 }else{ 5775 memset(pPayload, 0, n); 5776 } 5777 nPayload -= n; 5778 pPayload += n; 5779 pSrc += n; 5780 nSrc -= n; 5781 spaceLeft -= n; 5782 if( nSrc==0 ){ 5783 nSrc = nData; 5784 pSrc = pData; 5785 } 5786 } 5787 releasePage(pToRelease); 5788 return SQLITE_OK; 5789 } 5790 5791 /* 5792 ** Remove the i-th cell from pPage. This routine effects pPage only. 5793 ** The cell content is not freed or deallocated. It is assumed that 5794 ** the cell content has been copied someplace else. This routine just 5795 ** removes the reference to the cell from pPage. 5796 ** 5797 ** "sz" must be the number of bytes in the cell. 5798 */ 5799 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){ 5800 u32 pc; /* Offset to cell content of cell being deleted */ 5801 u8 *data; /* pPage->aData */ 5802 u8 *ptr; /* Used to move bytes around within data[] */ 5803 int rc; /* The return code */ 5804 int hdr; /* Beginning of the header. 0 most pages. 100 page 1 */ 5805 5806 if( *pRC ) return; 5807 5808 assert( idx>=0 && idx<pPage->nCell ); 5809 assert( sz==cellSize(pPage, idx) ); 5810 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5811 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5812 data = pPage->aData; 5813 ptr = &pPage->aCellIdx[2*idx]; 5814 pc = get2byte(ptr); 5815 hdr = pPage->hdrOffset; 5816 testcase( pc==get2byte(&data[hdr+5]) ); 5817 testcase( pc+sz==pPage->pBt->usableSize ); 5818 if( pc < (u32)get2byte(&data[hdr+5]) || pc+sz > pPage->pBt->usableSize ){ 5819 *pRC = SQLITE_CORRUPT_BKPT; 5820 return; 5821 } 5822 rc = freeSpace(pPage, pc, sz); 5823 if( rc ){ 5824 *pRC = rc; 5825 return; 5826 } 5827 pPage->nCell--; 5828 memmove(ptr, ptr+2, 2*(pPage->nCell - idx)); 5829 put2byte(&data[hdr+3], pPage->nCell); 5830 pPage->nFree += 2; 5831 } 5832 5833 /* 5834 ** Insert a new cell on pPage at cell index "i". pCell points to the 5835 ** content of the cell. 5836 ** 5837 ** If the cell content will fit on the page, then put it there. If it 5838 ** will not fit, then make a copy of the cell content into pTemp if 5839 ** pTemp is not null. Regardless of pTemp, allocate a new entry 5840 ** in pPage->apOvfl[] and make it point to the cell content (either 5841 ** in pTemp or the original pCell) and also record its index. 5842 ** Allocating a new entry in pPage->aCell[] implies that 5843 ** pPage->nOverflow is incremented. 5844 ** 5845 ** If nSkip is non-zero, then do not copy the first nSkip bytes of the 5846 ** cell. The caller will overwrite them after this function returns. If 5847 ** nSkip is non-zero, then pCell may not point to an invalid memory location 5848 ** (but pCell+nSkip is always valid). 5849 */ 5850 static void insertCell( 5851 MemPage *pPage, /* Page into which we are copying */ 5852 int i, /* New cell becomes the i-th cell of the page */ 5853 u8 *pCell, /* Content of the new cell */ 5854 int sz, /* Bytes of content in pCell */ 5855 u8 *pTemp, /* Temp storage space for pCell, if needed */ 5856 Pgno iChild, /* If non-zero, replace first 4 bytes with this value */ 5857 int *pRC /* Read and write return code from here */ 5858 ){ 5859 int idx = 0; /* Where to write new cell content in data[] */ 5860 int j; /* Loop counter */ 5861 int end; /* First byte past the last cell pointer in data[] */ 5862 int ins; /* Index in data[] where new cell pointer is inserted */ 5863 int cellOffset; /* Address of first cell pointer in data[] */ 5864 u8 *data; /* The content of the whole page */ 5865 int nSkip = (iChild ? 4 : 0); 5866 5867 if( *pRC ) return; 5868 5869 assert( i>=0 && i<=pPage->nCell+pPage->nOverflow ); 5870 assert( MX_CELL(pPage->pBt)<=10921 ); 5871 assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB ); 5872 assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) ); 5873 assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) ); 5874 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5875 /* The cell should normally be sized correctly. However, when moving a 5876 ** malformed cell from a leaf page to an interior page, if the cell size 5877 ** wanted to be less than 4 but got rounded up to 4 on the leaf, then size 5878 ** might be less than 8 (leaf-size + pointer) on the interior node. Hence 5879 ** the term after the || in the following assert(). */ 5880 assert( sz==cellSizePtr(pPage, pCell) || (sz==8 && iChild>0) ); 5881 if( pPage->nOverflow || sz+2>pPage->nFree ){ 5882 if( pTemp ){ 5883 memcpy(pTemp+nSkip, pCell+nSkip, sz-nSkip); 5884 pCell = pTemp; 5885 } 5886 if( iChild ){ 5887 put4byte(pCell, iChild); 5888 } 5889 j = pPage->nOverflow++; 5890 assert( j<(int)(sizeof(pPage->apOvfl)/sizeof(pPage->apOvfl[0])) ); 5891 pPage->apOvfl[j] = pCell; 5892 pPage->aiOvfl[j] = (u16)i; 5893 }else{ 5894 int rc = sqlite3PagerWrite(pPage->pDbPage); 5895 if( rc!=SQLITE_OK ){ 5896 *pRC = rc; 5897 return; 5898 } 5899 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5900 data = pPage->aData; 5901 cellOffset = pPage->cellOffset; 5902 end = cellOffset + 2*pPage->nCell; 5903 ins = cellOffset + 2*i; 5904 rc = allocateSpace(pPage, sz, &idx); 5905 if( rc ){ *pRC = rc; return; } 5906 /* The allocateSpace() routine guarantees the following two properties 5907 ** if it returns success */ 5908 assert( idx >= end+2 ); 5909 assert( idx+sz <= (int)pPage->pBt->usableSize ); 5910 pPage->nCell++; 5911 pPage->nFree -= (u16)(2 + sz); 5912 memcpy(&data[idx+nSkip], pCell+nSkip, sz-nSkip); 5913 if( iChild ){ 5914 put4byte(&data[idx], iChild); 5915 } 5916 memmove(&data[ins+2], &data[ins], end-ins); 5917 put2byte(&data[ins], idx); 5918 put2byte(&data[pPage->hdrOffset+3], pPage->nCell); 5919 #ifndef SQLITE_OMIT_AUTOVACUUM 5920 if( pPage->pBt->autoVacuum ){ 5921 /* The cell may contain a pointer to an overflow page. If so, write 5922 ** the entry for the overflow page into the pointer map. 5923 */ 5924 ptrmapPutOvflPtr(pPage, pCell, pRC); 5925 } 5926 #endif 5927 } 5928 } 5929 5930 /* 5931 ** Add a list of cells to a page. The page should be initially empty. 5932 ** The cells are guaranteed to fit on the page. 5933 */ 5934 static void assemblePage( 5935 MemPage *pPage, /* The page to be assembled */ 5936 int nCell, /* The number of cells to add to this page */ 5937 u8 **apCell, /* Pointers to cell bodies */ 5938 u16 *aSize /* Sizes of the cells */ 5939 ){ 5940 int i; /* Loop counter */ 5941 u8 *pCellptr; /* Address of next cell pointer */ 5942 int cellbody; /* Address of next cell body */ 5943 u8 * const data = pPage->aData; /* Pointer to data for pPage */ 5944 const int hdr = pPage->hdrOffset; /* Offset of header on pPage */ 5945 const int nUsable = pPage->pBt->usableSize; /* Usable size of page */ 5946 5947 assert( pPage->nOverflow==0 ); 5948 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 5949 assert( nCell>=0 && nCell<=(int)MX_CELL(pPage->pBt) 5950 && (int)MX_CELL(pPage->pBt)<=10921); 5951 assert( sqlite3PagerIswriteable(pPage->pDbPage) ); 5952 5953 /* Check that the page has just been zeroed by zeroPage() */ 5954 assert( pPage->nCell==0 ); 5955 assert( get2byteNotZero(&data[hdr+5])==nUsable ); 5956 5957 pCellptr = &pPage->aCellIdx[nCell*2]; 5958 cellbody = nUsable; 5959 for(i=nCell-1; i>=0; i--){ 5960 u16 sz = aSize[i]; 5961 pCellptr -= 2; 5962 cellbody -= sz; 5963 put2byte(pCellptr, cellbody); 5964 memcpy(&data[cellbody], apCell[i], sz); 5965 } 5966 put2byte(&data[hdr+3], nCell); 5967 put2byte(&data[hdr+5], cellbody); 5968 pPage->nFree -= (nCell*2 + nUsable - cellbody); 5969 pPage->nCell = (u16)nCell; 5970 } 5971 5972 /* 5973 ** The following parameters determine how many adjacent pages get involved 5974 ** in a balancing operation. NN is the number of neighbors on either side 5975 ** of the page that participate in the balancing operation. NB is the 5976 ** total number of pages that participate, including the target page and 5977 ** NN neighbors on either side. 5978 ** 5979 ** The minimum value of NN is 1 (of course). Increasing NN above 1 5980 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 5981 ** in exchange for a larger degradation in INSERT and UPDATE performance. 5982 ** The value of NN appears to give the best results overall. 5983 */ 5984 #define NN 1 /* Number of neighbors on either side of pPage */ 5985 #define NB (NN*2+1) /* Total pages involved in the balance */ 5986 5987 5988 #ifndef SQLITE_OMIT_QUICKBALANCE 5989 /* 5990 ** This version of balance() handles the common special case where 5991 ** a new entry is being inserted on the extreme right-end of the 5992 ** tree, in other words, when the new entry will become the largest 5993 ** entry in the tree. 5994 ** 5995 ** Instead of trying to balance the 3 right-most leaf pages, just add 5996 ** a new page to the right-hand side and put the one new entry in 5997 ** that page. This leaves the right side of the tree somewhat 5998 ** unbalanced. But odds are that we will be inserting new entries 5999 ** at the end soon afterwards so the nearly empty page will quickly 6000 ** fill up. On average. 6001 ** 6002 ** pPage is the leaf page which is the right-most page in the tree. 6003 ** pParent is its parent. pPage must have a single overflow entry 6004 ** which is also the right-most entry on the page. 6005 ** 6006 ** The pSpace buffer is used to store a temporary copy of the divider 6007 ** cell that will be inserted into pParent. Such a cell consists of a 4 6008 ** byte page number followed by a variable length integer. In other 6009 ** words, at most 13 bytes. Hence the pSpace buffer must be at 6010 ** least 13 bytes in size. 6011 */ 6012 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){ 6013 BtShared *const pBt = pPage->pBt; /* B-Tree Database */ 6014 MemPage *pNew; /* Newly allocated page */ 6015 int rc; /* Return Code */ 6016 Pgno pgnoNew; /* Page number of pNew */ 6017 6018 assert( sqlite3_mutex_held(pPage->pBt->mutex) ); 6019 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6020 assert( pPage->nOverflow==1 ); 6021 6022 /* This error condition is now caught prior to reaching this function */ 6023 if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT; 6024 6025 /* Allocate a new page. This page will become the right-sibling of 6026 ** pPage. Make the parent page writable, so that the new divider cell 6027 ** may be inserted. If both these operations are successful, proceed. 6028 */ 6029 rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0); 6030 6031 if( rc==SQLITE_OK ){ 6032 6033 u8 *pOut = &pSpace[4]; 6034 u8 *pCell = pPage->apOvfl[0]; 6035 u16 szCell = cellSizePtr(pPage, pCell); 6036 u8 *pStop; 6037 6038 assert( sqlite3PagerIswriteable(pNew->pDbPage) ); 6039 assert( pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) ); 6040 zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF); 6041 assemblePage(pNew, 1, &pCell, &szCell); 6042 6043 /* If this is an auto-vacuum database, update the pointer map 6044 ** with entries for the new page, and any pointer from the 6045 ** cell on the page to an overflow page. If either of these 6046 ** operations fails, the return code is set, but the contents 6047 ** of the parent page are still manipulated by thh code below. 6048 ** That is Ok, at this point the parent page is guaranteed to 6049 ** be marked as dirty. Returning an error code will cause a 6050 ** rollback, undoing any changes made to the parent page. 6051 */ 6052 if( ISAUTOVACUUM ){ 6053 ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc); 6054 if( szCell>pNew->minLocal ){ 6055 ptrmapPutOvflPtr(pNew, pCell, &rc); 6056 } 6057 } 6058 6059 /* Create a divider cell to insert into pParent. The divider cell 6060 ** consists of a 4-byte page number (the page number of pPage) and 6061 ** a variable length key value (which must be the same value as the 6062 ** largest key on pPage). 6063 ** 6064 ** To find the largest key value on pPage, first find the right-most 6065 ** cell on pPage. The first two fields of this cell are the 6066 ** record-length (a variable length integer at most 32-bits in size) 6067 ** and the key value (a variable length integer, may have any value). 6068 ** The first of the while(...) loops below skips over the record-length 6069 ** field. The second while(...) loop copies the key value from the 6070 ** cell on pPage into the pSpace buffer. 6071 */ 6072 pCell = findCell(pPage, pPage->nCell-1); 6073 pStop = &pCell[9]; 6074 while( (*(pCell++)&0x80) && pCell<pStop ); 6075 pStop = &pCell[9]; 6076 while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop ); 6077 6078 /* Insert the new divider cell into pParent. */ 6079 insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace), 6080 0, pPage->pgno, &rc); 6081 6082 /* Set the right-child pointer of pParent to point to the new page. */ 6083 put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew); 6084 6085 /* Release the reference to the new page. */ 6086 releasePage(pNew); 6087 } 6088 6089 return rc; 6090 } 6091 #endif /* SQLITE_OMIT_QUICKBALANCE */ 6092 6093 #if 0 6094 /* 6095 ** This function does not contribute anything to the operation of SQLite. 6096 ** it is sometimes activated temporarily while debugging code responsible 6097 ** for setting pointer-map entries. 6098 */ 6099 static int ptrmapCheckPages(MemPage **apPage, int nPage){ 6100 int i, j; 6101 for(i=0; i<nPage; i++){ 6102 Pgno n; 6103 u8 e; 6104 MemPage *pPage = apPage[i]; 6105 BtShared *pBt = pPage->pBt; 6106 assert( pPage->isInit ); 6107 6108 for(j=0; j<pPage->nCell; j++){ 6109 CellInfo info; 6110 u8 *z; 6111 6112 z = findCell(pPage, j); 6113 btreeParseCellPtr(pPage, z, &info); 6114 if( info.iOverflow ){ 6115 Pgno ovfl = get4byte(&z[info.iOverflow]); 6116 ptrmapGet(pBt, ovfl, &e, &n); 6117 assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 ); 6118 } 6119 if( !pPage->leaf ){ 6120 Pgno child = get4byte(z); 6121 ptrmapGet(pBt, child, &e, &n); 6122 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 6123 } 6124 } 6125 if( !pPage->leaf ){ 6126 Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]); 6127 ptrmapGet(pBt, child, &e, &n); 6128 assert( n==pPage->pgno && e==PTRMAP_BTREE ); 6129 } 6130 } 6131 return 1; 6132 } 6133 #endif 6134 6135 /* 6136 ** This function is used to copy the contents of the b-tree node stored 6137 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then 6138 ** the pointer-map entries for each child page are updated so that the 6139 ** parent page stored in the pointer map is page pTo. If pFrom contained 6140 ** any cells with overflow page pointers, then the corresponding pointer 6141 ** map entries are also updated so that the parent page is page pTo. 6142 ** 6143 ** If pFrom is currently carrying any overflow cells (entries in the 6144 ** MemPage.apOvfl[] array), they are not copied to pTo. 6145 ** 6146 ** Before returning, page pTo is reinitialized using btreeInitPage(). 6147 ** 6148 ** The performance of this function is not critical. It is only used by 6149 ** the balance_shallower() and balance_deeper() procedures, neither of 6150 ** which are called often under normal circumstances. 6151 */ 6152 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){ 6153 if( (*pRC)==SQLITE_OK ){ 6154 BtShared * const pBt = pFrom->pBt; 6155 u8 * const aFrom = pFrom->aData; 6156 u8 * const aTo = pTo->aData; 6157 int const iFromHdr = pFrom->hdrOffset; 6158 int const iToHdr = ((pTo->pgno==1) ? 100 : 0); 6159 int rc; 6160 int iData; 6161 6162 6163 assert( pFrom->isInit ); 6164 assert( pFrom->nFree>=iToHdr ); 6165 assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize ); 6166 6167 /* Copy the b-tree node content from page pFrom to page pTo. */ 6168 iData = get2byte(&aFrom[iFromHdr+5]); 6169 memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData); 6170 memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell); 6171 6172 /* Reinitialize page pTo so that the contents of the MemPage structure 6173 ** match the new data. The initialization of pTo can actually fail under 6174 ** fairly obscure circumstances, even though it is a copy of initialized 6175 ** page pFrom. 6176 */ 6177 pTo->isInit = 0; 6178 rc = btreeInitPage(pTo); 6179 if( rc!=SQLITE_OK ){ 6180 *pRC = rc; 6181 return; 6182 } 6183 6184 /* If this is an auto-vacuum database, update the pointer-map entries 6185 ** for any b-tree or overflow pages that pTo now contains the pointers to. 6186 */ 6187 if( ISAUTOVACUUM ){ 6188 *pRC = setChildPtrmaps(pTo); 6189 } 6190 } 6191 } 6192 6193 /* 6194 ** This routine redistributes cells on the iParentIdx'th child of pParent 6195 ** (hereafter "the page") and up to 2 siblings so that all pages have about the 6196 ** same amount of free space. Usually a single sibling on either side of the 6197 ** page are used in the balancing, though both siblings might come from one 6198 ** side if the page is the first or last child of its parent. If the page 6199 ** has fewer than 2 siblings (something which can only happen if the page 6200 ** is a root page or a child of a root page) then all available siblings 6201 ** participate in the balancing. 6202 ** 6203 ** The number of siblings of the page might be increased or decreased by 6204 ** one or two in an effort to keep pages nearly full but not over full. 6205 ** 6206 ** Note that when this routine is called, some of the cells on the page 6207 ** might not actually be stored in MemPage.aData[]. This can happen 6208 ** if the page is overfull. This routine ensures that all cells allocated 6209 ** to the page and its siblings fit into MemPage.aData[] before returning. 6210 ** 6211 ** In the course of balancing the page and its siblings, cells may be 6212 ** inserted into or removed from the parent page (pParent). Doing so 6213 ** may cause the parent page to become overfull or underfull. If this 6214 ** happens, it is the responsibility of the caller to invoke the correct 6215 ** balancing routine to fix this problem (see the balance() routine). 6216 ** 6217 ** If this routine fails for any reason, it might leave the database 6218 ** in a corrupted state. So if this routine fails, the database should 6219 ** be rolled back. 6220 ** 6221 ** The third argument to this function, aOvflSpace, is a pointer to a 6222 ** buffer big enough to hold one page. If while inserting cells into the parent 6223 ** page (pParent) the parent page becomes overfull, this buffer is 6224 ** used to store the parent's overflow cells. Because this function inserts 6225 ** a maximum of four divider cells into the parent page, and the maximum 6226 ** size of a cell stored within an internal node is always less than 1/4 6227 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large 6228 ** enough for all overflow cells. 6229 ** 6230 ** If aOvflSpace is set to a null pointer, this function returns 6231 ** SQLITE_NOMEM. 6232 */ 6233 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) 6234 #pragma optimize("", off) 6235 #endif 6236 static int balance_nonroot( 6237 MemPage *pParent, /* Parent page of siblings being balanced */ 6238 int iParentIdx, /* Index of "the page" in pParent */ 6239 u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ 6240 int isRoot, /* True if pParent is a root-page */ 6241 int bBulk /* True if this call is part of a bulk load */ 6242 ){ 6243 BtShared *pBt; /* The whole database */ 6244 int nCell = 0; /* Number of cells in apCell[] */ 6245 int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ 6246 int nNew = 0; /* Number of pages in apNew[] */ 6247 int nOld; /* Number of pages in apOld[] */ 6248 int i, j, k; /* Loop counters */ 6249 int nxDiv; /* Next divider slot in pParent->aCell[] */ 6250 int rc = SQLITE_OK; /* The return code */ 6251 u16 leafCorrection; /* 4 if pPage is a leaf. 0 if not */ 6252 int leafData; /* True if pPage is a leaf of a LEAFDATA tree */ 6253 int usableSpace; /* Bytes in pPage beyond the header */ 6254 int pageFlags; /* Value of pPage->aData[0] */ 6255 int subtotal; /* Subtotal of bytes in cells on one page */ 6256 int iSpace1 = 0; /* First unused byte of aSpace1[] */ 6257 int iOvflSpace = 0; /* First unused byte of aOvflSpace[] */ 6258 int szScratch; /* Size of scratch memory requested */ 6259 MemPage *apOld[NB]; /* pPage and up to two siblings */ 6260 MemPage *apCopy[NB]; /* Private copies of apOld[] pages */ 6261 MemPage *apNew[NB+2]; /* pPage and up to NB siblings after balancing */ 6262 u8 *pRight; /* Location in parent of right-sibling pointer */ 6263 u8 *apDiv[NB-1]; /* Divider cells in pParent */ 6264 int cntNew[NB+2]; /* Index in aCell[] of cell after i-th page */ 6265 int szNew[NB+2]; /* Combined size of cells place on i-th page */ 6266 u8 **apCell = 0; /* All cells begin balanced */ 6267 u16 *szCell; /* Local size of all cells in apCell[] */ 6268 u8 *aSpace1; /* Space for copies of dividers cells */ 6269 Pgno pgno; /* Temp var to store a page number in */ 6270 6271 pBt = pParent->pBt; 6272 assert( sqlite3_mutex_held(pBt->mutex) ); 6273 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6274 6275 #if 0 6276 TRACE(("BALANCE: begin page %d child of %d\n", pPage->pgno, pParent->pgno)); 6277 #endif 6278 6279 /* At this point pParent may have at most one overflow cell. And if 6280 ** this overflow cell is present, it must be the cell with 6281 ** index iParentIdx. This scenario comes about when this function 6282 ** is called (indirectly) from sqlite3BtreeDelete(). 6283 */ 6284 assert( pParent->nOverflow==0 || pParent->nOverflow==1 ); 6285 assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx ); 6286 6287 if( !aOvflSpace ){ 6288 return SQLITE_NOMEM; 6289 } 6290 6291 /* Find the sibling pages to balance. Also locate the cells in pParent 6292 ** that divide the siblings. An attempt is made to find NN siblings on 6293 ** either side of pPage. More siblings are taken from one side, however, 6294 ** if there are fewer than NN siblings on the other side. If pParent 6295 ** has NB or fewer children then all children of pParent are taken. 6296 ** 6297 ** This loop also drops the divider cells from the parent page. This 6298 ** way, the remainder of the function does not have to deal with any 6299 ** overflow cells in the parent page, since if any existed they will 6300 ** have already been removed. 6301 */ 6302 i = pParent->nOverflow + pParent->nCell; 6303 if( i<2 ){ 6304 nxDiv = 0; 6305 }else{ 6306 assert( bBulk==0 || bBulk==1 ); 6307 if( iParentIdx==0 ){ 6308 nxDiv = 0; 6309 }else if( iParentIdx==i ){ 6310 nxDiv = i-2+bBulk; 6311 }else{ 6312 assert( bBulk==0 ); 6313 nxDiv = iParentIdx-1; 6314 } 6315 i = 2-bBulk; 6316 } 6317 nOld = i+1; 6318 if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){ 6319 pRight = &pParent->aData[pParent->hdrOffset+8]; 6320 }else{ 6321 pRight = findCell(pParent, i+nxDiv-pParent->nOverflow); 6322 } 6323 pgno = get4byte(pRight); 6324 while( 1 ){ 6325 rc = getAndInitPage(pBt, pgno, &apOld[i], 0); 6326 if( rc ){ 6327 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 6328 goto balance_cleanup; 6329 } 6330 nMaxCells += 1+apOld[i]->nCell+apOld[i]->nOverflow; 6331 if( (i--)==0 ) break; 6332 6333 if( i+nxDiv==pParent->aiOvfl[0] && pParent->nOverflow ){ 6334 apDiv[i] = pParent->apOvfl[0]; 6335 pgno = get4byte(apDiv[i]); 6336 szNew[i] = cellSizePtr(pParent, apDiv[i]); 6337 pParent->nOverflow = 0; 6338 }else{ 6339 apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow); 6340 pgno = get4byte(apDiv[i]); 6341 szNew[i] = cellSizePtr(pParent, apDiv[i]); 6342 6343 /* Drop the cell from the parent page. apDiv[i] still points to 6344 ** the cell within the parent, even though it has been dropped. 6345 ** This is safe because dropping a cell only overwrites the first 6346 ** four bytes of it, and this function does not need the first 6347 ** four bytes of the divider cell. So the pointer is safe to use 6348 ** later on. 6349 ** 6350 ** But not if we are in secure-delete mode. In secure-delete mode, 6351 ** the dropCell() routine will overwrite the entire cell with zeroes. 6352 ** In this case, temporarily copy the cell into the aOvflSpace[] 6353 ** buffer. It will be copied out again as soon as the aSpace[] buffer 6354 ** is allocated. */ 6355 if( pBt->btsFlags & BTS_SECURE_DELETE ){ 6356 int iOff; 6357 6358 iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData); 6359 if( (iOff+szNew[i])>(int)pBt->usableSize ){ 6360 rc = SQLITE_CORRUPT_BKPT; 6361 memset(apOld, 0, (i+1)*sizeof(MemPage*)); 6362 goto balance_cleanup; 6363 }else{ 6364 memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]); 6365 apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData]; 6366 } 6367 } 6368 dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc); 6369 } 6370 } 6371 6372 /* Make nMaxCells a multiple of 4 in order to preserve 8-byte 6373 ** alignment */ 6374 nMaxCells = (nMaxCells + 3)&~3; 6375 6376 /* 6377 ** Allocate space for memory structures 6378 */ 6379 k = pBt->pageSize + ROUND8(sizeof(MemPage)); 6380 szScratch = 6381 nMaxCells*sizeof(u8*) /* apCell */ 6382 + nMaxCells*sizeof(u16) /* szCell */ 6383 + pBt->pageSize /* aSpace1 */ 6384 + k*nOld; /* Page copies (apCopy) */ 6385 apCell = sqlite3ScratchMalloc( szScratch ); 6386 if( apCell==0 ){ 6387 rc = SQLITE_NOMEM; 6388 goto balance_cleanup; 6389 } 6390 szCell = (u16*)&apCell[nMaxCells]; 6391 aSpace1 = (u8*)&szCell[nMaxCells]; 6392 assert( EIGHT_BYTE_ALIGNMENT(aSpace1) ); 6393 6394 /* 6395 ** Load pointers to all cells on sibling pages and the divider cells 6396 ** into the local apCell[] array. Make copies of the divider cells 6397 ** into space obtained from aSpace1[] and remove the divider cells 6398 ** from pParent. 6399 ** 6400 ** If the siblings are on leaf pages, then the child pointers of the 6401 ** divider cells are stripped from the cells before they are copied 6402 ** into aSpace1[]. In this way, all cells in apCell[] are without 6403 ** child pointers. If siblings are not leaves, then all cell in 6404 ** apCell[] include child pointers. Either way, all cells in apCell[] 6405 ** are alike. 6406 ** 6407 ** leafCorrection: 4 if pPage is a leaf. 0 if pPage is not a leaf. 6408 ** leafData: 1 if pPage holds key+data and pParent holds only keys. 6409 */ 6410 leafCorrection = apOld[0]->leaf*4; 6411 leafData = apOld[0]->intKeyLeaf; 6412 for(i=0; i<nOld; i++){ 6413 int limit; 6414 6415 /* Before doing anything else, take a copy of the i'th original sibling 6416 ** The rest of this function will use data from the copies rather 6417 ** that the original pages since the original pages will be in the 6418 ** process of being overwritten. */ 6419 MemPage *pOld = apCopy[i] = (MemPage*)&aSpace1[pBt->pageSize + k*i]; 6420 memcpy(pOld, apOld[i], sizeof(MemPage)); 6421 pOld->aData = (void*)&pOld[1]; 6422 memcpy(pOld->aData, apOld[i]->aData, pBt->pageSize); 6423 6424 limit = pOld->nCell+pOld->nOverflow; 6425 if( pOld->nOverflow>0 ){ 6426 for(j=0; j<limit; j++){ 6427 assert( nCell<nMaxCells ); 6428 apCell[nCell] = findOverflowCell(pOld, j); 6429 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 6430 nCell++; 6431 } 6432 }else{ 6433 u8 *aData = pOld->aData; 6434 u16 maskPage = pOld->maskPage; 6435 u16 cellOffset = pOld->cellOffset; 6436 for(j=0; j<limit; j++){ 6437 assert( nCell<nMaxCells ); 6438 apCell[nCell] = findCellv2(aData, maskPage, cellOffset, j); 6439 szCell[nCell] = cellSizePtr(pOld, apCell[nCell]); 6440 nCell++; 6441 } 6442 } 6443 if( i<nOld-1 && !leafData){ 6444 u16 sz = (u16)szNew[i]; 6445 u8 *pTemp; 6446 assert( nCell<nMaxCells ); 6447 szCell[nCell] = sz; 6448 pTemp = &aSpace1[iSpace1]; 6449 iSpace1 += sz; 6450 assert( sz<=pBt->maxLocal+23 ); 6451 assert( iSpace1 <= (int)pBt->pageSize ); 6452 memcpy(pTemp, apDiv[i], sz); 6453 apCell[nCell] = pTemp+leafCorrection; 6454 assert( leafCorrection==0 || leafCorrection==4 ); 6455 szCell[nCell] = szCell[nCell] - leafCorrection; 6456 if( !pOld->leaf ){ 6457 assert( leafCorrection==0 ); 6458 assert( pOld->hdrOffset==0 ); 6459 /* The right pointer of the child page pOld becomes the left 6460 ** pointer of the divider cell */ 6461 memcpy(apCell[nCell], &pOld->aData[8], 4); 6462 }else{ 6463 assert( leafCorrection==4 ); 6464 if( szCell[nCell]<4 ){ 6465 /* Do not allow any cells smaller than 4 bytes. */ 6466 szCell[nCell] = 4; 6467 } 6468 } 6469 nCell++; 6470 } 6471 } 6472 6473 /* 6474 ** Figure out the number of pages needed to hold all nCell cells. 6475 ** Store this number in "k". Also compute szNew[] which is the total 6476 ** size of all cells on the i-th page and cntNew[] which is the index 6477 ** in apCell[] of the cell that divides page i from page i+1. 6478 ** cntNew[k] should equal nCell. 6479 ** 6480 ** Values computed by this block: 6481 ** 6482 ** k: The total number of sibling pages 6483 ** szNew[i]: Spaced used on the i-th sibling page. 6484 ** cntNew[i]: Index in apCell[] and szCell[] for the first cell to 6485 ** the right of the i-th sibling page. 6486 ** usableSpace: Number of bytes of space available on each sibling. 6487 ** 6488 */ 6489 usableSpace = pBt->usableSize - 12 + leafCorrection; 6490 for(subtotal=k=i=0; i<nCell; i++){ 6491 assert( i<nMaxCells ); 6492 subtotal += szCell[i] + 2; 6493 if( subtotal > usableSpace ){ 6494 szNew[k] = subtotal - szCell[i]; 6495 cntNew[k] = i; 6496 if( leafData ){ i--; } 6497 subtotal = 0; 6498 k++; 6499 if( k>NB+1 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; } 6500 } 6501 } 6502 szNew[k] = subtotal; 6503 cntNew[k] = nCell; 6504 k++; 6505 6506 /* 6507 ** The packing computed by the previous block is biased toward the siblings 6508 ** on the left side. The left siblings are always nearly full, while the 6509 ** right-most sibling might be nearly empty. This block of code attempts 6510 ** to adjust the packing of siblings to get a better balance. 6511 ** 6512 ** This adjustment is more than an optimization. The packing above might 6513 ** be so out of balance as to be illegal. For example, the right-most 6514 ** sibling might be completely empty. This adjustment is not optional. 6515 */ 6516 for(i=k-1; i>0; i--){ 6517 int szRight = szNew[i]; /* Size of sibling on the right */ 6518 int szLeft = szNew[i-1]; /* Size of sibling on the left */ 6519 int r; /* Index of right-most cell in left sibling */ 6520 int d; /* Index of first cell to the left of right sibling */ 6521 6522 r = cntNew[i-1] - 1; 6523 d = r + 1 - leafData; 6524 assert( d<nMaxCells ); 6525 assert( r<nMaxCells ); 6526 while( szRight==0 6527 || (!bBulk && szRight+szCell[d]+2<=szLeft-(szCell[r]+2)) 6528 ){ 6529 szRight += szCell[d] + 2; 6530 szLeft -= szCell[r] + 2; 6531 cntNew[i-1]--; 6532 r = cntNew[i-1] - 1; 6533 d = r + 1 - leafData; 6534 } 6535 szNew[i] = szRight; 6536 szNew[i-1] = szLeft; 6537 } 6538 6539 /* Either we found one or more cells (cntnew[0])>0) or pPage is 6540 ** a virtual root page. A virtual root page is when the real root 6541 ** page is page 1 and we are the only child of that page. 6542 ** 6543 ** UPDATE: The assert() below is not necessarily true if the database 6544 ** file is corrupt. The corruption will be detected and reported later 6545 ** in this procedure so there is no need to act upon it now. 6546 */ 6547 #if 0 6548 assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) ); 6549 #endif 6550 6551 TRACE(("BALANCE: old: %d %d %d ", 6552 apOld[0]->pgno, 6553 nOld>=2 ? apOld[1]->pgno : 0, 6554 nOld>=3 ? apOld[2]->pgno : 0 6555 )); 6556 6557 /* 6558 ** Allocate k new pages. Reuse old pages where possible. 6559 */ 6560 if( apOld[0]->pgno<=1 ){ 6561 rc = SQLITE_CORRUPT_BKPT; 6562 goto balance_cleanup; 6563 } 6564 pageFlags = apOld[0]->aData[0]; 6565 for(i=0; i<k; i++){ 6566 MemPage *pNew; 6567 if( i<nOld ){ 6568 pNew = apNew[i] = apOld[i]; 6569 apOld[i] = 0; 6570 rc = sqlite3PagerWrite(pNew->pDbPage); 6571 nNew++; 6572 if( rc ) goto balance_cleanup; 6573 }else{ 6574 assert( i>0 ); 6575 rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); 6576 if( rc ) goto balance_cleanup; 6577 apNew[i] = pNew; 6578 nNew++; 6579 6580 /* Set the pointer-map entry for the new sibling page. */ 6581 if( ISAUTOVACUUM ){ 6582 ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); 6583 if( rc!=SQLITE_OK ){ 6584 goto balance_cleanup; 6585 } 6586 } 6587 } 6588 } 6589 6590 /* Free any old pages that were not reused as new pages. 6591 */ 6592 while( i<nOld ){ 6593 freePage(apOld[i], &rc); 6594 if( rc ) goto balance_cleanup; 6595 releasePage(apOld[i]); 6596 apOld[i] = 0; 6597 i++; 6598 } 6599 6600 /* 6601 ** Put the new pages in ascending order. This helps to 6602 ** keep entries in the disk file in order so that a scan 6603 ** of the table is a linear scan through the file. That 6604 ** in turn helps the operating system to deliver pages 6605 ** from the disk more rapidly. 6606 ** 6607 ** An O(n^2) insertion sort algorithm is used, but since 6608 ** n is never more than NB (a small constant), that should 6609 ** not be a problem. 6610 ** 6611 ** When NB==3, this one optimization makes the database 6612 ** about 25% faster for large insertions and deletions. 6613 */ 6614 for(i=0; i<k-1; i++){ 6615 int minV = apNew[i]->pgno; 6616 int minI = i; 6617 for(j=i+1; j<k; j++){ 6618 if( apNew[j]->pgno<(unsigned)minV ){ 6619 minI = j; 6620 minV = apNew[j]->pgno; 6621 } 6622 } 6623 if( minI>i ){ 6624 MemPage *pT; 6625 pT = apNew[i]; 6626 apNew[i] = apNew[minI]; 6627 apNew[minI] = pT; 6628 } 6629 } 6630 TRACE(("new: %d(%d) %d(%d) %d(%d) %d(%d) %d(%d)\n", 6631 apNew[0]->pgno, szNew[0], 6632 nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0, 6633 nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0, 6634 nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0, 6635 nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0)); 6636 6637 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6638 put4byte(pRight, apNew[nNew-1]->pgno); 6639 6640 /* 6641 ** Evenly distribute the data in apCell[] across the new pages. 6642 ** Insert divider cells into pParent as necessary. 6643 */ 6644 j = 0; 6645 for(i=0; i<nNew; i++){ 6646 /* Assemble the new sibling page. */ 6647 MemPage *pNew = apNew[i]; 6648 assert( j<nMaxCells ); 6649 zeroPage(pNew, pageFlags); 6650 assemblePage(pNew, cntNew[i]-j, &apCell[j], &szCell[j]); 6651 assert( pNew->nCell>0 || (nNew==1 && cntNew[0]==0) ); 6652 assert( pNew->nOverflow==0 ); 6653 6654 j = cntNew[i]; 6655 6656 /* If the sibling page assembled above was not the right-most sibling, 6657 ** insert a divider cell into the parent page. 6658 */ 6659 assert( i<nNew-1 || j==nCell ); 6660 if( j<nCell ){ 6661 u8 *pCell; 6662 u8 *pTemp; 6663 int sz; 6664 6665 assert( j<nMaxCells ); 6666 pCell = apCell[j]; 6667 sz = szCell[j] + leafCorrection; 6668 pTemp = &aOvflSpace[iOvflSpace]; 6669 if( !pNew->leaf ){ 6670 memcpy(&pNew->aData[8], pCell, 4); 6671 }else if( leafData ){ 6672 /* If the tree is a leaf-data tree, and the siblings are leaves, 6673 ** then there is no divider cell in apCell[]. Instead, the divider 6674 ** cell consists of the integer key for the right-most cell of 6675 ** the sibling-page assembled above only. 6676 */ 6677 CellInfo info; 6678 j--; 6679 btreeParseCellPtr(pNew, apCell[j], &info); 6680 pCell = pTemp; 6681 sz = 4 + putVarint(&pCell[4], info.nKey); 6682 pTemp = 0; 6683 }else{ 6684 pCell -= 4; 6685 /* Obscure case for non-leaf-data trees: If the cell at pCell was 6686 ** previously stored on a leaf node, and its reported size was 4 6687 ** bytes, then it may actually be smaller than this 6688 ** (see btreeParseCellPtr(), 4 bytes is the minimum size of 6689 ** any cell). But it is important to pass the correct size to 6690 ** insertCell(), so reparse the cell now. 6691 ** 6692 ** Note that this can never happen in an SQLite data file, as all 6693 ** cells are at least 4 bytes. It only happens in b-trees used 6694 ** to evaluate "IN (SELECT ...)" and similar clauses. 6695 */ 6696 if( szCell[j]==4 ){ 6697 assert(leafCorrection==4); 6698 sz = cellSizePtr(pParent, pCell); 6699 } 6700 } 6701 iOvflSpace += sz; 6702 assert( sz<=pBt->maxLocal+23 ); 6703 assert( iOvflSpace <= (int)pBt->pageSize ); 6704 insertCell(pParent, nxDiv, pCell, sz, pTemp, pNew->pgno, &rc); 6705 if( rc!=SQLITE_OK ) goto balance_cleanup; 6706 assert( sqlite3PagerIswriteable(pParent->pDbPage) ); 6707 6708 j++; 6709 nxDiv++; 6710 } 6711 } 6712 assert( j==nCell ); 6713 assert( nOld>0 ); 6714 assert( nNew>0 ); 6715 if( (pageFlags & PTF_LEAF)==0 ){ 6716 u8 *zChild = &apCopy[nOld-1]->aData[8]; 6717 memcpy(&apNew[nNew-1]->aData[8], zChild, 4); 6718 } 6719 6720 if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){ 6721 /* The root page of the b-tree now contains no cells. The only sibling 6722 ** page is the right-child of the parent. Copy the contents of the 6723 ** child page into the parent, decreasing the overall height of the 6724 ** b-tree structure by one. This is described as the "balance-shallower" 6725 ** sub-algorithm in some documentation. 6726 ** 6727 ** If this is an auto-vacuum database, the call to copyNodeContent() 6728 ** sets all pointer-map entries corresponding to database image pages 6729 ** for which the pointer is stored within the content being copied. 6730 ** 6731 ** The second assert below verifies that the child page is defragmented 6732 ** (it must be, as it was just reconstructed using assemblePage()). This 6733 ** is important if the parent page happens to be page 1 of the database 6734 ** image. */ 6735 assert( nNew==1 ); 6736 assert( apNew[0]->nFree == 6737 (get2byte(&apNew[0]->aData[5])-apNew[0]->cellOffset-apNew[0]->nCell*2) 6738 ); 6739 copyNodeContent(apNew[0], pParent, &rc); 6740 freePage(apNew[0], &rc); 6741 }else if( ISAUTOVACUUM ){ 6742 /* Fix the pointer-map entries for all the cells that were shifted around. 6743 ** There are several different types of pointer-map entries that need to 6744 ** be dealt with by this routine. Some of these have been set already, but 6745 ** many have not. The following is a summary: 6746 ** 6747 ** 1) The entries associated with new sibling pages that were not 6748 ** siblings when this function was called. These have already 6749 ** been set. We don't need to worry about old siblings that were 6750 ** moved to the free-list - the freePage() code has taken care 6751 ** of those. 6752 ** 6753 ** 2) The pointer-map entries associated with the first overflow 6754 ** page in any overflow chains used by new divider cells. These 6755 ** have also already been taken care of by the insertCell() code. 6756 ** 6757 ** 3) If the sibling pages are not leaves, then the child pages of 6758 ** cells stored on the sibling pages may need to be updated. 6759 ** 6760 ** 4) If the sibling pages are not internal intkey nodes, then any 6761 ** overflow pages used by these cells may need to be updated 6762 ** (internal intkey nodes never contain pointers to overflow pages). 6763 ** 6764 ** 5) If the sibling pages are not leaves, then the pointer-map 6765 ** entries for the right-child pages of each sibling may need 6766 ** to be updated. 6767 ** 6768 ** Cases 1 and 2 are dealt with above by other code. The next 6769 ** block deals with cases 3 and 4 and the one after that, case 5. Since 6770 ** setting a pointer map entry is a relatively expensive operation, this 6771 ** code only sets pointer map entries for child or overflow pages that have 6772 ** actually moved between pages. */ 6773 MemPage *pNew = apNew[0]; 6774 MemPage *pOld = apCopy[0]; 6775 int nOverflow = pOld->nOverflow; 6776 int iNextOld = pOld->nCell + nOverflow; 6777 int iOverflow = (nOverflow ? pOld->aiOvfl[0] : -1); 6778 j = 0; /* Current 'old' sibling page */ 6779 k = 0; /* Current 'new' sibling page */ 6780 for(i=0; i<nCell; i++){ 6781 int isDivider = 0; 6782 while( i==iNextOld ){ 6783 /* Cell i is the cell immediately following the last cell on old 6784 ** sibling page j. If the siblings are not leaf pages of an 6785 ** intkey b-tree, then cell i was a divider cell. */ 6786 assert( j+1 < ArraySize(apCopy) ); 6787 assert( j+1 < nOld ); 6788 pOld = apCopy[++j]; 6789 iNextOld = i + !leafData + pOld->nCell + pOld->nOverflow; 6790 if( pOld->nOverflow ){ 6791 nOverflow = pOld->nOverflow; 6792 iOverflow = i + !leafData + pOld->aiOvfl[0]; 6793 } 6794 isDivider = !leafData; 6795 } 6796 6797 assert(nOverflow>0 || iOverflow<i ); 6798 assert(nOverflow<2 || pOld->aiOvfl[0]==pOld->aiOvfl[1]-1); 6799 assert(nOverflow<3 || pOld->aiOvfl[1]==pOld->aiOvfl[2]-1); 6800 if( i==iOverflow ){ 6801 isDivider = 1; 6802 if( (--nOverflow)>0 ){ 6803 iOverflow++; 6804 } 6805 } 6806 6807 if( i==cntNew[k] ){ 6808 /* Cell i is the cell immediately following the last cell on new 6809 ** sibling page k. If the siblings are not leaf pages of an 6810 ** intkey b-tree, then cell i is a divider cell. */ 6811 pNew = apNew[++k]; 6812 if( !leafData ) continue; 6813 } 6814 assert( j<nOld ); 6815 assert( k<nNew ); 6816 6817 /* If the cell was originally divider cell (and is not now) or 6818 ** an overflow cell, or if the cell was located on a different sibling 6819 ** page before the balancing, then the pointer map entries associated 6820 ** with any child or overflow pages need to be updated. */ 6821 if( isDivider || pOld->pgno!=pNew->pgno ){ 6822 if( !leafCorrection ){ 6823 ptrmapPut(pBt, get4byte(apCell[i]), PTRMAP_BTREE, pNew->pgno, &rc); 6824 } 6825 if( szCell[i]>pNew->minLocal ){ 6826 ptrmapPutOvflPtr(pNew, apCell[i], &rc); 6827 } 6828 } 6829 } 6830 6831 if( !leafCorrection ){ 6832 for(i=0; i<nNew; i++){ 6833 u32 key = get4byte(&apNew[i]->aData[8]); 6834 ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc); 6835 } 6836 } 6837 6838 #if 0 6839 /* The ptrmapCheckPages() contains assert() statements that verify that 6840 ** all pointer map pages are set correctly. This is helpful while 6841 ** debugging. This is usually disabled because a corrupt database may 6842 ** cause an assert() statement to fail. */ 6843 ptrmapCheckPages(apNew, nNew); 6844 ptrmapCheckPages(&pParent, 1); 6845 #endif 6846 } 6847 6848 assert( pParent->isInit ); 6849 TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n", 6850 nOld, nNew, nCell)); 6851 6852 /* 6853 ** Cleanup before returning. 6854 */ 6855 balance_cleanup: 6856 sqlite3ScratchFree(apCell); 6857 for(i=0; i<nOld; i++){ 6858 releasePage(apOld[i]); 6859 } 6860 for(i=0; i<nNew; i++){ 6861 releasePage(apNew[i]); 6862 } 6863 6864 return rc; 6865 } 6866 #if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM) 6867 #pragma optimize("", on) 6868 #endif 6869 6870 6871 /* 6872 ** This function is called when the root page of a b-tree structure is 6873 ** overfull (has one or more overflow pages). 6874 ** 6875 ** A new child page is allocated and the contents of the current root 6876 ** page, including overflow cells, are copied into the child. The root 6877 ** page is then overwritten to make it an empty page with the right-child 6878 ** pointer pointing to the new page. 6879 ** 6880 ** Before returning, all pointer-map entries corresponding to pages 6881 ** that the new child-page now contains pointers to are updated. The 6882 ** entry corresponding to the new right-child pointer of the root 6883 ** page is also updated. 6884 ** 6885 ** If successful, *ppChild is set to contain a reference to the child 6886 ** page and SQLITE_OK is returned. In this case the caller is required 6887 ** to call releasePage() on *ppChild exactly once. If an error occurs, 6888 ** an error code is returned and *ppChild is set to 0. 6889 */ 6890 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){ 6891 int rc; /* Return value from subprocedures */ 6892 MemPage *pChild = 0; /* Pointer to a new child page */ 6893 Pgno pgnoChild = 0; /* Page number of the new child page */ 6894 BtShared *pBt = pRoot->pBt; /* The BTree */ 6895 6896 assert( pRoot->nOverflow>0 ); 6897 assert( sqlite3_mutex_held(pBt->mutex) ); 6898 6899 /* Make pRoot, the root page of the b-tree, writable. Allocate a new 6900 ** page that will become the new right-child of pPage. Copy the contents 6901 ** of the node stored on pRoot into the new child page. 6902 */ 6903 rc = sqlite3PagerWrite(pRoot->pDbPage); 6904 if( rc==SQLITE_OK ){ 6905 rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0); 6906 copyNodeContent(pRoot, pChild, &rc); 6907 if( ISAUTOVACUUM ){ 6908 ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc); 6909 } 6910 } 6911 if( rc ){ 6912 *ppChild = 0; 6913 releasePage(pChild); 6914 return rc; 6915 } 6916 assert( sqlite3PagerIswriteable(pChild->pDbPage) ); 6917 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 6918 assert( pChild->nCell==pRoot->nCell ); 6919 6920 TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno)); 6921 6922 /* Copy the overflow cells from pRoot to pChild */ 6923 memcpy(pChild->aiOvfl, pRoot->aiOvfl, 6924 pRoot->nOverflow*sizeof(pRoot->aiOvfl[0])); 6925 memcpy(pChild->apOvfl, pRoot->apOvfl, 6926 pRoot->nOverflow*sizeof(pRoot->apOvfl[0])); 6927 pChild->nOverflow = pRoot->nOverflow; 6928 6929 /* Zero the contents of pRoot. Then install pChild as the right-child. */ 6930 zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF); 6931 put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild); 6932 6933 *ppChild = pChild; 6934 return SQLITE_OK; 6935 } 6936 6937 /* 6938 ** The page that pCur currently points to has just been modified in 6939 ** some way. This function figures out if this modification means the 6940 ** tree needs to be balanced, and if so calls the appropriate balancing 6941 ** routine. Balancing routines are: 6942 ** 6943 ** balance_quick() 6944 ** balance_deeper() 6945 ** balance_nonroot() 6946 */ 6947 static int balance(BtCursor *pCur){ 6948 int rc = SQLITE_OK; 6949 const int nMin = pCur->pBt->usableSize * 2 / 3; 6950 u8 aBalanceQuickSpace[13]; 6951 u8 *pFree = 0; 6952 6953 TESTONLY( int balance_quick_called = 0 ); 6954 TESTONLY( int balance_deeper_called = 0 ); 6955 6956 do { 6957 int iPage = pCur->iPage; 6958 MemPage *pPage = pCur->apPage[iPage]; 6959 6960 if( iPage==0 ){ 6961 if( pPage->nOverflow ){ 6962 /* The root page of the b-tree is overfull. In this case call the 6963 ** balance_deeper() function to create a new child for the root-page 6964 ** and copy the current contents of the root-page to it. The 6965 ** next iteration of the do-loop will balance the child page. 6966 */ 6967 assert( (balance_deeper_called++)==0 ); 6968 rc = balance_deeper(pPage, &pCur->apPage[1]); 6969 if( rc==SQLITE_OK ){ 6970 pCur->iPage = 1; 6971 pCur->aiIdx[0] = 0; 6972 pCur->aiIdx[1] = 0; 6973 assert( pCur->apPage[1]->nOverflow ); 6974 } 6975 }else{ 6976 break; 6977 } 6978 }else if( pPage->nOverflow==0 && pPage->nFree<=nMin ){ 6979 break; 6980 }else{ 6981 MemPage * const pParent = pCur->apPage[iPage-1]; 6982 int const iIdx = pCur->aiIdx[iPage-1]; 6983 6984 rc = sqlite3PagerWrite(pParent->pDbPage); 6985 if( rc==SQLITE_OK ){ 6986 #ifndef SQLITE_OMIT_QUICKBALANCE 6987 if( pPage->intKeyLeaf 6988 && pPage->nOverflow==1 6989 && pPage->aiOvfl[0]==pPage->nCell 6990 && pParent->pgno!=1 6991 && pParent->nCell==iIdx 6992 ){ 6993 /* Call balance_quick() to create a new sibling of pPage on which 6994 ** to store the overflow cell. balance_quick() inserts a new cell 6995 ** into pParent, which may cause pParent overflow. If this 6996 ** happens, the next iteration of the do-loop will balance pParent 6997 ** use either balance_nonroot() or balance_deeper(). Until this 6998 ** happens, the overflow cell is stored in the aBalanceQuickSpace[] 6999 ** buffer. 7000 ** 7001 ** The purpose of the following assert() is to check that only a 7002 ** single call to balance_quick() is made for each call to this 7003 ** function. If this were not verified, a subtle bug involving reuse 7004 ** of the aBalanceQuickSpace[] might sneak in. 7005 */ 7006 assert( (balance_quick_called++)==0 ); 7007 rc = balance_quick(pParent, pPage, aBalanceQuickSpace); 7008 }else 7009 #endif 7010 { 7011 /* In this case, call balance_nonroot() to redistribute cells 7012 ** between pPage and up to 2 of its sibling pages. This involves 7013 ** modifying the contents of pParent, which may cause pParent to 7014 ** become overfull or underfull. The next iteration of the do-loop 7015 ** will balance the parent page to correct this. 7016 ** 7017 ** If the parent page becomes overfull, the overflow cell or cells 7018 ** are stored in the pSpace buffer allocated immediately below. 7019 ** A subsequent iteration of the do-loop will deal with this by 7020 ** calling balance_nonroot() (balance_deeper() may be called first, 7021 ** but it doesn't deal with overflow cells - just moves them to a 7022 ** different page). Once this subsequent call to balance_nonroot() 7023 ** has completed, it is safe to release the pSpace buffer used by 7024 ** the previous call, as the overflow cell data will have been 7025 ** copied either into the body of a database page or into the new 7026 ** pSpace buffer passed to the latter call to balance_nonroot(). 7027 */ 7028 u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); 7029 rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints); 7030 if( pFree ){ 7031 /* If pFree is not NULL, it points to the pSpace buffer used 7032 ** by a previous call to balance_nonroot(). Its contents are 7033 ** now stored either on real database pages or within the 7034 ** new pSpace buffer, so it may be safely freed here. */ 7035 sqlite3PageFree(pFree); 7036 } 7037 7038 /* The pSpace buffer will be freed after the next call to 7039 ** balance_nonroot(), or just before this function returns, whichever 7040 ** comes first. */ 7041 pFree = pSpace; 7042 } 7043 } 7044 7045 pPage->nOverflow = 0; 7046 7047 /* The next iteration of the do-loop balances the parent page. */ 7048 releasePage(pPage); 7049 pCur->iPage--; 7050 } 7051 }while( rc==SQLITE_OK ); 7052 7053 if( pFree ){ 7054 sqlite3PageFree(pFree); 7055 } 7056 return rc; 7057 } 7058 7059 7060 /* 7061 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 7062 ** and the data is given by (pData,nData). The cursor is used only to 7063 ** define what table the record should be inserted into. The cursor 7064 ** is left pointing at a random location. 7065 ** 7066 ** For an INTKEY table, only the nKey value of the key is used. pKey is 7067 ** ignored. For a ZERODATA table, the pData and nData are both ignored. 7068 ** 7069 ** If the seekResult parameter is non-zero, then a successful call to 7070 ** MovetoUnpacked() to seek cursor pCur to (pKey, nKey) has already 7071 ** been performed. seekResult is the search result returned (a negative 7072 ** number if pCur points at an entry that is smaller than (pKey, nKey), or 7073 ** a positive value if pCur points at an entry that is larger than 7074 ** (pKey, nKey)). 7075 ** 7076 ** If the seekResult parameter is non-zero, then the caller guarantees that 7077 ** cursor pCur is pointing at the existing copy of a row that is to be 7078 ** overwritten. If the seekResult parameter is 0, then cursor pCur may 7079 ** point to any entry or to no entry at all and so this function has to seek 7080 ** the cursor before the new key can be inserted. 7081 */ 7082 int sqlite3BtreeInsert( 7083 BtCursor *pCur, /* Insert data into the table of this cursor */ 7084 const void *pKey, i64 nKey, /* The key of the new record */ 7085 const void *pData, int nData, /* The data of the new record */ 7086 int nZero, /* Number of extra 0 bytes to append to data */ 7087 int appendBias, /* True if this is likely an append */ 7088 int seekResult /* Result of prior MovetoUnpacked() call */ 7089 ){ 7090 int rc; 7091 int loc = seekResult; /* -1: before desired location +1: after */ 7092 int szNew = 0; 7093 int idx; 7094 MemPage *pPage; 7095 Btree *p = pCur->pBtree; 7096 BtShared *pBt = p->pBt; 7097 unsigned char *oldCell; 7098 unsigned char *newCell = 0; 7099 7100 if( pCur->eState==CURSOR_FAULT ){ 7101 assert( pCur->skipNext!=SQLITE_OK ); 7102 return pCur->skipNext; 7103 } 7104 7105 assert( cursorHoldsMutex(pCur) ); 7106 assert( (pCur->curFlags & BTCF_WriteFlag)!=0 7107 && pBt->inTransaction==TRANS_WRITE 7108 && (pBt->btsFlags & BTS_READ_ONLY)==0 ); 7109 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 7110 7111 /* Assert that the caller has been consistent. If this cursor was opened 7112 ** expecting an index b-tree, then the caller should be inserting blob 7113 ** keys with no associated data. If the cursor was opened expecting an 7114 ** intkey table, the caller should be inserting integer keys with a 7115 ** blob of associated data. */ 7116 assert( (pKey==0)==(pCur->pKeyInfo==0) ); 7117 7118 /* Save the positions of any other cursors open on this table. 7119 ** 7120 ** In some cases, the call to btreeMoveto() below is a no-op. For 7121 ** example, when inserting data into a table with auto-generated integer 7122 ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the 7123 ** integer key to use. It then calls this function to actually insert the 7124 ** data into the intkey B-Tree. In this case btreeMoveto() recognizes 7125 ** that the cursor is already where it needs to be and returns without 7126 ** doing any work. To avoid thwarting these optimizations, it is important 7127 ** not to clear the cursor here. 7128 */ 7129 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 7130 if( rc ) return rc; 7131 7132 if( pCur->pKeyInfo==0 ){ 7133 /* If this is an insert into a table b-tree, invalidate any incrblob 7134 ** cursors open on the row being replaced */ 7135 invalidateIncrblobCursors(p, nKey, 0); 7136 7137 /* If the cursor is currently on the last row and we are appending a 7138 ** new row onto the end, set the "loc" to avoid an unnecessary btreeMoveto() 7139 ** call */ 7140 if( (pCur->curFlags&BTCF_ValidNKey)!=0 && nKey>0 7141 && pCur->info.nKey==nKey-1 ){ 7142 loc = -1; 7143 } 7144 } 7145 7146 if( !loc ){ 7147 rc = btreeMoveto(pCur, pKey, nKey, appendBias, &loc); 7148 if( rc ) return rc; 7149 } 7150 assert( pCur->eState==CURSOR_VALID || (pCur->eState==CURSOR_INVALID && loc) ); 7151 7152 pPage = pCur->apPage[pCur->iPage]; 7153 assert( pPage->intKey || nKey>=0 ); 7154 assert( pPage->leaf || !pPage->intKey ); 7155 7156 TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n", 7157 pCur->pgnoRoot, nKey, nData, pPage->pgno, 7158 loc==0 ? "overwrite" : "new entry")); 7159 assert( pPage->isInit ); 7160 newCell = pBt->pTmpSpace; 7161 assert( newCell!=0 ); 7162 rc = fillInCell(pPage, newCell, pKey, nKey, pData, nData, nZero, &szNew); 7163 if( rc ) goto end_insert; 7164 assert( szNew==cellSizePtr(pPage, newCell) ); 7165 assert( szNew <= MX_CELL_SIZE(pBt) ); 7166 idx = pCur->aiIdx[pCur->iPage]; 7167 if( loc==0 ){ 7168 u16 szOld; 7169 assert( idx<pPage->nCell ); 7170 rc = sqlite3PagerWrite(pPage->pDbPage); 7171 if( rc ){ 7172 goto end_insert; 7173 } 7174 oldCell = findCell(pPage, idx); 7175 if( !pPage->leaf ){ 7176 memcpy(newCell, oldCell, 4); 7177 } 7178 rc = clearCell(pPage, oldCell, &szOld); 7179 dropCell(pPage, idx, szOld, &rc); 7180 if( rc ) goto end_insert; 7181 }else if( loc<0 && pPage->nCell>0 ){ 7182 assert( pPage->leaf ); 7183 idx = ++pCur->aiIdx[pCur->iPage]; 7184 }else{ 7185 assert( pPage->leaf ); 7186 } 7187 insertCell(pPage, idx, newCell, szNew, 0, 0, &rc); 7188 assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 ); 7189 7190 /* If no error has occurred and pPage has an overflow cell, call balance() 7191 ** to redistribute the cells within the tree. Since balance() may move 7192 ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey 7193 ** variables. 7194 ** 7195 ** Previous versions of SQLite called moveToRoot() to move the cursor 7196 ** back to the root page as balance() used to invalidate the contents 7197 ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that, 7198 ** set the cursor state to "invalid". This makes common insert operations 7199 ** slightly faster. 7200 ** 7201 ** There is a subtle but important optimization here too. When inserting 7202 ** multiple records into an intkey b-tree using a single cursor (as can 7203 ** happen while processing an "INSERT INTO ... SELECT" statement), it 7204 ** is advantageous to leave the cursor pointing to the last entry in 7205 ** the b-tree if possible. If the cursor is left pointing to the last 7206 ** entry in the table, and the next row inserted has an integer key 7207 ** larger than the largest existing key, it is possible to insert the 7208 ** row without seeking the cursor. This can be a big performance boost. 7209 */ 7210 pCur->info.nSize = 0; 7211 if( rc==SQLITE_OK && pPage->nOverflow ){ 7212 pCur->curFlags &= ~(BTCF_ValidNKey); 7213 rc = balance(pCur); 7214 7215 /* Must make sure nOverflow is reset to zero even if the balance() 7216 ** fails. Internal data structure corruption will result otherwise. 7217 ** Also, set the cursor state to invalid. This stops saveCursorPosition() 7218 ** from trying to save the current position of the cursor. */ 7219 pCur->apPage[pCur->iPage]->nOverflow = 0; 7220 pCur->eState = CURSOR_INVALID; 7221 } 7222 assert( pCur->apPage[pCur->iPage]->nOverflow==0 ); 7223 7224 end_insert: 7225 return rc; 7226 } 7227 7228 /* 7229 ** Delete the entry that the cursor is pointing to. The cursor 7230 ** is left pointing at an arbitrary location. 7231 */ 7232 int sqlite3BtreeDelete(BtCursor *pCur){ 7233 Btree *p = pCur->pBtree; 7234 BtShared *pBt = p->pBt; 7235 int rc; /* Return code */ 7236 MemPage *pPage; /* Page to delete cell from */ 7237 unsigned char *pCell; /* Pointer to cell to delete */ 7238 int iCellIdx; /* Index of cell to delete */ 7239 int iCellDepth; /* Depth of node containing pCell */ 7240 u16 szCell; /* Size of the cell being deleted */ 7241 7242 assert( cursorHoldsMutex(pCur) ); 7243 assert( pBt->inTransaction==TRANS_WRITE ); 7244 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 7245 assert( pCur->curFlags & BTCF_WriteFlag ); 7246 assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) ); 7247 assert( !hasReadConflicts(p, pCur->pgnoRoot) ); 7248 7249 if( NEVER(pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell) 7250 || NEVER(pCur->eState!=CURSOR_VALID) 7251 ){ 7252 return SQLITE_ERROR; /* Something has gone awry. */ 7253 } 7254 7255 iCellDepth = pCur->iPage; 7256 iCellIdx = pCur->aiIdx[iCellDepth]; 7257 pPage = pCur->apPage[iCellDepth]; 7258 pCell = findCell(pPage, iCellIdx); 7259 7260 /* If the page containing the entry to delete is not a leaf page, move 7261 ** the cursor to the largest entry in the tree that is smaller than 7262 ** the entry being deleted. This cell will replace the cell being deleted 7263 ** from the internal node. The 'previous' entry is used for this instead 7264 ** of the 'next' entry, as the previous entry is always a part of the 7265 ** sub-tree headed by the child page of the cell being deleted. This makes 7266 ** balancing the tree following the delete operation easier. */ 7267 if( !pPage->leaf ){ 7268 int notUsed = 0; 7269 rc = sqlite3BtreePrevious(pCur, &notUsed); 7270 if( rc ) return rc; 7271 } 7272 7273 /* Save the positions of any other cursors open on this table before 7274 ** making any modifications. Make the page containing the entry to be 7275 ** deleted writable. Then free any overflow pages associated with the 7276 ** entry and finally remove the cell itself from within the page. 7277 */ 7278 rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur); 7279 if( rc ) return rc; 7280 7281 /* If this is a delete operation to remove a row from a table b-tree, 7282 ** invalidate any incrblob cursors open on the row being deleted. */ 7283 if( pCur->pKeyInfo==0 ){ 7284 invalidateIncrblobCursors(p, pCur->info.nKey, 0); 7285 } 7286 7287 rc = sqlite3PagerWrite(pPage->pDbPage); 7288 if( rc ) return rc; 7289 rc = clearCell(pPage, pCell, &szCell); 7290 dropCell(pPage, iCellIdx, szCell, &rc); 7291 if( rc ) return rc; 7292 7293 /* If the cell deleted was not located on a leaf page, then the cursor 7294 ** is currently pointing to the largest entry in the sub-tree headed 7295 ** by the child-page of the cell that was just deleted from an internal 7296 ** node. The cell from the leaf node needs to be moved to the internal 7297 ** node to replace the deleted cell. */ 7298 if( !pPage->leaf ){ 7299 MemPage *pLeaf = pCur->apPage[pCur->iPage]; 7300 int nCell; 7301 Pgno n = pCur->apPage[iCellDepth+1]->pgno; 7302 unsigned char *pTmp; 7303 7304 pCell = findCell(pLeaf, pLeaf->nCell-1); 7305 nCell = cellSizePtr(pLeaf, pCell); 7306 assert( MX_CELL_SIZE(pBt) >= nCell ); 7307 pTmp = pBt->pTmpSpace; 7308 assert( pTmp!=0 ); 7309 rc = sqlite3PagerWrite(pLeaf->pDbPage); 7310 insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc); 7311 dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc); 7312 if( rc ) return rc; 7313 } 7314 7315 /* Balance the tree. If the entry deleted was located on a leaf page, 7316 ** then the cursor still points to that page. In this case the first 7317 ** call to balance() repairs the tree, and the if(...) condition is 7318 ** never true. 7319 ** 7320 ** Otherwise, if the entry deleted was on an internal node page, then 7321 ** pCur is pointing to the leaf page from which a cell was removed to 7322 ** replace the cell deleted from the internal node. This is slightly 7323 ** tricky as the leaf node may be underfull, and the internal node may 7324 ** be either under or overfull. In this case run the balancing algorithm 7325 ** on the leaf node first. If the balance proceeds far enough up the 7326 ** tree that we can be sure that any problem in the internal node has 7327 ** been corrected, so be it. Otherwise, after balancing the leaf node, 7328 ** walk the cursor up the tree to the internal node and balance it as 7329 ** well. */ 7330 rc = balance(pCur); 7331 if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){ 7332 while( pCur->iPage>iCellDepth ){ 7333 releasePage(pCur->apPage[pCur->iPage--]); 7334 } 7335 rc = balance(pCur); 7336 } 7337 7338 if( rc==SQLITE_OK ){ 7339 moveToRoot(pCur); 7340 } 7341 return rc; 7342 } 7343 7344 /* 7345 ** Create a new BTree table. Write into *piTable the page 7346 ** number for the root page of the new table. 7347 ** 7348 ** The type of type is determined by the flags parameter. Only the 7349 ** following values of flags are currently in use. Other values for 7350 ** flags might not work: 7351 ** 7352 ** BTREE_INTKEY|BTREE_LEAFDATA Used for SQL tables with rowid keys 7353 ** BTREE_ZERODATA Used for SQL indices 7354 */ 7355 static int btreeCreateTable(Btree *p, int *piTable, int createTabFlags){ 7356 BtShared *pBt = p->pBt; 7357 MemPage *pRoot; 7358 Pgno pgnoRoot; 7359 int rc; 7360 int ptfFlags; /* Page-type flage for the root page of new table */ 7361 7362 assert( sqlite3BtreeHoldsMutex(p) ); 7363 assert( pBt->inTransaction==TRANS_WRITE ); 7364 assert( (pBt->btsFlags & BTS_READ_ONLY)==0 ); 7365 7366 #ifdef SQLITE_OMIT_AUTOVACUUM 7367 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 7368 if( rc ){ 7369 return rc; 7370 } 7371 #else 7372 if( pBt->autoVacuum ){ 7373 Pgno pgnoMove; /* Move a page here to make room for the root-page */ 7374 MemPage *pPageMove; /* The page to move to. */ 7375 7376 /* Creating a new table may probably require moving an existing database 7377 ** to make room for the new tables root page. In case this page turns 7378 ** out to be an overflow page, delete all overflow page-map caches 7379 ** held by open cursors. 7380 */ 7381 invalidateAllOverflowCache(pBt); 7382 7383 /* Read the value of meta[3] from the database to determine where the 7384 ** root page of the new table should go. meta[3] is the largest root-page 7385 ** created so far, so the new root-page is (meta[3]+1). 7386 */ 7387 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot); 7388 pgnoRoot++; 7389 7390 /* The new root-page may not be allocated on a pointer-map page, or the 7391 ** PENDING_BYTE page. 7392 */ 7393 while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) || 7394 pgnoRoot==PENDING_BYTE_PAGE(pBt) ){ 7395 pgnoRoot++; 7396 } 7397 assert( pgnoRoot>=3 ); 7398 7399 /* Allocate a page. The page that currently resides at pgnoRoot will 7400 ** be moved to the allocated page (unless the allocated page happens 7401 ** to reside at pgnoRoot). 7402 */ 7403 rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT); 7404 if( rc!=SQLITE_OK ){ 7405 return rc; 7406 } 7407 7408 if( pgnoMove!=pgnoRoot ){ 7409 /* pgnoRoot is the page that will be used for the root-page of 7410 ** the new table (assuming an error did not occur). But we were 7411 ** allocated pgnoMove. If required (i.e. if it was not allocated 7412 ** by extending the file), the current page at position pgnoMove 7413 ** is already journaled. 7414 */ 7415 u8 eType = 0; 7416 Pgno iPtrPage = 0; 7417 7418 /* Save the positions of any open cursors. This is required in 7419 ** case they are holding a reference to an xFetch reference 7420 ** corresponding to page pgnoRoot. */ 7421 rc = saveAllCursors(pBt, 0, 0); 7422 releasePage(pPageMove); 7423 if( rc!=SQLITE_OK ){ 7424 return rc; 7425 } 7426 7427 /* Move the page currently at pgnoRoot to pgnoMove. */ 7428 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 7429 if( rc!=SQLITE_OK ){ 7430 return rc; 7431 } 7432 rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage); 7433 if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){ 7434 rc = SQLITE_CORRUPT_BKPT; 7435 } 7436 if( rc!=SQLITE_OK ){ 7437 releasePage(pRoot); 7438 return rc; 7439 } 7440 assert( eType!=PTRMAP_ROOTPAGE ); 7441 assert( eType!=PTRMAP_FREEPAGE ); 7442 rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0); 7443 releasePage(pRoot); 7444 7445 /* Obtain the page at pgnoRoot */ 7446 if( rc!=SQLITE_OK ){ 7447 return rc; 7448 } 7449 rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0); 7450 if( rc!=SQLITE_OK ){ 7451 return rc; 7452 } 7453 rc = sqlite3PagerWrite(pRoot->pDbPage); 7454 if( rc!=SQLITE_OK ){ 7455 releasePage(pRoot); 7456 return rc; 7457 } 7458 }else{ 7459 pRoot = pPageMove; 7460 } 7461 7462 /* Update the pointer-map and meta-data with the new root-page number. */ 7463 ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc); 7464 if( rc ){ 7465 releasePage(pRoot); 7466 return rc; 7467 } 7468 7469 /* When the new root page was allocated, page 1 was made writable in 7470 ** order either to increase the database filesize, or to decrement the 7471 ** freelist count. Hence, the sqlite3BtreeUpdateMeta() call cannot fail. 7472 */ 7473 assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) ); 7474 rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot); 7475 if( NEVER(rc) ){ 7476 releasePage(pRoot); 7477 return rc; 7478 } 7479 7480 }else{ 7481 rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0); 7482 if( rc ) return rc; 7483 } 7484 #endif 7485 assert( sqlite3PagerIswriteable(pRoot->pDbPage) ); 7486 if( createTabFlags & BTREE_INTKEY ){ 7487 ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF; 7488 }else{ 7489 ptfFlags = PTF_ZERODATA | PTF_LEAF; 7490 } 7491 zeroPage(pRoot, ptfFlags); 7492 sqlite3PagerUnref(pRoot->pDbPage); 7493 assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 ); 7494 *piTable = (int)pgnoRoot; 7495 return SQLITE_OK; 7496 } 7497 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags){ 7498 int rc; 7499 sqlite3BtreeEnter(p); 7500 rc = btreeCreateTable(p, piTable, flags); 7501 sqlite3BtreeLeave(p); 7502 return rc; 7503 } 7504 7505 /* 7506 ** Erase the given database page and all its children. Return 7507 ** the page to the freelist. 7508 */ 7509 static int clearDatabasePage( 7510 BtShared *pBt, /* The BTree that contains the table */ 7511 Pgno pgno, /* Page number to clear */ 7512 int freePageFlag, /* Deallocate page if true */ 7513 int *pnChange /* Add number of Cells freed to this counter */ 7514 ){ 7515 MemPage *pPage; 7516 int rc; 7517 unsigned char *pCell; 7518 int i; 7519 int hdr; 7520 u16 szCell; 7521 7522 assert( sqlite3_mutex_held(pBt->mutex) ); 7523 if( pgno>btreePagecount(pBt) ){ 7524 return SQLITE_CORRUPT_BKPT; 7525 } 7526 7527 rc = getAndInitPage(pBt, pgno, &pPage, 0); 7528 if( rc ) return rc; 7529 hdr = pPage->hdrOffset; 7530 for(i=0; i<pPage->nCell; i++){ 7531 pCell = findCell(pPage, i); 7532 if( !pPage->leaf ){ 7533 rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange); 7534 if( rc ) goto cleardatabasepage_out; 7535 } 7536 rc = clearCell(pPage, pCell, &szCell); 7537 if( rc ) goto cleardatabasepage_out; 7538 } 7539 if( !pPage->leaf ){ 7540 rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange); 7541 if( rc ) goto cleardatabasepage_out; 7542 }else if( pnChange ){ 7543 assert( pPage->intKey ); 7544 *pnChange += pPage->nCell; 7545 } 7546 if( freePageFlag ){ 7547 freePage(pPage, &rc); 7548 }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){ 7549 zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF); 7550 } 7551 7552 cleardatabasepage_out: 7553 releasePage(pPage); 7554 return rc; 7555 } 7556 7557 /* 7558 ** Delete all information from a single table in the database. iTable is 7559 ** the page number of the root of the table. After this routine returns, 7560 ** the root page is empty, but still exists. 7561 ** 7562 ** This routine will fail with SQLITE_LOCKED if there are any open 7563 ** read cursors on the table. Open write cursors are moved to the 7564 ** root of the table. 7565 ** 7566 ** If pnChange is not NULL, then table iTable must be an intkey table. The 7567 ** integer value pointed to by pnChange is incremented by the number of 7568 ** entries in the table. 7569 */ 7570 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){ 7571 int rc; 7572 BtShared *pBt = p->pBt; 7573 sqlite3BtreeEnter(p); 7574 assert( p->inTrans==TRANS_WRITE ); 7575 7576 rc = saveAllCursors(pBt, (Pgno)iTable, 0); 7577 7578 if( SQLITE_OK==rc ){ 7579 /* Invalidate all incrblob cursors open on table iTable (assuming iTable 7580 ** is the root of a table b-tree - if it is not, the following call is 7581 ** a no-op). */ 7582 invalidateIncrblobCursors(p, 0, 1); 7583 rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange); 7584 } 7585 sqlite3BtreeLeave(p); 7586 return rc; 7587 } 7588 7589 /* 7590 ** Delete all information from the single table that pCur is open on. 7591 ** 7592 ** This routine only work for pCur on an ephemeral table. 7593 */ 7594 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){ 7595 return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0); 7596 } 7597 7598 /* 7599 ** Erase all information in a table and add the root of the table to 7600 ** the freelist. Except, the root of the principle table (the one on 7601 ** page 1) is never added to the freelist. 7602 ** 7603 ** This routine will fail with SQLITE_LOCKED if there are any open 7604 ** cursors on the table. 7605 ** 7606 ** If AUTOVACUUM is enabled and the page at iTable is not the last 7607 ** root page in the database file, then the last root page 7608 ** in the database file is moved into the slot formerly occupied by 7609 ** iTable and that last slot formerly occupied by the last root page 7610 ** is added to the freelist instead of iTable. In this say, all 7611 ** root pages are kept at the beginning of the database file, which 7612 ** is necessary for AUTOVACUUM to work right. *piMoved is set to the 7613 ** page number that used to be the last root page in the file before 7614 ** the move. If no page gets moved, *piMoved is set to 0. 7615 ** The last root page is recorded in meta[3] and the value of 7616 ** meta[3] is updated by this procedure. 7617 */ 7618 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){ 7619 int rc; 7620 MemPage *pPage = 0; 7621 BtShared *pBt = p->pBt; 7622 7623 assert( sqlite3BtreeHoldsMutex(p) ); 7624 assert( p->inTrans==TRANS_WRITE ); 7625 7626 /* It is illegal to drop a table if any cursors are open on the 7627 ** database. This is because in auto-vacuum mode the backend may 7628 ** need to move another root-page to fill a gap left by the deleted 7629 ** root page. If an open cursor was using this page a problem would 7630 ** occur. 7631 ** 7632 ** This error is caught long before control reaches this point. 7633 */ 7634 if( NEVER(pBt->pCursor) ){ 7635 sqlite3ConnectionBlocked(p->db, pBt->pCursor->pBtree->db); 7636 return SQLITE_LOCKED_SHAREDCACHE; 7637 } 7638 7639 rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0); 7640 if( rc ) return rc; 7641 rc = sqlite3BtreeClearTable(p, iTable, 0); 7642 if( rc ){ 7643 releasePage(pPage); 7644 return rc; 7645 } 7646 7647 *piMoved = 0; 7648 7649 if( iTable>1 ){ 7650 #ifdef SQLITE_OMIT_AUTOVACUUM 7651 freePage(pPage, &rc); 7652 releasePage(pPage); 7653 #else 7654 if( pBt->autoVacuum ){ 7655 Pgno maxRootPgno; 7656 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno); 7657 7658 if( iTable==maxRootPgno ){ 7659 /* If the table being dropped is the table with the largest root-page 7660 ** number in the database, put the root page on the free list. 7661 */ 7662 freePage(pPage, &rc); 7663 releasePage(pPage); 7664 if( rc!=SQLITE_OK ){ 7665 return rc; 7666 } 7667 }else{ 7668 /* The table being dropped does not have the largest root-page 7669 ** number in the database. So move the page that does into the 7670 ** gap left by the deleted root-page. 7671 */ 7672 MemPage *pMove; 7673 releasePage(pPage); 7674 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7675 if( rc!=SQLITE_OK ){ 7676 return rc; 7677 } 7678 rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0); 7679 releasePage(pMove); 7680 if( rc!=SQLITE_OK ){ 7681 return rc; 7682 } 7683 pMove = 0; 7684 rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0); 7685 freePage(pMove, &rc); 7686 releasePage(pMove); 7687 if( rc!=SQLITE_OK ){ 7688 return rc; 7689 } 7690 *piMoved = maxRootPgno; 7691 } 7692 7693 /* Set the new 'max-root-page' value in the database header. This 7694 ** is the old value less one, less one more if that happens to 7695 ** be a root-page number, less one again if that is the 7696 ** PENDING_BYTE_PAGE. 7697 */ 7698 maxRootPgno--; 7699 while( maxRootPgno==PENDING_BYTE_PAGE(pBt) 7700 || PTRMAP_ISPAGE(pBt, maxRootPgno) ){ 7701 maxRootPgno--; 7702 } 7703 assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) ); 7704 7705 rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno); 7706 }else{ 7707 freePage(pPage, &rc); 7708 releasePage(pPage); 7709 } 7710 #endif 7711 }else{ 7712 /* If sqlite3BtreeDropTable was called on page 1. 7713 ** This really never should happen except in a corrupt 7714 ** database. 7715 */ 7716 zeroPage(pPage, PTF_INTKEY|PTF_LEAF ); 7717 releasePage(pPage); 7718 } 7719 return rc; 7720 } 7721 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){ 7722 int rc; 7723 sqlite3BtreeEnter(p); 7724 rc = btreeDropTable(p, iTable, piMoved); 7725 sqlite3BtreeLeave(p); 7726 return rc; 7727 } 7728 7729 7730 /* 7731 ** This function may only be called if the b-tree connection already 7732 ** has a read or write transaction open on the database. 7733 ** 7734 ** Read the meta-information out of a database file. Meta[0] 7735 ** is the number of free pages currently in the database. Meta[1] 7736 ** through meta[15] are available for use by higher layers. Meta[0] 7737 ** is read-only, the others are read/write. 7738 ** 7739 ** The schema layer numbers meta values differently. At the schema 7740 ** layer (and the SetCookie and ReadCookie opcodes) the number of 7741 ** free pages is not visible. So Cookie[0] is the same as Meta[1]. 7742 */ 7743 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){ 7744 BtShared *pBt = p->pBt; 7745 7746 sqlite3BtreeEnter(p); 7747 assert( p->inTrans>TRANS_NONE ); 7748 assert( SQLITE_OK==querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK) ); 7749 assert( pBt->pPage1 ); 7750 assert( idx>=0 && idx<=15 ); 7751 7752 *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]); 7753 7754 /* If auto-vacuum is disabled in this build and this is an auto-vacuum 7755 ** database, mark the database as read-only. */ 7756 #ifdef SQLITE_OMIT_AUTOVACUUM 7757 if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){ 7758 pBt->btsFlags |= BTS_READ_ONLY; 7759 } 7760 #endif 7761 7762 sqlite3BtreeLeave(p); 7763 } 7764 7765 /* 7766 ** Write meta-information back into the database. Meta[0] is 7767 ** read-only and may not be written. 7768 */ 7769 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){ 7770 BtShared *pBt = p->pBt; 7771 unsigned char *pP1; 7772 int rc; 7773 assert( idx>=1 && idx<=15 ); 7774 sqlite3BtreeEnter(p); 7775 assert( p->inTrans==TRANS_WRITE ); 7776 assert( pBt->pPage1!=0 ); 7777 pP1 = pBt->pPage1->aData; 7778 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 7779 if( rc==SQLITE_OK ){ 7780 put4byte(&pP1[36 + idx*4], iMeta); 7781 #ifndef SQLITE_OMIT_AUTOVACUUM 7782 if( idx==BTREE_INCR_VACUUM ){ 7783 assert( pBt->autoVacuum || iMeta==0 ); 7784 assert( iMeta==0 || iMeta==1 ); 7785 pBt->incrVacuum = (u8)iMeta; 7786 } 7787 #endif 7788 } 7789 sqlite3BtreeLeave(p); 7790 return rc; 7791 } 7792 7793 #ifndef SQLITE_OMIT_BTREECOUNT 7794 /* 7795 ** The first argument, pCur, is a cursor opened on some b-tree. Count the 7796 ** number of entries in the b-tree and write the result to *pnEntry. 7797 ** 7798 ** SQLITE_OK is returned if the operation is successfully executed. 7799 ** Otherwise, if an error is encountered (i.e. an IO error or database 7800 ** corruption) an SQLite error code is returned. 7801 */ 7802 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry){ 7803 i64 nEntry = 0; /* Value to return in *pnEntry */ 7804 int rc; /* Return code */ 7805 7806 if( pCur->pgnoRoot==0 ){ 7807 *pnEntry = 0; 7808 return SQLITE_OK; 7809 } 7810 rc = moveToRoot(pCur); 7811 7812 /* Unless an error occurs, the following loop runs one iteration for each 7813 ** page in the B-Tree structure (not including overflow pages). 7814 */ 7815 while( rc==SQLITE_OK ){ 7816 int iIdx; /* Index of child node in parent */ 7817 MemPage *pPage; /* Current page of the b-tree */ 7818 7819 /* If this is a leaf page or the tree is not an int-key tree, then 7820 ** this page contains countable entries. Increment the entry counter 7821 ** accordingly. 7822 */ 7823 pPage = pCur->apPage[pCur->iPage]; 7824 if( pPage->leaf || !pPage->intKey ){ 7825 nEntry += pPage->nCell; 7826 } 7827 7828 /* pPage is a leaf node. This loop navigates the cursor so that it 7829 ** points to the first interior cell that it points to the parent of 7830 ** the next page in the tree that has not yet been visited. The 7831 ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell 7832 ** of the page, or to the number of cells in the page if the next page 7833 ** to visit is the right-child of its parent. 7834 ** 7835 ** If all pages in the tree have been visited, return SQLITE_OK to the 7836 ** caller. 7837 */ 7838 if( pPage->leaf ){ 7839 do { 7840 if( pCur->iPage==0 ){ 7841 /* All pages of the b-tree have been visited. Return successfully. */ 7842 *pnEntry = nEntry; 7843 return SQLITE_OK; 7844 } 7845 moveToParent(pCur); 7846 }while ( pCur->aiIdx[pCur->iPage]>=pCur->apPage[pCur->iPage]->nCell ); 7847 7848 pCur->aiIdx[pCur->iPage]++; 7849 pPage = pCur->apPage[pCur->iPage]; 7850 } 7851 7852 /* Descend to the child node of the cell that the cursor currently 7853 ** points at. This is the right-child if (iIdx==pPage->nCell). 7854 */ 7855 iIdx = pCur->aiIdx[pCur->iPage]; 7856 if( iIdx==pPage->nCell ){ 7857 rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8])); 7858 }else{ 7859 rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx))); 7860 } 7861 } 7862 7863 /* An error has occurred. Return an error code. */ 7864 return rc; 7865 } 7866 #endif 7867 7868 /* 7869 ** Return the pager associated with a BTree. This routine is used for 7870 ** testing and debugging only. 7871 */ 7872 Pager *sqlite3BtreePager(Btree *p){ 7873 return p->pBt->pPager; 7874 } 7875 7876 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7877 /* 7878 ** Append a message to the error message string. 7879 */ 7880 static void checkAppendMsg( 7881 IntegrityCk *pCheck, 7882 const char *zFormat, 7883 ... 7884 ){ 7885 va_list ap; 7886 char zBuf[200]; 7887 if( !pCheck->mxErr ) return; 7888 pCheck->mxErr--; 7889 pCheck->nErr++; 7890 va_start(ap, zFormat); 7891 if( pCheck->errMsg.nChar ){ 7892 sqlite3StrAccumAppend(&pCheck->errMsg, "\n", 1); 7893 } 7894 if( pCheck->zPfx ){ 7895 sqlite3_snprintf(sizeof(zBuf), zBuf, pCheck->zPfx, pCheck->v1, pCheck->v2); 7896 sqlite3StrAccumAppendAll(&pCheck->errMsg, zBuf); 7897 } 7898 sqlite3VXPrintf(&pCheck->errMsg, 1, zFormat, ap); 7899 va_end(ap); 7900 if( pCheck->errMsg.accError==STRACCUM_NOMEM ){ 7901 pCheck->mallocFailed = 1; 7902 } 7903 } 7904 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 7905 7906 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 7907 7908 /* 7909 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that 7910 ** corresponds to page iPg is already set. 7911 */ 7912 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 7913 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 7914 return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07))); 7915 } 7916 7917 /* 7918 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg. 7919 */ 7920 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){ 7921 assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 ); 7922 pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07)); 7923 } 7924 7925 7926 /* 7927 ** Add 1 to the reference count for page iPage. If this is the second 7928 ** reference to the page, add an error message to pCheck->zErrMsg. 7929 ** Return 1 if there are 2 or more references to the page and 0 if 7930 ** if this is the first reference to the page. 7931 ** 7932 ** Also check that the page number is in bounds. 7933 */ 7934 static int checkRef(IntegrityCk *pCheck, Pgno iPage){ 7935 if( iPage==0 ) return 1; 7936 if( iPage>pCheck->nPage ){ 7937 checkAppendMsg(pCheck, "invalid page number %d", iPage); 7938 return 1; 7939 } 7940 if( getPageReferenced(pCheck, iPage) ){ 7941 checkAppendMsg(pCheck, "2nd reference to page %d", iPage); 7942 return 1; 7943 } 7944 setPageReferenced(pCheck, iPage); 7945 return 0; 7946 } 7947 7948 #ifndef SQLITE_OMIT_AUTOVACUUM 7949 /* 7950 ** Check that the entry in the pointer-map for page iChild maps to 7951 ** page iParent, pointer type ptrType. If not, append an error message 7952 ** to pCheck. 7953 */ 7954 static void checkPtrmap( 7955 IntegrityCk *pCheck, /* Integrity check context */ 7956 Pgno iChild, /* Child page number */ 7957 u8 eType, /* Expected pointer map type */ 7958 Pgno iParent /* Expected pointer map parent page number */ 7959 ){ 7960 int rc; 7961 u8 ePtrmapType; 7962 Pgno iPtrmapParent; 7963 7964 rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent); 7965 if( rc!=SQLITE_OK ){ 7966 if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->mallocFailed = 1; 7967 checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild); 7968 return; 7969 } 7970 7971 if( ePtrmapType!=eType || iPtrmapParent!=iParent ){ 7972 checkAppendMsg(pCheck, 7973 "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)", 7974 iChild, eType, iParent, ePtrmapType, iPtrmapParent); 7975 } 7976 } 7977 #endif 7978 7979 /* 7980 ** Check the integrity of the freelist or of an overflow page list. 7981 ** Verify that the number of pages on the list is N. 7982 */ 7983 static void checkList( 7984 IntegrityCk *pCheck, /* Integrity checking context */ 7985 int isFreeList, /* True for a freelist. False for overflow page list */ 7986 int iPage, /* Page number for first page in the list */ 7987 int N /* Expected number of pages in the list */ 7988 ){ 7989 int i; 7990 int expected = N; 7991 int iFirst = iPage; 7992 while( N-- > 0 && pCheck->mxErr ){ 7993 DbPage *pOvflPage; 7994 unsigned char *pOvflData; 7995 if( iPage<1 ){ 7996 checkAppendMsg(pCheck, 7997 "%d of %d pages missing from overflow list starting at %d", 7998 N+1, expected, iFirst); 7999 break; 8000 } 8001 if( checkRef(pCheck, iPage) ) break; 8002 if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage) ){ 8003 checkAppendMsg(pCheck, "failed to get page %d", iPage); 8004 break; 8005 } 8006 pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage); 8007 if( isFreeList ){ 8008 int n = get4byte(&pOvflData[4]); 8009 #ifndef SQLITE_OMIT_AUTOVACUUM 8010 if( pCheck->pBt->autoVacuum ){ 8011 checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0); 8012 } 8013 #endif 8014 if( n>(int)pCheck->pBt->usableSize/4-2 ){ 8015 checkAppendMsg(pCheck, 8016 "freelist leaf count too big on page %d", iPage); 8017 N--; 8018 }else{ 8019 for(i=0; i<n; i++){ 8020 Pgno iFreePage = get4byte(&pOvflData[8+i*4]); 8021 #ifndef SQLITE_OMIT_AUTOVACUUM 8022 if( pCheck->pBt->autoVacuum ){ 8023 checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0); 8024 } 8025 #endif 8026 checkRef(pCheck, iFreePage); 8027 } 8028 N -= n; 8029 } 8030 } 8031 #ifndef SQLITE_OMIT_AUTOVACUUM 8032 else{ 8033 /* If this database supports auto-vacuum and iPage is not the last 8034 ** page in this overflow list, check that the pointer-map entry for 8035 ** the following page matches iPage. 8036 */ 8037 if( pCheck->pBt->autoVacuum && N>0 ){ 8038 i = get4byte(pOvflData); 8039 checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage); 8040 } 8041 } 8042 #endif 8043 iPage = get4byte(pOvflData); 8044 sqlite3PagerUnref(pOvflPage); 8045 } 8046 } 8047 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 8048 8049 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 8050 /* 8051 ** Do various sanity checks on a single page of a tree. Return 8052 ** the tree depth. Root pages return 0. Parents of root pages 8053 ** return 1, and so forth. 8054 ** 8055 ** These checks are done: 8056 ** 8057 ** 1. Make sure that cells and freeblocks do not overlap 8058 ** but combine to completely cover the page. 8059 ** NO 2. Make sure cell keys are in order. 8060 ** NO 3. Make sure no key is less than or equal to zLowerBound. 8061 ** NO 4. Make sure no key is greater than or equal to zUpperBound. 8062 ** 5. Check the integrity of overflow pages. 8063 ** 6. Recursively call checkTreePage on all children. 8064 ** 7. Verify that the depth of all children is the same. 8065 ** 8. Make sure this page is at least 33% full or else it is 8066 ** the root of the tree. 8067 */ 8068 static int checkTreePage( 8069 IntegrityCk *pCheck, /* Context for the sanity check */ 8070 int iPage, /* Page number of the page to check */ 8071 i64 *pnParentMinKey, 8072 i64 *pnParentMaxKey 8073 ){ 8074 MemPage *pPage; 8075 int i, rc, depth, d2, pgno, cnt; 8076 int hdr, cellStart; 8077 int nCell; 8078 u8 *data; 8079 BtShared *pBt; 8080 int usableSize; 8081 char *hit = 0; 8082 i64 nMinKey = 0; 8083 i64 nMaxKey = 0; 8084 const char *saved_zPfx = pCheck->zPfx; 8085 int saved_v1 = pCheck->v1; 8086 int saved_v2 = pCheck->v2; 8087 8088 /* Check that the page exists 8089 */ 8090 pBt = pCheck->pBt; 8091 usableSize = pBt->usableSize; 8092 if( iPage==0 ) return 0; 8093 if( checkRef(pCheck, iPage) ) return 0; 8094 pCheck->zPfx = "Page %d: "; 8095 pCheck->v1 = iPage; 8096 if( (rc = btreeGetPage(pBt, (Pgno)iPage, &pPage, 0))!=0 ){ 8097 checkAppendMsg(pCheck, 8098 "unable to get the page. error code=%d", rc); 8099 depth = -1; 8100 goto end_of_check; 8101 } 8102 8103 /* Clear MemPage.isInit to make sure the corruption detection code in 8104 ** btreeInitPage() is executed. */ 8105 pPage->isInit = 0; 8106 if( (rc = btreeInitPage(pPage))!=0 ){ 8107 assert( rc==SQLITE_CORRUPT ); /* The only possible error from InitPage */ 8108 checkAppendMsg(pCheck, 8109 "btreeInitPage() returns error code %d", rc); 8110 releasePage(pPage); 8111 depth = -1; 8112 goto end_of_check; 8113 } 8114 8115 /* Check out all the cells. 8116 */ 8117 depth = 0; 8118 for(i=0; i<pPage->nCell && pCheck->mxErr; i++){ 8119 u8 *pCell; 8120 u32 sz; 8121 CellInfo info; 8122 8123 /* Check payload overflow pages 8124 */ 8125 pCheck->zPfx = "On tree page %d cell %d: "; 8126 pCheck->v1 = iPage; 8127 pCheck->v2 = i; 8128 pCell = findCell(pPage,i); 8129 btreeParseCellPtr(pPage, pCell, &info); 8130 sz = info.nPayload; 8131 /* For intKey pages, check that the keys are in order. 8132 */ 8133 if( pPage->intKey ){ 8134 if( i==0 ){ 8135 nMinKey = nMaxKey = info.nKey; 8136 }else if( info.nKey <= nMaxKey ){ 8137 checkAppendMsg(pCheck, 8138 "Rowid %lld out of order (previous was %lld)", info.nKey, nMaxKey); 8139 } 8140 nMaxKey = info.nKey; 8141 } 8142 if( (sz>info.nLocal) 8143 && (&pCell[info.iOverflow]<=&pPage->aData[pBt->usableSize]) 8144 ){ 8145 int nPage = (sz - info.nLocal + usableSize - 5)/(usableSize - 4); 8146 Pgno pgnoOvfl = get4byte(&pCell[info.iOverflow]); 8147 #ifndef SQLITE_OMIT_AUTOVACUUM 8148 if( pBt->autoVacuum ){ 8149 checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage); 8150 } 8151 #endif 8152 checkList(pCheck, 0, pgnoOvfl, nPage); 8153 } 8154 8155 /* Check sanity of left child page. 8156 */ 8157 if( !pPage->leaf ){ 8158 pgno = get4byte(pCell); 8159 #ifndef SQLITE_OMIT_AUTOVACUUM 8160 if( pBt->autoVacuum ){ 8161 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 8162 } 8163 #endif 8164 d2 = checkTreePage(pCheck, pgno, &nMinKey, i==0?NULL:&nMaxKey); 8165 if( i>0 && d2!=depth ){ 8166 checkAppendMsg(pCheck, "Child page depth differs"); 8167 } 8168 depth = d2; 8169 } 8170 } 8171 8172 if( !pPage->leaf ){ 8173 pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]); 8174 pCheck->zPfx = "On page %d at right child: "; 8175 pCheck->v1 = iPage; 8176 #ifndef SQLITE_OMIT_AUTOVACUUM 8177 if( pBt->autoVacuum ){ 8178 checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage); 8179 } 8180 #endif 8181 checkTreePage(pCheck, pgno, NULL, !pPage->nCell?NULL:&nMaxKey); 8182 } 8183 8184 /* For intKey leaf pages, check that the min/max keys are in order 8185 ** with any left/parent/right pages. 8186 */ 8187 pCheck->zPfx = "Page %d: "; 8188 pCheck->v1 = iPage; 8189 if( pPage->leaf && pPage->intKey ){ 8190 /* if we are a left child page */ 8191 if( pnParentMinKey ){ 8192 /* if we are the left most child page */ 8193 if( !pnParentMaxKey ){ 8194 if( nMaxKey > *pnParentMinKey ){ 8195 checkAppendMsg(pCheck, 8196 "Rowid %lld out of order (max larger than parent min of %lld)", 8197 nMaxKey, *pnParentMinKey); 8198 } 8199 }else{ 8200 if( nMinKey <= *pnParentMinKey ){ 8201 checkAppendMsg(pCheck, 8202 "Rowid %lld out of order (min less than parent min of %lld)", 8203 nMinKey, *pnParentMinKey); 8204 } 8205 if( nMaxKey > *pnParentMaxKey ){ 8206 checkAppendMsg(pCheck, 8207 "Rowid %lld out of order (max larger than parent max of %lld)", 8208 nMaxKey, *pnParentMaxKey); 8209 } 8210 *pnParentMinKey = nMaxKey; 8211 } 8212 /* else if we're a right child page */ 8213 } else if( pnParentMaxKey ){ 8214 if( nMinKey <= *pnParentMaxKey ){ 8215 checkAppendMsg(pCheck, 8216 "Rowid %lld out of order (min less than parent max of %lld)", 8217 nMinKey, *pnParentMaxKey); 8218 } 8219 } 8220 } 8221 8222 /* Check for complete coverage of the page 8223 */ 8224 data = pPage->aData; 8225 hdr = pPage->hdrOffset; 8226 hit = sqlite3PageMalloc( pBt->pageSize ); 8227 pCheck->zPfx = 0; 8228 if( hit==0 ){ 8229 pCheck->mallocFailed = 1; 8230 }else{ 8231 int contentOffset = get2byteNotZero(&data[hdr+5]); 8232 assert( contentOffset<=usableSize ); /* Enforced by btreeInitPage() */ 8233 memset(hit+contentOffset, 0, usableSize-contentOffset); 8234 memset(hit, 1, contentOffset); 8235 nCell = get2byte(&data[hdr+3]); 8236 cellStart = hdr + 12 - 4*pPage->leaf; 8237 for(i=0; i<nCell; i++){ 8238 int pc = get2byte(&data[cellStart+i*2]); 8239 u32 size = 65536; 8240 int j; 8241 if( pc<=usableSize-4 ){ 8242 size = cellSizePtr(pPage, &data[pc]); 8243 } 8244 if( (int)(pc+size-1)>=usableSize ){ 8245 pCheck->zPfx = 0; 8246 checkAppendMsg(pCheck, 8247 "Corruption detected in cell %d on page %d",i,iPage); 8248 }else{ 8249 for(j=pc+size-1; j>=pc; j--) hit[j]++; 8250 } 8251 } 8252 i = get2byte(&data[hdr+1]); 8253 while( i>0 ){ 8254 int size, j; 8255 assert( i<=usableSize-4 ); /* Enforced by btreeInitPage() */ 8256 size = get2byte(&data[i+2]); 8257 assert( i+size<=usableSize ); /* Enforced by btreeInitPage() */ 8258 for(j=i+size-1; j>=i; j--) hit[j]++; 8259 j = get2byte(&data[i]); 8260 assert( j==0 || j>i+size ); /* Enforced by btreeInitPage() */ 8261 assert( j<=usableSize-4 ); /* Enforced by btreeInitPage() */ 8262 i = j; 8263 } 8264 for(i=cnt=0; i<usableSize; i++){ 8265 if( hit[i]==0 ){ 8266 cnt++; 8267 }else if( hit[i]>1 ){ 8268 checkAppendMsg(pCheck, 8269 "Multiple uses for byte %d of page %d", i, iPage); 8270 break; 8271 } 8272 } 8273 if( cnt!=data[hdr+7] ){ 8274 checkAppendMsg(pCheck, 8275 "Fragmentation of %d bytes reported as %d on page %d", 8276 cnt, data[hdr+7], iPage); 8277 } 8278 } 8279 sqlite3PageFree(hit); 8280 releasePage(pPage); 8281 8282 end_of_check: 8283 pCheck->zPfx = saved_zPfx; 8284 pCheck->v1 = saved_v1; 8285 pCheck->v2 = saved_v2; 8286 return depth+1; 8287 } 8288 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 8289 8290 #ifndef SQLITE_OMIT_INTEGRITY_CHECK 8291 /* 8292 ** This routine does a complete check of the given BTree file. aRoot[] is 8293 ** an array of pages numbers were each page number is the root page of 8294 ** a table. nRoot is the number of entries in aRoot. 8295 ** 8296 ** A read-only or read-write transaction must be opened before calling 8297 ** this function. 8298 ** 8299 ** Write the number of error seen in *pnErr. Except for some memory 8300 ** allocation errors, an error message held in memory obtained from 8301 ** malloc is returned if *pnErr is non-zero. If *pnErr==0 then NULL is 8302 ** returned. If a memory allocation error occurs, NULL is returned. 8303 */ 8304 char *sqlite3BtreeIntegrityCheck( 8305 Btree *p, /* The btree to be checked */ 8306 int *aRoot, /* An array of root pages numbers for individual trees */ 8307 int nRoot, /* Number of entries in aRoot[] */ 8308 int mxErr, /* Stop reporting errors after this many */ 8309 int *pnErr /* Write number of errors seen to this variable */ 8310 ){ 8311 Pgno i; 8312 int nRef; 8313 IntegrityCk sCheck; 8314 BtShared *pBt = p->pBt; 8315 char zErr[100]; 8316 8317 sqlite3BtreeEnter(p); 8318 assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE ); 8319 nRef = sqlite3PagerRefcount(pBt->pPager); 8320 sCheck.pBt = pBt; 8321 sCheck.pPager = pBt->pPager; 8322 sCheck.nPage = btreePagecount(sCheck.pBt); 8323 sCheck.mxErr = mxErr; 8324 sCheck.nErr = 0; 8325 sCheck.mallocFailed = 0; 8326 sCheck.zPfx = 0; 8327 sCheck.v1 = 0; 8328 sCheck.v2 = 0; 8329 *pnErr = 0; 8330 if( sCheck.nPage==0 ){ 8331 sqlite3BtreeLeave(p); 8332 return 0; 8333 } 8334 8335 sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1); 8336 if( !sCheck.aPgRef ){ 8337 *pnErr = 1; 8338 sqlite3BtreeLeave(p); 8339 return 0; 8340 } 8341 i = PENDING_BYTE_PAGE(pBt); 8342 if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i); 8343 sqlite3StrAccumInit(&sCheck.errMsg, zErr, sizeof(zErr), SQLITE_MAX_LENGTH); 8344 sCheck.errMsg.useMalloc = 2; 8345 8346 /* Check the integrity of the freelist 8347 */ 8348 sCheck.zPfx = "Main freelist: "; 8349 checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]), 8350 get4byte(&pBt->pPage1->aData[36])); 8351 sCheck.zPfx = 0; 8352 8353 /* Check all the tables. 8354 */ 8355 for(i=0; (int)i<nRoot && sCheck.mxErr; i++){ 8356 if( aRoot[i]==0 ) continue; 8357 #ifndef SQLITE_OMIT_AUTOVACUUM 8358 if( pBt->autoVacuum && aRoot[i]>1 ){ 8359 checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0); 8360 } 8361 #endif 8362 sCheck.zPfx = "List of tree roots: "; 8363 checkTreePage(&sCheck, aRoot[i], NULL, NULL); 8364 sCheck.zPfx = 0; 8365 } 8366 8367 /* Make sure every page in the file is referenced 8368 */ 8369 for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){ 8370 #ifdef SQLITE_OMIT_AUTOVACUUM 8371 if( getPageReferenced(&sCheck, i)==0 ){ 8372 checkAppendMsg(&sCheck, "Page %d is never used", i); 8373 } 8374 #else 8375 /* If the database supports auto-vacuum, make sure no tables contain 8376 ** references to pointer-map pages. 8377 */ 8378 if( getPageReferenced(&sCheck, i)==0 && 8379 (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){ 8380 checkAppendMsg(&sCheck, "Page %d is never used", i); 8381 } 8382 if( getPageReferenced(&sCheck, i)!=0 && 8383 (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){ 8384 checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i); 8385 } 8386 #endif 8387 } 8388 8389 /* Make sure this analysis did not leave any unref() pages. 8390 ** This is an internal consistency check; an integrity check 8391 ** of the integrity check. 8392 */ 8393 if( NEVER(nRef != sqlite3PagerRefcount(pBt->pPager)) ){ 8394 checkAppendMsg(&sCheck, 8395 "Outstanding page count goes from %d to %d during this analysis", 8396 nRef, sqlite3PagerRefcount(pBt->pPager) 8397 ); 8398 } 8399 8400 /* Clean up and report errors. 8401 */ 8402 sqlite3BtreeLeave(p); 8403 sqlite3_free(sCheck.aPgRef); 8404 if( sCheck.mallocFailed ){ 8405 sqlite3StrAccumReset(&sCheck.errMsg); 8406 *pnErr = sCheck.nErr+1; 8407 return 0; 8408 } 8409 *pnErr = sCheck.nErr; 8410 if( sCheck.nErr==0 ) sqlite3StrAccumReset(&sCheck.errMsg); 8411 return sqlite3StrAccumFinish(&sCheck.errMsg); 8412 } 8413 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */ 8414 8415 /* 8416 ** Return the full pathname of the underlying database file. Return 8417 ** an empty string if the database is in-memory or a TEMP database. 8418 ** 8419 ** The pager filename is invariant as long as the pager is 8420 ** open so it is safe to access without the BtShared mutex. 8421 */ 8422 const char *sqlite3BtreeGetFilename(Btree *p){ 8423 assert( p->pBt->pPager!=0 ); 8424 return sqlite3PagerFilename(p->pBt->pPager, 1); 8425 } 8426 8427 /* 8428 ** Return the pathname of the journal file for this database. The return 8429 ** value of this routine is the same regardless of whether the journal file 8430 ** has been created or not. 8431 ** 8432 ** The pager journal filename is invariant as long as the pager is 8433 ** open so it is safe to access without the BtShared mutex. 8434 */ 8435 const char *sqlite3BtreeGetJournalname(Btree *p){ 8436 assert( p->pBt->pPager!=0 ); 8437 return sqlite3PagerJournalname(p->pBt->pPager); 8438 } 8439 8440 /* 8441 ** Return non-zero if a transaction is active. 8442 */ 8443 int sqlite3BtreeIsInTrans(Btree *p){ 8444 assert( p==0 || sqlite3_mutex_held(p->db->mutex) ); 8445 return (p && (p->inTrans==TRANS_WRITE)); 8446 } 8447 8448 #ifndef SQLITE_OMIT_WAL 8449 /* 8450 ** Run a checkpoint on the Btree passed as the first argument. 8451 ** 8452 ** Return SQLITE_LOCKED if this or any other connection has an open 8453 ** transaction on the shared-cache the argument Btree is connected to. 8454 ** 8455 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART. 8456 */ 8457 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){ 8458 int rc = SQLITE_OK; 8459 if( p ){ 8460 BtShared *pBt = p->pBt; 8461 sqlite3BtreeEnter(p); 8462 if( pBt->inTransaction!=TRANS_NONE ){ 8463 rc = SQLITE_LOCKED; 8464 }else{ 8465 rc = sqlite3PagerCheckpoint(pBt->pPager, eMode, pnLog, pnCkpt); 8466 } 8467 sqlite3BtreeLeave(p); 8468 } 8469 return rc; 8470 } 8471 #endif 8472 8473 /* 8474 ** Return non-zero if a read (or write) transaction is active. 8475 */ 8476 int sqlite3BtreeIsInReadTrans(Btree *p){ 8477 assert( p ); 8478 assert( sqlite3_mutex_held(p->db->mutex) ); 8479 return p->inTrans!=TRANS_NONE; 8480 } 8481 8482 int sqlite3BtreeIsInBackup(Btree *p){ 8483 assert( p ); 8484 assert( sqlite3_mutex_held(p->db->mutex) ); 8485 return p->nBackup!=0; 8486 } 8487 8488 /* 8489 ** This function returns a pointer to a blob of memory associated with 8490 ** a single shared-btree. The memory is used by client code for its own 8491 ** purposes (for example, to store a high-level schema associated with 8492 ** the shared-btree). The btree layer manages reference counting issues. 8493 ** 8494 ** The first time this is called on a shared-btree, nBytes bytes of memory 8495 ** are allocated, zeroed, and returned to the caller. For each subsequent 8496 ** call the nBytes parameter is ignored and a pointer to the same blob 8497 ** of memory returned. 8498 ** 8499 ** If the nBytes parameter is 0 and the blob of memory has not yet been 8500 ** allocated, a null pointer is returned. If the blob has already been 8501 ** allocated, it is returned as normal. 8502 ** 8503 ** Just before the shared-btree is closed, the function passed as the 8504 ** xFree argument when the memory allocation was made is invoked on the 8505 ** blob of allocated memory. The xFree function should not call sqlite3_free() 8506 ** on the memory, the btree layer does that. 8507 */ 8508 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){ 8509 BtShared *pBt = p->pBt; 8510 sqlite3BtreeEnter(p); 8511 if( !pBt->pSchema && nBytes ){ 8512 pBt->pSchema = sqlite3DbMallocZero(0, nBytes); 8513 pBt->xFreeSchema = xFree; 8514 } 8515 sqlite3BtreeLeave(p); 8516 return pBt->pSchema; 8517 } 8518 8519 /* 8520 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared 8521 ** btree as the argument handle holds an exclusive lock on the 8522 ** sqlite_master table. Otherwise SQLITE_OK. 8523 */ 8524 int sqlite3BtreeSchemaLocked(Btree *p){ 8525 int rc; 8526 assert( sqlite3_mutex_held(p->db->mutex) ); 8527 sqlite3BtreeEnter(p); 8528 rc = querySharedCacheTableLock(p, MASTER_ROOT, READ_LOCK); 8529 assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE ); 8530 sqlite3BtreeLeave(p); 8531 return rc; 8532 } 8533 8534 8535 #ifndef SQLITE_OMIT_SHARED_CACHE 8536 /* 8537 ** Obtain a lock on the table whose root page is iTab. The 8538 ** lock is a write lock if isWritelock is true or a read lock 8539 ** if it is false. 8540 */ 8541 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){ 8542 int rc = SQLITE_OK; 8543 assert( p->inTrans!=TRANS_NONE ); 8544 if( p->sharable ){ 8545 u8 lockType = READ_LOCK + isWriteLock; 8546 assert( READ_LOCK+1==WRITE_LOCK ); 8547 assert( isWriteLock==0 || isWriteLock==1 ); 8548 8549 sqlite3BtreeEnter(p); 8550 rc = querySharedCacheTableLock(p, iTab, lockType); 8551 if( rc==SQLITE_OK ){ 8552 rc = setSharedCacheTableLock(p, iTab, lockType); 8553 } 8554 sqlite3BtreeLeave(p); 8555 } 8556 return rc; 8557 } 8558 #endif 8559 8560 #ifndef SQLITE_OMIT_INCRBLOB 8561 /* 8562 ** Argument pCsr must be a cursor opened for writing on an 8563 ** INTKEY table currently pointing at a valid table entry. 8564 ** This function modifies the data stored as part of that entry. 8565 ** 8566 ** Only the data content may only be modified, it is not possible to 8567 ** change the length of the data stored. If this function is called with 8568 ** parameters that attempt to write past the end of the existing data, 8569 ** no modifications are made and SQLITE_CORRUPT is returned. 8570 */ 8571 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){ 8572 int rc; 8573 assert( cursorHoldsMutex(pCsr) ); 8574 assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) ); 8575 assert( pCsr->curFlags & BTCF_Incrblob ); 8576 8577 rc = restoreCursorPosition(pCsr); 8578 if( rc!=SQLITE_OK ){ 8579 return rc; 8580 } 8581 assert( pCsr->eState!=CURSOR_REQUIRESEEK ); 8582 if( pCsr->eState!=CURSOR_VALID ){ 8583 return SQLITE_ABORT; 8584 } 8585 8586 /* Save the positions of all other cursors open on this table. This is 8587 ** required in case any of them are holding references to an xFetch 8588 ** version of the b-tree page modified by the accessPayload call below. 8589 ** 8590 ** Note that pCsr must be open on a INTKEY table and saveCursorPosition() 8591 ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence 8592 ** saveAllCursors can only return SQLITE_OK. 8593 */ 8594 VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr); 8595 assert( rc==SQLITE_OK ); 8596 8597 /* Check some assumptions: 8598 ** (a) the cursor is open for writing, 8599 ** (b) there is a read/write transaction open, 8600 ** (c) the connection holds a write-lock on the table (if required), 8601 ** (d) there are no conflicting read-locks, and 8602 ** (e) the cursor points at a valid row of an intKey table. 8603 */ 8604 if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){ 8605 return SQLITE_READONLY; 8606 } 8607 assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0 8608 && pCsr->pBt->inTransaction==TRANS_WRITE ); 8609 assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) ); 8610 assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) ); 8611 assert( pCsr->apPage[pCsr->iPage]->intKey ); 8612 8613 return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1); 8614 } 8615 8616 /* 8617 ** Mark this cursor as an incremental blob cursor. 8618 */ 8619 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){ 8620 pCur->curFlags |= BTCF_Incrblob; 8621 } 8622 #endif 8623 8624 /* 8625 ** Set both the "read version" (single byte at byte offset 18) and 8626 ** "write version" (single byte at byte offset 19) fields in the database 8627 ** header to iVersion. 8628 */ 8629 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){ 8630 BtShared *pBt = pBtree->pBt; 8631 int rc; /* Return code */ 8632 8633 assert( iVersion==1 || iVersion==2 ); 8634 8635 /* If setting the version fields to 1, do not automatically open the 8636 ** WAL connection, even if the version fields are currently set to 2. 8637 */ 8638 pBt->btsFlags &= ~BTS_NO_WAL; 8639 if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL; 8640 8641 rc = sqlite3BtreeBeginTrans(pBtree, 0); 8642 if( rc==SQLITE_OK ){ 8643 u8 *aData = pBt->pPage1->aData; 8644 if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){ 8645 rc = sqlite3BtreeBeginTrans(pBtree, 2); 8646 if( rc==SQLITE_OK ){ 8647 rc = sqlite3PagerWrite(pBt->pPage1->pDbPage); 8648 if( rc==SQLITE_OK ){ 8649 aData[18] = (u8)iVersion; 8650 aData[19] = (u8)iVersion; 8651 } 8652 } 8653 } 8654 } 8655 8656 pBt->btsFlags &= ~BTS_NO_WAL; 8657 return rc; 8658 } 8659 8660 /* 8661 ** set the mask of hint flags for cursor pCsr. Currently the only valid 8662 ** values are 0 and BTREE_BULKLOAD. 8663 */ 8664 void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){ 8665 assert( mask==BTREE_BULKLOAD || mask==0 ); 8666 pCsr->hints = mask; 8667 } 8668 8669 /* 8670 ** Return true if the given Btree is read-only. 8671 */ 8672 int sqlite3BtreeIsReadonly(Btree *p){ 8673 return (p->pBt->btsFlags & BTS_READ_ONLY)!=0; 8674 }