/* ** 2001 September 15 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** $Id: btree.c,v 1.97 2003/12/16 03:44:48 drh Exp $ ** ** This file implements a external (disk-based) database using BTrees. ** For a detailed discussion of BTrees, refer to ** ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: ** "Sorting And Searching", pages 473-480. Addison-Wesley ** Publishing Company, Reading, Massachusetts. ** ** The basic idea is that each page of the file contains N database ** entries and N+1 pointers to subpages. ** ** ---------------------------------------------------------------- ** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) | ** ---------------------------------------------------------------- ** ** All of the keys on the page that Ptr(0) points to have values less ** than Key(0). All of the keys on page Ptr(1) and its subpages have ** values greater than Key(0) and less than Key(1). All of the keys ** on Ptr(N+1) and its subpages have values greater than Key(N). And ** so forth. ** ** Finding a particular key requires reading O(log(M)) pages from the ** disk where M is the number of entries in the tree. ** ** In this implementation, a single file can hold one or more separate ** BTrees. Each BTree is identified by the index of its root page. The ** key and data for any entry are combined to form the "payload". Up to ** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the ** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes ** then surplus bytes are stored on overflow pages. The payload for an ** entry and the preceding pointer are combined to form a "Cell". Each ** page has a small header which contains the Ptr(N+1) pointer. ** ** The first page of the file contains a magic string used to verify that ** the file really is a valid BTree database, a pointer to a list of unused ** pages in the file, and some meta information. The root of the first ** BTree begins on page 2 of the file. (Pages are numbered beginning with ** 1, not 0.) Thus a minimum database contains 2 pages. */ #include "sqliteInt.h" #include "pager.h" #include "btree.h" #include /* Forward declarations */ static BtOps sqliteBtreeOps; static BtCursorOps sqliteBtreeCursorOps; /* ** Macros used for byteswapping. B is a pointer to the Btree ** structure. This is needed to access the Btree.needSwab boolean ** in order to tell if byte swapping is needed or not. ** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer. ** SWAB32 byteswaps a 32-bit integer. */ #define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X)) #define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X)) #define SWAB_ADD(B,X,A) \ if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); } /* ** The following global variable - available only if SQLITE_TEST is ** defined - is used to determine whether new databases are created in ** native byte order or in non-native byte order. Non-native byte order ** databases are created for testing purposes only. Under normal operation, ** only native byte-order databases should be created, but we should be ** able to read or write existing databases regardless of the byteorder. */ #ifdef SQLITE_TEST int btree_native_byte_order = 1; #else # define btree_native_byte_order 1 #endif /* ** Forward declarations of structures used only in this file. */ typedef struct PageOne PageOne; typedef struct MemPage MemPage; typedef struct PageHdr PageHdr; typedef struct Cell Cell; typedef struct CellHdr CellHdr; typedef struct FreeBlk FreeBlk; typedef struct OverflowPage OverflowPage; typedef struct FreelistInfo FreelistInfo; /* ** All structures on a database page are aligned to 4-byte boundries. ** This routine rounds up a number of bytes to the next multiple of 4. ** ** This might need to change for computer architectures that require ** and 8-byte alignment boundry for structures. */ #define ROUNDUP(X) ((X+3) & ~3) /* ** This is a magic string that appears at the beginning of every ** SQLite database in order to identify the file as a real database. */ static const char zMagicHeader[] = "** This file contains an SQLite 2.1 database **"; #define MAGIC_SIZE (sizeof(zMagicHeader)) /* ** This is a magic integer also used to test the integrity of the database ** file. This integer is used in addition to the string above so that ** if the file is written on a little-endian architecture and read ** on a big-endian architectures (or vice versa) we can detect the ** problem. ** ** The number used was obtained at random and has no special ** significance other than the fact that it represents a different ** integer on little-endian and big-endian machines. */ #define MAGIC 0xdae37528 /* ** The first page of the database file contains a magic header string ** to identify the file as an SQLite database file. It also contains ** a pointer to the first free page of the file. Page 2 contains the ** root of the principle BTree. The file might contain other BTrees ** rooted on pages above 2. ** ** The first page also contains SQLITE_N_BTREE_META integers that ** can be used by higher-level routines. ** ** Remember that pages are numbered beginning with 1. (See pager.c ** for additional information.) Page 0 does not exist and a page ** number of 0 is used to mean "no such page". */ struct PageOne { char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */ int iMagic; /* Integer to verify correct byte order */ Pgno freeList; /* First free page in a list of all free pages */ int nFree; /* Number of pages on the free list */ int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */ }; /* ** Each database page has a header that is an instance of this ** structure. ** ** PageHdr.firstFree is 0 if there is no free space on this page. ** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a ** FreeBlk structure that describes the first block of free space. ** All free space is defined by a linked list of FreeBlk structures. ** ** Data is stored in a linked list of Cell structures. PageHdr.firstCell ** is the index into MemPage.u.aDisk[] of the first cell on the page. The ** Cells are kept in sorted order. ** ** A Cell contains all information about a database entry and a pointer ** to a child page that contains other entries less than itself. In ** other words, the i-th Cell contains both Ptr(i) and Key(i). The ** right-most pointer of the page is contained in PageHdr.rightChild. */ struct PageHdr { Pgno rightChild; /* Child page that comes after all cells on this page */ u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */ u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */ }; /* ** Entries on a page of the database are called "Cells". Each Cell ** has a header and data. This structure defines the header. The ** key and data (collectively the "payload") follow this header on ** the database page. ** ** A definition of the complete Cell structure is given below. The ** header for the cell must be defined first in order to do some ** of the sizing #defines that follow. */ struct CellHdr { Pgno leftChild; /* Child page that comes before this cell */ u16 nKey; /* Number of bytes in the key */ u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */ u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */ u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */ u16 nData; /* Number of bytes of data */ }; /* ** The key and data size are split into a lower 16-bit segment and an ** upper 8-bit segment in order to pack them together into a smaller ** space. The following macros reassembly a key or data size back ** into an integer. */ #define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536) #define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536) /* ** The minimum size of a complete Cell. The Cell must contain a header ** and at least 4 bytes of payload. */ #define MIN_CELL_SIZE (sizeof(CellHdr)+4) /* ** The maximum number of database entries that can be held in a single ** page of the database. */ #define MX_CELL ((SQLITE_PAGE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE) /* ** The amount of usable space on a single page of the BTree. This is the ** page size minus the overhead of the page header. */ #define USABLE_SPACE (SQLITE_PAGE_SIZE - sizeof(PageHdr)) /* ** The maximum amount of payload (in bytes) that can be stored locally for ** a database entry. If the entry contains more data than this, the ** extra goes onto overflow pages. ** ** This number is chosen so that at least 4 cells will fit on every page. */ #define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3) /* ** Data on a database page is stored as a linked list of Cell structures. ** Both the key and the data are stored in aPayload[]. The key always comes ** first. The aPayload[] field grows as necessary to hold the key and data, ** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the ** page number of the first overflow page. ** ** Though this structure is fixed in size, the Cell on the database ** page varies in size. Every cell has a CellHdr and at least 4 bytes ** of payload space. Additional payload bytes (up to the maximum of ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as ** needed. */ struct Cell { CellHdr h; /* The cell header */ char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */ Pgno ovfl; /* The first overflow page */ }; /* ** Free space on a page is remembered using a linked list of the FreeBlk ** structures. Space on a database page is allocated in increments of ** at least 4 bytes and is always aligned to a 4-byte boundry. The ** linked list of FreeBlks is always kept in order by address. */ struct FreeBlk { u16 iSize; /* Number of bytes in this block of free space */ u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */ }; /* ** The number of bytes of payload that will fit on a single overflow page. */ #define OVERFLOW_SIZE (SQLITE_PAGE_SIZE-sizeof(Pgno)) /* ** When the key and data for a single entry in the BTree will not fit in ** the MX_LOCAL_PAYLOAD bytes of space available on the database page, ** then all extra bytes are written to a linked list of overflow pages. ** Each overflow page is an instance of the following structure. ** ** Unused pages in the database are also represented by instances of ** the OverflowPage structure. The PageOne.freeList field is the ** page number of the first page in a linked list of unused database ** pages. */ struct OverflowPage { Pgno iNext; char aPayload[OVERFLOW_SIZE]; }; /* ** The PageOne.freeList field points to a linked list of overflow pages ** hold information about free pages. The aPayload section of each ** overflow page contains an instance of the following structure. The ** aFree[] array holds the page number of nFree unused pages in the disk ** file. */ struct FreelistInfo { int nFree; Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)]; }; /* ** For every page in the database file, an instance of the following structure ** is stored in memory. The u.aDisk[] array contains the raw bits read from ** the disk. The rest is auxiliary information held in memory only. The ** auxiliary info is only valid for regular database pages - it is not ** used for overflow pages and pages on the freelist. ** ** Of particular interest in the auxiliary info is the apCell[] entry. Each ** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are ** put in this array so that they can be accessed in constant time, rather ** than in linear time which would be needed if we had to walk the linked ** list on every access. ** ** Note that apCell[] contains enough space to hold up to two more Cells ** than can possibly fit on one page. In the steady state, every apCell[] ** points to memory inside u.aDisk[]. But in the middle of an insert ** operation, some apCell[] entries may temporarily point to data space ** outside of u.aDisk[]. This is a transient situation that is quickly ** resolved. But while it is happening, it is possible for a database ** page to hold as many as two more cells than it might otherwise hold. ** The extra two entries in apCell[] are an allowance for this situation. ** ** The pParent field points back to the parent page. This allows us to ** walk up the BTree from any leaf to the root. Care must be taken to ** unref() the parent page pointer when this page is no longer referenced. ** The pageDestructor() routine handles that chore. */ struct MemPage { union { char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */ PageHdr hdr; /* Overlay page header */ } u; u8 isInit; /* True if auxiliary data is initialized */ u8 idxShift; /* True if apCell[] indices have changed */ u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */ MemPage *pParent; /* The parent of this page. NULL for root */ int idxParent; /* Index in pParent->apCell[] of this node */ int nFree; /* Number of free bytes in u.aDisk[] */ int nCell; /* Number of entries on this page */ Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */ }; /* ** The in-memory image of a disk page has the auxiliary information appended ** to the end. EXTRA_SIZE is the number of bytes of space needed to hold ** that extra information. */ #define EXTRA_SIZE (sizeof(MemPage)-SQLITE_PAGE_SIZE) /* ** Everything we need to know about an open database */ struct Btree { BtOps *pOps; /* Function table */ Pager *pPager; /* The page cache */ BtCursor *pCursor; /* A list of all open cursors */ PageOne *page1; /* First page of the database */ u8 inTrans; /* True if a transaction is in progress */ u8 inCkpt; /* True if there is a checkpoint on the transaction */ u8 readOnly; /* True if the underlying file is readonly */ u8 needSwab; /* Need to byte-swapping */ }; typedef Btree Bt; /* ** A cursor is a pointer to a particular entry in the BTree. ** The entry is identified by its MemPage and the index in ** MemPage.apCell[] of the entry. */ struct BtCursor { BtCursorOps *pOps; /* Function table */ Btree *pBt; /* The Btree to which this cursor belongs */ BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */ BtCursor *pShared; /* Loop of cursors with the same root page */ Pgno pgnoRoot; /* The root page of this tree */ MemPage *pPage; /* Page that contains the entry */ int idx; /* Index of the entry in pPage->apCell[] */ u8 wrFlag; /* True if writable */ u8 eSkip; /* Determines if next step operation is a no-op */ u8 iMatch; /* compare result from last sqliteBtreeMoveto() */ }; /* ** Legal values for BtCursor.eSkip. */ #define SKIP_NONE 0 /* Always step the cursor */ #define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */ #define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */ #define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */ /* Forward declarations */ static int fileBtreeCloseCursor(BtCursor *pCur); /* ** Routines for byte swapping. */ u16 swab16(u16 x){ return ((x & 0xff)<<8) | ((x>>8)&0xff); } u32 swab32(u32 x){ return ((x & 0xff)<<24) | ((x & 0xff00)<<8) | ((x>>8) & 0xff00) | ((x>>24)&0xff); } /* ** Compute the total number of bytes that a Cell needs on the main ** database page. The number returned includes the Cell header, ** local payload storage, and the pointer to overflow pages (if ** applicable). Additional space allocated on overflow pages ** is NOT included in the value returned from this routine. */ static int cellSize(Btree *pBt, Cell *pCell){ int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); if( n>MX_LOCAL_PAYLOAD ){ n = MX_LOCAL_PAYLOAD + sizeof(Pgno); }else{ n = ROUNDUP(n); } n += sizeof(CellHdr); return n; } /* ** Defragment the page given. All Cells are moved to the ** beginning of the page and all free space is collected ** into one big FreeBlk at the end of the page. */ static void defragmentPage(Btree *pBt, MemPage *pPage){ int pc, i, n; FreeBlk *pFBlk; char newPage[SQLITE_PAGE_SIZE]; assert( sqlitepager_iswriteable(pPage) ); assert( pPage->isInit ); pc = sizeof(PageHdr); pPage->u.hdr.firstCell = SWAB16(pBt, pc); memcpy(newPage, pPage->u.aDisk, pc); for(i=0; inCell; i++){ Cell *pCell = pPage->apCell[i]; /* This routine should never be called on an overfull page. The ** following asserts verify that constraint. */ assert( Addr(pCell) > Addr(pPage) ); assert( Addr(pCell) < Addr(pPage) + SQLITE_PAGE_SIZE ); n = cellSize(pBt, pCell); pCell->h.iNext = SWAB16(pBt, pc + n); memcpy(&newPage[pc], pCell, n); pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc]; pc += n; } assert( pPage->nFree==SQLITE_PAGE_SIZE-pc ); memcpy(pPage->u.aDisk, newPage, pc); if( pPage->nCell>0 ){ pPage->apCell[pPage->nCell-1]->h.iNext = 0; } pFBlk = (FreeBlk*)&pPage->u.aDisk[pc]; pFBlk->iSize = SWAB16(pBt, SQLITE_PAGE_SIZE - pc); pFBlk->iNext = 0; pPage->u.hdr.firstFree = SWAB16(pBt, pc); memset(&pFBlk[1], 0, SQLITE_PAGE_SIZE - pc - sizeof(FreeBlk)); } /* ** Allocate nByte bytes of space on a page. nByte must be a ** multiple of 4. ** ** Return the index into pPage->u.aDisk[] of the first byte of ** the new allocation. Or return 0 if there is not enough free ** space on the page to satisfy the allocation request. ** ** If the page contains nBytes of free space but does not contain ** nBytes of contiguous free space, then this routine automatically ** calls defragementPage() to consolidate all free space before ** allocating the new chunk. */ static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){ FreeBlk *p; u16 *pIdx; int start; int iSize; #ifndef NDEBUG int cnt = 0; #endif assert( sqlitepager_iswriteable(pPage) ); assert( nByte==ROUNDUP(nByte) ); assert( pPage->isInit ); if( pPage->nFreeisOverfull ) return 0; pIdx = &pPage->u.hdr.firstFree; p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; while( (iSize = SWAB16(pBt, p->iSize))iNext==0 ){ defragmentPage(pBt, pPage); pIdx = &pPage->u.hdr.firstFree; }else{ pIdx = &p->iNext; } p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; } if( iSize==nByte ){ start = SWAB16(pBt, *pIdx); *pIdx = p->iNext; }else{ FreeBlk *pNew; start = SWAB16(pBt, *pIdx); pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte]; pNew->iNext = p->iNext; pNew->iSize = SWAB16(pBt, iSize - nByte); *pIdx = SWAB16(pBt, start + nByte); } pPage->nFree -= nByte; return start; } /* ** Return a section of the MemPage.u.aDisk[] to the freelist. ** The first byte of the new free block is pPage->u.aDisk[start] ** and the size of the block is "size" bytes. Size must be ** a multiple of 4. ** ** Most of the effort here is involved in coalesing adjacent ** free blocks into a single big free block. */ static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){ int end = start + size; u16 *pIdx, idx; FreeBlk *pFBlk; FreeBlk *pNew; FreeBlk *pNext; int iSize; assert( sqlitepager_iswriteable(pPage) ); assert( size == ROUNDUP(size) ); assert( start == ROUNDUP(start) ); assert( pPage->isInit ); pIdx = &pPage->u.hdr.firstFree; idx = SWAB16(pBt, *pIdx); while( idx!=0 && idxu.aDisk[idx]; iSize = SWAB16(pBt, pFBlk->iSize); if( idx + iSize == start ){ pFBlk->iSize = SWAB16(pBt, iSize + size); if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){ pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size]; if( pBt->needSwab ){ pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size); }else{ pFBlk->iSize += pNext->iSize; } pFBlk->iNext = pNext->iNext; } pPage->nFree += size; return; } pIdx = &pFBlk->iNext; idx = SWAB16(pBt, *pIdx); } pNew = (FreeBlk*)&pPage->u.aDisk[start]; if( idx != end ){ pNew->iSize = SWAB16(pBt, size); pNew->iNext = SWAB16(pBt, idx); }else{ pNext = (FreeBlk*)&pPage->u.aDisk[idx]; pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize)); pNew->iNext = pNext->iNext; } *pIdx = SWAB16(pBt, start); pPage->nFree += size; } /* ** Initialize the auxiliary information for a disk block. ** ** The pParent parameter must be a pointer to the MemPage which ** is the parent of the page being initialized. The root of the ** BTree (usually page 2) has no parent and so for that page, ** pParent==NULL. ** ** Return SQLITE_OK on success. If we see that the page does ** not contain a well-formed database page, then return ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not ** guarantee that the page is well-formed. It only shows that ** we failed to detect any corruption. */ static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){ int idx; /* An index into pPage->u.aDisk[] */ Cell *pCell; /* A pointer to a Cell in pPage->u.aDisk[] */ FreeBlk *pFBlk; /* A pointer to a free block in pPage->u.aDisk[] */ int sz; /* The size of a Cell in bytes */ int freeSpace; /* Amount of free space on the page */ if( pPage->pParent ){ assert( pPage->pParent==pParent ); return SQLITE_OK; } if( pParent ){ pPage->pParent = pParent; sqlitepager_ref(pParent); } if( pPage->isInit ) return SQLITE_OK; pPage->isInit = 1; pPage->nCell = 0; freeSpace = USABLE_SPACE; idx = SWAB16(pBt, pPage->u.hdr.firstCell); while( idx!=0 ){ if( idx>SQLITE_PAGE_SIZE-MIN_CELL_SIZE ) goto page_format_error; if( idxu.aDisk[idx]; sz = cellSize(pBt, pCell); if( idx+sz > SQLITE_PAGE_SIZE ) goto page_format_error; freeSpace -= sz; pPage->apCell[pPage->nCell++] = pCell; idx = SWAB16(pBt, pCell->h.iNext); } pPage->nFree = 0; idx = SWAB16(pBt, pPage->u.hdr.firstFree); while( idx!=0 ){ int iNext; if( idx>SQLITE_PAGE_SIZE-sizeof(FreeBlk) ) goto page_format_error; if( idxu.aDisk[idx]; pPage->nFree += SWAB16(pBt, pFBlk->iSize); iNext = SWAB16(pBt, pFBlk->iNext); if( iNext>0 && iNext <= idx ) goto page_format_error; idx = iNext; } if( pPage->nCell==0 && pPage->nFree==0 ){ /* As a special case, an uninitialized root page appears to be ** an empty database */ return SQLITE_OK; } if( pPage->nFree!=freeSpace ) goto page_format_error; return SQLITE_OK; page_format_error: return SQLITE_CORRUPT; } /* ** Set up a raw page so that it looks like a database page holding ** no entries. */ static void zeroPage(Btree *pBt, MemPage *pPage){ PageHdr *pHdr; FreeBlk *pFBlk; assert( sqlitepager_iswriteable(pPage) ); memset(pPage, 0, SQLITE_PAGE_SIZE); pHdr = &pPage->u.hdr; pHdr->firstCell = 0; pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr)); pFBlk = (FreeBlk*)&pHdr[1]; pFBlk->iNext = 0; pPage->nFree = SQLITE_PAGE_SIZE - sizeof(*pHdr); pFBlk->iSize = SWAB16(pBt, pPage->nFree); pPage->nCell = 0; pPage->isOverfull = 0; } /* ** This routine is called when the reference count for a page ** reaches zero. We need to unref the pParent pointer when that ** happens. */ static void pageDestructor(void *pData){ MemPage *pPage = (MemPage*)pData; if( pPage->pParent ){ MemPage *pParent = pPage->pParent; pPage->pParent = 0; sqlitepager_unref(pParent); } } /* ** Open a new database. ** ** Actually, this routine just sets up the internal data structures ** for accessing the database. We do not open the database file ** until the first page is loaded. ** ** zFilename is the name of the database file. If zFilename is NULL ** a new database with a random name is created. This randomly named ** database file will be deleted when sqliteBtreeClose() is called. */ int sqliteBtreeOpen( const char *zFilename, /* Name of the file containing the BTree database */ int omitJournal, /* if TRUE then do not journal this file */ int nCache, /* How many pages in the page cache */ Btree **ppBtree /* Pointer to new Btree object written here */ ){ Btree *pBt; int rc; /* ** The following asserts make sure that structures used by the btree are ** the right size. This is to guard against size changes that result ** when compiling on a different architecture. */ assert( sizeof(u32)==4 ); assert( sizeof(u16)==2 ); assert( sizeof(Pgno)==4 ); assert( sizeof(PageHdr)==8 ); assert( sizeof(CellHdr)==12 ); assert( sizeof(FreeBlk)==4 ); assert( sizeof(OverflowPage)==SQLITE_PAGE_SIZE ); assert( sizeof(FreelistInfo)==OVERFLOW_SIZE ); assert( sizeof(ptr)==sizeof(char*) ); assert( sizeof(uptr)==sizeof(ptr) ); pBt = sqliteMalloc( sizeof(*pBt) ); if( pBt==0 ){ *ppBtree = 0; return SQLITE_NOMEM; } if( nCache<10 ) nCache = 10; rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE, !omitJournal); if( rc!=SQLITE_OK ){ if( pBt->pPager ) sqlitepager_close(pBt->pPager); sqliteFree(pBt); *ppBtree = 0; return rc; } sqlitepager_set_destructor(pBt->pPager, pageDestructor); pBt->pCursor = 0; pBt->page1 = 0; pBt->readOnly = sqlitepager_isreadonly(pBt->pPager); pBt->pOps = &sqliteBtreeOps; *ppBtree = pBt; return SQLITE_OK; } /* ** Close an open database and invalidate all cursors. */ static int fileBtreeClose(Btree *pBt){ while( pBt->pCursor ){ fileBtreeCloseCursor(pBt->pCursor); } sqlitepager_close(pBt->pPager); sqliteFree(pBt); return SQLITE_OK; } /* ** Change the limit on the number of pages allowed in the cache. ** ** The maximum number of cache pages is set to the absolute ** value of mxPage. If mxPage is negative, the pager will ** operate asynchronously - it will not stop to do fsync()s ** to insure data is written to the disk surface before ** continuing. Transactions still work if synchronous is off, ** and the database cannot be corrupted if this program ** crashes. But if the operating system crashes or there is ** an abrupt power failure when synchronous is off, the database ** could be left in an inconsistent and unrecoverable state. ** Synchronous is on by default so database corruption is not ** normally a worry. */ static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){ sqlitepager_set_cachesize(pBt->pPager, mxPage); return SQLITE_OK; } /* ** Change the way data is synced to disk in order to increase or decrease ** how well the database resists damage due to OS crashes and power ** failures. Level 1 is the same as asynchronous (no syncs() occur and ** there is a high probability of damage) Level 2 is the default. There ** is a very low but non-zero probability of damage. Level 3 reduces the ** probability of damage to near zero but with a write performance reduction. */ static int fileBtreeSetSafetyLevel(Btree *pBt, int level){ sqlitepager_set_safety_level(pBt->pPager, level); return SQLITE_OK; } /* ** Get a reference to page1 of the database file. This will ** also acquire a readlock on that file. ** ** SQLITE_OK is returned on success. If the file is not a ** well-formed database file, then SQLITE_CORRUPT is returned. ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM ** is returned if we run out of memory. SQLITE_PROTOCOL is returned ** if there is a locking protocol violation. */ static int lockBtree(Btree *pBt){ int rc; if( pBt->page1 ) return SQLITE_OK; rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1); if( rc!=SQLITE_OK ) return rc; /* Do some checking to help insure the file we opened really is ** a valid database file. */ if( sqlitepager_pagecount(pBt->pPager)>0 ){ PageOne *pP1 = pBt->page1; if( strcmp(pP1->zMagic,zMagicHeader)!=0 || (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){ rc = SQLITE_CORRUPT; goto page1_init_failed; } pBt->needSwab = pP1->iMagic!=MAGIC; } return rc; page1_init_failed: sqlitepager_unref(pBt->page1); pBt->page1 = 0; return rc; } /* ** If there are no outstanding cursors and we are not in the middle ** of a transaction but there is a read lock on the database, then ** this routine unrefs the first page of the database file which ** has the effect of releasing the read lock. ** ** If there are any outstanding cursors, this routine is a no-op. ** ** If there is a transaction in progress, this routine is a no-op. */ static void unlockBtreeIfUnused(Btree *pBt){ if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){ sqlitepager_unref(pBt->page1); pBt->page1 = 0; pBt->inTrans = 0; pBt->inCkpt = 0; } } /* ** Create a new database by initializing the first two pages of the ** file. */ static int newDatabase(Btree *pBt){ MemPage *pRoot; PageOne *pP1; int rc; if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK; pP1 = pBt->page1; rc = sqlitepager_write(pBt->page1); if( rc ) return rc; rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot); if( rc ) return rc; rc = sqlitepager_write(pRoot); if( rc ){ sqlitepager_unref(pRoot); return rc; } strcpy(pP1->zMagic, zMagicHeader); if( btree_native_byte_order ){ pP1->iMagic = MAGIC; pBt->needSwab = 0; }else{ pP1->iMagic = swab32(MAGIC); pBt->needSwab = 1; } zeroPage(pBt, pRoot); sqlitepager_unref(pRoot); return SQLITE_OK; } /* ** Attempt to start a new transaction. ** ** A transaction must be started before attempting any changes ** to the database. None of the following routines will work ** unless a transaction is started first: ** ** sqliteBtreeCreateTable() ** sqliteBtreeCreateIndex() ** sqliteBtreeClearTable() ** sqliteBtreeDropTable() ** sqliteBtreeInsert() ** sqliteBtreeDelete() ** sqliteBtreeUpdateMeta() */ static int fileBtreeBeginTrans(Btree *pBt){ int rc; if( pBt->inTrans ) return SQLITE_ERROR; if( pBt->readOnly ) return SQLITE_READONLY; if( pBt->page1==0 ){ rc = lockBtree(pBt); if( rc!=SQLITE_OK ){ return rc; } } rc = sqlitepager_begin(pBt->page1); if( rc==SQLITE_OK ){ rc = newDatabase(pBt); } if( rc==SQLITE_OK ){ pBt->inTrans = 1; pBt->inCkpt = 0; }else{ unlockBtreeIfUnused(pBt); } return rc; } /* ** Commit the transaction currently in progress. ** ** This will release the write lock on the database file. If there ** are no active cursors, it also releases the read lock. */ static int fileBtreeCommit(Btree *pBt){ int rc; rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager); pBt->inTrans = 0; pBt->inCkpt = 0; unlockBtreeIfUnused(pBt); return rc; } /* ** Rollback the transaction in progress. All cursors will be ** invalided by this operation. Any attempt to use a cursor ** that was open at the beginning of this operation will result ** in an error. ** ** This will release the write lock on the database file. If there ** are no active cursors, it also releases the read lock. */ static int fileBtreeRollback(Btree *pBt){ int rc; BtCursor *pCur; if( pBt->inTrans==0 ) return SQLITE_OK; pBt->inTrans = 0; pBt->inCkpt = 0; rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager); for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ if( pCur->pPage && pCur->pPage->isInit==0 ){ sqlitepager_unref(pCur->pPage); pCur->pPage = 0; } } unlockBtreeIfUnused(pBt); return rc; } /* ** Set the checkpoint for the current transaction. The checkpoint serves ** as a sub-transaction that can be rolled back independently of the ** main transaction. You must start a transaction before starting a ** checkpoint. The checkpoint is ended automatically if the transaction ** commits or rolls back. ** ** Only one checkpoint may be active at a time. It is an error to try ** to start a new checkpoint if another checkpoint is already active. */ static int fileBtreeBeginCkpt(Btree *pBt){ int rc; if( !pBt->inTrans || pBt->inCkpt ){ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager); pBt->inCkpt = 1; return rc; } /* ** Commit a checkpoint to transaction currently in progress. If no ** checkpoint is active, this is a no-op. */ static int fileBtreeCommitCkpt(Btree *pBt){ int rc; if( pBt->inCkpt && !pBt->readOnly ){ rc = sqlitepager_ckpt_commit(pBt->pPager); }else{ rc = SQLITE_OK; } pBt->inCkpt = 0; return rc; } /* ** Rollback the checkpoint to the current transaction. If there ** is no active checkpoint or transaction, this routine is a no-op. ** ** All cursors will be invalided by this operation. Any attempt ** to use a cursor that was open at the beginning of this operation ** will result in an error. */ static int fileBtreeRollbackCkpt(Btree *pBt){ int rc; BtCursor *pCur; if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK; rc = sqlitepager_ckpt_rollback(pBt->pPager); for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ if( pCur->pPage && pCur->pPage->isInit==0 ){ sqlitepager_unref(pCur->pPage); pCur->pPage = 0; } } pBt->inCkpt = 0; return rc; } /* ** Create a new cursor for the BTree whose root is on the page ** iTable. The act of acquiring a cursor gets a read lock on ** the database file. ** ** If wrFlag==0, then the cursor can only be used for reading. ** If wrFlag==1, then the cursor can be used for reading or for ** writing if other conditions for writing are also met. These ** are the conditions that must be met in order for writing to ** be allowed: ** ** 1: The cursor must have been opened with wrFlag==1 ** ** 2: No other cursors may be open with wrFlag==0 on the same table ** ** 3: The database must be writable (not on read-only media) ** ** 4: There must be an active transaction. ** ** Condition 2 warrants further discussion. If any cursor is opened ** on a table with wrFlag==0, that prevents all other cursors from ** writing to that table. This is a kind of "read-lock". When a cursor ** is opened with wrFlag==0 it is guaranteed that the table will not ** change as long as the cursor is open. This allows the cursor to ** do a sequential scan of the table without having to worry about ** entries being inserted or deleted during the scan. Cursors should ** be opened with wrFlag==0 only if this read-lock property is needed. ** That is to say, cursors should be opened with wrFlag==0 only if they ** intend to use the sqliteBtreeNext() system call. All other cursors ** should be opened with wrFlag==1 even if they never really intend ** to write. ** ** No checking is done to make sure that page iTable really is the ** root page of a b-tree. If it is not, then the cursor acquired ** will not work correctly. */ static int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){ int rc; BtCursor *pCur, *pRing; if( pBt->page1==0 ){ rc = lockBtree(pBt); if( rc!=SQLITE_OK ){ *ppCur = 0; return rc; } } pCur = sqliteMalloc( sizeof(*pCur) ); if( pCur==0 ){ rc = SQLITE_NOMEM; goto create_cursor_exception; } pCur->pgnoRoot = (Pgno)iTable; rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage); if( rc!=SQLITE_OK ){ goto create_cursor_exception; } rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0); if( rc!=SQLITE_OK ){ goto create_cursor_exception; } pCur->pOps = &sqliteBtreeCursorOps; pCur->pBt = pBt; pCur->wrFlag = wrFlag; pCur->idx = 0; pCur->eSkip = SKIP_INVALID; pCur->pNext = pBt->pCursor; if( pCur->pNext ){ pCur->pNext->pPrev = pCur; } pCur->pPrev = 0; pRing = pBt->pCursor; while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; } if( pRing ){ pCur->pShared = pRing->pShared; pRing->pShared = pCur; }else{ pCur->pShared = pCur; } pBt->pCursor = pCur; *ppCur = pCur; return SQLITE_OK; create_cursor_exception: *ppCur = 0; if( pCur ){ if( pCur->pPage ) sqlitepager_unref(pCur->pPage); sqliteFree(pCur); } unlockBtreeIfUnused(pBt); return rc; } /* ** Close a cursor. The read lock on the database file is released ** when the last cursor is closed. */ static int fileBtreeCloseCursor(BtCursor *pCur){ Btree *pBt = pCur->pBt; if( pCur->pPrev ){ pCur->pPrev->pNext = pCur->pNext; }else{ pBt->pCursor = pCur->pNext; } if( pCur->pNext ){ pCur->pNext->pPrev = pCur->pPrev; } if( pCur->pPage ){ sqlitepager_unref(pCur->pPage); } if( pCur->pShared!=pCur ){ BtCursor *pRing = pCur->pShared; while( pRing->pShared!=pCur ){ pRing = pRing->pShared; } pRing->pShared = pCur->pShared; } unlockBtreeIfUnused(pBt); sqliteFree(pCur); return SQLITE_OK; } /* ** Make a temporary cursor by filling in the fields of pTempCur. ** The temporary cursor is not on the cursor list for the Btree. */ static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){ memcpy(pTempCur, pCur, sizeof(*pCur)); pTempCur->pNext = 0; pTempCur->pPrev = 0; if( pTempCur->pPage ){ sqlitepager_ref(pTempCur->pPage); } } /* ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() ** function above. */ static void releaseTempCursor(BtCursor *pCur){ if( pCur->pPage ){ sqlitepager_unref(pCur->pPage); } } /* ** Set *pSize to the number of bytes of key in the entry the ** cursor currently points to. Always return SQLITE_OK. ** Failure is not possible. If the cursor is not currently ** pointing to an entry (which can happen, for example, if ** the database is empty) then *pSize is set to 0. */ static int fileBtreeKeySize(BtCursor *pCur, int *pSize){ Cell *pCell; MemPage *pPage; pPage = pCur->pPage; assert( pPage!=0 ); if( pCur->idx >= pPage->nCell ){ *pSize = 0; }else{ pCell = pPage->apCell[pCur->idx]; *pSize = NKEY(pCur->pBt, pCell->h); } return SQLITE_OK; } /* ** Read payload information from the entry that the pCur cursor is ** pointing to. Begin reading the payload at "offset" and read ** a total of "amt" bytes. Put the result in zBuf. ** ** This routine does not make a distinction between key and data. ** It just reads bytes from the payload area. */ static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){ char *aPayload; Pgno nextPage; int rc; Btree *pBt = pCur->pBt; assert( pCur!=0 && pCur->pPage!=0 ); assert( pCur->idx>=0 && pCur->idxpPage->nCell ); aPayload = pCur->pPage->apCell[pCur->idx]->aPayload; if( offsetMX_LOCAL_PAYLOAD ){ a = MX_LOCAL_PAYLOAD - offset; } memcpy(zBuf, &aPayload[offset], a); if( a==amt ){ return SQLITE_OK; } offset = 0; zBuf += a; amt -= a; }else{ offset -= MX_LOCAL_PAYLOAD; } if( amt>0 ){ nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl); } while( amt>0 && nextPage ){ OverflowPage *pOvfl; rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); if( rc!=0 ){ return rc; } nextPage = SWAB32(pBt, pOvfl->iNext); if( offset OVERFLOW_SIZE ){ a = OVERFLOW_SIZE - offset; } memcpy(zBuf, &pOvfl->aPayload[offset], a); offset = 0; amt -= a; zBuf += a; }else{ offset -= OVERFLOW_SIZE; } sqlitepager_unref(pOvfl); } if( amt>0 ){ return SQLITE_CORRUPT; } return SQLITE_OK; } /* ** Read part of the key associated with cursor pCur. A maximum ** of "amt" bytes will be transfered into zBuf[]. The transfer ** begins at "offset". The number of bytes actually read is ** returned. ** ** Change: It used to be that the amount returned will be smaller ** than the amount requested if there are not enough bytes in the key ** to satisfy the request. But now, it must be the case that there ** is enough data available to satisfy the request. If not, an exception ** is raised. The change was made in an effort to boost performance ** by eliminating unneeded tests. */ static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){ MemPage *pPage; assert( amt>=0 ); assert( offset>=0 ); assert( pCur->pPage!=0 ); pPage = pCur->pPage; if( pCur->idx >= pPage->nCell ){ return 0; } assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) ); getPayload(pCur, offset, amt, zBuf); return amt; } /* ** Set *pSize to the number of bytes of data in the entry the ** cursor currently points to. Always return SQLITE_OK. ** Failure is not possible. If the cursor is not currently ** pointing to an entry (which can happen, for example, if ** the database is empty) then *pSize is set to 0. */ static int fileBtreeDataSize(BtCursor *pCur, int *pSize){ Cell *pCell; MemPage *pPage; pPage = pCur->pPage; assert( pPage!=0 ); if( pCur->idx >= pPage->nCell ){ *pSize = 0; }else{ pCell = pPage->apCell[pCur->idx]; *pSize = NDATA(pCur->pBt, pCell->h); } return SQLITE_OK; } /* ** Read part of the data associated with cursor pCur. A maximum ** of "amt" bytes will be transfered into zBuf[]. The transfer ** begins at "offset". The number of bytes actually read is ** returned. The amount returned will be smaller than the ** amount requested if there are not enough bytes in the data ** to satisfy the request. */ static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){ Cell *pCell; MemPage *pPage; assert( amt>=0 ); assert( offset>=0 ); assert( pCur->pPage!=0 ); pPage = pCur->pPage; if( pCur->idx >= pPage->nCell ){ return 0; } pCell = pPage->apCell[pCur->idx]; assert( amt+offset <= NDATA(pCur->pBt, pCell->h) ); getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf); return amt; } /* ** Compare an external key against the key on the entry that pCur points to. ** ** The external key is pKey and is nKey bytes long. The last nIgnore bytes ** of the key associated with pCur are ignored, as if they do not exist. ** (The normal case is for nIgnore to be zero in which case the entire ** internal key is used in the comparison.) ** ** The comparison result is written to *pRes as follows: ** ** *pRes<0 This means pCur0 This means pCur>pKey ** ** When one key is an exact prefix of the other, the shorter key is ** considered less than the longer one. In order to be equal the ** keys must be exactly the same length. (The length of the pCur key ** is the actual key length minus nIgnore bytes.) */ static int fileBtreeKeyCompare( BtCursor *pCur, /* Pointer to entry to compare against */ const void *pKey, /* Key to compare against entry that pCur points to */ int nKey, /* Number of bytes in pKey */ int nIgnore, /* Ignore this many bytes at the end of pCur */ int *pResult /* Write the result here */ ){ Pgno nextPage; int n, c, rc, nLocal; Cell *pCell; Btree *pBt = pCur->pBt; const char *zKey = (const char*)pKey; assert( pCur->pPage ); assert( pCur->idx>=0 && pCur->idxpPage->nCell ); pCell = pCur->pPage->apCell[pCur->idx]; nLocal = NKEY(pBt, pCell->h) - nIgnore; if( nLocal<0 ) nLocal = 0; n = nKeyMX_LOCAL_PAYLOAD ){ n = MX_LOCAL_PAYLOAD; } c = memcmp(pCell->aPayload, zKey, n); if( c!=0 ){ *pResult = c; return SQLITE_OK; } zKey += n; nKey -= n; nLocal -= n; nextPage = SWAB32(pBt, pCell->ovfl); while( nKey>0 && nLocal>0 ){ OverflowPage *pOvfl; if( nextPage==0 ){ return SQLITE_CORRUPT; } rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); if( rc ){ return rc; } nextPage = SWAB32(pBt, pOvfl->iNext); n = nKeyOVERFLOW_SIZE ){ n = OVERFLOW_SIZE; } c = memcmp(pOvfl->aPayload, zKey, n); sqlitepager_unref(pOvfl); if( c!=0 ){ *pResult = c; return SQLITE_OK; } nKey -= n; nLocal -= n; zKey += n; } if( c==0 ){ c = nLocal - nKey; } *pResult = c; return SQLITE_OK; } /* ** Move the cursor down to a new child page. The newPgno argument is the ** page number of the child page in the byte order of the disk image. */ static int moveToChild(BtCursor *pCur, int newPgno){ int rc; MemPage *pNewPage; Btree *pBt = pCur->pBt; newPgno = SWAB32(pBt, newPgno); rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage); if( rc ) return rc; rc = initPage(pBt, pNewPage, newPgno, pCur->pPage); if( rc ) return rc; assert( pCur->idx>=pCur->pPage->nCell || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) ); assert( pCur->idxpPage->nCell || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) ); pNewPage->idxParent = pCur->idx; pCur->pPage->idxShift = 0; sqlitepager_unref(pCur->pPage); pCur->pPage = pNewPage; pCur->idx = 0; if( pNewPage->nCell<1 ){ return SQLITE_CORRUPT; } return SQLITE_OK; } /* ** Move the cursor up to the parent page. ** ** pCur->idx is set to the cell index that contains the pointer ** to the page we are coming from. If we are coming from the ** right-most child page then pCur->idx is set to one more than ** the largest cell index. */ static void moveToParent(BtCursor *pCur){ Pgno oldPgno; MemPage *pParent; MemPage *pPage; int idxParent; pPage = pCur->pPage; assert( pPage!=0 ); pParent = pPage->pParent; assert( pParent!=0 ); idxParent = pPage->idxParent; sqlitepager_ref(pParent); sqlitepager_unref(pPage); pCur->pPage = pParent; assert( pParent->idxShift==0 ); if( pParent->idxShift==0 ){ pCur->idx = idxParent; #ifndef NDEBUG /* Verify that pCur->idx is the correct index to point back to the child ** page we just came from */ oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); if( pCur->idxnCell ){ assert( pParent->apCell[idxParent]->h.leftChild==oldPgno ); }else{ assert( pParent->u.hdr.rightChild==oldPgno ); } #endif }else{ /* The MemPage.idxShift flag indicates that cell indices might have ** changed since idxParent was set and hence idxParent might be out ** of date. So recompute the parent cell index by scanning all cells ** and locating the one that points to the child we just came from. */ int i; pCur->idx = pParent->nCell; oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); for(i=0; inCell; i++){ if( pParent->apCell[i]->h.leftChild==oldPgno ){ pCur->idx = i; break; } } } } /* ** Move the cursor to the root page */ static int moveToRoot(BtCursor *pCur){ MemPage *pNew; int rc; Btree *pBt = pCur->pBt; rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew); if( rc ) return rc; rc = initPage(pBt, pNew, pCur->pgnoRoot, 0); if( rc ) return rc; sqlitepager_unref(pCur->pPage); pCur->pPage = pNew; pCur->idx = 0; return SQLITE_OK; } /* ** Move the cursor down to the left-most leaf entry beneath the ** entry to which it is currently pointing. */ static int moveToLeftmost(BtCursor *pCur){ Pgno pgno; int rc; while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ rc = moveToChild(pCur, pgno); if( rc ) return rc; } return SQLITE_OK; } /* ** Move the cursor down to the right-most leaf entry beneath the ** page to which it is currently pointing. Notice the difference ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() ** finds the left-most entry beneath the *entry* whereas moveToRightmost() ** finds the right-most entry beneath the *page*. */ static int moveToRightmost(BtCursor *pCur){ Pgno pgno; int rc; while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){ pCur->idx = pCur->pPage->nCell; rc = moveToChild(pCur, pgno); if( rc ) return rc; } pCur->idx = pCur->pPage->nCell - 1; return SQLITE_OK; } /* Move the cursor to the first entry in the table. Return SQLITE_OK ** on success. Set *pRes to 0 if the cursor actually points to something ** or set *pRes to 1 if the table is empty. */ static int fileBtreeFirst(BtCursor *pCur, int *pRes){ int rc; if( pCur->pPage==0 ) return SQLITE_ABORT; rc = moveToRoot(pCur); if( rc ) return rc; if( pCur->pPage->nCell==0 ){ *pRes = 1; return SQLITE_OK; } *pRes = 0; rc = moveToLeftmost(pCur); pCur->eSkip = SKIP_NONE; return rc; } /* Move the cursor to the last entry in the table. Return SQLITE_OK ** on success. Set *pRes to 0 if the cursor actually points to something ** or set *pRes to 1 if the table is empty. */ static int fileBtreeLast(BtCursor *pCur, int *pRes){ int rc; if( pCur->pPage==0 ) return SQLITE_ABORT; rc = moveToRoot(pCur); if( rc ) return rc; assert( pCur->pPage->isInit ); if( pCur->pPage->nCell==0 ){ *pRes = 1; return SQLITE_OK; } *pRes = 0; rc = moveToRightmost(pCur); pCur->eSkip = SKIP_NONE; return rc; } /* Move the cursor so that it points to an entry near pKey. ** Return a success code. ** ** If an exact match is not found, then the cursor is always ** left pointing at a leaf page which would hold the entry if it ** were present. The cursor might point to an entry that comes ** before or after the key. ** ** The result of comparing the key with the entry to which the ** cursor is left pointing is stored in pCur->iMatch. The same ** value is also written to *pRes if pRes!=NULL. The meaning of ** this value is as follows: ** ** *pRes<0 The cursor is left pointing at an entry that ** is smaller than pKey or if the table is empty ** and the cursor is therefore left point to nothing. ** ** *pRes==0 The cursor is left pointing at an entry that ** exactly matches pKey. ** ** *pRes>0 The cursor is left pointing at an entry that ** is larger than pKey. */ static int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){ int rc; if( pCur->pPage==0 ) return SQLITE_ABORT; pCur->eSkip = SKIP_NONE; rc = moveToRoot(pCur); if( rc ) return rc; for(;;){ int lwr, upr; Pgno chldPg; MemPage *pPage = pCur->pPage; int c = -1; /* pRes return if table is empty must be -1 */ lwr = 0; upr = pPage->nCell-1; while( lwr<=upr ){ pCur->idx = (lwr+upr)/2; rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c); if( rc ) return rc; if( c==0 ){ pCur->iMatch = c; if( pRes ) *pRes = 0; return SQLITE_OK; } if( c<0 ){ lwr = pCur->idx+1; }else{ upr = pCur->idx-1; } } assert( lwr==upr+1 ); assert( pPage->isInit ); if( lwr>=pPage->nCell ){ chldPg = pPage->u.hdr.rightChild; }else{ chldPg = pPage->apCell[lwr]->h.leftChild; } if( chldPg==0 ){ pCur->iMatch = c; if( pRes ) *pRes = c; return SQLITE_OK; } pCur->idx = lwr; rc = moveToChild(pCur, chldPg); if( rc ) return rc; } /* NOT REACHED */ } /* ** Advance the cursor to the next entry in the database. If ** successful then set *pRes=0. If the cursor ** was already pointing to the last entry in the database before ** this routine was called, then set *pRes=1. */ static int fileBtreeNext(BtCursor *pCur, int *pRes){ int rc; MemPage *pPage = pCur->pPage; assert( pRes!=0 ); if( pPage==0 ){ *pRes = 1; return SQLITE_ABORT; } assert( pPage->isInit ); assert( pCur->eSkip!=SKIP_INVALID ); if( pPage->nCell==0 ){ *pRes = 1; return SQLITE_OK; } assert( pCur->idxnCell ); if( pCur->eSkip==SKIP_NEXT ){ pCur->eSkip = SKIP_NONE; *pRes = 0; return SQLITE_OK; } pCur->eSkip = SKIP_NONE; pCur->idx++; if( pCur->idx>=pPage->nCell ){ if( pPage->u.hdr.rightChild ){ rc = moveToChild(pCur, pPage->u.hdr.rightChild); if( rc ) return rc; rc = moveToLeftmost(pCur); *pRes = 0; return rc; } do{ if( pPage->pParent==0 ){ *pRes = 1; return SQLITE_OK; } moveToParent(pCur); pPage = pCur->pPage; }while( pCur->idx>=pPage->nCell ); *pRes = 0; return SQLITE_OK; } *pRes = 0; if( pPage->u.hdr.rightChild==0 ){ return SQLITE_OK; } rc = moveToLeftmost(pCur); return rc; } /* ** Step the cursor to the back to the previous entry in the database. If ** successful then set *pRes=0. If the cursor ** was already pointing to the first entry in the database before ** this routine was called, then set *pRes=1. */ static int fileBtreePrevious(BtCursor *pCur, int *pRes){ int rc; Pgno pgno; MemPage *pPage; pPage = pCur->pPage; if( pPage==0 ){ *pRes = 1; return SQLITE_ABORT; } assert( pPage->isInit ); assert( pCur->eSkip!=SKIP_INVALID ); if( pPage->nCell==0 ){ *pRes = 1; return SQLITE_OK; } if( pCur->eSkip==SKIP_PREV ){ pCur->eSkip = SKIP_NONE; *pRes = 0; return SQLITE_OK; } pCur->eSkip = SKIP_NONE; assert( pCur->idx>=0 ); if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ rc = moveToChild(pCur, pgno); if( rc ) return rc; rc = moveToRightmost(pCur); }else{ while( pCur->idx==0 ){ if( pPage->pParent==0 ){ if( pRes ) *pRes = 1; return SQLITE_OK; } moveToParent(pCur); pPage = pCur->pPage; } pCur->idx--; rc = SQLITE_OK; } *pRes = 0; return rc; } /* ** Allocate a new page from the database file. ** ** The new page is marked as dirty. (In other words, sqlitepager_write() ** has already been called on the new page.) The new page has also ** been referenced and the calling routine is responsible for calling ** sqlitepager_unref() on the new page when it is done. ** ** SQLITE_OK is returned on success. Any other return value indicates ** an error. *ppPage and *pPgno are undefined in the event of an error. ** Do not invoke sqlitepager_unref() on *ppPage if an error is returned. ** ** If the "nearby" parameter is not 0, then a (feeble) effort is made to ** locate a page close to the page number "nearby". This can be used in an ** attempt to keep related pages close to each other in the database file, ** which in turn can make database access faster. */ static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){ PageOne *pPage1 = pBt->page1; int rc; if( pPage1->freeList ){ OverflowPage *pOvfl; FreelistInfo *pInfo; rc = sqlitepager_write(pPage1); if( rc ) return rc; SWAB_ADD(pBt, pPage1->nFree, -1); rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), (void**)&pOvfl); if( rc ) return rc; rc = sqlitepager_write(pOvfl); if( rc ){ sqlitepager_unref(pOvfl); return rc; } pInfo = (FreelistInfo*)pOvfl->aPayload; if( pInfo->nFree==0 ){ *pPgno = SWAB32(pBt, pPage1->freeList); pPage1->freeList = pOvfl->iNext; *ppPage = (MemPage*)pOvfl; }else{ int closest, n; n = SWAB32(pBt, pInfo->nFree); if( n>1 && nearby>0 ){ int i, dist; closest = 0; dist = SWAB32(pBt, pInfo->aFree[0]) - nearby; if( dist<0 ) dist = -dist; for(i=1; iaFree[i]) - nearby; if( d2<0 ) d2 = -d2; if( d2nFree, -1); *pPgno = SWAB32(pBt, pInfo->aFree[closest]); pInfo->aFree[closest] = pInfo->aFree[n-1]; rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); sqlitepager_unref(pOvfl); if( rc==SQLITE_OK ){ sqlitepager_dont_rollback(*ppPage); rc = sqlitepager_write(*ppPage); } } }else{ *pPgno = sqlitepager_pagecount(pBt->pPager) + 1; rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); if( rc ) return rc; rc = sqlitepager_write(*ppPage); } return rc; } /* ** Add a page of the database file to the freelist. Either pgno or ** pPage but not both may be 0. ** ** sqlitepager_unref() is NOT called for pPage. */ static int freePage(Btree *pBt, void *pPage, Pgno pgno){ PageOne *pPage1 = pBt->page1; OverflowPage *pOvfl = (OverflowPage*)pPage; int rc; int needUnref = 0; MemPage *pMemPage; if( pgno==0 ){ assert( pOvfl!=0 ); pgno = sqlitepager_pagenumber(pOvfl); } assert( pgno>2 ); assert( sqlitepager_pagenumber(pOvfl)==pgno ); pMemPage = (MemPage*)pPage; pMemPage->isInit = 0; if( pMemPage->pParent ){ sqlitepager_unref(pMemPage->pParent); pMemPage->pParent = 0; } rc = sqlitepager_write(pPage1); if( rc ){ return rc; } SWAB_ADD(pBt, pPage1->nFree, 1); if( pPage1->nFree!=0 && pPage1->freeList!=0 ){ OverflowPage *pFreeIdx; rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), (void**)&pFreeIdx); if( rc==SQLITE_OK ){ FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload; int n = SWAB32(pBt, pInfo->nFree); if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){ rc = sqlitepager_write(pFreeIdx); if( rc==SQLITE_OK ){ pInfo->aFree[n] = SWAB32(pBt, pgno); SWAB_ADD(pBt, pInfo->nFree, 1); sqlitepager_unref(pFreeIdx); sqlitepager_dont_write(pBt->pPager, pgno); return rc; } } sqlitepager_unref(pFreeIdx); } } if( pOvfl==0 ){ assert( pgno>0 ); rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl); if( rc ) return rc; needUnref = 1; } rc = sqlitepager_write(pOvfl); if( rc ){ if( needUnref ) sqlitepager_unref(pOvfl); return rc; } pOvfl->iNext = pPage1->freeList; pPage1->freeList = SWAB32(pBt, pgno); memset(pOvfl->aPayload, 0, OVERFLOW_SIZE); if( needUnref ) rc = sqlitepager_unref(pOvfl); return rc; } /* ** Erase all the data out of a cell. This involves returning overflow ** pages back the freelist. */ static int clearCell(Btree *pBt, Cell *pCell){ Pager *pPager = pBt->pPager; OverflowPage *pOvfl; Pgno ovfl, nextOvfl; int rc; if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){ return SQLITE_OK; } ovfl = SWAB32(pBt, pCell->ovfl); pCell->ovfl = 0; while( ovfl ){ rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl); if( rc ) return rc; nextOvfl = SWAB32(pBt, pOvfl->iNext); rc = freePage(pBt, pOvfl, ovfl); if( rc ) return rc; sqlitepager_unref(pOvfl); ovfl = nextOvfl; } return SQLITE_OK; } /* ** Create a new cell from key and data. Overflow pages are allocated as ** necessary and linked to this cell. */ static int fillInCell( Btree *pBt, /* The whole Btree. Needed to allocate pages */ Cell *pCell, /* Populate this Cell structure */ const void *pKey, int nKey, /* The key */ const void *pData,int nData /* The data */ ){ OverflowPage *pOvfl, *pPrior; Pgno *pNext; int spaceLeft; int n, rc; int nPayload; const char *pPayload; char *pSpace; Pgno nearby = 0; pCell->h.leftChild = 0; pCell->h.nKey = SWAB16(pBt, nKey & 0xffff); pCell->h.nKeyHi = nKey >> 16; pCell->h.nData = SWAB16(pBt, nData & 0xffff); pCell->h.nDataHi = nData >> 16; pCell->h.iNext = 0; pNext = &pCell->ovfl; pSpace = pCell->aPayload; spaceLeft = MX_LOCAL_PAYLOAD; pPayload = pKey; pKey = 0; nPayload = nKey; pPrior = 0; while( nPayload>0 ){ if( spaceLeft==0 ){ rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby); if( rc ){ *pNext = 0; }else{ nearby = *pNext; } if( pPrior ) sqlitepager_unref(pPrior); if( rc ){ clearCell(pBt, pCell); return rc; } if( pBt->needSwab ) *pNext = swab32(*pNext); pPrior = pOvfl; spaceLeft = OVERFLOW_SIZE; pSpace = pOvfl->aPayload; pNext = &pOvfl->iNext; } n = nPayload; if( n>spaceLeft ) n = spaceLeft; memcpy(pSpace, pPayload, n); nPayload -= n; if( nPayload==0 && pData ){ pPayload = pData; nPayload = nData; pData = 0; }else{ pPayload += n; } spaceLeft -= n; pSpace += n; } *pNext = 0; if( pPrior ){ sqlitepager_unref(pPrior); } return SQLITE_OK; } /* ** Change the MemPage.pParent pointer on the page whose number is ** given in the second argument so that MemPage.pParent holds the ** pointer in the third argument. */ static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){ MemPage *pThis; if( pgno==0 ) return; assert( pPager!=0 ); pThis = sqlitepager_lookup(pPager, pgno); if( pThis && pThis->isInit ){ if( pThis->pParent!=pNewParent ){ if( pThis->pParent ) sqlitepager_unref(pThis->pParent); pThis->pParent = pNewParent; if( pNewParent ) sqlitepager_ref(pNewParent); } pThis->idxParent = idx; sqlitepager_unref(pThis); } } /* ** Reparent all children of the given page to be the given page. ** In other words, for every child of pPage, invoke reparentPage() ** to make sure that each child knows that pPage is its parent. ** ** This routine gets called after you memcpy() one page into ** another. */ static void reparentChildPages(Btree *pBt, MemPage *pPage){ int i; Pager *pPager = pBt->pPager; for(i=0; inCell; i++){ reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i); } reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i); pPage->idxShift = 0; } /* ** Remove the i-th cell from pPage. This routine effects pPage only. ** The cell content is not freed or deallocated. It is assumed that ** the cell content has been copied someplace else. This routine just ** removes the reference to the cell from pPage. ** ** "sz" must be the number of bytes in the cell. ** ** Do not bother maintaining the integrity of the linked list of Cells. ** Only the pPage->apCell[] array is important. The relinkCellList() ** routine will be called soon after this routine in order to rebuild ** the linked list. */ static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){ int j; assert( idx>=0 && idxnCell ); assert( sz==cellSize(pBt, pPage->apCell[idx]) ); assert( sqlitepager_iswriteable(pPage) ); freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz); for(j=idx; jnCell-1; j++){ pPage->apCell[j] = pPage->apCell[j+1]; } pPage->nCell--; pPage->idxShift = 1; } /* ** Insert a new cell on pPage at cell index "i". pCell points to the ** content of the cell. ** ** If the cell content will fit on the page, then put it there. If it ** will not fit, then just make pPage->apCell[i] point to the content ** and set pPage->isOverfull. ** ** Do not bother maintaining the integrity of the linked list of Cells. ** Only the pPage->apCell[] array is important. The relinkCellList() ** routine will be called soon after this routine in order to rebuild ** the linked list. */ static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){ int idx, j; assert( i>=0 && i<=pPage->nCell ); assert( sz==cellSize(pBt, pCell) ); assert( sqlitepager_iswriteable(pPage) ); idx = allocateSpace(pBt, pPage, sz); for(j=pPage->nCell; j>i; j--){ pPage->apCell[j] = pPage->apCell[j-1]; } pPage->nCell++; if( idx<=0 ){ pPage->isOverfull = 1; pPage->apCell[i] = pCell; }else{ memcpy(&pPage->u.aDisk[idx], pCell, sz); pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx]; } pPage->idxShift = 1; } /* ** Rebuild the linked list of cells on a page so that the cells ** occur in the order specified by the pPage->apCell[] array. ** Invoke this routine once to repair damage after one or more ** invocations of either insertCell() or dropCell(). */ static void relinkCellList(Btree *pBt, MemPage *pPage){ int i; u16 *pIdx; assert( sqlitepager_iswriteable(pPage) ); pIdx = &pPage->u.hdr.firstCell; for(i=0; inCell; i++){ int idx = Addr(pPage->apCell[i]) - Addr(pPage); assert( idx>0 && idxapCell[i]->h.iNext; } *pIdx = 0; } /* ** Make a copy of the contents of pFrom into pTo. The pFrom->apCell[] ** pointers that point into pFrom->u.aDisk[] must be adjusted to point ** into pTo->u.aDisk[] instead. But some pFrom->apCell[] entries might ** not point to pFrom->u.aDisk[]. Those are unchanged. */ static void copyPage(MemPage *pTo, MemPage *pFrom){ uptr from, to; int i; memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_PAGE_SIZE); pTo->pParent = 0; pTo->isInit = 1; pTo->nCell = pFrom->nCell; pTo->nFree = pFrom->nFree; pTo->isOverfull = pFrom->isOverfull; to = Addr(pTo); from = Addr(pFrom); for(i=0; inCell; i++){ uptr x = Addr(pFrom->apCell[i]); if( x>from && xapCell[i]) = x + to - from; }else{ pTo->apCell[i] = pFrom->apCell[i]; } } } /* ** The following parameters determine how many adjacent pages get involved ** in a balancing operation. NN is the number of neighbors on either side ** of the page that participate in the balancing operation. NB is the ** total number of pages that participate, including the target page and ** NN neighbors on either side. ** ** The minimum value of NN is 1 (of course). Increasing NN above 1 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance ** in exchange for a larger degradation in INSERT and UPDATE performance. ** The value of NN appears to give the best results overall. */ #define NN 1 /* Number of neighbors on either side of pPage */ #define NB (NN*2+1) /* Total pages involved in the balance */ /* ** This routine redistributes Cells on pPage and up to two siblings ** of pPage so that all pages have about the same amount of free space. ** Usually one sibling on either side of pPage is used in the balancing, ** though both siblings might come from one side if pPage is the first ** or last child of its parent. If pPage has fewer than two siblings ** (something which can only happen if pPage is the root page or a ** child of root) then all available siblings participate in the balancing. ** ** The number of siblings of pPage might be increased or decreased by ** one in an effort to keep pages between 66% and 100% full. The root page ** is special and is allowed to be less than 66% full. If pPage is ** the root page, then the depth of the tree might be increased ** or decreased by one, as necessary, to keep the root page from being ** overfull or empty. ** ** This routine calls relinkCellList() on its input page regardless of ** whether or not it does any real balancing. Client routines will typically ** invoke insertCell() or dropCell() before calling this routine, so we ** need to call relinkCellList() to clean up the mess that those other ** routines left behind. ** ** pCur is left pointing to the same cell as when this routine was called ** even if that cell gets moved to a different page. pCur may be NULL. ** Set the pCur parameter to NULL if you do not care about keeping track ** of a cell as that will save this routine the work of keeping track of it. ** ** Note that when this routine is called, some of the Cells on pPage ** might not actually be stored in pPage->u.aDisk[]. This can happen ** if the page is overfull. Part of the job of this routine is to ** make sure all Cells for pPage once again fit in pPage->u.aDisk[]. ** ** In the course of balancing the siblings of pPage, the parent of pPage ** might become overfull or underfull. If that happens, then this routine ** is called recursively on the parent. ** ** If this routine fails for any reason, it might leave the database ** in a corrupted state. So if this routine fails, the database should ** be rolled back. */ static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){ MemPage *pParent; /* The parent of pPage */ int nCell; /* Number of cells in apCell[] */ int nOld; /* Number of pages in apOld[] */ int nNew; /* Number of pages in apNew[] */ int nDiv; /* Number of cells in apDiv[] */ int i, j, k; /* Loop counters */ int idx; /* Index of pPage in pParent->apCell[] */ int nxDiv; /* Next divider slot in pParent->apCell[] */ int rc; /* The return code */ int iCur; /* apCell[iCur] is the cell of the cursor */ MemPage *pOldCurPage; /* The cursor originally points to this page */ int subtotal; /* Subtotal of bytes in cells on one page */ MemPage *extraUnref = 0; /* A page that needs to be unref-ed */ MemPage *apOld[NB]; /* pPage and up to two siblings */ Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */ Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */ int idxDiv[NB]; /* Indices of divider cells in pParent */ Cell *apDiv[NB]; /* Divider cells in pParent */ Cell aTemp[NB]; /* Temporary holding area for apDiv[] */ int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */ int szNew[NB+1]; /* Combined size of cells place on i-th page */ MemPage aOld[NB]; /* Temporary copies of pPage and its siblings */ Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */ int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */ /* ** Return without doing any work if pPage is neither overfull nor ** underfull. */ assert( sqlitepager_iswriteable(pPage) ); if( !pPage->isOverfull && pPage->nFreenCell>=2){ relinkCellList(pBt, pPage); return SQLITE_OK; } /* ** Find the parent of the page to be balanceed. ** If there is no parent, it means this page is the root page and ** special rules apply. */ pParent = pPage->pParent; if( pParent==0 ){ Pgno pgnoChild; MemPage *pChild; assert( pPage->isInit ); if( pPage->nCell==0 ){ if( pPage->u.hdr.rightChild ){ /* ** The root page is empty. Copy the one child page ** into the root page and return. This reduces the depth ** of the BTree by one. */ pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild); rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild); if( rc ) return rc; memcpy(pPage, pChild, SQLITE_PAGE_SIZE); pPage->isInit = 0; rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0); assert( rc==SQLITE_OK ); reparentChildPages(pBt, pPage); if( pCur && pCur->pPage==pChild ){ sqlitepager_unref(pChild); pCur->pPage = pPage; sqlitepager_ref(pPage); } freePage(pBt, pChild, pgnoChild); sqlitepager_unref(pChild); }else{ relinkCellList(pBt, pPage); } return SQLITE_OK; } if( !pPage->isOverfull ){ /* It is OK for the root page to be less than half full. */ relinkCellList(pBt, pPage); return SQLITE_OK; } /* ** If we get to here, it means the root page is overfull. ** When this happens, Create a new child page and copy the ** contents of the root into the child. Then make the root ** page an empty page with rightChild pointing to the new ** child. Then fall thru to the code below which will cause ** the overfull child page to be split. */ rc = sqlitepager_write(pPage); if( rc ) return rc; rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage)); if( rc ) return rc; assert( sqlitepager_iswriteable(pChild) ); copyPage(pChild, pPage); pChild->pParent = pPage; pChild->idxParent = 0; sqlitepager_ref(pPage); pChild->isOverfull = 1; if( pCur && pCur->pPage==pPage ){ sqlitepager_unref(pPage); pCur->pPage = pChild; }else{ extraUnref = pChild; } zeroPage(pBt, pPage); pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild); pParent = pPage; pPage = pChild; } rc = sqlitepager_write(pParent); if( rc ) return rc; assert( pParent->isInit ); /* ** Find the Cell in the parent page whose h.leftChild points back ** to pPage. The "idx" variable is the index of that cell. If pPage ** is the rightmost child of pParent then set idx to pParent->nCell */ if( pParent->idxShift ){ Pgno pgno, swabPgno; pgno = sqlitepager_pagenumber(pPage); swabPgno = SWAB32(pBt, pgno); for(idx=0; idxnCell; idx++){ if( pParent->apCell[idx]->h.leftChild==swabPgno ){ break; } } assert( idxnCell || pParent->u.hdr.rightChild==swabPgno ); }else{ idx = pPage->idxParent; } /* ** Initialize variables so that it will be safe to jump ** directly to balance_cleanup at any moment. */ nOld = nNew = 0; sqlitepager_ref(pParent); /* ** Find sibling pages to pPage and the Cells in pParent that divide ** the siblings. An attempt is made to find NN siblings on either ** side of pPage. More siblings are taken from one side, however, if ** pPage there are fewer than NN siblings on the other side. If pParent ** has NB or fewer children then all children of pParent are taken. */ nxDiv = idx - NN; if( nxDiv + NB > pParent->nCell ){ nxDiv = pParent->nCell - NB + 1; } if( nxDiv<0 ){ nxDiv = 0; } nDiv = 0; for(i=0, k=nxDiv; inCell ){ idxDiv[i] = k; apDiv[i] = pParent->apCell[k]; nDiv++; pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild); }else if( k==pParent->nCell ){ pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild); }else{ break; } rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]); if( rc ) goto balance_cleanup; rc = initPage(pBt, apOld[i], pgnoOld[i], pParent); if( rc ) goto balance_cleanup; apOld[i]->idxParent = k; nOld++; } /* ** Set iCur to be the index in apCell[] of the cell that the cursor ** is pointing to. We will need this later on in order to keep the ** cursor pointing at the same cell. If pCur points to a page that ** has no involvement with this rebalancing, then set iCur to a large ** number so that the iCur==j tests always fail in the main cell ** distribution loop below. */ if( pCur ){ iCur = 0; for(i=0; ipPage==apOld[i] ){ iCur += pCur->idx; break; } iCur += apOld[i]->nCell; if( ipPage==pParent && pCur->idx==idxDiv[i] ){ break; } iCur++; } pOldCurPage = pCur->pPage; } /* ** Make copies of the content of pPage and its siblings into aOld[]. ** The rest of this function will use data from the copies rather ** that the original pages since the original pages will be in the ** process of being overwritten. */ for(i=0; inCell; j++){ apCell[nCell] = pOld->apCell[j]; szCell[nCell] = cellSize(pBt, apCell[nCell]); nCell++; } if( ih.leftChild)==pgnoOld[i] ); apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild; nCell++; } } /* ** Figure out the number of pages needed to hold all nCell cells. ** Store this number in "k". Also compute szNew[] which is the total ** size of all cells on the i-th page and cntNew[] which is the index ** in apCell[] of the cell that divides path i from path i+1. ** cntNew[k] should equal nCell. ** ** This little patch of code is critical for keeping the tree ** balanced. */ for(subtotal=k=i=0; i USABLE_SPACE ){ szNew[k] = subtotal - szCell[i]; cntNew[k] = i; subtotal = 0; k++; } } szNew[k] = subtotal; cntNew[k] = nCell; k++; for(i=k-1; i>0; i--){ while( szNew[i]0 ); szNew[i] += szCell[cntNew[i-1]]; szNew[i-1] -= szCell[cntNew[i-1]-1]; } } assert( cntNew[0]>0 ); /* ** Allocate k new pages. Reuse old pages where possible. */ for(i=0; iisInit = 1; } /* Free any old pages that were not reused as new pages. */ while( ii ){ int t; MemPage *pT; t = pgnoNew[i]; pT = apNew[i]; pgnoNew[i] = pgnoNew[minI]; apNew[i] = apNew[minI]; pgnoNew[minI] = t; apNew[minI] = pT; } } /* ** Evenly distribute the data in apCell[] across the new pages. ** Insert divider cells into pParent as necessary. */ j = 0; for(i=0; inFree>=szCell[j] ); if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; } insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]); j++; } assert( pNew->nCell>0 ); assert( !pNew->isOverfull ); relinkCellList(pBt, pNew); if( iu.hdr.rightChild = apCell[j]->h.leftChild; apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]); if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; } insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]); j++; nxDiv++; } } assert( j==nCell ); apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild; if( nxDiv==pParent->nCell ){ pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]); }else{ pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]); } if( pCur ){ if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){ assert( pCur->pPage==pOldCurPage ); pCur->idx += nNew - nOld; }else{ assert( pOldCurPage!=0 ); sqlitepager_ref(pCur->pPage); sqlitepager_unref(pOldCurPage); } } /* ** Reparent children of all cells. */ for(i=0; ipPage==0 ){ pCur->pPage = pParent; pCur->idx = 0; }else{ sqlitepager_unref(pParent); } return rc; } /* ** This routine checks all cursors that point to the same table ** as pCur points to. If any of those cursors were opened with ** wrFlag==0 then this routine returns SQLITE_LOCKED. If all ** cursors point to the same table were opened with wrFlag==1 ** then this routine returns SQLITE_OK. ** ** In addition to checking for read-locks (where a read-lock ** means a cursor opened with wrFlag==0) this routine also moves ** all cursors other than pCur so that they are pointing to the ** first Cell on root page. This is necessary because an insert ** or delete might change the number of cells on a page or delete ** a page entirely and we do not want to leave any cursors ** pointing to non-existant pages or cells. */ static int checkReadLocks(BtCursor *pCur){ BtCursor *p; assert( pCur->wrFlag ); for(p=pCur->pShared; p!=pCur; p=p->pShared){ assert( p ); assert( p->pgnoRoot==pCur->pgnoRoot ); if( p->wrFlag==0 ) return SQLITE_LOCKED; if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){ moveToRoot(p); } } return SQLITE_OK; } /* ** Insert a new record into the BTree. The key is given by (pKey,nKey) ** and the data is given by (pData,nData). The cursor is used only to ** define what database the record should be inserted into. The cursor ** is left pointing at the new record. */ static int fileBtreeInsert( BtCursor *pCur, /* Insert data into the table of this cursor */ const void *pKey, int nKey, /* The key of the new record */ const void *pData, int nData /* The data of the new record */ ){ Cell newCell; int rc; int loc; int szNew; MemPage *pPage; Btree *pBt = pCur->pBt; if( pCur->pPage==0 ){ return SQLITE_ABORT; /* A rollback destroyed this cursor */ } if( !pBt->inTrans || nKey+nData==0 ){ /* Must start a transaction before doing an insert */ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } assert( !pBt->readOnly ); if( !pCur->wrFlag ){ return SQLITE_PERM; /* Cursor not open for writing */ } if( checkReadLocks(pCur) ){ return SQLITE_LOCKED; /* The table pCur points to has a read lock */ } rc = fileBtreeMoveto(pCur, pKey, nKey, &loc); if( rc ) return rc; pPage = pCur->pPage; assert( pPage->isInit ); rc = sqlitepager_write(pPage); if( rc ) return rc; rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData); if( rc ) return rc; szNew = cellSize(pBt, &newCell); if( loc==0 ){ newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild; rc = clearCell(pBt, pPage->apCell[pCur->idx]); if( rc ) return rc; dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx])); }else if( loc<0 && pPage->nCell>0 ){ assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ pCur->idx++; }else{ assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ } insertCell(pBt, pPage, pCur->idx, &newCell, szNew); rc = balance(pCur->pBt, pPage, pCur); /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */ /* fflush(stdout); */ pCur->eSkip = SKIP_INVALID; return rc; } /* ** Delete the entry that the cursor is pointing to. ** ** The cursor is left pointing at either the next or the previous ** entry. If the cursor is left pointing to the next entry, then ** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to ** sqliteBtreeNext() to be a no-op. That way, you can always call ** sqliteBtreeNext() after a delete and the cursor will be left ** pointing to the first entry after the deleted entry. Similarly, ** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to ** the entry prior to the deleted entry so that a subsequent call to ** sqliteBtreePrevious() will always leave the cursor pointing at the ** entry immediately before the one that was deleted. */ static int fileBtreeDelete(BtCursor *pCur){ MemPage *pPage = pCur->pPage; Cell *pCell; int rc; Pgno pgnoChild; Btree *pBt = pCur->pBt; assert( pPage->isInit ); if( pCur->pPage==0 ){ return SQLITE_ABORT; /* A rollback destroyed this cursor */ } if( !pBt->inTrans ){ /* Must start a transaction before doing a delete */ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } assert( !pBt->readOnly ); if( pCur->idx >= pPage->nCell ){ return SQLITE_ERROR; /* The cursor is not pointing to anything */ } if( !pCur->wrFlag ){ return SQLITE_PERM; /* Did not open this cursor for writing */ } if( checkReadLocks(pCur) ){ return SQLITE_LOCKED; /* The table pCur points to has a read lock */ } rc = sqlitepager_write(pPage); if( rc ) return rc; pCell = pPage->apCell[pCur->idx]; pgnoChild = SWAB32(pBt, pCell->h.leftChild); clearCell(pBt, pCell); if( pgnoChild ){ /* ** The entry we are about to delete is not a leaf so if we do not ** do something we will leave a hole on an internal page. ** We have to fill the hole by moving in a cell from a leaf. The ** next Cell after the one to be deleted is guaranteed to exist and ** to be a leaf so we can use it. */ BtCursor leafCur; Cell *pNext; int szNext; int notUsed; getTempCursor(pCur, &leafCur); rc = fileBtreeNext(&leafCur, ¬Used); if( rc!=SQLITE_OK ){ return SQLITE_CORRUPT; } rc = sqlitepager_write(leafCur.pPage); if( rc ) return rc; dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); pNext = leafCur.pPage->apCell[leafCur.idx]; szNext = cellSize(pBt, pNext); pNext->h.leftChild = SWAB32(pBt, pgnoChild); insertCell(pBt, pPage, pCur->idx, pNext, szNext); rc = balance(pBt, pPage, pCur); if( rc ) return rc; pCur->eSkip = SKIP_NEXT; dropCell(pBt, leafCur.pPage, leafCur.idx, szNext); rc = balance(pBt, leafCur.pPage, pCur); releaseTempCursor(&leafCur); }else{ dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); if( pCur->idx>=pPage->nCell ){ pCur->idx = pPage->nCell-1; if( pCur->idx<0 ){ pCur->idx = 0; pCur->eSkip = SKIP_NEXT; }else{ pCur->eSkip = SKIP_PREV; } }else{ pCur->eSkip = SKIP_NEXT; } rc = balance(pBt, pPage, pCur); } return rc; } /* ** Create a new BTree table. Write into *piTable the page ** number for the root page of the new table. ** ** In the current implementation, BTree tables and BTree indices are the ** the same. In the future, we may change this so that BTree tables ** are restricted to having a 4-byte integer key and arbitrary data and ** BTree indices are restricted to having an arbitrary key and no data. ** But for now, this routine also serves to create indices. */ static int fileBtreeCreateTable(Btree *pBt, int *piTable){ MemPage *pRoot; Pgno pgnoRoot; int rc; if( !pBt->inTrans ){ /* Must start a transaction first */ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } if( pBt->readOnly ){ return SQLITE_READONLY; } rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0); if( rc ) return rc; assert( sqlitepager_iswriteable(pRoot) ); zeroPage(pBt, pRoot); sqlitepager_unref(pRoot); *piTable = (int)pgnoRoot; return SQLITE_OK; } /* ** Erase the given database page and all its children. Return ** the page to the freelist. */ static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){ MemPage *pPage; int rc; Cell *pCell; int idx; rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage); if( rc ) return rc; rc = sqlitepager_write(pPage); if( rc ) return rc; rc = initPage(pBt, pPage, pgno, 0); if( rc ) return rc; idx = SWAB16(pBt, pPage->u.hdr.firstCell); while( idx>0 ){ pCell = (Cell*)&pPage->u.aDisk[idx]; idx = SWAB16(pBt, pCell->h.iNext); if( pCell->h.leftChild ){ rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1); if( rc ) return rc; } rc = clearCell(pBt, pCell); if( rc ) return rc; } if( pPage->u.hdr.rightChild ){ rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); if( rc ) return rc; } if( freePageFlag ){ rc = freePage(pBt, pPage, pgno); }else{ zeroPage(pBt, pPage); } sqlitepager_unref(pPage); return rc; } /* ** Delete all information from a single table in the database. */ static int fileBtreeClearTable(Btree *pBt, int iTable){ int rc; BtCursor *pCur; if( !pBt->inTrans ){ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ if( pCur->pgnoRoot==(Pgno)iTable ){ if( pCur->wrFlag==0 ) return SQLITE_LOCKED; moveToRoot(pCur); } } rc = clearDatabasePage(pBt, (Pgno)iTable, 0); if( rc ){ fileBtreeRollback(pBt); } return rc; } /* ** Erase all information in a table and add the root of the table to ** the freelist. Except, the root of the principle table (the one on ** page 2) is never added to the freelist. */ static int fileBtreeDropTable(Btree *pBt, int iTable){ int rc; MemPage *pPage; BtCursor *pCur; if( !pBt->inTrans ){ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ if( pCur->pgnoRoot==(Pgno)iTable ){ return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */ } } rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage); if( rc ) return rc; rc = fileBtreeClearTable(pBt, iTable); if( rc ) return rc; if( iTable>2 ){ rc = freePage(pBt, pPage, iTable); }else{ zeroPage(pBt, pPage); } sqlitepager_unref(pPage); return rc; } #if 0 /* UNTESTED */ /* ** Copy all cell data from one database file into another. ** pages back the freelist. */ static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){ Pager *pFromPager = pBtFrom->pPager; OverflowPage *pOvfl; Pgno ovfl, nextOvfl; Pgno *pPrev; int rc = SQLITE_OK; MemPage *pNew, *pPrevPg; Pgno new; if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){ return SQLITE_OK; } pPrev = &pCell->ovfl; pPrevPg = 0; ovfl = SWAB32(pBtTo, pCell->ovfl); while( ovfl && rc==SQLITE_OK ){ rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl); if( rc ) return rc; nextOvfl = SWAB32(pBtFrom, pOvfl->iNext); rc = allocatePage(pBtTo, &pNew, &new, 0); if( rc==SQLITE_OK ){ rc = sqlitepager_write(pNew); if( rc==SQLITE_OK ){ memcpy(pNew, pOvfl, SQLITE_PAGE_SIZE); *pPrev = SWAB32(pBtTo, new); if( pPrevPg ){ sqlitepager_unref(pPrevPg); } pPrev = &pOvfl->iNext; pPrevPg = pNew; } } sqlitepager_unref(pOvfl); ovfl = nextOvfl; } if( pPrevPg ){ sqlitepager_unref(pPrevPg); } return rc; } #endif #if 0 /* UNTESTED */ /* ** Copy a page of data from one database over to another. */ static int copyDatabasePage( Btree *pBtFrom, Pgno pgnoFrom, Btree *pBtTo, Pgno *pTo ){ MemPage *pPageFrom, *pPage; Pgno to; int rc; Cell *pCell; int idx; rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom); if( rc ) return rc; rc = allocatePage(pBt, &pPage, pTo, 0); if( rc==SQLITE_OK ){ rc = sqlitepager_write(pPage); } if( rc==SQLITE_OK ){ memcpy(pPage, pPageFrom, SQLITE_PAGE_SIZE); idx = SWAB16(pBt, pPage->u.hdr.firstCell); while( idx>0 ){ pCell = (Cell*)&pPage->u.aDisk[idx]; idx = SWAB16(pBt, pCell->h.iNext); if( pCell->h.leftChild ){ Pgno newChld; rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild), pBtTo, &newChld); if( rc ) return rc; pCell->h.leftChild = SWAB32(pBtFrom, newChld); } rc = copyCell(pBtFrom, pBtTo, pCell); if( rc ) return rc; } if( pPage->u.hdr.rightChild ){ Pgno newChld; rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild), pBtTo, &newChld); if( rc ) return rc; pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild); } } sqlitepager_unref(pPage); return rc; } #endif /* ** Read the meta-information out of a database file. */ static int fileBtreeGetMeta(Btree *pBt, int *aMeta){ PageOne *pP1; int rc; int i; rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1); if( rc ) return rc; aMeta[0] = SWAB32(pBt, pP1->nFree); for(i=0; iaMeta)/sizeof(pP1->aMeta[0]); i++){ aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]); } sqlitepager_unref(pP1); return SQLITE_OK; } /* ** Write meta-information back into the database. */ static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){ PageOne *pP1; int rc, i; if( !pBt->inTrans ){ return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; } pP1 = pBt->page1; rc = sqlitepager_write(pP1); if( rc ) return rc; for(i=0; iaMeta)/sizeof(pP1->aMeta[0]); i++){ pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]); } return SQLITE_OK; } /****************************************************************************** ** The complete implementation of the BTree subsystem is above this line. ** All the code the follows is for testing and troubleshooting the BTree ** subsystem. None of the code that follows is used during normal operation. ******************************************************************************/ /* ** Print a disassembly of the given page on standard output. This routine ** is used for debugging and testing only. */ #ifdef SQLITE_TEST static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){ int rc; MemPage *pPage; int i, j; int nFree; u16 idx; char range[20]; unsigned char payload[20]; rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage); if( rc ){ return rc; } if( recursive ) printf("PAGE %d:\n", pgno); i = 0; idx = SWAB16(pBt, pPage->u.hdr.firstCell); while( idx>0 && idx<=SQLITE_PAGE_SIZE-MIN_CELL_SIZE ){ Cell *pCell = (Cell*)&pPage->u.aDisk[idx]; int sz = cellSize(pBt, pCell); sprintf(range,"%d..%d", idx, idx+sz-1); sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1; memcpy(payload, pCell->aPayload, sz); for(j=0; j0x7f ) payload[j] = '.'; } payload[sz] = 0; printf( "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n", i, range, (int)pCell->h.leftChild, NKEY(pBt, pCell->h), NDATA(pBt, pCell->h), payload ); if( pPage->isInit && pPage->apCell[i]!=pCell ){ printf("**** apCell[%d] does not match on prior entry ****\n", i); } i++; idx = SWAB16(pBt, pCell->h.iNext); } if( idx!=0 ){ printf("ERROR: next cell index out of range: %d\n", idx); } printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild)); nFree = 0; i = 0; idx = SWAB16(pBt, pPage->u.hdr.firstFree); while( idx>0 && idxu.aDisk[idx]; sprintf(range,"%d..%d", idx, idx+p->iSize-1); nFree += SWAB16(pBt, p->iSize); printf("freeblock %2d: i=%-10s size=%-4d total=%d\n", i, range, SWAB16(pBt, p->iSize), nFree); idx = SWAB16(pBt, p->iNext); i++; } if( idx!=0 ){ printf("ERROR: next freeblock index out of range: %d\n", idx); } if( recursive && pPage->u.hdr.rightChild!=0 ){ idx = SWAB16(pBt, pPage->u.hdr.firstCell); while( idx>0 && idxu.aDisk[idx]; fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1); idx = SWAB16(pBt, pCell->h.iNext); } fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); } sqlitepager_unref(pPage); return SQLITE_OK; } #endif #ifdef SQLITE_TEST /* ** Fill aResult[] with information about the entry and page that the ** cursor is pointing to. ** ** aResult[0] = The page number ** aResult[1] = The entry number ** aResult[2] = Total number of entries on this page ** aResult[3] = Size of this entry ** aResult[4] = Number of free bytes on this page ** aResult[5] = Number of free blocks on the page ** aResult[6] = Page number of the left child of this entry ** aResult[7] = Page number of the right child for the whole page ** ** This routine is used for testing and debugging only. */ static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){ int cnt, idx; MemPage *pPage = pCur->pPage; Btree *pBt = pCur->pBt; aResult[0] = sqlitepager_pagenumber(pPage); aResult[1] = pCur->idx; aResult[2] = pPage->nCell; if( pCur->idx>=0 && pCur->idxnCell ){ aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]); aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild); }else{ aResult[3] = 0; aResult[6] = 0; } aResult[4] = pPage->nFree; cnt = 0; idx = SWAB16(pBt, pPage->u.hdr.firstFree); while( idx>0 && idxu.aDisk[idx])->iNext); } aResult[5] = cnt; aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild); return SQLITE_OK; } #endif #ifdef SQLITE_TEST /* ** Return the pager associated with a BTree. This routine is used for ** testing and debugging only. */ static Pager *fileBtreePager(Btree *pBt){ return pBt->pPager; } #endif /* ** This structure is passed around through all the sanity checking routines ** in order to keep track of some global state information. */ typedef struct IntegrityCk IntegrityCk; struct IntegrityCk { Btree *pBt; /* The tree being checked out */ Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */ int nPage; /* Number of pages in the database */ int *anRef; /* Number of times each page is referenced */ int nTreePage; /* Number of BTree pages */ int nByte; /* Number of bytes of data stored on BTree pages */ char *zErrMsg; /* An error message. NULL of no errors seen. */ }; /* ** Append a message to the error message string. */ static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){ if( pCheck->zErrMsg ){ char *zOld = pCheck->zErrMsg; pCheck->zErrMsg = 0; sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); sqliteFree(zOld); }else{ sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); } } /* ** Add 1 to the reference count for page iPage. If this is the second ** reference to the page, add an error message to pCheck->zErrMsg. ** Return 1 if there are 2 ore more references to the page and 0 if ** if this is the first reference to the page. ** ** Also check that the page number is in bounds. */ static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ if( iPage==0 ) return 1; if( iPage>pCheck->nPage || iPage<0 ){ char zBuf[100]; sprintf(zBuf, "invalid page number %d", iPage); checkAppendMsg(pCheck, zContext, zBuf); return 1; } if( pCheck->anRef[iPage]==1 ){ char zBuf[100]; sprintf(zBuf, "2nd reference to page %d", iPage); checkAppendMsg(pCheck, zContext, zBuf); return 1; } return (pCheck->anRef[iPage]++)>1; } /* ** Check the integrity of the freelist or of an overflow page list. ** Verify that the number of pages on the list is N. */ static void checkList( IntegrityCk *pCheck, /* Integrity checking context */ int isFreeList, /* True for a freelist. False for overflow page list */ int iPage, /* Page number for first page in the list */ int N, /* Expected number of pages in the list */ char *zContext /* Context for error messages */ ){ int i; char zMsg[100]; while( N-- > 0 ){ OverflowPage *pOvfl; if( iPage<1 ){ sprintf(zMsg, "%d pages missing from overflow list", N+1); checkAppendMsg(pCheck, zContext, zMsg); break; } if( checkRef(pCheck, iPage, zContext) ) break; if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){ sprintf(zMsg, "failed to get page %d", iPage); checkAppendMsg(pCheck, zContext, zMsg); break; } if( isFreeList ){ FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload; int n = SWAB32(pCheck->pBt, pInfo->nFree); for(i=0; ipBt, pInfo->aFree[i]), zContext); } N -= n; } iPage = SWAB32(pCheck->pBt, pOvfl->iNext); sqlitepager_unref(pOvfl); } } /* ** Return negative if zKey1zKey2. */ static int keyCompare( const char *zKey1, int nKey1, const char *zKey2, int nKey2 ){ int min = nKey1>nKey2 ? nKey2 : nKey1; int c = memcmp(zKey1, zKey2, min); if( c==0 ){ c = nKey1 - nKey2; } return c; } /* ** Do various sanity checks on a single page of a tree. Return ** the tree depth. Root pages return 0. Parents of root pages ** return 1, and so forth. ** ** These checks are done: ** ** 1. Make sure that cells and freeblocks do not overlap ** but combine to completely cover the page. ** 2. Make sure cell keys are in order. ** 3. Make sure no key is less than or equal to zLowerBound. ** 4. Make sure no key is greater than or equal to zUpperBound. ** 5. Check the integrity of overflow pages. ** 6. Recursively call checkTreePage on all children. ** 7. Verify that the depth of all children is the same. ** 8. Make sure this page is at least 33% full or else it is ** the root of the tree. */ static int checkTreePage( IntegrityCk *pCheck, /* Context for the sanity check */ int iPage, /* Page number of the page to check */ MemPage *pParent, /* Parent page */ char *zParentContext, /* Parent context */ char *zLowerBound, /* All keys should be greater than this, if not NULL */ int nLower, /* Number of characters in zLowerBound */ char *zUpperBound, /* All keys should be less than this, if not NULL */ int nUpper /* Number of characters in zUpperBound */ ){ MemPage *pPage; int i, rc, depth, d2, pgno; char *zKey1, *zKey2; int nKey1, nKey2; BtCursor cur; Btree *pBt; char zMsg[100]; char zContext[100]; char hit[SQLITE_PAGE_SIZE]; /* Check that the page exists */ cur.pBt = pBt = pCheck->pBt; if( iPage==0 ) return 0; if( checkRef(pCheck, iPage, zParentContext) ) return 0; sprintf(zContext, "On tree page %d: ", iPage); if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){ sprintf(zMsg, "unable to get the page. error code=%d", rc); checkAppendMsg(pCheck, zContext, zMsg); return 0; } if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){ sprintf(zMsg, "initPage() returns error code %d", rc); checkAppendMsg(pCheck, zContext, zMsg); sqlitepager_unref(pPage); return 0; } /* Check out all the cells. */ depth = 0; if( zLowerBound ){ zKey1 = sqliteMalloc( nLower+1 ); memcpy(zKey1, zLowerBound, nLower); zKey1[nLower] = 0; }else{ zKey1 = 0; } nKey1 = nLower; cur.pPage = pPage; for(i=0; inCell; i++){ Cell *pCell = pPage->apCell[i]; int sz; /* Check payload overflow pages */ nKey2 = NKEY(pBt, pCell->h); sz = nKey2 + NDATA(pBt, pCell->h); sprintf(zContext, "On page %d cell %d: ", iPage, i); if( sz>MX_LOCAL_PAYLOAD ){ int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE; checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext); } /* Check that keys are in the right order */ cur.idx = i; zKey2 = sqliteMallocRaw( nKey2+1 ); getPayload(&cur, 0, nKey2, zKey2); if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){ checkAppendMsg(pCheck, zContext, "Key is out of order"); } /* Check sanity of left child page. */ pgno = SWAB32(pBt, pCell->h.leftChild); d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2); if( i>0 && d2!=depth ){ checkAppendMsg(pCheck, zContext, "Child page depth differs"); } depth = d2; sqliteFree(zKey1); zKey1 = zKey2; nKey1 = nKey2; } pgno = SWAB32(pBt, pPage->u.hdr.rightChild); sprintf(zContext, "On page %d at right child: ", iPage); checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper); sqliteFree(zKey1); /* Check for complete coverage of the page */ memset(hit, 0, sizeof(hit)); memset(hit, 1, sizeof(PageHdr)); for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && iu.aDisk[i]; int j; for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++; i = SWAB16(pBt, pCell->h.iNext); } for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && iu.aDisk[i]; int j; for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++; i = SWAB16(pBt,pFBlk->iNext); } for(i=0; i1 ){ sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage); checkAppendMsg(pCheck, zMsg, 0); break; } } /* Check that free space is kept to a minimum */ #if 0 if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_PAGE_SIZE/4 ){ sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree, SQLITE_PAGE_SIZE/3); checkAppendMsg(pCheck, zContext, zMsg); } #endif /* Update freespace totals. */ pCheck->nTreePage++; pCheck->nByte += USABLE_SPACE - pPage->nFree; sqlitepager_unref(pPage); return depth; } /* ** This routine does a complete check of the given BTree file. aRoot[] is ** an array of pages numbers were each page number is the root page of ** a table. nRoot is the number of entries in aRoot. ** ** If everything checks out, this routine returns NULL. If something is ** amiss, an error message is written into memory obtained from malloc() ** and a pointer to that error message is returned. The calling function ** is responsible for freeing the error message when it is done. */ char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){ int i; int nRef; IntegrityCk sCheck; nRef = *sqlitepager_stats(pBt->pPager); if( lockBtree(pBt)!=SQLITE_OK ){ return sqliteStrDup("Unable to acquire a read lock on the database"); } sCheck.pBt = pBt; sCheck.pPager = pBt->pPager; sCheck.nPage = sqlitepager_pagecount(sCheck.pPager); if( sCheck.nPage==0 ){ unlockBtreeIfUnused(pBt); return 0; } sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); sCheck.anRef[1] = 1; for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } sCheck.zErrMsg = 0; /* Check the integrity of the freelist */ checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList), SWAB32(pBt, pBt->page1->nFree), "Main freelist: "); /* Check all the tables. */ for(i=0; ipPager) ){ char zBuf[100]; sprintf(zBuf, "Outstanding page count goes from %d to %d during this analysis", nRef, *sqlitepager_stats(pBt->pPager) ); checkAppendMsg(&sCheck, zBuf, 0); } /* Clean up and report errors. */ sqliteFree(sCheck.anRef); return sCheck.zErrMsg; } /* ** Return the full pathname of the underlying database file. */ static const char *fileBtreeGetFilename(Btree *pBt){ assert( pBt->pPager!=0 ); return sqlitepager_filename(pBt->pPager); } /* ** Copy the complete content of pBtFrom into pBtTo. A transaction ** must be active for both files. ** ** The size of file pBtFrom may be reduced by this operation. ** If anything goes wrong, the transaction on pBtFrom is rolled back. */ static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){ int rc = SQLITE_OK; Pgno i, nPage, nToPage; if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR; if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR; if( pBtTo->pCursor ) return SQLITE_BUSY; memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_PAGE_SIZE); rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1); nToPage = sqlitepager_pagecount(pBtTo->pPager); nPage = sqlitepager_pagecount(pBtFrom->pPager); for(i=2; rc==SQLITE_OK && i<=nPage; i++){ void *pPage; rc = sqlitepager_get(pBtFrom->pPager, i, &pPage); if( rc ) break; rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage); if( rc ) break; sqlitepager_unref(pPage); } for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ void *pPage; rc = sqlitepager_get(pBtTo->pPager, i, &pPage); if( rc ) break; rc = sqlitepager_write(pPage); sqlitepager_unref(pPage); sqlitepager_dont_write(pBtTo->pPager, i); } if( !rc && nPagepPager, nPage); } if( rc ){ fileBtreeRollback(pBtTo); } return rc; } /* ** The following tables contain pointers to all of the interface ** routines for this implementation of the B*Tree backend. To ** substitute a different implemention of the backend, one has merely ** to provide pointers to alternative functions in similar tables. */ static BtOps sqliteBtreeOps = { fileBtreeClose, fileBtreeSetCacheSize, fileBtreeSetSafetyLevel, fileBtreeBeginTrans, fileBtreeCommit, fileBtreeRollback, fileBtreeBeginCkpt, fileBtreeCommitCkpt, fileBtreeRollbackCkpt, fileBtreeCreateTable, fileBtreeCreateTable, /* Really sqliteBtreeCreateIndex() */ fileBtreeDropTable, fileBtreeClearTable, fileBtreeCursor, fileBtreeGetMeta, fileBtreeUpdateMeta, fileBtreeIntegrityCheck, fileBtreeGetFilename, fileBtreeCopyFile, #ifdef SQLITE_TEST fileBtreePageDump, fileBtreePager #endif }; static BtCursorOps sqliteBtreeCursorOps = { fileBtreeMoveto, fileBtreeDelete, fileBtreeInsert, fileBtreeFirst, fileBtreeLast, fileBtreeNext, fileBtreePrevious, fileBtreeKeySize, fileBtreeKey, fileBtreeKeyCompare, fileBtreeDataSize, fileBtreeData, fileBtreeCloseCursor, #ifdef SQLITE_TEST fileBtreeCursorDump, #endif };