Index: src/analyze.c ================================================================== --- src/analyze.c +++ src/analyze.c @@ -174,11 +174,11 @@ ** because the OpenWrite opcode below will be needing it. */ sqlite3NestedParse(pParse, "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols ); aRoot[i] = pParse->regRoot; - aCreateTbl[i] = 1; + aCreateTbl[i] = OPFLAG_P2ISREG; }else{ /* The table already exists. If zWhere is not NULL, delete all entries ** associated with the table zWhere. If zWhere is NULL, delete the ** entire contents of the table. */ aRoot[i] = pStat->tnum; Index: src/btree.c ================================================================== --- src/btree.c +++ src/btree.c @@ -5924,11 +5924,12 @@ */ static int balance_nonroot( MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ - int isRoot /* True if pParent is a root-page */ + int isRoot, /* True if pParent is a root-page */ + int bBulk /* True if this call is part of a bulk load */ ){ BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nNew = 0; /* Number of pages in apNew[] */ @@ -6255,11 +6256,11 @@ rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); - rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0); + rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; apNew[i] = pNew; nNew++; /* Set the pointer-map entry for the new sibling page. */ @@ -6705,11 +6706,11 @@ ** the previous call, as the overflow cell data will have been ** copied either into the body of a database page or into the new ** pSpace buffer passed to the latter call to balance_nonroot(). */ u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); - rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1); + rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints); if( pFree ){ /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ @@ -8292,5 +8293,15 @@ } pBt->btsFlags &= ~BTS_NO_WAL; return rc; } + +/* +** set the mask of hint flags for cursor pCsr. Currently the only valid +** values are 0 and BTREE_BULKLOAD. +*/ +void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){ + assert( mask==BTREE_BULKLOAD || mask==0 ); + pCsr->hints = mask; +} + Index: src/btree.h ================================================================== --- src/btree.h +++ src/btree.h @@ -133,10 +133,16 @@ #define BTREE_LARGEST_ROOT_PAGE 4 #define BTREE_TEXT_ENCODING 5 #define BTREE_USER_VERSION 6 #define BTREE_INCR_VACUUM 7 +/* +** Values that may be OR'd together to form the second argument of an +** sqlite3BtreeCursorHints() call. +*/ +#define BTREE_BULKLOAD 0x00000001 + int sqlite3BtreeCursor( Btree*, /* BTree containing table to open */ int iTable, /* Index of root page */ int wrFlag, /* 1 for writing. 0 for read-only */ struct KeyInfo*, /* First argument to compare function */ @@ -176,12 +182,12 @@ struct Pager *sqlite3BtreePager(Btree*); int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*); void sqlite3BtreeCacheOverflow(BtCursor *); void sqlite3BtreeClearCursor(BtCursor *); - int sqlite3BtreeSetVersion(Btree *pBt, int iVersion); +void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask); #ifndef NDEBUG int sqlite3BtreeCursorIsValid(BtCursor*); #endif Index: src/btreeInt.h ================================================================== --- src/btreeInt.h +++ src/btreeInt.h @@ -508,10 +508,11 @@ u8 validNKey; /* True if info.nKey is valid */ u8 eState; /* One of the CURSOR_XXX constants (see below) */ #ifndef SQLITE_OMIT_INCRBLOB u8 isIncrblobHandle; /* True if this cursor is an incr. io handle */ #endif + u8 hints; /* As configured by CursorSetHints() */ i16 iPage; /* Index of current page in apPage */ u16 aiIdx[BTCURSOR_MAX_DEPTH]; /* Current index in apPage[i] */ MemPage *apPage[BTCURSOR_MAX_DEPTH]; /* Pages from root to current page */ }; Index: src/build.c ================================================================== --- src/build.c +++ src/build.c @@ -1579,11 +1579,11 @@ SelectDest dest; Table *pSelTab; assert(pParse->nTab==1); sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb); - sqlite3VdbeChangeP5(v, 1); + sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG); pParse->nTab = 2; sqlite3SelectDestInit(&dest, SRT_Table, 1); sqlite3Select(pParse, pSelect, &dest); sqlite3VdbeAddOp1(v, OP_Close, 1); if( pParse->nErr==0 ){ @@ -2395,13 +2395,11 @@ sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb); } pKey = sqlite3IndexKeyinfo(pParse, pIndex); sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, (char *)pKey, P4_KEYINFO_HANDOFF); - if( memRootPage>=0 ){ - sqlite3VdbeChangeP5(v, 1); - } + sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0)); #ifndef SQLITE_OMIT_MERGE_SORT /* Open the sorter cursor if we are to use one. */ iSorter = pParse->nTab++; sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO); Index: src/sqliteInt.h ================================================================== --- src/sqliteInt.h +++ src/sqliteInt.h @@ -2315,10 +2315,12 @@ #define OPFLAG_APPEND 0x08 /* This is likely to be an append */ #define OPFLAG_USESEEKRESULT 0x10 /* Try to avoid a seek in BtreeInsert() */ #define OPFLAG_CLEARCACHE 0x20 /* Clear pseudo-table cache in OP_Column */ #define OPFLAG_LENGTHARG 0x40 /* OP_Column only used for length() */ #define OPFLAG_TYPEOFARG 0x80 /* OP_Column only used for typeof() */ +#define OPFLAG_BULKCSR 0x01 /* OP_Open** used to open bulk cursor */ +#define OPFLAG_P2ISREG 0x02 /* P2 to OP_Open** is a register number */ /* * Each trigger present in the database schema is stored as an instance of * struct Trigger. * Index: src/test_vfs.c ================================================================== --- src/test_vfs.c +++ src/test_vfs.c @@ -359,11 +359,12 @@ TestvfsFd *pFd = tvfsGetFd(pFile); Testvfs *p = (Testvfs *)pFd->pVfs->pAppData; if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){ tvfsExecTcl(p, "xWrite", - Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 0 + Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, + Tcl_NewWideIntObj(iOfst) ); tvfsResultCode(p, &rc); } if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){ Index: src/vdbe.c ================================================================== --- src/vdbe.c +++ src/vdbe.c @@ -3118,10 +3118,13 @@ int wrFlag; Btree *pX; VdbeCursor *pCur; Db *pDb; + assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 ); + assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 ); + if( p->expired ){ rc = SQLITE_ABORT; break; } @@ -3141,11 +3144,11 @@ p->minWriteFileFormat = pDb->pSchema->file_format; } }else{ wrFlag = 0; } - if( pOp->p5 ){ + if( pOp->p5 & OPFLAG_P2ISREG ){ assert( p2>0 ); assert( p2<=p->nMem ); pIn2 = &aMem[p2]; assert( memIsValid(pIn2) ); assert( (pIn2->flags & MEM_Int)!=0 ); @@ -3172,10 +3175,12 @@ if( pCur==0 ) goto no_mem; pCur->nullRow = 1; pCur->isOrdered = 1; rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor); pCur->pKeyInfo = pKeyInfo; + assert( OPFLAG_BULKCSR==BTREE_BULKLOAD ); + sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR)); /* Since it performs no memory allocation or IO, the only value that ** sqlite3BtreeCursor() may return is SQLITE_OK. */ assert( rc==SQLITE_OK ); Index: src/vdbesort.c ================================================================== --- src/vdbesort.c +++ src/vdbesort.c @@ -20,10 +20,11 @@ #ifndef SQLITE_OMIT_MERGE_SORT typedef struct VdbeSorterIter VdbeSorterIter; typedef struct SorterRecord SorterRecord; +typedef struct FileWriter FileWriter; /* ** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES: ** ** As keys are added to the sorter, they are written to disk in a series @@ -117,10 +118,26 @@ int nAlloc; /* Bytes of space at aAlloc */ int nKey; /* Number of bytes in key */ sqlite3_file *pFile; /* File iterator is reading from */ u8 *aAlloc; /* Allocated space */ u8 *aKey; /* Pointer to current key */ + u8 *aBuffer; /* Current read buffer */ + int nBuffer; /* Size of read buffer in bytes */ +}; + +/* +** An instance of this structure is used to separate the stream of records +** being written to files by the merge-sort code into aligned, page-sized +** blocks. +*/ +struct FileWriter { + u8 *aBuffer; /* Pointer to write buffer */ + int nBuffer; /* Size of write buffer in bytes */ + int iBufStart; /* First byte of buffer to write */ + int iBufEnd; /* Last byte of buffer to write */ + i64 iWriteOff; /* Offset of start of buffer in file */ + sqlite3_file *pFile; /* File to write to */ }; /* ** A structure to store a single record. All in-memory records are connected ** together into a linked list headed at VdbeSorter.pRecord using the @@ -142,12 +159,127 @@ ** Free all memory belonging to the VdbeSorterIter object passed as the second ** argument. All structure fields are set to zero before returning. */ static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){ sqlite3DbFree(db, pIter->aAlloc); + sqlite3DbFree(db, pIter->aBuffer); memset(pIter, 0, sizeof(VdbeSorterIter)); } + +/* +** Read nByte bytes of data from the stream of data iterated by object p. +** If successful, set *ppOut to point to a buffer containing the data +** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite +** error code. +** +** The buffer indicated by *ppOut may only be considered valid until the +** next call to this function. +*/ +static int vdbeSorterIterRead( + sqlite3 *db, /* Database handle (for malloc) */ + VdbeSorterIter *p, /* Iterator */ + int nByte, /* Bytes of data to read */ + u8 **ppOut /* OUT: Pointer to buffer containing data */ +){ + int iBuf; /* Offset within buffer to read from */ + int nAvail; /* Bytes of data available in buffer */ + assert( p->aBuffer ); + + /* If there is no more data to be read from the buffer, read the next + ** p->nBuffer bytes of data from the file into it. Or, if there are less + ** than p->nBuffer bytes remaining in the PMA, read all remaining data. */ + iBuf = p->iReadOff % p->nBuffer; + if( iBuf==0 ){ + int nRead; /* Bytes to read from disk */ + int rc; /* sqlite3OsRead() return code */ + + /* Determine how many bytes of data to read. */ + nRead = p->iEof - p->iReadOff; + if( nRead>p->nBuffer ) nRead = p->nBuffer; + assert( nRead>0 ); + + /* Read data from the file. Return early if an error occurs. */ + rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff); + assert( rc!=SQLITE_IOERR_SHORT_READ ); + if( rc!=SQLITE_OK ) return rc; + } + nAvail = p->nBuffer - iBuf; + + if( nByte<=nAvail ){ + /* The requested data is available in the in-memory buffer. In this + ** case there is no need to make a copy of the data, just return a + ** pointer into the buffer to the caller. */ + *ppOut = &p->aBuffer[iBuf]; + p->iReadOff += nByte; + }else{ + /* The requested data is not all available in the in-memory buffer. + ** In this case, allocate space at p->aAlloc[] to copy the requested + ** range into. Then return a copy of pointer p->aAlloc to the caller. */ + int nRem; /* Bytes remaining to copy */ + + /* Extend the p->aAlloc[] allocation if required. */ + if( p->nAllocnAlloc*2; + while( nByte>nNew ) nNew = nNew*2; + p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew); + if( !p->aAlloc ) return SQLITE_NOMEM; + } + + /* Copy as much data as is available in the buffer into the start of + ** p->aAlloc[]. */ + memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail); + p->iReadOff += nAvail; + nRem = nByte - nAvail; + + /* The following loop copies up to p->nBuffer bytes per iteration into + ** the p->aAlloc[] buffer. */ + while( nRem>0 ){ + int rc; /* vdbeSorterIterRead() return code */ + int nCopy; /* Number of bytes to copy */ + u8 *aNext; /* Pointer to buffer to copy data from */ + + nCopy = nRem; + if( nRem>p->nBuffer ) nCopy = p->nBuffer; + rc = vdbeSorterIterRead(db, p, nCopy, &aNext); + if( rc!=SQLITE_OK ) return rc; + assert( aNext!=p->aAlloc ); + memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy); + nRem -= nCopy; + } + + *ppOut = p->aAlloc; + } + + return SQLITE_OK; +} + +/* +** Read a varint from the stream of data accessed by p. Set *pnOut to +** the value read. +*/ +static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){ + int iBuf; + + iBuf = p->iReadOff % p->nBuffer; + if( iBuf && (p->nBuffer-iBuf)>=9 ){ + p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut); + }else{ + u8 aVarint[9]; + int i; + for(i=0; iiEof>=pIter->iReadOff ); - if( pIter->iEof-pIter->iReadOff>5 ){ - nRead = 5; - }else{ - nRead = (int)(pIter->iEof - pIter->iReadOff); - } - if( nRead<=0 ){ + u64 nRec = 0; /* Size of record in bytes */ + + if( pIter->iReadOff>=pIter->iEof ){ /* This is an EOF condition */ vdbeSorterIterZero(db, pIter); return SQLITE_OK; } - rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff); - if( rc==SQLITE_OK ){ - iOff = getVarint32(pIter->aAlloc, nRec); - if( (iOff+nRec)>nRead ){ - int nRead2; /* Number of extra bytes to read */ - if( (iOff+nRec)>pIter->nAlloc ){ - int nNew = pIter->nAlloc*2; - while( (iOff+nRec)>nNew ) nNew = nNew*2; - pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew); - if( !pIter->aAlloc ) return SQLITE_NOMEM; - pIter->nAlloc = nNew; - } - - nRead2 = iOff + nRec - nRead; - rc = sqlite3OsRead( - pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead - ); - } - } - - assert( rc!=SQLITE_OK || nRec>0 ); - pIter->iReadOff += iOff+nRec; - pIter->nKey = nRec; - pIter->aKey = &pIter->aAlloc[iOff]; - return rc; -} - -/* -** Write a single varint, value iVal, to file-descriptor pFile. Return -** SQLITE_OK if successful, or an SQLite error code if some error occurs. -** -** The value of *piOffset when this function is called is used as the byte -** offset in file pFile to write to. Before returning, *piOffset is -** incremented by the number of bytes written. -*/ -static int vdbeSorterWriteVarint( - sqlite3_file *pFile, /* File to write to */ - i64 iVal, /* Value to write as a varint */ - i64 *piOffset /* IN/OUT: Write offset in file pFile */ -){ - u8 aVarint[9]; /* Buffer large enough for a varint */ - int nVarint; /* Number of used bytes in varint */ - int rc; /* Result of write() call */ - - nVarint = sqlite3PutVarint(aVarint, iVal); - rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset); - *piOffset += nVarint; - - return rc; -} - -/* -** Read a single varint from file-descriptor pFile. Return SQLITE_OK if -** successful, or an SQLite error code if some error occurs. -** -** The value of *piOffset when this function is called is used as the -** byte offset in file pFile from whence to read the varint. If successful -** (i.e. if no IO error occurs), then *piOffset is set to the offset of -** the first byte past the end of the varint before returning. *piVal is -** set to the integer value read. If an error occurs, the final values of -** both *piOffset and *piVal are undefined. -*/ -static int vdbeSorterReadVarint( - sqlite3_file *pFile, /* File to read from */ - i64 *piOffset, /* IN/OUT: Read offset in pFile */ - i64 *piVal /* OUT: Value read from file */ -){ - u8 aVarint[9]; /* Buffer large enough for a varint */ - i64 iOff = *piOffset; /* Offset in file to read from */ - int rc; /* Return code */ - - rc = sqlite3OsRead(pFile, aVarint, 9, iOff); - if( rc==SQLITE_OK ){ - *piOffset += getVarint(aVarint, (u64 *)piVal); + rc = vdbeSorterIterVarint(db, pIter, &nRec); + if( rc==SQLITE_OK ){ + pIter->nKey = (int)nRec; + rc = vdbeSorterIterRead(db, pIter, nRec, &pIter->aKey); } return rc; } @@ -262,26 +316,51 @@ const VdbeSorter *pSorter, /* Sorter object */ i64 iStart, /* Start offset in pFile */ VdbeSorterIter *pIter, /* Iterator to populate */ i64 *pnByte /* IN/OUT: Increment this value by PMA size */ ){ - int rc; + int rc = SQLITE_OK; + int nBuf; + + nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt); assert( pSorter->iWriteOff>iStart ); assert( pIter->aAlloc==0 ); + assert( pIter->aBuffer==0 ); pIter->pFile = pSorter->pTemp1; pIter->iReadOff = iStart; pIter->nAlloc = 128; pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc); - if( !pIter->aAlloc ){ + pIter->nBuffer = nBuf; + pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf); + + if( !pIter->aBuffer ){ rc = SQLITE_NOMEM; }else{ - i64 nByte; /* Total size of PMA in bytes */ - rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte); - *pnByte += nByte; - pIter->iEof = pIter->iReadOff + nByte; + int iBuf; + + iBuf = iStart % nBuf; + if( iBuf ){ + int nRead = nBuf - iBuf; + if( (iStart + nRead) > pSorter->iWriteOff ){ + nRead = pSorter->iWriteOff - iStart; + } + rc = sqlite3OsRead( + pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart + ); + assert( rc!=SQLITE_IOERR_SHORT_READ ); + } + + if( rc==SQLITE_OK ){ + u64 nByte; /* Size of PMA in bytes */ + pIter->iEof = pSorter->iWriteOff; + rc = vdbeSorterIterVarint(db, pIter, &nByte); + pIter->iEof = pIter->iReadOff + nByte; + *pnByte += nByte; + } } + if( rc==SQLITE_OK ){ rc = vdbeSorterIterNext(db, pIter); } return rc; } @@ -529,10 +608,96 @@ sqlite3_free(aSlot); return SQLITE_OK; } +/* +** Initialize a file-writer object. +*/ +static int fileWriterInit( + sqlite3 *db, /* Database (for malloc) */ + sqlite3_file *pFile, /* File to write to */ + FileWriter *p, /* Object to populate */ + i64 iStart /* Offset of pFile to begin writing at */ +){ + int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt); + + memset(p, 0, sizeof(FileWriter)); + p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf); + if( !p->aBuffer ) return SQLITE_NOMEM; + + p->iBufEnd = p->iBufStart = (iStart % nBuf); + p->iWriteOff = iStart - p->iBufStart; + p->nBuffer = nBuf; + p->pFile = pFile; + return SQLITE_OK; +} + +/* +** Write nData bytes of data to the file-write object. Return SQLITE_OK +** if successful, or an SQLite error code if an error occurs. +*/ +static int fileWriterWrite(FileWriter *p, u8 *pData, int nData){ + int nRem = nData; + while( nRem>0 ){ + int nCopy = nRem; + if( nCopy>(p->nBuffer - p->iBufEnd) ){ + nCopy = p->nBuffer - p->iBufEnd; + } + + memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy); + p->iBufEnd += nCopy; + if( p->iBufEnd==p->nBuffer ){ + int rc = sqlite3OsWrite(p->pFile, + &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, + p->iWriteOff + p->iBufStart + ); + if( rc!=SQLITE_OK ) return rc; + p->iBufStart = p->iBufEnd = 0; + p->iWriteOff += p->nBuffer; + } + assert( p->iBufEndnBuffer ); + + nRem -= nCopy; + } + + return SQLITE_OK; +} + +/* +** Flush any buffered data to disk and clean up the file-writer object. +** The results of using the file-writer after this call are undefined. +** Return SQLITE_OK if flushing the buffered data succeeds or is not +** required. Otherwise, return an SQLite error code. +** +** Before returning, set *piEof to the offset immediately following the +** last byte written to the file. +*/ +static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){ + int rc = SQLITE_OK; + if( p->aBuffer && p->iBufEnd>p->iBufStart ){ + rc = sqlite3OsWrite(p->pFile, + &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, + p->iWriteOff + p->iBufStart + ); + } + *piEof = (p->iWriteOff + p->iBufEnd); + sqlite3DbFree(db, p->aBuffer); + memset(p, 0, sizeof(FileWriter)); + return rc; +} + +/* +** Write value iVal encoded as a varint to the file-write object. Return +** SQLITE_OK if successful, or an SQLite error code if an error occurs. +*/ +static int fileWriterWriteVarint(FileWriter *p, u64 iVal){ + int nByte; + u8 aByte[10]; + nByte = sqlite3PutVarint(aByte, iVal); + return fileWriterWrite(p, aByte, nByte); +} /* ** Write the current contents of the in-memory linked-list to a PMA. Return ** SQLITE_OK if successful, or an SQLite error code otherwise. ** @@ -545,11 +710,15 @@ ** Each record consists of a varint followed by a blob of data (the ** key). The varint is the number of bytes in the blob of data. */ static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){ int rc = SQLITE_OK; /* Return code */ + int rc2; /* fileWriterFinish return code */ VdbeSorter *pSorter = pCsr->pSorter; + FileWriter writer; + + memset(&writer, 0, sizeof(FileWriter)); if( pSorter->nInMemory==0 ){ assert( pSorter->pRecord==0 ); return rc; } @@ -563,44 +732,35 @@ assert( pSorter->iWriteOff==0 ); assert( pSorter->nPMA==0 ); } if( rc==SQLITE_OK ){ - i64 iOff = pSorter->iWriteOff; + rc = fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff); + } + + if( rc==SQLITE_OK ){ SorterRecord *p; SorterRecord *pNext = 0; - static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + pSorter->nPMA++; - rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff); + rc = fileWriterWriteVarint(&writer, pSorter->nInMemory); for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){ pNext = p->pNext; - rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff); - + rc = fileWriterWriteVarint(&writer, p->nVal); if( rc==SQLITE_OK ){ - rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff); - iOff += p->nVal; + rc = fileWriterWrite(&writer, p->pVal, p->nVal); } sqlite3DbFree(db, p); } - /* This assert verifies that unless an error has occurred, the size of - ** the PMA on disk is the same as the expected size stored in - ** pSorter->nInMemory. */ - assert( rc!=SQLITE_OK || pSorter->nInMemory==( - iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory) - )); - - pSorter->iWriteOff = iOff; - if( rc==SQLITE_OK ){ - /* Terminate each file with 8 extra bytes so that from any offset - ** in the file we can always read 9 bytes without a SHORT_READ error */ - rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff); - } pSorter->pRecord = p; } + + rc2 = fileWriterFinish(db, &writer, &pSorter->iWriteOff); + if( rc==SQLITE_OK ) rc = rc2; return rc; } /* @@ -640,12 +800,18 @@ */ if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && ( (pSorter->nInMemory>pSorter->mxPmaSize) || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull()) )){ +#ifdef SQLITE_DEBUG + i64 nExpect = pSorter->iWriteOff + + sqlite3VarintLen(pSorter->nInMemory) + + pSorter->nInMemory; +#endif rc = vdbeSorterListToPMA(db, pCsr); pSorter->nInMemory = 0; + assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) ); } return rc; } @@ -702,11 +868,11 @@ *pbEof = !pSorter->pRecord; assert( pSorter->aTree==0 ); return vdbeSorterSort(pCsr); } - /* Write the current b-tree to a PMA. Close the b-tree cursor. */ + /* Write the current in-memory list to a PMA. */ rc = vdbeSorterListToPMA(db, pCsr); if( rc!=SQLITE_OK ) return rc; /* Allocate space for aIter[] and aTree[]. */ nIter = pSorter->nPMA; @@ -724,11 +890,15 @@ for(iNew=0; rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNTnPMA; iNew++ ){ + int rc2; /* Return code from fileWriterFinish() */ + FileWriter writer; /* Object used to write to disk */ i64 nWrite; /* Number of bytes in new PMA */ + + memset(&writer, 0, sizeof(FileWriter)); /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1, ** initialize an iterator for each of them and break out of the loop. ** These iterators will be incrementally merged as the VDBE layer calls ** sqlite3VdbeSorterNext(). @@ -748,27 +918,34 @@ assert( iWrite2==0 ); rc = vdbeSorterOpenTempFile(db, &pTemp2); } if( rc==SQLITE_OK ){ - rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2); + rc = fileWriterInit(db, pTemp2, &writer, iWrite2); + } + if( rc==SQLITE_OK ){ + rc = fileWriterWriteVarint(&writer, nWrite); } if( rc==SQLITE_OK ){ int bEof = 0; while( rc==SQLITE_OK && bEof==0 ){ - int nToWrite; VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ]; assert( pIter->pFile ); - nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey); - rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2); - iWrite2 += nToWrite; + + rc = fileWriterWriteVarint(&writer, pIter->nKey); + if( rc==SQLITE_OK ){ + rc = fileWriterWrite(&writer, pIter->aKey, pIter->nKey); + } if( rc==SQLITE_OK ){ rc = sqlite3VdbeSorterNext(db, pCsr, &bEof); } } } + + rc2 = fileWriterFinish(db, &writer, &iWrite2); + if( rc==SQLITE_OK ) rc = rc2; } if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){ break; }else{ Index: test/index4.test ================================================================== --- test/index4.test +++ test/index4.test @@ -14,10 +14,34 @@ set testdir [file dirname $argv0] source $testdir/tester.tcl set testprefix index4 + +#proc str {n} { string range [string repeat [format %.06d. $n] 20] 0 101 } +#db func str str +#do_execsql_test 1.1 { +# BEGIN; +# CREATE TABLE t1(x); +# INSERT INTO t1 VALUES(str(1)); +# INSERT INTO t1 SELECT str(rowid + 1) FROM t1; -- 2 +# INSERT INTO t1 SELECT str(rowid + 2) FROM t1; -- 4 +# INSERT INTO t1 SELECT str(rowid + 4) FROM t1; -- 8 +# INSERT INTO t1 SELECT str(rowid + 8) FROM t1; -- 16 +# INSERT INTO t1 SELECT str(rowid + 16) FROM t1; -- 32 +# INSERT INTO t1 SELECT str(rowid + 32) FROM t1; -- 64 +# INSERT INTO t1 SELECT str(rowid + 64) FROM t1; -- 128 +# INSERT INTO t1 SELECT str(rowid + 128) FROM t1; -- 256 +# INSERT INTO t1 SELECT str(rowid + 256) FROM t1; -- 512 +# INSERT INTO t1 SELECT str(rowid + 512) FROM t1; -- 1024 +# INSERT INTO t1 SELECT str(rowid + 1024) FROM t1; -- 2048 +# INSERT INTO t1 SELECT str(rowid + 2048) FROM t1; -- 4096 +# INSERT INTO t1 SELECT str(rowid + 4096) FROM t1; -- 8192 +# INSERT INTO t1 SELECT str(rowid + 8192) FROM t1; -- 16384 +# INSERT INTO t1 SELECT str(rowid + 16384) FROM t1; -- 32768 +# COMMIT; +#} do_execsql_test 1.1 { BEGIN; CREATE TABLE t1(x); INSERT INTO t1 VALUES(randomblob(102)); ADDED test/index5.test Index: test/index5.test ================================================================== --- /dev/null +++ test/index5.test @@ -0,0 +1,75 @@ +# 2012 August 6 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set ::testprefix index5 + +do_test 1.1 { + execsql { + PRAGMA page_size = 1024; + CREATE TABLE t1(x); + BEGIN; + } + for {set i 0} {$i < 100000} {incr i} { + execsql { INSERT INTO t1 VALUES(randstr(100,100)) } + } + execsql COMMIT + execsql { + CREATE INDEX i1 ON t1(x); + DROP INDEX I1; + PRAGMA main.page_size; + } +} {1024} + +db close +testvfs tvfs +tvfs filter xWrite +tvfs script write_cb +proc write_cb {xCall file handle iOfst} { + if {[file tail $file]=="test.db"} { + lappend ::write_list [expr $iOfst/1024] + } + puts "$xCall $file $args" +} + +do_test 1.2 { + sqlite3 db test.db -vfs tvfs + set ::write_list [list] + execsql { CREATE INDEX i1 ON t1(x) } +} {} + +do_test 1.3 { + set nForward 0 + set nBackward 0 + set nNoncont 0 + set iPrev [lindex $::write_list 0] + for {set i 1} {$i < [llength $::write_list]} {incr i} { + set iNext [lindex $::write_list $i] + if {$iNext==($iPrev+1)} { + incr nForward + } elseif {$iNext==($iPrev-1)} { + incr nBackward + } else { + incr nNoncont + } + set iPrev $iNext + } + + expr {$nForward > $nBackward} +} {1} +db close +tvfs delete + +finish_test +