/ Check-in [e1e9cb08]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge the sorter-coalesce-writes branch into the trunk. This improves CREATE INDEX performance on some platforms.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:e1e9cb08b011e67b767091e42225f22ec862fa64
User & Date: dan 2012-08-06 19:28:20
Context
2012-08-06
22:29
Modify VSIX package generation tool to put the PDB files in the Debug directory. check-in: 9d072083 user: mistachkin tags: trunk
19:28
Merge the sorter-coalesce-writes branch into the trunk. This improves CREATE INDEX performance on some platforms. check-in: e1e9cb08 user: dan tags: trunk
19:12
Fix a crash that could follow an OOM condition. Closed-Leaf check-in: 2e5741f7 user: dan tags: sorter-coalesce-writes
10:51
Update description strings in the VSIX package. check-in: 541e9310 user: mistachkin tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/analyze.c.

   172    172         ** side-effect of the CREATE TABLE statement is to leave the rootpage 
   173    173         ** of the new table in register pParse->regRoot. This is important 
   174    174         ** because the OpenWrite opcode below will be needing it. */
   175    175         sqlite3NestedParse(pParse,
   176    176             "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols
   177    177         );
   178    178         aRoot[i] = pParse->regRoot;
   179         -      aCreateTbl[i] = 1;
          179  +      aCreateTbl[i] = OPFLAG_P2ISREG;
   180    180       }else{
   181    181         /* The table already exists. If zWhere is not NULL, delete all entries 
   182    182         ** associated with the table zWhere. If zWhere is NULL, delete the
   183    183         ** entire contents of the table. */
   184    184         aRoot[i] = pStat->tnum;
   185    185         sqlite3TableLock(pParse, iDb, aRoot[i], 1, zTab);
   186    186         if( zWhere ){

Changes to src/btree.c.

  5922   5922   ** If aOvflSpace is set to a null pointer, this function returns 
  5923   5923   ** SQLITE_NOMEM.
  5924   5924   */
  5925   5925   static int balance_nonroot(
  5926   5926     MemPage *pParent,               /* Parent page of siblings being balanced */
  5927   5927     int iParentIdx,                 /* Index of "the page" in pParent */
  5928   5928     u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
  5929         -  int isRoot                      /* True if pParent is a root-page */
         5929  +  int isRoot,                     /* True if pParent is a root-page */
         5930  +  int bBulk                       /* True if this call is part of a bulk load */
  5930   5931   ){
  5931   5932     BtShared *pBt;               /* The whole database */
  5932   5933     int nCell = 0;               /* Number of cells in apCell[] */
  5933   5934     int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  5934   5935     int nNew = 0;                /* Number of pages in apNew[] */
  5935   5936     int nOld;                    /* Number of pages in apOld[] */
  5936   5937     int i, j, k;                 /* Loop counters */
................................................................................
  6253   6254         pNew = apNew[i] = apOld[i];
  6254   6255         apOld[i] = 0;
  6255   6256         rc = sqlite3PagerWrite(pNew->pDbPage);
  6256   6257         nNew++;
  6257   6258         if( rc ) goto balance_cleanup;
  6258   6259       }else{
  6259   6260         assert( i>0 );
  6260         -      rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
         6261  +      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
  6261   6262         if( rc ) goto balance_cleanup;
  6262   6263         apNew[i] = pNew;
  6263   6264         nNew++;
  6264   6265   
  6265   6266         /* Set the pointer-map entry for the new sibling page. */
  6266   6267         if( ISAUTOVACUUM ){
  6267   6268           ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
................................................................................
  6703   6704             ** different page). Once this subsequent call to balance_nonroot() 
  6704   6705             ** has completed, it is safe to release the pSpace buffer used by
  6705   6706             ** the previous call, as the overflow cell data will have been 
  6706   6707             ** copied either into the body of a database page or into the new
  6707   6708             ** pSpace buffer passed to the latter call to balance_nonroot().
  6708   6709             */
  6709   6710             u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
  6710         -          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
         6711  +          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
  6711   6712             if( pFree ){
  6712   6713               /* If pFree is not NULL, it points to the pSpace buffer used 
  6713   6714               ** by a previous call to balance_nonroot(). Its contents are
  6714   6715               ** now stored either on real database pages or within the 
  6715   6716               ** new pSpace buffer, so it may be safely freed here. */
  6716   6717               sqlite3PageFree(pFree);
  6717   6718             }
................................................................................
  8290   8291         }
  8291   8292       }
  8292   8293     }
  8293   8294   
  8294   8295     pBt->btsFlags &= ~BTS_NO_WAL;
  8295   8296     return rc;
  8296   8297   }
         8298  +
         8299  +/*
         8300  +** set the mask of hint flags for cursor pCsr. Currently the only valid
         8301  +** values are 0 and BTREE_BULKLOAD.
         8302  +*/
         8303  +void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
         8304  +  assert( mask==BTREE_BULKLOAD || mask==0 );
         8305  +  pCsr->hints = mask;
         8306  +}
         8307  +

Changes to src/btree.h.

   131    131   #define BTREE_FILE_FORMAT         2
   132    132   #define BTREE_DEFAULT_CACHE_SIZE  3
   133    133   #define BTREE_LARGEST_ROOT_PAGE   4
   134    134   #define BTREE_TEXT_ENCODING       5
   135    135   #define BTREE_USER_VERSION        6
   136    136   #define BTREE_INCR_VACUUM         7
   137    137   
          138  +/*
          139  +** Values that may be OR'd together to form the second argument of an
          140  +** sqlite3BtreeCursorHints() call.
          141  +*/
          142  +#define BTREE_BULKLOAD 0x00000001
          143  +
   138    144   int sqlite3BtreeCursor(
   139    145     Btree*,                              /* BTree containing table to open */
   140    146     int iTable,                          /* Index of root page */
   141    147     int wrFlag,                          /* 1 for writing.  0 for read-only */
   142    148     struct KeyInfo*,                     /* First argument to compare function */
   143    149     BtCursor *pCursor                    /* Space to write cursor structure */
   144    150   );
................................................................................
   174    180   
   175    181   char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*);
   176    182   struct Pager *sqlite3BtreePager(Btree*);
   177    183   
   178    184   int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
   179    185   void sqlite3BtreeCacheOverflow(BtCursor *);
   180    186   void sqlite3BtreeClearCursor(BtCursor *);
   181         -
   182    187   int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);
          188  +void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask);
   183    189   
   184    190   #ifndef NDEBUG
   185    191   int sqlite3BtreeCursorIsValid(BtCursor*);
   186    192   #endif
   187    193   
   188    194   #ifndef SQLITE_OMIT_BTREECOUNT
   189    195   int sqlite3BtreeCount(BtCursor *, i64 *);

Changes to src/btreeInt.h.

   506    506     u8 wrFlag;                /* True if writable */
   507    507     u8 atLast;                /* Cursor pointing to the last entry */
   508    508     u8 validNKey;             /* True if info.nKey is valid */
   509    509     u8 eState;                /* One of the CURSOR_XXX constants (see below) */
   510    510   #ifndef SQLITE_OMIT_INCRBLOB
   511    511     u8 isIncrblobHandle;      /* True if this cursor is an incr. io handle */
   512    512   #endif
          513  +  u8 hints;                             /* As configured by CursorSetHints() */
   513    514     i16 iPage;                            /* Index of current page in apPage */
   514    515     u16 aiIdx[BTCURSOR_MAX_DEPTH];        /* Current index in apPage[i] */
   515    516     MemPage *apPage[BTCURSOR_MAX_DEPTH];  /* Pages from root to current page */
   516    517   };
   517    518   
   518    519   /*
   519    520   ** Potential values for BtCursor.eState.

Changes to src/build.c.

  1577   1577       */
  1578   1578       if( pSelect ){
  1579   1579         SelectDest dest;
  1580   1580         Table *pSelTab;
  1581   1581   
  1582   1582         assert(pParse->nTab==1);
  1583   1583         sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb);
  1584         -      sqlite3VdbeChangeP5(v, 1);
         1584  +      sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG);
  1585   1585         pParse->nTab = 2;
  1586   1586         sqlite3SelectDestInit(&dest, SRT_Table, 1);
  1587   1587         sqlite3Select(pParse, pSelect, &dest);
  1588   1588         sqlite3VdbeAddOp1(v, OP_Close, 1);
  1589   1589         if( pParse->nErr==0 ){
  1590   1590           pSelTab = sqlite3ResultSetOfSelect(pParse, pSelect);
  1591   1591           if( pSelTab==0 ) return;
................................................................................
  2393   2393     }else{
  2394   2394       tnum = pIndex->tnum;
  2395   2395       sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb);
  2396   2396     }
  2397   2397     pKey = sqlite3IndexKeyinfo(pParse, pIndex);
  2398   2398     sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, 
  2399   2399                       (char *)pKey, P4_KEYINFO_HANDOFF);
  2400         -  if( memRootPage>=0 ){
  2401         -    sqlite3VdbeChangeP5(v, 1);
  2402         -  }
         2400  +  sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0));
  2403   2401   
  2404   2402   #ifndef SQLITE_OMIT_MERGE_SORT
  2405   2403     /* Open the sorter cursor if we are to use one. */
  2406   2404     iSorter = pParse->nTab++;
  2407   2405     sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO);
  2408   2406   #else
  2409   2407     iSorter = iTab;

Changes to src/sqliteInt.h.

  2313   2313   #define OPFLAG_LASTROWID     0x02    /* Set to update db->lastRowid */
  2314   2314   #define OPFLAG_ISUPDATE      0x04    /* This OP_Insert is an sql UPDATE */
  2315   2315   #define OPFLAG_APPEND        0x08    /* This is likely to be an append */
  2316   2316   #define OPFLAG_USESEEKRESULT 0x10    /* Try to avoid a seek in BtreeInsert() */
  2317   2317   #define OPFLAG_CLEARCACHE    0x20    /* Clear pseudo-table cache in OP_Column */
  2318   2318   #define OPFLAG_LENGTHARG     0x40    /* OP_Column only used for length() */
  2319   2319   #define OPFLAG_TYPEOFARG     0x80    /* OP_Column only used for typeof() */
         2320  +#define OPFLAG_BULKCSR       0x01    /* OP_Open** used to open bulk cursor */
         2321  +#define OPFLAG_P2ISREG       0x02    /* P2 to OP_Open** is a register number */
  2320   2322   
  2321   2323   /*
  2322   2324    * Each trigger present in the database schema is stored as an instance of
  2323   2325    * struct Trigger. 
  2324   2326    *
  2325   2327    * Pointers to instances of struct Trigger are stored in two ways.
  2326   2328    * 1. In the "trigHash" hash table (part of the sqlite3* that represents the 

Changes to src/test_vfs.c.

   357    357   ){
   358    358     int rc = SQLITE_OK;
   359    359     TestvfsFd *pFd = tvfsGetFd(pFile);
   360    360     Testvfs *p = (Testvfs *)pFd->pVfs->pAppData;
   361    361   
   362    362     if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){
   363    363       tvfsExecTcl(p, "xWrite", 
   364         -        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 0
          364  +        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 
          365  +        Tcl_NewWideIntObj(iOfst)
   365    366       );
   366    367       tvfsResultCode(p, &rc);
   367    368     }
   368    369   
   369    370     if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){
   370    371       rc = SQLITE_FULL;
   371    372     }

Changes to src/vdbe.c.

  3116   3116     int p2;
  3117   3117     int iDb;
  3118   3118     int wrFlag;
  3119   3119     Btree *pX;
  3120   3120     VdbeCursor *pCur;
  3121   3121     Db *pDb;
  3122   3122   
         3123  +  assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 );
         3124  +  assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 );
         3125  +
  3123   3126     if( p->expired ){
  3124   3127       rc = SQLITE_ABORT;
  3125   3128       break;
  3126   3129     }
  3127   3130   
  3128   3131     nField = 0;
  3129   3132     pKeyInfo = 0;
................................................................................
  3139   3142       assert( sqlite3SchemaMutexHeld(db, iDb, 0) );
  3140   3143       if( pDb->pSchema->file_format < p->minWriteFileFormat ){
  3141   3144         p->minWriteFileFormat = pDb->pSchema->file_format;
  3142   3145       }
  3143   3146     }else{
  3144   3147       wrFlag = 0;
  3145   3148     }
  3146         -  if( pOp->p5 ){
         3149  +  if( pOp->p5 & OPFLAG_P2ISREG ){
  3147   3150       assert( p2>0 );
  3148   3151       assert( p2<=p->nMem );
  3149   3152       pIn2 = &aMem[p2];
  3150   3153       assert( memIsValid(pIn2) );
  3151   3154       assert( (pIn2->flags & MEM_Int)!=0 );
  3152   3155       sqlite3VdbeMemIntegerify(pIn2);
  3153   3156       p2 = (int)pIn2->u.i;
................................................................................
  3170   3173     assert( pOp->p1>=0 );
  3171   3174     pCur = allocateCursor(p, pOp->p1, nField, iDb, 1);
  3172   3175     if( pCur==0 ) goto no_mem;
  3173   3176     pCur->nullRow = 1;
  3174   3177     pCur->isOrdered = 1;
  3175   3178     rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor);
  3176   3179     pCur->pKeyInfo = pKeyInfo;
         3180  +  assert( OPFLAG_BULKCSR==BTREE_BULKLOAD );
         3181  +  sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR));
  3177   3182   
  3178   3183     /* Since it performs no memory allocation or IO, the only value that
  3179   3184     ** sqlite3BtreeCursor() may return is SQLITE_OK. */
  3180   3185     assert( rc==SQLITE_OK );
  3181   3186   
  3182   3187     /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of
  3183   3188     ** SQLite used to check if the root-page flags were sane at this point

Changes to src/vdbesort.c.

    18     18   #include "sqliteInt.h"
    19     19   #include "vdbeInt.h"
    20     20   
    21     21   #ifndef SQLITE_OMIT_MERGE_SORT
    22     22   
    23     23   typedef struct VdbeSorterIter VdbeSorterIter;
    24     24   typedef struct SorterRecord SorterRecord;
           25  +typedef struct FileWriter FileWriter;
    25     26   
    26     27   /*
    27     28   ** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
    28     29   **
    29     30   ** As keys are added to the sorter, they are written to disk in a series
    30     31   ** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
    31     32   ** the same as the cache-size allowed for temporary databases. In order
................................................................................
   115    116     i64 iReadOff;                   /* Current read offset */
   116    117     i64 iEof;                       /* 1 byte past EOF for this iterator */
   117    118     int nAlloc;                     /* Bytes of space at aAlloc */
   118    119     int nKey;                       /* Number of bytes in key */
   119    120     sqlite3_file *pFile;            /* File iterator is reading from */
   120    121     u8 *aAlloc;                     /* Allocated space */
   121    122     u8 *aKey;                       /* Pointer to current key */
          123  +  u8 *aBuffer;                    /* Current read buffer */
          124  +  int nBuffer;                    /* Size of read buffer in bytes */
          125  +};
          126  +
          127  +/*
          128  +** An instance of this structure is used to separate the stream of records
          129  +** being written to files by the merge-sort code into aligned, page-sized
          130  +** blocks.
          131  +*/
          132  +struct FileWriter {
          133  +  u8 *aBuffer;                    /* Pointer to write buffer */
          134  +  int nBuffer;                    /* Size of write buffer in bytes */
          135  +  int iBufStart;                  /* First byte of buffer to write */
          136  +  int iBufEnd;                    /* Last byte of buffer to write */
          137  +  i64 iWriteOff;                  /* Offset of start of buffer in file */
          138  +  sqlite3_file *pFile;            /* File to write to */
   122    139   };
   123    140   
   124    141   /*
   125    142   ** A structure to store a single record. All in-memory records are connected
   126    143   ** together into a linked list headed at VdbeSorter.pRecord using the 
   127    144   ** SorterRecord.pNext pointer.
   128    145   */
................................................................................
   140    157   
   141    158   /*
   142    159   ** Free all memory belonging to the VdbeSorterIter object passed as the second
   143    160   ** argument. All structure fields are set to zero before returning.
   144    161   */
   145    162   static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
   146    163     sqlite3DbFree(db, pIter->aAlloc);
          164  +  sqlite3DbFree(db, pIter->aBuffer);
   147    165     memset(pIter, 0, sizeof(VdbeSorterIter));
   148    166   }
          167  +
          168  +/*
          169  +** Read nByte bytes of data from the stream of data iterated by object p.
          170  +** If successful, set *ppOut to point to a buffer containing the data
          171  +** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite
          172  +** error code.
          173  +**
          174  +** The buffer indicated by *ppOut may only be considered valid until the
          175  +** next call to this function.
          176  +*/
          177  +static int vdbeSorterIterRead(
          178  +  sqlite3 *db,                    /* Database handle (for malloc) */
          179  +  VdbeSorterIter *p,              /* Iterator */
          180  +  int nByte,                      /* Bytes of data to read */
          181  +  u8 **ppOut                      /* OUT: Pointer to buffer containing data */
          182  +){
          183  +  int iBuf;                       /* Offset within buffer to read from */
          184  +  int nAvail;                     /* Bytes of data available in buffer */
          185  +  assert( p->aBuffer );
          186  +
          187  +  /* If there is no more data to be read from the buffer, read the next 
          188  +  ** p->nBuffer bytes of data from the file into it. Or, if there are less
          189  +  ** than p->nBuffer bytes remaining in the PMA, read all remaining data.  */
          190  +  iBuf = p->iReadOff % p->nBuffer;
          191  +  if( iBuf==0 ){
          192  +    int nRead;                    /* Bytes to read from disk */
          193  +    int rc;                       /* sqlite3OsRead() return code */
          194  +
          195  +    /* Determine how many bytes of data to read. */
          196  +    nRead = p->iEof - p->iReadOff;
          197  +    if( nRead>p->nBuffer ) nRead = p->nBuffer;
          198  +    assert( nRead>0 );
          199  +
          200  +    /* Read data from the file. Return early if an error occurs. */
          201  +    rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff);
          202  +    assert( rc!=SQLITE_IOERR_SHORT_READ );
          203  +    if( rc!=SQLITE_OK ) return rc;
          204  +  }
          205  +  nAvail = p->nBuffer - iBuf; 
          206  +
          207  +  if( nByte<=nAvail ){
          208  +    /* The requested data is available in the in-memory buffer. In this
          209  +    ** case there is no need to make a copy of the data, just return a 
          210  +    ** pointer into the buffer to the caller.  */
          211  +    *ppOut = &p->aBuffer[iBuf];
          212  +    p->iReadOff += nByte;
          213  +  }else{
          214  +    /* The requested data is not all available in the in-memory buffer.
          215  +    ** In this case, allocate space at p->aAlloc[] to copy the requested
          216  +    ** range into. Then return a copy of pointer p->aAlloc to the caller.  */
          217  +    int nRem;                     /* Bytes remaining to copy */
          218  +
          219  +    /* Extend the p->aAlloc[] allocation if required. */
          220  +    if( p->nAlloc<nByte ){
          221  +      int nNew = p->nAlloc*2;
          222  +      while( nByte>nNew ) nNew = nNew*2;
          223  +      p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew);
          224  +      if( !p->aAlloc ) return SQLITE_NOMEM;
          225  +      p->nAlloc = nNew;
          226  +    }
          227  +
          228  +    /* Copy as much data as is available in the buffer into the start of
          229  +    ** p->aAlloc[].  */
          230  +    memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail);
          231  +    p->iReadOff += nAvail;
          232  +    nRem = nByte - nAvail;
          233  +
          234  +    /* The following loop copies up to p->nBuffer bytes per iteration into
          235  +    ** the p->aAlloc[] buffer.  */
          236  +    while( nRem>0 ){
          237  +      int rc;                     /* vdbeSorterIterRead() return code */
          238  +      int nCopy;                  /* Number of bytes to copy */
          239  +      u8 *aNext;                  /* Pointer to buffer to copy data from */
          240  +
          241  +      nCopy = nRem;
          242  +      if( nRem>p->nBuffer ) nCopy = p->nBuffer;
          243  +      rc = vdbeSorterIterRead(db, p, nCopy, &aNext);
          244  +      if( rc!=SQLITE_OK ) return rc;
          245  +      assert( aNext!=p->aAlloc );
          246  +      memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy);
          247  +      nRem -= nCopy;
          248  +    }
          249  +
          250  +    *ppOut = p->aAlloc;
          251  +  }
          252  +
          253  +  return SQLITE_OK;
          254  +}
          255  +
          256  +/*
          257  +** Read a varint from the stream of data accessed by p. Set *pnOut to
          258  +** the value read.
          259  +*/
          260  +static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
          261  +  int iBuf;
          262  +
          263  +  iBuf = p->iReadOff % p->nBuffer;
          264  +  if( iBuf && (p->nBuffer-iBuf)>=9 ){
          265  +    p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
          266  +  }else{
          267  +    u8 aVarint[9];
          268  +    int i;
          269  +    for(i=0; i<sizeof(aVarint); i++){
          270  +      u8 *a;
          271  +      int rc = vdbeSorterIterRead(db, p, 1, &a);
          272  +      if( rc ) return rc;
          273  +      aVarint[i] = *a;
          274  +      if( (aVarint[i] & 0x80)==0 ) break;
          275  +    }
          276  +    sqlite3GetVarint(aVarint, pnOut);
          277  +  }
          278  +
          279  +  return SQLITE_OK;
          280  +}
          281  +
   149    282   
   150    283   /*
   151    284   ** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
   152    285   ** no error occurs, or an SQLite error code if one does.
   153    286   */
   154    287   static int vdbeSorterIterNext(
   155    288     sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
   156    289     VdbeSorterIter *pIter           /* Iterator to advance */
   157    290   ){
   158    291     int rc;                         /* Return Code */
   159         -  int nRead;                      /* Number of bytes read */
   160         -  int nRec = 0;                   /* Size of record in bytes */
   161         -  int iOff = 0;                   /* Size of serialized size varint in bytes */
          292  +  u64 nRec = 0;                   /* Size of record in bytes */
   162    293   
   163         -  assert( pIter->iEof>=pIter->iReadOff );
   164         -  if( pIter->iEof-pIter->iReadOff>5 ){
   165         -    nRead = 5;
   166         -  }else{
   167         -    nRead = (int)(pIter->iEof - pIter->iReadOff);
   168         -  }
   169         -  if( nRead<=0 ){
          294  +  if( pIter->iReadOff>=pIter->iEof ){
   170    295       /* This is an EOF condition */
   171    296       vdbeSorterIterZero(db, pIter);
   172    297       return SQLITE_OK;
   173    298     }
   174    299   
   175         -  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
          300  +  rc = vdbeSorterIterVarint(db, pIter, &nRec);
   176    301     if( rc==SQLITE_OK ){
   177         -    iOff = getVarint32(pIter->aAlloc, nRec);
   178         -    if( (iOff+nRec)>nRead ){
   179         -      int nRead2;                   /* Number of extra bytes to read */
   180         -      if( (iOff+nRec)>pIter->nAlloc ){
   181         -        int nNew = pIter->nAlloc*2;
   182         -        while( (iOff+nRec)>nNew ) nNew = nNew*2;
   183         -        pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
   184         -        if( !pIter->aAlloc ) return SQLITE_NOMEM;
   185         -        pIter->nAlloc = nNew;
   186         -      }
   187         -  
   188         -      nRead2 = iOff + nRec - nRead;
   189         -      rc = sqlite3OsRead(
   190         -          pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
   191         -      );
   192         -    }
   193         -  }
   194         -
   195         -  assert( rc!=SQLITE_OK || nRec>0 );
   196         -  pIter->iReadOff += iOff+nRec;
   197         -  pIter->nKey = nRec;
   198         -  pIter->aKey = &pIter->aAlloc[iOff];
   199         -  return rc;
   200         -}
   201         -
   202         -/*
   203         -** Write a single varint, value iVal, to file-descriptor pFile. Return
   204         -** SQLITE_OK if successful, or an SQLite error code if some error occurs.
   205         -**
   206         -** The value of *piOffset when this function is called is used as the byte
   207         -** offset in file pFile to write to. Before returning, *piOffset is 
   208         -** incremented by the number of bytes written.
   209         -*/
   210         -static int vdbeSorterWriteVarint(
   211         -  sqlite3_file *pFile,            /* File to write to */
   212         -  i64 iVal,                       /* Value to write as a varint */
   213         -  i64 *piOffset                   /* IN/OUT: Write offset in file pFile */
   214         -){
   215         -  u8 aVarint[9];                  /* Buffer large enough for a varint */
   216         -  int nVarint;                    /* Number of used bytes in varint */
   217         -  int rc;                         /* Result of write() call */
   218         -
   219         -  nVarint = sqlite3PutVarint(aVarint, iVal);
   220         -  rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
   221         -  *piOffset += nVarint;
   222         -
   223         -  return rc;
   224         -}
   225         -
   226         -/*
   227         -** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
   228         -** successful, or an SQLite error code if some error occurs.
   229         -**
   230         -** The value of *piOffset when this function is called is used as the
   231         -** byte offset in file pFile from whence to read the varint. If successful
   232         -** (i.e. if no IO error occurs), then *piOffset is set to the offset of
   233         -** the first byte past the end of the varint before returning. *piVal is
   234         -** set to the integer value read. If an error occurs, the final values of
   235         -** both *piOffset and *piVal are undefined.
   236         -*/
   237         -static int vdbeSorterReadVarint(
   238         -  sqlite3_file *pFile,            /* File to read from */
   239         -  i64 *piOffset,                  /* IN/OUT: Read offset in pFile */
   240         -  i64 *piVal                      /* OUT: Value read from file */
   241         -){
   242         -  u8 aVarint[9];                  /* Buffer large enough for a varint */
   243         -  i64 iOff = *piOffset;           /* Offset in file to read from */
   244         -  int rc;                         /* Return code */
   245         -
   246         -  rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
   247         -  if( rc==SQLITE_OK ){
   248         -    *piOffset += getVarint(aVarint, (u64 *)piVal);
          302  +    pIter->nKey = (int)nRec;
          303  +    rc = vdbeSorterIterRead(db, pIter, nRec, &pIter->aKey);
   249    304     }
   250    305   
   251    306     return rc;
   252    307   }
   253    308   
   254    309   /*
   255    310   ** Initialize iterator pIter to scan through the PMA stored in file pFile
................................................................................
   260    315   static int vdbeSorterIterInit(
   261    316     sqlite3 *db,                    /* Database handle */
   262    317     const VdbeSorter *pSorter,      /* Sorter object */
   263    318     i64 iStart,                     /* Start offset in pFile */
   264    319     VdbeSorterIter *pIter,          /* Iterator to populate */
   265    320     i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
   266    321   ){
   267         -  int rc;
          322  +  int rc = SQLITE_OK;
          323  +  int nBuf;
          324  +
          325  +  nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
   268    326   
   269    327     assert( pSorter->iWriteOff>iStart );
   270    328     assert( pIter->aAlloc==0 );
          329  +  assert( pIter->aBuffer==0 );
   271    330     pIter->pFile = pSorter->pTemp1;
   272    331     pIter->iReadOff = iStart;
   273    332     pIter->nAlloc = 128;
   274    333     pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
   275         -  if( !pIter->aAlloc ){
          334  +  pIter->nBuffer = nBuf;
          335  +  pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
          336  +
          337  +  if( !pIter->aBuffer ){
   276    338       rc = SQLITE_NOMEM;
   277    339     }else{
   278         -    i64 nByte;                         /* Total size of PMA in bytes */
   279         -    rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);
   280         -    *pnByte += nByte;
   281         -    pIter->iEof = pIter->iReadOff + nByte;
          340  +    int iBuf;
          341  +
          342  +    iBuf = iStart % nBuf;
          343  +    if( iBuf ){
          344  +      int nRead = nBuf - iBuf;
          345  +      if( (iStart + nRead) > pSorter->iWriteOff ){
          346  +        nRead = pSorter->iWriteOff - iStart;
          347  +      }
          348  +      rc = sqlite3OsRead(
          349  +          pSorter->pTemp1, &pIter->aBuffer[iBuf], nRead, iStart
          350  +      );
          351  +      assert( rc!=SQLITE_IOERR_SHORT_READ );
          352  +    }
          353  +
          354  +    if( rc==SQLITE_OK ){
          355  +      u64 nByte;                       /* Size of PMA in bytes */
          356  +      pIter->iEof = pSorter->iWriteOff;
          357  +      rc = vdbeSorterIterVarint(db, pIter, &nByte);
          358  +      pIter->iEof = pIter->iReadOff + nByte;
          359  +      *pnByte += nByte;
          360  +    }
   282    361     }
          362  +
   283    363     if( rc==SQLITE_OK ){
   284    364       rc = vdbeSorterIterNext(db, pIter);
   285    365     }
   286    366     return rc;
   287    367   }
   288    368   
   289    369   
................................................................................
   527    607     }
   528    608     pSorter->pRecord = p;
   529    609   
   530    610     sqlite3_free(aSlot);
   531    611     return SQLITE_OK;
   532    612   }
   533    613   
          614  +/*
          615  +** Initialize a file-writer object.
          616  +*/
          617  +static int fileWriterInit(
          618  +  sqlite3 *db,                    /* Database (for malloc) */
          619  +  sqlite3_file *pFile,            /* File to write to */
          620  +  FileWriter *p,                  /* Object to populate */
          621  +  i64 iStart                      /* Offset of pFile to begin writing at */
          622  +){
          623  +  int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
          624  +
          625  +  memset(p, 0, sizeof(FileWriter));
          626  +  p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
          627  +  if( !p->aBuffer ) return SQLITE_NOMEM;
          628  +
          629  +  p->iBufEnd = p->iBufStart = (iStart % nBuf);
          630  +  p->iWriteOff = iStart - p->iBufStart;
          631  +  p->nBuffer = nBuf;
          632  +  p->pFile = pFile;
          633  +  return SQLITE_OK;
          634  +}
          635  +
          636  +/*
          637  +** Write nData bytes of data to the file-write object. Return SQLITE_OK
          638  +** if successful, or an SQLite error code if an error occurs.
          639  +*/
          640  +static int fileWriterWrite(FileWriter *p, u8 *pData, int nData){
          641  +  int nRem = nData;
          642  +  while( nRem>0 ){
          643  +    int nCopy = nRem;
          644  +    if( nCopy>(p->nBuffer - p->iBufEnd) ){
          645  +      nCopy = p->nBuffer - p->iBufEnd;
          646  +    }
          647  +
          648  +    memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy);
          649  +    p->iBufEnd += nCopy;
          650  +    if( p->iBufEnd==p->nBuffer ){
          651  +      int rc = sqlite3OsWrite(p->pFile, 
          652  +          &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
          653  +          p->iWriteOff + p->iBufStart
          654  +      );
          655  +      if( rc!=SQLITE_OK ) return rc;
          656  +      p->iBufStart = p->iBufEnd = 0;
          657  +      p->iWriteOff += p->nBuffer;
          658  +    }
          659  +    assert( p->iBufEnd<p->nBuffer );
          660  +
          661  +    nRem -= nCopy;
          662  +  }
          663  +
          664  +  return SQLITE_OK;
          665  +}
          666  +
          667  +/*
          668  +** Flush any buffered data to disk and clean up the file-writer object.
          669  +** The results of using the file-writer after this call are undefined.
          670  +** Return SQLITE_OK if flushing the buffered data succeeds or is not 
          671  +** required. Otherwise, return an SQLite error code.
          672  +**
          673  +** Before returning, set *piEof to the offset immediately following the
          674  +** last byte written to the file.
          675  +*/
          676  +static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){
          677  +  int rc = SQLITE_OK;
          678  +  if( p->aBuffer && p->iBufEnd>p->iBufStart ){
          679  +    rc = sqlite3OsWrite(p->pFile, 
          680  +        &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
          681  +        p->iWriteOff + p->iBufStart
          682  +    );
          683  +  }
          684  +  *piEof = (p->iWriteOff + p->iBufEnd);
          685  +  sqlite3DbFree(db, p->aBuffer);
          686  +  memset(p, 0, sizeof(FileWriter));
          687  +  return rc;
          688  +}
          689  +
          690  +/*
          691  +** Write value iVal encoded as a varint to the file-write object. Return 
          692  +** SQLITE_OK if successful, or an SQLite error code if an error occurs.
          693  +*/
          694  +static int fileWriterWriteVarint(FileWriter *p, u64 iVal){
          695  +  int nByte; 
          696  +  u8 aByte[10];
          697  +  nByte = sqlite3PutVarint(aByte, iVal);
          698  +  return fileWriterWrite(p, aByte, nByte);
          699  +}
   534    700   
   535    701   /*
   536    702   ** Write the current contents of the in-memory linked-list to a PMA. Return
   537    703   ** SQLITE_OK if successful, or an SQLite error code otherwise.
   538    704   **
   539    705   ** The format of a PMA is:
   540    706   **
................................................................................
   543    709   **
   544    710   **     * One or more records packed end-to-end in order of ascending keys. 
   545    711   **       Each record consists of a varint followed by a blob of data (the 
   546    712   **       key). The varint is the number of bytes in the blob of data.
   547    713   */
   548    714   static int vdbeSorterListToPMA(sqlite3 *db, const VdbeCursor *pCsr){
   549    715     int rc = SQLITE_OK;             /* Return code */
          716  +  int rc2;                        /* fileWriterFinish return code */
   550    717     VdbeSorter *pSorter = pCsr->pSorter;
          718  +  FileWriter writer;
          719  +
          720  +  memset(&writer, 0, sizeof(FileWriter));
   551    721   
   552    722     if( pSorter->nInMemory==0 ){
   553    723       assert( pSorter->pRecord==0 );
   554    724       return rc;
   555    725     }
   556    726   
   557    727     rc = vdbeSorterSort(pCsr);
................................................................................
   561    731       rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
   562    732       assert( rc!=SQLITE_OK || pSorter->pTemp1 );
   563    733       assert( pSorter->iWriteOff==0 );
   564    734       assert( pSorter->nPMA==0 );
   565    735     }
   566    736   
   567    737     if( rc==SQLITE_OK ){
   568         -    i64 iOff = pSorter->iWriteOff;
          738  +    rc = fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
          739  +  }
          740  +
          741  +  if( rc==SQLITE_OK ){
   569    742       SorterRecord *p;
   570    743       SorterRecord *pNext = 0;
   571         -    static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
          744  +
   572    745   
   573    746       pSorter->nPMA++;
   574         -    rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
          747  +    rc = fileWriterWriteVarint(&writer, pSorter->nInMemory);
   575    748       for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
   576    749         pNext = p->pNext;
   577         -      rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);
   578         -
          750  +      rc = fileWriterWriteVarint(&writer, p->nVal);
   579    751         if( rc==SQLITE_OK ){
   580         -        rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
   581         -        iOff += p->nVal;
          752  +        rc = fileWriterWrite(&writer, p->pVal, p->nVal);
   582    753         }
   583    754   
   584    755         sqlite3DbFree(db, p);
   585    756       }
   586    757   
   587         -    /* This assert verifies that unless an error has occurred, the size of 
   588         -    ** the PMA on disk is the same as the expected size stored in
   589         -    ** pSorter->nInMemory. */ 
   590         -    assert( rc!=SQLITE_OK || pSorter->nInMemory==(
   591         -          iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
   592         -    ));
   593         -
   594         -    pSorter->iWriteOff = iOff;
   595         -    if( rc==SQLITE_OK ){
   596         -      /* Terminate each file with 8 extra bytes so that from any offset
   597         -      ** in the file we can always read 9 bytes without a SHORT_READ error */
   598         -      rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
   599         -    }
   600    758       pSorter->pRecord = p;
   601    759     }
          760  +
          761  +  rc2 = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
          762  +  if( rc==SQLITE_OK ) rc = rc2;
   602    763   
   603    764     return rc;
   604    765   }
   605    766   
   606    767   /*
   607    768   ** Add a record to the sorter.
   608    769   */
................................................................................
   638    799     **   * The total memory allocated for the in-memory list is greater 
   639    800     **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
   640    801     */
   641    802     if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
   642    803           (pSorter->nInMemory>pSorter->mxPmaSize)
   643    804        || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
   644    805     )){
          806  +#ifdef SQLITE_DEBUG
          807  +    i64 nExpect = pSorter->iWriteOff
          808  +                + sqlite3VarintLen(pSorter->nInMemory)
          809  +                + pSorter->nInMemory;
          810  +#endif
   645    811       rc = vdbeSorterListToPMA(db, pCsr);
   646    812       pSorter->nInMemory = 0;
          813  +    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
   647    814     }
   648    815   
   649    816     return rc;
   650    817   }
   651    818   
   652    819   /*
   653    820   ** Helper function for sqlite3VdbeSorterRewind(). 
................................................................................
   700    867     ** from the in-memory list.  */
   701    868     if( pSorter->nPMA==0 ){
   702    869       *pbEof = !pSorter->pRecord;
   703    870       assert( pSorter->aTree==0 );
   704    871       return vdbeSorterSort(pCsr);
   705    872     }
   706    873   
   707         -  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
          874  +  /* Write the current in-memory list to a PMA. */
   708    875     rc = vdbeSorterListToPMA(db, pCsr);
   709    876     if( rc!=SQLITE_OK ) return rc;
   710    877   
   711    878     /* Allocate space for aIter[] and aTree[]. */
   712    879     nIter = pSorter->nPMA;
   713    880     if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
   714    881     assert( nIter>0 );
................................................................................
   722    889     do {
   723    890       int iNew;                     /* Index of new, merged, PMA */
   724    891   
   725    892       for(iNew=0; 
   726    893           rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
   727    894           iNew++
   728    895       ){
          896  +      int rc2;                    /* Return code from fileWriterFinish() */
          897  +      FileWriter writer;          /* Object used to write to disk */
   729    898         i64 nWrite;                 /* Number of bytes in new PMA */
          899  +
          900  +      memset(&writer, 0, sizeof(FileWriter));
   730    901   
   731    902         /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
   732    903         ** initialize an iterator for each of them and break out of the loop.
   733    904         ** These iterators will be incrementally merged as the VDBE layer calls
   734    905         ** sqlite3VdbeSorterNext().
   735    906         **
   736    907         ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
................................................................................
   746    917         /* Open the second temp file, if it is not already open. */
   747    918         if( pTemp2==0 ){
   748    919           assert( iWrite2==0 );
   749    920           rc = vdbeSorterOpenTempFile(db, &pTemp2);
   750    921         }
   751    922   
   752    923         if( rc==SQLITE_OK ){
   753         -        rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
          924  +        rc = fileWriterInit(db, pTemp2, &writer, iWrite2);
          925  +      }
          926  +      if( rc==SQLITE_OK ){
          927  +        rc = fileWriterWriteVarint(&writer, nWrite);
   754    928         }
   755    929   
   756    930         if( rc==SQLITE_OK ){
   757    931           int bEof = 0;
   758    932           while( rc==SQLITE_OK && bEof==0 ){
   759         -          int nToWrite;
   760    933             VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
   761    934             assert( pIter->pFile );
   762         -          nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
   763         -          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);
   764         -          iWrite2 += nToWrite;
          935  +
          936  +          rc = fileWriterWriteVarint(&writer, pIter->nKey);
          937  +          if( rc==SQLITE_OK ){
          938  +            rc = fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
          939  +          }
   765    940             if( rc==SQLITE_OK ){
   766    941               rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
   767    942             }
   768    943           }
   769    944         }
          945  +
          946  +      rc2 = fileWriterFinish(db, &writer, &iWrite2);
          947  +      if( rc==SQLITE_OK ) rc = rc2;
   770    948       }
   771    949   
   772    950       if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
   773    951         break;
   774    952       }else{
   775    953         sqlite3_file *pTmp = pSorter->pTemp1;
   776    954         pSorter->nPMA = iNew;

Added test/index5.test.

            1  +# 2012 August 6
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +#
           12  +
           13  +
           14  +set testdir [file dirname $argv0]
           15  +source $testdir/tester.tcl
           16  +set ::testprefix index5
           17  +
           18  +do_test 1.1 {
           19  +  execsql {
           20  +    PRAGMA page_size = 1024;
           21  +    CREATE TABLE t1(x);
           22  +    BEGIN;
           23  +  }
           24  +  for {set i 0} {$i < 100000} {incr i} {
           25  +    execsql { INSERT INTO t1 VALUES(randstr(100,100)) }
           26  +  }
           27  +  execsql COMMIT
           28  +  execsql { 
           29  +    CREATE INDEX i1 ON t1(x);
           30  +    DROP INDEX I1;
           31  +    PRAGMA main.page_size;
           32  +  }
           33  +} {1024}
           34  +
           35  +db close
           36  +testvfs tvfs
           37  +tvfs filter xWrite
           38  +tvfs script write_cb
           39  +proc write_cb {xCall file handle iOfst} {
           40  +  if {[file tail $file]=="test.db"} {
           41  +    lappend ::write_list [expr $iOfst/1024]
           42  +  }
           43  +  puts "$xCall $file $args"
           44  +}
           45  +
           46  +do_test 1.2 {
           47  +  sqlite3 db test.db -vfs tvfs
           48  +  set ::write_list [list]
           49  +  execsql { CREATE INDEX i1 ON t1(x) }
           50  +} {}
           51  +
           52  +do_test 1.3 {
           53  +  set nForward 0
           54  +  set nBackward 0
           55  +  set nNoncont 0
           56  +  set iPrev [lindex $::write_list 0]
           57  +  for {set i 1} {$i < [llength $::write_list]} {incr i} {
           58  +    set iNext [lindex $::write_list $i]
           59  +    if {$iNext==($iPrev+1)} {
           60  +      incr nForward
           61  +    } elseif {$iNext==($iPrev-1)} { 
           62  +      incr nBackward 
           63  +    } else {
           64  +      incr nNoncont
           65  +    }
           66  +    set iPrev $iNext
           67  +  }
           68  +
           69  +  expr {$nForward > $nBackward}
           70  +} {1}
           71  +db close
           72  +tvfs delete
           73  +
           74  +finish_test
           75  +