/ Check-in [d045f8b2]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:When reusing pages as part of creating a new index, allocate the leaves from each free-list trunk page in ascending order, instead of trying to maximize localization for each individual allocation. This increases the chance that pages will be written to disk in ascending order by a large CREATE INDEX statement, improving overall performance.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | sorter-coalesce-writes
Files: files | file ages | folders
SHA1: d045f8b2d44e388d8c4549ff02d4ca7eff4e2038
User & Date: dan 2012-08-06 18:50:11
Context
2012-08-06
19:12
Fix a crash that could follow an OOM condition. Closed-Leaf check-in: 2e5741f7 user: dan tags: sorter-coalesce-writes
18:50
When reusing pages as part of creating a new index, allocate the leaves from each free-list trunk page in ascending order, instead of trying to maximize localization for each individual allocation. This increases the chance that pages will be written to disk in ascending order by a large CREATE INDEX statement, improving overall performance. check-in: d045f8b2 user: dan tags: sorter-coalesce-writes
18:10
Update sorter-coalesce-writes branch with latest trunk changes. check-in: 214f8cda user: dan tags: sorter-coalesce-writes
Changes
Hide Diffs Side-by-Side Diffs Show Whitespace Changes Patch

Changes to src/analyze.c.

   172    172         ** side-effect of the CREATE TABLE statement is to leave the rootpage 
   173    173         ** of the new table in register pParse->regRoot. This is important 
   174    174         ** because the OpenWrite opcode below will be needing it. */
   175    175         sqlite3NestedParse(pParse,
   176    176             "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols
   177    177         );
   178    178         aRoot[i] = pParse->regRoot;
   179         -      aCreateTbl[i] = 1;
          179  +      aCreateTbl[i] = OPFLAG_P2ISREG;
   180    180       }else{
   181    181         /* The table already exists. If zWhere is not NULL, delete all entries 
   182    182         ** associated with the table zWhere. If zWhere is NULL, delete the
   183    183         ** entire contents of the table. */
   184    184         aRoot[i] = pStat->tnum;
   185    185         sqlite3TableLock(pParse, iDb, aRoot[i], 1, zTab);
   186    186         if( zWhere ){

Changes to src/btree.c.

  5922   5922   ** If aOvflSpace is set to a null pointer, this function returns 
  5923   5923   ** SQLITE_NOMEM.
  5924   5924   */
  5925   5925   static int balance_nonroot(
  5926   5926     MemPage *pParent,               /* Parent page of siblings being balanced */
  5927   5927     int iParentIdx,                 /* Index of "the page" in pParent */
  5928   5928     u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
  5929         -  int isRoot                      /* True if pParent is a root-page */
         5929  +  int isRoot,                     /* True if pParent is a root-page */
         5930  +  int bBulk                       /* True if this call is part of a bulk load */
  5930   5931   ){
  5931   5932     BtShared *pBt;               /* The whole database */
  5932   5933     int nCell = 0;               /* Number of cells in apCell[] */
  5933   5934     int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
  5934   5935     int nNew = 0;                /* Number of pages in apNew[] */
  5935   5936     int nOld;                    /* Number of pages in apOld[] */
  5936   5937     int i, j, k;                 /* Loop counters */
................................................................................
  6253   6254         pNew = apNew[i] = apOld[i];
  6254   6255         apOld[i] = 0;
  6255   6256         rc = sqlite3PagerWrite(pNew->pDbPage);
  6256   6257         nNew++;
  6257   6258         if( rc ) goto balance_cleanup;
  6258   6259       }else{
  6259   6260         assert( i>0 );
  6260         -      rc = allocateBtreePage(pBt, &pNew, &pgno, pgno, 0);
         6261  +      rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
  6261   6262         if( rc ) goto balance_cleanup;
  6262   6263         apNew[i] = pNew;
  6263   6264         nNew++;
  6264   6265   
  6265   6266         /* Set the pointer-map entry for the new sibling page. */
  6266   6267         if( ISAUTOVACUUM ){
  6267   6268           ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
................................................................................
  6703   6704             ** different page). Once this subsequent call to balance_nonroot() 
  6704   6705             ** has completed, it is safe to release the pSpace buffer used by
  6705   6706             ** the previous call, as the overflow cell data will have been 
  6706   6707             ** copied either into the body of a database page or into the new
  6707   6708             ** pSpace buffer passed to the latter call to balance_nonroot().
  6708   6709             */
  6709   6710             u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
  6710         -          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1);
         6711  +          rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints);
  6711   6712             if( pFree ){
  6712   6713               /* If pFree is not NULL, it points to the pSpace buffer used 
  6713   6714               ** by a previous call to balance_nonroot(). Its contents are
  6714   6715               ** now stored either on real database pages or within the 
  6715   6716               ** new pSpace buffer, so it may be safely freed here. */
  6716   6717               sqlite3PageFree(pFree);
  6717   6718             }
................................................................................
  8290   8291         }
  8291   8292       }
  8292   8293     }
  8293   8294   
  8294   8295     pBt->btsFlags &= ~BTS_NO_WAL;
  8295   8296     return rc;
  8296   8297   }
         8298  +
         8299  +/*
         8300  +** set the mask of hint flags for cursor pCsr. Currently the only valid
         8301  +** values are 0 and BTREE_BULKLOAD.
         8302  +*/
         8303  +void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){
         8304  +  assert( mask==BTREE_BULKLOAD || mask==0 );
         8305  +  pCsr->hints = mask;
         8306  +}
         8307  +

Changes to src/btree.h.

   131    131   #define BTREE_FILE_FORMAT         2
   132    132   #define BTREE_DEFAULT_CACHE_SIZE  3
   133    133   #define BTREE_LARGEST_ROOT_PAGE   4
   134    134   #define BTREE_TEXT_ENCODING       5
   135    135   #define BTREE_USER_VERSION        6
   136    136   #define BTREE_INCR_VACUUM         7
   137    137   
          138  +/*
          139  +** Values that may be OR'd together to form the second argument of an
          140  +** sqlite3BtreeCursorHints() call.
          141  +*/
          142  +#define BTREE_BULKLOAD 0x00000001
          143  +
   138    144   int sqlite3BtreeCursor(
   139    145     Btree*,                              /* BTree containing table to open */
   140    146     int iTable,                          /* Index of root page */
   141    147     int wrFlag,                          /* 1 for writing.  0 for read-only */
   142    148     struct KeyInfo*,                     /* First argument to compare function */
   143    149     BtCursor *pCursor                    /* Space to write cursor structure */
   144    150   );
................................................................................
   174    180   
   175    181   char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*);
   176    182   struct Pager *sqlite3BtreePager(Btree*);
   177    183   
   178    184   int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
   179    185   void sqlite3BtreeCacheOverflow(BtCursor *);
   180    186   void sqlite3BtreeClearCursor(BtCursor *);
   181         -
   182    187   int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);
          188  +void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask);
   183    189   
   184    190   #ifndef NDEBUG
   185    191   int sqlite3BtreeCursorIsValid(BtCursor*);
   186    192   #endif
   187    193   
   188    194   #ifndef SQLITE_OMIT_BTREECOUNT
   189    195   int sqlite3BtreeCount(BtCursor *, i64 *);

Changes to src/btreeInt.h.

   506    506     u8 wrFlag;                /* True if writable */
   507    507     u8 atLast;                /* Cursor pointing to the last entry */
   508    508     u8 validNKey;             /* True if info.nKey is valid */
   509    509     u8 eState;                /* One of the CURSOR_XXX constants (see below) */
   510    510   #ifndef SQLITE_OMIT_INCRBLOB
   511    511     u8 isIncrblobHandle;      /* True if this cursor is an incr. io handle */
   512    512   #endif
          513  +  u8 hints;                             /* As configured by CursorSetHints() */
   513    514     i16 iPage;                            /* Index of current page in apPage */
   514    515     u16 aiIdx[BTCURSOR_MAX_DEPTH];        /* Current index in apPage[i] */
   515    516     MemPage *apPage[BTCURSOR_MAX_DEPTH];  /* Pages from root to current page */
   516    517   };
   517    518   
   518    519   /*
   519    520   ** Potential values for BtCursor.eState.

Changes to src/build.c.

  1577   1577       */
  1578   1578       if( pSelect ){
  1579   1579         SelectDest dest;
  1580   1580         Table *pSelTab;
  1581   1581   
  1582   1582         assert(pParse->nTab==1);
  1583   1583         sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb);
  1584         -      sqlite3VdbeChangeP5(v, 1);
         1584  +      sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG);
  1585   1585         pParse->nTab = 2;
  1586   1586         sqlite3SelectDestInit(&dest, SRT_Table, 1);
  1587   1587         sqlite3Select(pParse, pSelect, &dest);
  1588   1588         sqlite3VdbeAddOp1(v, OP_Close, 1);
  1589   1589         if( pParse->nErr==0 ){
  1590   1590           pSelTab = sqlite3ResultSetOfSelect(pParse, pSelect);
  1591   1591           if( pSelTab==0 ) return;
................................................................................
  2393   2393     }else{
  2394   2394       tnum = pIndex->tnum;
  2395   2395       sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb);
  2396   2396     }
  2397   2397     pKey = sqlite3IndexKeyinfo(pParse, pIndex);
  2398   2398     sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, 
  2399   2399                       (char *)pKey, P4_KEYINFO_HANDOFF);
  2400         -  if( memRootPage>=0 ){
  2401         -    sqlite3VdbeChangeP5(v, 1);
  2402         -  }
         2400  +  sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0));
  2403   2401   
  2404   2402   #ifndef SQLITE_OMIT_MERGE_SORT
  2405   2403     /* Open the sorter cursor if we are to use one. */
  2406   2404     iSorter = pParse->nTab++;
  2407   2405     sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO);
  2408   2406   #else
  2409   2407     iSorter = iTab;

Changes to src/sqliteInt.h.

  2313   2313   #define OPFLAG_LASTROWID     0x02    /* Set to update db->lastRowid */
  2314   2314   #define OPFLAG_ISUPDATE      0x04    /* This OP_Insert is an sql UPDATE */
  2315   2315   #define OPFLAG_APPEND        0x08    /* This is likely to be an append */
  2316   2316   #define OPFLAG_USESEEKRESULT 0x10    /* Try to avoid a seek in BtreeInsert() */
  2317   2317   #define OPFLAG_CLEARCACHE    0x20    /* Clear pseudo-table cache in OP_Column */
  2318   2318   #define OPFLAG_LENGTHARG     0x40    /* OP_Column only used for length() */
  2319   2319   #define OPFLAG_TYPEOFARG     0x80    /* OP_Column only used for typeof() */
         2320  +#define OPFLAG_BULKCSR       0x01    /* OP_Open** used to open bulk cursor */
         2321  +#define OPFLAG_P2ISREG       0x02    /* P2 to OP_Open** is a register number */
  2320   2322   
  2321   2323   /*
  2322   2324    * Each trigger present in the database schema is stored as an instance of
  2323   2325    * struct Trigger. 
  2324   2326    *
  2325   2327    * Pointers to instances of struct Trigger are stored in two ways.
  2326   2328    * 1. In the "trigHash" hash table (part of the sqlite3* that represents the 

Changes to src/test_vfs.c.

   357    357   ){
   358    358     int rc = SQLITE_OK;
   359    359     TestvfsFd *pFd = tvfsGetFd(pFile);
   360    360     Testvfs *p = (Testvfs *)pFd->pVfs->pAppData;
   361    361   
   362    362     if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){
   363    363       tvfsExecTcl(p, "xWrite", 
   364         -        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 0
          364  +        Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, 
          365  +        Tcl_NewWideIntObj(iOfst)
   365    366       );
   366    367       tvfsResultCode(p, &rc);
   367    368     }
   368    369   
   369    370     if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){
   370    371       rc = SQLITE_FULL;
   371    372     }

Changes to src/vdbe.c.

  3116   3116     int p2;
  3117   3117     int iDb;
  3118   3118     int wrFlag;
  3119   3119     Btree *pX;
  3120   3120     VdbeCursor *pCur;
  3121   3121     Db *pDb;
  3122   3122   
         3123  +  assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 );
         3124  +  assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 );
         3125  +
  3123   3126     if( p->expired ){
  3124   3127       rc = SQLITE_ABORT;
  3125   3128       break;
  3126   3129     }
  3127   3130   
  3128   3131     nField = 0;
  3129   3132     pKeyInfo = 0;
................................................................................
  3139   3142       assert( sqlite3SchemaMutexHeld(db, iDb, 0) );
  3140   3143       if( pDb->pSchema->file_format < p->minWriteFileFormat ){
  3141   3144         p->minWriteFileFormat = pDb->pSchema->file_format;
  3142   3145       }
  3143   3146     }else{
  3144   3147       wrFlag = 0;
  3145   3148     }
  3146         -  if( pOp->p5 ){
         3149  +  if( pOp->p5 & OPFLAG_P2ISREG ){
  3147   3150       assert( p2>0 );
  3148   3151       assert( p2<=p->nMem );
  3149   3152       pIn2 = &aMem[p2];
  3150   3153       assert( memIsValid(pIn2) );
  3151   3154       assert( (pIn2->flags & MEM_Int)!=0 );
  3152   3155       sqlite3VdbeMemIntegerify(pIn2);
  3153   3156       p2 = (int)pIn2->u.i;
................................................................................
  3170   3173     assert( pOp->p1>=0 );
  3171   3174     pCur = allocateCursor(p, pOp->p1, nField, iDb, 1);
  3172   3175     if( pCur==0 ) goto no_mem;
  3173   3176     pCur->nullRow = 1;
  3174   3177     pCur->isOrdered = 1;
  3175   3178     rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor);
  3176   3179     pCur->pKeyInfo = pKeyInfo;
         3180  +  assert( OPFLAG_BULKCSR==BTREE_BULKLOAD );
         3181  +  sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR));
  3177   3182   
  3178   3183     /* Since it performs no memory allocation or IO, the only value that
  3179   3184     ** sqlite3BtreeCursor() may return is SQLITE_OK. */
  3180   3185     assert( rc==SQLITE_OK );
  3181   3186   
  3182   3187     /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of
  3183   3188     ** SQLite used to check if the root-page flags were sane at this point

Added test/index5.test.

            1  +# 2012 August 6
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +#
           12  +
           13  +
           14  +set testdir [file dirname $argv0]
           15  +source $testdir/tester.tcl
           16  +set ::testprefix index5
           17  +
           18  +do_test 1.1 {
           19  +  execsql {
           20  +    PRAGMA page_size = 1024;
           21  +    CREATE TABLE t1(x);
           22  +    BEGIN;
           23  +  }
           24  +  for {set i 0} {$i < 100000} {incr i} {
           25  +    execsql { INSERT INTO t1 VALUES(randstr(100,100)) }
           26  +  }
           27  +  execsql COMMIT
           28  +  execsql { 
           29  +    CREATE INDEX i1 ON t1(x);
           30  +    DROP INDEX I1;
           31  +    PRAGMA main.page_size;
           32  +  }
           33  +} {1024}
           34  +
           35  +db close
           36  +testvfs tvfs
           37  +tvfs filter xWrite
           38  +tvfs script write_cb
           39  +proc write_cb {xCall file handle iOfst} {
           40  +  if {[file tail $file]=="test.db"} {
           41  +    lappend ::write_list [expr $iOfst/1024]
           42  +  }
           43  +  puts "$xCall $file $args"
           44  +}
           45  +
           46  +do_test 1.2 {
           47  +  sqlite3 db test.db -vfs tvfs
           48  +  set ::write_list [list]
           49  +  execsql { CREATE INDEX i1 ON t1(x) }
           50  +} {}
           51  +
           52  +do_test 1.3 {
           53  +  set nForward 0
           54  +  set nBackward 0
           55  +  set nNoncont 0
           56  +  set iPrev [lindex $::write_list 0]
           57  +  for {set i 1} {$i < [llength $::write_list]} {incr i} {
           58  +    set iNext [lindex $::write_list $i]
           59  +    if {$iNext==($iPrev+1)} {
           60  +      incr nForward
           61  +    } elseif {$iNext==($iPrev-1)} { 
           62  +      incr nBackward 
           63  +    } else {
           64  +      incr nNoncont
           65  +    }
           66  +    set iPrev $iNext
           67  +  }
           68  +
           69  +  expr {$nForward > $nBackward}
           70  +} {1}
           71  +db close
           72  +tvfs delete
           73  +
           74  +finish_test
           75  +