Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | When reusing pages as part of creating a new index, allocate the leaves from each free-list trunk page in ascending order, instead of trying to maximize localization for each individual allocation. This increases the chance that pages will be written to disk in ascending order by a large CREATE INDEX statement, improving overall performance. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | sorter-coalesce-writes |
Files: | files | file ages | folders |
SHA1: |
d045f8b2d44e388d8c4549ff02d4ca7e |
User & Date: | dan 2012-08-06 18:50:11.492 |
Context
2012-08-06
| ||
19:12 | Fix a crash that could follow an OOM condition. (Closed-Leaf check-in: 2e5741f774 user: dan tags: sorter-coalesce-writes) | |
18:50 | When reusing pages as part of creating a new index, allocate the leaves from each free-list trunk page in ascending order, instead of trying to maximize localization for each individual allocation. This increases the chance that pages will be written to disk in ascending order by a large CREATE INDEX statement, improving overall performance. (check-in: d045f8b2d4 user: dan tags: sorter-coalesce-writes) | |
18:10 | Update sorter-coalesce-writes branch with latest trunk changes. (check-in: 214f8cda17 user: dan tags: sorter-coalesce-writes) | |
Changes
Changes to src/analyze.c.
︙ | ︙ | |||
172 173 174 175 176 177 178 | ** side-effect of the CREATE TABLE statement is to leave the rootpage ** of the new table in register pParse->regRoot. This is important ** because the OpenWrite opcode below will be needing it. */ sqlite3NestedParse(pParse, "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols ); aRoot[i] = pParse->regRoot; | | | 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 | ** side-effect of the CREATE TABLE statement is to leave the rootpage ** of the new table in register pParse->regRoot. This is important ** because the OpenWrite opcode below will be needing it. */ sqlite3NestedParse(pParse, "CREATE TABLE %Q.%s(%s)", pDb->zName, zTab, aTable[i].zCols ); aRoot[i] = pParse->regRoot; aCreateTbl[i] = OPFLAG_P2ISREG; }else{ /* The table already exists. If zWhere is not NULL, delete all entries ** associated with the table zWhere. If zWhere is NULL, delete the ** entire contents of the table. */ aRoot[i] = pStat->tnum; sqlite3TableLock(pParse, iDb, aRoot[i], 1, zTab); if( zWhere ){ |
︙ | ︙ |
Changes to src/btree.c.
︙ | ︙ | |||
5922 5923 5924 5925 5926 5927 5928 | ** If aOvflSpace is set to a null pointer, this function returns ** SQLITE_NOMEM. */ static int balance_nonroot( MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ | | > | 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 | ** If aOvflSpace is set to a null pointer, this function returns ** SQLITE_NOMEM. */ static int balance_nonroot( MemPage *pParent, /* Parent page of siblings being balanced */ int iParentIdx, /* Index of "the page" in pParent */ u8 *aOvflSpace, /* page-size bytes of space for parent ovfl */ int isRoot, /* True if pParent is a root-page */ int bBulk /* True if this call is part of a bulk load */ ){ BtShared *pBt; /* The whole database */ int nCell = 0; /* Number of cells in apCell[] */ int nMaxCells = 0; /* Allocated size of apCell, szCell, aFrom. */ int nNew = 0; /* Number of pages in apNew[] */ int nOld; /* Number of pages in apOld[] */ int i, j, k; /* Loop counters */ |
︙ | ︙ | |||
6253 6254 6255 6256 6257 6258 6259 | pNew = apNew[i] = apOld[i]; apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); | | | 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 | pNew = apNew[i] = apOld[i]; apOld[i] = 0; rc = sqlite3PagerWrite(pNew->pDbPage); nNew++; if( rc ) goto balance_cleanup; }else{ assert( i>0 ); rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0); if( rc ) goto balance_cleanup; apNew[i] = pNew; nNew++; /* Set the pointer-map entry for the new sibling page. */ if( ISAUTOVACUUM ){ ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc); |
︙ | ︙ | |||
6703 6704 6705 6706 6707 6708 6709 | ** different page). Once this subsequent call to balance_nonroot() ** has completed, it is safe to release the pSpace buffer used by ** the previous call, as the overflow cell data will have been ** copied either into the body of a database page or into the new ** pSpace buffer passed to the latter call to balance_nonroot(). */ u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); | | | 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 | ** different page). Once this subsequent call to balance_nonroot() ** has completed, it is safe to release the pSpace buffer used by ** the previous call, as the overflow cell data will have been ** copied either into the body of a database page or into the new ** pSpace buffer passed to the latter call to balance_nonroot(). */ u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize); rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1, pCur->hints); if( pFree ){ /* If pFree is not NULL, it points to the pSpace buffer used ** by a previous call to balance_nonroot(). Its contents are ** now stored either on real database pages or within the ** new pSpace buffer, so it may be safely freed here. */ sqlite3PageFree(pFree); } |
︙ | ︙ | |||
8290 8291 8292 8293 8294 8295 8296 | } } } pBt->btsFlags &= ~BTS_NO_WAL; return rc; } | > > > > > > > > > > | 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 | } } } pBt->btsFlags &= ~BTS_NO_WAL; return rc; } /* ** set the mask of hint flags for cursor pCsr. Currently the only valid ** values are 0 and BTREE_BULKLOAD. */ void sqlite3BtreeCursorHints(BtCursor *pCsr, unsigned int mask){ assert( mask==BTREE_BULKLOAD || mask==0 ); pCsr->hints = mask; } |
Changes to src/btree.h.
︙ | ︙ | |||
131 132 133 134 135 136 137 138 139 140 141 142 143 144 | #define BTREE_FILE_FORMAT 2 #define BTREE_DEFAULT_CACHE_SIZE 3 #define BTREE_LARGEST_ROOT_PAGE 4 #define BTREE_TEXT_ENCODING 5 #define BTREE_USER_VERSION 6 #define BTREE_INCR_VACUUM 7 int sqlite3BtreeCursor( Btree*, /* BTree containing table to open */ int iTable, /* Index of root page */ int wrFlag, /* 1 for writing. 0 for read-only */ struct KeyInfo*, /* First argument to compare function */ BtCursor *pCursor /* Space to write cursor structure */ ); | > > > > > > | 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 | #define BTREE_FILE_FORMAT 2 #define BTREE_DEFAULT_CACHE_SIZE 3 #define BTREE_LARGEST_ROOT_PAGE 4 #define BTREE_TEXT_ENCODING 5 #define BTREE_USER_VERSION 6 #define BTREE_INCR_VACUUM 7 /* ** Values that may be OR'd together to form the second argument of an ** sqlite3BtreeCursorHints() call. */ #define BTREE_BULKLOAD 0x00000001 int sqlite3BtreeCursor( Btree*, /* BTree containing table to open */ int iTable, /* Index of root page */ int wrFlag, /* 1 for writing. 0 for read-only */ struct KeyInfo*, /* First argument to compare function */ BtCursor *pCursor /* Space to write cursor structure */ ); |
︙ | ︙ | |||
174 175 176 177 178 179 180 | char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*); struct Pager *sqlite3BtreePager(Btree*); int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*); void sqlite3BtreeCacheOverflow(BtCursor *); void sqlite3BtreeClearCursor(BtCursor *); | < > | 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | char *sqlite3BtreeIntegrityCheck(Btree*, int *aRoot, int nRoot, int, int*); struct Pager *sqlite3BtreePager(Btree*); int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*); void sqlite3BtreeCacheOverflow(BtCursor *); void sqlite3BtreeClearCursor(BtCursor *); int sqlite3BtreeSetVersion(Btree *pBt, int iVersion); void sqlite3BtreeCursorHints(BtCursor *, unsigned int mask); #ifndef NDEBUG int sqlite3BtreeCursorIsValid(BtCursor*); #endif #ifndef SQLITE_OMIT_BTREECOUNT int sqlite3BtreeCount(BtCursor *, i64 *); |
︙ | ︙ |
Changes to src/btreeInt.h.
︙ | ︙ | |||
506 507 508 509 510 511 512 513 514 515 516 517 518 519 | u8 wrFlag; /* True if writable */ u8 atLast; /* Cursor pointing to the last entry */ u8 validNKey; /* True if info.nKey is valid */ u8 eState; /* One of the CURSOR_XXX constants (see below) */ #ifndef SQLITE_OMIT_INCRBLOB u8 isIncrblobHandle; /* True if this cursor is an incr. io handle */ #endif i16 iPage; /* Index of current page in apPage */ u16 aiIdx[BTCURSOR_MAX_DEPTH]; /* Current index in apPage[i] */ MemPage *apPage[BTCURSOR_MAX_DEPTH]; /* Pages from root to current page */ }; /* ** Potential values for BtCursor.eState. | > | 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 | u8 wrFlag; /* True if writable */ u8 atLast; /* Cursor pointing to the last entry */ u8 validNKey; /* True if info.nKey is valid */ u8 eState; /* One of the CURSOR_XXX constants (see below) */ #ifndef SQLITE_OMIT_INCRBLOB u8 isIncrblobHandle; /* True if this cursor is an incr. io handle */ #endif u8 hints; /* As configured by CursorSetHints() */ i16 iPage; /* Index of current page in apPage */ u16 aiIdx[BTCURSOR_MAX_DEPTH]; /* Current index in apPage[i] */ MemPage *apPage[BTCURSOR_MAX_DEPTH]; /* Pages from root to current page */ }; /* ** Potential values for BtCursor.eState. |
︙ | ︙ |
Changes to src/build.c.
︙ | ︙ | |||
1577 1578 1579 1580 1581 1582 1583 | */ if( pSelect ){ SelectDest dest; Table *pSelTab; assert(pParse->nTab==1); sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb); | | | 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 | */ if( pSelect ){ SelectDest dest; Table *pSelTab; assert(pParse->nTab==1); sqlite3VdbeAddOp3(v, OP_OpenWrite, 1, pParse->regRoot, iDb); sqlite3VdbeChangeP5(v, OPFLAG_P2ISREG); pParse->nTab = 2; sqlite3SelectDestInit(&dest, SRT_Table, 1); sqlite3Select(pParse, pSelect, &dest); sqlite3VdbeAddOp1(v, OP_Close, 1); if( pParse->nErr==0 ){ pSelTab = sqlite3ResultSetOfSelect(pParse, pSelect); if( pSelTab==0 ) return; |
︙ | ︙ | |||
2393 2394 2395 2396 2397 2398 2399 | }else{ tnum = pIndex->tnum; sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb); } pKey = sqlite3IndexKeyinfo(pParse, pIndex); sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, (char *)pKey, P4_KEYINFO_HANDOFF); | < | < | 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 | }else{ tnum = pIndex->tnum; sqlite3VdbeAddOp2(v, OP_Clear, tnum, iDb); } pKey = sqlite3IndexKeyinfo(pParse, pIndex); sqlite3VdbeAddOp4(v, OP_OpenWrite, iIdx, tnum, iDb, (char *)pKey, P4_KEYINFO_HANDOFF); sqlite3VdbeChangeP5(v, OPFLAG_BULKCSR|((memRootPage>=0)?OPFLAG_P2ISREG:0)); #ifndef SQLITE_OMIT_MERGE_SORT /* Open the sorter cursor if we are to use one. */ iSorter = pParse->nTab++; sqlite3VdbeAddOp4(v, OP_SorterOpen, iSorter, 0, 0, (char*)pKey, P4_KEYINFO); #else iSorter = iTab; |
︙ | ︙ |
Changes to src/sqliteInt.h.
︙ | ︙ | |||
2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 | #define OPFLAG_LASTROWID 0x02 /* Set to update db->lastRowid */ #define OPFLAG_ISUPDATE 0x04 /* This OP_Insert is an sql UPDATE */ #define OPFLAG_APPEND 0x08 /* This is likely to be an append */ #define OPFLAG_USESEEKRESULT 0x10 /* Try to avoid a seek in BtreeInsert() */ #define OPFLAG_CLEARCACHE 0x20 /* Clear pseudo-table cache in OP_Column */ #define OPFLAG_LENGTHARG 0x40 /* OP_Column only used for length() */ #define OPFLAG_TYPEOFARG 0x80 /* OP_Column only used for typeof() */ /* * Each trigger present in the database schema is stored as an instance of * struct Trigger. * * Pointers to instances of struct Trigger are stored in two ways. * 1. In the "trigHash" hash table (part of the sqlite3* that represents the | > > | 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 | #define OPFLAG_LASTROWID 0x02 /* Set to update db->lastRowid */ #define OPFLAG_ISUPDATE 0x04 /* This OP_Insert is an sql UPDATE */ #define OPFLAG_APPEND 0x08 /* This is likely to be an append */ #define OPFLAG_USESEEKRESULT 0x10 /* Try to avoid a seek in BtreeInsert() */ #define OPFLAG_CLEARCACHE 0x20 /* Clear pseudo-table cache in OP_Column */ #define OPFLAG_LENGTHARG 0x40 /* OP_Column only used for length() */ #define OPFLAG_TYPEOFARG 0x80 /* OP_Column only used for typeof() */ #define OPFLAG_BULKCSR 0x01 /* OP_Open** used to open bulk cursor */ #define OPFLAG_P2ISREG 0x02 /* P2 to OP_Open** is a register number */ /* * Each trigger present in the database schema is stored as an instance of * struct Trigger. * * Pointers to instances of struct Trigger are stored in two ways. * 1. In the "trigHash" hash table (part of the sqlite3* that represents the |
︙ | ︙ |
Changes to src/test_vfs.c.
︙ | ︙ | |||
357 358 359 360 361 362 363 | ){ int rc = SQLITE_OK; TestvfsFd *pFd = tvfsGetFd(pFile); Testvfs *p = (Testvfs *)pFd->pVfs->pAppData; if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){ tvfsExecTcl(p, "xWrite", | | > | 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 | ){ int rc = SQLITE_OK; TestvfsFd *pFd = tvfsGetFd(pFile); Testvfs *p = (Testvfs *)pFd->pVfs->pAppData; if( p->pScript && p->mask&TESTVFS_WRITE_MASK ){ tvfsExecTcl(p, "xWrite", Tcl_NewStringObj(pFd->zFilename, -1), pFd->pShmId, Tcl_NewWideIntObj(iOfst) ); tvfsResultCode(p, &rc); } if( rc==SQLITE_OK && tvfsInjectFullerr(p) ){ rc = SQLITE_FULL; } |
︙ | ︙ |
Changes to src/vdbe.c.
︙ | ︙ | |||
3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 | int p2; int iDb; int wrFlag; Btree *pX; VdbeCursor *pCur; Db *pDb; if( p->expired ){ rc = SQLITE_ABORT; break; } nField = 0; pKeyInfo = 0; | > > > | 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 | int p2; int iDb; int wrFlag; Btree *pX; VdbeCursor *pCur; Db *pDb; assert( (pOp->p5&(OPFLAG_P2ISREG|OPFLAG_BULKCSR))==pOp->p5 ); assert( pOp->opcode==OP_OpenWrite || pOp->p5==0 ); if( p->expired ){ rc = SQLITE_ABORT; break; } nField = 0; pKeyInfo = 0; |
︙ | ︙ | |||
3139 3140 3141 3142 3143 3144 3145 | assert( sqlite3SchemaMutexHeld(db, iDb, 0) ); if( pDb->pSchema->file_format < p->minWriteFileFormat ){ p->minWriteFileFormat = pDb->pSchema->file_format; } }else{ wrFlag = 0; } | | | 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 | assert( sqlite3SchemaMutexHeld(db, iDb, 0) ); if( pDb->pSchema->file_format < p->minWriteFileFormat ){ p->minWriteFileFormat = pDb->pSchema->file_format; } }else{ wrFlag = 0; } if( pOp->p5 & OPFLAG_P2ISREG ){ assert( p2>0 ); assert( p2<=p->nMem ); pIn2 = &aMem[p2]; assert( memIsValid(pIn2) ); assert( (pIn2->flags & MEM_Int)!=0 ); sqlite3VdbeMemIntegerify(pIn2); p2 = (int)pIn2->u.i; |
︙ | ︙ | |||
3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 | assert( pOp->p1>=0 ); pCur = allocateCursor(p, pOp->p1, nField, iDb, 1); if( pCur==0 ) goto no_mem; pCur->nullRow = 1; pCur->isOrdered = 1; rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor); pCur->pKeyInfo = pKeyInfo; /* Since it performs no memory allocation or IO, the only value that ** sqlite3BtreeCursor() may return is SQLITE_OK. */ assert( rc==SQLITE_OK ); /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of ** SQLite used to check if the root-page flags were sane at this point | > > | 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 | assert( pOp->p1>=0 ); pCur = allocateCursor(p, pOp->p1, nField, iDb, 1); if( pCur==0 ) goto no_mem; pCur->nullRow = 1; pCur->isOrdered = 1; rc = sqlite3BtreeCursor(pX, p2, wrFlag, pKeyInfo, pCur->pCursor); pCur->pKeyInfo = pKeyInfo; assert( OPFLAG_BULKCSR==BTREE_BULKLOAD ); sqlite3BtreeCursorHints(pCur->pCursor, (pOp->p5 & OPFLAG_BULKCSR)); /* Since it performs no memory allocation or IO, the only value that ** sqlite3BtreeCursor() may return is SQLITE_OK. */ assert( rc==SQLITE_OK ); /* Set the VdbeCursor.isTable and isIndex variables. Previous versions of ** SQLite used to check if the root-page flags were sane at this point |
︙ | ︙ |
Added test/index5.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | # 2012 August 6 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # set testdir [file dirname $argv0] source $testdir/tester.tcl set ::testprefix index5 do_test 1.1 { execsql { PRAGMA page_size = 1024; CREATE TABLE t1(x); BEGIN; } for {set i 0} {$i < 100000} {incr i} { execsql { INSERT INTO t1 VALUES(randstr(100,100)) } } execsql COMMIT execsql { CREATE INDEX i1 ON t1(x); DROP INDEX I1; PRAGMA main.page_size; } } {1024} db close testvfs tvfs tvfs filter xWrite tvfs script write_cb proc write_cb {xCall file handle iOfst} { if {[file tail $file]=="test.db"} { lappend ::write_list [expr $iOfst/1024] } puts "$xCall $file $args" } do_test 1.2 { sqlite3 db test.db -vfs tvfs set ::write_list [list] execsql { CREATE INDEX i1 ON t1(x) } } {} do_test 1.3 { set nForward 0 set nBackward 0 set nNoncont 0 set iPrev [lindex $::write_list 0] for {set i 1} {$i < [llength $::write_list]} {incr i} { set iNext [lindex $::write_list $i] if {$iNext==($iPrev+1)} { incr nForward } elseif {$iNext==($iPrev-1)} { incr nBackward } else { incr nNoncont } set iPrev $iNext } expr {$nForward > $nBackward} } {1} db close tvfs delete finish_test |