Index: src/analyze.c ================================================================== --- src/analyze.c +++ src/analyze.c @@ -265,10 +265,11 @@ typedef struct Stat4Accum Stat4Accum; typedef struct Stat4Sample Stat4Sample; struct Stat4Sample { tRowcnt *anEq; /* sqlite_stat4.nEq */ tRowcnt *anDLt; /* sqlite_stat4.nDLt */ + tRowcnt *amxEq; /* Maximum length run of equal values */ #ifdef SQLITE_ENABLE_STAT3_OR_STAT4 tRowcnt *anLt; /* sqlite_stat4.nLt */ union { i64 iRowid; /* Rowid in main table of the key */ u8 *aRowid; /* Key for WITHOUT ROWID tables */ @@ -416,10 +417,11 @@ assert( nKeyCol>0 ); /* Allocate the space required for the Stat4Accum object */ n = sizeof(*p) + sizeof(tRowcnt)*nColUp /* Stat4Accum.anEq */ + + sizeof(tRowcnt)*nColUp /* Stat4Accum.amxEq */ + sizeof(tRowcnt)*nColUp /* Stat4Accum.anDLt */ #ifdef SQLITE_ENABLE_STAT3_OR_STAT4 + sizeof(tRowcnt)*nColUp /* Stat4Accum.anLt */ + sizeof(Stat4Sample)*(nCol+mxSample) /* Stat4Accum.aBest[], a[] */ + sizeof(tRowcnt)*3*nColUp*(nCol+mxSample) @@ -434,22 +436,23 @@ p->db = db; p->nRow = 0; p->nCol = nCol; p->nKeyCol = nKeyCol; - p->current.anDLt = (tRowcnt*)&p[1]; - p->current.anEq = &p->current.anDLt[nColUp]; + p->current.anEq = (tRowcnt*)&p[1]; + p->current.amxEq = &p->current.anEq[nColUp]; + p->current.anDLt = &p->current.amxEq[nColUp]; #ifdef SQLITE_ENABLE_STAT3_OR_STAT4 { u8 *pSpace; /* Allocated space not yet assigned */ - int i; /* Used to iterate through p->aSample[] */ + int i; /* Loop counter */ p->iGet = -1; p->mxSample = mxSample; p->nPSample = (tRowcnt)(sqlite3_value_int64(argv[2])/(mxSample/3+1) + 1); - p->current.anLt = &p->current.anEq[nColUp]; + p->current.anLt = &p->current.anDLt[nColUp]; p->iPrn = 0x689e962d*(u32)nCol ^ 0xd0944565*(u32)sqlite3_value_int(argv[2]); /* Set up the Stat4Accum.a[] and aBest[] arrays */ p->a = (struct Stat4Sample*)&p->current.anLt[nColUp]; p->aBest = &p->a[mxSample]; @@ -719,18 +722,22 @@ assert( p->nCol>0 ); assert( iChngnCol ); if( p->nRow==0 ){ /* This is the first call to this function. Do initialization. */ - for(i=0; inCol; i++) p->current.anEq[i] = 1; + for(i=0; inCol; i++){ + p->current.anEq[i] = 1; + p->current.amxEq[i] = 1; + } }else{ /* Second and subsequent calls get processed here */ samplePushPrevious(p, iChng); /* Update anDLt[], anLt[] and anEq[] to reflect the values that apply ** to the current row of the index. */ for(i=0; icurrent.amxEq[i]==p->current.anEq[i] ) p->current.amxEq[i]++; p->current.anEq[i]++; } for(i=iChng; inCol; i++){ p->current.anDLt[i]++; #ifdef SQLITE_ENABLE_STAT3_OR_STAT4 @@ -819,51 +826,77 @@ if( eCall==STAT_GET_STAT1 ) #else assert( argc==1 ); #endif { - /* Return the value to store in the "stat" column of the sqlite_stat1 + /* Return a value for the "stat" column of the sqlite_stat1 ** table for this index. ** ** The value is a string composed of a list of integers describing - ** the index. The first integer in the list is the total number of - ** entries in the index. There is one additional integer in the list - ** for each indexed column. This additional integer is an estimate of - ** the number of rows matched by a stabbing query on the index using - ** a key with the corresponding number of fields. In other words, - ** if the index is on columns (a,b) and the sqlite_stat1 value is - ** "100 10 2", then SQLite estimates that: - ** - ** * the index contains 100 rows, - ** * "WHERE a=?" matches 10 rows, and - ** * "WHERE a=? AND b=?" matches 2 rows. - ** - ** If D is the count of distinct values and K is the total number of - ** rows, then each estimate is computed as: - ** - ** I = (K+D-1)/D + ** the index. The first integer is the (estimated) total number + ** entries in the index. There is one additional integer for each + ** column in the index. The first added integer is an estimate of + ** the number of rows that match a single key in the first column of + ** the index. The second added integer is an estimate on of the number + ** of rows that match a single key consisting of the first two columns + ** of the index. And so forth. + ** + ** For example, for an index on columns (a,b), if the sqlite_stat1.stat + ** values is "100 10 2", that means there are about 100 rows in the + ** index, and that a query against a=$key1 will match about 10 rows + ** and a query against "a=$key1 AND b=$key2" will match about 2 rows. + ** + ** Let V be the average number of rows that match a key, and let M + ** be the most number of rows that match the key for any possible value + ** of that key. The estimate is computed as: + ** + ** E = (2*V + M)/3 + ** + ** Consider two indexes. Index X has with 100 values of exactly 0 and + ** 100 singleton values between 1 and 100. Index Y has 200 values + ** evenly distributed between 1 and 20. If only the average (V) is + ** used in the estimate, X would have "200 2" and Y would have "200 10" + ** and so the planner would think X is the more selective index. And + ** X often would be more selective. But when searching for 0, index X + ** would perform badly. To avoid this problem, the M is added into the + ** estimate so that the stat for X is "200 34" and Y is still "200 10". + ** In this way, Y is the preferred index (all else being equal) and + ** the pathological case is avoided. + ** + ** For deciding whether or not to do a skip-scan, we want to know the + ** average number of rows (V) with the same key, not the mixed estimate + ** E shown above. Usually E will be close enough. However, if E is + ** large but V is small, that could trick the query planner into thinking + ** that a skip-scan might work well on this index. To avoid that, the + ** "noskipscan" flag is added in cases where the divergence between E + ** and V might mislead the query planner. */ char *z; int i; + int noSkipScan = 0; - char *zRet = sqlite3MallocZero( (p->nKeyCol+1)*25 ); + char *zRet = sqlite3MallocZero( (p->nKeyCol+2)*25 ); if( zRet==0 ){ sqlite3_result_error_nomem(context); return; } sqlite3_snprintf(24, zRet, "%llu", (u64)p->nRow); z = zRet + sqlite3Strlen30(zRet); for(i=0; inKeyCol; i++){ - u64 nDistinct = p->current.anDLt[i] + 1; - u64 iVal = (p->nRow + nDistinct - 1) / nDistinct; + u64 nDistinct = p->current.anDLt[i]; + u64 iMx = p->current.amxEq[i]; /* M: Most rows per key */ + u64 iAvg = (p->nRow+nDistinct)/(nDistinct+1); /* V: Average per key */ + u64 iVal = (iMx+iAvg*2)/3; /* E: The estimate */ sqlite3_snprintf(24, z, " %llu", iVal); z += sqlite3Strlen30(z); assert( p->current.anEq[i] ); + if( iVal>=WHERE_SKIPSCAN_ONSET && iAvg<(WHERE_SKIPSCAN_ONSET*2/3) ){ + noSkipScan = 1; + } } - assert( z[0]=='\0' && z>zRet ); - + if( noSkipScan ) sqlite3_snprintf(14, z, " noskipscan"); sqlite3_result_text(context, zRet, -1, sqlite3_free); } #ifdef SQLITE_ENABLE_STAT3_OR_STAT4 else if( eCall==STAT_GET_ROWID ){ if( p->iGet<0 ){ @@ -1569,56 +1602,49 @@ ** Populate the pIdx->aAvgEq[] array based on the samples currently ** stored in pIdx->aSample[]. */ static void initAvgEq(Index *pIdx){ if( pIdx ){ - IndexSample *aSample = pIdx->aSample; - IndexSample *pFinal = &aSample[pIdx->nSample-1]; - int iCol; - int nCol = 1; - if( pIdx->nSampleCol>1 ){ + int nSample = pIdx->nSample; /* Number of samples */ + int nCol = pIdx->nSampleCol; /* Number of columns sampled */ + IndexSample *aSample = pIdx->aSample; /* The samples */ + IndexSample *pFinal = &aSample[nSample-1]; /* The last sample */ + int iCol; /* Loop counter of columns */ + if( nCol>1 ){ /* If this is stat4 data, then calculate aAvgEq[] values for all ** sample columns except the last. The last is always set to 1, as ** once the trailing PK fields are considered all index keys are ** unique. */ - nCol = pIdx->nSampleCol-1; + nCol--; pIdx->aAvgEq[nCol] = 1; } for(iCol=0; iColnSample; int i; /* Used to iterate through samples */ + u32 nDSample = 0; /* Number of distinct samples less than pFinal*/ tRowcnt sumEq = 0; /* Sum of the nEq values */ - tRowcnt avgEq = 0; + tRowcnt avgEq; /* Average repeats for unsampled entries */ tRowcnt nRow; /* Number of rows in index */ - i64 nSum100 = 0; /* Number of terms contributing to sumEq */ - i64 nDist100; /* Number of distinct values in index */ + tRowcnt prevLt = 0; /* Number of values less than previous sample */ + tRowcnt thisLt; /* Number of values less than current sample */ if( !pIdx->aiRowEst || iCol>=pIdx->nKeyCol || pIdx->aiRowEst[iCol+1]==0 ){ - nRow = pFinal->anLt[iCol]; - nDist100 = (i64)100 * pFinal->anDLt[iCol]; - nSample--; + nRow = pFinal->anLt[iCol] + pFinal->anEq[iCol]; }else{ nRow = pIdx->aiRowEst[0]; - nDist100 = ((i64)100 * pIdx->aiRowEst[0]) / pIdx->aiRowEst[iCol+1]; } pIdx->nRowEst0 = nRow; - - /* Set nSum to the number of distinct (iCol+1) field prefixes that - ** occur in the stat4 table for this index. Set sumEq to the sum of - ** the nEq values for column iCol for the same set (adding the value - ** only once where there exist duplicate prefixes). */ - for(i=0; inSample-1) - || aSample[i].anDLt[iCol]!=aSample[i+1].anDLt[iCol] - ){ + for(i=0; (thisLt = aSample[i].anLt[iCol])anLt[iCol]; i++){ + if( i==0 || thisLt!=prevLt ){ sumEq += aSample[i].anEq[iCol]; - nSum100 += 100; + nDSample++; } + prevLt = thisLt; } - - if( nDist100>nSum100 ){ - avgEq = ((i64)100 * (nRow - sumEq))/(nDist100 - nSum100); + if( pFinal->anDLt[iCol] > nDSample ){ + avgEq = (pFinal->anLt[iCol] - sumEq)/(pFinal->anDLt[iCol] - nDSample); + }else{ + avgEq = 1; } if( avgEq==0 ) avgEq = 1; pIdx->aAvgEq[iCol] = avgEq; } } Index: src/sqliteInt.h ================================================================== --- src/sqliteInt.h +++ src/sqliteInt.h @@ -2419,10 +2419,19 @@ #define JT_LEFT 0x0008 /* Left outer join */ #define JT_RIGHT 0x0010 /* Right outer join */ #define JT_OUTER 0x0020 /* The "OUTER" keyword is present */ #define JT_ERROR 0x0040 /* unknown or unsupported join type */ +/* +** TUNING: The skip-scan optimization is only profitable if the average +** number of repeats of an entry in the index is greater than or equal to +** WHERE_SKIPSCAN_ONSET. If the average numbe of repeats is less +** than WHERE_SKIPSCAN_ONSET, then it is faster to do a full table +** scan. +*/ +#define WHERE_SKIPSCAN_ONSET 18 +#define WHERE_SKIPSCAN_ONSET_LOG 42 /* ** Flags appropriate for the wctrlFlags parameter of sqlite3WhereBegin() ** and the WhereInfo.wctrlFlags member. */ Index: src/where.c ================================================================== --- src/where.c +++ src/where.c @@ -2410,15 +2410,15 @@ ** is almost always quicker than an index seek (even though if the index ** contains fewer than 2^17 rows we assume otherwise in other parts of ** the code). And, even if it is not, it should not be too much slower. ** On the other hand, the extra seeks could end up being significantly ** more expensive. */ - assert( 42==sqlite3LogEst(18) ); + assert( WHERE_SKIPSCAN_ONSET_LOG==sqlite3LogEst(WHERE_SKIPSCAN_ONSET) ); if( saved_nEq==saved_nSkip && saved_nEq+1nKeyCol && pProbe->noSkipScan==0 - && pProbe->aiRowLogEst[saved_nEq+1]>=42 /* TUNING: Minimum for skip-scan */ + && pProbe->aiRowLogEst[saved_nEq+1]>=WHERE_SKIPSCAN_ONSET_LOG && (rc = whereLoopResize(db, pNew, pNew->nLTerm+1))==SQLITE_OK ){ LogEst nIter; pNew->u.btree.nEq++; pNew->nSkip++; Index: test/skipscan2.test ================================================================== --- test/skipscan2.test +++ test/skipscan2.test @@ -46,26 +46,20 @@ INSERT INTO people VALUES('Patrick','teacher',180); INSERT INTO people VALUES('Quiana','student',182); INSERT INTO people VALUES('Robert','student',159); INSERT INTO people VALUES('Sally','student',166); INSERT INTO people VALUES('Tom','student',171); - INSERT INTO people VALUES('Ursula','student',170); - INSERT INTO people VALUES('Vance','student',179); - INSERT INTO people VALUES('Willma','student',175); - INSERT INTO people VALUES('Xavier','teacher',185); - INSERT INTO people VALUES('Yvonne','student',149); - INSERT INTO people VALUES('Zach','student',170); } # Without ANALYZE, a skip-scan is not used # do_execsql_test skipscan2-1.3 { - SELECT name FROM people WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + SELECT name FROM people WHERE height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-1.3eqp { EXPLAIN QUERY PLAN - SELECT name FROM people WHERE height>=180 ORDER BY +name; + SELECT name FROM people WHERE height<=161 ORDER BY +name; } {~/*INDEX people_idx1 */} # Now do an ANALYZE. A skip-scan can be used after ANALYZE. # do_execsql_test skipscan2-1.4 { @@ -77,75 +71,49 @@ UPDATE sqlite_stat1 SET stat='10000 1' WHERE idx='sqlite_autoindex_people_1'; ANALYZE sqlite_master; } db cache flush do_execsql_test skipscan2-1.5 { - SELECT name FROM people WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + SELECT name FROM people WHERE height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-1.5eqp { EXPLAIN QUERY PLAN - SELECT name FROM people WHERE height>=180 ORDER BY +name; + SELECT name FROM people WHERE height<=161 ORDER BY +name; } {/*INDEX people_idx1 */} # Same answer with other formulations of the same query # do_execsql_test skipscan2-1.6 { SELECT name FROM people WHERE role IN (SELECT DISTINCT role FROM people) - AND height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + AND height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-1.7 { - SELECT name FROM people WHERE role='teacher' AND height>=180 + SELECT name FROM people WHERE role='teacher' AND height<=161 UNION ALL - SELECT name FROM people WHERE role='student' AND height>=180 + SELECT name FROM people WHERE role='student' AND height<=161 ORDER BY 1; -} {David Jack Patrick Quiana Xavier} +} {Alice Bob Cindy Emily Megan Olivia Robert} -# Add 8 more people, bringing the total to 34. Then the number of -# duplicates in the left-column of the index will be 17 and -# skip-scan should not be used after an (unfudged) ANALYZE. +# Add more people so that the estimated number of rows per key is +# equal to the skip-scan threshold of 18. Then run an (unfudged) +# ANALYZE. skip-scan should work. # do_execsql_test skipscan2-1.8 { - INSERT INTO people VALUES('Angie','student',166); - INSERT INTO people VALUES('Brad','student',176); - INSERT INTO people VALUES('Claire','student',168); - INSERT INTO people VALUES('Donald','student',162); - INSERT INTO people VALUES('Elaine','student',177); - INSERT INTO people VALUES('Frazier','student',159); - INSERT INTO people VALUES('Grace','student',179); - INSERT INTO people VALUES('Horace','student',166); + INSERT INTO people VALUES('Ursula','student',170); ANALYZE; SELECT stat FROM sqlite_stat1 WHERE idx='people_idx1'; -} {{34 17 2}} +} {{21 13 2}} db cache flush do_execsql_test skipscan2-1.9 { - SELECT name FROM people WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + SELECT name FROM people WHERE height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-1.9eqp { EXPLAIN QUERY PLAN - SELECT name FROM people WHERE height>=180 ORDER BY +name; + SELECT name FROM people WHERE height<=161 ORDER BY +name; } {~/*INDEX people_idx1 */} -# Add 2 more people, bringing the total to 36. Then the number of -# duplicates in the left-column of the index will be 18 and -# skip-scan will be used after an (unfudged) ANALYZE. -# -do_execsql_test skipscan2-1.10 { - INSERT INTO people VALUES('Ingrad','student',155); - INSERT INTO people VALUES('Jacob','student',179); - ANALYZE; - SELECT stat FROM sqlite_stat1 WHERE idx='people_idx1'; -} {{36 18 2}} -db cache flush -do_execsql_test skipscan2-1.11 { - SELECT name FROM people WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} -do_execsql_test skipscan2-1.11eqp { - EXPLAIN QUERY PLAN - SELECT name FROM people WHERE height>=180 ORDER BY +name; -} {/*INDEX people_idx1 */} - # Repeat using a WITHOUT ROWID table. # do_execsql_test skipscan2-2.1 { CREATE TABLE peoplew( @@ -156,37 +124,37 @@ ) WITHOUT ROWID; CREATE INDEX peoplew_idx1 ON peoplew(role, height); INSERT INTO peoplew(name,role,height) SELECT name, role, height FROM people; ALTER TABLE people RENAME TO old_people; - SELECT name FROM peoplew WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + SELECT name FROM peoplew WHERE height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-2.2 { SELECT name FROM peoplew WHERE role IN (SELECT DISTINCT role FROM peoplew) - AND height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + AND height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-2.2 { - SELECT name FROM peoplew WHERE role='teacher' AND height>=180 + SELECT name FROM peoplew WHERE role='teacher' AND height<=161 UNION ALL - SELECT name FROM peoplew WHERE role='student' AND height>=180 + SELECT name FROM peoplew WHERE role='student' AND height<=161 ORDER BY 1; -} {David Jack Patrick Quiana Xavier} +} {Alice Bob Cindy Emily Megan Olivia Robert} # Now do an ANALYZE. A skip-scan can be used after ANALYZE. # do_execsql_test skipscan2-2.4 { ANALYZE; } db cache flush do_execsql_test skipscan2-2.5 { - SELECT name FROM peoplew WHERE height>=180 ORDER BY +name; -} {David Jack Patrick Quiana Xavier} + SELECT name FROM peoplew WHERE height<=161 ORDER BY +name; +} {Alice Bob Cindy Emily Megan Olivia Robert} do_execsql_test skipscan2-2.5eqp { EXPLAIN QUERY PLAN - SELECT name FROM peoplew WHERE height>=180 ORDER BY +name; -} {/*INDEX peoplew_idx1 */} + SELECT name FROM peoplew WHERE height<=161 ORDER BY +name; +} {/*INDEX peoplew_idx1*/} # A skip-scan on a PK index of a WITHOUT ROWID table. # do_execsql_test skipscan2-3.1 { CREATE TABLE t3(a, b, c, PRIMARY KEY(a, b)) WITHOUT ROWID;