Index: src/where.c
==================================================================
 src/where.c
+++ src/where.c
@@ 17,44 +17,10 @@
** indices, you might also think of this module as the "query optimizer".
*/
#include "sqliteInt.h"
/*
** The following parameter define the relative cost of various
** search operations. These parameters are used to estimate query
** plan costs and to select the query plan with the lowest estimated
** cost.
**
** Let the cost of moving from one row in a table or index to the next
** or previous row be SEEK_COST.
**
** Let the base10 logarithm of the number of rows in a table or
** index be L. The estLog() function below will estimate this
** numbmer given the number of rows in the table.
**
** The cost of doing a lookup of an index will be IDX_LKUP_COST*L.
**
** The cost of doing a lookup on a table is TBL_LKUP_COST*L.
**
** The cost of sorting a result set of N rows is assumed to be
** N*log10(N)*SORT_COST.
*/
#if defined(SEEK_COST)
 /* Assume that IDX_LKUP_COST, TBL_LKUP_COST, and SORT_COST are also defined */
#elif !defined(SQLITE_OMIT_FLOATING_POINT)
# define SEEK_COST 1.0
# define IDX_LKUP_COST 1.0
# define TBL_LKUP_COST 0.1
# define SORT_COST 3.0
#else
# define SEEK_COST 10
# define IDX_LKUP_COST 10
# define TBL_LKUP_COST 1
# define SORT_COST 30
#endif

/*
** Trace output macros
*/
#if defined(SQLITE_TEST)  defined(SQLITE_DEBUG)
int sqlite3WhereTrace = 0;
@@ 2769,12 +2735,11 @@
*/
for(; pProbe; pIdx=pProbe=pProbe>pNext){
const unsigned int * const aiRowEst = pProbe>aiRowEst;
double cost; /* Cost of using pProbe */
double nRow; /* Estimated number of rows in result set */
 double nTabSrch; /* Est number of table searches */
 double nIdxSrch; /* Est number of index searches */
+ double log10N; /* base10 logarithm of nRow (inexact) */
int rev; /* True to scan in reverse order */
int wsFlags = 0;
Bitmask used = 0;
/* The following variables are populated based on the properties of
@@ 2965,60 +2930,75 @@
whereInScanEst(pParse, pProbe, pFirstTerm>pExpr>x.pList, &nRow);
}
}
#endif /* SQLITE_ENABLE_STAT2 */
 /* Adjust the number of rows and the cost downward to reflect rows
+ /* Adjust the number of output rows and downward to reflect rows
** that are excluded by range constraints.
*/
nRow = (nRow * (double)estBound) / (double)100;
if( nRow<1 ) nRow = 1;
 /* Assume constant cost to advance from one row to the next and
 ** logarithmic cost to do a binary search. Hence, the initial cost
 ** is the number of output rows plus log2(tablesize) times the
 ** number of binary searches.
 **
 ** Because fanout on tables is so much higher than the fanout on
 ** indices (because table btrees contain only integer keys in nonleaf
 ** nodes) we weight the cost of a table binary search as 1/10th the
 ** cost of an index binary search.
 */
 if( pIdx ){
 if( bLookup ){
 /* For an index lookup followed by a table lookup:
 ** nInMul index searches to find the start of each index range
 ** + nRow steps through the index
 ** + nRow table searches to lookup the table entry using the rowid
 */
 nIdxSrch = nInMul;
 nTabSrch = nRow;
 }else{
 /* For a covering index:
 ** nInMul index searches to find the initial entry
 ** + nRow steps through the index
 */
 nIdxSrch = nInMul;
 nTabSrch = 0;
 }
+ /* Experiments run on real SQLite databases show that the time needed
+ ** to do a binary search to locate a row in a table or index is roughly
+ ** log10(N) times the time to move from one row to the next row within
+ ** a table or index. The actual times can vary, with the size of
+ ** records being an important factor. Both moves and searches are
+ ** slower with larger records, presumably because fewer records fit
+ ** on one page and hence more pages have to be fetched.
+ **
+ ** The ANALYZE command and the sqlite_stat1 and sqlite_stat2 tables do
+ ** not give us data on the relative sizes of table and index records.
+ ** So this computation assumes table records are about twice as big
+ ** as index records
+ */
+ if( (wsFlags & WHERE_NOT_FULLSCAN)==0 ){
+ /* The cost of a full table scan is a number of move operations equal
+ ** to the number of rows in the table.
+ **
+ ** We add an additional 4x penalty to full table scans. This causes
+ ** the cost function to err on the side of choosing an index over
+ ** choosing a full scan. This 4x fullscan penalty is an arguable
+ ** decision and one which we expect to revisit in the future. But
+ ** it seems to be working well enough at the moment.
+ */
+ cost = aiRowEst[0]*4;
}else{
 /* For a rowid primary key lookup:
 ** nInMult table searches to find the initial entry for each range
 ** + nRow steps through the table
 */
 nIdxSrch = 0;
 nTabSrch = nInMul;
 }
 cost = nRow + (nIdxSrch*IDX_LKUP_COST + nTabSrch*TBL_LKUP_COST)
 *estLog(aiRowEst[0])/SEEK_COST;

 /* Add in the estimated cost of sorting the result. This cost is expanded
 ** by a fudge factor of 3.0 to account for the fact that a sorting step
 ** involves a write and is thus more expensive than a lookup step.
+ log10N = estLog(aiRowEst[0]);
+ cost = nRow;
+ if( pIdx ){
+ if( bLookup ){
+ /* For an index lookup followed by a table lookup:
+ ** nInMul index searches to find the start of each index range
+ ** + nRow steps through the index
+ ** + nRow table searches to lookup the table entry using the rowid
+ */
+ cost += (nInMul + nRow)*log10N;
+ }else{
+ /* For a covering index:
+ ** nInMul index searches to find the initial entry
+ ** + nRow steps through the index
+ */
+ cost += nInMul*log10N;
+ }
+ }else{
+ /* For a rowid primary key lookup:
+ ** nInMult table searches to find the initial entry for each range
+ ** + nRow steps through the table
+ */
+ cost += nInMul*log10N;
+ }
+ }
+
+ /* Add in the estimated cost of sorting the result. Actual experimental
+ ** measurements of sorting performance in SQLite show that sorting time
+ ** adds C*N*log10(N) to the cost, where N is the number of rows to be
+ ** sorted and C is a factor between 1.95 and 4.3. We will split the
+ ** difference and select C of 3.0.
*/
if( bSort ){
 cost += nRow*estLog(nRow)*SORT_COST/SEEK_COST;
+ cost += nRow*estLog(nRow)*3;
}
/**** Cost of using this index has now been computed ****/
/* If there are additional constraints on this table that cannot
@@ 3080,15 +3060,14 @@
}
WHERETRACE((
"%s(%s): nEq=%d nInMul=%d estBound=%d bSort=%d bLookup=%d wsFlags=0x%x\n"
 " notReady=0x%llx nTSrch=%.1f nISrch=%.1f nRow=%.1f\n"
 " estLog=%.1f cost=%.1f used=0x%llx\n",
+ " notReady=0x%llx log10N=%.1f nRow=%.1f cost=%.1f used=0x%llx\n",
pSrc>pTab>zName, (pIdx ? pIdx>zName : "ipk"),
nEq, nInMul, estBound, bSort, bLookup, wsFlags,
 notReady, nTabSrch, nIdxSrch, nRow, estLog(aiRowEst[0]), cost, used
+ notReady, log10N, nRow, cost, used
));
/* If this index is the best we have seen so far, then record this
** index and its cost in the pCost structure.
*/
Index: test/analyze5.test
==================================================================
 test/analyze5.test
+++ test/analyze5.test
@@ 100,12 +100,12 @@
16 {z>=100 AND z<=1} t1z 50
17 {z>=100 AND z<=0} t1z 400
18 {z>=100 AND z<0} t1z 50
19 {z>=100 AND z<=1} t1z 700
20 {z>=100 AND z<2} t1z 700
 21 {z>=100 AND z<=2} {} 111
 22 {z>=100 AND z<3} {} 111
+ 21 {z>=100 AND z<=2} t1z 900
+ 22 {z>=100 AND z<3} t1z 900
31 {z>=0.0 AND z<=0.0} t1z 400
32 {z>=1.0 AND z<=1.0} t1z 300
33 {z>=2.0 AND z<=2.0} t1z 200
34 {z>=3.0 AND z<=3.0} t1z 100
@@ 123,12 +123,12 @@
46 {z>=100 AND z<=1.0} t1z 50
47 {z>=100 AND z<=0.0} t1z 400
48 {z>=100 AND z<0.0} t1z 50
49 {z>=100 AND z<=1.0} t1z 700
50 {z>=100 AND z<2.0} t1z 700
 51 {z>=100 AND z<=2.0} {} 111
 52 {z>=100 AND z<3.0} {} 111
+ 51 {z>=100 AND z<=2.0} t1z 900
+ 52 {z>=100 AND z<3.0} t1z 900
101 {z=1} t1z 50
102 {z=0} t1z 400
103 {z=1} t1z 300
104 {z=2} t1z 200
@@ 149,11 +149,11 @@
204 {z IN (2)} t1z 200
205 {z IN (3)} t1z 100
206 {z IN (4)} t1z 50
207 {z IN (0.5)} t1z 50
208 {z IN (0,1)} t1z 700
 209 {z IN (0,1,2)} {} 100
+ 209 {z IN (0,1,2)} t1z 900
210 {z IN (0,1,2,3)} {} 100
211 {z IN (0,1,2,3,4,5)} {} 100
212 {z IN (1,2)} t1z 500
213 {z IN (2,3)} t1z 300
214 {z=3 OR z=2} t1z 300