SQLite

Check-in [878da276eb]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Refactor the cost function in the query planner. Give extra cost (thus reduce likelihood of selection) to full table scans.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 878da276ebf643b716ddd650d4d0ca3595fe5bf2
User & Date: drh 2011-02-10 00:08:47.701
Context
2011-02-10
17:46
Prevent a segfault when automatic indices try to use a column with an unknown collating function. Ticket [77aa3b1e6592582e38605d36]. This check-in also removes some stray \r characters unrelated to the problem. (check-in: f01030a0df user: drh tags: trunk)
00:08
Refactor the cost function in the query planner. Give extra cost (thus reduce likelihood of selection) to full table scans. (check-in: 878da276eb user: drh tags: trunk)
2011-02-09
19:55
Make sure code *compiles* with each OMIT and ENABLE option. Mostly changes to test modules. (check-in: 7cc515edc9 user: shaneh tags: trunk)
Changes
Unified Diff Show Whitespace Changes Patch
configure became executable.

whitespace changes only

install-sh became executable.

whitespace changes only

Changes to src/where.c.
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
*/
#include "sqliteInt.h"


/*
** The following parameter define the relative cost of various
** search operations.  These parameters are used to estimate query
** plan costs and to select the query plan with the lowest estimated
** cost.
**
** Let the cost of moving from one row in a table or index to the next
** or previous row be SEEK_COST.
**
** Let the base-10 logarithm of the number of rows in a table or
** index be L.  The estLog() function below will estimate this
** numbmer given the number of rows in the table.
**
** The cost of doing a lookup of an index will be IDX_LKUP_COST*L.
**
** The cost of doing a lookup on a table is TBL_LKUP_COST*L.
**
** The cost of sorting a result set of N rows is assumed to be
** N*log10(N)*SORT_COST.
*/
#if defined(SEEK_COST)
  /* Assume that IDX_LKUP_COST, TBL_LKUP_COST, and SORT_COST are also defined */
#elif !defined(SQLITE_OMIT_FLOATING_POINT)
#  define SEEK_COST     1.0
#  define IDX_LKUP_COST 1.0
#  define TBL_LKUP_COST 0.1
#  define SORT_COST     3.0
#else
#  define SEEK_COST     10
#  define IDX_LKUP_COST 10
#  define TBL_LKUP_COST 1
#  define SORT_COST     30
#endif

/*
** Trace output macros
*/
#if defined(SQLITE_TEST) || defined(SQLITE_DEBUG)
int sqlite3WhereTrace = 0;
#endif
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







15
16
17
18
19
20
21


































22
23
24
25
26
27
28
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
*/
#include "sqliteInt.h"




































/*
** Trace output macros
*/
#if defined(SQLITE_TEST) || defined(SQLITE_DEBUG)
int sqlite3WhereTrace = 0;
#endif
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
2767
2768
2769
2770
2771
2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782

  /* Loop over all indices looking for the best one to use
  */
  for(; pProbe; pIdx=pProbe=pProbe->pNext){
    const unsigned int * const aiRowEst = pProbe->aiRowEst;
    double cost;                /* Cost of using pProbe */
    double nRow;                /* Estimated number of rows in result set */
    double nTabSrch;            /* Est number of table searches */
    double nIdxSrch;            /* Est number of index searches */
    int rev;                    /* True to scan in reverse order */
    int wsFlags = 0;
    Bitmask used = 0;

    /* The following variables are populated based on the properties of
    ** index being evaluated. They are then used to determine the expected
    ** cost and number of rows returned.







|
<







2733
2734
2735
2736
2737
2738
2739
2740

2741
2742
2743
2744
2745
2746
2747

  /* Loop over all indices looking for the best one to use
  */
  for(; pProbe; pIdx=pProbe=pProbe->pNext){
    const unsigned int * const aiRowEst = pProbe->aiRowEst;
    double cost;                /* Cost of using pProbe */
    double nRow;                /* Estimated number of rows in result set */
    double log10N;              /* base-10 logarithm of nRow (inexact) */

    int rev;                    /* True to scan in reverse order */
    int wsFlags = 0;
    Bitmask used = 0;

    /* The following variables are populated based on the properties of
    ** index being evaluated. They are then used to determine the expected
    ** cost and number of rows returned.
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978












2979
2980
2981
2982
2983

2984



2985




2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
3011
3012
3013

3014


3015
3016
3017
3018
3019
3020
3021
3022
3023
3024
3025
3026
        whereEqualScanEst(pParse, pProbe, pFirstTerm->pExpr->pRight, &nRow);
      }else if( pFirstTerm->eOperator==WO_IN && bInEst==0 ){
        whereInScanEst(pParse, pProbe, pFirstTerm->pExpr->x.pList, &nRow);
      }
    }
#endif /* SQLITE_ENABLE_STAT2 */

    /* Adjust the number of rows and the cost downward to reflect rows
    ** that are excluded by range constraints.
    */
    nRow = (nRow * (double)estBound) / (double)100;
    if( nRow<1 ) nRow = 1;

    /* Assume constant cost to advance from one row to the next and
    ** logarithmic cost to do a binary search.  Hence, the initial cost
    ** is the number of output rows plus log2(table-size) times the












    ** number of binary searches.
    **
    ** Because fan-out on tables is so much higher than the fan-out on
    ** indices (because table btrees contain only integer keys in non-leaf
    ** nodes) we weight the cost of a table binary search as 1/10th the

    ** cost of an index binary search.



    */




    if( pIdx ){
      if( bLookup ){
        /* For an index lookup followed by a table lookup:
        **    nInMul index searches to find the start of each index range
        **  + nRow steps through the index
        **  + nRow table searches to lookup the table entry using the rowid
        */
        nIdxSrch = nInMul;
        nTabSrch = nRow;
      }else{
        /* For a covering index:
        **     nInMul index searches to find the initial entry 
        **   + nRow steps through the index
        */
        nIdxSrch = nInMul;
        nTabSrch = 0;
      }
    }else{
      /* For a rowid primary key lookup:
      **    nInMult table searches to find the initial entry for each range
      **  + nRow steps through the table
      */
      nIdxSrch = 0;
      nTabSrch = nInMul;
    }
    cost = nRow + (nIdxSrch*IDX_LKUP_COST + nTabSrch*TBL_LKUP_COST)
                      *estLog(aiRowEst[0])/SEEK_COST;


    /* Add in the estimated cost of sorting the result.  This cost is expanded


    ** by a fudge factor of 3.0 to account for the fact that a sorting step 
    ** involves a write and is thus more expensive than a lookup step.
    */
    if( bSort ){
      cost += nRow*estLog(nRow)*SORT_COST/SEEK_COST;
    }

    /**** Cost of using this index has now been computed ****/

    /* If there are additional constraints on this table that cannot
    ** be used with the current index, but which might lower the number
    ** of output rows, adjust the nRow value accordingly.  This only 







|





|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
|

<
<
<
>
|
>
>
>

>
>
>
>







|
<





|
<






<
|

<
<
|
>
|
>
>
|
|


|







2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957



2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975

2976
2977
2978
2979
2980
2981

2982
2983
2984
2985
2986
2987

2988
2989


2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
        whereEqualScanEst(pParse, pProbe, pFirstTerm->pExpr->pRight, &nRow);
      }else if( pFirstTerm->eOperator==WO_IN && bInEst==0 ){
        whereInScanEst(pParse, pProbe, pFirstTerm->pExpr->x.pList, &nRow);
      }
    }
#endif /* SQLITE_ENABLE_STAT2 */

    /* Adjust the number of output rows and downward to reflect rows
    ** that are excluded by range constraints.
    */
    nRow = (nRow * (double)estBound) / (double)100;
    if( nRow<1 ) nRow = 1;

    /* Experiments run on real SQLite databases show that the time needed
    ** to do a binary search to locate a row in a table or index is roughly
    ** log10(N) times the time to move from one row to the next row within
    ** a table or index.  The actual times can vary, with the size of
    ** records being an important factor.  Both moves and searches are
    ** slower with larger records, presumably because fewer records fit
    ** on one page and hence more pages have to be fetched.
    **
    ** The ANALYZE command and the sqlite_stat1 and sqlite_stat2 tables do
    ** not give us data on the relative sizes of table and index records.
    ** So this computation assumes table records are about twice as big
    ** as index records
    */
    if( (wsFlags & WHERE_NOT_FULLSCAN)==0 ){
      /* The cost of a full table scan is a number of move operations equal
      ** to the number of rows in the table.
    **



      ** We add an additional 4x penalty to full table scans.  This causes
      ** the cost function to err on the side of choosing an index over
      ** choosing a full scan.  This 4x full-scan penalty is an arguable
      ** decision and one which we expect to revisit in the future.  But
      ** it seems to be working well enough at the moment.
    */
      cost = aiRowEst[0]*4;
    }else{
      log10N = estLog(aiRowEst[0]);
      cost = nRow;
    if( pIdx ){
      if( bLookup ){
        /* For an index lookup followed by a table lookup:
        **    nInMul index searches to find the start of each index range
        **  + nRow steps through the index
        **  + nRow table searches to lookup the table entry using the rowid
        */
          cost += (nInMul + nRow)*log10N;

      }else{
        /* For a covering index:
        **     nInMul index searches to find the initial entry 
        **   + nRow steps through the index
        */
          cost += nInMul*log10N;

      }
    }else{
      /* For a rowid primary key lookup:
      **    nInMult table searches to find the initial entry for each range
      **  + nRow steps through the table
      */

        cost += nInMul*log10N;
    }


    }

    /* Add in the estimated cost of sorting the result.  Actual experimental
    ** measurements of sorting performance in SQLite show that sorting time
    ** adds C*N*log10(N) to the cost, where N is the number of rows to be 
    ** sorted and C is a factor between 1.95 and 4.3.  We will split the
    ** difference and select C of 3.0.
    */
    if( bSort ){
      cost += nRow*estLog(nRow)*3;
    }

    /**** Cost of using this index has now been computed ****/

    /* If there are additional constraints on this table that cannot
    ** be used with the current index, but which might lower the number
    ** of output rows, adjust the nRow value accordingly.  This only 
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
      }
      if( nRow<2 ) nRow = 2;
    }


    WHERETRACE((
      "%s(%s): nEq=%d nInMul=%d estBound=%d bSort=%d bLookup=%d wsFlags=0x%x\n"
      "         notReady=0x%llx nTSrch=%.1f nISrch=%.1f nRow=%.1f\n"
      "         estLog=%.1f cost=%.1f used=0x%llx\n",
      pSrc->pTab->zName, (pIdx ? pIdx->zName : "ipk"), 
      nEq, nInMul, estBound, bSort, bLookup, wsFlags,
      notReady, nTabSrch, nIdxSrch, nRow, estLog(aiRowEst[0]), cost, used
    ));

    /* If this index is the best we have seen so far, then record this
    ** index and its cost in the pCost structure.
    */
    if( (!pIdx || wsFlags)
     && (cost<pCost->rCost || (cost<=pCost->rCost && nRow<pCost->plan.nRow))







<
|


|







3058
3059
3060
3061
3062
3063
3064

3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
      }
      if( nRow<2 ) nRow = 2;
    }


    WHERETRACE((
      "%s(%s): nEq=%d nInMul=%d estBound=%d bSort=%d bLookup=%d wsFlags=0x%x\n"

      "         notReady=0x%llx log10N=%.1f nRow=%.1f cost=%.1f used=0x%llx\n",
      pSrc->pTab->zName, (pIdx ? pIdx->zName : "ipk"), 
      nEq, nInMul, estBound, bSort, bLookup, wsFlags,
      notReady, log10N, nRow, cost, used
    ));

    /* If this index is the best we have seen so far, then record this
    ** index and its cost in the pCost structure.
    */
    if( (!pIdx || wsFlags)
     && (cost<pCost->rCost || (cost<=pCost->rCost && nRow<pCost->plan.nRow))
Changes to test/analyze5.test.
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
   14  {z>3 AND z<100}       t1z   50
   15  {z>=4 AND z<100}      t1z   50
   16  {z>=-100 AND z<=-1}   t1z   50
   17  {z>=-100 AND z<=0}    t1z  400
   18  {z>=-100 AND z<0}     t1z   50
   19  {z>=-100 AND z<=1}    t1z  700
   20  {z>=-100 AND z<2}     t1z  700
   21  {z>=-100 AND z<=2}    {}   111
   22  {z>=-100 AND z<3}     {}   111
  
   31  {z>=0.0 AND z<=0.0}   t1z  400
   32  {z>=1.0 AND z<=1.0}   t1z  300
   33  {z>=2.0 AND z<=2.0}   t1z  200
   34  {z>=3.0 AND z<=3.0}   t1z  100
   35  {z>=4.0 AND z<=4.0}   t1z   50
   36  {z>=-1.0 AND z<=-1.0} t1z   50







|
|







98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
   14  {z>3 AND z<100}       t1z   50
   15  {z>=4 AND z<100}      t1z   50
   16  {z>=-100 AND z<=-1}   t1z   50
   17  {z>=-100 AND z<=0}    t1z  400
   18  {z>=-100 AND z<0}     t1z   50
   19  {z>=-100 AND z<=1}    t1z  700
   20  {z>=-100 AND z<2}     t1z  700
   21  {z>=-100 AND z<=2}    t1z  900
   22  {z>=-100 AND z<3}     t1z  900
  
   31  {z>=0.0 AND z<=0.0}   t1z  400
   32  {z>=1.0 AND z<=1.0}   t1z  300
   33  {z>=2.0 AND z<=2.0}   t1z  200
   34  {z>=3.0 AND z<=3.0}   t1z  100
   35  {z>=4.0 AND z<=4.0}   t1z   50
   36  {z>=-1.0 AND z<=-1.0} t1z   50
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
   44  {z>3.2 AND z<100}     t1z   50
   45  {z>=4.0 AND z<100}    t1z   50
   46  {z>=-100 AND z<=-1.0} t1z   50
   47  {z>=-100 AND z<=0.0}  t1z  400
   48  {z>=-100 AND z<0.0}   t1z   50
   49  {z>=-100 AND z<=1.0}  t1z  700
   50  {z>=-100 AND z<2.0}   t1z  700
   51  {z>=-100 AND z<=2.0}  {}   111
   52  {z>=-100 AND z<3.0}   {}   111
  
  101  {z=-1}                t1z   50
  102  {z=0}                 t1z  400
  103  {z=1}                 t1z  300
  104  {z=2}                 t1z  200
  105  {z=3}                 t1z  100
  106  {z=4}                 t1z   50







|
|







121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
   44  {z>3.2 AND z<100}     t1z   50
   45  {z>=4.0 AND z<100}    t1z   50
   46  {z>=-100 AND z<=-1.0} t1z   50
   47  {z>=-100 AND z<=0.0}  t1z  400
   48  {z>=-100 AND z<0.0}   t1z   50
   49  {z>=-100 AND z<=1.0}  t1z  700
   50  {z>=-100 AND z<2.0}   t1z  700
   51  {z>=-100 AND z<=2.0}  t1z  900
   52  {z>=-100 AND z<3.0}   t1z  900
  
  101  {z=-1}                t1z   50
  102  {z=0}                 t1z  400
  103  {z=1}                 t1z  300
  104  {z=2}                 t1z  200
  105  {z=3}                 t1z  100
  106  {z=4}                 t1z   50
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  202  {z IN (0)}            t1z  400
  203  {z IN (1)}            t1z  300
  204  {z IN (2)}            t1z  200
  205  {z IN (3)}            t1z  100
  206  {z IN (4)}            t1z   50
  207  {z IN (0.5)}          t1z   50
  208  {z IN (0,1)}          t1z  700
  209  {z IN (0,1,2)}        {}   100
  210  {z IN (0,1,2,3)}      {}   100
  211  {z IN (0,1,2,3,4,5)}  {}   100
  212  {z IN (1,2)}          t1z  500
  213  {z IN (2,3)}          t1z  300
  214  {z=3 OR z=2}          t1z  300
  215  {z IN (-1,3)}         t1z  150
  216  {z=-1 OR z=3}         t1z  150







|







147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  202  {z IN (0)}            t1z  400
  203  {z IN (1)}            t1z  300
  204  {z IN (2)}            t1z  200
  205  {z IN (3)}            t1z  100
  206  {z IN (4)}            t1z   50
  207  {z IN (0.5)}          t1z   50
  208  {z IN (0,1)}          t1z  700
  209  {z IN (0,1,2)}        t1z  900
  210  {z IN (0,1,2,3)}      {}   100
  211  {z IN (0,1,2,3,4,5)}  {}   100
  212  {z IN (1,2)}          t1z  500
  213  {z IN (2,3)}          t1z  300
  214  {z=3 OR z=2}          t1z  300
  215  {z IN (-1,3)}         t1z  150
  216  {z=-1 OR z=3}         t1z  150