SQLite

Check-in [2d36be5d9a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the query planner to do a better job of estimating the number rows selected by a BETWEEN operator using STAT4 when both upper and lower bounds are contained within the same sample.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 2d36be5d9a1cdd4fd2d54fc4eeece32a81cbacc1
User & Date: drh 2014-11-05 19:26:12.741
Context
2014-11-05
21:21
Fix harmless compiler warnings in the new balance_nonroot() routine. (check-in: 83a1e5db92 user: drh tags: trunk)
19:26
Change the query planner to do a better job of estimating the number rows selected by a BETWEEN operator using STAT4 when both upper and lower bounds are contained within the same sample. (check-in: 2d36be5d9a user: drh tags: trunk)
15:57
Make sure that NULL results from OP_Column are fully and completely NULL and do not have the MEM_Ephem bit set. Fix for ticket [094d39a4c95ee4]. (check-in: 42705fd7d8 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/where.c.
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909

1910
1911
1912
1913
1914
1915
1916
1917
1918
    }
  }

  return pParse->nErr;
}
#endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */


#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
/*
** Estimate the location of a particular key among all keys in an
** index.  Store the results in aStat as follows:
**
**    aStat[0]      Est. number of rows less than pVal
**    aStat[1]      Est. number of rows equal to pVal
**
** Return SQLITE_OK on success.

*/
static void whereKeyStats(
  Parse *pParse,              /* Database connection */
  Index *pIdx,                /* Index to consider domain of */
  UnpackedRecord *pRec,       /* Vector of values to consider */
  int roundUp,                /* Round up if true.  Round down if false */
  tRowcnt *aStat              /* OUT: stats written here */
){
  IndexSample *aSample = pIdx->aSample;







<








|
>

|







1893
1894
1895
1896
1897
1898
1899

1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
    }
  }

  return pParse->nErr;
}
#endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */


#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
/*
** Estimate the location of a particular key among all keys in an
** index.  Store the results in aStat as follows:
**
**    aStat[0]      Est. number of rows less than pVal
**    aStat[1]      Est. number of rows equal to pVal
**
** Return the index of the sample that is the smallest sample that
** is greater than or equal to pRec.
*/
static int whereKeyStats(
  Parse *pParse,              /* Database connection */
  Index *pIdx,                /* Index to consider domain of */
  UnpackedRecord *pRec,       /* Vector of values to consider */
  int roundUp,                /* Round up if true.  Round down if false */
  tRowcnt *aStat              /* OUT: stats written here */
){
  IndexSample *aSample = pIdx->aSample;
1986
1987
1988
1989
1990
1991
1992

1993
1994
1995
1996
1997
1998
1999
    if( roundUp ){
      iGap = (iGap*2)/3;
    }else{
      iGap = iGap/3;
    }
    aStat[0] = iLower + iGap;
  }

}
#endif /* SQLITE_ENABLE_STAT3_OR_STAT4 */

/*
** If it is not NULL, pTerm is a term that provides an upper or lower
** bound on a range scan. Without considering pTerm, it is estimated 
** that the scan will visit nNew rows. This function returns the number







>







1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
    if( roundUp ){
      iGap = (iGap*2)/3;
    }else{
      iGap = iGap/3;
    }
    aStat[0] = iLower + iGap;
  }
  return i;
}
#endif /* SQLITE_ENABLE_STAT3_OR_STAT4 */

/*
** If it is not NULL, pTerm is a term that provides an upper or lower
** bound on a range scan. Without considering pTerm, it is estimated 
** that the scan will visit nNew rows. This function returns the number
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
**                    |_____|   |_____|
**                       |         |
**                     pLower    pUpper
**
** If either of the upper or lower bound is not present, then NULL is passed in
** place of the corresponding WhereTerm.
**
** The value in (pBuilder->pNew->u.btree.nEq) is the index of the index
** column subject to the range constraint. Or, equivalently, the number of
** equality constraints optimized by the proposed index scan. For example,
** assuming index p is on t1(a, b), and the SQL query is:
**
**   ... FROM t1 WHERE a = ? AND b > ? AND b < ? ...
**
** then nEq is set to 1 (as the range restricted column, b, is the second 
** left-most column of the index). Or, if the query is:
**
**   ... FROM t1 WHERE a > ? AND a < ? ...
**
** then nEq is set to 0.
**
** When this function is called, *pnOut is set to the sqlite3LogEst() of the
** number of rows that the index scan is expected to visit without 
** considering the range constraints. If nEq is 0, this is the number of 
** rows in the index. Assuming no error occurs, *pnOut is adjusted (reduced)
** to account for the range constraints pLower and pUpper.
** 
** In the absence of sqlite_stat4 ANALYZE data, or if such data cannot be
** used, a single range inequality reduces the search space by a factor of 4. 
** and a pair of constraints (x>? AND x<?) reduces the expected number of
** rows visited by a factor of 64.







|















|







2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
**                    |_____|   |_____|
**                       |         |
**                     pLower    pUpper
**
** If either of the upper or lower bound is not present, then NULL is passed in
** place of the corresponding WhereTerm.
**
** The value in (pBuilder->pNew->u.btree.nEq) is the number of the index
** column subject to the range constraint. Or, equivalently, the number of
** equality constraints optimized by the proposed index scan. For example,
** assuming index p is on t1(a, b), and the SQL query is:
**
**   ... FROM t1 WHERE a = ? AND b > ? AND b < ? ...
**
** then nEq is set to 1 (as the range restricted column, b, is the second 
** left-most column of the index). Or, if the query is:
**
**   ... FROM t1 WHERE a > ? AND a < ? ...
**
** then nEq is set to 0.
**
** When this function is called, *pnOut is set to the sqlite3LogEst() of the
** number of rows that the index scan is expected to visit without 
** considering the range constraints. If nEq is 0, then *pnOut is the number of 
** rows in the index. Assuming no error occurs, *pnOut is adjusted (reduced)
** to account for the range constraints pLower and pUpper.
** 
** In the absence of sqlite_stat4 ANALYZE data, or if such data cannot be
** used, a single range inequality reduces the search space by a factor of 4. 
** and a pair of constraints (x>? AND x<?) reduces the expected number of
** rows visited by a factor of 64.
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206


2207
2208
2209


2210
2211
2212
2213
2214
2215
2216
  int nOut = pLoop->nOut;
  LogEst nNew;

#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  Index *p = pLoop->u.btree.pIndex;
  int nEq = pLoop->u.btree.nEq;

  if( p->nSample>0
   && nEq<p->nSampleCol
  ){
    if( nEq==pBuilder->nRecValid ){
      UnpackedRecord *pRec = pBuilder->pRec;
      tRowcnt a[2];
      u8 aff;

      /* Variable iLower will be set to the estimate of the number of rows in 
      ** the index that are less than the lower bound of the range query. The
      ** lower bound being the concatenation of $P and $L, where $P is the
      ** key-prefix formed by the nEq values matched against the nEq left-most
      ** columns of the index, and $L is the value in pLower.
      **
      ** Or, if pLower is NULL or $L cannot be extracted from it (because it
      ** is not a simple variable or literal value), the lower bound of the
      ** range is $P. Due to a quirk in the way whereKeyStats() works, even
      ** if $L is available, whereKeyStats() is called for both ($P) and 
      ** ($P:$L) and the larger of the two returned values used.
      **
      ** Similarly, iUpper is to be set to the estimate of the number of rows
      ** less than the upper bound of the range query. Where the upper bound
      ** is either ($P) or ($P:$U). Again, even if $U is available, both values
      ** of iUpper are requested of whereKeyStats() and the smaller used.


      */
      tRowcnt iLower;
      tRowcnt iUpper;



      if( pRec ){
        testcase( pRec->nField!=pBuilder->nRecValid );
        pRec->nField = pBuilder->nRecValid;
      }
      if( nEq==p->nKeyCol ){
        aff = SQLITE_AFF_INTEGER;







|
<
<















|





>
>

|
|
>
>







2177
2178
2179
2180
2181
2182
2183
2184


2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
  int nOut = pLoop->nOut;
  LogEst nNew;

#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  Index *p = pLoop->u.btree.pIndex;
  int nEq = pLoop->u.btree.nEq;

  if( p->nSample>0 && nEq<p->nSampleCol ){


    if( nEq==pBuilder->nRecValid ){
      UnpackedRecord *pRec = pBuilder->pRec;
      tRowcnt a[2];
      u8 aff;

      /* Variable iLower will be set to the estimate of the number of rows in 
      ** the index that are less than the lower bound of the range query. The
      ** lower bound being the concatenation of $P and $L, where $P is the
      ** key-prefix formed by the nEq values matched against the nEq left-most
      ** columns of the index, and $L is the value in pLower.
      **
      ** Or, if pLower is NULL or $L cannot be extracted from it (because it
      ** is not a simple variable or literal value), the lower bound of the
      ** range is $P. Due to a quirk in the way whereKeyStats() works, even
      ** if $L is available, whereKeyStats() is called for both ($P) and 
      ** ($P:$L) and the larger of the two returned values is used.
      **
      ** Similarly, iUpper is to be set to the estimate of the number of rows
      ** less than the upper bound of the range query. Where the upper bound
      ** is either ($P) or ($P:$U). Again, even if $U is available, both values
      ** of iUpper are requested of whereKeyStats() and the smaller used.
      **
      ** The number of rows between the two bounds is then just iUpper-iLower.
      */
      tRowcnt iLower;     /* Rows less than the lower bound */
      tRowcnt iUpper;     /* Rows less than the upper bound */
      int iLwrIdx = -2;   /* aSample[] for the lower bound */
      int iUprIdx = -1;   /* aSample[] for the upper bound */

      if( pRec ){
        testcase( pRec->nField!=pBuilder->nRecValid );
        pRec->nField = pBuilder->nRecValid;
      }
      if( nEq==p->nKeyCol ){
        aff = SQLITE_AFF_INTEGER;
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273





2274
2275
2276
2277
2278
2279
2280
      /* If possible, improve on the iLower estimate using ($P:$L). */
      if( pLower ){
        int bOk;                    /* True if value is extracted from pExpr */
        Expr *pExpr = pLower->pExpr->pRight;
        rc = sqlite3Stat4ProbeSetValue(pParse, p, &pRec, pExpr, aff, nEq, &bOk);
        if( rc==SQLITE_OK && bOk ){
          tRowcnt iNew;
          whereKeyStats(pParse, p, pRec, 0, a);
          iNew = a[0] + ((pLower->eOperator & (WO_GT|WO_LE)) ? a[1] : 0);
          if( iNew>iLower ) iLower = iNew;
          nOut--;
          pLower = 0;
        }
      }

      /* If possible, improve on the iUpper estimate using ($P:$U). */
      if( pUpper ){
        int bOk;                    /* True if value is extracted from pExpr */
        Expr *pExpr = pUpper->pExpr->pRight;
        rc = sqlite3Stat4ProbeSetValue(pParse, p, &pRec, pExpr, aff, nEq, &bOk);
        if( rc==SQLITE_OK && bOk ){
          tRowcnt iNew;
          whereKeyStats(pParse, p, pRec, 1, a);
          iNew = a[0] + ((pUpper->eOperator & (WO_GT|WO_LE)) ? a[1] : 0);
          if( iNew<iUpper ) iUpper = iNew;
          nOut--;
          pUpper = 0;
        }
      }

      pBuilder->pRec = pRec;
      if( rc==SQLITE_OK ){
        if( iUpper>iLower ){
          nNew = sqlite3LogEst(iUpper - iLower);





        }else{
          nNew = 10;        assert( 10==sqlite3LogEst(2) );
        }
        if( nNew<nOut ){
          nOut = nNew;
        }
        WHERETRACE(0x10, ("STAT4 range scan: %u..%u  est=%d\n",







|














|











>
>
>
>
>







2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
      /* If possible, improve on the iLower estimate using ($P:$L). */
      if( pLower ){
        int bOk;                    /* True if value is extracted from pExpr */
        Expr *pExpr = pLower->pExpr->pRight;
        rc = sqlite3Stat4ProbeSetValue(pParse, p, &pRec, pExpr, aff, nEq, &bOk);
        if( rc==SQLITE_OK && bOk ){
          tRowcnt iNew;
          iLwrIdx = whereKeyStats(pParse, p, pRec, 0, a);
          iNew = a[0] + ((pLower->eOperator & (WO_GT|WO_LE)) ? a[1] : 0);
          if( iNew>iLower ) iLower = iNew;
          nOut--;
          pLower = 0;
        }
      }

      /* If possible, improve on the iUpper estimate using ($P:$U). */
      if( pUpper ){
        int bOk;                    /* True if value is extracted from pExpr */
        Expr *pExpr = pUpper->pExpr->pRight;
        rc = sqlite3Stat4ProbeSetValue(pParse, p, &pRec, pExpr, aff, nEq, &bOk);
        if( rc==SQLITE_OK && bOk ){
          tRowcnt iNew;
          iUprIdx = whereKeyStats(pParse, p, pRec, 1, a);
          iNew = a[0] + ((pUpper->eOperator & (WO_GT|WO_LE)) ? a[1] : 0);
          if( iNew<iUpper ) iUpper = iNew;
          nOut--;
          pUpper = 0;
        }
      }

      pBuilder->pRec = pRec;
      if( rc==SQLITE_OK ){
        if( iUpper>iLower ){
          nNew = sqlite3LogEst(iUpper - iLower);
          /* TUNING:  If both iUpper and iLower are derived from the same
          ** sample, then assume they are 4x more selective.  This brings
          ** the estimated selectivity more in line with what it would be
          ** if estimated without the use of STAT3/4 tables. */
          if( iLwrIdx==iUprIdx ) nNew -= 20;  assert( 20==sqlite3LogEst(4) );
        }else{
          nNew = 10;        assert( 10==sqlite3LogEst(2) );
        }
        if( nNew<nOut ){
          nOut = nNew;
        }
        WHERETRACE(0x10, ("STAT4 range scan: %u..%u  est=%d\n",
Changes to test/analyze8.test.
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1b (b>? AND b<?)}}

# There are many more values of c between 0 and 100000 than there are
# between 800000 and 900000.  So t1c is more selective for the latter
# range.
# 
# Test 3.2 is a little unstable. It depends on the planner estimating
# that (b BETWEEN 50 AND 54) will match more rows than (c BETWEEN
# 800000 AND 900000). Which is a pretty close call (50 vs. 32), so
# the planner could get it wrong with an unlucky set of samples. This
# case happens to work, but others ("b BETWEEN 40 AND 44" for example) 
# will fail.
#
do_execsql_test 3.0 {
  SELECT count(*) FROM t1 WHERE b BETWEEN 50 AND 54;
  SELECT count(*) FROM t1 WHERE c BETWEEN 0 AND 100000;
  SELECT count(*) FROM t1 WHERE c BETWEEN 800000 AND 900000;
} {50 376 32}
do_test 3.1 {
  eqp {SELECT * FROM t1 WHERE b BETWEEN 50 AND 54 AND c BETWEEN 0 AND 100000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1b (b>? AND b<?)}}
do_test 3.2 {
  eqp {SELECT * FROM t1
       WHERE b BETWEEN 50 AND 54 AND c BETWEEN 800000 AND 900000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1c (c>? AND c<?)}}
do_test 3.3 {
  eqp {SELECT * FROM t1 WHERE a=100 AND c BETWEEN 0 AND 100000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1a (a=?)}}
do_test 3.4 {
  eqp {SELECT * FROM t1
       WHERE a=100 AND c BETWEEN 800000 AND 900000}







|






|




|



|







82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1b (b>? AND b<?)}}

# There are many more values of c between 0 and 100000 than there are
# between 800000 and 900000.  So t1c is more selective for the latter
# range.
# 
# Test 3.2 is a little unstable. It depends on the planner estimating
# that (b BETWEEN 30 AND 34) will match more rows than (c BETWEEN
# 800000 AND 900000). Which is a pretty close call (50 vs. 32), so
# the planner could get it wrong with an unlucky set of samples. This
# case happens to work, but others ("b BETWEEN 40 AND 44" for example) 
# will fail.
#
do_execsql_test 3.0 {
  SELECT count(*) FROM t1 WHERE b BETWEEN 30 AND 34;
  SELECT count(*) FROM t1 WHERE c BETWEEN 0 AND 100000;
  SELECT count(*) FROM t1 WHERE c BETWEEN 800000 AND 900000;
} {50 376 32}
do_test 3.1 {
  eqp {SELECT * FROM t1 WHERE b BETWEEN 30 AND 34 AND c BETWEEN 0 AND 100000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1b (b>? AND b<?)}}
do_test 3.2 {
  eqp {SELECT * FROM t1
       WHERE b BETWEEN 30 AND 34 AND c BETWEEN 800000 AND 900000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1c (c>? AND c<?)}}
do_test 3.3 {
  eqp {SELECT * FROM t1 WHERE a=100 AND c BETWEEN 0 AND 100000}
} {0 0 0 {SEARCH TABLE t1 USING INDEX t1a (a=?)}}
do_test 3.4 {
  eqp {SELECT * FROM t1
       WHERE a=100 AND c BETWEEN 800000 AND 900000}