/ Check-in [3e0590de]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:When estimating the number of rows visited by a range scan for which the keys consist of more than one field, consider prefixes of stat4 samples as well as the full samples. This generates more accurate estimates.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 3e0590dee0e68cc1599858757c650a7378026170
User & Date: dan 2015-03-16 16:28:43
References
2017-05-24
04:32
Fix a problem in STAT4 equality estimation for multi-column indexes introduced by check-in [3e0590dee0e68cc1599]. check-in: 19dad0a7 user: drh tags: branch-3.8.9
04:18
Fix a problem in STAT4 equality estimation for multi-column indexes introduced by check-in [3e0590dee0e68cc1599]. check-in: cfb0d9e0 user: drh tags: trunk
Context
2015-03-16
16:44
When deleting the master journal to commit a multi-database transaction, do not sync the directory if PRAGMA synchronous=OFF for all participating database files. check-in: 018d7671 user: drh tags: trunk
16:28
When estimating the number of rows visited by a range scan for which the keys consist of more than one field, consider prefixes of stat4 samples as well as the full samples. This generates more accurate estimates. check-in: 3e0590de user: dan tags: trunk
13:48
Use #ifdef to omit code that is only used for STAT3 and STAT4. check-in: f2c9c5b5 user: drh tags: trunk
09:21
Another test case for the planner change on this branch. Closed-Leaf check-in: f2207a06 user: dan tags: stat4-change
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/where.c.

1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938



1939
1940
1941
1942
1943
1944
1945
1946
1947
1948


1949
1950
1951
1952


1953
1954
1955
1956
1957
1958
1959
1960
















































1961



1962













1963
1964

1965




1966
1967

1968
1969

1970
1971
1972
1973
1974

1975
1976
1977


1978
1979

1980

1981
1982



1983
1984
1985
1986












1987
1988
1989


1990
1991
1992
1993
1994
1995
1996


1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020

2021



2022
2023
2024
2025
2026
2027
2028
#endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */

#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
/*
** Estimate the location of a particular key among all keys in an
** index.  Store the results in aStat as follows:
**
**    aStat[0]      Est. number of rows less than pVal
**    aStat[1]      Est. number of rows equal to pVal
**
** Return the index of the sample that is the smallest sample that
** is greater than or equal to pRec.



*/
static int whereKeyStats(
  Parse *pParse,              /* Database connection */
  Index *pIdx,                /* Index to consider domain of */
  UnpackedRecord *pRec,       /* Vector of values to consider */
  int roundUp,                /* Round up if true.  Round down if false */
  tRowcnt *aStat              /* OUT: stats written here */
){
  IndexSample *aSample = pIdx->aSample;
  int iCol;                   /* Index of required stats in anEq[] etc. */


  int iMin = 0;               /* Smallest sample not yet tested */
  int i = pIdx->nSample;      /* Smallest sample larger than or equal to pRec */
  int iTest;                  /* Next sample to test */
  int res;                    /* Result of comparison operation */



#ifndef SQLITE_DEBUG
  UNUSED_PARAMETER( pParse );
#endif
  assert( pRec!=0 );
  iCol = pRec->nField - 1;
  assert( pIdx->nSample>0 );
  assert( pRec->nField>0 && iCol<pIdx->nSampleCol );
















































  do{



    iTest = (iMin+i)/2;













    res = sqlite3VdbeRecordCompare(aSample[iTest].n, aSample[iTest].p, pRec);
    if( res<0 ){

      iMin = iTest+1;




    }else{
      i = iTest;

    }
  }while( res && iMin<i );


#ifdef SQLITE_DEBUG
  /* The following assert statements check that the binary search code
  ** above found the right answer. This block serves no purpose other
  ** than to invoke the asserts.  */

  if( res==0 ){
    /* If (res==0) is true, then sample $i must be equal to pRec */
    assert( i<pIdx->nSample );


    assert( 0==sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)
         || pParse->db->mallocFailed );

  }else{

    /* Otherwise, pRec must be smaller than sample $i and larger than
    ** sample ($i-1).  */



    assert( i==pIdx->nSample 
         || sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)>0
         || pParse->db->mallocFailed );
    assert( i==0












         || sqlite3VdbeRecordCompare(aSample[i-1].n, aSample[i-1].p, pRec)<0
         || pParse->db->mallocFailed );
  }


#endif /* ifdef SQLITE_DEBUG */

  /* At this point, aSample[i] is the first sample that is greater than
  ** or equal to pVal.  Or if i==pIdx->nSample, then all samples are less
  ** than pVal.  If aSample[i]==pVal, then res==0.
  */
  if( res==0 ){


    aStat[0] = aSample[i].anLt[iCol];
    aStat[1] = aSample[i].anEq[iCol];
  }else{
    tRowcnt iLower, iUpper, iGap;
    if( i==0 ){
      iLower = 0;
      iUpper = aSample[0].anLt[iCol];
    }else{
      i64 nRow0 = sqlite3LogEstToInt(pIdx->aiRowLogEst[0]);
      iUpper = i>=pIdx->nSample ? nRow0 : aSample[i].anLt[iCol];
      iLower = aSample[i-1].anEq[iCol] + aSample[i-1].anLt[iCol];
    }
    aStat[1] = pIdx->aAvgEq[iCol];
    if( iLower>=iUpper ){
      iGap = 0;
    }else{
      iGap = iUpper - iLower;
    }
    if( roundUp ){
      iGap = (iGap*2)/3;
    }else{
      iGap = iGap/3;
    }
    aStat[0] = iLower + iGap;

  }



  return i;
}
#endif /* SQLITE_ENABLE_STAT3_OR_STAT4 */

/*
** If it is not NULL, pTerm is a term that provides an upper or lower
** bound on a range scan. Without considering pTerm, it is estimated 







|
|


|
>
>
>










>
>

<


>
>





<

|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
|

>

>
>
>
>

|
>

|
>





>
|
|
|
>
>
|
|
>
|
>
|
<
>
>
>
|
|
|
<
>
>
>
>
>
>
>
>
>
>
>
>
|
|
|
>
>


<
<
<
<

>
>



|
|
|
|
|
|
|
|
|
|











>

>
>
>







1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954

1955
1956
1957
1958
1959
1960
1961
1962
1963

1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062

2063
2064
2065
2066
2067
2068

2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087




2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
#endif /* !defined(SQLITE_OMIT_VIRTUALTABLE) */

#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
/*
** Estimate the location of a particular key among all keys in an
** index.  Store the results in aStat as follows:
**
**    aStat[0]      Est. number of rows less than pRec
**    aStat[1]      Est. number of rows equal to pRec
**
** Return the index of the sample that is the smallest sample that
** is greater than or equal to pRec. Note that this index is not an index
** into the aSample[] array - it is an index into a virtual set of samples
** based on the contents of aSample[] and the number of fields in record 
** pRec. 
*/
static int whereKeyStats(
  Parse *pParse,              /* Database connection */
  Index *pIdx,                /* Index to consider domain of */
  UnpackedRecord *pRec,       /* Vector of values to consider */
  int roundUp,                /* Round up if true.  Round down if false */
  tRowcnt *aStat              /* OUT: stats written here */
){
  IndexSample *aSample = pIdx->aSample;
  int iCol;                   /* Index of required stats in anEq[] etc. */
  int i;                      /* Index of first sample >= pRec */
  int iSample;                /* Smallest sample larger than or equal to pRec */
  int iMin = 0;               /* Smallest sample not yet tested */

  int iTest;                  /* Next sample to test */
  int res;                    /* Result of comparison operation */
  int nField;                 /* Number of fields in pRec */
  tRowcnt iLower = 0;         /* anLt[] + anEq[] of largest sample pRec is > */

#ifndef SQLITE_DEBUG
  UNUSED_PARAMETER( pParse );
#endif
  assert( pRec!=0 );

  assert( pIdx->nSample>0 );
  assert( pRec->nField>0 && pRec->nField<=pIdx->nSampleCol );

  /* Do a binary search to find the first sample greater than or equal
  ** to pRec. If pRec contains a single field, the set of samples to search
  ** is simply the aSample[] array. If the samples in aSample[] contain more
  ** than one fields, all fields following the first are ignored.
  **
  ** If pRec contains N fields, where N is more than one, then as well as the
  ** samples in aSample[] (truncated to N fields), the search also has to
  ** consider prefixes of those samples. For example, if the set of samples
  ** in aSample is:
  **
  **     aSample[0] = (a, 5) 
  **     aSample[1] = (a, 10) 
  **     aSample[2] = (b, 5) 
  **     aSample[3] = (c, 100) 
  **     aSample[4] = (c, 105)
  **
  ** Then the search space should ideally be the samples above and the 
  ** unique prefixes [a], [b] and [c]. But since that is hard to organize, 
  ** the code actually searches this set:
  **
  **     0: (a) 
  **     1: (a, 5) 
  **     2: (a, 10) 
  **     3: (a, 10) 
  **     4: (b) 
  **     5: (b, 5) 
  **     6: (c) 
  **     7: (c, 100) 
  **     8: (c, 105)
  **     9: (c, 105)
  **
  ** For each sample in the aSample[] array, N samples are present in the
  ** effective sample array. In the above, samples 0 and 1 are based on 
  ** sample aSample[0]. Samples 2 and 3 on aSample[1] etc.
  **
  ** Often, sample i of each block of N effective samples has (i+1) fields.
  ** Except, each sample may be extended to ensure that it is greater than or
  ** equal to the previous sample in the array. For example, in the above, 
  ** sample 2 is the first sample of a block of N samples, so at first it 
  ** appears that it should be 1 field in size. However, that would make it 
  ** smaller than sample 1, so the binary search would not work. As a result, 
  ** it is extended to two fields. The duplicates that this creates do not 
  ** cause any problems.
  */
  nField = pRec->nField;
  iCol = 0;
  iSample = pIdx->nSample * nField;
  do{
    int iSamp;                    /* Index in aSample[] of test sample */
    int n;                        /* Number of fields in test sample */

    iTest = (iMin+iSample)/2;
    iSamp = iTest / nField;
    if( iSamp>0 ){
      /* The proposed effective sample is a prefix of sample aSample[iSamp].
      ** Specifically, the shortest prefix of at least (1 + iTest%nField) 
      ** fields that is greater than the previous effective sample.  */
      for(n=(iTest % nField) + 1; n<nField; n++){
        if( aSample[iSamp-1].anLt[n-1]!=aSample[iSamp].anLt[n-1] ) break;
      }
    }else{
      n = iTest + 1;
    }

    pRec->nField = n;
    res = sqlite3VdbeRecordCompare(aSample[iSamp].n, aSample[iSamp].p, pRec);
    if( res<0 ){
      iLower = aSample[iSamp].anLt[n-1] + aSample[iSamp].anEq[n-1];
      iMin = iTest+1;
    }else if( res==0 && n<nField ){
      iLower = aSample[iSamp].anLt[n-1];
      iMin = iTest+1;
      res = -1;
    }else{
      iSample = iTest;
      iCol = n-1;
    }
  }while( res && iMin<iSample );
  i = iSample / nField;

#ifdef SQLITE_DEBUG
  /* The following assert statements check that the binary search code
  ** above found the right answer. This block serves no purpose other
  ** than to invoke the asserts.  */
  if( pParse->db->mallocFailed==0 ){
    if( res==0 ){
      /* If (res==0) is true, then pRec must be equal to sample i. */
      assert( i<pIdx->nSample );
      assert( iCol==nField-1 );
      pRec->nField = nField;
      assert( 0==sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec) 
           || pParse->db->mallocFailed 
      );
    }else{
      /* Unless i==pIdx->nSample, indicating that pRec is larger than
      ** all samples in the aSample[] array, pRec must be smaller than the

      ** (iCol+1) field prefix of sample i.  */
      assert( i<=pIdx->nSample && i>=0 );
      pRec->nField = iCol+1;
      assert( i==pIdx->nSample 
           || sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)>0
           || pParse->db->mallocFailed );


      /* if i==0 and iCol==0, then record pRec is smaller than all samples
      ** in the aSample[] array. Otherwise, if (iCol>0) then pRec must
      ** be greater than or equal to the (iCol) field prefix of sample i.
      ** If (i>0), then pRec must also be greater than sample (i-1).  */
      if( iCol>0 ){
        pRec->nField = iCol;
        assert( sqlite3VdbeRecordCompare(aSample[i].n, aSample[i].p, pRec)<=0
             || pParse->db->mallocFailed );
      }
      if( i>0 ){
        pRec->nField = nField;
        assert( sqlite3VdbeRecordCompare(aSample[i-1].n, aSample[i-1].p, pRec)<0
             || pParse->db->mallocFailed );
      }
    }
  }
#endif /* ifdef SQLITE_DEBUG */





  if( res==0 ){
    /* Record pRec is equal to sample i */
    assert( iCol==nField-1 );
    aStat[0] = aSample[i].anLt[iCol];
    aStat[1] = aSample[i].anEq[iCol];
  }else{
    /* At this point, the (iCol+1) field prefix of aSample[i] is the first 
    ** sample that is greater than pRec. Or, if i==pIdx->nSample then pRec
    ** is larger than all samples in the array. */
    tRowcnt iUpper, iGap;
    if( i>=pIdx->nSample ){
      iUpper = sqlite3LogEstToInt(pIdx->aiRowLogEst[0]);
    }else{
      iUpper = aSample[i].anLt[iCol];
    }

    if( iLower>=iUpper ){
      iGap = 0;
    }else{
      iGap = iUpper - iLower;
    }
    if( roundUp ){
      iGap = (iGap*2)/3;
    }else{
      iGap = iGap/3;
    }
    aStat[0] = iLower + iGap;
    aStat[1] = pIdx->aAvgEq[iCol];
  }

  /* Restore the pRec->nField value before returning.  */
  pRec->nField = nField;
  return i;
}
#endif /* SQLITE_ENABLE_STAT3_OR_STAT4 */

/*
** If it is not NULL, pTerm is a term that provides an upper or lower
** bound on a range scan. Without considering pTerm, it is estimated 

Changes to test/analyze9.test.

1129
1130
1131
1132
1133
1134
1135
1136













































































































1137



  }
  do_eqp_test 25.4.2 { 
    SELECT * FROM t6 WHERE a < 20 AND (b BETWEEN ? AND 60)
  } {
    0 0 0 {SEARCH TABLE t6 USING INDEX bb (b>? AND b<?)}
  }
}














































































































finish_test











>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
  }
  do_eqp_test 25.4.2 { 
    SELECT * FROM t6 WHERE a < 20 AND (b BETWEEN ? AND 60)
  } {
    0 0 0 {SEARCH TABLE t6 USING INDEX bb (b>? AND b<?)}
  }
}

#-------------------------------------------------------------------------
# Check that a problem in they way stat4 data is used has been 
# resolved (see below).
#
reset_db
do_test 26.1.1 {
  db transaction {
    execsql { 
      CREATE TABLE t1(x, y, z);
      CREATE INDEX t1xy ON t1(x, y);
      CREATE INDEX t1z ON t1(z);
    }
    for {set i 0} {$i < 10000} {incr i} {
      execsql { INSERT INTO t1(x, y) VALUES($i, $i) }
    }
    for {set i 0} {$i < 10} {incr i} {
      execsql {
        WITH cnt(x) AS (SELECT 1 UNION ALL SELECT x+1 FROM cnt WHERE x<100)
        INSERT INTO t1(x, y) SELECT 10000+$i, x FROM cnt;
        INSERT INTO t1(x, y) SELECT 10000+$i, 100;
      }
    }
    execsql {
      UPDATE t1 SET z = rowid / 20;
      ANALYZE;
    }
  }
} {}

do_execsql_test 26.1.2 {
  SELECT count(*) FROM t1 WHERE x = 10000 AND y < 50;
} {49}
do_execsql_test 26.1.3 {
  SELECT count(*) FROM t1 WHERE z = 444;
} {20}

# The analyzer knows that any (z=?) expression matches 20 rows. So it
# will use index "t1z" if the estimate of hits for (x=10000 AND y<50)
# is greater than 20 rows.
#
# And it should be. The analyzer has a stat4 sample as follows:
#
#   sample=(x=10000, y=100) nLt=(10000 10099)
#
# There should be no other samples that start with (x=10000). So it knows 
# that (x=10000 AND y<50) must match somewhere between 0 and 99 rows, but
# know more than that. Guessing less than 20 is therefore unreasonable.
#
# At one point though, due to a problem in whereKeyStats(), the planner was
# estimating that (x=10000 AND y<50) would match only 2 rows.
#
do_eqp_test 26.1.4 {
  SELECT * FROM t1 WHERE x = 10000 AND y < 50 AND z = 444;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z=?)}
}


# This test - 26.2.* - tests that another manifestation of the same problem
# is no longer present in the library. Assuming:
# 
#   CREATE INDEX t1xy ON t1(x, y)
#
# and that have samples for index t1xy as follows:
#
#
#   sample=('A', 70)        nEq=(100, 2)        nLt=(900, 970)
#   sample=('B', 70)        nEq=(100, 2)        nLt=(1000, 1070)    
#
# the planner should estimate that (x = 'B' AND y > 25) matches 76 rows
# (70 * 2/3 + 30). Before, due to the problem, the planner was estimating 
# that this matched 100 rows.
# 
reset_db
do_execsql_test 26.2.1 {
  BEGIN;
    CREATE TABLE t1(x, y, z);
    CREATE INDEX i1 ON t1(x, y);
    CREATE INDEX i2 ON t1(z);
  
    WITH 
    cnt(y) AS (SELECT 0 UNION ALL SELECT y+1 FROM cnt WHERE y<99),
    letters(x) AS (
      SELECT 'A' UNION SELECT 'B' UNION SELECT 'C' UNION SELECT 'D'
    )
    INSERT INTO t1(x, y) SELECT x, y FROM letters, cnt;
  
    WITH
    letters(x) AS (
      SELECT 'A' UNION SELECT 'B' UNION SELECT 'C' UNION SELECT 'D'
    )
    INSERT INTO t1(x, y) SELECT x, 70 FROM letters;
  
    WITH
    cnt(i) AS (SELECT 0 UNION ALL SELECT i+1 FROM cnt WHERE i<9999)
    INSERT INTO t1(x, y) SELECT i, i FROM cnt;
  
    UPDATE t1 SET z = (rowid / 95);
    ANALYZE;
  COMMIT;
}

do_eqp_test 26.2.2 {
  SELECT * FROM t1 WHERE x='B' AND y>25 AND z=?;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX i1 (x=? AND y>?)}
}


finish_test