SQLite

Check-in [0c438e813c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improvements to cost estimation for evaluating the IN operator. Ticket #3757. (CVS 6403)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 0c438e813c411e8f9e92d6c7405fccb7a36e110a
User & Date: drh 2009-03-29 00:13:03.000
Context
2009-03-29
00:15
Turn off the debugging macros in where.c - left on by mistake in the previous check-in. (CVS 6404) (check-in: b601a57582 user: drh tags: trunk)
00:13
Improvements to cost estimation for evaluating the IN operator. Ticket #3757. (CVS 6403) (check-in: 0c438e813c user: drh tags: trunk)
2009-03-28
23:47
Previous commit ((6401)) did not quite fix the problem. This should work better. (CVS 6402) (check-in: 2e7d3cc9f0 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/where.c.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
** This module contains C code that generates VDBE code used to process
** the WHERE clause of SQL statements.  This module is responsible for
** generating the code that loops through a table looking for applicable
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
**
** $Id: where.c,v 1.377 2009/03/25 16:51:43 drh Exp $
*/
#include "sqliteInt.h"

/*
** Trace output macros
*/
#if defined(SQLITE_TEST) || defined(SQLITE_DEBUG)
int sqlite3WhereTrace = 0;
#endif
#if 0
# define WHERETRACE(X)  if(sqlite3WhereTrace) sqlite3DebugPrintf X
#else
# define WHERETRACE(X)
#endif

/* Forward reference
*/







|









|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
** This module contains C code that generates VDBE code used to process
** the WHERE clause of SQL statements.  This module is responsible for
** generating the code that loops through a table looking for applicable
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
**
** $Id: where.c,v 1.378 2009/03/29 00:13:03 drh Exp $
*/
#include "sqliteInt.h"

/*
** Trace output macros
*/
#if defined(SQLITE_TEST) || defined(SQLITE_DEBUG)
int sqlite3WhereTrace = 0;
#endif
#if 1
# define WHERETRACE(X)  if(sqlite3WhereTrace) sqlite3DebugPrintf X
#else
# define WHERETRACE(X)
#endif

/* Forward reference
*/
1922
1923
1924
1925
1926
1927
1928
1929

1930
1931
1932
1933
1934





1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946

1947
1948
1949
1950
1951
1952





1953


1954
1955
1956
1957
1958
1959

1960
1961

1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978

1979
1980
1981
1982
1983
1984
1985

  /* Look at each index.
  */
  if( pSrc->pIndex ){
    pProbe = pSrc->pIndex;
  }
  for(; pProbe; pProbe=(pSrc->pIndex ? 0 : pProbe->pNext)){
    double inMultiplier = 1;


    WHERETRACE(("... index %s:\n", pProbe->zName));

    /* Count the number of columns in the index that are satisfied
    ** by x=EXPR constraints or x IN (...) constraints.





    */
    wsFlags = 0;
    for(i=0; i<pProbe->nColumn; i++){
      int j = pProbe->aiColumn[i];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pProbe);
      if( pTerm==0 ) break;
      wsFlags |= WHERE_COLUMN_EQ;
      if( pTerm->eOperator & WO_IN ){
        Expr *pExpr = pTerm->pExpr;
        wsFlags |= WHERE_COLUMN_IN;
        if( ExprHasProperty(pExpr, EP_xIsSelect) ){
          inMultiplier *= 25;

        }else if( pExpr->x.pList ){
          inMultiplier *= pExpr->x.pList->nExpr + 1;
        }
      }
    }
    nRow = pProbe->aiRowEst[i] * inMultiplier;





    cost = nRow * estLog(inMultiplier);


    nEq = i;
    if( pProbe->onError!=OE_None && (wsFlags & WHERE_COLUMN_IN)==0
         && nEq==pProbe->nColumn ){
      wsFlags |= WHERE_UNIQUE;
    }
    WHERETRACE(("...... nEq=%d inMult=%.9g cost=%.9g\n",nEq,inMultiplier,cost));


    /* Look for range constraints

    */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pProbe);
      if( pTerm ){
        wsFlags |= WHERE_COLUMN_RANGE;
        if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
          wsFlags |= WHERE_TOP_LIMIT;
          cost /= 3;
          nRow /= 3;
        }
        if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
          wsFlags |= WHERE_BTM_LIMIT;
          cost /= 3;
          nRow /= 3;
        }
        WHERETRACE(("...... range reduces cost to %.9g\n", cost));

      }
    }

    /* Add the additional cost of sorting if that is a factor.
    */
    if( pOrderBy ){
      if( (wsFlags & WHERE_COLUMN_IN)==0 &&







|
>




|
>
>
>
>
>












>






>
>
>
>
>
|
>
>





|
>

|
>
















|
>







1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002

  /* Look at each index.
  */
  if( pSrc->pIndex ){
    pProbe = pSrc->pIndex;
  }
  for(; pProbe; pProbe=(pSrc->pIndex ? 0 : pProbe->pNext)){
    double inMultiplier = 1;  /* Number of equality look-ups needed */
    int inMultIsEst = 0;      /* True if inMultiplier is an estimate */

    WHERETRACE(("... index %s:\n", pProbe->zName));

    /* Count the number of columns in the index that are satisfied
    ** by x=EXPR constraints or x IN (...) constraints.  For a term
    ** of the form x=EXPR we only have to do a single binary search.
    ** But for x IN (...) we have to do a number of binary searched
    ** equal to the number of entries on the RHS of the IN operator.
    ** The inMultipler variable with try to estimate the number of
    ** binary searches needed.
    */
    wsFlags = 0;
    for(i=0; i<pProbe->nColumn; i++){
      int j = pProbe->aiColumn[i];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pProbe);
      if( pTerm==0 ) break;
      wsFlags |= WHERE_COLUMN_EQ;
      if( pTerm->eOperator & WO_IN ){
        Expr *pExpr = pTerm->pExpr;
        wsFlags |= WHERE_COLUMN_IN;
        if( ExprHasProperty(pExpr, EP_xIsSelect) ){
          inMultiplier *= 25;
          inMultIsEst = 1;
        }else if( pExpr->x.pList ){
          inMultiplier *= pExpr->x.pList->nExpr + 1;
        }
      }
    }
    nRow = pProbe->aiRowEst[i] * inMultiplier;
    /* If inMultiplier is an estimate and that estimate results in an
    ** nRow it that is more than half number of rows in the table,
    ** then reduce inMultipler */
    if( inMultIsEst && nRow*2 > pProbe->aiRowEst[0] ){
      nRow = pProbe->aiRowEst[0]/2;
      inMultiplier = nRow/pProbe->aiRowEst[i];
    }
    cost = nRow + inMultiplier*estLog(pProbe->aiRowEst[0]);
    nEq = i;
    if( pProbe->onError!=OE_None && (wsFlags & WHERE_COLUMN_IN)==0
         && nEq==pProbe->nColumn ){
      wsFlags |= WHERE_UNIQUE;
    }
    WHERETRACE(("...... nEq=%d inMult=%.9g nRow=%.9g cost=%.9g\n",
                nEq, inMultiplier, nRow, cost));

    /* Look for range constraints.  Assume that each range constraint
    ** makes the search space 1/3rd smaller.
    */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pProbe);
      if( pTerm ){
        wsFlags |= WHERE_COLUMN_RANGE;
        if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
          wsFlags |= WHERE_TOP_LIMIT;
          cost /= 3;
          nRow /= 3;
        }
        if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
          wsFlags |= WHERE_BTM_LIMIT;
          cost /= 3;
          nRow /= 3;
        }
        WHERETRACE(("...... range reduces nRow to %.9g and cost to %.9g\n",
                    nRow, cost));
      }
    }

    /* Add the additional cost of sorting if that is a factor.
    */
    if( pOrderBy ){
      if( (wsFlags & WHERE_COLUMN_IN)==0 &&
Added test/tkt3757.test.
























































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# 2009 March 28
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Ticket #3757:  The cost functions on the query optimizer for the
# IN operator can be improved.
#
# $Id: tkt3757.test,v 1.1 2009/03/29 00:13:04 drh Exp $

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# Evaluate SQL.  Return the result set followed by the
# and the number of full-scan steps.
#
proc count_steps {sql} {
  set r [db eval $sql]
  lappend r scan [db status step] sort [db status sort]
}

# Construct tables
#
do_test tkt3757-1.1 {
  db eval {
     CREATE TABLE t1(x INTEGER, y INTEGER, z TEXT);
     CREATE INDEX t1i1 ON t1(y,z);
     INSERT INTO t1 VALUES(1,2,'three');
     CREATE TABLE t2(a INTEGER, b TEXT);
     INSERT INTO t2 VALUES(2, 'two');
     ANALYZE;
     SELECT * FROM sqlite_stat1;
  }
} {t1 t1i1 {1 1 1}}

# Modify statistics in order to make the optimizer then that:
#
#   (1)  Table T1 has about 250K entries
#   (2)  There are only about 5 distinct values of T1.
#
# Then run a query with "t1.y IN (SELECT ..)" in the WHERE clause.
# Make sure the index is used.
#
do_test tkt3757-1.2 {
  db eval {
    DELETE FROM sqlite_stat1;
    INSERT INTO sqlite_stat1 VALUES('t1','t1i1','250000 50000 30');
  }
  count_steps {
    SELECT * FROM t1 WHERE y IN (SELECT a FROM t2)
  }
} {1 2 three scan 0 sort 0}

finish_test