SQLite

Check-in [c82cb9c028]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Adjustments to the result row estimator for the IN operator so that it gives the same estimates as the equivalent OR operator. Test cases for the same.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | stat2-enhancement
Files: files | file ages | folders
SHA1: c82cb9c028b3ba5463ae50c30196dbf157a7a305
User & Date: drh 2011-01-21 18:18:13.960
Context
2011-01-22
00:10
Add the ability to use indices for constraints of the form "x IS NOT NULL" when sqlite_stat2 is available and most entries for column x are NULL. (check-in: 5d5bddd290 user: drh tags: stat2-enhancement)
2011-01-21
18:18
Adjustments to the result row estimator for the IN operator so that it gives the same estimates as the equivalent OR operator. Test cases for the same. (check-in: c82cb9c028 user: drh tags: stat2-enhancement)
16:27
Make use of histogram data to make better estimates for the number of rows that will be returned from "x IN (v1,v2,v3,...)" constraints. (check-in: fd3977a27a user: drh tags: stat2-enhancement)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/where.c.
113
114
115
116
117
118
119

120
121
122
123
124
125
126
#define TERM_DYNAMIC    0x01   /* Need to call sqlite3ExprDelete(db, pExpr) */
#define TERM_VIRTUAL    0x02   /* Added by the optimizer.  Do not code */
#define TERM_CODED      0x04   /* This term is already coded */
#define TERM_COPIED     0x08   /* Has a child */
#define TERM_ORINFO     0x10   /* Need to free the WhereTerm.u.pOrInfo object */
#define TERM_ANDINFO    0x20   /* Need to free the WhereTerm.u.pAndInfo obj */
#define TERM_OR_OK      0x40   /* Used during OR-clause processing */


/*
** An instance of the following structure holds all information about a
** WHERE clause.  Mostly this is a container for one or more WhereTerms.
*/
struct WhereClause {
  Parse *pParse;           /* The parser context */







>







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#define TERM_DYNAMIC    0x01   /* Need to call sqlite3ExprDelete(db, pExpr) */
#define TERM_VIRTUAL    0x02   /* Added by the optimizer.  Do not code */
#define TERM_CODED      0x04   /* This term is already coded */
#define TERM_COPIED     0x08   /* Has a child */
#define TERM_ORINFO     0x10   /* Need to free the WhereTerm.u.pOrInfo object */
#define TERM_ANDINFO    0x20   /* Need to free the WhereTerm.u.pAndInfo obj */
#define TERM_OR_OK      0x40   /* Used during OR-clause processing */
#define TERM_NOHELP     0x80   /* This term does not reduce the search space */

/*
** An instance of the following structure holds all information about a
** WHERE clause.  Mostly this is a container for one or more WhereTerms.
*/
struct WhereClause {
  Parse *pParse;           /* The parser context */
1056
1057
1058
1059
1060
1061
1062

1063
1064
1065
1066
1067
1068
1069
        exprAnalyze(pSrc, pWC, idxNew);
        pTerm = &pWC->a[idxTerm];
        pWC->a[idxNew].iParent = idxTerm;
        pTerm->nChild = 1;
      }else{
        sqlite3ExprListDelete(db, pList);
      }

      pTerm->eOperator = 0;  /* case 1 trumps case 2 */
    }
  }
}
#endif /* !SQLITE_OMIT_OR_OPTIMIZATION && !SQLITE_OMIT_SUBQUERY */









>







1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
        exprAnalyze(pSrc, pWC, idxNew);
        pTerm = &pWC->a[idxTerm];
        pWC->a[idxNew].iParent = idxTerm;
        pTerm->nChild = 1;
      }else{
        sqlite3ExprListDelete(db, pList);
      }
      pTerm->wtFlags |= TERM_NOHELP;
      pTerm->eOperator = 0;  /* case 1 trumps case 2 */
    }
  }
}
#endif /* !SQLITE_OMIT_OR_OPTIMIZATION && !SQLITE_OMIT_SUBQUERY */


2519
2520
2521
2522
2523
2524
2525
2526
2527


2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552

2553
2554
2555
2556
2557

2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575

2576
2577






2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */

#ifdef SQLITE_ENABLE_STAT2
/*
** Estimate the number of rows that will be returned based on
** an IN constraint "x IN (V1,V2,V3,...)" where the right-hand side
** of the IN operator is a list of values.


**
** Write the estimated row count into *pnRow and return SQLITE_OK. 
** If unable to make an estimate, leave *pnRow unchanged and return
** non-zero.
**
** This routine can fail if it is unable to load a collating sequence
** required for string comparison, or if unable to allocate memory
** for a UTF conversion required for comparison.  The error is stored
** in the pParse structure.
*/
int whereInScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  ExprList *pList,     /* The value list on the RHS of "x IN (v1,v2,v3,...)" */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pVal = 0;  /* One value from list */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc;                   /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */
  int nRegion = 0;          /* Number of histogram regions spanned */
  int nSingle = 0;          /* Count of values contained within one region */
  int nNotFound = 0;        /* Count of values that are not constants */
  int i;                            /* Loop counter */

  u8 aHit[SQLITE_INDEX_SAMPLES+1];  /* Histogram regions that are spanned */

  assert( p->aSample!=0 );
  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  memset(aHit, 0, sizeof(aHit));

  for(i=0; i<pList->nExpr; i++){
    sqlite3ValueFree(pVal);
    rc = valueFromExpr(pParse, pList->a[i].pExpr, aff, &pVal);
    if( rc ) break;
    if( pVal==0 ){
      nNotFound++;
      continue;
    }
    rc = whereRangeRegion(pParse, p, pVal, 0, &iLower);
    if( rc ) break;
    rc = whereRangeRegion(pParse, p, pVal, 1, &iUpper);
    if( rc ) break;
    if( iLower>=iUpper ){
      nSingle++;
    }
    assert( iLower>=0 && iUpper<=SQLITE_INDEX_SAMPLES );
    while( iLower<=iUpper ) aHit[iLower++] = 1;
  }

  if( rc==SQLITE_OK ){
    for(i=nRegion=0; i<ArraySize(aHit); i++) nRegion += aHit[i];






    nRowEst = nRegion*p->aiRowEst[0]/(SQLITE_INDEX_SAMPLES+1)
               + nNotFound*p->aiRowEst[1];
    if( nRowEst > p->aiRowEst[0] ) nRowEst = p->aiRowEst[0];
    *pnRow = nRowEst;
    WHERETRACE(("IN row estimate: nRegion=%d, nSingle=%d, nNotFound=%d\n",
                 nRegion, nSingle, nNotFound));
  }
  sqlite3ValueFree(pVal);
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */









|
|
>
>



















|

|
|

|
>
|



|
>




|








|
|
|
|
|
>

|
>
>
>
>
>
>
|



|
|







2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */

#ifdef SQLITE_ENABLE_STAT2
/*
** Estimate the number of rows that will be returned based on
** an IN constraint where the right-hand side of the IN operator
** is a list of values.  Example:
**
**        WHERE x IN (1,2,3,4)
**
** Write the estimated row count into *pnRow and return SQLITE_OK. 
** If unable to make an estimate, leave *pnRow unchanged and return
** non-zero.
**
** This routine can fail if it is unable to load a collating sequence
** required for string comparison, or if unable to allocate memory
** for a UTF conversion required for comparison.  The error is stored
** in the pParse structure.
*/
int whereInScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  ExprList *pList,     /* The value list on the RHS of "x IN (v1,v2,v3,...)" */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pVal = 0;  /* One value from list */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc = SQLITE_OK;       /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */
  int nSpan = 0;            /* Number of histogram regions spanned */
  int nSingle = 0;          /* Histogram regions hit by a single value */
  int nNotFound = 0;        /* Count of values that are not constants */
  int i;                               /* Loop counter */
  u8 aSpan[SQLITE_INDEX_SAMPLES+1];    /* Histogram regions that are spanned */
  u8 aSingle[SQLITE_INDEX_SAMPLES+1];  /* Histogram regions hit once */

  assert( p->aSample!=0 );
  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  memset(aSpan, 0, sizeof(aSpan));
  memset(aSingle, 0, sizeof(aSingle));
  for(i=0; i<pList->nExpr; i++){
    sqlite3ValueFree(pVal);
    rc = valueFromExpr(pParse, pList->a[i].pExpr, aff, &pVal);
    if( rc ) break;
    if( pVal==0 || sqlite3_value_type(pVal)==SQLITE_NULL ){
      nNotFound++;
      continue;
    }
    rc = whereRangeRegion(pParse, p, pVal, 0, &iLower);
    if( rc ) break;
    rc = whereRangeRegion(pParse, p, pVal, 1, &iUpper);
    if( rc ) break;
    if( iLower>=iUpper ){
      aSingle[iLower] = 1;
    }else{
      assert( iLower>=0 && iUpper<=SQLITE_INDEX_SAMPLES );
      while( iLower<iUpper ) aSpan[iLower++] = 1;
    }
  }
  if( rc==SQLITE_OK ){
    for(i=nSpan=0; i<=SQLITE_INDEX_SAMPLES; i++){
      if( aSpan[i] ){
        nSpan++;
      }else if( aSingle[i] ){
        nSingle++;
      }
    }
    nRowEst = (nSpan*2+nSingle)*p->aiRowEst[0]/(2*SQLITE_INDEX_SAMPLES)
               + nNotFound*p->aiRowEst[1];
    if( nRowEst > p->aiRowEst[0] ) nRowEst = p->aiRowEst[0];
    *pnRow = nRowEst;
    WHERETRACE(("IN row estimate: nSpan=%d, nSingle=%d, nNotFound=%d, est=%g\n",
                 nSpan, nSingle, nNotFound, nRowEst));
  }
  sqlite3ValueFree(pVal);
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */


2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
      int k;                       /* Loop counter */
      int nSkipEq = nEq;           /* Number of == constraints to skip */
      int nSkipRange = nBound;     /* Number of < constraints to skip */
      Bitmask thisTab;             /* Bitmap for pSrc */

      thisTab = getMask(pWC->pMaskSet, iCur);
      for(pTerm=pWC->a, k=pWC->nTerm; nRow>2 && k; k--, pTerm++){
        if( pTerm->wtFlags & TERM_VIRTUAL ) continue;
        if( (pTerm->prereqAll & notValid)!=thisTab ) continue;
        if( pTerm->eOperator & (WO_EQ|WO_IN|WO_ISNULL) ){
          if( nSkipEq ){
            /* Ignore the first nEq equality matches since the index
            ** has already accounted for these */
            nSkipEq--;
          }else{
            /* Assume each additional equality match reduces the result
            ** set size by a factor of 10 */
            nRow /= 10;
          }
        }else if( pTerm->eOperator & (WO_LT|WO_LE|WO_GT|WO_GE) ){
          if( nSkipRange ){
            /* Ignore the first nBound range constraints since the index
            ** has already accounted for these */
            nSkipRange--;
          }else{
            /* Assume each additional range constraint reduces the result
            ** set size by a factor of 3 */
            nRow /= 3;
          }







|













|







2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
2956
2957
2958
2959
2960
      int k;                       /* Loop counter */
      int nSkipEq = nEq;           /* Number of == constraints to skip */
      int nSkipRange = nBound;     /* Number of < constraints to skip */
      Bitmask thisTab;             /* Bitmap for pSrc */

      thisTab = getMask(pWC->pMaskSet, iCur);
      for(pTerm=pWC->a, k=pWC->nTerm; nRow>2 && k; k--, pTerm++){
        if( pTerm->wtFlags & (TERM_VIRTUAL|TERM_NOHELP) ) continue;
        if( (pTerm->prereqAll & notValid)!=thisTab ) continue;
        if( pTerm->eOperator & (WO_EQ|WO_IN|WO_ISNULL) ){
          if( nSkipEq ){
            /* Ignore the first nEq equality matches since the index
            ** has already accounted for these */
            nSkipEq--;
          }else{
            /* Assume each additional equality match reduces the result
            ** set size by a factor of 10 */
            nRow /= 10;
          }
        }else if( pTerm->eOperator & (WO_LT|WO_LE|WO_GT|WO_GE) ){
          if( nSkipRange ){
            /* Ignore the first nSkipRange range constraints since the index
            ** has already accounted for these */
            nSkipRange--;
          }else{
            /* Assume each additional range constraint reduces the result
            ** set size by a factor of 3 */
            nRow /= 3;
          }
Changes to test/analyze5.test.
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
































140
141
142
143
144
145
146
} {
  do_test analyze5-1.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z>? AND z<?) (~%d rows)}} \
       $rows]
}
foreach {testid where rows} {
  101  {z=-1}           50
  102  {z=0}            400
  103  {z=1}            300
  104  {z=2}            200
  105  {z=3}            100
  106  {z=4}             50
  107  {z=-10.0}         50
  108  {z=0.0}          400
  109  {z=1.0}          300
  110  {z=2.0}          200
  111  {z=3.0}          100
  112  {z=4.0}           50
  113  {z=1.5}           50
  114  {z=2.5}           50
} {
  do_test analyze5-1.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z=?) (~%d rows)}} $rows]
}

































# For the t1.y column, most entries are known to be zero.  So do a 
# full table scan for y=0 but use the index for any other constraint on
# y.
#
do_test analyze5-201 {
  eqp {SELECT * FROM t1 WHERE y=0}







|


















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
} {
  do_test analyze5-1.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z>? AND z<?) (~%d rows)}} \
       $rows]
}
foreach {testid where rows} {
  101  {z=-1}            50
  102  {z=0}            400
  103  {z=1}            300
  104  {z=2}            200
  105  {z=3}            100
  106  {z=4}             50
  107  {z=-10.0}         50
  108  {z=0.0}          400
  109  {z=1.0}          300
  110  {z=2.0}          200
  111  {z=3.0}          100
  112  {z=4.0}           50
  113  {z=1.5}           50
  114  {z=2.5}           50
} {
  do_test analyze5-1.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1z (z=?) (~%d rows)}} $rows]
}

# for the next sequence of tests a value of rows<=0 means a full-table scan
# is used.
#
#set sqlite_where_trace 1
foreach {testid where rows} {
  201  {z IN (-1)}            50
  202  {z IN (0)}            400
  203  {z IN (1)}            300
  204  {z IN (2)}            200
  205  {z IN (3)}            100
  206  {z IN (4)}             50
  207  {z IN (0.5)}           50
  208  {z IN (0,1)}          700
  209  {z IN (0,1,2)}        900
  210  {z IN (0,1,2,3)}        0
  211  {z IN (0,1,2,3,4,5)}    0
  212  {z IN (1,2)}          500
  213  {z IN (2,3)}          300
  214  {z=3 OR z=2}          300
  215  {z IN (-1,3)}         150
  216  {z=-1 OR z=3}         150
} {
  if {$rows<=0} {
    set ans {SCAN TABLE t1 (~100 rows)}
  } else {
    set ans [format {SEARCH TABLE t1 USING INDEX t1z (z=?) (~%d rows)} $rows]
  }
  do_test analyze5-1.$testid {
    lindex [eqp "SELECT * FROM t1 WHERE $where"] 3
  } $ans
}

# For the t1.y column, most entries are known to be zero.  So do a 
# full table scan for y=0 but use the index for any other constraint on
# y.
#
do_test analyze5-201 {
  eqp {SELECT * FROM t1 WHERE y=0}