SQLite

Check-in [6bfc5c69eb]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Use histogram data to improve the row-count estimates on equality constraints.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | stat2-enhancement
Files: files | file ages | folders
SHA1: 6bfc5c69eb22938972bbf4e60179952dc215f770
User & Date: drh 2011-01-20 16:52:09.439
Context
2011-01-20
20:36
Update ANALYZE test cases to check out the use of histograms for equality constraints. (check-in: c7b59afaf0 user: drh tags: stat2-enhancement)
16:52
Use histogram data to improve the row-count estimates on equality constraints. (check-in: 6bfc5c69eb user: drh tags: stat2-enhancement)
02:56
The first of a planned series of enhancements to the query planner that enable it to make better use of sqlite_stat2 histograms when the table has many repeated values. (check-in: 2cd374cd23 user: drh tags: stat2-enhancement)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/where.c.
2457
2458
2459
2460
2461
2462
2463













































2464
2465
2466
2467
2468
2469
2470
    *piEst = 11;
  }else{
    *piEst = 33;
  }
  return rc;
}















































/*
** Find the query plan for accessing a particular table.  Write the
** best query plan and its cost into the WhereCost object supplied as the
** last parameter.
**
** The lowest cost plan wins.  The cost is an estimate of the amount of







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
    *piEst = 11;
  }else{
    *piEst = 33;
  }
  return rc;
}

#ifdef SQLITE_ENABLE_STAT2
/*
** Estimate the number of rows that will be returned based on
** an equality constraint x=VALUE and where that VALUE occurs in
** the histogram data.  This only works when x is the left-most
** column of an index and sqlite_stat2 histogram data is available
** for that index.
**
** Write the estimated row count into *pnRow.  If unable to make
** an estimate, leave *pnRow unchanged.
*/
void whereEqScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  WhereTerm *pTerm,    /* The x=VALUE constraint */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pRhs = 0;  /* VALUE on right-hand side of pTerm */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc;                   /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */

  assert( p->aSample!=0 );
  assert( pTerm->eOperator==WO_EQ );
  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  rc = valueFromExpr(pParse, pTerm->pExpr->pRight, aff, &pRhs);
  if( rc ) goto whereEqScanEst_cancel;
  rc = whereRangeRegion(pParse, p, pRhs, 0, &iLower);
  if( rc ) goto whereEqScanEst_cancel;
  rc = whereRangeRegion(pParse, p, pRhs, 1, &iUpper);
  if( rc ) goto whereEqScanEst_cancel;
  if( iLower>=iUpper ){
    nRowEst = p->aiRowEst[0]/(SQLITE_INDEX_SAMPLES*2);
    if( nRowEst<*pnRow ) *pnRow = nRowEst;
  }else{
    nRowEst = (iUpper-iLower)*p->aiRowEst[0]/SQLITE_INDEX_SAMPLES;
    *pnRow = nRowEst;
  }

whereEqScanEst_cancel:
  sqlite3ValueFree(pRhs);
}
#endif /* defined(SQLITE_ENABLE_STAT2) */


/*
** Find the query plan for accessing a particular table.  Write the
** best query plan and its cost into the WhereCost object supplied as the
** last parameter.
**
** The lowest cost plan wins.  The cost is an estimate of the amount of
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630



2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649





2650
2651
2652
2653
2654
2655
2656
    **             SELECT a, b    FROM tbl WHERE a = 1;
    **             SELECT a, b, c FROM tbl WHERE a = 1;
    */
    int nEq;
    int bInEst = 0;
    int nInMul = 1;
    int estBound = 100;
    int nBound = 0;             /* Number of range constraints seen */
    int bSort = 0;
    int bLookup = 0;
    WhereTerm *pTerm;           /* A single term of the WHERE clause */




    /* Determine the values of nEq and nInMul */
    for(nEq=0; nEq<pProbe->nColumn; nEq++){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pIdx);
      if( pTerm==0 ) break;
      wsFlags |= (WHERE_COLUMN_EQ|WHERE_ROWID_EQ);
      if( pTerm->eOperator & WO_IN ){
        Expr *pExpr = pTerm->pExpr;
        wsFlags |= WHERE_COLUMN_IN;
        if( ExprHasProperty(pExpr, EP_xIsSelect) ){
          nInMul *= 25;
          bInEst = 1;
        }else if( ALWAYS(pExpr->x.pList) ){
          nInMul *= pExpr->x.pList->nExpr + 1;
        }
      }else if( pTerm->eOperator & WO_ISNULL ){
        wsFlags |= WHERE_COLUMN_NULL;
      }





      used |= pTerm->prereqRight;
    }

    /* Determine the value of estBound. */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pIdx) ){







|


|
>
>
>



















>
>
>
>
>







2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
    **             SELECT a, b    FROM tbl WHERE a = 1;
    **             SELECT a, b, c FROM tbl WHERE a = 1;
    */
    int nEq;
    int bInEst = 0;
    int nInMul = 1;
    int estBound = 100;
    int nBound = 0;               /* Number of range constraints seen */
    int bSort = 0;
    int bLookup = 0;
    WhereTerm *pTerm;             /* A single term of the WHERE clause */
#ifdef SQLITE_ENABLE_STAT2
    WhereTerm *pFirstEqTerm = 0;  /* First WO_EQ term */
#endif

    /* Determine the values of nEq and nInMul */
    for(nEq=0; nEq<pProbe->nColumn; nEq++){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pIdx);
      if( pTerm==0 ) break;
      wsFlags |= (WHERE_COLUMN_EQ|WHERE_ROWID_EQ);
      if( pTerm->eOperator & WO_IN ){
        Expr *pExpr = pTerm->pExpr;
        wsFlags |= WHERE_COLUMN_IN;
        if( ExprHasProperty(pExpr, EP_xIsSelect) ){
          nInMul *= 25;
          bInEst = 1;
        }else if( ALWAYS(pExpr->x.pList) ){
          nInMul *= pExpr->x.pList->nExpr + 1;
        }
      }else if( pTerm->eOperator & WO_ISNULL ){
        wsFlags |= WHERE_COLUMN_NULL;
      }
#ifdef SQLITE_ENABLE_STAT2
      else if( nEq==0 && pProbe->aSample ){
        pFirstEqTerm = pTerm;
      }
#endif
      used |= pTerm->prereqRight;
    }

    /* Determine the value of estBound. */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pIdx) ){
2719
2720
2721
2722
2723
2724
2725











2726
2727
2728
2729
2730
2731
2732
    */
    nRow = (double)(aiRowEst[nEq] * nInMul);
    if( bInEst && nRow*2>aiRowEst[0] ){
      nRow = aiRowEst[0]/2;
      nInMul = (int)(nRow / aiRowEst[nEq]);
    }












    /* Assume constant cost to access a row and logarithmic cost to
    ** do a binary search.  Hence, the initial cost is the number of output
    ** rows plus log2(table-size) times the number of binary searches.
    */
    cost = nRow + nInMul*estLog(aiRowEst[0]);

    /* Adjust the number of rows and the cost downward to reflect rows







>
>
>
>
>
>
>
>
>
>
>







2772
2773
2774
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
    */
    nRow = (double)(aiRowEst[nEq] * nInMul);
    if( bInEst && nRow*2>aiRowEst[0] ){
      nRow = aiRowEst[0]/2;
      nInMul = (int)(nRow / aiRowEst[nEq]);
    }

#ifdef SQLITE_ENABLE_STAT2
    /* If the constraint is of the form x=VALUE and histogram
    ** data is available for column x, then it might be possible
    ** to get a better estimate on the number of rows based on
    ** VALUE and how common that value is according to the histogram.
    */
    if( nRow>(double)1 && nEq==1 && pFirstEqTerm!=0 ){
      whereEqScanEst(pParse, pProbe, pFirstEqTerm, &nRow);
    }
#endif /* SQLITE_ENABLE_STAT2 */

    /* Assume constant cost to access a row and logarithmic cost to
    ** do a binary search.  Hence, the initial cost is the number of output
    ** rows plus log2(table-size) times the number of binary searches.
    */
    cost = nRow + nInMul*estLog(aiRowEst[0]);

    /* Adjust the number of rows and the cost downward to reflect rows
Changes to test/analyze5.test.
210
211
212
213
214
215
216








217
218
219
220
221
222
223
 16  {y>=0 AND y<=''}                 50
 17  {y>=0 AND y<='alpha'}           400
 18  {y>=0 AND y<'alpha'}             50
 19  {y>=0 AND y<='bravo'}           700
 20  {y>=0 AND y<'charlie'}          700
 21  {y>=0 AND y<='charlie'}         900
 22  {y>=0 AND y<'delta'}            900








} {
  do_test analyze5-3.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1y (y>? AND y<?) (~%d rows)}} \
       $rows]
}








>
>
>
>
>
>
>
>







210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
 16  {y>=0 AND y<=''}                 50
 17  {y>=0 AND y<='alpha'}           400
 18  {y>=0 AND y<'alpha'}             50
 19  {y>=0 AND y<='bravo'}           700
 20  {y>=0 AND y<'charlie'}          700
 21  {y>=0 AND y<='charlie'}         900
 22  {y>=0 AND y<'delta'}            900
 23  {y>'alpha' AND y<x'00'}         600
 24  {y>='bravo' AND y<x'00'}        600
 25  {y>'bravo' AND y<x'00'}         300
 26  {y>='charlie' AND y<x'00'}      300
 27  {y>'charlie' AND y<x'00'}       100
 28  {y>='delta' AND y<x'00'}        100
 29  {y>'delta' AND y<x'00'}          50
 30  {y>='echo' AND y<x'00'}          50
} {
  do_test analyze5-3.$testid {
    eqp "SELECT * FROM t1 WHERE $where"
  } [format {0 0 0 {SEARCH TABLE t1 USING INDEX t1y (y>? AND y<?) (~%d rows)}} \
       $rows]
}