/ Check-in [fd3977a2]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Make use of histogram data to make better estimates for the number of rows that will be returned from "x IN (v1,v2,v3,...)" constraints.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | stat2-enhancement
Files: files | file ages | folders
SHA1:fd3977a27ae68e694df12a4713e55515c1e87c5d
User & Date: drh 2011-01-21 16:27:18
Context
2011-01-21
18:18
Adjustments to the result row estimator for the IN operator so that it gives the same estimates as the equivalent OR operator. Test cases for the same. check-in: c82cb9c0 user: drh tags: stat2-enhancement
16:27
Make use of histogram data to make better estimates for the number of rows that will be returned from "x IN (v1,v2,v3,...)" constraints. check-in: fd3977a2 user: drh tags: stat2-enhancement
14:37
Add the ability to use indices when a range contraint is bounded on the lower end by NULL. check-in: f73a167b user: drh tags: stat2-enhancement
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/where.c.

2471
2472
2473
2474
2475
2476
2477
2478
2479

2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502

2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517




































































2518
2519
2520
2521
2522
2523
2524
....
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
....
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
....
2792
2793
2794
2795
2796
2797
2798
2799



2800

2801
2802
2803
2804
2805
2806
2807
/*
** Estimate the number of rows that will be returned based on
** an equality constraint x=VALUE and where that VALUE occurs in
** the histogram data.  This only works when x is the left-most
** column of an index and sqlite_stat2 histogram data is available
** for that index.
**
** Write the estimated row count into *pnRow.  If unable to make
** an estimate, leave *pnRow unchanged.

**
** This routine can fail if it is unable to load a collating sequence
** required for string comparison, or if unable to allocate memory
** for a UTF conversion required for comparison.  The error is stored
** in the pParse structure.
*/
void whereEqScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  WhereTerm *pTerm,    /* The x=VALUE constraint */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pRhs = 0;  /* VALUE on right-hand side of pTerm */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc;                   /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */

  assert( p->aSample!=0 );
  assert( pTerm->eOperator==WO_EQ );
  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  rc = valueFromExpr(pParse, pTerm->pExpr->pRight, aff, &pRhs);
  if( rc ) goto whereEqScanEst_cancel;

  rc = whereRangeRegion(pParse, p, pRhs, 0, &iLower);
  if( rc ) goto whereEqScanEst_cancel;
  rc = whereRangeRegion(pParse, p, pRhs, 1, &iUpper);
  if( rc ) goto whereEqScanEst_cancel;
  WHERETRACE(("equality scan regions: %d..%d\n", iLower, iUpper));
  if( iLower>=iUpper ){
    nRowEst = p->aiRowEst[0]/(SQLITE_INDEX_SAMPLES*2);
    if( nRowEst<*pnRow ) *pnRow = nRowEst;
  }else{
    nRowEst = (iUpper-iLower)*p->aiRowEst[0]/SQLITE_INDEX_SAMPLES;
    *pnRow = nRowEst;
  }

whereEqScanEst_cancel:
  sqlite3ValueFree(pRhs);




































































}
#endif /* defined(SQLITE_ENABLE_STAT2) */


/*
** Find the query plan for accessing a particular table.  Write the
** best query plan and its cost into the WhereCost object supplied as the
................................................................................
    int nInMul = 1;
    int estBound = 100;
    int nBound = 0;               /* Number of range constraints seen */
    int bSort = 0;
    int bLookup = 0;
    WhereTerm *pTerm;             /* A single term of the WHERE clause */
#ifdef SQLITE_ENABLE_STAT2
    WhereTerm *pFirstEqTerm = 0;  /* First WO_EQ term */
#endif

    /* Determine the values of nEq and nInMul */
    for(nEq=0; nEq<pProbe->nColumn; nEq++){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pIdx);
      if( pTerm==0 ) break;
................................................................................
          /* "x IN (value, value, ...)" */
          nInMul *= pExpr->x.pList->nExpr + 1;
        }
      }else if( pTerm->eOperator & WO_ISNULL ){
        wsFlags |= WHERE_COLUMN_NULL;
      }
#ifdef SQLITE_ENABLE_STAT2
      else if( nEq==0 && pProbe->aSample ){
        pFirstEqTerm = pTerm;
      }
#endif
      used |= pTerm->prereqRight;
    }

    /* Determine the value of estBound. */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
................................................................................

#ifdef SQLITE_ENABLE_STAT2
    /* If the constraint is of the form x=VALUE and histogram
    ** data is available for column x, then it might be possible
    ** to get a better estimate on the number of rows based on
    ** VALUE and how common that value is according to the histogram.
    */
    if( nRow>(double)1 && nEq==1 && pFirstEqTerm!=0 ){



      whereEqScanEst(pParse, pProbe, pFirstEqTerm, &nRow);

    }
#endif /* SQLITE_ENABLE_STAT2 */

    /* Assume constant cost to access a row and logarithmic cost to
    ** do a binary search.  Hence, the initial cost is the number of output
    ** rows plus log2(table-size) times the number of binary searches.
    */







|
|
>






|


|









<

|
|
>

|

|









|

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|







 







|
<
<







 







|
>
>
>
|
>







2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499

2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
....
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
....
2775
2776
2777
2778
2779
2780
2781
2782


2783
2784
2785
2786
2787
2788
2789
....
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
2872
2873
2874
2875
2876
2877
2878
/*
** Estimate the number of rows that will be returned based on
** an equality constraint x=VALUE and where that VALUE occurs in
** the histogram data.  This only works when x is the left-most
** column of an index and sqlite_stat2 histogram data is available
** for that index.
**
** Write the estimated row count into *pnRow and return SQLITE_OK. 
** If unable to make an estimate, leave *pnRow unchanged and return
** non-zero.
**
** This routine can fail if it is unable to load a collating sequence
** required for string comparison, or if unable to allocate memory
** for a UTF conversion required for comparison.  The error is stored
** in the pParse structure.
*/
int whereEqualScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  Expr *pExpr,         /* Expression for VALUE in the x=VALUE constraint */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pRhs = 0;  /* VALUE on right-hand side of pTerm */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc;                   /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */

  assert( p->aSample!=0 );

  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  rc = valueFromExpr(pParse, pExpr, aff, &pRhs);
  if( rc ) goto whereEqualScanEst_cancel;
  if( pRhs==0 ) return SQLITE_NOTFOUND;
  rc = whereRangeRegion(pParse, p, pRhs, 0, &iLower);
  if( rc ) goto whereEqualScanEst_cancel;
  rc = whereRangeRegion(pParse, p, pRhs, 1, &iUpper);
  if( rc ) goto whereEqualScanEst_cancel;
  WHERETRACE(("equality scan regions: %d..%d\n", iLower, iUpper));
  if( iLower>=iUpper ){
    nRowEst = p->aiRowEst[0]/(SQLITE_INDEX_SAMPLES*2);
    if( nRowEst<*pnRow ) *pnRow = nRowEst;
  }else{
    nRowEst = (iUpper-iLower)*p->aiRowEst[0]/SQLITE_INDEX_SAMPLES;
    *pnRow = nRowEst;
  }

whereEqualScanEst_cancel:
  sqlite3ValueFree(pRhs);
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */

#ifdef SQLITE_ENABLE_STAT2
/*
** Estimate the number of rows that will be returned based on
** an IN constraint "x IN (V1,V2,V3,...)" where the right-hand side
** of the IN operator is a list of values.
**
** Write the estimated row count into *pnRow and return SQLITE_OK. 
** If unable to make an estimate, leave *pnRow unchanged and return
** non-zero.
**
** This routine can fail if it is unable to load a collating sequence
** required for string comparison, or if unable to allocate memory
** for a UTF conversion required for comparison.  The error is stored
** in the pParse structure.
*/
int whereInScanEst(
  Parse *pParse,       /* Parsing & code generating context */
  Index *p,            /* The index whose left-most column is pTerm */
  ExprList *pList,     /* The value list on the RHS of "x IN (v1,v2,v3,...)" */
  double *pnRow        /* Write the revised row estimate here */
){
  sqlite3_value *pVal = 0;  /* One value from list */
  int iLower, iUpper;       /* Range of histogram regions containing pRhs */
  u8 aff;                   /* Column affinity */
  int rc;                   /* Subfunction return code */
  double nRowEst;           /* New estimate of the number of rows */
  int nRegion = 0;          /* Number of histogram regions spanned */
  int nSingle = 0;          /* Count of values contained within one region */
  int nNotFound = 0;        /* Count of values that are not constants */
  int i;                            /* Loop counter */
  u8 aHit[SQLITE_INDEX_SAMPLES+1];  /* Histogram regions that are spanned */

  assert( p->aSample!=0 );
  aff = p->pTable->aCol[p->aiColumn[0]].affinity;
  memset(aHit, 0, sizeof(aHit));
  for(i=0; i<pList->nExpr; i++){
    sqlite3ValueFree(pVal);
    rc = valueFromExpr(pParse, pList->a[i].pExpr, aff, &pVal);
    if( rc ) break;
    if( pVal==0 ){
      nNotFound++;
      continue;
    }
    rc = whereRangeRegion(pParse, p, pVal, 0, &iLower);
    if( rc ) break;
    rc = whereRangeRegion(pParse, p, pVal, 1, &iUpper);
    if( rc ) break;
    if( iLower>=iUpper ){
      nSingle++;
    }
    assert( iLower>=0 && iUpper<=SQLITE_INDEX_SAMPLES );
    while( iLower<=iUpper ) aHit[iLower++] = 1;
  }
  if( rc==SQLITE_OK ){
    for(i=nRegion=0; i<ArraySize(aHit); i++) nRegion += aHit[i];
    nRowEst = nRegion*p->aiRowEst[0]/(SQLITE_INDEX_SAMPLES+1)
               + nNotFound*p->aiRowEst[1];
    if( nRowEst > p->aiRowEst[0] ) nRowEst = p->aiRowEst[0];
    *pnRow = nRowEst;
    WHERETRACE(("IN row estimate: nRegion=%d, nSingle=%d, nNotFound=%d\n",
                 nRegion, nSingle, nNotFound));
  }
  sqlite3ValueFree(pVal);
  return rc;
}
#endif /* defined(SQLITE_ENABLE_STAT2) */


/*
** Find the query plan for accessing a particular table.  Write the
** best query plan and its cost into the WhereCost object supplied as the
................................................................................
    int nInMul = 1;
    int estBound = 100;
    int nBound = 0;               /* Number of range constraints seen */
    int bSort = 0;
    int bLookup = 0;
    WhereTerm *pTerm;             /* A single term of the WHERE clause */
#ifdef SQLITE_ENABLE_STAT2
    WhereTerm *pFirstTerm = 0;    /* First term matching the index */
#endif

    /* Determine the values of nEq and nInMul */
    for(nEq=0; nEq<pProbe->nColumn; nEq++){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, eqTermMask, pIdx);
      if( pTerm==0 ) break;
................................................................................
          /* "x IN (value, value, ...)" */
          nInMul *= pExpr->x.pList->nExpr + 1;
        }
      }else if( pTerm->eOperator & WO_ISNULL ){
        wsFlags |= WHERE_COLUMN_NULL;
      }
#ifdef SQLITE_ENABLE_STAT2
      if( nEq==0 && pProbe->aSample ) pFirstTerm = pTerm;


#endif
      used |= pTerm->prereqRight;
    }

    /* Determine the value of estBound. */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
................................................................................

#ifdef SQLITE_ENABLE_STAT2
    /* If the constraint is of the form x=VALUE and histogram
    ** data is available for column x, then it might be possible
    ** to get a better estimate on the number of rows based on
    ** VALUE and how common that value is according to the histogram.
    */
    if( nRow>(double)1 && nEq==1 && pFirstTerm!=0 ){
      if( pFirstTerm->eOperator==WO_EQ ){
        whereEqualScanEst(pParse, pProbe, pFirstTerm->pExpr->pRight, &nRow);
      }else if( pFirstTerm->eOperator==WO_IN && bInEst==0 ){
        whereInScanEst(pParse, pProbe, pFirstTerm->pExpr->x.pList, &nRow);
      }
    }
#endif /* SQLITE_ENABLE_STAT2 */

    /* Assume constant cost to access a row and logarithmic cost to
    ** do a binary search.  Hence, the initial cost is the number of output
    ** rows plus log2(table-size) times the number of binary searches.
    */