SQLite

Check-in [ee3a08e353]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:More work on the new optimizer. Fewer tests fail now. (CVS 2565)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ee3a08e353f563c36e904479393fcb56f96ee975
User & Date: drh 2005-07-27 20:41:44.000
Context
2005-07-28
16:51
The new optimizer now passes all regression tests. (CVS 2566) (check-in: a212128433 user: drh tags: trunk)
2005-07-27
20:41
More work on the new optimizer. Fewer tests fail now. (CVS 2565) (check-in: ee3a08e353 user: drh tags: trunk)
2005-07-23
22:59
A new optimizer that breaks a lot of tests. But none of them critically, I think. Nevertheless, there is a lot of work ahead to stabilize the code. (CVS 2564) (check-in: 86ce56ccea user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/build.c.
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
**     CREATE INDEX
**     DROP INDEX
**     creating ID lists
**     BEGIN TRANSACTION
**     COMMIT
**     ROLLBACK
**
** $Id: build.c,v 1.336 2005/07/23 22:59:56 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>

/*
** This routine is called when a new SQL statement is beginning to
** be parsed.  Initialize the pParse structure as needed.







|







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
**     CREATE INDEX
**     DROP INDEX
**     creating ID lists
**     BEGIN TRANSACTION
**     COMMIT
**     ROLLBACK
**
** $Id: build.c,v 1.337 2005/07/27 20:41:44 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>

/*
** This routine is called when a new SQL statement is beginning to
** be parsed.  Initialize the pParse structure as needed.
2386
2387
2388
2389
2390
2391
2392














2393
2394

2395


2396














2397



2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
  sqliteFree(zName);
  return;
}

/*
** Fill the Index.aiRowEst[] array with default information - information
** to be used when we have no ANALYZE command to run.














*/
void sqlite3DefaultRowEst(Index *pIdx){

  int i;


  int n = pIdx->nColumn;














  int j = 1000000;



  int f = (1000000-1-100*(pIdx->onError==OE_None))/n;
  for(i=0; i<=n; i++, j-=f){
    assert( j>0 );
    pIdx->aiRowEst[i] = j;
  }
}

/*
** This routine will drop an existing named index.  This routine
** implements the DROP INDEX statement.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>


>

>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
|
<
<
|







2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432


2433
2434
2435
2436
2437
2438
2439
2440
  sqliteFree(zName);
  return;
}

/*
** Fill the Index.aiRowEst[] array with default information - information
** to be used when we have no ANALYZE command to run.
**
** aiRowEst[0] is suppose to contain the number of elements in the index.
** Since we do not know, guess 1 million.  aiRowEst[1] is an estimate of the
** number of rows in the table that match any particular value of the
** first column of the index.  aiRowEst[2] is an estimate of the number
** of rows that match any particular combiniation of the first 2 columns
** of the index.  And so forth.  It must always be the case that
*
**           aiRowEst[N]<=aiRowEst[N-1]
**           aiRowEst[N]>=1
**
** Apart from that, we have little to go on besides intuition as to
** how aiRowEst[] should be initialized.  The numbers generated here
** are based on typical values found in actual indices.
*/
void sqlite3DefaultRowEst(Index *pIdx){
  int *a = pIdx->aiRowEst;
  int i;
  assert( a!=0 );
  a[0] = 1000000;
  switch( pIdx->nColumn ){
    case 1: {
      a[1] = 20;
      break;
    }
    case 2: {
      a[1] = 350;
      a[2] = 20;
      break;
    }
    default: {
      a[1] = 1250;
      a[2] = 350;
      a[3] = 20;
      for(i=pIdx->nColumn; i>=4; i--){
        a[i] = 10;
      }
    }
  }
  if( pIdx->onError!=OE_None ){


    a[pIdx->nColumn] = 1;
  }
}

/*
** This routine will drop an existing named index.  This routine
** implements the DROP INDEX statement.
*/
Changes to src/where.c.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
** This module contains C code that generates VDBE code used to process
** the WHERE clause of SQL statements.  This module is reponsible for
** generating the code that loops through a table looking for applicable
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
**
** $Id: where.c,v 1.152 2005/07/23 22:59:56 drh Exp $
*/
#include "sqliteInt.h"

/*
** The number of bits in a Bitmask.  "BMS" means "BitMask Size".
*/
#define BMS  (sizeof(Bitmask)*8)







|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
** This module contains C code that generates VDBE code used to process
** the WHERE clause of SQL statements.  This module is reponsible for
** generating the code that loops through a table looking for applicable
** rows.  Indices are selected and used to speed the search when doing
** so is applicable.  Because this module is responsible for selecting
** indices, you might also think of this module as the "query optimizer".
**
** $Id: where.c,v 1.153 2005/07/27 20:41:44 drh Exp $
*/
#include "sqliteInt.h"

/*
** The number of bits in a Bitmask.  "BMS" means "BitMask Size".
*/
#define BMS  (sizeof(Bitmask)*8)
539
540
541
542
543
544
545








546
547
548
549
550
551
552
  int nTerm;                   /* Number of ORDER BY terms */
  struct ExprList_item *pTerm; /* A term of the ORDER BY clause */
  sqlite3 *db = pParse->db;

  assert( pOrderBy!=0 );
  nTerm = pOrderBy->nExpr;
  assert( nTerm>0 );









  /* Match terms of the ORDER BY clause against columns of
  ** the index.
  */
  for(i=j=0, pTerm=pOrderBy->a; j<nTerm && i<pIdx->nColumn; i++){
    Expr *pExpr;       /* The expression of the ORDER BY pTerm */
    CollSeq *pColl;    /* The collating sequence of pExpr */







>
>
>
>
>
>
>
>







539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
  int nTerm;                   /* Number of ORDER BY terms */
  struct ExprList_item *pTerm; /* A term of the ORDER BY clause */
  sqlite3 *db = pParse->db;

  assert( pOrderBy!=0 );
  nTerm = pOrderBy->nExpr;
  assert( nTerm>0 );

  /* A UNIQUE index that is fully specified is always a sorting
  ** index.
  */
  if( pIdx->onError!=OE_None && nEqCol==pIdx->nColumn ){
    *pbRev = 0;
    return 1;
  }

  /* Match terms of the ORDER BY clause against columns of
  ** the index.
  */
  for(i=j=0, pTerm=pOrderBy->a; j<nTerm && i<pIdx->nColumn; i++){
    Expr *pExpr;       /* The expression of the ORDER BY pTerm */
    CollSeq *pColl;    /* The collating sequence of pExpr */
614
615
616
617
618
619
620



















621
622
623
624
625
626
627
  p = pOrderBy->a[0].pExpr;
  if( p->op==TK_COLUMN && p->iTable==base && p->iColumn==-1 ){
    *pbRev = pOrderBy->a[0].sortOrder;
    return 1;
  }
  return 0;
}




















/*
** Find the best index for accessing a particular table.  Return a pointer
** to the index, flags that describe how the index should be used, the
** number of equality constraints and the "cost" for this index.
**
** The lowest cost index wins.  The cost is an estimate of the amount of







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
  p = pOrderBy->a[0].pExpr;
  if( p->op==TK_COLUMN && p->iTable==base && p->iColumn==-1 ){
    *pbRev = pOrderBy->a[0].sortOrder;
    return 1;
  }
  return 0;
}

/*
** Prepare a crude estimate of the logorithm of the input value.
** The results need not be exact.  This is only used for estimating
** the total cost of performing operatings with O(logN) or O(NlogN)
** complexity.  Because N is just a guess, it is no great tragedy if
** logN is a little off.
**
** We can assume N>=1.0;
*/
static double estLog(double N){
  double logN = 1.0;
  double x = 10.0;
  while( N>x ){
    logN = logN+1.0;
    x *= 10;
  }
  return logN;
}

/*
** Find the best index for accessing a particular table.  Return a pointer
** to the index, flags that describe how the index should be used, the
** number of equality constraints and the "cost" for this index.
**
** The lowest cost index wins.  The cost is an estimate of the amount of
664
665
666
667
668
669
670


671
672
673
674
675
676


677

678



679
680
681
682
683
684

685
686
687
688




689
690
691
692
693
694
695
696
697
698
699
700
701
702
703




704
705
706
707
708
709


710

711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
  /* Check for a rowid=EXPR or rowid IN (...) constraints
  */
  pTerm = findTerm(pWC, iCur, -1, notReady, WO_EQ|WO_IN, 0);
  if( pTerm ){
    *ppIndex = 0;
    bestFlags = WHERE_ROWID_EQ;
    if( pTerm->operator & WO_EQ ){


      *pFlags = WHERE_ROWID_EQ;
      *pnEq = 1;
      if( pOrderBy ) *pFlags |= WHERE_ORDERBY;
      TRACE(("... best is rowid\n"));
      return 0.0;
    }else if( pTerm->operator & WO_LIST ){


      lowestCost = pTerm->pExpr->pList->nExpr;

    }else{



      lowestCost = 100.0;
    }
    TRACE(("... rowid IN cost: %g\n", lowestCost));
  }

  /* Check for constraints on a range of rowids or a full table scan.

  */
  pProbe = pSrc->pTab->pIndex;
  cost = pProbe ? pProbe->aiRowEst[0] : 100000.0;
  TRACE(("... base cost: %g\n", cost));




  pTerm = findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE|WO_GT|WO_GE, 0);
  if( pTerm ){
    flags = WHERE_ROWID_RANGE;
    if( findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE, 0) ){
      flags |= WHERE_TOP_LIMIT;
      cost *= 0.25;  /* Guess that rowid<EXPR eliminates 75% of the search */
    }
    if( findTerm(pWC, iCur, -1, notReady, WO_GT|WO_GE, 0) ){
      flags |= WHERE_BTM_LIMIT;
      cost *= 0.25;  /* Guess that rowid>EXPR eliminates 75% of the search */
    }
    TRACE(("... rowid range cost: %g\n", cost));
  }else{
    flags = 0;
  }




  if( pOrderBy && sortableByRowid(iCur, pOrderBy, &rev) ){
    flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;
    cost *= 0.5;
    if( rev ){
      flags |= WHERE_REVERSE;
    }


    TRACE(("... order by reduces cost to %g\n", cost));

  }
  if( cost<lowestCost ){
    lowestCost = cost;
    bestFlags = flags;
  }

  /* Look at each index.
  */
  for(; pProbe; pProbe=pProbe->pNext){
    int i;                       /* Loop counter */
    double inMultiplier = 2.0;   /* Includes built-in index lookup penalty */

    TRACE(("... index %s:\n", pProbe->zName));

    /* Count the number of columns in the index that are satisfied
    ** by x=EXPR constraints or x IN (...) constraints.
    */
    flags = 0;







>
>






>
>

>

>
>
>
|




|
>


|
|
>
>
>
>


<


|



|

|



>
>
>
>
|
|
<
|
|
|
>
>
|
>










|







691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730

731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748

749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
  /* Check for a rowid=EXPR or rowid IN (...) constraints
  */
  pTerm = findTerm(pWC, iCur, -1, notReady, WO_EQ|WO_IN, 0);
  if( pTerm ){
    *ppIndex = 0;
    bestFlags = WHERE_ROWID_EQ;
    if( pTerm->operator & WO_EQ ){
      /* Rowid== is always the best pick.  Look no further.  Because only
      ** a single row is generated, output is always in sorted order */
      *pFlags = WHERE_ROWID_EQ;
      *pnEq = 1;
      if( pOrderBy ) *pFlags |= WHERE_ORDERBY;
      TRACE(("... best is rowid\n"));
      return 0.0;
    }else if( pTerm->operator & WO_LIST ){
      /* Rowid IN (LIST): cost is NlogN where N is the number of list
      ** elements.  */
      lowestCost = pTerm->pExpr->pList->nExpr;
      lowestCost *= estLog(lowestCost);
    }else{
      /* Rowid IN (SELECT): cost is NlogN where N is the number of rows
      ** in the result of the inner select.  We have no way to estimate
      ** that value so make a wild guess. */
      lowestCost = 200.0;
    }
    TRACE(("... rowid IN cost: %g\n", lowestCost));
  }

  /* Estimate the cost of a table scan.  If we do not know how many
  ** entries are in the table, use 1 million as a guess.
  */
  pProbe = pSrc->pTab->pIndex;
  cost = pProbe ? pProbe->aiRowEst[0] : 1000000.0;
  TRACE(("... table scan base cost: %g\n", cost));
  flags = WHERE_ROWID_RANGE;

  /* Check for constraints on a range of rowids in a table scan.
  */
  pTerm = findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE|WO_GT|WO_GE, 0);
  if( pTerm ){

    if( findTerm(pWC, iCur, -1, notReady, WO_LT|WO_LE, 0) ){
      flags |= WHERE_TOP_LIMIT;
      cost *= 0.333;  /* Guess that rowid<EXPR eliminates two-thirds or rows */
    }
    if( findTerm(pWC, iCur, -1, notReady, WO_GT|WO_GE, 0) ){
      flags |= WHERE_BTM_LIMIT;
      cost *= 0.333;  /* Guess that rowid>EXPR eliminates two-thirds of rows */
    }
    TRACE(("... rowid range reduces cost to %g\n", cost));
  }else{
    flags = 0;
  }

  /* If the table scan does not satisfy the ORDER BY clause, increase
  ** the cost by NlogN to cover the expense of sorting. */
  if( pOrderBy ){
    if( sortableByRowid(iCur, pOrderBy, &rev) ){
      flags |= WHERE_ORDERBY|WHERE_ROWID_RANGE;

      if( rev ){
        flags |= WHERE_REVERSE;
      }
    }else{
      cost += cost*estLog(cost);
      TRACE(("... sorting increases cost to %g\n", cost));
    }
  }
  if( cost<lowestCost ){
    lowestCost = cost;
    bestFlags = flags;
  }

  /* Look at each index.
  */
  for(; pProbe; pProbe=pProbe->pNext){
    int i;                       /* Loop counter */
    double inMultiplier = 1.0;

    TRACE(("... index %s:\n", pProbe->zName));

    /* Count the number of columns in the index that are satisfied
    ** by x=EXPR constraints or x IN (...) constraints.
    */
    flags = 0;
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768

769
770
771
772
773
774
775
776
777
778


779

780
781
782
783
784
785
786
        if( pTerm->operator & WO_SELECT ){
          inMultiplier *= 100.0;
        }else if( pTerm->operator & WO_LIST ){
          inMultiplier *= pTerm->pExpr->pList->nExpr + 1.0;
        }
      }
    }
    cost = pProbe->aiRowEst[i] * inMultiplier;
    nEq = i;
    TRACE(("...... nEq=%d inMult=%g cost=%g\n", nEq, inMultiplier, cost));

    /* Look for range constraints
    */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pProbe);
      if( pTerm ){
        flags = WHERE_COLUMN_RANGE;
        if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
          flags |= WHERE_TOP_LIMIT;
          cost *= 0.5;
        }
        if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
          flags |= WHERE_BTM_LIMIT;
          cost *= 0.5;
        }
        TRACE(("...... range reduces cost to %g\n", cost));
      }
    }

    /* Reduce the cost substantially if this index can be used to satisfy
    ** the ORDER BY clause
    */

    if( pOrderBy && (flags & WHERE_COLUMN_IN)==0 &&
        isSortingIndex(pParse, pProbe, pSrc->pTab, iCur, pOrderBy, nEq, &rev) ){
      if( flags==0 ){
        flags = WHERE_COLUMN_RANGE;
      }
      flags |= WHERE_ORDERBY;
      cost *= 0.5;
      if( rev ){
        flags |= WHERE_REVERSE;
      }


      TRACE(("...... orderby reduces cost to %g\n", cost));

    }

    /* Check to see if we can get away with using just the index without
    ** ever reading the table.  If that is the case, then halve the
    ** cost of this index.
    */
    if( flags && pSrc->colUsed < (((Bitmask)1)<<(BMS-1)) ){







|












|



|





<
|

>
|

|
|
|
|
<
|
|
|
>
>
|
>







781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810

811
812
813
814
815
816
817
818
819

820
821
822
823
824
825
826
827
828
829
830
831
832
833
        if( pTerm->operator & WO_SELECT ){
          inMultiplier *= 100.0;
        }else if( pTerm->operator & WO_LIST ){
          inMultiplier *= pTerm->pExpr->pList->nExpr + 1.0;
        }
      }
    }
    cost = pProbe->aiRowEst[i] * inMultiplier * estLog(inMultiplier);
    nEq = i;
    TRACE(("...... nEq=%d inMult=%g cost=%g\n", nEq, inMultiplier, cost));

    /* Look for range constraints
    */
    if( nEq<pProbe->nColumn ){
      int j = pProbe->aiColumn[nEq];
      pTerm = findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE|WO_GT|WO_GE, pProbe);
      if( pTerm ){
        flags = WHERE_COLUMN_RANGE;
        if( findTerm(pWC, iCur, j, notReady, WO_LT|WO_LE, pProbe) ){
          flags |= WHERE_TOP_LIMIT;
          cost *= 0.333;
        }
        if( findTerm(pWC, iCur, j, notReady, WO_GT|WO_GE, pProbe) ){
          flags |= WHERE_BTM_LIMIT;
          cost *= 0.333;
        }
        TRACE(("...... range reduces cost to %g\n", cost));
      }
    }


    /* Add the additional cost of sorting if that is a factor.
    */
    if( pOrderBy ){
      if( (flags & WHERE_COLUMN_IN)==0 &&
        isSortingIndex(pParse, pProbe, pSrc->pTab, iCur, pOrderBy, nEq, &rev) ){
        if( flags==0 ){
          flags = WHERE_COLUMN_RANGE;
        }
        flags |= WHERE_ORDERBY;

        if( rev ){
          flags |= WHERE_REVERSE;
        }
      }else{
        cost += cost*estLog(cost);
        TRACE(("...... orderby reduces cost to %g\n", cost));
      }
    }

    /* Check to see if we can get away with using just the index without
    ** ever reading the table.  If that is the case, then halve the
    ** cost of this index.
    */
    if( flags && pSrc->colUsed < (((Bitmask)1)<<(BMS-1)) ){