/ Check-in [7337293c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improvements and tests for detection of redundant DISTINCT qualifiers.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1:7337293c87fb563604dd6ad284f2d1e30c938b4c
User & Date: dan 2011-07-01 14:21:38
Context
2011-07-01
18:26
Improve use of indexes to optimize DISTINCT queries. check-in: 6c202ea0 user: dan tags: experimental
14:21
Improvements and tests for detection of redundant DISTINCT qualifiers. check-in: 7337293c user: dan tags: experimental
2011-06-30
20:17
Experimental changes to improve optimization of DISTINCT queries. check-in: f7ba0219 user: dan tags: experimental
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/select.c.

3844
3845
3846
3847
3848
3849
3850
3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
























3874
3875
3876
3877
3878
3879
3880
....
3931
3932
3933
3934
3935
3936
3937




3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
....
3966
3967
3968
3969
3970
3971
3972


3973
3974
3975
3976
3977
3978
3979
    }
    rc = multiSelect(pParse, p, pDest);
    explainSetInteger(pParse->iSelectId, iRestoreSelectId);
    return rc;
  }
#endif

  /* If possible, rewrite the query to use GROUP BY instead of DISTINCT.
  ** GROUP BY might use an index, DISTINCT never does.
  */
#if 0
  assert( p->pGroupBy==0 || (p->selFlags & SF_Aggregate)!=0 );
  if( (p->selFlags & (SF_Distinct|SF_Aggregate))==SF_Distinct ){
    p->pGroupBy = sqlite3ExprListDup(db, p->pEList, 0);
    pGroupBy = p->pGroupBy;
    p->selFlags &= ~SF_Distinct;
  }
#endif

  /* If there is both a GROUP BY and an ORDER BY clause and they are
  ** identical, then disable the ORDER BY clause since the GROUP BY
  ** will cause elements to come out in the correct order.  This is
  ** an optimization - the correct answer should result regardless.
  ** Use the SQLITE_GroupByOrder flag with SQLITE_TESTCTRL_OPTIMIZER
  ** to disable this optimization for testing purposes.
  */
  if( sqlite3ExprListCompare(p->pGroupBy, pOrderBy)==0
         && (db->flags & SQLITE_GroupByOrder)==0 ){
    pOrderBy = 0;
  }

























  /* If there is an ORDER BY clause, then this sorting
  ** index might end up being unused if the data can be 
  ** extracted in pre-sorted order.  If that is the case, then the
  ** OP_OpenEphemeral instruction will be changed to an OP_Noop once
  ** we figure out that the sorting index is not needed.  The addrSortIndex
  ** variable is used to facilitate that change.
................................................................................
    */
    if( addrSortIndex>=0 && pOrderBy==0 ){
      sqlite3VdbeChangeToNoop(v, addrSortIndex, 1);
      p->addrOpenEphm[2] = -1;
    }

    if( pWInfo->eDistinct ){




      assert( isDistinct );
      assert( pWInfo->eDistinct==WHERE_DISTINCT_ORDERED 
           || pWInfo->eDistinct==WHERE_DISTINCT_UNIQUE 
      );
      distinct = -1;
      if( pWInfo->eDistinct==WHERE_DISTINCT_ORDERED ){
        int iJump;
        int iExpr;
        int iFlag = ++pParse->nMem;
        int iBase = pParse->nMem+1;
        int iBase2 = iBase + pEList->nExpr;
        pParse->nMem += (pEList->nExpr*2);
        VdbeOp *pOp;

        /* Change the OP_OpenEphemeral coded earlier to an OP_Integer. The
        ** OP_Integer initializes the "first row" flag.  */
        pOp = sqlite3VdbeGetOp(v, addrDistinctIndex);
        pOp->opcode = OP_Integer;
        pOp->p1 = 1;
        pOp->p2 = iFlag;

        sqlite3ExprCodeExprList(pParse, pEList, iBase, 1);
        iJump = sqlite3VdbeCurrentAddr(v) + 1 + pEList->nExpr + 1 + 1;
        sqlite3VdbeAddOp2(v, OP_If, iFlag, iJump-1);
................................................................................
          sqlite3VdbeChangeP5(v, SQLITE_NULLEQ);
        }
        sqlite3VdbeAddOp2(v, OP_Goto, 0, pWInfo->iContinue);

        sqlite3VdbeAddOp2(v, OP_Integer, 0, iFlag);
        assert( sqlite3VdbeCurrentAddr(v)==iJump );
        sqlite3VdbeAddOp3(v, OP_Move, iBase, iBase2, pEList->nExpr);


      }
    }

    /* Use the standard inner loop. */
    selectInnerLoop(pParse, p, pEList, 0, 0, pOrderBy, distinct, pDest,
                    pWInfo->iContinue, pWInfo->iBreak);








<
<
<
<
<
<
<
<
<
<
<
<











>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>












<



<







 







>
>







3844
3845
3846
3847
3848
3849
3850












3851
3852
3853
3854
3855
3856
3857
3858
3859
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
....
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
3964
3965

3966
3967
3968

3969
3970
3971
3972
3973
3974
3975
....
3980
3981
3982
3983
3984
3985
3986
3987
3988
3989
3990
3991
3992
3993
3994
3995
    }
    rc = multiSelect(pParse, p, pDest);
    explainSetInteger(pParse->iSelectId, iRestoreSelectId);
    return rc;
  }
#endif













  /* If there is both a GROUP BY and an ORDER BY clause and they are
  ** identical, then disable the ORDER BY clause since the GROUP BY
  ** will cause elements to come out in the correct order.  This is
  ** an optimization - the correct answer should result regardless.
  ** Use the SQLITE_GroupByOrder flag with SQLITE_TESTCTRL_OPTIMIZER
  ** to disable this optimization for testing purposes.
  */
  if( sqlite3ExprListCompare(p->pGroupBy, pOrderBy)==0
         && (db->flags & SQLITE_GroupByOrder)==0 ){
    pOrderBy = 0;
  }

  /* If the query is DISTINCT with an ORDER BY but is not an aggregate, and 
  ** if the select-list is the same as the ORDER BY list, then this query
  ** can be rewritten as a GROUP BY. In other words, this:
  **
  **     SELECT DISTINCT xyz FROM ... ORDER BY xyz
  **
  ** is transformed to:
  **
  **     SELECT xyz FROM ... GROUP BY xyz
  **
  ** The second form is preferred as a single index (or temp-table) may be 
  ** used for both the ORDER BY and DISTINCT processing. As originally 
  ** written the query must use a temp-table for at least one of the ORDER 
  ** BY and DISTINCT, and an index or separate temp-table for the other.
  */
  if( (p->selFlags & (SF_Distinct|SF_Aggregate))==SF_Distinct 
   && sqlite3ExprListCompare(pOrderBy, p->pEList)==0
  ){
    p->selFlags &= ~SF_Distinct;
    p->pGroupBy = sqlite3ExprListDup(db, p->pEList, 0);
    pGroupBy = p->pGroupBy;
    pOrderBy = 0;
  }

  /* If there is an ORDER BY clause, then this sorting
  ** index might end up being unused if the data can be 
  ** extracted in pre-sorted order.  If that is the case, then the
  ** OP_OpenEphemeral instruction will be changed to an OP_Noop once
  ** we figure out that the sorting index is not needed.  The addrSortIndex
  ** variable is used to facilitate that change.
................................................................................
    */
    if( addrSortIndex>=0 && pOrderBy==0 ){
      sqlite3VdbeChangeToNoop(v, addrSortIndex, 1);
      p->addrOpenEphm[2] = -1;
    }

    if( pWInfo->eDistinct ){
      VdbeOp *pOp;                /* No longer required OpenEphemeral instr. */
     
      pOp = sqlite3VdbeGetOp(v, addrDistinctIndex);

      assert( isDistinct );
      assert( pWInfo->eDistinct==WHERE_DISTINCT_ORDERED 
           || pWInfo->eDistinct==WHERE_DISTINCT_UNIQUE 
      );
      distinct = -1;
      if( pWInfo->eDistinct==WHERE_DISTINCT_ORDERED ){
        int iJump;
        int iExpr;
        int iFlag = ++pParse->nMem;
        int iBase = pParse->nMem+1;
        int iBase2 = iBase + pEList->nExpr;
        pParse->nMem += (pEList->nExpr*2);


        /* Change the OP_OpenEphemeral coded earlier to an OP_Integer. The
        ** OP_Integer initializes the "first row" flag.  */

        pOp->opcode = OP_Integer;
        pOp->p1 = 1;
        pOp->p2 = iFlag;

        sqlite3ExprCodeExprList(pParse, pEList, iBase, 1);
        iJump = sqlite3VdbeCurrentAddr(v) + 1 + pEList->nExpr + 1 + 1;
        sqlite3VdbeAddOp2(v, OP_If, iFlag, iJump-1);
................................................................................
          sqlite3VdbeChangeP5(v, SQLITE_NULLEQ);
        }
        sqlite3VdbeAddOp2(v, OP_Goto, 0, pWInfo->iContinue);

        sqlite3VdbeAddOp2(v, OP_Integer, 0, iFlag);
        assert( sqlite3VdbeCurrentAddr(v)==iJump );
        sqlite3VdbeAddOp3(v, OP_Move, iBase, iBase2, pEList->nExpr);
      }else{
        pOp->opcode = OP_Noop;
      }
    }

    /* Use the standard inner loop. */
    selectInnerLoop(pParse, p, pEList, 0, 0, pOrderBy, distinct, pDest,
                    pWInfo->iContinue, pWInfo->iBreak);

Changes to src/where.c.

4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
....
4339
4340
4341
4342
4343
4344
4345

4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
  Parse *pParse,
  SrcList *pTabList,
  WhereClause *pWC,
  ExprList *pDistinct
){
  Table *pTab;
  Index *pIdx;
  int i;
  int iBase;

  /* If there is more than one table or sub-select in the FROM clause of
  ** this query, then it will not be possible to show that the DISTINCT 
  ** clause is redundant. */
  if( pTabList->nSrc!=1 ) return 0;
  iBase = pTabList->a[0].iCursor;
  pTab = pTabList->a[0].pTab;

  /* Check if all the expressions in the ExprList are of type TK_COLUMN and
  ** on the same table. If this is not the case, return early, since it will 
  ** not be possible to prove that the DISTINCT qualifier is redundant. 
  ** If any of the expressions is an IPK column, then return true.
  */
  for(i=0; i<pDistinct->nExpr; i++){
    Expr *p = pDistinct->a[i].pExpr;
    if( p->op!=TK_COLUMN || p->iTable!=iBase ) return 0;
    if( p->iColumn<0 ) return 1;
  }

  /* Loop through all indices on the table, checking each to see if it makes
  ** the DISTINCT qualifier redundant. It does so if:
  **
  **   1. The index is itself UNIQUE, and
  **
................................................................................
      int iCol = pIdx->aiColumn[i];
      const char *zColl = pIdx->azColl[i];

      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) ){
        int j;
        for(j=0; j<pDistinct->nExpr; j++){
          Expr *p = pDistinct->a[j].pExpr;

          if( p->iColumn==iCol ){
            CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);
            const char *zEColl = (pColl ? pColl : pParse->db->pDfltColl)->zName;
            if( 0==sqlite3StrICmp(zColl, zEColl) ) break;
          }
        }
        if( j==pDistinct->nExpr ) break;
      }
    }
    if( i==pIdx->nColumn ){
      /* This index implies that the DISTINCT qualifier is redundant. */







|









<
<
<
|
<


|
|







 







>
|

<
|







4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314



4315

4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
....
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344

4345
4346
4347
4348
4349
4350
4351
4352
  Parse *pParse,
  SrcList *pTabList,
  WhereClause *pWC,
  ExprList *pDistinct
){
  Table *pTab;
  Index *pIdx;
  int i;                          
  int iBase;

  /* If there is more than one table or sub-select in the FROM clause of
  ** this query, then it will not be possible to show that the DISTINCT 
  ** clause is redundant. */
  if( pTabList->nSrc!=1 ) return 0;
  iBase = pTabList->a[0].iCursor;
  pTab = pTabList->a[0].pTab;




  /* If any of the expressions is an IPK column, then return true. */

  for(i=0; i<pDistinct->nExpr; i++){
    Expr *p = pDistinct->a[i].pExpr;
    assert( p->op!=TK_COLUMN || p->iTable==iBase );
    if( p->op==TK_COLUMN && p->iColumn<0 ) return 1;
  }

  /* Loop through all indices on the table, checking each to see if it makes
  ** the DISTINCT qualifier redundant. It does so if:
  **
  **   1. The index is itself UNIQUE, and
  **
................................................................................
      int iCol = pIdx->aiColumn[i];
      const char *zColl = pIdx->azColl[i];

      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) ){
        int j;
        for(j=0; j<pDistinct->nExpr; j++){
          Expr *p = pDistinct->a[j].pExpr;
          assert( p->op!=TK_COLUMN || p->iTable==iBase );
          if( p->op==TK_COLUMN && p->iColumn==iCol ){
            CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);

            if( pColl && 0==sqlite3StrICmp(zColl, pColl->zName) ) break;
          }
        }
        if( j==pDistinct->nExpr ) break;
      }
    }
    if( i==pIdx->nColumn ){
      /* This index implies that the DISTINCT qualifier is redundant. */

Added test/distinct.test.







































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# 2011 July 1
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is the DISTINCT modifier.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set testprefix distinct


proc is_distinct_noop {sql} {
  set sql1 $sql
  set sql2 [string map {DISTINCT ""} $sql]

  set program1 [list]
  set program2 [list]
  db eval "EXPLAIN $sql1" {
    if {$opcode != "Noop"} { lappend program1 $opcode }
  }
  db eval "EXPLAIN $sql2" {
    if {$opcode != "Noop"} { lappend program2 $opcode }
  }

  return [expr {$program1==$program2}]
}

proc do_distinct_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 1]
}
proc do_distinct_not_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 0]
}


#-------------------------------------------------------------------------
# The following tests - distinct-1.* - check that the planner correctly 
# detects cases where a UNIQUE index means that a DISTINCT clause is 
# redundant. Currently the planner only detects such cases when there
# is a single table in the FROM clause.
#
do_execsql_test 1.0 {
  CREATE TABLE t1(a, b, c, d);
  CREATE UNIQUE INDEX i1 ON t1(b, c);
  CREATE UNIQUE INDEX i2 ON t1(d COLLATE nocase);

  CREATE TABLE t2(x INTEGER PRIMARY KEY, y);

  CREATE TABLE t3(c1 PRIMARY KEY, c2);
  CREATE INDEX i3 ON t3(c2);
}
foreach {tn noop sql} {

  1   1   "SELECT DISTINCT b, c FROM t1"
  2   1   "SELECT DISTINCT c FROM t1 WHERE b = ?"
  3   1   "SELECT DISTINCT rowid FROM t1"
  4   1   "SELECT DISTINCT rowid, a FROM t1"
  5   1   "SELECT DISTINCT x FROM t2"
  6   1   "SELECT DISTINCT * FROM t2"
  7   1   "SELECT DISTINCT * FROM (SELECT * FROM t2)"

  8   1   "SELECT DISTINCT * FROM t1"

  8   0   "SELECT DISTINCT a, b FROM t1"

  9   0   "SELECT DISTINCT c FROM t1 WHERE b IN (1,2)"
  10  0   "SELECT DISTINCT c FROM t1"
  11  0   "SELECT DISTINCT b FROM t1"

  12  0   "SELECT DISTINCT a, d FROM t1"
  13  0   "SELECT DISTINCT a, b, c COLLATE nocase FROM t1"
  14  1   "SELECT DISTINCT a, d COLLATE nocase FROM t1"
  15  0   "SELECT DISTINCT a, d COLLATE binary FROM t1"
  16  1   "SELECT DISTINCT a, b, c COLLATE binary FROM t1"

  16  0   "SELECT DISTINCT t1.rowid FROM t1, t2"
  17  0   { /* Technically, it would be possible to detect that DISTINCT
            ** is a no-op in cases like the following. But SQLite does not
            ** do so. */
            SELECT DISTINCT t1.rowid FROM t1, t2 WHERE t1.rowid=t2.rowid }

  18  1   "SELECT DISTINCT c1, c2 FROM t3"
  19  1   "SELECT DISTINCT c1 FROM t3"
  20  1   "SELECT DISTINCT * FROM t3"
  21  0   "SELECT DISTINCT c2 FROM t3"

  22  0   "SELECT DISTINCT * FROM (SELECT 1, 2, 3 UNION SELECT 4, 5, 6)"
  23  1   "SELECT DISTINCT rowid FROM (SELECT 1, 2, 3 UNION SELECT 4, 5, 6)"

  24  0   "SELECT DISTINCT rowid/2 FROM t1"
  25  1   "SELECT DISTINCT rowid/2, rowid FROM t1"
  26  1   "SELECT DISTINCT rowid/2, b FROM t1 WHERE c = ?"
} {
  if {$noop} {
    do_distinct_noop_test 1.$tn $sql
  } else {
    do_distinct_not_noop_test 1.$tn $sql
  }
}






finish_test