SQLite

Check-in [6c202ea024]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve use of indexes to optimize DISTINCT queries.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | experimental
Files: files | file ages | folders
SHA1: 6c202ea0247ff509f696eee3839975a88ed26cf2
User & Date: dan 2011-07-01 18:26:40.769
Context
2011-07-01
18:43
Merge latest trunk changes with experimental branch. (check-in: e56be74eab user: dan tags: experimental)
18:26
Improve use of indexes to optimize DISTINCT queries. (check-in: 6c202ea024 user: dan tags: experimental)
14:21
Improvements and tests for detection of redundant DISTINCT qualifiers. (check-in: 7337293c87 user: dan tags: experimental)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/where.c.
1394
1395
1396
1397
1398
1399
1400






















































































































































1401
1402
1403
1404
1405
1406
1407
    if( (exprTableUsage(pMaskSet, pList->a[iFirst++].pExpr)&allowed)!=0 ){
      return 1;
    }
  }
  return 0;
}
























































































































































/*
** This routine decides if pIdx can be used to satisfy the ORDER BY
** clause.  If it can, it returns 1.  If pIdx cannot satisfy the
** ORDER BY clause, this routine returns 0.
**
** pOrderBy is an ORDER BY clause from a SELECT statement.  pTab is the







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
    if( (exprTableUsage(pMaskSet, pList->a[iFirst++].pExpr)&allowed)!=0 ){
      return 1;
    }
  }
  return 0;
}

/*
** This function searches the expression list passed as the second argument
** for an expression of type TK_COLUMN that refers to the same column and
** uses the same collation sequence as the iCol'th column of index pIdx.
** Argument iBase is the cursor number used for the table that pIdx refers
** to.
**
** If such an expression is found, its index in pList->a[] is returned. If
** no expression is found, -1 is returned.
*/
static int findIndexCol(
  Parse *pParse,                  /* Parse context */
  ExprList *pList,                /* Expression list to search */
  int iBase,                      /* Cursor for table associated with pIdx */
  Index *pIdx,                    /* Index to match column of */
  int iCol                        /* Column of index to match */
){
  int i;
  const char *zColl = pIdx->azColl[iCol];

  for(i=0; i<pList->nExpr; i++){
    Expr *p = pList->a[i].pExpr;
    if( pIdx->aiColumn[iCol]==p->iColumn && iBase==p->iTable ){
      CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);
      if( pColl && 0==sqlite3StrICmp(pColl->zName, zColl) ){
        return i;
      }
    }
  }

  return -1;
}

/*
** This routine determines if pIdx can be used to assist in processing a
** DISTINCT qualifier. In other words, it tests whether or not using this
** index for the outer loop guarantees that rows with equal values for
** all expressions in the pDistinct list are delivered grouped together.
**
** For example, the query 
**
**   SELECT DISTINCT a, b, c FROM tbl WHERE a = ?
**
** can benefit from any index on columns "b" and "c".
*/
static int isDistinctIndex(
  Parse *pParse,                  /* Parsing context */
  WhereClause *pWC,               /* The WHERE clause */
  Index *pIdx,                    /* The index being considered */
  int base,                       /* Cursor number for the table pIdx is on */
  ExprList *pDistinct,            /* The DISTINCT expressions */
  int nEqCol                      /* Number of index columns with == */
){
  Bitmask mask = 0;               /* Mask of unaccounted for pDistinct exprs */
  int i;                          /* Iterator variable */

  if( pIdx->zName==0 || pDistinct==0 || pDistinct->nExpr>=BMS ) return 0;

  /* Loop through all the expressions in the distinct list. If any of them
  ** are not simple column references, return early. Otherwise, test if the
  ** WHERE clause contains a "col=X" clause. If it does, the expression
  ** can be ignored. If it does not, and the column does not belong to the
  ** same table as index pIdx, return early. Finally, if there is no
  ** matching "col=X" expression and the column is on the same table as pIdx,
  ** set the corresponding bit in variable mask.
  */
  for(i=0; i<pDistinct->nExpr; i++){
    WhereTerm *pTerm;
    Expr *p = pDistinct->a[i].pExpr;
    if( p->op!=TK_COLUMN ) return 0;
    pTerm = findTerm(pWC, p->iTable, p->iColumn, ~(Bitmask)0, WO_EQ, 0);
    if( pTerm ){
      Expr *pX = pTerm->pExpr;
      CollSeq *p1 = sqlite3BinaryCompareCollSeq(pParse, pX->pLeft, pX->pRight);
      CollSeq *p2 = sqlite3ExprCollSeq(pParse, p);
      if( p1==p2 ) continue;
    }
    if( p->iTable!=base ) return 0;
    mask |= (((Bitmask)1) << i);
  }

  for(i=nEqCol; mask && i<pIdx->nColumn; i++){
    int iExpr = findIndexCol(pParse, pDistinct, base, pIdx, i);
    if( iExpr<0 ) break;
    mask &= ~(((Bitmask)1) << iExpr);
  }

  return (mask==0);
}


/*
** Return true if the DISTINCT expression-list passed as the third argument
** is redundant. A DISTINCT list is redundant if the database contains a
** UNIQUE index that guarantees that the result of the query will be distinct
** anyway.
*/
static int isDistinctRedundant(
  Parse *pParse,
  SrcList *pTabList,
  WhereClause *pWC,
  ExprList *pDistinct
){
  Table *pTab;
  Index *pIdx;
  int i;                          
  int iBase;

  /* If there is more than one table or sub-select in the FROM clause of
  ** this query, then it will not be possible to show that the DISTINCT 
  ** clause is redundant. */
  if( pTabList->nSrc!=1 ) return 0;
  iBase = pTabList->a[0].iCursor;
  pTab = pTabList->a[0].pTab;

  /* If any of the expressions is an IPK column, then return true. */
  for(i=0; i<pDistinct->nExpr; i++){
    Expr *p = pDistinct->a[i].pExpr;
    assert( p->op!=TK_COLUMN || p->iTable==iBase );
    if( p->op==TK_COLUMN && p->iColumn<0 ) return 1;
  }

  /* Loop through all indices on the table, checking each to see if it makes
  ** the DISTINCT qualifier redundant. It does so if:
  **
  **   1. The index is itself UNIQUE, and
  **
  **   2. All of the columns in the index are either part of the pDistinct
  **      list, or else the WHERE clause contains a term of the form "col=X",
  **      where X is a constant value. The collation sequences of the
  **      comparison and select-list expressions must match those of the index.
  */
  for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){
    if( pIdx->onError==OE_None ) continue;
    for(i=0; i<pIdx->nColumn; i++){
      int iCol = pIdx->aiColumn[i];
      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) 
       && 0>findIndexCol(pParse, pDistinct, iBase, pIdx, i)
      ){
        break;
      }
    }
    if( i==pIdx->nColumn ){
      /* This index implies that the DISTINCT qualifier is redundant. */
      return 1;
    }
  }

  return 0;
}

/*
** This routine decides if pIdx can be used to satisfy the ORDER BY
** clause.  If it can, it returns 1.  If pIdx cannot satisfy the
** ORDER BY clause, this routine returns 0.
**
** pOrderBy is an ORDER BY clause from a SELECT statement.  pTab is the
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
      wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_ORDERBY;
      wsFlags |= (rev ? WHERE_REVERSE : 0);
    }

    /* If there is a DISTINCT qualifier and this index will scan rows in
    ** order of the DISTINCT expressions, clear bDist and set the appropriate
    ** flags in wsFlags. */
    if( isSortingIndex(
          pParse, pWC->pMaskSet, pProbe, iCur, pDistinct, nEq, wsFlags, 0)
    ){
      bDist = 0;
      wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_DISTINCT;
    }

    /* If currently calculating the cost of using an index (not the IPK
    ** index), determine if all required column data may be obtained without 
    ** using the main table (i.e. if the index is a covering







<
|
<







3056
3057
3058
3059
3060
3061
3062

3063

3064
3065
3066
3067
3068
3069
3070
      wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_ORDERBY;
      wsFlags |= (rev ? WHERE_REVERSE : 0);
    }

    /* If there is a DISTINCT qualifier and this index will scan rows in
    ** order of the DISTINCT expressions, clear bDist and set the appropriate
    ** flags in wsFlags. */

    if( isDistinctIndex(pParse, pWC, pProbe, iCur, pDistinct, nEq) ){

      bDist = 0;
      wsFlags |= WHERE_ROWID_RANGE|WHERE_COLUMN_RANGE|WHERE_DISTINCT;
    }

    /* If currently calculating the cost of using an index (not the IPK
    ** index), determine if all required column data may be obtained without 
    ** using the main table (i.e. if the index is a covering
4284
4285
4286
4287
4288
4289
4290
4291
4292
4293
4294
4295
4296
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
4313
4314
4315
4316
4317
4318
4319
4320
4321
4322
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
4354
4355
4356
4357
4358
4359
4360
4361
4362
4363
4364
4365
4366
      }
    }
    whereClauseClear(pWInfo->pWC);
    sqlite3DbFree(db, pWInfo);
  }
}

/*
** Return true if the DISTINCT expression-list passed as the third argument
** is redundant. A DISTINCT list is redundant if the database contains a
** UNIQUE index that guarantees that the result of the query will be distinct
** anyway.
*/
static int whereDistinctRedundant(
  Parse *pParse,
  SrcList *pTabList,
  WhereClause *pWC,
  ExprList *pDistinct
){
  Table *pTab;
  Index *pIdx;
  int i;                          
  int iBase;

  /* If there is more than one table or sub-select in the FROM clause of
  ** this query, then it will not be possible to show that the DISTINCT 
  ** clause is redundant. */
  if( pTabList->nSrc!=1 ) return 0;
  iBase = pTabList->a[0].iCursor;
  pTab = pTabList->a[0].pTab;

  /* If any of the expressions is an IPK column, then return true. */
  for(i=0; i<pDistinct->nExpr; i++){
    Expr *p = pDistinct->a[i].pExpr;
    assert( p->op!=TK_COLUMN || p->iTable==iBase );
    if( p->op==TK_COLUMN && p->iColumn<0 ) return 1;
  }

  /* Loop through all indices on the table, checking each to see if it makes
  ** the DISTINCT qualifier redundant. It does so if:
  **
  **   1. The index is itself UNIQUE, and
  **
  **   2. All of the columns in the index are either part of the pDistinct
  **      list, or else the WHERE clause contains a term of the form "col=X",
  **      where X is a constant value. The collation sequences of the
  **      comparison and select-list expressions must match those of the index.
  */
  for(pIdx=pTab->pIndex; pIdx; pIdx=pIdx->pNext){
    if( pIdx->onError==OE_None ) continue;
    for(i=0; i<pIdx->nColumn; i++){
      int iCol = pIdx->aiColumn[i];
      const char *zColl = pIdx->azColl[i];

      if( 0==findTerm(pWC, iBase, iCol, ~(Bitmask)0, WO_EQ, pIdx) ){
        int j;
        for(j=0; j<pDistinct->nExpr; j++){
          Expr *p = pDistinct->a[j].pExpr;
          assert( p->op!=TK_COLUMN || p->iTable==iBase );
          if( p->op==TK_COLUMN && p->iColumn==iCol ){
            CollSeq *pColl = sqlite3ExprCollSeq(pParse, p);
            if( pColl && 0==sqlite3StrICmp(zColl, pColl->zName) ) break;
          }
        }
        if( j==pDistinct->nExpr ) break;
      }
    }
    if( i==pIdx->nColumn ){
      /* This index implies that the DISTINCT qualifier is redundant. */
      return 1;
    }
  }

  return 0;
}


/*
** Generate the beginning of the loop used for WHERE clause processing.
** The return value is a pointer to an opaque structure that contains
** information needed to terminate the loop.  Later, the calling routine
** should invoke sqlite3WhereEnd() with the return value of this function
** in order to complete the WHERE clause processing.







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







4432
4433
4434
4435
4436
4437
4438





































































4439
4440
4441
4442
4443
4444
4445
      }
    }
    whereClauseClear(pWInfo->pWC);
    sqlite3DbFree(db, pWInfo);
  }
}







































































/*
** Generate the beginning of the loop used for WHERE clause processing.
** The return value is a pointer to an opaque structure that contains
** information needed to terminate the loop.  Later, the calling routine
** should invoke sqlite3WhereEnd() with the return value of this function
** in order to complete the WHERE clause processing.
4579
4580
4581
4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
    goto whereBeginError;
  }

  /* Check if the DISTINCT qualifier, if there is one, is redundant. 
  ** If it is, then set pDistinct to NULL and WhereInfo.eDistinct to
  ** WHERE_DISTINCT_UNIQUE to tell the caller to ignore the DISTINCT.
  */
  if( pDistinct && whereDistinctRedundant(pParse, pTabList, pWC, pDistinct) ){
    pDistinct = 0;
    pWInfo->eDistinct = WHERE_DISTINCT_UNIQUE;
  }

  /* Chose the best index to use for each table in the FROM clause.
  **
  ** This loop fills in the following fields:







|







4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
    goto whereBeginError;
  }

  /* Check if the DISTINCT qualifier, if there is one, is redundant. 
  ** If it is, then set pDistinct to NULL and WhereInfo.eDistinct to
  ** WHERE_DISTINCT_UNIQUE to tell the caller to ignore the DISTINCT.
  */
  if( pDistinct && isDistinctRedundant(pParse, pTabList, pWC, pDistinct) ){
    pDistinct = 0;
    pWInfo->eDistinct = WHERE_DISTINCT_UNIQUE;
  }

  /* Chose the best index to use for each table in the FROM clause.
  **
  ** This loop fills in the following fields:
Changes to test/distinct.test.
37
38
39
40
41
42
43

















44
45
46
47
48
49
50
proc do_distinct_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 1]
}
proc do_distinct_not_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 0]
}



















#-------------------------------------------------------------------------
# The following tests - distinct-1.* - check that the planner correctly 
# detects cases where a UNIQUE index means that a DISTINCT clause is 
# redundant. Currently the planner only detects such cases when there
# is a single table in the FROM clause.
#







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
proc do_distinct_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 1]
}
proc do_distinct_not_noop_test {tn sql} {
  uplevel [list do_test $tn [list is_distinct_noop $sql] 0]
}

proc do_temptables_test {tn sql temptables} {
  uplevel [list do_test $tn [subst -novar {
    set ret ""
    db eval "EXPLAIN [set sql]" {
      if {$opcode == "OpenEphemeral"} { 
        if {$p5 != "10" && $p5!="00"} { error "p5 = $p5" }
        if {$p5 == "10"} {
          lappend ret hash
        } else {
          lappend ret btree
        }
      }
    }
    set ret
  }] $temptables]
}


#-------------------------------------------------------------------------
# The following tests - distinct-1.* - check that the planner correctly 
# detects cases where a UNIQUE index means that a DISTINCT clause is 
# redundant. Currently the planner only detects such cases when there
# is a single table in the FROM clause.
#
103
104
105
106
107
108
109































110
111
112
113
114
115
  if {$noop} {
    do_distinct_noop_test 1.$tn $sql
  } else {
    do_distinct_not_noop_test 1.$tn $sql
  }
}





































finish_test







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>






120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
  if {$noop} {
    do_distinct_noop_test 1.$tn $sql
  } else {
    do_distinct_not_noop_test 1.$tn $sql
  }
}

#-------------------------------------------------------------------------
# The following tests - distinct-2.* - test cases where an index is
# used to deliver results in order of the DISTINCT expressions. 
#
drop_all_tables
do_execsql_test 2.0 {
  CREATE TABLE t1(a, b, c);

  CREATE INDEX i1 ON t1(a, b);
  CREATE INDEX i2 ON t1(b COLLATE nocase, c COLLATE nocase);

  INSERT INTO t1 VALUES('a', 'b', 'c');
  INSERT INTO t1 VALUES('A', 'B', 'C');
  INSERT INTO t1 VALUES('a', 'b', 'c');
  INSERT INTO t1 VALUES('A', 'B', 'C');
}

foreach {tn sql temptables res} {
  1   "a, b FROM t1"                                       {}      {A B a b}
  2   "b, a FROM t1"                                       {}      {B A b a}
  3   "a, b, c FROM t1"                                    {hash}  {a b c A B C}
  4   "a, b, c FROM t1 ORDER BY a, b, c"                   {btree} {A B C a b c}
  5   "b FROM t1 WHERE a = 'a'"                            {}      {b}
  6   "b FROM t1"                                          {hash}  {b B}
  7   "a FROM t1"                                          {}      {A a}
  8   "b COLLATE nocase FROM t1"                           {}      {b}
  9   "b COLLATE nocase FROM t1 ORDER BY b COLLATE nocase" {}      {B}
} {
  do_execsql_test    2.$tn.1 "SELECT DISTINCT $sql" $res
  do_temptables_test 2.$tn.2 "SELECT DISTINCT $sql" $temptables
}





finish_test
Changes to test/tester.tcl.
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#-------------------------------------------------------------------------
# The commands provided by the code in this file to help with creating 
# test cases are as follows:
#
# Commands to manipulate the db and the file-system at a high level:
#
#      copy_file              FROM TO
#      drop_all_table         ?DB?
#      forcedelete            FILENAME
#
# Test the capability of the SQLite version built into the interpreter to
# determine if a specific test can be run:
#
#      ifcapable              EXPR
#







|







16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#-------------------------------------------------------------------------
# The commands provided by the code in this file to help with creating 
# test cases are as follows:
#
# Commands to manipulate the db and the file-system at a high level:
#
#      copy_file              FROM TO
#      drop_all_tables        ?DB?
#      forcedelete            FILENAME
#
# Test the capability of the SQLite version built into the interpreter to
# determine if a specific test can be run:
#
#      ifcapable              EXPR
#