/ Check-in [3aff9a9c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improve the accuracy of the estimates used when searching an index for values not present in any stat4 samples.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 3aff9a9cac7aa994dfdaa0ab5c23ae73a1e820f0
User & Date: dan 2014-10-06 14:37:48
Context
2014-10-06
18:33
Remove unreachable branches in decodeIntArray() when compiling without STAT3 or STAT4. check-in: 80e1baa5 user: drh tags: trunk
14:37
Improve the accuracy of the estimates used when searching an index for values not present in any stat4 samples. check-in: 3aff9a9c user: dan tags: trunk
12:41
Fix a harmless compiler warning inside an assert() in FTS4. check-in: 418f3c9a user: drh tags: trunk
2014-10-04
10:22
Add a test to show that the change on this branch is effective. Closed-Leaf check-in: fc619be0 user: dan tags: stat4-avgeq
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/analyze.c.

1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
....
1512
1513
1514
1515
1516
1517
1518









1519
1520
1521
1522
1523
1524
1525
1526
1527
....
1572
1573
1574
1575
1576
1577
1578

1579
1580
1581
1582






1583




1584

1585
1586
1587
1588
1589

1590
1591

1592
1593
1594
1595
1596
1597



1598
1599
1600
1601
1602
1603
1604
....
1841
1842
1843
1844
1845
1846
1847





1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
    while( (c=z[0])>='0' && c<='9' ){
      v = v*10 + c - '0';
      z++;
    }
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
    if( aOut ){
      aOut[i] = v;
    }else
#else
    assert( aOut==0 );
    UNUSED_PARAMETER(aOut);
#endif
    {
      aLog[i] = sqlite3LogEst(v);
    }
    if( *z==' ' ) z++;
  }
#ifndef SQLITE_ENABLE_STAT3_OR_STAT4
  assert( pIndex!=0 );
#else
................................................................................
    pIndex = sqlite3PrimaryKeyIndex(pTable);
  }else{
    pIndex = sqlite3FindIndex(pInfo->db, argv[1], pInfo->zDatabase);
  }
  z = argv[2];

  if( pIndex ){









    pIndex->bUnordered = 0;
    decodeIntArray((char*)z, pIndex->nKeyCol+1, 0, pIndex->aiRowLogEst, pIndex);
    if( pIndex->pPartIdxWhere==0 ) pTable->nRowLogEst = pIndex->aiRowLogEst[0];
  }else{
    Index fakeIdx;
    fakeIdx.szIdxRow = pTable->szTabRow;
#ifdef SQLITE_ENABLE_COSTMULT
    fakeIdx.pTable = pTable;
#endif
................................................................................
      ** sample columns except the last. The last is always set to 1, as
      ** once the trailing PK fields are considered all index keys are
      ** unique.  */
      nCol = pIdx->nSampleCol-1;
      pIdx->aAvgEq[nCol] = 1;
    }
    for(iCol=0; iCol<nCol; iCol++){

      int i;                    /* Used to iterate through samples */
      tRowcnt sumEq = 0;        /* Sum of the nEq values */
      tRowcnt nSum = 0;         /* Number of terms contributing to sumEq */
      tRowcnt avgEq = 0;






      tRowcnt nDLt = pFinal->anDLt[iCol];






      /* Set nSum to the number of distinct (iCol+1) field prefixes that
      ** occur in the stat4 table for this index before pFinal. Set
      ** sumEq to the sum of the nEq values for column iCol for the same
      ** set (adding the value only once where there exist duplicate 
      ** prefixes).  */

      for(i=0; i<(pIdx->nSample-1); i++){
        if( aSample[i].anDLt[iCol]!=aSample[i+1].anDLt[iCol] ){

          sumEq += aSample[i].anEq[iCol];
          nSum++;
        }
      }
      if( nDLt>nSum ){
        avgEq = (pFinal->anLt[iCol] - sumEq)/(nDLt - nSum);



      }
      if( avgEq==0 ) avgEq = 1;
      pIdx->aAvgEq[iCol] = avgEq;
    }
  }
}

................................................................................
  /* Load the statistics from the sqlite_stat4 table. */
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  if( rc==SQLITE_OK ){
    int lookasideEnabled = db->lookaside.bEnabled;
    db->lookaside.bEnabled = 0;
    rc = loadStat4(db, sInfo.zDatabase);
    db->lookaside.bEnabled = lookasideEnabled;





  }
#endif

  if( rc==SQLITE_NOMEM ){
    db->mallocFailed = 1;
  }
  return rc;
}


#endif /* SQLITE_OMIT_ANALYZE */







|




|







 







>
>
>
>
>
>
>
>
>

|







 







>


<

>
>
>
>
>
>
|
>
>
>
>
|
>

|
|
|
<
>
|
|
>

|


<
<
>
>
>







 







>
>
>
>
>











1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
....
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
....
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590

1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608

1609
1610
1611
1612
1613
1614
1615
1616


1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
....
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
    while( (c=z[0])>='0' && c<='9' ){
      v = v*10 + c - '0';
      z++;
    }
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
    if( aOut ){
      aOut[i] = v;
    }
#else
    assert( aOut==0 );
    UNUSED_PARAMETER(aOut);
#endif
    if( aLog ){
      aLog[i] = sqlite3LogEst(v);
    }
    if( *z==' ' ) z++;
  }
#ifndef SQLITE_ENABLE_STAT3_OR_STAT4
  assert( pIndex!=0 );
#else
................................................................................
    pIndex = sqlite3PrimaryKeyIndex(pTable);
  }else{
    pIndex = sqlite3FindIndex(pInfo->db, argv[1], pInfo->zDatabase);
  }
  z = argv[2];

  if( pIndex ){
    int nCol = pIndex->nKeyCol+1;
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
    tRowcnt * const aiRowEst = pIndex->aiRowEst = (tRowcnt*)sqlite3MallocZero(
        sizeof(tRowcnt) * nCol
    );
    if( aiRowEst==0 ) pInfo->db->mallocFailed = 1;
#else
    tRowcnt * const aiRowEst = 0;
#endif
    pIndex->bUnordered = 0;
    decodeIntArray((char*)z, nCol, aiRowEst, pIndex->aiRowLogEst, pIndex);
    if( pIndex->pPartIdxWhere==0 ) pTable->nRowLogEst = pIndex->aiRowLogEst[0];
  }else{
    Index fakeIdx;
    fakeIdx.szIdxRow = pTable->szTabRow;
#ifdef SQLITE_ENABLE_COSTMULT
    fakeIdx.pTable = pTable;
#endif
................................................................................
      ** sample columns except the last. The last is always set to 1, as
      ** once the trailing PK fields are considered all index keys are
      ** unique.  */
      nCol = pIdx->nSampleCol-1;
      pIdx->aAvgEq[nCol] = 1;
    }
    for(iCol=0; iCol<nCol; iCol++){
      int nSample = pIdx->nSample;
      int i;                    /* Used to iterate through samples */
      tRowcnt sumEq = 0;        /* Sum of the nEq values */

      tRowcnt avgEq = 0;
      tRowcnt nRow;             /* Number of rows in index */
      i64 nSum100 = 0;          /* Number of terms contributing to sumEq */
      i64 nDist100;             /* Number of distinct values in index */

      if( pIdx->aiRowEst==0 || pIdx->aiRowEst[iCol+1]==0 ){
        nRow = pFinal->anLt[iCol];
        nDist100 = (i64)100 * pFinal->anDLt[iCol];
        nSample--;
      }else{
        nRow = pIdx->aiRowEst[0];
        nDist100 = ((i64)100 * pIdx->aiRowEst[0]) / pIdx->aiRowEst[iCol+1];
      }

      /* Set nSum to the number of distinct (iCol+1) field prefixes that
      ** occur in the stat4 table for this index. Set sumEq to the sum of 
      ** the nEq values for column iCol for the same set (adding the value 
      ** only once where there exist duplicate prefixes).  */

      for(i=0; i<nSample; i++){
        if( i==(pIdx->nSample-1)
         || aSample[i].anDLt[iCol]!=aSample[i+1].anDLt[iCol] 
        ){
          sumEq += aSample[i].anEq[iCol];
          nSum100 += 100;
        }
      }



      if( nDist100>nSum100 ){
        avgEq = ((i64)100 * (nRow - sumEq))/(nDist100 - nSum100);
      }
      if( avgEq==0 ) avgEq = 1;
      pIdx->aAvgEq[iCol] = avgEq;
    }
  }
}

................................................................................
  /* Load the statistics from the sqlite_stat4 table. */
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  if( rc==SQLITE_OK ){
    int lookasideEnabled = db->lookaside.bEnabled;
    db->lookaside.bEnabled = 0;
    rc = loadStat4(db, sInfo.zDatabase);
    db->lookaside.bEnabled = lookasideEnabled;
  }
  for(i=sqliteHashFirst(&db->aDb[iDb].pSchema->idxHash);i;i=sqliteHashNext(i)){
    Index *pIdx = sqliteHashData(i);
    sqlite3_free(pIdx->aiRowEst);
    pIdx->aiRowEst = 0;
  }
#endif

  if( rc==SQLITE_NOMEM ){
    db->mallocFailed = 1;
  }
  return rc;
}


#endif /* SQLITE_OMIT_ANALYZE */

Changes to src/build.c.

431
432
433
434
435
436
437



438
439
440
441
442
443
444
#ifndef SQLITE_OMIT_ANALYZE
  sqlite3DeleteIndexSamples(db, p);
#endif
  if( db==0 || db->pnBytesFreed==0 ) sqlite3KeyInfoUnref(p->pKeyInfo);
  sqlite3ExprDelete(db, p->pPartIdxWhere);
  sqlite3DbFree(db, p->zColAff);
  if( p->isResized ) sqlite3DbFree(db, p->azColl);



  sqlite3DbFree(db, p);
}

/*
** For the index called zIdxName which is found in the database iDb,
** unlike that index from its Table then remove the index from
** the index hash table and free all memory structures associated







>
>
>







431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
#ifndef SQLITE_OMIT_ANALYZE
  sqlite3DeleteIndexSamples(db, p);
#endif
  if( db==0 || db->pnBytesFreed==0 ) sqlite3KeyInfoUnref(p->pKeyInfo);
  sqlite3ExprDelete(db, p->pPartIdxWhere);
  sqlite3DbFree(db, p->zColAff);
  if( p->isResized ) sqlite3DbFree(db, p->azColl);
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  sqlite3_free(p->aiRowEst);
#endif
  sqlite3DbFree(db, p);
}

/*
** For the index called zIdxName which is found in the database iDb,
** unlike that index from its Table then remove the index from
** the index hash table and free all memory structures associated

Changes to src/sqliteInt.h.

1797
1798
1799
1800
1801
1802
1803

1804
1805
1806
1807
1808
1809
1810
  unsigned isResized:1;    /* True if resizeIndexObject() has been called */
  unsigned isCovering:1;   /* True if this is a covering index */
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  int nSample;             /* Number of elements in aSample[] */
  int nSampleCol;          /* Size of IndexSample.anEq[] and so on */
  tRowcnt *aAvgEq;         /* Average nEq values for keys not in aSample */
  IndexSample *aSample;    /* Samples of the left-most key */

#endif
};

/*
** Allowed values for Index.idxType
*/
#define SQLITE_IDXTYPE_APPDEF      0   /* Created using CREATE INDEX */







>







1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
  unsigned isResized:1;    /* True if resizeIndexObject() has been called */
  unsigned isCovering:1;   /* True if this is a covering index */
#ifdef SQLITE_ENABLE_STAT3_OR_STAT4
  int nSample;             /* Number of elements in aSample[] */
  int nSampleCol;          /* Size of IndexSample.anEq[] and so on */
  tRowcnt *aAvgEq;         /* Average nEq values for keys not in aSample */
  IndexSample *aSample;    /* Samples of the left-most key */
  tRowcnt *aiRowEst;       /* Non-logarithmic stat1 data for this table */
#endif
};

/*
** Allowed values for Index.idxType
*/
#define SQLITE_IDXTYPE_APPDEF      0   /* Created using CREATE INDEX */

Added test/analyzeD.test.











































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# 2005 July 22
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
# This file implements regression tests for SQLite library.
# This file implements tests for the ANALYZE command.
#
# $Id: analyze.test,v 1.9 2008/08/11 18:44:58 drh Exp $

set testdir [file dirname $argv0]
source $testdir/tester.tcl
set ::testprefix analyzeD

ifcapable {!stat4} {
  finish_test
  return
}


# Set up a table with the following properties:
#
#    * Contains 1000 rows.
#    * Column a contains even integers between 0 and 18, inclusive (so that
#      a=? for any such integer matches 100 rows).
#    * Column b contains integers between 0 and 9, inclusive.
#    * Column c contains integers between 0 and 199, inclusive (so that
#      for any such integer, c=? matches 5 rows).
#    * Then add 7 rows with a new value for "a" - 3001. The stat4 table will
#      not contain any samples with a=3001.
#
do_execsql_test 1.0 {
  CREATE TABLE t1(a, b, c);
}
do_test 1.1 {
  for {set i 1} {$i < 1000} {incr i} {
    set c [expr $i % 200]
    execsql { INSERT INTO t1(a, b, c) VALUES( 2*($i/100), $i%10, $c ) }
  }

  execsql {
    INSERT INTO t1 VALUES(3001, 3001, 3001);
    INSERT INTO t1 VALUES(3001, 3001, 3002);
    INSERT INTO t1 VALUES(3001, 3001, 3003);
    INSERT INTO t1 VALUES(3001, 3001, 3004);
    INSERT INTO t1 VALUES(3001, 3001, 3005);
    INSERT INTO t1 VALUES(3001, 3001, 3006);
    INSERT INTO t1 VALUES(3001, 3001, 3007);

    CREATE INDEX t1_ab ON t1(a, b);
    CREATE INDEX t1_c ON t1(c);

    ANALYZE;
  }
} {}

# With full ANALYZE data, SQLite sees that c=150 (5 rows) is better than
# a=3001 (7 rows).
#
do_eqp_test 1.2 {
  SELECT * FROM t1 WHERE a=3001 AND c=150;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX t1_c (c=?)}
}

do_test 1.3 {
  execsql { DELETE FROM sqlite_stat1 }
  db close
  sqlite3 db test.db
} {}

# Without stat1, because 3001 is larger than all samples in the stat4
# table, SQLite things that a=3001 matches just 1 row. So it (incorrectly)
# chooses it over the c=150 index (5 rows). Even with stat1 data, things
# worked this way before commit [e6f7f97dbc].
#
do_eqp_test 1.4 {
  SELECT * FROM t1 WHERE a=3001 AND c=150;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX t1_ab (a=?)}
}

do_test 1.5 {
  execsql { 
    UPDATE t1 SET a=13 WHERE a = 3001;
    ANALYZE;
  }
} {}

do_eqp_test 1.6 {
  SELECT * FROM t1 WHERE a=13 AND c=150;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX t1_c (c=?)}
}

do_test 1.7 {
  execsql { DELETE FROM sqlite_stat1 }
  db close
  sqlite3 db test.db
} {}

# Same test as 1.4, except this time the 7 rows that match the a=? condition 
# do not feature larger values than all rows in the stat4 table. So SQLite
# gets this right, even without stat1 data.
do_eqp_test 1.8 {
  SELECT * FROM t1 WHERE a=13 AND c=150;
} {
  0 0 0 {SEARCH TABLE t1 USING INDEX t1_c (c=?)}
}

finish_test