Index: src/analyze.c
==================================================================
--- src/analyze.c
+++ src/analyze.c
@@ -265,10 +265,11 @@
 typedef struct Stat4Accum Stat4Accum;
 typedef struct Stat4Sample Stat4Sample;
 struct Stat4Sample {
   tRowcnt *anEq;                  /* sqlite_stat4.nEq */
   tRowcnt *anDLt;                 /* sqlite_stat4.nDLt */
+  tRowcnt *amxEq;                 /* Maximum length run of equal values */
 #ifdef SQLITE_ENABLE_STAT3_OR_STAT4
   tRowcnt *anLt;                  /* sqlite_stat4.nLt */
   union {
     i64 iRowid;                     /* Rowid in main table of the key */
     u8 *aRowid;                     /* Key for WITHOUT ROWID tables */
@@ -416,10 +417,11 @@
   assert( nKeyCol>0 );
 
   /* Allocate the space required for the Stat4Accum object */
   n = sizeof(*p) 
     + sizeof(tRowcnt)*nColUp                  /* Stat4Accum.anEq */
+    + sizeof(tRowcnt)*nColUp                  /* Stat4Accum.amxEq */
     + sizeof(tRowcnt)*nColUp                  /* Stat4Accum.anDLt */
 #ifdef SQLITE_ENABLE_STAT3_OR_STAT4
     + sizeof(tRowcnt)*nColUp                  /* Stat4Accum.anLt */
     + sizeof(Stat4Sample)*(nCol+mxSample)     /* Stat4Accum.aBest[], a[] */
     + sizeof(tRowcnt)*3*nColUp*(nCol+mxSample)
@@ -434,22 +436,23 @@
 
   p->db = db;
   p->nRow = 0;
   p->nCol = nCol;
   p->nKeyCol = nKeyCol;
-  p->current.anDLt = (tRowcnt*)&p[1];
-  p->current.anEq = &p->current.anDLt[nColUp];
+  p->current.anEq = (tRowcnt*)&p[1];
+  p->current.amxEq = &p->current.anEq[nColUp];
+  p->current.anDLt = &p->current.amxEq[nColUp];
 
 #ifdef SQLITE_ENABLE_STAT3_OR_STAT4
   {
     u8 *pSpace;                     /* Allocated space not yet assigned */
-    int i;                          /* Used to iterate through p->aSample[] */
+    int i;                          /* Loop counter */
 
     p->iGet = -1;
     p->mxSample = mxSample;
     p->nPSample = (tRowcnt)(sqlite3_value_int64(argv[2])/(mxSample/3+1) + 1);
-    p->current.anLt = &p->current.anEq[nColUp];
+    p->current.anLt = &p->current.anDLt[nColUp];
     p->iPrn = 0x689e962d*(u32)nCol ^ 0xd0944565*(u32)sqlite3_value_int(argv[2]);
   
     /* Set up the Stat4Accum.a[] and aBest[] arrays */
     p->a = (struct Stat4Sample*)&p->current.anLt[nColUp];
     p->aBest = &p->a[mxSample];
@@ -719,18 +722,22 @@
   assert( p->nCol>0 );
   assert( iChng<p->nCol );
 
   if( p->nRow==0 ){
     /* This is the first call to this function. Do initialization. */
-    for(i=0; i<p->nCol; i++) p->current.anEq[i] = 1;
+    for(i=0; i<p->nCol; i++){
+      p->current.anEq[i] = 1;
+      p->current.amxEq[i] = 1;
+    }
   }else{
     /* Second and subsequent calls get processed here */
     samplePushPrevious(p, iChng);
 
     /* Update anDLt[], anLt[] and anEq[] to reflect the values that apply
     ** to the current row of the index. */
     for(i=0; i<iChng; i++){
+      if( p->current.amxEq[i]==p->current.anEq[i] ) p->current.amxEq[i]++;
       p->current.anEq[i]++;
     }
     for(i=iChng; i<p->nCol; i++){
       p->current.anDLt[i]++;
 #ifdef SQLITE_ENABLE_STAT3_OR_STAT4
@@ -819,51 +826,77 @@
   if( eCall==STAT_GET_STAT1 )
 #else
   assert( argc==1 );
 #endif
   {
-    /* Return the value to store in the "stat" column of the sqlite_stat1
+    /* Return a value for the "stat" column of the sqlite_stat1
     ** table for this index.
     **
     ** The value is a string composed of a list of integers describing 
-    ** the index. The first integer in the list is the total number of 
-    ** entries in the index. There is one additional integer in the list 
-    ** for each indexed column. This additional integer is an estimate of
-    ** the number of rows matched by a stabbing query on the index using
-    ** a key with the corresponding number of fields. In other words,
-    ** if the index is on columns (a,b) and the sqlite_stat1 value is 
-    ** "100 10 2", then SQLite estimates that:
-    **
-    **   * the index contains 100 rows,
-    **   * "WHERE a=?" matches 10 rows, and
-    **   * "WHERE a=? AND b=?" matches 2 rows.
-    **
-    ** If D is the count of distinct values and K is the total number of 
-    ** rows, then each estimate is computed as:
-    **
-    **        I = (K+D-1)/D
+    ** the index. The first integer is the (estimated) total number
+    ** entries in the index. There is one additional integer for each
+    ** column in the index.  The first added integer is an estimate of
+    ** the number of rows that match a single key in the first column of
+    ** the index.  The second added integer is an estimate on of the number
+    ** of rows that match a single key consisting of the first two columns
+    ** of the index.  And so forth.
+    **
+    ** For example, for an index on columns (a,b), if the sqlite_stat1.stat
+    ** values is "100 10 2", that means there are about 100 rows in the
+    ** index, and that a query against a=$key1 will match about 10 rows
+    ** and a query against "a=$key1 AND b=$key2" will match about 2 rows.
+    **
+    ** Let V be the average number of rows that match a key, and let M
+    ** be the most number of rows that match the key for any possible value
+    ** of that key.  The estimate is computed as:
+    **
+    **     E = (2*V + M)/3
+    **
+    ** Consider two indexes.  Index X has with 100 values of exactly 0 and 
+    ** 100 singleton values between 1 and 100.  Index Y has 200 values
+    ** evenly distributed between 1 and 20.  If only the average (V) is
+    ** used in the estimate, X would have "200 2" and Y would have "200 10"
+    ** and so the planner would think X is the more selective index.  And
+    ** X often would be more selective.  But when searching for 0, index X
+    ** would perform badly.  To avoid this problem, the M is added into the
+    ** estimate so that the stat for X is "200 34" and Y is still "200 10".
+    ** In this way, Y is the preferred index (all else being equal) and
+    ** the pathological case is avoided.
+    **
+    ** For deciding whether or not to do a skip-scan, we want to know the
+    ** average number of rows (V) with the same key, not the mixed estimate
+    ** E shown above.  Usually E will be close enough.  However, if E is
+    ** large but V is small, that could trick the query planner into thinking
+    ** that a skip-scan might work well on this index.  To avoid that, the
+    ** "noskipscan" flag is added in cases where the divergence between E
+    ** and V might mislead the query planner.
     */
     char *z;
     int i;
+    int noSkipScan = 0;
 
-    char *zRet = sqlite3MallocZero( (p->nKeyCol+1)*25 );
+    char *zRet = sqlite3MallocZero( (p->nKeyCol+2)*25 );
     if( zRet==0 ){
       sqlite3_result_error_nomem(context);
       return;
     }
 
     sqlite3_snprintf(24, zRet, "%llu", (u64)p->nRow);
     z = zRet + sqlite3Strlen30(zRet);
     for(i=0; i<p->nKeyCol; i++){
-      u64 nDistinct = p->current.anDLt[i] + 1;
-      u64 iVal = (p->nRow + nDistinct - 1) / nDistinct;
+      u64 nDistinct = p->current.anDLt[i];
+      u64 iMx = p->current.amxEq[i];                /* M: Most rows per key */
+      u64 iAvg = (p->nRow+nDistinct)/(nDistinct+1); /* V: Average per key */
+      u64 iVal = (iMx+iAvg*2)/3;                    /* E: The estimate */
       sqlite3_snprintf(24, z, " %llu", iVal);
       z += sqlite3Strlen30(z);
       assert( p->current.anEq[i] );
+      if( iVal>=WHERE_SKIPSCAN_ONSET && iAvg<(WHERE_SKIPSCAN_ONSET*2/3) ){
+        noSkipScan = 1;
+      }
     }
-    assert( z[0]=='\0' && z>zRet );
-
+    if( noSkipScan ) sqlite3_snprintf(14, z, " noskipscan");
     sqlite3_result_text(context, zRet, -1, sqlite3_free);
   }
 #ifdef SQLITE_ENABLE_STAT3_OR_STAT4
   else if( eCall==STAT_GET_ROWID ){
     if( p->iGet<0 ){
@@ -1569,56 +1602,49 @@
 ** Populate the pIdx->aAvgEq[] array based on the samples currently
 ** stored in pIdx->aSample[]. 
 */
 static void initAvgEq(Index *pIdx){
   if( pIdx ){
-    IndexSample *aSample = pIdx->aSample;
-    IndexSample *pFinal = &aSample[pIdx->nSample-1];
-    int iCol;
-    int nCol = 1;
-    if( pIdx->nSampleCol>1 ){
+    int nSample = pIdx->nSample;                /* Number of samples */
+    int nCol = pIdx->nSampleCol;                /* Number of columns sampled */
+    IndexSample *aSample = pIdx->aSample;       /* The samples */
+    IndexSample *pFinal = &aSample[nSample-1];  /* The last sample */
+    int iCol;                                   /* Loop counter of columns */
+    if( nCol>1 ){
       /* If this is stat4 data, then calculate aAvgEq[] values for all
       ** sample columns except the last. The last is always set to 1, as
       ** once the trailing PK fields are considered all index keys are
       ** unique.  */
-      nCol = pIdx->nSampleCol-1;
+      nCol--;
       pIdx->aAvgEq[nCol] = 1;
     }
     for(iCol=0; iCol<nCol; iCol++){
-      int nSample = pIdx->nSample;
       int i;                    /* Used to iterate through samples */
+      u32 nDSample = 0;         /* Number of distinct samples less than pFinal*/
       tRowcnt sumEq = 0;        /* Sum of the nEq values */
-      tRowcnt avgEq = 0;
+      tRowcnt avgEq;            /* Average repeats for unsampled entries */
       tRowcnt nRow;             /* Number of rows in index */
-      i64 nSum100 = 0;          /* Number of terms contributing to sumEq */
-      i64 nDist100;             /* Number of distinct values in index */
+      tRowcnt prevLt = 0;       /* Number of values less than previous sample */
+      tRowcnt thisLt;           /* Number of values less than current sample */
 
       if( !pIdx->aiRowEst || iCol>=pIdx->nKeyCol || pIdx->aiRowEst[iCol+1]==0 ){
-        nRow = pFinal->anLt[iCol];
-        nDist100 = (i64)100 * pFinal->anDLt[iCol];
-        nSample--;
+        nRow = pFinal->anLt[iCol] + pFinal->anEq[iCol];
       }else{
         nRow = pIdx->aiRowEst[0];
-        nDist100 = ((i64)100 * pIdx->aiRowEst[0]) / pIdx->aiRowEst[iCol+1];
       }
       pIdx->nRowEst0 = nRow;
-
-      /* Set nSum to the number of distinct (iCol+1) field prefixes that
-      ** occur in the stat4 table for this index. Set sumEq to the sum of 
-      ** the nEq values for column iCol for the same set (adding the value 
-      ** only once where there exist duplicate prefixes).  */
-      for(i=0; i<nSample; i++){
-        if( i==(pIdx->nSample-1)
-         || aSample[i].anDLt[iCol]!=aSample[i+1].anDLt[iCol] 
-        ){
+      for(i=0; (thisLt = aSample[i].anLt[iCol])<pFinal->anLt[iCol]; i++){
+        if( i==0 || thisLt!=prevLt ){
           sumEq += aSample[i].anEq[iCol];
-          nSum100 += 100;
+          nDSample++;
         }
+        prevLt = thisLt;
       }
-
-      if( nDist100>nSum100 ){
-        avgEq = ((i64)100 * (nRow - sumEq))/(nDist100 - nSum100);
+      if( pFinal->anDLt[iCol] > nDSample ){
+        avgEq = (pFinal->anLt[iCol] - sumEq)/(pFinal->anDLt[iCol] - nDSample);
+      }else{
+        avgEq = 1;
       }
       if( avgEq==0 ) avgEq = 1;
       pIdx->aAvgEq[iCol] = avgEq;
     }
   }

Index: src/sqliteInt.h
==================================================================
--- src/sqliteInt.h
+++ src/sqliteInt.h
@@ -2419,10 +2419,19 @@
 #define JT_LEFT      0x0008    /* Left outer join */
 #define JT_RIGHT     0x0010    /* Right outer join */
 #define JT_OUTER     0x0020    /* The "OUTER" keyword is present */
 #define JT_ERROR     0x0040    /* unknown or unsupported join type */
 
+/*
+** TUNING: The skip-scan optimization is only profitable if the average 
+** number of repeats of an entry in the index is greater than or equal to
+** WHERE_SKIPSCAN_ONSET.  If the average numbe of repeats is less
+** than WHERE_SKIPSCAN_ONSET, then it is faster to do a full table
+** scan.
+*/
+#define WHERE_SKIPSCAN_ONSET         18
+#define WHERE_SKIPSCAN_ONSET_LOG     42
 
 /*
 ** Flags appropriate for the wctrlFlags parameter of sqlite3WhereBegin()
 ** and the WhereInfo.wctrlFlags member.
 */

Index: src/where.c
==================================================================
--- src/where.c
+++ src/where.c
@@ -2410,15 +2410,15 @@
   ** is almost always quicker than an index seek (even though if the index
   ** contains fewer than 2^17 rows we assume otherwise in other parts of
   ** the code). And, even if it is not, it should not be too much slower. 
   ** On the other hand, the extra seeks could end up being significantly
   ** more expensive.  */
-  assert( 42==sqlite3LogEst(18) );
+  assert( WHERE_SKIPSCAN_ONSET_LOG==sqlite3LogEst(WHERE_SKIPSCAN_ONSET) );
   if( saved_nEq==saved_nSkip
    && saved_nEq+1<pProbe->nKeyCol
    && pProbe->noSkipScan==0
-   && pProbe->aiRowLogEst[saved_nEq+1]>=42  /* TUNING: Minimum for skip-scan */
+   && pProbe->aiRowLogEst[saved_nEq+1]>=WHERE_SKIPSCAN_ONSET_LOG
    && (rc = whereLoopResize(db, pNew, pNew->nLTerm+1))==SQLITE_OK
   ){
     LogEst nIter;
     pNew->u.btree.nEq++;
     pNew->nSkip++;

Index: test/skipscan2.test
==================================================================
--- test/skipscan2.test
+++ test/skipscan2.test
@@ -46,26 +46,20 @@
   INSERT INTO people VALUES('Patrick','teacher',180);
   INSERT INTO people VALUES('Quiana','student',182);
   INSERT INTO people VALUES('Robert','student',159);
   INSERT INTO people VALUES('Sally','student',166);
   INSERT INTO people VALUES('Tom','student',171);
-  INSERT INTO people VALUES('Ursula','student',170);
-  INSERT INTO people VALUES('Vance','student',179);
-  INSERT INTO people VALUES('Willma','student',175);
-  INSERT INTO people VALUES('Xavier','teacher',185);
-  INSERT INTO people VALUES('Yvonne','student',149);
-  INSERT INTO people VALUES('Zach','student',170);
 }
 
 # Without ANALYZE, a skip-scan is not used
 #
 do_execsql_test skipscan2-1.3 {
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-1.3eqp {
   EXPLAIN QUERY PLAN
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
 } {~/*INDEX people_idx1 */}
 
 # Now do an ANALYZE.  A skip-scan can be used after ANALYZE.
 #
 do_execsql_test skipscan2-1.4 {
@@ -77,75 +71,49 @@
   UPDATE sqlite_stat1 SET stat='10000 1' WHERE idx='sqlite_autoindex_people_1';
   ANALYZE sqlite_master;
 }
 db cache flush
 do_execsql_test skipscan2-1.5 {
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-1.5eqp {
   EXPLAIN QUERY PLAN
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
 } {/*INDEX people_idx1 */}
 
 # Same answer with other formulations of the same query
 #
 do_execsql_test skipscan2-1.6 {
   SELECT name FROM people
    WHERE role IN (SELECT DISTINCT role FROM people)
-     AND height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+     AND height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-1.7 {
-  SELECT name FROM people WHERE role='teacher' AND height>=180
+  SELECT name FROM people WHERE role='teacher' AND height<=161
   UNION ALL
-  SELECT name FROM people WHERE role='student' AND height>=180
+  SELECT name FROM people WHERE role='student' AND height<=161
   ORDER BY 1;
-} {David Jack Patrick Quiana Xavier}
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 
-# Add 8 more people, bringing the total to 34.  Then the number of
-# duplicates in the left-column of the index will be 17 and 
-# skip-scan should not be used after an (unfudged) ANALYZE.
+# Add more people so that the estimated number of rows per key is
+# equal to the skip-scan threshold of 18.  Then run an (unfudged)
+# ANALYZE.  skip-scan should work.
 #
 do_execsql_test skipscan2-1.8 {
-  INSERT INTO people VALUES('Angie','student',166);
-  INSERT INTO people VALUES('Brad','student',176);
-  INSERT INTO people VALUES('Claire','student',168);
-  INSERT INTO people VALUES('Donald','student',162);
-  INSERT INTO people VALUES('Elaine','student',177);
-  INSERT INTO people VALUES('Frazier','student',159);
-  INSERT INTO people VALUES('Grace','student',179);
-  INSERT INTO people VALUES('Horace','student',166);
+  INSERT INTO people VALUES('Ursula','student',170);
   ANALYZE;
   SELECT stat FROM sqlite_stat1 WHERE idx='people_idx1';
-} {{34 17 2}}
+} {{21 13 2}}
 db cache flush
 do_execsql_test skipscan2-1.9 {
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-1.9eqp {
   EXPLAIN QUERY PLAN
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
+  SELECT name FROM people WHERE height<=161 ORDER BY +name;
 } {~/*INDEX people_idx1 */}
 
-# Add 2 more people, bringing the total to 36.  Then the number of
-# duplicates in the left-column of the index will be 18 and 
-# skip-scan will be used after an (unfudged) ANALYZE.
-#
-do_execsql_test skipscan2-1.10 {
-  INSERT INTO people VALUES('Ingrad','student',155);
-  INSERT INTO people VALUES('Jacob','student',179);
-  ANALYZE;
-  SELECT stat FROM sqlite_stat1 WHERE idx='people_idx1';
-} {{36 18 2}}
-db cache flush
-do_execsql_test skipscan2-1.11 {
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
-do_execsql_test skipscan2-1.11eqp {
-  EXPLAIN QUERY PLAN
-  SELECT name FROM people WHERE height>=180 ORDER BY +name;
-} {/*INDEX people_idx1 */}
-
 
 # Repeat using a WITHOUT ROWID table.
 #
 do_execsql_test skipscan2-2.1 {
   CREATE TABLE peoplew(
@@ -156,37 +124,37 @@
   ) WITHOUT ROWID;
   CREATE INDEX peoplew_idx1 ON peoplew(role, height);
   INSERT INTO peoplew(name,role,height)
      SELECT name, role, height FROM  people;
   ALTER TABLE people RENAME TO old_people;
-  SELECT name FROM peoplew WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+  SELECT name FROM peoplew WHERE height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-2.2 {
   SELECT name FROM peoplew
    WHERE role IN (SELECT DISTINCT role FROM peoplew)
-     AND height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+     AND height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-2.2 {
-  SELECT name FROM peoplew WHERE role='teacher' AND height>=180
+  SELECT name FROM peoplew WHERE role='teacher' AND height<=161
   UNION ALL
-  SELECT name FROM peoplew WHERE role='student' AND height>=180
+  SELECT name FROM peoplew WHERE role='student' AND height<=161
   ORDER BY 1;
-} {David Jack Patrick Quiana Xavier}
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 
 # Now do an ANALYZE.  A skip-scan can be used after ANALYZE.
 #
 do_execsql_test skipscan2-2.4 {
   ANALYZE;
 }
 db cache flush
 do_execsql_test skipscan2-2.5 {
-  SELECT name FROM peoplew WHERE height>=180 ORDER BY +name;
-} {David Jack Patrick Quiana Xavier}
+  SELECT name FROM peoplew WHERE height<=161 ORDER BY +name;
+} {Alice Bob Cindy Emily Megan Olivia Robert}
 do_execsql_test skipscan2-2.5eqp {
   EXPLAIN QUERY PLAN
-  SELECT name FROM peoplew WHERE height>=180 ORDER BY +name;
-} {/*INDEX peoplew_idx1 */}
+  SELECT name FROM peoplew WHERE height<=161 ORDER BY +name;
+} {/*INDEX peoplew_idx1*/}
 
 # A skip-scan on a PK index of a WITHOUT ROWID table.
 #
 do_execsql_test skipscan2-3.1 {
   CREATE TABLE t3(a, b, c, PRIMARY KEY(a, b)) WITHOUT ROWID;