/ Check-in [91daea7d]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add a fix and tests for the FTS deferred token logic.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts3-changes
Files: files | file ages | folders
SHA1: 91daea7d2ec41f014fb30c6371aae447cc07f287
User & Date: dan 2011-06-28 11:58:09
Context
2011-06-28
14:16
Merge the fts3-changes branch back into the trunk. check-in: b9477eb0 user: dan tags: trunk
11:58
Add a fix and tests for the FTS deferred token logic. Closed-Leaf check-in: 91daea7d user: dan tags: fts3-changes
09:51
Merge latest trunk changes with fts3-changes branch. check-in: 22668647 user: dan tags: fts3-changes
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3.c.

  3860   3860     Fts3Cursor *pCsr,               /* FTS Cursor handle */
  3861   3861     Fts3Expr *pRoot,                /* Consider tokens with this root node */
  3862   3862     Fts3TokenAndCost *aTC,          /* Array of expression tokens and costs */
  3863   3863     int nTC                         /* Number of entries in aTC[] */
  3864   3864   ){
  3865   3865     Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  3866   3866     int nDocSize = 0;               /* Number of pages per doc loaded */
  3867         -  int nDocEst = 0;                /* Est. docs if all other tokens deferred */
  3868   3867     int rc = SQLITE_OK;             /* Return code */
  3869   3868     int ii;                         /* Iterator variable for various purposes */
  3870   3869     int nOvfl = 0;                  /* Total overflow pages used by doclists */
  3871   3870     int nToken = 0;                 /* Total number of tokens in cluster */
         3871  +
         3872  +  int nMinEst = 0;                /* The minimum count for any phrase so far. */
         3873  +  int nLoad4 = 1;                 /* (Phrases that will be loaded)^4. */
  3872   3874   
  3873   3875     /* Count the tokens in this AND/NEAR cluster. If none of the doclists
  3874   3876     ** associated with the tokens spill onto overflow pages, or if there is
  3875   3877     ** only 1 token, exit early. No tokens to defer in this case. */
  3876   3878     for(ii=0; ii<nTC; ii++){
  3877   3879       if( aTC[ii].pRoot==pRoot ){
  3878   3880         nOvfl += aTC[ii].nOvfl;
................................................................................
  3881   3883     }
  3882   3884     if( nOvfl==0 || nToken<2 ) return SQLITE_OK;
  3883   3885   
  3884   3886     /* Obtain the average docsize (in pages). */
  3885   3887     rc = fts3EvalAverageDocsize(pCsr, &nDocSize);
  3886   3888     assert( rc!=SQLITE_OK || nDocSize>0 );
  3887   3889   
         3890  +
         3891  +  /* Iterate through all tokens in this AND/NEAR cluster, in ascending order 
         3892  +  ** of the number of overflow pages that will be loaded by the pager layer 
         3893  +  ** to retrieve the entire doclist for the token from the full-text index.
         3894  +  ** Load the doclists for tokens that are either:
         3895  +  **
         3896  +  **   a. The cheapest token in the entire query (i.e. the one visited by the
         3897  +  **      first iteration of this loop), or
         3898  +  **
         3899  +  **   b. Part of a multi-token phrase.
         3900  +  **
         3901  +  ** After each token doclist is loaded, merge it with the others from the
         3902  +  ** same phrase and count the number of documents that the merged doclist
         3903  +  ** contains. Set variable "nMinEst" to the smallest number of documents in 
         3904  +  ** any phrase doclist for which 1 or more token doclists have been loaded.
         3905  +  ** Let nOther be the number of other phrases for which it is certain that
         3906  +  ** one or more tokens will not be deferred.
         3907  +  **
         3908  +  ** Then, for each token, defer it if loading the doclist would result in
         3909  +  ** loading N or more overflow pages into memory, where N is computed as:
         3910  +  **
         3911  +  **    (nMinEst + 4^nOther - 1) / (4^nOther)
         3912  +  */
  3888   3913     for(ii=0; ii<nToken && rc==SQLITE_OK; ii++){
  3889   3914       int iTC;                      /* Used to iterate through aTC[] array. */
  3890   3915       Fts3TokenAndCost *pTC = 0;    /* Set to cheapest remaining token. */
  3891   3916   
  3892   3917       /* Set pTC to point to the cheapest remaining token. */
  3893   3918       for(iTC=0; iTC<nTC; iTC++){
  3894   3919         if( aTC[iTC].pToken && aTC[iTC].pRoot==pRoot 
................................................................................
  3895   3920          && (!pTC || aTC[iTC].nOvfl<pTC->nOvfl) 
  3896   3921         ){
  3897   3922           pTC = &aTC[iTC];
  3898   3923         }
  3899   3924       }
  3900   3925       assert( pTC );
  3901   3926   
  3902         -    /* Determine if token pTC should be deferred. If not, update nDocEst. 
  3903         -    **
  3904         -    ** TODO: If there are performance regressions involving deferred tokens,
  3905         -    ** this (the logic that selects the tokens to be deferred) is probably
  3906         -    ** the bit that needs to change.
  3907         -    */
  3908         -
  3909         -    if( ii && pTC->nOvfl>=(nDocEst*nDocSize) ){
         3927  +    if( ii && pTC->nOvfl>=((nMinEst+(nLoad4/4)-1)/(nLoad4/4))*nDocSize ){
  3910   3928         /* The number of overflow pages to load for this (and therefore all
  3911   3929         ** subsequent) tokens is greater than the estimated number of pages 
  3912   3930         ** that will be loaded if all subsequent tokens are deferred.
  3913   3931         */
  3914   3932         Fts3PhraseToken *pToken = pTC->pToken;
  3915   3933         rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol);
  3916   3934         fts3SegReaderCursorFree(pToken->pSegcsr);
  3917   3935         pToken->pSegcsr = 0;
  3918         -    }else if( ii==0 || pTC->pPhrase->nToken>1 ){
  3919         -      /* Either this is the cheapest token in the entire query, or it is
  3920         -      ** part of a multi-token phrase. Either way, the entire doclist will
  3921         -      ** (eventually) be loaded into memory. It may as well be now. */
  3922         -      Fts3PhraseToken *pToken = pTC->pToken;
  3923         -      int nList = 0;
  3924         -      char *pList = 0;
  3925         -      rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
  3926         -      assert( rc==SQLITE_OK || pList==0 );
  3927         -      if( rc==SQLITE_OK ){
  3928         -        fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList);
  3929         -        nDocEst = fts3DoclistCountDocids(
  3930         -            pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll
  3931         -        );
  3932         -      }
  3933         -    }else {
  3934         -      /* This token will not be deferred. And it will not be loaded into
  3935         -      ** memory at this point either. So assume that it filters out 75% of
  3936         -      ** the currently estimated number of documents. */
  3937         -      nDocEst = 1 + (nDocEst/4);
         3936  +    }else{
         3937  +      nLoad4 = nLoad4*4;
         3938  +      if( ii==0 || pTC->pPhrase->nToken>1 ){
         3939  +        /* Either this is the cheapest token in the entire query, or it is
         3940  +        ** part of a multi-token phrase. Either way, the entire doclist will
         3941  +        ** (eventually) be loaded into memory. It may as well be now. */
         3942  +        Fts3PhraseToken *pToken = pTC->pToken;
         3943  +        int nList = 0;
         3944  +        char *pList = 0;
         3945  +        rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
         3946  +        assert( rc==SQLITE_OK || pList==0 );
         3947  +        if( rc==SQLITE_OK ){
         3948  +          int nCount;
         3949  +          fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList);
         3950  +          nCount = fts3DoclistCountDocids(
         3951  +              pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll
         3952  +          );
         3953  +          if( ii==0 || nCount<nMinEst ) nMinEst = nCount;
         3954  +        }
         3955  +      }
  3938   3956       }
  3939   3957       pTC->pToken = 0;
  3940   3958     }
  3941   3959   
  3942   3960     return rc;
  3943   3961   }
  3944   3962   

Changes to test/fts3auto.test.

   103    103   
   104    104     do_execsql_test $tn$title.4 "
   105    105       SELECT docid, mit(matchinfo($tbl, 'x')) FROM $tbl 
   106    106       WHERE $tbl MATCH '$match' ORDER BY docid ASC
   107    107     " $matchinfo_asc
   108    108   }
   109    109   
   110         -#    fts3_make_deferrable TABLE TOKEN
          110  +#    fts3_make_deferrable TABLE TOKEN ?NROW?
   111    111   #
   112         -proc fts3_make_deferrable {tbl token} {
          112  +proc fts3_make_deferrable {tbl token {nRow 0}} {
   113    113   
   114    114     set stmt [sqlite3_prepare db "SELECT * FROM $tbl" -1 dummy]
   115    115     set name [sqlite3_column_name $stmt 0]
   116    116     sqlite3_finalize $stmt
   117    117   
   118         -  set nRow [db one "SELECT count(*) FROM $tbl"]
          118  +  if {$nRow==0} {
          119  +    set nRow [db one "SELECT count(*) FROM $tbl"]
          120  +  }
   119    121     set pgsz [db one "PRAGMA page_size"]
   120    122     execsql BEGIN
   121    123     for {set i 0} {$i < ($nRow * $pgsz * 1.2)/100} {incr i} {
   122    124       set doc [string repeat "$token " 100]
   123    125       execsql "INSERT INTO $tbl ($name) VALUES(\$doc)"
   124    126     }
   125    127     execsql "INSERT INTO $tbl ($name) VALUES('aaaaaaa ${token}aaaaa')"
................................................................................
   648    650     do_fts3query_test 6.$tn.2 t1 {b:G AND c:I}
   649    651     do_fts3query_test 6.$tn.3 t1 {b:G NEAR c:I}
   650    652     do_fts3query_test 6.$tn.4 t1 {a:C OR b:G OR c:K OR d:C}
   651    653     do_fts3query_test 6.$tn.5 t1 {a:G OR b:G}
   652    654   
   653    655     catchsql { COMMIT }
   654    656   }
          657  +
          658  +foreach {tn create} {
          659  +  1    "fts4(x)"
          660  +  2    "fts4(x, order=DESC)"
          661  +} {
          662  +  execsql [subst {
          663  +    DROP TABLE IF EXISTS t1;
          664  +    CREATE VIRTUAL TABLE t1 USING $create;
          665  +  }]
          666  +
          667  +  foreach {x} {
          668  +    "F E N O T K X V A X I E X A P G Q V H U"
          669  +    "R V A E T C V Q N I E L O N U G J K L U"
          670  +    "U Y I G W M V F J L X I D C H F P J Q B"
          671  +    "S G D Z X R P G S S Y B K A S G A I L L"
          672  +    "L S I C H T Z S R Q P R N K J X L F M J"
          673  +    "C C C D P X B Z C M A D A C X S B T X V"
          674  +    "W Y J M D R G V R K B X S A W R I T N C"
          675  +    "P K L W T M S P O Y Y V V O E H Q A I R"
          676  +    "C D Y I C Z F H J C O Y A Q F L S B D K"
          677  +    "P G S C Y C Y V I M B D S Z D D Y W I E"
          678  +    "Z K Z U E E S F Y X T U A L W O U J C Q"
          679  +    "P A T Z S W L P L Q V Y Y I P W U X S S"
          680  +    "I U I H U O F Z F R H R F T N D X A G M"
          681  +    "N A B M S H K X S O Y D T X S B R Y H Z"
          682  +    "L U D A S K I L S V Z J P U B E B Y H M"
          683  +  } {
          684  +    execsql { INSERT INTO t1 VALUES($x) }
          685  +  }
          686  +
          687  +  # Add extra documents to the database such that token "B" will be considered
          688  +  # deferrable if considering the other tokens means that 2 or fewer documents
          689  +  # will be loaded into memory.
          690  +  #
          691  +  fts3_make_deferrable t1 B 2
          692  +
          693  +  # B is not deferred in either of the first two tests below, since filtering
          694  +  # on "M" or "D" returns 10 documents or so. But filtering on "M * D" only
          695  +  # returns 2, so B is deferred in this case.
          696  +  #
          697  +  do_fts3query_test 7.$tn.1             t1 {"M B"}
          698  +  do_fts3query_test 7.$tn.2             t1 {"B D"}
          699  +  do_fts3query_test 7.$tn.3 -deferred B t1 {"M B D"}
          700  +}
   655    701   
   656    702   set sqlite_fts3_enable_parentheses $sfep
   657    703   finish_test
   658    704