SQLite

Check-in [91daea7d2e]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add a fix and tests for the FTS deferred token logic.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts3-changes
Files: files | file ages | folders
SHA1: 91daea7d2ec41f014fb30c6371aae447cc07f287
User & Date: dan 2011-06-28 11:58:09.194
Context
2011-06-28
14:16
Merge the fts3-changes branch back into the trunk. (check-in: b9477eb056 user: dan tags: trunk)
11:58
Add a fix and tests for the FTS deferred token logic. (Closed-Leaf check-in: 91daea7d2e user: dan tags: fts3-changes)
09:51
Merge latest trunk changes with fts3-changes branch. (check-in: 226686475c user: dan tags: fts3-changes)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts3/fts3.c.
3860
3861
3862
3863
3864
3865
3866
3867
3868
3869
3870
3871



3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887























3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917


3918
3919
3920
3921
3922
3923
3924
3925
3926
3927

3928
3929
3930
3931

3932
3933
3934
3935
3936
3937

3938
3939
3940
3941
3942
3943
3944
  Fts3Cursor *pCsr,               /* FTS Cursor handle */
  Fts3Expr *pRoot,                /* Consider tokens with this root node */
  Fts3TokenAndCost *aTC,          /* Array of expression tokens and costs */
  int nTC                         /* Number of entries in aTC[] */
){
  Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  int nDocSize = 0;               /* Number of pages per doc loaded */
  int nDocEst = 0;                /* Est. docs if all other tokens deferred */
  int rc = SQLITE_OK;             /* Return code */
  int ii;                         /* Iterator variable for various purposes */
  int nOvfl = 0;                  /* Total overflow pages used by doclists */
  int nToken = 0;                 /* Total number of tokens in cluster */




  /* Count the tokens in this AND/NEAR cluster. If none of the doclists
  ** associated with the tokens spill onto overflow pages, or if there is
  ** only 1 token, exit early. No tokens to defer in this case. */
  for(ii=0; ii<nTC; ii++){
    if( aTC[ii].pRoot==pRoot ){
      nOvfl += aTC[ii].nOvfl;
      nToken++;
    }
  }
  if( nOvfl==0 || nToken<2 ) return SQLITE_OK;

  /* Obtain the average docsize (in pages). */
  rc = fts3EvalAverageDocsize(pCsr, &nDocSize);
  assert( rc!=SQLITE_OK || nDocSize>0 );
























  for(ii=0; ii<nToken && rc==SQLITE_OK; ii++){
    int iTC;                      /* Used to iterate through aTC[] array. */
    Fts3TokenAndCost *pTC = 0;    /* Set to cheapest remaining token. */

    /* Set pTC to point to the cheapest remaining token. */
    for(iTC=0; iTC<nTC; iTC++){
      if( aTC[iTC].pToken && aTC[iTC].pRoot==pRoot 
       && (!pTC || aTC[iTC].nOvfl<pTC->nOvfl) 
      ){
        pTC = &aTC[iTC];
      }
    }
    assert( pTC );

    /* Determine if token pTC should be deferred. If not, update nDocEst. 
    **
    ** TODO: If there are performance regressions involving deferred tokens,
    ** this (the logic that selects the tokens to be deferred) is probably
    ** the bit that needs to change.
    */

    if( ii && pTC->nOvfl>=(nDocEst*nDocSize) ){
      /* The number of overflow pages to load for this (and therefore all
      ** subsequent) tokens is greater than the estimated number of pages 
      ** that will be loaded if all subsequent tokens are deferred.
      */
      Fts3PhraseToken *pToken = pTC->pToken;
      rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol);
      fts3SegReaderCursorFree(pToken->pSegcsr);
      pToken->pSegcsr = 0;


    }else if( ii==0 || pTC->pPhrase->nToken>1 ){
      /* Either this is the cheapest token in the entire query, or it is
      ** part of a multi-token phrase. Either way, the entire doclist will
      ** (eventually) be loaded into memory. It may as well be now. */
      Fts3PhraseToken *pToken = pTC->pToken;
      int nList = 0;
      char *pList = 0;
      rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
      assert( rc==SQLITE_OK || pList==0 );
      if( rc==SQLITE_OK ){

        fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList);
        nDocEst = fts3DoclistCountDocids(
            pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll
        );

      }
    }else {
      /* This token will not be deferred. And it will not be loaded into
      ** memory at this point either. So assume that it filters out 75% of
      ** the currently estimated number of documents. */
      nDocEst = 1 + (nDocEst/4);

    }
    pTC->pToken = 0;
  }

  return rc;
}








<




>
>
>
















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>














<
<
<
<
<
<
<
|








>
>
|
|
|
|
|
|
|
|
|
|
>
|
|
|
|
>
|
<
<
<
<
<
>







3860
3861
3862
3863
3864
3865
3866

3867
3868
3869
3870
3871
3872
3873
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
3908
3909
3910
3911
3912
3913
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926







3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954





3955
3956
3957
3958
3959
3960
3961
3962
  Fts3Cursor *pCsr,               /* FTS Cursor handle */
  Fts3Expr *pRoot,                /* Consider tokens with this root node */
  Fts3TokenAndCost *aTC,          /* Array of expression tokens and costs */
  int nTC                         /* Number of entries in aTC[] */
){
  Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab;
  int nDocSize = 0;               /* Number of pages per doc loaded */

  int rc = SQLITE_OK;             /* Return code */
  int ii;                         /* Iterator variable for various purposes */
  int nOvfl = 0;                  /* Total overflow pages used by doclists */
  int nToken = 0;                 /* Total number of tokens in cluster */

  int nMinEst = 0;                /* The minimum count for any phrase so far. */
  int nLoad4 = 1;                 /* (Phrases that will be loaded)^4. */

  /* Count the tokens in this AND/NEAR cluster. If none of the doclists
  ** associated with the tokens spill onto overflow pages, or if there is
  ** only 1 token, exit early. No tokens to defer in this case. */
  for(ii=0; ii<nTC; ii++){
    if( aTC[ii].pRoot==pRoot ){
      nOvfl += aTC[ii].nOvfl;
      nToken++;
    }
  }
  if( nOvfl==0 || nToken<2 ) return SQLITE_OK;

  /* Obtain the average docsize (in pages). */
  rc = fts3EvalAverageDocsize(pCsr, &nDocSize);
  assert( rc!=SQLITE_OK || nDocSize>0 );


  /* Iterate through all tokens in this AND/NEAR cluster, in ascending order 
  ** of the number of overflow pages that will be loaded by the pager layer 
  ** to retrieve the entire doclist for the token from the full-text index.
  ** Load the doclists for tokens that are either:
  **
  **   a. The cheapest token in the entire query (i.e. the one visited by the
  **      first iteration of this loop), or
  **
  **   b. Part of a multi-token phrase.
  **
  ** After each token doclist is loaded, merge it with the others from the
  ** same phrase and count the number of documents that the merged doclist
  ** contains. Set variable "nMinEst" to the smallest number of documents in 
  ** any phrase doclist for which 1 or more token doclists have been loaded.
  ** Let nOther be the number of other phrases for which it is certain that
  ** one or more tokens will not be deferred.
  **
  ** Then, for each token, defer it if loading the doclist would result in
  ** loading N or more overflow pages into memory, where N is computed as:
  **
  **    (nMinEst + 4^nOther - 1) / (4^nOther)
  */
  for(ii=0; ii<nToken && rc==SQLITE_OK; ii++){
    int iTC;                      /* Used to iterate through aTC[] array. */
    Fts3TokenAndCost *pTC = 0;    /* Set to cheapest remaining token. */

    /* Set pTC to point to the cheapest remaining token. */
    for(iTC=0; iTC<nTC; iTC++){
      if( aTC[iTC].pToken && aTC[iTC].pRoot==pRoot 
       && (!pTC || aTC[iTC].nOvfl<pTC->nOvfl) 
      ){
        pTC = &aTC[iTC];
      }
    }
    assert( pTC );








    if( ii && pTC->nOvfl>=((nMinEst+(nLoad4/4)-1)/(nLoad4/4))*nDocSize ){
      /* The number of overflow pages to load for this (and therefore all
      ** subsequent) tokens is greater than the estimated number of pages 
      ** that will be loaded if all subsequent tokens are deferred.
      */
      Fts3PhraseToken *pToken = pTC->pToken;
      rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol);
      fts3SegReaderCursorFree(pToken->pSegcsr);
      pToken->pSegcsr = 0;
    }else{
      nLoad4 = nLoad4*4;
      if( ii==0 || pTC->pPhrase->nToken>1 ){
        /* Either this is the cheapest token in the entire query, or it is
        ** part of a multi-token phrase. Either way, the entire doclist will
        ** (eventually) be loaded into memory. It may as well be now. */
        Fts3PhraseToken *pToken = pTC->pToken;
        int nList = 0;
        char *pList = 0;
        rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList);
        assert( rc==SQLITE_OK || pList==0 );
        if( rc==SQLITE_OK ){
          int nCount;
          fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList);
          nCount = fts3DoclistCountDocids(
              pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll
          );
          if( ii==0 || nCount<nMinEst ) nMinEst = nCount;
        }





      }
    }
    pTC->pToken = 0;
  }

  return rc;
}

Changes to test/fts3auto.test.
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

118

119
120
121
122
123
124
125

  do_execsql_test $tn$title.4 "
    SELECT docid, mit(matchinfo($tbl, 'x')) FROM $tbl 
    WHERE $tbl MATCH '$match' ORDER BY docid ASC
  " $matchinfo_asc
}

#    fts3_make_deferrable TABLE TOKEN
#
proc fts3_make_deferrable {tbl token} {

  set stmt [sqlite3_prepare db "SELECT * FROM $tbl" -1 dummy]
  set name [sqlite3_column_name $stmt 0]
  sqlite3_finalize $stmt


  set nRow [db one "SELECT count(*) FROM $tbl"]

  set pgsz [db one "PRAGMA page_size"]
  execsql BEGIN
  for {set i 0} {$i < ($nRow * $pgsz * 1.2)/100} {incr i} {
    set doc [string repeat "$token " 100]
    execsql "INSERT INTO $tbl ($name) VALUES(\$doc)"
  }
  execsql "INSERT INTO $tbl ($name) VALUES('aaaaaaa ${token}aaaaa')"







|

|





>
|
>







103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

  do_execsql_test $tn$title.4 "
    SELECT docid, mit(matchinfo($tbl, 'x')) FROM $tbl 
    WHERE $tbl MATCH '$match' ORDER BY docid ASC
  " $matchinfo_asc
}

#    fts3_make_deferrable TABLE TOKEN ?NROW?
#
proc fts3_make_deferrable {tbl token {nRow 0}} {

  set stmt [sqlite3_prepare db "SELECT * FROM $tbl" -1 dummy]
  set name [sqlite3_column_name $stmt 0]
  sqlite3_finalize $stmt

  if {$nRow==0} {
    set nRow [db one "SELECT count(*) FROM $tbl"]
  }
  set pgsz [db one "PRAGMA page_size"]
  execsql BEGIN
  for {set i 0} {$i < ($nRow * $pgsz * 1.2)/100} {incr i} {
    set doc [string repeat "$token " 100]
    execsql "INSERT INTO $tbl ($name) VALUES(\$doc)"
  }
  execsql "INSERT INTO $tbl ($name) VALUES('aaaaaaa ${token}aaaaa')"
648
649
650
651
652
653
654
655












































656
657
658
  do_fts3query_test 6.$tn.2 t1 {b:G AND c:I}
  do_fts3query_test 6.$tn.3 t1 {b:G NEAR c:I}
  do_fts3query_test 6.$tn.4 t1 {a:C OR b:G OR c:K OR d:C}
  do_fts3query_test 6.$tn.5 t1 {a:G OR b:G}

  catchsql { COMMIT }
}













































set sqlite_fts3_enable_parentheses $sfep
finish_test









>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
  do_fts3query_test 6.$tn.2 t1 {b:G AND c:I}
  do_fts3query_test 6.$tn.3 t1 {b:G NEAR c:I}
  do_fts3query_test 6.$tn.4 t1 {a:C OR b:G OR c:K OR d:C}
  do_fts3query_test 6.$tn.5 t1 {a:G OR b:G}

  catchsql { COMMIT }
}

foreach {tn create} {
  1    "fts4(x)"
  2    "fts4(x, order=DESC)"
} {
  execsql [subst {
    DROP TABLE IF EXISTS t1;
    CREATE VIRTUAL TABLE t1 USING $create;
  }]

  foreach {x} {
    "F E N O T K X V A X I E X A P G Q V H U"
    "R V A E T C V Q N I E L O N U G J K L U"
    "U Y I G W M V F J L X I D C H F P J Q B"
    "S G D Z X R P G S S Y B K A S G A I L L"
    "L S I C H T Z S R Q P R N K J X L F M J"
    "C C C D P X B Z C M A D A C X S B T X V"
    "W Y J M D R G V R K B X S A W R I T N C"
    "P K L W T M S P O Y Y V V O E H Q A I R"
    "C D Y I C Z F H J C O Y A Q F L S B D K"
    "P G S C Y C Y V I M B D S Z D D Y W I E"
    "Z K Z U E E S F Y X T U A L W O U J C Q"
    "P A T Z S W L P L Q V Y Y I P W U X S S"
    "I U I H U O F Z F R H R F T N D X A G M"
    "N A B M S H K X S O Y D T X S B R Y H Z"
    "L U D A S K I L S V Z J P U B E B Y H M"
  } {
    execsql { INSERT INTO t1 VALUES($x) }
  }

  # Add extra documents to the database such that token "B" will be considered
  # deferrable if considering the other tokens means that 2 or fewer documents
  # will be loaded into memory.
  #
  fts3_make_deferrable t1 B 2

  # B is not deferred in either of the first two tests below, since filtering
  # on "M" or "D" returns 10 documents or so. But filtering on "M * D" only
  # returns 2, so B is deferred in this case.
  #
  do_fts3query_test 7.$tn.1             t1 {"M B"}
  do_fts3query_test 7.$tn.2             t1 {"B D"}
  do_fts3query_test 7.$tn.3 -deferred B t1 {"M B D"}
}

set sqlite_fts3_enable_parentheses $sfep
finish_test