Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add a fix and tests for the FTS deferred token logic. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts3-changes |
Files: | files | file ages | folders |
SHA1: |
91daea7d2ec41f014fb30c6371aae447 |
User & Date: | dan 2011-06-28 11:58:09.194 |
Context
2011-06-28
| ||
14:16 | Merge the fts3-changes branch back into the trunk. (check-in: b9477eb056 user: dan tags: trunk) | |
11:58 | Add a fix and tests for the FTS deferred token logic. (Closed-Leaf check-in: 91daea7d2e user: dan tags: fts3-changes) | |
09:51 | Merge latest trunk changes with fts3-changes branch. (check-in: 226686475c user: dan tags: fts3-changes) | |
Changes
Changes to ext/fts3/fts3.c.
︙ | ︙ | |||
3860 3861 3862 3863 3864 3865 3866 | Fts3Cursor *pCsr, /* FTS Cursor handle */ Fts3Expr *pRoot, /* Consider tokens with this root node */ Fts3TokenAndCost *aTC, /* Array of expression tokens and costs */ int nTC /* Number of entries in aTC[] */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int nDocSize = 0; /* Number of pages per doc loaded */ | < > > > > > > > > > > > > > > > > > > > > > > > > > > < < < < < < < | > > | | | | | | | | | | > | | | | > | < < < < < > | 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 | Fts3Cursor *pCsr, /* FTS Cursor handle */ Fts3Expr *pRoot, /* Consider tokens with this root node */ Fts3TokenAndCost *aTC, /* Array of expression tokens and costs */ int nTC /* Number of entries in aTC[] */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int nDocSize = 0; /* Number of pages per doc loaded */ int rc = SQLITE_OK; /* Return code */ int ii; /* Iterator variable for various purposes */ int nOvfl = 0; /* Total overflow pages used by doclists */ int nToken = 0; /* Total number of tokens in cluster */ int nMinEst = 0; /* The minimum count for any phrase so far. */ int nLoad4 = 1; /* (Phrases that will be loaded)^4. */ /* Count the tokens in this AND/NEAR cluster. If none of the doclists ** associated with the tokens spill onto overflow pages, or if there is ** only 1 token, exit early. No tokens to defer in this case. */ for(ii=0; ii<nTC; ii++){ if( aTC[ii].pRoot==pRoot ){ nOvfl += aTC[ii].nOvfl; nToken++; } } if( nOvfl==0 || nToken<2 ) return SQLITE_OK; /* Obtain the average docsize (in pages). */ rc = fts3EvalAverageDocsize(pCsr, &nDocSize); assert( rc!=SQLITE_OK || nDocSize>0 ); /* Iterate through all tokens in this AND/NEAR cluster, in ascending order ** of the number of overflow pages that will be loaded by the pager layer ** to retrieve the entire doclist for the token from the full-text index. ** Load the doclists for tokens that are either: ** ** a. The cheapest token in the entire query (i.e. the one visited by the ** first iteration of this loop), or ** ** b. Part of a multi-token phrase. ** ** After each token doclist is loaded, merge it with the others from the ** same phrase and count the number of documents that the merged doclist ** contains. Set variable "nMinEst" to the smallest number of documents in ** any phrase doclist for which 1 or more token doclists have been loaded. ** Let nOther be the number of other phrases for which it is certain that ** one or more tokens will not be deferred. ** ** Then, for each token, defer it if loading the doclist would result in ** loading N or more overflow pages into memory, where N is computed as: ** ** (nMinEst + 4^nOther - 1) / (4^nOther) */ for(ii=0; ii<nToken && rc==SQLITE_OK; ii++){ int iTC; /* Used to iterate through aTC[] array. */ Fts3TokenAndCost *pTC = 0; /* Set to cheapest remaining token. */ /* Set pTC to point to the cheapest remaining token. */ for(iTC=0; iTC<nTC; iTC++){ if( aTC[iTC].pToken && aTC[iTC].pRoot==pRoot && (!pTC || aTC[iTC].nOvfl<pTC->nOvfl) ){ pTC = &aTC[iTC]; } } assert( pTC ); if( ii && pTC->nOvfl>=((nMinEst+(nLoad4/4)-1)/(nLoad4/4))*nDocSize ){ /* The number of overflow pages to load for this (and therefore all ** subsequent) tokens is greater than the estimated number of pages ** that will be loaded if all subsequent tokens are deferred. */ Fts3PhraseToken *pToken = pTC->pToken; rc = sqlite3Fts3DeferToken(pCsr, pToken, pTC->iCol); fts3SegReaderCursorFree(pToken->pSegcsr); pToken->pSegcsr = 0; }else{ nLoad4 = nLoad4*4; if( ii==0 || pTC->pPhrase->nToken>1 ){ /* Either this is the cheapest token in the entire query, or it is ** part of a multi-token phrase. Either way, the entire doclist will ** (eventually) be loaded into memory. It may as well be now. */ Fts3PhraseToken *pToken = pTC->pToken; int nList = 0; char *pList = 0; rc = fts3TermSelect(pTab, pToken, pTC->iCol, &nList, &pList); assert( rc==SQLITE_OK || pList==0 ); if( rc==SQLITE_OK ){ int nCount; fts3EvalPhraseMergeToken(pTab, pTC->pPhrase, pTC->iToken,pList,nList); nCount = fts3DoclistCountDocids( pTC->pPhrase->doclist.aAll, pTC->pPhrase->doclist.nAll ); if( ii==0 || nCount<nMinEst ) nMinEst = nCount; } } } pTC->pToken = 0; } return rc; } |
︙ | ︙ |
Changes to test/fts3auto.test.
︙ | ︙ | |||
103 104 105 106 107 108 109 | do_execsql_test $tn$title.4 " SELECT docid, mit(matchinfo($tbl, 'x')) FROM $tbl WHERE $tbl MATCH '$match' ORDER BY docid ASC " $matchinfo_asc } | | | > | > | 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | do_execsql_test $tn$title.4 " SELECT docid, mit(matchinfo($tbl, 'x')) FROM $tbl WHERE $tbl MATCH '$match' ORDER BY docid ASC " $matchinfo_asc } # fts3_make_deferrable TABLE TOKEN ?NROW? # proc fts3_make_deferrable {tbl token {nRow 0}} { set stmt [sqlite3_prepare db "SELECT * FROM $tbl" -1 dummy] set name [sqlite3_column_name $stmt 0] sqlite3_finalize $stmt if {$nRow==0} { set nRow [db one "SELECT count(*) FROM $tbl"] } set pgsz [db one "PRAGMA page_size"] execsql BEGIN for {set i 0} {$i < ($nRow * $pgsz * 1.2)/100} {incr i} { set doc [string repeat "$token " 100] execsql "INSERT INTO $tbl ($name) VALUES(\$doc)" } execsql "INSERT INTO $tbl ($name) VALUES('aaaaaaa ${token}aaaaa')" |
︙ | ︙ | |||
648 649 650 651 652 653 654 655 656 657 658 | do_fts3query_test 6.$tn.2 t1 {b:G AND c:I} do_fts3query_test 6.$tn.3 t1 {b:G NEAR c:I} do_fts3query_test 6.$tn.4 t1 {a:C OR b:G OR c:K OR d:C} do_fts3query_test 6.$tn.5 t1 {a:G OR b:G} catchsql { COMMIT } } set sqlite_fts3_enable_parentheses $sfep finish_test | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 | do_fts3query_test 6.$tn.2 t1 {b:G AND c:I} do_fts3query_test 6.$tn.3 t1 {b:G NEAR c:I} do_fts3query_test 6.$tn.4 t1 {a:C OR b:G OR c:K OR d:C} do_fts3query_test 6.$tn.5 t1 {a:G OR b:G} catchsql { COMMIT } } foreach {tn create} { 1 "fts4(x)" 2 "fts4(x, order=DESC)" } { execsql [subst { DROP TABLE IF EXISTS t1; CREATE VIRTUAL TABLE t1 USING $create; }] foreach {x} { "F E N O T K X V A X I E X A P G Q V H U" "R V A E T C V Q N I E L O N U G J K L U" "U Y I G W M V F J L X I D C H F P J Q B" "S G D Z X R P G S S Y B K A S G A I L L" "L S I C H T Z S R Q P R N K J X L F M J" "C C C D P X B Z C M A D A C X S B T X V" "W Y J M D R G V R K B X S A W R I T N C" "P K L W T M S P O Y Y V V O E H Q A I R" "C D Y I C Z F H J C O Y A Q F L S B D K" "P G S C Y C Y V I M B D S Z D D Y W I E" "Z K Z U E E S F Y X T U A L W O U J C Q" "P A T Z S W L P L Q V Y Y I P W U X S S" "I U I H U O F Z F R H R F T N D X A G M" "N A B M S H K X S O Y D T X S B R Y H Z" "L U D A S K I L S V Z J P U B E B Y H M" } { execsql { INSERT INTO t1 VALUES($x) } } # Add extra documents to the database such that token "B" will be considered # deferrable if considering the other tokens means that 2 or fewer documents # will be loaded into memory. # fts3_make_deferrable t1 B 2 # B is not deferred in either of the first two tests below, since filtering # on "M" or "D" returns 10 documents or so. But filtering on "M * D" only # returns 2, so B is deferred in this case. # do_fts3query_test 7.$tn.1 t1 {"M B"} do_fts3query_test 7.$tn.2 t1 {"B D"} do_fts3query_test 7.$tn.3 -deferred B t1 {"M B D"} } set sqlite_fts3_enable_parentheses $sfep finish_test |