Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add a test for an fts5 tokenizer that supports synonyms by adding multiple entries to the fts index. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts5-incompatible |
Files: | files | file ages | folders |
SHA1: |
98d07d16cab92f1e7001afbe370df3ec |
User & Date: | dan 2015-08-29 18:46:12.456 |
Context
2015-08-31
| ||
20:06 | Begin changes to allow synonym support by adding multiple terms to a query (an alternative to adding multiple terms to the FTS index). (check-in: ad7feaed4c user: dan tags: fts5-incompatible) | |
2015-08-29
| ||
18:46 | Add a test for an fts5 tokenizer that supports synonyms by adding multiple entries to the fts index. (check-in: 98d07d16ca user: dan tags: fts5-incompatible) | |
15:44 | Another change to the fts5 tokenizer API. (check-in: fc71868496 user: dan tags: fts5-incompatible) | |
Changes
Changes to ext/fts5/fts5_storage.c.
︙ | ︙ | |||
363 364 365 366 367 368 369 | const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ int iEnd /* End offset of token */ ){ Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext; Fts5Index *pIdx = pCtx->pStorage->pIndex; | | | 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 | const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ int iEnd /* End offset of token */ ){ Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext; Fts5Index *pIdx = pCtx->pStorage->pIndex; if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){ pCtx->szCol++; } return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken); } /* ** If a row with rowid iDel is present in the %_content table, add the |
︙ | ︙ | |||
848 849 850 851 852 853 854 | int tflags, const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ int iEnd /* End offset of token */ ){ Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext; | | | 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 | int tflags, const char *pToken, /* Buffer containing token */ int nToken, /* Size of token in bytes */ int iStart, /* Start offset of token */ int iEnd /* End offset of token */ ){ Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext; if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){ pCtx->szCol++; } pCtx->cksum ^= sqlite3Fts5IndexCksum( pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken ); return SQLITE_OK; } |
︙ | ︙ |
Changes to ext/fts5/fts5_tcl.c.
︙ | ︙ | |||
688 689 690 691 692 693 694 | /************************************************************************* ** Start of tokenizer wrapper. */ typedef struct F5tTokenizerContext F5tTokenizerContext; typedef struct F5tTokenizerCb F5tTokenizerCb; typedef struct F5tTokenizerModule F5tTokenizerModule; | | > > > > > > | 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 | /************************************************************************* ** Start of tokenizer wrapper. */ typedef struct F5tTokenizerContext F5tTokenizerContext; typedef struct F5tTokenizerCb F5tTokenizerCb; typedef struct F5tTokenizerModule F5tTokenizerModule; typedef struct F5tTokenizerInstance F5tTokenizerInstance; struct F5tTokenizerContext { void *pCtx; int (*xToken)(void*, int, const char*, int, int, int); }; struct F5tTokenizerModule { Tcl_Interp *interp; Tcl_Obj *pScript; F5tTokenizerContext *pContext; }; struct F5tTokenizerInstance { Tcl_Interp *interp; Tcl_Obj *pScript; F5tTokenizerContext *pContext; }; static int f5tTokenizerCreate( void *pCtx, const char **azArg, |
︙ | ︙ | |||
757 758 759 760 761 762 763 764 765 766 767 768 769 | int (*xToken)(void*, int, const char*, int, int, int) ){ F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p; void *pOldCtx; int (*xOldToken)(void*, int, const char*, int, int, int); Tcl_Obj *pEval; int rc; pOldCtx = pInst->pContext->pCtx; xOldToken = pInst->pContext->xToken; pEval = Tcl_DuplicateObj(pInst->pScript); Tcl_IncrRefCount(pEval); | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | < < | < | > > > > > > | > > | > > | > > > > > | < | < | | | < | | | 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 | int (*xToken)(void*, int, const char*, int, int, int) ){ F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p; void *pOldCtx; int (*xOldToken)(void*, int, const char*, int, int, int); Tcl_Obj *pEval; int rc; const char *zFlags; pOldCtx = pInst->pContext->pCtx; xOldToken = pInst->pContext->xToken; pInst->pContext->pCtx = pCtx; pInst->pContext->xToken = xToken; assert( flags==FTS5_TOKENIZE_DOCUMENT || flags==FTS5_TOKENIZE_AUX || flags==FTS5_TOKENIZE_QUERY || flags==(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX) ); pEval = Tcl_DuplicateObj(pInst->pScript); Tcl_IncrRefCount(pEval); switch( flags ){ case FTS5_TOKENIZE_DOCUMENT: zFlags = "document"; break; case FTS5_TOKENIZE_AUX: zFlags = "aux"; break; case FTS5_TOKENIZE_QUERY: zFlags = "query"; break; case (FTS5_TOKENIZE_PREFIX | FTS5_TOKENIZE_QUERY): zFlags = "prefixquery"; break; default: assert( 0 ); zFlags = "invalid"; break; } Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(zFlags, -1)); Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(pText,nText)); rc = Tcl_EvalObjEx(pInst->interp, pEval, TCL_GLOBAL_ONLY); Tcl_DecrRefCount(pEval); pInst->pContext->pCtx = pOldCtx; pInst->pContext->xToken = xOldToken; return rc; } /* ** sqlite3_fts5_token ?-colocated? TEXT START END */ static int f5tTokenizerReturn( void * clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ F5tTokenizerContext *p = (F5tTokenizerContext*)clientData; int iStart; int iEnd; int nToken; int tflags = 0; char *zToken; int rc; if( objc==5 ){ int nArg; char *zArg = Tcl_GetStringFromObj(objv[1], &nArg); if( nArg<=10 && nArg>=2 && memcmp("-colocated", zArg, nArg)==0 ){ tflags |= FTS5_TOKEN_COLOCATED; }else{ goto usage; } }else if( objc!=4 ){ goto usage; } zToken = Tcl_GetStringFromObj(objv[objc-3], &nToken); if( Tcl_GetIntFromObj(interp, objv[objc-2], &iStart) || Tcl_GetIntFromObj(interp, objv[objc-1], &iEnd) ){ return TCL_ERROR; } if( p->xToken==0 ){ Tcl_AppendResult(interp, "sqlite3_fts5_token may only be used by tokenizer callback", 0 ); return TCL_ERROR; } rc = p->xToken(p->pCtx, tflags, zToken, nToken, iStart, iEnd); Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE); return TCL_OK; usage: Tcl_WrongNumArgs(interp, 1, objv, "?-colocated? TEXT START END"); return TCL_ERROR; } static void f5tDelTokenizer(void *pCtx){ F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx; Tcl_DecrRefCount(pMod->pScript); ckfree((char *)pMod); } |
︙ | ︙ |
Added ext/fts5/test/fts5synonym.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | # 2014 Dec 20 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # # Tests focusing on custom tokenizers that support synonyms. # source [file join [file dirname [info script]] fts5_common.tcl] set testprefix fts5synonym # If SQLITE_ENABLE_FTS5 is defined, omit this file. ifcapable !fts5 { finish_test return } proc gobble_whitespace {textvar} { upvar $textvar t regexp {([ ]*)(.*)} $t -> space t return [string length $space] } proc gobble_text {textvar wordvar} { upvar $textvar t upvar $wordvar w regexp {([^ ]*)(.*)} $t -> w t return [string length $w] } proc do_tokenize_split {text} { set token "" set ret [list] set iOff [gobble_whitespace text] while {[set nToken [gobble_text text word]]} { lappend ret $word $iOff [expr $iOff+$nToken] incr iOff $nToken incr iOff [gobble_whitespace text] } set ret } proc tcl_tokenize {tflags text} { foreach {w iStart iEnd} [do_tokenize_split $text] { sqlite3_fts5_token $w $iStart $iEnd } } proc tcl_create {args} { return "tcl_tokenize" } sqlite3_fts5_create_tokenizer db tcl tcl_create #------------------------------------------------------------------------- # Warm body test for the code in fts5_tcl.c. # do_execsql_test 1.0 { CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl); INSERT INTO ft VALUES('abc def ghi'); INSERT INTO ft VALUES('jkl mno pqr'); SELECT rowid, x FROM ft WHERE ft MATCH 'def'; SELECT x, rowid FROM ft WHERE ft MATCH 'pqr'; } {1 {abc def ghi} {jkl mno pqr} 2} #------------------------------------------------------------------------- # Test a tokenizer that supports synonyms by adding extra entries to the # FTS index. # foreach S { {zero 0} {one 1} {two 2} {three 3 iii} {four 4} {five 5} {six 6} {seven 7} {eight 8} {nine 9} } { foreach s $S { set o [list] foreach x $S {if {$x!=$s} {lappend o $x}} set ::syn($s) $o } } proc tcl_tokenize {tflags text} { foreach {w iStart iEnd} [do_tokenize_split $text] { sqlite3_fts5_token $w $iStart $iEnd if {$tflags=="document" && [info exists ::syn($w)]} { foreach s $::syn($w) { sqlite3_fts5_token -colo $s $iStart $iEnd } } } } reset_db sqlite3_fts5_create_tokenizer db tcl tcl_create do_execsql_test 2.0 { CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl); INSERT INTO ft VALUES('one two three'); INSERT INTO ft VALUES('four five six'); INSERT INTO ft VALUES('eight nine ten'); } {} foreach {tn expr res} { 1 "3" 1 2 "eight OR 8 OR 5" {2 3} 3 "10" {} 4 "1*" {1} } { do_execsql_test 2.1.$tn { SELECT rowid FROM ft WHERE ft MATCH $expr } $res } #------------------------------------------------------------------------- # Test some broken tokenizers: # # 3.1.*: A tokenizer that declares the very first token to be colocated. # # 3.2.*: A tokenizer that reports two identical tokens at the same position. # This is allowed. # reset_db sqlite3_fts5_create_tokenizer db tcl tcl_create proc tcl_tokenize {tflags text} { set bColo 1 foreach {w iStart iEnd} [do_tokenize_split $text] { if {$bColo} { sqlite3_fts5_token -colo $w $iStart $iEnd set bColo 0 } { sqlite3_fts5_token $w $iStart $iEnd } } } do_execsql_test 3.1.0 { CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl); INSERT INTO ft VALUES('one two three'); CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row); SELECT * FROM vv; } { one 1 1 three 1 1 two 1 1 } do_execsql_test 3.1.1 { INSERT INTO ft(ft) VALUES('integrity-check'); } {} proc tcl_tokenize {tflags text} { foreach {w iStart iEnd} [do_tokenize_split $text] { sqlite3_fts5_token $w $iStart $iEnd } } do_execsql_test 3.1.2 { SELECT rowid FROM ft WHERE ft MATCH 'one two three' } {1} reset_db sqlite3_fts5_create_tokenizer db tcl tcl_create proc tcl_tokenize {tflags text} { foreach {w iStart iEnd} [do_tokenize_split $text] { sqlite3_fts5_token $w $iStart $iEnd sqlite3_fts5_token -colo $w $iStart $iEnd } } do_execsql_test 3.2.0 { CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl); INSERT INTO ft VALUES('one one two three'); CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row); SELECT * FROM vv; } { one 1 4 three 1 2 two 1 2 } do_execsql_test 3.2.1 { SELECT rowid FROM ft WHERE ft MATCH 'one two three'; SELECT rowid FROM ft WHERE ft MATCH 'one + one + two + three'; } {1 1} do_execsql_test 3.2.2 { SELECT rowid FROM ft WHERE ft MATCH 'one two two three'; SELECT rowid FROM ft WHERE ft MATCH 'one + two + two + three'; } {1} finish_test |