SQLite

Check-in [98d07d16ca]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add a test for an fts5 tokenizer that supports synonyms by adding multiple entries to the fts index.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5-incompatible
Files: files | file ages | folders
SHA1: 98d07d16cab92f1e7001afbe370df3ec6343fc1f
User & Date: dan 2015-08-29 18:46:12.456
Context
2015-08-31
20:06
Begin changes to allow synonym support by adding multiple terms to a query (an alternative to adding multiple terms to the FTS index). (check-in: ad7feaed4c user: dan tags: fts5-incompatible)
2015-08-29
18:46
Add a test for an fts5 tokenizer that supports synonyms by adding multiple entries to the fts index. (check-in: 98d07d16ca user: dan tags: fts5-incompatible)
15:44
Another change to the fts5 tokenizer API. (check-in: fc71868496 user: dan tags: fts5-incompatible)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5_storage.c.
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
    pCtx->szCol++;
  }
  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the







|







363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
    pCtx->szCol++;
  }
  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
    pCtx->szCol++;
  }
  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
  );
  return SQLITE_OK;
}







|







848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
    pCtx->szCol++;
  }
  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
  );
  return SQLITE_OK;
}
Changes to ext/fts5/fts5_tcl.c.
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703






704
705
706
707
708
709
710
/*************************************************************************
** Start of tokenizer wrapper.
*/

typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerModule F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, int, const char*, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;






  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};

static int f5tTokenizerCreate(
  void *pCtx, 
  const char **azArg, 







|








>
>
>
>
>
>







688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
/*************************************************************************
** Start of tokenizer wrapper.
*/

typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerInstance F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, int, const char*, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};

struct F5tTokenizerInstance {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};

static int f5tTokenizerCreate(
  void *pCtx, 
  const char **azArg, 
757
758
759
760
761
762
763

764
765
766
767









768
769



















770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795

796
797
798





799


800


801




802
803

804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
  int (*xToken)(void*, int, const char*, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, int, const char*, int, int, int);
  Tcl_Obj *pEval;
  int rc;


  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;










  pEval = Tcl_DuplicateObj(pInst->pScript);
  Tcl_IncrRefCount(pEval);



















  rc = Tcl_ListObjAppendElement(
      pInst->interp, pEval, Tcl_NewStringObj(pText, nText)
  );
  if( rc==TCL_OK ){
    rc = Tcl_EvalObjEx(pInst->interp, pEval, TCL_GLOBAL_ONLY);
  }
  Tcl_DecrRefCount(pEval);

  pInst->pContext->pCtx = pOldCtx;
  pInst->pContext->xToken = xOldToken;
  return rc;
}

/*
** sqlite3_fts5_token TEXT START END POS
*/
static int f5tTokenizerReturn(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  F5tTokenizerContext *p = (F5tTokenizerContext*)clientData;
  int iStart;
  int iEnd;
  int nToken;

  char *zToken;
  int rc;






  assert( p );


  if( objc!=4 ){


    Tcl_WrongNumArgs(interp, 1, objv, "TEXT START END");




    return TCL_ERROR;
  }

  if( p->xToken==0 ){
    Tcl_AppendResult(interp, 
        "sqlite3_fts5_token may only be used by tokenizer callback", 0
    );
    return TCL_ERROR;
  }

  zToken = Tcl_GetStringFromObj(objv[1], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[3], &iEnd) 
  ){
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, 0, zToken, nToken, iStart, iEnd);
  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
  return TCL_OK;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);
  ckfree((char *)pMod);
}







>




>
>
>
>
>
>
>
>
>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|
<
<
|
<








|











>



>
>
>
>
>
|
>
>
|
>
>
|
>
>
>
>


>







|
<
|
<
|
|
|
<
|
|







763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806


807

808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858

859

860
861
862

863
864
865
866
867
868
869
870
871
  int (*xToken)(void*, int, const char*, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, int, const char*, int, int, int);
  Tcl_Obj *pEval;
  int rc;
  const char *zFlags;

  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;

  pInst->pContext->pCtx = pCtx;
  pInst->pContext->xToken = xToken;

  assert( 
      flags==FTS5_TOKENIZE_DOCUMENT
   || flags==FTS5_TOKENIZE_AUX
   || flags==FTS5_TOKENIZE_QUERY
   || flags==(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)
  );
  pEval = Tcl_DuplicateObj(pInst->pScript);
  Tcl_IncrRefCount(pEval);
  switch( flags ){
    case FTS5_TOKENIZE_DOCUMENT:
      zFlags = "document";
      break;
    case FTS5_TOKENIZE_AUX:
      zFlags = "aux";
      break;
    case FTS5_TOKENIZE_QUERY:
      zFlags = "query";
      break;
    case (FTS5_TOKENIZE_PREFIX | FTS5_TOKENIZE_QUERY):
      zFlags = "prefixquery";
      break;
    default:
      assert( 0 );
      zFlags = "invalid";
      break;
  }

  Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(zFlags, -1));
  Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(pText,nText));


  rc = Tcl_EvalObjEx(pInst->interp, pEval, TCL_GLOBAL_ONLY);

  Tcl_DecrRefCount(pEval);

  pInst->pContext->pCtx = pOldCtx;
  pInst->pContext->xToken = xOldToken;
  return rc;
}

/*
** sqlite3_fts5_token ?-colocated? TEXT START END
*/
static int f5tTokenizerReturn(
  void * clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  F5tTokenizerContext *p = (F5tTokenizerContext*)clientData;
  int iStart;
  int iEnd;
  int nToken;
  int tflags = 0;
  char *zToken;
  int rc;

  if( objc==5 ){
    int nArg;
    char *zArg = Tcl_GetStringFromObj(objv[1], &nArg);
    if( nArg<=10 && nArg>=2 && memcmp("-colocated", zArg, nArg)==0 ){
      tflags |= FTS5_TOKEN_COLOCATED;
    }else{
      goto usage;
    }
  }else if( objc!=4 ){
    goto usage;
  }

  zToken = Tcl_GetStringFromObj(objv[objc-3], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[objc-2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[objc-1], &iEnd) 
  ){
    return TCL_ERROR;
  }

  if( p->xToken==0 ){
    Tcl_AppendResult(interp, 
        "sqlite3_fts5_token may only be used by tokenizer callback", 0
    );
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, tflags, zToken, nToken, iStart, iEnd);

  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);

  return TCL_OK;

 usage:

  Tcl_WrongNumArgs(interp, 1, objv, "?-colocated? TEXT START END");
  return TCL_ERROR;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);
  ckfree((char *)pMod);
}
Added ext/fts5/test/fts5synonym.test.












































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on custom tokenizers that support synonyms.
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5synonym

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}


proc gobble_whitespace {textvar} {
  upvar $textvar t
  regexp {([ ]*)(.*)} $t -> space t
  return [string length $space]
}

proc gobble_text {textvar wordvar} {
  upvar $textvar t
  upvar $wordvar w
  regexp {([^ ]*)(.*)} $t -> w t
  return [string length $w]
}

proc do_tokenize_split {text} {
  set token ""
  set ret [list]
  set iOff [gobble_whitespace text]
  while {[set nToken [gobble_text text word]]} {
    lappend ret $word $iOff [expr $iOff+$nToken]
    incr iOff $nToken
    incr iOff [gobble_whitespace text]
  }

  set ret
}

proc tcl_tokenize {tflags text} {
  foreach {w iStart iEnd} [do_tokenize_split $text] {
    sqlite3_fts5_token $w $iStart $iEnd
  }
}

proc tcl_create {args} {
  return "tcl_tokenize"
}

sqlite3_fts5_create_tokenizer db tcl tcl_create

#-------------------------------------------------------------------------
# Warm body test for the code in fts5_tcl.c.
#
do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
  INSERT INTO ft VALUES('abc def ghi');
  INSERT INTO ft VALUES('jkl mno pqr');
  SELECT rowid, x FROM ft WHERE ft MATCH 'def';
  SELECT x, rowid FROM ft WHERE ft MATCH 'pqr';
} {1 {abc def ghi} {jkl mno pqr} 2}

#-------------------------------------------------------------------------
# Test a tokenizer that supports synonyms by adding extra entries to the
# FTS index.
#
foreach S {
  {zero 0}
  {one 1}
  {two 2}
  {three 3 iii}
  {four 4}
  {five 5}
  {six 6}
  {seven 7}
  {eight 8}
  {nine 9}
} {
  foreach s $S {
    set o [list]
    foreach x $S {if {$x!=$s} {lappend o $x}}
    set ::syn($s) $o
  }
}

proc tcl_tokenize {tflags text} {
  foreach {w iStart iEnd} [do_tokenize_split $text] {
    sqlite3_fts5_token $w $iStart $iEnd
    if {$tflags=="document" && [info exists ::syn($w)]} {
      foreach s $::syn($w) {
        sqlite3_fts5_token -colo $s $iStart $iEnd
      }
    }
  }
}
reset_db
sqlite3_fts5_create_tokenizer db tcl tcl_create

do_execsql_test 2.0 {
  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
  INSERT INTO ft VALUES('one two three');
  INSERT INTO ft VALUES('four five six');
  INSERT INTO ft VALUES('eight nine ten');
} {}

foreach {tn expr res} {
  1 "3" 1
  2 "eight OR 8 OR 5" {2 3}
  3 "10" {}
  4 "1*" {1}
} {
  do_execsql_test 2.1.$tn {
    SELECT rowid FROM ft WHERE ft MATCH $expr
  } $res
}

#-------------------------------------------------------------------------
# Test some broken tokenizers:
#
#   3.1.*: A tokenizer that declares the very first token to be colocated.
#
#   3.2.*: A tokenizer that reports two identical tokens at the same position.
#          This is allowed.
#
reset_db
sqlite3_fts5_create_tokenizer db tcl tcl_create
proc tcl_tokenize {tflags text} {
  set bColo 1
  foreach {w iStart iEnd} [do_tokenize_split $text] {
    if {$bColo} {
      sqlite3_fts5_token -colo $w $iStart $iEnd
      set bColo 0
    } {
      sqlite3_fts5_token $w $iStart $iEnd
    }
  }
}
do_execsql_test 3.1.0 {
  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
  INSERT INTO ft VALUES('one two three');
  CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
  SELECT * FROM vv;
} {
  one 1 1   three 1 1   two 1 1
}

do_execsql_test 3.1.1 {
  INSERT INTO ft(ft) VALUES('integrity-check');
} {}

proc tcl_tokenize {tflags text} {
  foreach {w iStart iEnd} [do_tokenize_split $text] {
    sqlite3_fts5_token $w $iStart $iEnd
  }
}

do_execsql_test 3.1.2 {
  SELECT rowid FROM ft WHERE ft MATCH 'one two three'
} {1}

reset_db
sqlite3_fts5_create_tokenizer db tcl tcl_create
proc tcl_tokenize {tflags text} {
  foreach {w iStart iEnd} [do_tokenize_split $text] {
    sqlite3_fts5_token $w $iStart $iEnd
    sqlite3_fts5_token -colo $w $iStart $iEnd
  }
}
do_execsql_test 3.2.0 {
  CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
  INSERT INTO ft VALUES('one one two three');
  CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
  SELECT * FROM vv;
} {
  one 1 4   three 1 2   two 1 2
}
do_execsql_test 3.2.1 {
  SELECT rowid FROM ft WHERE ft MATCH 'one two three';
  SELECT rowid FROM ft WHERE ft MATCH 'one + one + two + three';
} {1 1}
do_execsql_test 3.2.2 {
  SELECT rowid FROM ft WHERE ft MATCH 'one two two three';
  SELECT rowid FROM ft WHERE ft MATCH 'one + two + two + three';
} {1}

finish_test