/ Check-in [d281cb89]
Login
SQLite training in Houston TX on 2019-11-05 (details)
Part of the 2019 Tcl Conference

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix a bug in merging FTS language tables for languages other than language 0.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-languageid
Files: files | file ages | folders
SHA1: d281cb8984c911a4c0cce2ec299e1351d8e580e4
User & Date: dan 2012-03-02 11:48:50
Context
2012-03-02
12:26
Fix the FTS 'optimize' command on multi-lingual databases. check-in: 65fa6937 user: dan tags: fts4-languageid
11:48
Fix a bug in merging FTS language tables for languages other than language 0. check-in: d281cb89 user: dan tags: fts4-languageid
2012-03-01
19:44
Add the "languageid=" option to fts4. This code is still largely untested and alsmost certainly buggy. check-in: bea257f7 user: dan tags: fts4-languageid
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_write.c.

   484    484             getAbsoluteLevel(p, iLangid, iIndex, FTS3_SEGDIR_MAXLEVEL-1)
   485    485         );
   486    486       }
   487    487     }else{
   488    488       /* "SELECT * FROM %_segdir WHERE level = ? ORDER BY ..." */
   489    489       rc = fts3SqlStmt(p, SQL_SELECT_LEVEL, &pStmt, 0);
   490    490       if( rc==SQLITE_OK ){ 
   491         -      sqlite3_bind_int(pStmt, 1, iLevel+iIndex*FTS3_SEGDIR_MAXLEVEL);
          491  +      sqlite3_bind_int(pStmt, 1, getAbsoluteLevel(p, iLangid, iIndex, iLevel));
   492    492       }
   493    493     }
   494    494     *ppStmt = pStmt;
   495    495     return rc;
   496    496   }
   497    497   
   498    498   

Changes to test/fts4langid.test.

    20     20   ifcapable !fts3 {
    21     21     finish_test
    22     22     return
    23     23   }
    24     24   
    25     25   set ::testprefix fts4langid
    26     26   
           27  +#---------------------------------------------------------------------------
           28  +# Test plan:
           29  +#
           30  +#   1.* - Warm-body tests created for specific purposes during development.
           31  +#         Passing these doesn't really prove much.
           32  +#
           33  +#   2.* - Test that FTS queries only ever return rows associated with
           34  +#         the requested language.
           35  +#
           36  +#   3.* - Test that the 'optimize' and 'rebuild' commands work correctly.
           37  +#
           38  +#   4.* - Test that if one is provided, the tokenizer xLanguage method
           39  +#         is called to configure the tokenizer before tokenizing query
           40  +#         or document text.
           41  +#
           42  +#   5.* - Test the fts4aux table when the associated FTS4 table contains
           43  +#         multiple languages.
           44  +#
           45  +#   6.* - Tests with content= tables. Both where there is a real 
           46  +#         underlying content table and where there is not.
           47  +#
    27     48   
    28     49   
    29     50   do_execsql_test 1.1 {
    30     51     CREATE VIRTUAL TABLE t1 USING fts4(a, b, languageid=lang_id);
    31     52   }
    32     53   
    33     54   do_execsql_test 1.2 {
................................................................................
    69     90   do_execsql_test 1.16 {
    70     91     SELECT rowid FROM t1 WHERE t1 MATCH 'b' AND lang_id = 1;
    71     92   } {2}
    72     93   
    73     94   do_catchsql_test 1.17 {
    74     95     INSERT INTO t1(content, lang_id) VALUES('123', -1);
    75     96   } {1 {constraint failed}}
           97  +
           98  +do_execsql_test 1.18 {
           99  +  DROP TABLE t1;
          100  +  CREATE VIRTUAL TABLE t1 USING fts4(languageid=lang_id);
          101  +  INSERT INTO t1(content, lang_id) VALUES('A', 13);
          102  +  INSERT INTO t1(content, lang_id) VALUES('B', 13);
          103  +  INSERT INTO t1(content, lang_id) VALUES('C', 13);
          104  +  INSERT INTO t1(content, lang_id) VALUES('D', 13);
          105  +  INSERT INTO t1(content, lang_id) VALUES('E', 13);
          106  +  INSERT INTO t1(content, lang_id) VALUES('F', 13);
          107  +  INSERT INTO t1(content, lang_id) VALUES('G', 13);
          108  +  INSERT INTO t1(content, lang_id) VALUES('H', 13);
          109  +  INSERT INTO t1(content, lang_id) VALUES('I', 13);
          110  +  INSERT INTO t1(content, lang_id) VALUES('J', 13);
          111  +  INSERT INTO t1(content, lang_id) VALUES('K', 13);
          112  +  INSERT INTO t1(content, lang_id) VALUES('L', 13);
          113  +  INSERT INTO t1(content, lang_id) VALUES('M', 13);
          114  +  INSERT INTO t1(content, lang_id) VALUES('N', 13);
          115  +  INSERT INTO t1(content, lang_id) VALUES('O', 13);
          116  +  INSERT INTO t1(content, lang_id) VALUES('P', 13);
          117  +  INSERT INTO t1(content, lang_id) VALUES('Q', 13);
          118  +  INSERT INTO t1(content, lang_id) VALUES('R', 13);
          119  +  INSERT INTO t1(content, lang_id) VALUES('S', 13);
          120  +  SELECT rowid FROM t1 WHERE t1 MATCH 'A';
          121  +} {}
          122  +
          123  +
          124  +#-------------------------------------------------------------------------
          125  +# Test cases 2.*
          126  +#
          127  +
          128  +proc build_multilingual_db_1 {db} {
          129  +  $db eval { CREATE VIRTUAL TABLE t2 USING fts4(x, y, languageid=l) }
          130  +
          131  +  set xwords [list zero one two three four five six seven eight nine ten]
          132  +  set ywords [list alpha beta gamma delta epsilon zeta eta theta iota kappa]
          133  +
          134  +  for {set i 0} {$i < 1000} {incr i} {
          135  +    set iLangid [expr $i%9]
          136  +    set x ""
          137  +    set y ""
          138  +
          139  +    set x [list]
          140  +    lappend x [lindex $xwords [expr ($i / 1000) % 10]]
          141  +    lappend x [lindex $xwords [expr ($i / 100)  % 10]]
          142  +    lappend x [lindex $xwords [expr ($i / 10)   % 10]]
          143  +    lappend x [lindex $xwords [expr ($i / 1)   % 10]]
          144  +
          145  +    set y [list]
          146  +    lappend y [lindex $ywords [expr ($i / 1000) % 10]]
          147  +    lappend y [lindex $ywords [expr ($i / 100)  % 10]]
          148  +    lappend y [lindex $ywords [expr ($i / 10)   % 10]]
          149  +    lappend y [lindex $ywords [expr ($i / 1)   % 10]]
          150  +
          151  +    $db eval { INSERT INTO t2(docid, x, y, l) VALUES($i, $x, $y, $iLangid) }
          152  +  }
          153  +}
          154  +
          155  +proc rowid_list_set_langid {langid} {
          156  +  set ::rowid_list_langid $langid
          157  +}
          158  +proc rowid_list {pattern} {
          159  +  set langid $::rowid_list_langid
          160  +  set res [list]
          161  +  db eval {SELECT docid, x, y FROM t2 WHERE l = $langid ORDER BY docid ASC} {
          162  +    if {[string match "*$pattern*" $x] || [string match "*$pattern*" $y]} {
          163  +      lappend res $docid
          164  +    }
          165  +  }
          166  +  return $res
          167  +}
          168  +
          169  +proc or_merge_list {list1 list2} {
          170  +  set res [list]
          171  +
          172  +  set i1 0
          173  +  set i2 0
          174  +
          175  +  set n1 [llength $list1]
          176  +  set n2 [llength $list2]
          177  +
          178  +  while {$i1 < $n1 && $i2 < $n2} {
          179  +    set e1 [lindex $list1 $i1]
          180  +    set e2 [lindex $list2 $i2]
          181  +
          182  +    if {$e1==$e2} {
          183  +      lappend res $e1
          184  +      incr i1
          185  +      incr i2
          186  +    } elseif {$e1 < $e2} {
          187  +      lappend res $e1
          188  +      incr i1
          189  +    } else {
          190  +      lappend res $e2
          191  +      incr i2
          192  +    }
          193  +  }
          194  +
          195  +  concat $res [lrange $list1 $i1 end] [lrange $list2 $i2 end]
          196  +}
          197  +
          198  +proc or_merge_lists {args} {
          199  +  set res [lindex $args 0]
          200  +  for {set i 1} {$i < [llength $args]} {incr i} {
          201  +    set res [or_merge_list $res [lindex $args $i]]
          202  +  }
          203  +  set res
          204  +}
          205  +
          206  +proc and_merge_list {list1 list2} {
          207  +  foreach i $list2 { set a($i) 1 }
          208  +  set res [list]
          209  +  foreach i $list1 {
          210  +    if {[info exists a($i)]} {lappend res $i}
          211  +  }
          212  +  set res
          213  +}
          214  +
          215  +
          216  +proc and_merge_lists {args} {
          217  +  set res [lindex $args 0]
          218  +  for {set i 1} {$i < [llength $args]} {incr i} {
          219  +    set res [and_merge_list $res [lindex $args $i]]
          220  +  }
          221  +  set res
          222  +}
          223  +
          224  +proc filter_list {list langid} {
          225  +  set res [list]
          226  +  foreach i $list {
          227  +    if {($i % 9) == $langid} {lappend res $i}
          228  +  }
          229  +  set res
          230  +}
          231  +
          232  +do_test 2.0 { 
          233  +  reset_db
          234  +  build_multilingual_db_1 db
          235  +} {}
          236  +
          237  +proc do_test_2.1 {tn query res_script} {
          238  +  for {set langid 0} {$langid < 10} {incr langid} {
          239  +    rowid_list_set_langid $langid
          240  +    set res [eval $res_script]
          241  +
          242  +    set actual [
          243  +      execsql {SELECT docid FROM t2 WHERE t2 MATCH $query AND l = $langid}
          244  +    ]
          245  +    do_test 2.1.$tn.$langid [list set {} $actual] $res
          246  +  }
          247  +}
          248  +
          249  +do_test_2.1 1  {delta}          { rowid_list delta }
          250  +do_test_2.1 2  {"zero one two"} { rowid_list "zero one two" }
          251  +do_test_2.1 3  {zero one two} {
          252  +  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
          253  +}
          254  +do_test_2.1 4  {"zero one" OR "one two"} {
          255  +  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
          256  +}
    76    257   
    77    258   finish_test
    78    259