/ Check-in [790f76a5]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Have the FTS unicode61 strip out diacritics when tokenizing text. This can be disabled by specifying the tokenizer option "remove_diacritics=0".
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 790f76a5898dad1a955d40edddf11f7b0fec0ccd
User & Date: dan 2012-06-06 19:30:38
Context
2012-06-06
19:51
Disable FTS unicode61 by default. It is enabled by specifying compile time option SQLITE_ENABLE_FTS4_UNICODE61. check-in: eccd6b65 user: dan tags: trunk
19:30
Have the FTS unicode61 strip out diacritics when tokenizing text. This can be disabled by specifying the tokenizer option "remove_diacritics=0". check-in: 790f76a5 user: dan tags: trunk
19:01
Avoid resetting the shared-cache schema when on of the connections using the shared cache closes. Delay resetting the schema until the last connection closes. check-in: 635e3a76 user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3Int.h.

   538    538   int sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol, char **); 
   539    539   int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *);
   540    540   int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
   541    541   
   542    542   int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
   543    543   
   544    544   /* fts3_unicode2.c (functions generated by parsing unicode text files) */
   545         -int sqlite3FtsUnicodeTolower(int);
          545  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
          546  +int sqlite3FtsUnicodeFold(int, int);
   546    547   int sqlite3FtsUnicodeIsalnum(int);
          548  +int sqlite3FtsUnicodeIsdiacritic(int);
          549  +#endif
   547    550   
   548    551   #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
   549    552   #endif /* _FTSINT_H */

Changes to ext/fts3/fts3_unicode.c.

    78     78   #endif /* ifndef SQLITE_AMALGAMATION */
    79     79   
    80     80   typedef struct unicode_tokenizer unicode_tokenizer;
    81     81   typedef struct unicode_cursor unicode_cursor;
    82     82   
    83     83   struct unicode_tokenizer {
    84     84     sqlite3_tokenizer base;
           85  +  int bRemoveDiacritic;
    85     86   };
    86     87   
    87     88   struct unicode_cursor {
    88     89     sqlite3_tokenizer_cursor base;
    89     90     const unsigned char *aInput;    /* Input text being tokenized */
    90     91     int nInput;                     /* Size of aInput[] in bytes */
    91     92     int iOff;                       /* Current offset within aInput[] */
................................................................................
    99    100   */
   100    101   static int unicodeCreate(
   101    102     int nArg,                       /* Size of array argv[] */
   102    103     const char * const *azArg,      /* Tokenizer creation arguments */
   103    104     sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
   104    105   ){
   105    106     unicode_tokenizer *pNew;        /* New tokenizer object */
          107  +  int i;
   106    108     pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
   107    109     if( pNew==NULL ){
   108    110       return SQLITE_NOMEM;
   109    111     }
   110    112     memset(pNew, 0, sizeof(unicode_tokenizer));
          113  +  pNew->bRemoveDiacritic = 1;
          114  +
          115  +  for(i=0; i<nArg; i++){
          116  +    const char *z = azArg[i];
          117  +    int n = strlen(z);
          118  +
          119  +    if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
          120  +      pNew->bRemoveDiacritic = 1;
          121  +    }
          122  +    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
          123  +      pNew->bRemoveDiacritic = 0;
          124  +    }
          125  +    else{
          126  +      /* Unrecognized argument */
          127  +      return SQLITE_ERROR;
          128  +    }
          129  +  }
          130  +
   111    131     *pp = &pNew->base;
   112    132     return SQLITE_OK;
   113    133   }
   114    134   
   115    135   /*
   116    136   ** Destroy a tokenizer allocated by unicodeCreate().
   117    137   */
................................................................................
   193    213       if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
   194    214       zStart = z;
   195    215     }
   196    216     if( zStart>=zTerm ) return SQLITE_DONE;
   197    217   
   198    218     zOut = pCsr->zToken;
   199    219     do {
          220  +    int iOut;
          221  +
   200    222       /* Grow the output buffer if required. */
   201    223       if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
   202    224         char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
   203    225         if( !zNew ) return SQLITE_NOMEM;
   204    226         zOut = &zNew[zOut - pCsr->zToken];
   205    227         pCsr->zToken = zNew;
   206    228         pCsr->nAlloc += 64;
   207    229       }
   208    230   
   209    231       /* Write the folded case of the last character read to the output */
   210    232       zEnd = z;
   211         -    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
          233  +    iOut = sqlite3FtsUnicodeFold(iCode, 
          234  +        ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
          235  +    );
          236  +    if( iOut ){
          237  +      WRITE_UTF8(zOut, iOut);
          238  +    }
   212    239   
   213    240       /* If the cursor is not at EOF, read the next character */
   214    241       if( z>=zTerm ) break;
   215    242       READ_UTF8(z, zTerm, iCode);
   216         -  }while( sqlite3FtsUnicodeIsalnum(iCode) );
          243  +  }while( sqlite3FtsUnicodeIsalnum(iCode) 
          244  +       || sqlite3FtsUnicodeIsdiacritic(iCode)
          245  +  );
   217    246   
   218    247     /* Set the output variables and return. */
   219    248     pCsr->iOff = (z - pCsr->aInput);
   220    249     *paToken = pCsr->zToken;
   221    250     *pnToken = zOut - pCsr->zToken;
   222    251     *piStart = (zStart - pCsr->aInput);
   223    252     *piEnd = (zEnd - pCsr->aInput);

Changes to ext/fts3/fts3_unicode2.c.

   147    147       assert( aEntry[0]<key );
   148    148       assert( key>=aEntry[iRes] );
   149    149       return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
   150    150     }
   151    151     return 1;
   152    152   }
   153    153   
          154  +
          155  +/*
          156  +** If the argument is a codepoint corresponding to a lowercase letter
          157  +** in the ASCII range with a diacritic added, return the codepoint
          158  +** of the ASCII letter only. For example, if passed 235 - "LATIN
          159  +** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
          160  +** E"). The resuls of passing a codepoint that corresponds to an
          161  +** uppercase letter are undefined.
          162  +*/
          163  +static int remove_diacritic(int c){
          164  +  unsigned short aDia[] = {
          165  +        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
          166  +     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
          167  +     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
          168  +     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
          169  +     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
          170  +     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
          171  +     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
          172  +     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
          173  +    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
          174  +    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
          175  +    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
          176  +    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
          177  +    62924, 63050, 63082, 63274, 63390, 
          178  +  };
          179  +  char aChar[] = {
          180  +    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
          181  +    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
          182  +    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
          183  +    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
          184  +    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
          185  +    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
          186  +    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
          187  +    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
          188  +    'e',  'i',  'o',  'u',  'y',  
          189  +  };
          190  +
          191  +  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
          192  +  int iRes = 0;
          193  +  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
          194  +  int iLo = 0;
          195  +  while( iHi>=iLo ){
          196  +    int iTest = (iHi + iLo) / 2;
          197  +    if( key >= aDia[iTest] ){
          198  +      iRes = iTest;
          199  +      iLo = iTest+1;
          200  +    }else{
          201  +      iHi = iTest-1;
          202  +    }
          203  +  }
          204  +  assert( key>=aDia[iRes] );
          205  +  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
          206  +};
          207  +
          208  +
          209  +/*
          210  +** Return true if the argument interpreted as a unicode codepoint
          211  +** is a diacritical modifier character.
          212  +*/
          213  +int sqlite3FtsUnicodeIsdiacritic(int c){
          214  +  unsigned int mask0 = 0x08029FDF;
          215  +  unsigned int mask1 = 0x000361F8;
          216  +  if( c<768 || c>817 ) return 0;
          217  +  return (c < 768+32) ?
          218  +      (mask0 & (1 << (c-768))) :
          219  +      (mask1 & (1 << (c-768-32)));
          220  +}
          221  +
   154    222   
   155    223   /*
   156    224   ** Interpret the argument as a unicode codepoint. If the codepoint
   157    225   ** is an upper case character that has a lower case equivalent,
   158    226   ** return the codepoint corresponding to the lower case version.
   159    227   ** Otherwise, return a copy of the argument.
   160    228   **
   161    229   ** The results are undefined if the value passed to this function
   162    230   ** is less than zero.
   163    231   */
   164         -int sqlite3FtsUnicodeTolower(int c){
          232  +int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
   165    233     /* Each entry in the following array defines a rule for folding a range
   166    234     ** of codepoints to lower case. The rule applies to a range of nRange
   167    235     ** codepoints starting at codepoint iCode.
   168    236     **
   169    237     ** If the least significant bit in flags is clear, then the rule applies
   170    238     ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
   171    239     ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
   280    348       if( iRes>=0 ){
   281    349         const struct TableEntry *p = &aEntry[iRes];
   282    350         if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   283    351           ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   284    352           assert( ret>0 );
   285    353         }
   286    354       }
          355  +
          356  +    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
   287    357     }
   288    358     
   289    359     else if( c>=66560 && c<66600 ){
   290    360       ret = c + 40;
   291    361     }
   292    362   
   293    363     return ret;
   294    364   }
   295    365   #endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
   296    366   #endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */

Changes to ext/fts3/unicode/mkunicode.tcl.

     1      1   
            2  +#
            3  +# Parameter $zName must be a path to the file UnicodeData.txt. This command
            4  +# reads the file and returns a list of mappings required to remove all
            5  +# diacritical marks from a unicode string. Each mapping is itself a list
            6  +# consisting of two elements - the unicode codepoint and the single ASCII
            7  +# character that it should be replaced with, or an empty string if the 
            8  +# codepoint should simply be removed from the input. Examples:
            9  +#
           10  +#   { 224 a  }     (replace codepoint 224 to "a")
           11  +#   { 769 "" }     (remove codepoint 769 from input)
           12  +#
           13  +# Mappings are only returned for non-upper case codepoints. It is assumed
           14  +# that the input has already been folded to lower case.
           15  +#
           16  +proc rd_load_unicodedata_text {zName} {
           17  +  global tl_lookup_table
           18  +
           19  +  set fd [open $zName]
           20  +  set lField {
           21  +    code
           22  +    character_name
           23  +    general_category
           24  +    canonical_combining_classes
           25  +    bidirectional_category
           26  +    character_decomposition_mapping
           27  +    decimal_digit_value
           28  +    digit_value
           29  +    numeric_value
           30  +    mirrored
           31  +    unicode_1_name
           32  +    iso10646_comment_field
           33  +    uppercase_mapping
           34  +    lowercase_mapping
           35  +    titlecase_mapping
           36  +  }
           37  +  set lRet [list]
           38  +
           39  +  while { ![eof $fd] } {
           40  +    set line [gets $fd]
           41  +    if {$line == ""} continue
           42  +
           43  +    set fields [split $line ";"]
           44  +    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
           45  +    foreach $lField $fields {}
           46  +    if { [llength $character_decomposition_mapping]!=2
           47  +      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
           48  +    } {
           49  +      continue
           50  +    }
           51  +
           52  +    set iCode  [expr "0x$code"]
           53  +    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
           54  +    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]
           55  +
           56  +    if {[info exists tl_lookup_table($iCode)]} continue
           57  +
           58  +    if { ($iAscii >= 97 && $iAscii <= 122)
           59  +      || ($iAscii >= 65 && $iAscii <= 90)
           60  +    } {
           61  +      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
           62  +      set dia($iDia) 1
           63  +    }
           64  +  }
           65  +
           66  +  foreach d [array names dia] {
           67  +    lappend lRet [list $d ""]
           68  +  }
           69  +  set lRet [lsort -integer -index 0 $lRet]
           70  +
           71  +  close $fd
           72  +  set lRet
           73  +}
           74  +
           75  +
           76  +proc print_rd {map} {
           77  +  global tl_lookup_table
           78  +  set aChar [list]
           79  +  set lRange [list]
           80  +
           81  +  set nRange 1
           82  +  set iFirst  [lindex $map 0 0]
           83  +  set cPrev   [lindex $map 0 1]
           84  +
           85  +  foreach m [lrange $map 1 end] {
           86  +    foreach {i c} $m {}
           87  +
           88  +    if {$cPrev == $c} {
           89  +      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
           90  +        if {[info exists tl_lookup_table($j)]==0} break
           91  +      }
           92  +
           93  +      if {$j==$i} {
           94  +        set nNew [expr {(1 + $i - $iFirst)}]
           95  +        if {$nNew<=8} {
           96  +          set nRange $nNew
           97  +          continue
           98  +        }
           99  +      }
          100  +    }
          101  +
          102  +    lappend lRange [list $iFirst $nRange]
          103  +    lappend aChar  $cPrev
          104  +
          105  +    set iFirst $i
          106  +    set cPrev  $c
          107  +    set nRange 1
          108  +  }
          109  +  lappend lRange [list $iFirst $nRange]
          110  +  lappend aChar $cPrev
          111  +
          112  +  puts "/*"
          113  +  puts "** If the argument is a codepoint corresponding to a lowercase letter"
          114  +  puts "** in the ASCII range with a diacritic added, return the codepoint"
          115  +  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
          116  +  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
          117  +  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
          118  +  puts "** uppercase letter are undefined."
          119  +  puts "*/"
          120  +  puts "static int remove_diacritic(int c)\{"
          121  +  puts "  unsigned short aDia\[\] = \{"
          122  +  puts -nonewline "        0, "
          123  +  set i 1
          124  +  foreach r $lRange {
          125  +    foreach {iCode nRange} $r {}
          126  +    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
          127  +    incr i
          128  +
          129  +    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
          130  +    puts -nonewline ", "
          131  +  }
          132  +  puts ""
          133  +  puts "  \};"
          134  +  puts "  char aChar\[\] = \{"
          135  +  puts -nonewline "    '\\0', "
          136  +  set i 1
          137  +  foreach c $aChar {
          138  +    set str "'$c',  "
          139  +    if {$c == ""} { set str "'\\0', " }
          140  +
          141  +    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
          142  +    incr i
          143  +    puts -nonewline "$str"
          144  +  }
          145  +  puts ""
          146  +  puts "  \};"
          147  +  puts {
          148  +  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
          149  +  int iRes = 0;
          150  +  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
          151  +  int iLo = 0;
          152  +  while( iHi>=iLo ){
          153  +    int iTest = (iHi + iLo) / 2;
          154  +    if( key >= aDia[iTest] ){
          155  +      iRes = iTest;
          156  +      iLo = iTest+1;
          157  +    }else{
          158  +      iHi = iTest-1;
          159  +    }
          160  +  }
          161  +  assert( key>=aDia[iRes] );
          162  +  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
          163  +  puts "\};"
          164  +}
          165  +
          166  +proc print_isdiacritic {zFunc map} {
          167  +
          168  +  set lCode [list]
          169  +  foreach m $map {
          170  +    foreach {code char} $m {}
          171  +    if {$code && $char == ""} { lappend lCode $code }
          172  +  }
          173  +  set lCode [lsort -integer $lCode]
          174  +  set iFirst [lindex $lCode 0]
          175  +  set iLast [lindex $lCode end]
          176  +
          177  +  set i1 0
          178  +  set i2 0
          179  +
          180  +  foreach c $lCode {
          181  +    set i [expr $c - $iFirst]
          182  +    if {$i < 32} {
          183  +      set i1 [expr {$i1 | (1<<$i)}]
          184  +    } else {
          185  +      set i2 [expr {$i2 | (1<<($i-32))}]
          186  +    }
          187  +  }
          188  +
          189  +  puts "/*"
          190  +  puts "** Return true if the argument interpreted as a unicode codepoint" 
          191  +  puts "** is a diacritical modifier character."
          192  +  puts "*/"
          193  +  puts "int ${zFunc}\(int c)\{"
          194  +  puts "  unsigned int mask0 = [format "0x%08X" $i1];"
          195  +  puts "  unsigned int mask1 = [format "0x%08X" $i2];"
          196  +
          197  +  puts "  if( c<$iFirst || c>$iLast ) return 0;"
          198  +  puts "  return (c < $iFirst+32) ?"
          199  +  puts "      (mask0 & (1 << (c-$iFirst))) :"
          200  +  puts "      (mask1 & (1 << (c-$iFirst-32)));"
          201  +  puts "\}"
          202  +}
          203  +
          204  +
          205  +#-------------------------------------------------------------------------
     2    206   
     3    207   # Parameter $zName must be a path to the file UnicodeData.txt. This command
     4    208   # reads the file and returns a list of codepoints (integers). The list
     5    209   # contains all codepoints in the UnicodeData.txt assigned to any "General
     6    210   # Category" that is not a "Letter" or "Number".
     7    211   #
     8    212   proc an_load_unicodedata_text {zName} {
................................................................................
   389    593       incr i
   390    594     }
   391    595     puts ""
   392    596     puts "  \};"
   393    597   
   394    598   }
   395    599   
   396         -proc print_tolower {zFunc} {
          600  +proc print_fold {zFunc} {
   397    601   
   398    602     set lRecord [tl_create_records]
   399    603   
   400    604     set lHigh [list]
   401    605     puts "/*"
   402    606     puts "** Interpret the argument as a unicode codepoint. If the codepoint"
   403    607     puts "** is an upper case character that has a lower case equivalent,"
   404    608     puts "** return the codepoint corresponding to the lower case version."
   405    609     puts "** Otherwise, return a copy of the argument."
   406    610     puts "**"
   407    611     puts "** The results are undefined if the value passed to this function"
   408    612     puts "** is less than zero."
   409    613     puts "*/"
   410         -  puts "int ${zFunc}\(int c)\{"
          614  +  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"
   411    615   
   412    616     set liOff [tl_generate_ioff_table $lRecord]
   413    617     tl_print_table_header
   414    618     foreach entry $lRecord { 
   415    619       if {[tl_print_table_entry toggle $entry $liOff]} { 
   416    620         lappend lHigh $entry 
   417    621       } 
................................................................................
   447    651       if( iRes>=0 ){
   448    652         const struct TableEntry *p = &aEntry[iRes];
   449    653         if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
   450    654           ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
   451    655           assert( ret>0 );
   452    656         }
   453    657       }
          658  +
          659  +    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
   454    660     }
   455    661     }
   456    662   
   457    663     foreach entry $lHigh {
   458    664       tl_print_if_entry $entry
   459    665     }
   460    666   
   461    667     puts ""
   462    668     puts "  return ret;"
   463    669     puts "\}"
   464    670   }
   465    671   
   466         -proc print_tolower_test {zFunc} {
          672  +proc print_fold_test {zFunc mappings} {
   467    673     global tl_lookup_table
   468    674   
   469         -  puts "static int tolower_test(int *piCode)\{"
          675  +  foreach m $mappings {
          676  +    set c [lindex $m 1]
          677  +    if {$c == ""} {
          678  +      set extra([lindex $m 0]) 0
          679  +    } else {
          680  +      scan $c %c i
          681  +      set extra([lindex $m 0]) $i
          682  +    }
          683  +  }
          684  +
          685  +  puts "static int fold_test(int *piCode)\{"
   470    686     puts -nonewline "  static int aLookup\[\] = \{"
   471    687     for {set i 0} {$i < 70000} {incr i} {
          688  +
   472    689       set expected $i
   473    690       catch { set expected $tl_lookup_table($i) }
   474         -    if {($i % 8)==0}  { puts "" ; puts -nonewline "    " }
   475         -    puts -nonewline "$expected, "
          691  +    set expected2 $expected
          692  +    catch { set expected2 $extra($expected2) }
          693  +
          694  +    if {($i % 4)==0}  { puts "" ; puts -nonewline "    " }
          695  +    puts -nonewline "$expected, $expected2, "
   476    696     }
   477    697     puts "  \};"
   478    698     puts "  int i;"
   479    699     puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
   480         -  puts "    if( ${zFunc}\(i)!=aLookup\[i\] )\{"
   481         -  puts "      *piCode = i;"
          700  +  puts "    int iCode = (i/2);"
          701  +  puts "    int bFlag = i & 0x0001;"
          702  +  puts "    if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
          703  +  puts "      *piCode = iCode;"
   482    704     puts "      return 1;"
   483    705     puts "    \}"
   484    706     puts "  \}"
   485    707     puts "  return 0;"
   486    708     puts "\}"
   487    709   }
   488    710   
................................................................................
   520    742     puts ""
   521    743     puts "int main(int argc, char **argv)\{"
   522    744     puts "  int r1, r2;"
   523    745     puts "  int code;"
   524    746     puts "  r1 = isalnum_test(&code);"
   525    747     puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
   526    748     puts "  else printf(\"isalnum(): test passed\\n\");"
   527         -  puts "  r2 = tolower_test(&code);"
   528         -  puts "  if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
   529         -  puts "  else printf(\"tolower(): test passed\\n\");"
          749  +  puts "  r2 = fold_test(&code);"
          750  +  puts "  if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
          751  +  puts "  else printf(\"fold(): test passed\\n\");"
   530    752     puts "  return (r1 || r2);"
   531    753     puts "\}"
   532    754   }
   533    755   
   534    756   # Proces the command line arguments. Exit early if they are not to
   535    757   # our liking.
   536    758   #
................................................................................
   540    762     exit 1
   541    763   }
   542    764   if {[llength $argv]!=2 && [llength $argv]!=3} usage
   543    765   if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
   544    766   set unicodedata.txt [lindex $argv end]
   545    767   set casefolding.txt [lindex $argv end-1]
   546    768   set generate_test_code [expr {[llength $argv]==3}]
          769  +
          770  +print_fileheader
   547    771   
   548    772   # Print the isalnum() function to stdout.
   549    773   #
   550         -print_fileheader
   551    774   set lRange [an_load_separator_ranges]
   552    775   print_isalnum sqlite3FtsUnicodeIsalnum $lRange
   553    776   
   554    777   # Leave a gap between the two generated C functions.
   555    778   #
   556    779   puts ""
   557    780   puts ""
   558    781   
   559         -# Print the tolower() function to stdout.
   560         -#
          782  +# Load the fold data. This is used by the [rd_XXX] commands
          783  +# as well as [print_fold].
   561    784   tl_load_casefolding_txt ${casefolding.txt}
   562         -print_tolower sqlite3FtsUnicodeTolower
          785  +
          786  +set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
          787  +print_rd $mappings
          788  +puts ""
          789  +puts ""
          790  +print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
          791  +puts ""
          792  +puts ""
          793  +
          794  +# Print the fold() function to stdout.
          795  +#
          796  +print_fold sqlite3FtsUnicodeFold
   563    797   
   564    798   # Print the test routines and main() function to stdout, if -test 
   565    799   # was specified.
   566    800   #
   567    801   if {$::generate_test_code} {
   568    802     print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
   569         -  print_tolower_test sqlite3FtsUnicodeTolower 
          803  +  print_fold_test sqlite3FtsUnicodeFold $mappings
   570    804     print_test_main 
   571    805   }
   572    806   
   573    807   puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
   574    808   puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"

Changes to test/fts4unicode.test.

    15     15   set testdir [file dirname $argv0]
    16     16   source $testdir/tester.tcl
    17     17   ifcapable !fts3_unicode { finish_test ; return }
    18     18   set ::testprefix fts4unicode
    19     19   
    20     20   proc do_unicode_token_test {tn input res} {
    21     21     set input [string map {' ''} $input]
           22  +  uplevel [list do_execsql_test $tn "
           23  +    SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
           24  +  " [list [list {*}$res]]]
           25  +}
           26  +
           27  +proc do_unicode_token_test2 {tn input res} {
           28  +  set input [string map {' ''} $input]
    22     29     uplevel [list do_execsql_test $tn "
    23     30       SELECT fts3_tokenizer_test('unicode61', '$input');
    24     31     " [list [list {*}$res]]]
    25     32   }
    26     33   
    27     34   do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
    28     35   do_unicode_token_test 1.1 {  } {0   1   2  }
................................................................................
    35     42   
    36     43   do_unicode_token_test 1.6 "The quick brown fox" {
    37     44     0 the The 1 quick quick 2 brown brown 3 fox fox
    38     45   }
    39     46   do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
    40     47     0 the The 1 quick quick 2 brown brown 3 fox fox
    41     48   }
           49  +
           50  +do_unicode_token_test2 1.8  {a B c D} {0 a a 1 b B 2 c c 3 d D}
           51  +do_unicode_token_test2 1.9  {  } {0 a  1 o  2 u }
           52  +do_unicode_token_test2 1.10 {xx xx xx} {0 xax xx 1 xox xx 2 xux xx}
           53  +
           54  +# Check that diacritics are removed if remove_diacritics=1 is specified.
           55  +# And that they do not break tokens.
           56  +do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"
    42     57   
    43     58   #-------------------------------------------------------------------------
    44     59   #
    45     60   set docs [list {
    46     61     Enhance the INSERT syntax to allow multiple rows to be inserted via the
    47     62     VALUES clause.
    48     63   } {