/ Check-in [06177f3f]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256:06177f3f114b5d804b84c27ac843740282e2176fdf0f7a999feda0e1b624adec
User & Date: dan 2018-12-03 16:14:49
Context
2018-12-03
17:40
Remove the unused sqlite3Fts5UnicodeNCat() function. check-in: 7149dacf user: drh tags: trunk
16:14
Add the "remove_diacritics=2" option to the unicode61 tokenizer in both FTS5 and FTS3/4. check-in: 06177f3f user: dan tags: trunk
14:58
Update the autoconf makefile for MSVC. check-in: 675aba1f user: mistachkin tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode.c.

78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
...
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240



241
242
243
244
245
246
247
...
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct unicode_tokenizer unicode_tokenizer;
typedef struct unicode_cursor unicode_cursor;

struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int bRemoveDiacritic;
  int nException;
  int *aiException;
};

struct unicode_cursor {
  sqlite3_tokenizer_cursor base;
  const unsigned char *aInput;    /* Input text being tokenized */
................................................................................
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
  int rc = SQLITE_OK;

  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  if( pNew==NULL ) return SQLITE_NOMEM;
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->bRemoveDiacritic = 1;

  for(i=0; rc==SQLITE_OK && i<nArg; i++){
    const char *z = azArg[i];
    int n = (int)strlen(z);

    if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
      pNew->bRemoveDiacritic = 1;
    }
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->bRemoveDiacritic = 0;



    }
    else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
    }
    else if( n>=11 && memcmp("separators=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
    }
................................................................................
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }

    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);







|







 







|






|


|
>
>
>







 







|







78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
...
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
...
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct unicode_tokenizer unicode_tokenizer;
typedef struct unicode_cursor unicode_cursor;

struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int eRemoveDiacritic;
  int nException;
  int *aiException;
};

struct unicode_cursor {
  sqlite3_tokenizer_cursor base;
  const unsigned char *aInput;    /* Input text being tokenized */
................................................................................
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
  int rc = SQLITE_OK;

  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  if( pNew==NULL ) return SQLITE_NOMEM;
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->eRemoveDiacritic = 1;

  for(i=0; rc==SQLITE_OK && i<nArg; i++){
    const char *z = azArg[i];
    int n = (int)strlen(z);

    if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
      pNew->eRemoveDiacritic = 1;
    }
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->eRemoveDiacritic = 0;
    }
    else if( n==19 && memcmp("remove_diacritics=2", z, 19)==0 ){
      pNew->eRemoveDiacritic = 2;
    }
    else if( n>=11 && memcmp("tokenchars=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 1, &z[11], n-11);
    }
    else if( n>=11 && memcmp("separators=", z, 11)==0 ){
      rc = unicodeAddExceptions(pNew, 0, &z[11], n-11);
    }
................................................................................
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    iOut = sqlite3FtsUnicodeFold((int)iCode, p->eRemoveDiacritic);
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }

    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);

Changes to ext/fts3/fts3_unicode2.c.

155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170

171
172
173
174


175
176


177
178
179
180
181
182
183
184
185
186
187















188
189
190
191
192
193
194
...
197
198
199
200
201
202
203

204
205
206
207
208
209
210
211
...
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
...
347
348
349
350
351
352
353

354

355
356
357
358
359
360
361
362
363
364
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 

     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 


    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
    62924, 63050, 63082, 63274, 63390, 


  };
  char aChar[] = {
    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
    'e',  'i',  'o',  'u',  'y',  















  };

  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );

  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
}


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
................................................................................
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }


    if( bRemoveDiacritic ) ret = remove_diacritic(ret);

  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */







|





|
|
|
>
|
<
|
|
>
>
|
<
>
>


<
<
<
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
|







 







|







 







>
|
>










155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

173
174
175
176
177

178
179
180
181



182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
...
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
...
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
...
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int remove_diacritic(int c, int bComplex){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 

    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 

    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
    63182, 63242, 63274, 63310, 63368, 63390, 
  };
  char aChar[] = {



    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
  };

  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
}


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
................................................................................
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeFold(int c, int eRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }

    if( eRemoveDiacritic ){
      ret = remove_diacritic(ret, eRemoveDiacritic==2);
    }
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */

Changes to ext/fts3/unicode/mkunicode.tcl.

5
6
7
8
9
10
11

12
13
14
15
16
17
18
19
20
21
22
23
..
25
26
27
28
29
30
31

32
33
34

35
36
37
38

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
..
56
57
58
59
60
61
62
63
64
65

66



67
68
69
70
71
72
73
74
75
76
..
83
84
85
86
87
88
89

90
91
92
93
94
95
96
97
98

99
100
101
102
103
104
105
...
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
...
512
513
514
515
516
517
518

519

520
521
522
523
524
525
526
  global tl_lookup_table
  set aChar [list]
  set lRange [list]

  set nRange 1
  set iFirst  [lindex $map 0 0]
  set cPrev   [lindex $map 0 1]


  foreach m [lrange $map 1 end] {
    foreach {i c} $m {}

    if {$cPrev == $c} {
      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
        if {[info exists tl_lookup_table($j)]==0} break
      }

      if {$j==$i} {
        set nNew [expr {(1 + $i - $iFirst)}]
        if {$nNew<=8} {
................................................................................
          continue
        }
      }
    }

    lappend lRange [list $iFirst $nRange]
    lappend aChar  $cPrev


    set iFirst $i
    set cPrev  $c

    set nRange 1
  }
  lappend lRange [list $iFirst $nRange]
  lappend aChar $cPrev


  puts "/*"
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int ${::remove_diacritic}(int c)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i
................................................................................

    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
    puts -nonewline ", "
  }
  puts ""
  puts "  \};"
  puts "  char aChar\[\] = \{"
  puts -nonewline "    '\\0', "
  set i 1
  foreach c $aChar {

    set str "'$c',  "



    if {$c == ""} { set str "'\\0', " }

    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
    incr i
    puts -nonewline "$str"
  }
  puts ""
  puts "  \};"
  puts {
  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );

  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
  puts "\}"
}

proc print_isdiacritic {zFunc map} {

  set lCode [list]
  foreach m $map {
    foreach {code char} $m {}

    if {$code && $char == ""} { lappend lCode $code }
  }
  set lCode [lsort -integer $lCode]
  set iFirst [lindex $lCode 0]
  set iLast [lindex $lCode end]

  set i1 0
................................................................................
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"

  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }


    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);

  }
  }]

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }








>


|

|







 







>



>




>









|







 







|

|
>
|
>
>
>
|

|







 







>
|







|
>







 







|







 







>
|
>







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
..
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
..
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
..
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
...
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
...
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
  global tl_lookup_table
  set aChar [list]
  set lRange [list]

  set nRange 1
  set iFirst  [lindex $map 0 0]
  set cPrev   [lindex $map 0 1]
  set fPrev   [lindex $map 0 2]

  foreach m [lrange $map 1 end] {
    foreach {i c f} $m {}

    if {$cPrev == $c && $fPrev==$f} {
      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
        if {[info exists tl_lookup_table($j)]==0} break
      }

      if {$j==$i} {
        set nNew [expr {(1 + $i - $iFirst)}]
        if {$nNew<=8} {
................................................................................
          continue
        }
      }
    }

    lappend lRange [list $iFirst $nRange]
    lappend aChar  $cPrev
    lappend aFlag  $fPrev

    set iFirst $i
    set cPrev  $c
    set fPrev  $f
    set nRange 1
  }
  lappend lRange [list $iFirst $nRange]
  lappend aChar $cPrev
  lappend aFlag $fPrev

  puts "/*"
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int ${::remove_diacritic}(int c, int bComplex)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i
................................................................................

    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
    puts -nonewline ", "
  }
  puts ""
  puts "  \};"
  puts "  char aChar\[\] = \{"
  puts -nonewline "    '\\0',      "
  set i 1
  foreach c $aChar f $aFlag {
    if { $f } {
      set str "'$c'|0x80,  "
    } else {
      set str "'$c'|0x00,  "
    }
    if {$c == ""} { set str "'\\0',      " }

    if {($i % 6)==0} {puts "" ; puts -nonewline "    " }
    incr i
    puts -nonewline "$str"
  }
  puts ""
  puts "  \};"
  puts {
  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);}
  puts "\}"
}

proc print_isdiacritic {zFunc map} {

  set lCode [list]
  foreach m $map {
    foreach {code char flag} $m {}
    if {$flag} continue
    if {$code && $char == ""} { lappend lCode $code }
  }
  set lCode [lsort -integer $lCode]
  set iFirst [lindex $lCode 0]
  set iLast [lindex $lCode end]

  set i1 0
................................................................................
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c, int eRemoveDiacritic)\{"

  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }

    if( eRemoveDiacritic ){
      ret = ${::remove_diacritic}(ret, eRemoveDiacritic==2);
    }
  }
  }]

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }

Changes to ext/fts3/unicode/parseunicode.tcl.

3
4
5
6
7
8
9
10
11
12
13
14












15
16
17
18
19
20
21
..
49
50
51
52
53
54
55


56








57
58
59
60
61

62
63
64
65
66
67
68
69
70
71
72
73
74
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the 
# codepoint should simply be removed from the input. Examples:
#
#   { 224 a  }     (replace codepoint 224 to "a")
#   { 769 "" }     (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.












#
proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table

  set fd [open $zName]
  set lField {
    code
................................................................................
      continue
    }

    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]



    if {[info exists tl_lookup_table($iCode)]} continue









    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]

      set dia($iDia) 1
    }
  }

  foreach d [array names dia] {
    lappend lRet [list $d ""]
  }
  set lRet [lsort -integer -index 0 $lRet]

  close $fd
  set lRet
}








|
|



>
>
>
>
>
>
>
>
>
>
>
>







 







>
>

>
>
>
>
>
>
>
>




|
>





|







3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
..
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the 
# codepoint should simply be removed from the input. Examples:
#
#   { 224 a  0 }     (replace codepoint 224 to "a")
#   { 769 "" 0 }     (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
# The third value in the list is always either 0 or 1. 0 if the 
# UnicodeData.txt file maps the codepoint to a single ASCII character and
# a diacritic, or 1 if the mapping is indirect. For example, consider the 
# two entries:
#
# 1ECD;LATIN SMALL LETTER O WITH DOT BELOW;Ll;0;L;006F 0323;;;;N;;;1ECC;;1ECC
# 1ED9;LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW;Ll;0;L;1ECD 0302;;;;N;;;1ED8;;1ED8
#
# The first codepoint is a direct mapping (as 006F is ASCII and 0323 is a 
# diacritic). The second is an indirect mapping, as it maps to the
# first codepoint plus 0302 (a diacritic).
#
proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table

  set fd [open $zName]
  set lField {
    code
................................................................................
      continue
    }

    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]

    # Filter out upper-case characters, as they will be mapped to their
    # lower-case equivalents before this data is used.
    if {[info exists tl_lookup_table($iCode)]} continue

    # Check if this is an indirect mapping. If so, set bIndirect to true
    # and change $iAscii to the indirectly mappped ASCII character.
    set bIndirect 0
    if {[info exists dia($iDia)] && [info exists mapping($iAscii)]} {
      set iAscii $mapping($iAscii)
      set bIndirect 1
    }

    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]] $bIndirect]
      set mapping($iCode) $iAscii
      set dia($iDia) 1
    }
  }

  foreach d [array names dia] {
    lappend lRet [list $d "" 0]
  }
  set lRet [lsort -integer -index 0 $lRet]

  close $fd
  set lRet
}

Changes to ext/fts5/fts5_tokenize.c.

230
231
232
233
234
235
236
237
238
239
240
241
242





243
244
245
246
247
248
249
...
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
...
378
379
380
381
382
383
384
385
386
387
388





389
390
391
392
393
394
395
...
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
  unsigned char aTokenChar[128];  /* ASCII range token characters */
  char *aFold;                    /* Buffer to fold text into */
  int nFold;                      /* Size of aFold[] in bytes */
  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
  int nException;
  int *aiException;

  unsigned char aCategory[32];    /* True for token char categories */
};






static int fts5UnicodeAddExceptions(
  Unicode61Tokenizer *p,          /* Tokenizer object */
  const char *z,                  /* Characters to treat as exceptions */
  int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
){
  int rc = SQLITE_OK;
................................................................................
  }else{
    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
    if( p ){
      const char *zCat = "L* N* Co";
      int i;
      memset(p, 0, sizeof(Unicode61Tokenizer));

      p->bRemoveDiacritic = 1;
      p->nFold = 64;
      p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
      if( p->aFold==0 ){
        rc = SQLITE_NOMEM;
      }

      /* Search for a "categories" argument */
................................................................................
      if( rc==SQLITE_OK ){
        rc = unicodeSetCategories(p, zCat);
      }

      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
            rc = SQLITE_ERROR;
          }
          p->bRemoveDiacritic = (zArg[0]=='1');





        }else
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 0);
        }else
................................................................................

      if( *zCsr & 0x80 ){
        /* An non-ascii-range character. Fold it into the output buffer if
        ** it is a token character, or break out of the loop if it is not. */
        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 non_ascii_tokenchar:
          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
          if( iCode ) WRITE_UTF8(zOut, iCode);
        }else{
          break;
        }
      }else if( a[*zCsr]==0 ){
        /* An ascii-range separator character. End of token. */
        break; 







|





>
>
>
>
>







 







|







 







|

|
|
>
>
>
>
>







 







|







230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
...
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
...
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
...
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
  unsigned char aTokenChar[128];  /* ASCII range token characters */
  char *aFold;                    /* Buffer to fold text into */
  int nFold;                      /* Size of aFold[] in bytes */
  int eRemoveDiacritic;           /* True if remove_diacritics=1 is set */
  int nException;
  int *aiException;

  unsigned char aCategory[32];    /* True for token char categories */
};

/* Values for eRemoveDiacritic (must match internals of fts5_unicode2.c) */
#define FTS5_REMOVE_DIACRITICS_NONE    0
#define FTS5_REMOVE_DIACRITICS_SIMPLE  1
#define FTS5_REMOVE_DIACRITICS_COMPLEX 2

static int fts5UnicodeAddExceptions(
  Unicode61Tokenizer *p,          /* Tokenizer object */
  const char *z,                  /* Characters to treat as exceptions */
  int bTokenChars                 /* 1 for 'tokenchars', 0 for 'separators' */
){
  int rc = SQLITE_OK;
................................................................................
  }else{
    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
    if( p ){
      const char *zCat = "L* N* Co";
      int i;
      memset(p, 0, sizeof(Unicode61Tokenizer));

      p->eRemoveDiacritic = FTS5_REMOVE_DIACRITICS_SIMPLE;
      p->nFold = 64;
      p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
      if( p->aFold==0 ){
        rc = SQLITE_NOMEM;
      }

      /* Search for a "categories" argument */
................................................................................
      if( rc==SQLITE_OK ){
        rc = unicodeSetCategories(p, zCat);
      }

      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
          if( (zArg[0]!='0' && zArg[0]!='1' && zArg[0]!='2') || zArg[1] ){
            rc = SQLITE_ERROR;
          }else{
            p->eRemoveDiacritic = (zArg[0] - '0');
            assert( p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_NONE
                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_SIMPLE
                 || p->eRemoveDiacritic==FTS5_REMOVE_DIACRITICS_COMPLEX
            );
          }
        }else
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 0);
        }else
................................................................................

      if( *zCsr & 0x80 ){
        /* An non-ascii-range character. Fold it into the output buffer if
        ** it is a token character, or break out of the loop if it is not. */
        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 non_ascii_tokenchar:
          iCode = sqlite3Fts5UnicodeFold(iCode, p->eRemoveDiacritic);
          if( iCode ) WRITE_UTF8(zOut, iCode);
        }else{
          break;
        }
      }else if( a[*zCsr]==0 ){
        /* An ascii-range separator character. End of token. */
        break; 

Changes to ext/fts5/fts5_unicode2.c.

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39

40
41
42
43


44
45


46
47
48
49
50
51
52
53
54
55
56















57
58
59
60
61
62
63
..
66
67
68
69
70
71
72

73
74
75
76
77
78
79
80
..
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
...
216
217
218
219
220
221
222

223

224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
...
752
753
754
755
756
757
758
759
760
761
762
763
764
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int fts5_remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 

     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 


    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
    62924, 63050, 63082, 63274, 63390, 


  };
  char aChar[] = {
    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
    'e',  'i',  'o',  'u',  'y',  















  };

  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );

  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
}


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
................................................................................
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3Fts5UnicodeFold(int c, int bRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }


    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);

  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}


#if 0
int sqlite3Fts5UnicodeNCat(void) { 
  return 32;
}
#endif

int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 
  aArray[0] = 1;
  switch( zCat[0] ){
    case 'C':
          switch( zCat[1] ){
            case 'c': aArray[1] = 1; break;
................................................................................
void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
  int i = 0;
  int iTbl = 0;
  while( i<128 ){
    int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
    int n = (aFts5UnicodeData[iTbl] >> 5) + i;
    for(; i<128 && i<n; i++){
      aAscii[i] = (u8)bToken;
    }
    iTbl++;
  }
}








|





|
|
|
>
|
<
|
|
>
>
|
<
>
>


<
<
<
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
|







 







|







 







>
|
>










<



<







 







|





24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

42
43
44
45
46

47
48
49
50



51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
..
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
...
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
...
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

252
253
254

255
256
257
258
259
260
261
...
768
769
770
771
772
773
774
775
776
777
778
779
780
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int fts5_remove_diacritic(int c, int bComplex){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3766,  3832,  3896, 
     3912,  3928,  3944,  3968,  4008,  4040,  4056,  4106, 
     4138,  4170,  4202,  4234,  4266,  4296,  4312,  4344, 
     4408,  4424,  4442,  4472,  4488,  4504,  6148,  6198, 
     6264,  6280,  6360,  6429,  6505,  6529, 61448, 61468, 

    61512, 61534, 61592, 61610, 61642, 61672, 61688, 61704, 
    61726, 61784, 61800, 61816, 61836, 61880, 61896, 61914, 
    61948, 61998, 62062, 62122, 62154, 62184, 62200, 62218, 
    62252, 62302, 62364, 62410, 62442, 62478, 62536, 62554, 
    62584, 62604, 62640, 62648, 62656, 62664, 62730, 62766, 

    62830, 62890, 62924, 62974, 63032, 63050, 63082, 63118, 
    63182, 63242, 63274, 63310, 63368, 63390, 
  };
  char aChar[] = {



    '\0',      'a'|0x00,  'c'|0x00,  'e'|0x00,  'i'|0x00,  'n'|0x00,  
    'o'|0x00,  'u'|0x00,  'y'|0x00,  'y'|0x00,  'a'|0x00,  'c'|0x00,  
    'd'|0x00,  'e'|0x00,  'e'|0x00,  'g'|0x00,  'h'|0x00,  'i'|0x00,  
    'j'|0x00,  'k'|0x00,  'l'|0x00,  'n'|0x00,  'o'|0x00,  'r'|0x00,  
    's'|0x00,  't'|0x00,  'u'|0x00,  'u'|0x00,  'w'|0x00,  'y'|0x00,  
    'z'|0x00,  'o'|0x00,  'u'|0x00,  'a'|0x00,  'i'|0x00,  'o'|0x00,  
    'u'|0x00,  'u'|0x80,  'a'|0x80,  'g'|0x00,  'k'|0x00,  'o'|0x00,  
    'o'|0x80,  'j'|0x00,  'g'|0x00,  'n'|0x00,  'a'|0x80,  'a'|0x00,  
    'e'|0x00,  'i'|0x00,  'o'|0x00,  'r'|0x00,  'u'|0x00,  's'|0x00,  
    't'|0x00,  'h'|0x00,  'a'|0x00,  'e'|0x00,  'o'|0x80,  'o'|0x00,  
    'o'|0x80,  'y'|0x00,  '\0',      '\0',      '\0',      '\0',      
    '\0',      '\0',      '\0',      '\0',      'a'|0x00,  'b'|0x00,  
    'c'|0x80,  'd'|0x00,  'd'|0x00,  'e'|0x80,  'e'|0x00,  'e'|0x80,  
    'f'|0x00,  'g'|0x00,  'h'|0x00,  'h'|0x00,  'i'|0x00,  'i'|0x80,  
    'k'|0x00,  'l'|0x00,  'l'|0x80,  'l'|0x00,  'm'|0x00,  'n'|0x00,  
    'o'|0x80,  'p'|0x00,  'r'|0x00,  'r'|0x80,  'r'|0x00,  's'|0x00,  
    's'|0x80,  't'|0x00,  'u'|0x00,  'u'|0x80,  'v'|0x00,  'w'|0x00,  
    'w'|0x00,  'x'|0x00,  'y'|0x00,  'z'|0x00,  'h'|0x00,  't'|0x00,  
    'w'|0x00,  'y'|0x00,  'a'|0x00,  'a'|0x80,  'a'|0x80,  'a'|0x80,  
    'e'|0x00,  'e'|0x80,  'e'|0x80,  'i'|0x00,  'o'|0x00,  'o'|0x80,  
    'o'|0x80,  'o'|0x80,  'u'|0x00,  'u'|0x80,  'u'|0x80,  'y'|0x00,  
  };

  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
................................................................................
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  if( bComplex==0 && (aChar[iRes] & 0x80) ) return c;
  return (c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : ((int)aChar[iRes] & 0x7F);
}


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
................................................................................
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3Fts5UnicodeFold(int c, int eRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    assert( iRes>=0 && c>=aEntry[iRes].iCode );
    p = &aEntry[iRes];
    if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
      ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
      assert( ret>0 );
    }

    if( eRemoveDiacritic ){
      ret = fts5_remove_diacritic(ret, eRemoveDiacritic==2);
    }
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}



int sqlite3Fts5UnicodeNCat(void) { 
  return 32;
}


int sqlite3Fts5UnicodeCatParse(const char *zCat, u8 *aArray){ 
  aArray[0] = 1;
  switch( zCat[0] ){
    case 'C':
          switch( zCat[1] ){
            case 'c': aArray[1] = 1; break;
................................................................................
void sqlite3Fts5UnicodeAscii(u8 *aArray, u8 *aAscii){
  int i = 0;
  int iTbl = 0;
  while( i<128 ){
    int bToken = aArray[ aFts5UnicodeData[iTbl] & 0x1F ];
    int n = (aFts5UnicodeData[iTbl] >> 5) + i;
    for(; i<128 && i<n; i++){
      aAscii[i] = bToken;
    }
    iTbl++;
  }
}

Changes to ext/fts5/test/fts5tokenizer.test.

185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.2 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.3 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 2'
  );
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.4 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 10'
  );
} {1 {error in tokenizer constructor}}







|







185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 tokenchars');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.2 {
  CREATE VIRTUAL TABLE a3 USING fts5(x, y, tokenize = 'unicode61 a b');
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.3 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 3'
  );
} {1 {error in tokenizer constructor}}
do_catchsql_test 6.4 {
  CREATE VIRTUAL TABLE a3 USING fts5(
    x, y, tokenize = 'unicode61 remove_diacritics 10'
  );
} {1 {error in tokenizer constructor}}

Added ext/fts5/test/fts5umlaut.test.



































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# 2014 June 17
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5umlaut

# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}

do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE t1 USING fts5(x);
  CREATE VIRTUAL TABLE t2 USING fts5(
      x, 
      tokenize="unicode61 remove_diacritics 2"
  );
}

foreach {tn q res1 res2} {
  1 "Hà Nội"                  0 1
  2 "Hà Noi"                  1 1
  3 "Ha Noi"                  1 1
  4 "Ha N\u1ed9i"             0 1
  5 "Ha N\u006fi"             1 1
  6 "Ha N\u006f\u0302i"       1 1
  7 "Ha N\u006f\u0323\u0302i" 1 1
} {
  do_execsql_test 1.$tn.1 {
    DELETE FROM t1;
    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
    SELECT count(*) FROM t1($q)
  } $res1
  do_execsql_test 1.$tn.2 {
    DELETE FROM t1;
    INSERT INTO t1(rowid, x) VALUES (1, $q);
    SELECT count(*) FROM t1('Ha Noi')
  } $res1

  do_execsql_test 1.$tn.2 {
    DELETE FROM t2;
    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
    SELECT count(*) FROM t2($q)
  } $res2
  do_execsql_test 1.$tn.2 {
    DELETE FROM t2;
    INSERT INTO t2(rowid, x) VALUES (1, $q);
    SELECT count(*) FROM t2('Ha Noi')
  } $res2
}

finish_test

Changes to ext/fts5/test/fts5unicode3.test.

32
33
34
35
36
37
38
39
40
41
42
43
44
45

46
47
48
49
50

51
52
53
54
55
56
57
58
59
60
61
62
63
..
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96










97
98
99
100
101
102
103

tl_load_casefolding_txt $CF
foreach x [an_load_unicodedata_text $UD] {
  set aNotAlnum($x) 1
}

foreach {y} [rd_load_unicodedata_text $UD] {
  foreach {code ascii} $y {}
  if {$ascii==""} {
    set int 0
  } else {
    binary scan $ascii c int
  }
  set aDiacritic($code) $int

}

proc tcl_fold {i {bRemoveDiacritic 0}} {
  global tl_lookup_table
  global aDiacritic


  if {[info exists tl_lookup_table($i)]} {
    set i $tl_lookup_table($i)
  }
  if {$bRemoveDiacritic && [info exists aDiacritic($i)]} {
    set i $aDiacritic($i)
  }
  expr $i
}
db func tcl_fold tcl_fold

proc tcl_isalnum {i} {
  global aNotAlnum
................................................................................
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
} {0 {}}

do_execsql_test 1.2 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii 
  WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
} {0 {}}











do_execsql_test 1.3 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )







|





|
>





>




|
|







 







|








>
>
>
>
>
>
>
>
>
>







32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
..
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115

tl_load_casefolding_txt $CF
foreach x [an_load_unicodedata_text $UD] {
  set aNotAlnum($x) 1
}

foreach {y} [rd_load_unicodedata_text $UD] {
  foreach {code ascii f} $y {}
  if {$ascii==""} {
    set int 0
  } else {
    binary scan $ascii c int
  }
  set aDiacritic($code,$f) $int
  if {$f==0} { set aDiacritic($code,1) $int }
}

proc tcl_fold {i {bRemoveDiacritic 0}} {
  global tl_lookup_table
  global aDiacritic
  set f [expr $bRemoveDiacritic==2]

  if {[info exists tl_lookup_table($i)]} {
    set i $tl_lookup_table($i)
  }
  if {$bRemoveDiacritic && [info exists aDiacritic($i,$f)]} {
    set i $aDiacritic($i,$f)
  }
  expr $i
}
db func tcl_fold tcl_fold

proc tcl_isalnum {i} {
  global aNotAlnum
................................................................................
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii WHERE fts5_fold(i)!=CAST(tcl_fold(i) AS int);
} {0 {}}

do_execsql_test 1.2.1 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii 
  WHERE fts5_fold(i,1)!=CAST(tcl_fold(i,1) AS int);
} {0 {}}

do_execsql_test 1.2.2 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )
  SELECT count(*), min(i) FROM ii 
  WHERE fts5_fold(i,2)!=CAST(tcl_fold(i,2) AS int);
} {0 {}}

do_execsql_test 1.3 {
  WITH ii(i) AS (
    SELECT -1
    UNION ALL
    SELECT i+1 FROM ii WHERE i<100000
  )

Added test/fts4umlaut.test.



































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# 2018 December 3
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix fts4umlaut

ifcapable !fts3 {
  finish_test
  return
}

do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE t1 USING fts5(x);
  CREATE VIRTUAL TABLE t2 USING fts4(
      x, 
      tokenize=unicode61 "remove_diacritics=2"
  );
}

foreach {tn q res1 res2} {
  1 "Hà Nội"                  0 1
  2 "Hà Noi"                  1 1
  3 "Ha Noi"                  1 1
  4 "Ha N\u1ed9i"             0 1
  5 "Ha N\u006fi"             1 1
  6 "Ha N\u006f\u0302i"       1 1
  7 "Ha N\u006f\u0323\u0302i" 1 1
} {
  do_execsql_test 1.$tn.1 {
    DELETE FROM t1;
    INSERT INTO t1(rowid, x) VALUES (1, 'Ha Noi');
    SELECT count(*) FROM t1 WHERE t1 MATCH $q
  } $res1
  do_execsql_test 1.$tn.2 {
    DELETE FROM t1;
    INSERT INTO t1(rowid, x) VALUES (1, $q);
    SELECT count(*) FROM t1 WHERE t1 MATCH 'Ha Noi'
  } $res1

  do_execsql_test 1.$tn.2 {
    DELETE FROM t2;
    INSERT INTO t2(rowid, x) VALUES (1, 'Ha Noi');
    SELECT count(*) FROM t2 WHERE t2 MATCH $q
  } $res2
  do_execsql_test 1.$tn.2 {
    DELETE FROM t2;
    INSERT INTO t2(rowid, x) VALUES (1, $q);
    SELECT count(*) FROM t2 WHERE t2 MATCH 'Ha Noi'
  } $res2
}

finish_test