/ Check-in [790f76a5]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Have the FTS unicode61 strip out diacritics when tokenizing text. This can be disabled by specifying the tokenizer option "remove_diacritics=0".
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:790f76a5898dad1a955d40edddf11f7b0fec0ccd
User & Date: dan 2012-06-06 19:30:38
Context
2012-06-06
19:51
Disable FTS unicode61 by default. It is enabled by specifying compile time option SQLITE_ENABLE_FTS4_UNICODE61. check-in: eccd6b65 user: dan tags: trunk
19:30
Have the FTS unicode61 strip out diacritics when tokenizing text. This can be disabled by specifying the tokenizer option "remove_diacritics=0". check-in: 790f76a5 user: dan tags: trunk
19:01
Avoid resetting the shared-cache schema when on of the connections using the shared cache closes. Delay resetting the schema until the last connection closes. check-in: 635e3a76 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3Int.h.

538
539
540
541
542
543
544

545
546


547
548
549
int sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol, char **); 
int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *);
int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);

int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);

/* fts3_unicode2.c (functions generated by parsing unicode text files) */

int sqlite3FtsUnicodeTolower(int);
int sqlite3FtsUnicodeIsalnum(int);



#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
#endif /* _FTSINT_H */







>
|

>
>



538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
int sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol, char **); 
int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *);
int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);

int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);

/* fts3_unicode2.c (functions generated by parsing unicode text files) */
#ifndef SQLITE_DISABLE_FTS3_UNICODE
int sqlite3FtsUnicodeFold(int, int);
int sqlite3FtsUnicodeIsalnum(int);
int sqlite3FtsUnicodeIsdiacritic(int);
#endif

#endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
#endif /* _FTSINT_H */

Changes to ext/fts3/fts3_unicode.c.

78
79
80
81
82
83
84

85
86
87
88
89
90
91
..
99
100
101
102
103
104
105

106
107
108
109
110


















111
112
113
114
115
116
117
...
193
194
195
196
197
198
199


200
201
202
203
204
205
206
207
208
209
210




211

212
213
214
215
216


217
218
219
220
221
222
223
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct unicode_tokenizer unicode_tokenizer;
typedef struct unicode_cursor unicode_cursor;

struct unicode_tokenizer {
  sqlite3_tokenizer base;

};

struct unicode_cursor {
  sqlite3_tokenizer_cursor base;
  const unsigned char *aInput;    /* Input text being tokenized */
  int nInput;                     /* Size of aInput[] in bytes */
  int iOff;                       /* Current offset within aInput[] */
................................................................................
*/
static int unicodeCreate(
  int nArg,                       /* Size of array argv[] */
  const char * const *azArg,      /* Tokenizer creation arguments */
  sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
){
  unicode_tokenizer *pNew;        /* New tokenizer object */

  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  if( pNew==NULL ){
    return SQLITE_NOMEM;
  }
  memset(pNew, 0, sizeof(unicode_tokenizer));


















  *pp = &pNew->base;
  return SQLITE_OK;
}

/*
** Destroy a tokenizer allocated by unicodeCreate().
*/
................................................................................
    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
    zStart = z;
  }
  if( zStart>=zTerm ) return SQLITE_DONE;

  zOut = pCsr->zToken;
  do {


    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
      if( !zNew ) return SQLITE_NOMEM;
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;




    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));


    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
  }while( sqlite3FtsUnicodeIsalnum(iCode) );



  /* Set the output variables and return. */
  pCsr->iOff = (z - pCsr->aInput);
  *paToken = pCsr->zToken;
  *pnToken = zOut - pCsr->zToken;
  *piStart = (zStart - pCsr->aInput);
  *piEnd = (zEnd - pCsr->aInput);







>







 







>





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>











>
>
>
>
|
>




|
>
>







78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
...
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
...
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct unicode_tokenizer unicode_tokenizer;
typedef struct unicode_cursor unicode_cursor;

struct unicode_tokenizer {
  sqlite3_tokenizer base;
  int bRemoveDiacritic;
};

struct unicode_cursor {
  sqlite3_tokenizer_cursor base;
  const unsigned char *aInput;    /* Input text being tokenized */
  int nInput;                     /* Size of aInput[] in bytes */
  int iOff;                       /* Current offset within aInput[] */
................................................................................
*/
static int unicodeCreate(
  int nArg,                       /* Size of array argv[] */
  const char * const *azArg,      /* Tokenizer creation arguments */
  sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
){
  unicode_tokenizer *pNew;        /* New tokenizer object */
  int i;
  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
  if( pNew==NULL ){
    return SQLITE_NOMEM;
  }
  memset(pNew, 0, sizeof(unicode_tokenizer));
  pNew->bRemoveDiacritic = 1;

  for(i=0; i<nArg; i++){
    const char *z = azArg[i];
    int n = strlen(z);

    if( n==19 && memcmp("remove_diacritics=1", z, 19)==0 ){
      pNew->bRemoveDiacritic = 1;
    }
    else if( n==19 && memcmp("remove_diacritics=0", z, 19)==0 ){
      pNew->bRemoveDiacritic = 0;
    }
    else{
      /* Unrecognized argument */
      return SQLITE_ERROR;
    }
  }

  *pp = &pNew->base;
  return SQLITE_OK;
}

/*
** Destroy a tokenizer allocated by unicodeCreate().
*/
................................................................................
    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
    zStart = z;
  }
  if( zStart>=zTerm ) return SQLITE_DONE;

  zOut = pCsr->zToken;
  do {
    int iOut;

    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
      if( !zNew ) return SQLITE_NOMEM;
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    iOut = sqlite3FtsUnicodeFold(iCode, 
        ((unicode_tokenizer *)pCsr->base.pTokenizer)->bRemoveDiacritic
    );
    if( iOut ){
      WRITE_UTF8(zOut, iOut);
    }

    /* If the cursor is not at EOF, read the next character */
    if( z>=zTerm ) break;
    READ_UTF8(z, zTerm, iCode);
  }while( sqlite3FtsUnicodeIsalnum(iCode) 
       || sqlite3FtsUnicodeIsdiacritic(iCode)
  );

  /* Set the output variables and return. */
  pCsr->iOff = (z - pCsr->aInput);
  *paToken = pCsr->zToken;
  *pnToken = zOut - pCsr->zToken;
  *piStart = (zStart - pCsr->aInput);
  *piEnd = (zEnd - pCsr->aInput);

Changes to ext/fts3/fts3_unicode2.c.

147
148
149
150
151
152
153




































































154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
...
280
281
282
283
284
285
286


287
288
289
290
291
292
293
294
295
296
    assert( aEntry[0]<key );
    assert( key>=aEntry[iRes] );
    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
  }
  return 1;
}






































































/*
** Interpret the argument as a unicode codepoint. If the codepoint
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeTolower(int c){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }


  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










|







 







>
>










147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
...
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
    assert( aEntry[0]<key );
    assert( key>=aEntry[iRes] );
    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
  }
  return 1;
}


/*
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
     4266,  4296,  4312,  4344,  4408,  4424,  4472,  4504, 
     6148,  6198,  6264,  6280,  6360,  6429,  6505,  6529, 
    61448, 61468, 61534, 61592, 61642, 61688, 61704, 61726, 
    61784, 61800, 61836, 61880, 61914, 61948, 61998, 62122, 
    62154, 62200, 62218, 62302, 62364, 62442, 62478, 62536, 
    62554, 62584, 62604, 62640, 62648, 62656, 62664, 62730, 
    62924, 63050, 63082, 63274, 63390, 
  };
  char aChar[] = {
    '\0', 'a',  'c',  'e',  'i',  'n',  'o',  'u',  'y',  'y',  'a',  'c',  
    'd',  'e',  'e',  'g',  'h',  'i',  'j',  'k',  'l',  'n',  'o',  'r',  
    's',  't',  'u',  'u',  'w',  'y',  'z',  'o',  'u',  'a',  'i',  'o',  
    'u',  'g',  'k',  'o',  'j',  'g',  'n',  'a',  'e',  'i',  'o',  'r',  
    'u',  's',  't',  'h',  'a',  'e',  'o',  'y',  '\0', '\0', '\0', '\0', 
    '\0', '\0', '\0', '\0', 'a',  'b',  'd',  'd',  'e',  'f',  'g',  'h',  
    'h',  'i',  'k',  'l',  'l',  'm',  'n',  'p',  'r',  'r',  's',  't',  
    'u',  'v',  'w',  'w',  'x',  'y',  'z',  'h',  't',  'w',  'y',  'a',  
    'e',  'i',  'o',  'u',  'y',  
  };

  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
    int iTest = (iHi + iLo) / 2;
    if( key >= aDia[iTest] ){
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
};


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
int sqlite3FtsUnicodeIsdiacritic(int c){
  unsigned int mask0 = 0x08029FDF;
  unsigned int mask1 = 0x000361F8;
  if( c<768 || c>817 ) return 0;
  return (c < 768+32) ?
      (mask0 & (1 << (c-768))) :
      (mask1 & (1 << (c-768-32)));
}


/*
** Interpret the argument as a unicode codepoint. If the codepoint
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If the least significant bit in flags is clear, then the rule applies
  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
  ** need to be folded). Or, if it is set, then the rule only applies to
................................................................................
    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */
#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */

Changes to ext/fts3/unicode/mkunicode.tcl.

1












































































































































































































2
3
4
5
6
7
8
...
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
...
447
448
449
450
451
452
453


454
455
456
457
458
459
460
461
462
463
464
465
466
467
468










469
470
471

472
473



474
475
476
477
478
479


480
481
482
483
484
485
486
487
488
...
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
...
540
541
542
543
544
545
546


547
548
549
550
551
552
553
554
555
556
557
558












559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574














































































































































































































# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
................................................................................
    incr i
  }
  puts ""
  puts "  \};"

}

proc print_tolower {zFunc} {

  set lRecord [tl_create_records]

  set lHigh [list]
  puts "/*"
  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"

  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
................................................................................
    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }


  }
  }

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }

  puts ""
  puts "  return ret;"
  puts "\}"
}

proc print_tolower_test {zFunc} {
  global tl_lookup_table











  puts "static int tolower_test(int *piCode)\{"
  puts -nonewline "  static int aLookup\[\] = \{"
  for {set i 0} {$i < 70000} {incr i} {

    set expected $i
    catch { set expected $tl_lookup_table($i) }



    if {($i % 8)==0}  { puts "" ; puts -nonewline "    " }
    puts -nonewline "$expected, "
  }
  puts "  \};"
  puts "  int i;"
  puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"


  puts "    if( ${zFunc}\(i)!=aLookup\[i\] )\{"
  puts "      *piCode = i;"
  puts "      return 1;"
  puts "    \}"
  puts "  \}"
  puts "  return 0;"
  puts "\}"
}

................................................................................
  puts ""
  puts "int main(int argc, char **argv)\{"
  puts "  int r1, r2;"
  puts "  int code;"
  puts "  r1 = isalnum_test(&code);"
  puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
  puts "  else printf(\"isalnum(): test passed\\n\");"
  puts "  r2 = tolower_test(&code);"
  puts "  if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
  puts "  else printf(\"tolower(): test passed\\n\");"
  puts "  return (r1 || r2);"
  puts "\}"
}

# Proces the command line arguments. Exit early if they are not to
# our liking.
#
................................................................................
  exit 1
}
if {[llength $argv]!=2 && [llength $argv]!=3} usage
if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
set unicodedata.txt [lindex $argv end]
set casefolding.txt [lindex $argv end-1]
set generate_test_code [expr {[llength $argv]==3}]



# Print the isalnum() function to stdout.
#
print_fileheader
set lRange [an_load_separator_ranges]
print_isalnum sqlite3FtsUnicodeIsalnum $lRange

# Leave a gap between the two generated C functions.
#
puts ""
puts ""













# Print the tolower() function to stdout.
#
tl_load_casefolding_txt ${casefolding.txt}
print_tolower sqlite3FtsUnicodeTolower

# Print the test routines and main() function to stdout, if -test 
# was specified.
#
if {$::generate_test_code} {
  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
  print_tolower_test sqlite3FtsUnicodeTolower 
  print_test_main 
}

puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







|













|







 







>
>












|


>
>
>
>
>
>
>
>
>
>
|


>


>
>
>
|
|




>
>
|
|







 







|
|
|







 







>
>



<








>
>
>
>
>
>
>
>
>
>
>
>
|

<
|






|





1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
...
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
...
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
...
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
...
762
763
764
765
766
767
768
769
770
771
772
773

774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795

796
797
798
799
800
801
802
803
804
805
806
807
808

#
# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of mappings required to remove all
# diacritical marks from a unicode string. Each mapping is itself a list
# consisting of two elements - the unicode codepoint and the single ASCII
# character that it should be replaced with, or an empty string if the 
# codepoint should simply be removed from the input. Examples:
#
#   { 224 a  }     (replace codepoint 224 to "a")
#   { 769 "" }     (remove codepoint 769 from input)
#
# Mappings are only returned for non-upper case codepoints. It is assumed
# that the input has already been folded to lower case.
#
proc rd_load_unicodedata_text {zName} {
  global tl_lookup_table

  set fd [open $zName]
  set lField {
    code
    character_name
    general_category
    canonical_combining_classes
    bidirectional_category
    character_decomposition_mapping
    decimal_digit_value
    digit_value
    numeric_value
    mirrored
    unicode_1_name
    iso10646_comment_field
    uppercase_mapping
    lowercase_mapping
    titlecase_mapping
  }
  set lRet [list]

  while { ![eof $fd] } {
    set line [gets $fd]
    if {$line == ""} continue

    set fields [split $line ";"]
    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
    foreach $lField $fields {}
    if { [llength $character_decomposition_mapping]!=2
      || [string is xdigit [lindex $character_decomposition_mapping 0]]==0
    } {
      continue
    }

    set iCode  [expr "0x$code"]
    set iAscii [expr "0x[lindex $character_decomposition_mapping 0]"]
    set iDia   [expr "0x[lindex $character_decomposition_mapping 1]"]

    if {[info exists tl_lookup_table($iCode)]} continue

    if { ($iAscii >= 97 && $iAscii <= 122)
      || ($iAscii >= 65 && $iAscii <= 90)
    } {
      lappend lRet [list $iCode [string tolower [format %c $iAscii]]]
      set dia($iDia) 1
    }
  }

  foreach d [array names dia] {
    lappend lRet [list $d ""]
  }
  set lRet [lsort -integer -index 0 $lRet]

  close $fd
  set lRet
}


proc print_rd {map} {
  global tl_lookup_table
  set aChar [list]
  set lRange [list]

  set nRange 1
  set iFirst  [lindex $map 0 0]
  set cPrev   [lindex $map 0 1]

  foreach m [lrange $map 1 end] {
    foreach {i c} $m {}

    if {$cPrev == $c} {
      for {set j [expr $iFirst+$nRange]} {$j<$i} {incr j} {
        if {[info exists tl_lookup_table($j)]==0} break
      }

      if {$j==$i} {
        set nNew [expr {(1 + $i - $iFirst)}]
        if {$nNew<=8} {
          set nRange $nNew
          continue
        }
      }
    }

    lappend lRange [list $iFirst $nRange]
    lappend aChar  $cPrev

    set iFirst $i
    set cPrev  $c
    set nRange 1
  }
  lappend lRange [list $iFirst $nRange]
  lappend aChar $cPrev

  puts "/*"
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int remove_diacritic(int c)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i

    puts -nonewline [format "%5d" [expr ($iCode<<3) + $nRange-1]]
    puts -nonewline ", "
  }
  puts ""
  puts "  \};"
  puts "  char aChar\[\] = \{"
  puts -nonewline "    '\\0', "
  set i 1
  foreach c $aChar {
    set str "'$c',  "
    if {$c == ""} { set str "'\\0', " }

    if {($i % 12)==0} {puts "" ; puts -nonewline "    " }
    incr i
    puts -nonewline "$str"
  }
  puts ""
  puts "  \};"
  puts {
  unsigned int key = (((unsigned int)c)<<3) | 0x00000007;
  int iRes = 0;
  int iHi = sizeof(aDia)/sizeof(aDia[0]) - 1;
  int iLo = 0;
  while( iHi>=iLo ){
    int iTest = (iHi + iLo) / 2;
    if( key >= aDia[iTest] ){
      iRes = iTest;
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);}
  puts "\};"
}

proc print_isdiacritic {zFunc map} {

  set lCode [list]
  foreach m $map {
    foreach {code char} $m {}
    if {$code && $char == ""} { lappend lCode $code }
  }
  set lCode [lsort -integer $lCode]
  set iFirst [lindex $lCode 0]
  set iLast [lindex $lCode end]

  set i1 0
  set i2 0

  foreach c $lCode {
    set i [expr $c - $iFirst]
    if {$i < 32} {
      set i1 [expr {$i1 | (1<<$i)}]
    } else {
      set i2 [expr {$i2 | (1<<($i-32))}]
    }
  }

  puts "/*"
  puts "** Return true if the argument interpreted as a unicode codepoint" 
  puts "** is a diacritical modifier character."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  puts "  unsigned int mask0 = [format "0x%08X" $i1];"
  puts "  unsigned int mask1 = [format "0x%08X" $i2];"

  puts "  if( c<$iFirst || c>$iLast ) return 0;"
  puts "  return (c < $iFirst+32) ?"
  puts "      (mask0 & (1 << (c-$iFirst))) :"
  puts "      (mask1 & (1 << (c-$iFirst-32)));"
  puts "\}"
}


#-------------------------------------------------------------------------

# Parameter $zName must be a path to the file UnicodeData.txt. This command
# reads the file and returns a list of codepoints (integers). The list
# contains all codepoints in the UnicodeData.txt assigned to any "General
# Category" that is not a "Letter" or "Number".
#
proc an_load_unicodedata_text {zName} {
................................................................................
    incr i
  }
  puts ""
  puts "  \};"

}

proc print_fold {zFunc} {

  set lRecord [tl_create_records]

  set lHigh [list]
  puts "/*"
  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c, int bRemoveDiacritic)\{"

  set liOff [tl_generate_ioff_table $lRecord]
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
................................................................................
    if( iRes>=0 ){
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  }

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }

  puts ""
  puts "  return ret;"
  puts "\}"
}

proc print_fold_test {zFunc mappings} {
  global tl_lookup_table

  foreach m $mappings {
    set c [lindex $m 1]
    if {$c == ""} {
      set extra([lindex $m 0]) 0
    } else {
      scan $c %c i
      set extra([lindex $m 0]) $i
    }
  }

  puts "static int fold_test(int *piCode)\{"
  puts -nonewline "  static int aLookup\[\] = \{"
  for {set i 0} {$i < 70000} {incr i} {

    set expected $i
    catch { set expected $tl_lookup_table($i) }
    set expected2 $expected
    catch { set expected2 $extra($expected2) }

    if {($i % 4)==0}  { puts "" ; puts -nonewline "    " }
    puts -nonewline "$expected, $expected2, "
  }
  puts "  \};"
  puts "  int i;"
  puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
  puts "    int iCode = (i/2);"
  puts "    int bFlag = i & 0x0001;"
  puts "    if( ${zFunc}\(iCode, bFlag)!=aLookup\[i\] )\{"
  puts "      *piCode = iCode;"
  puts "      return 1;"
  puts "    \}"
  puts "  \}"
  puts "  return 0;"
  puts "\}"
}

................................................................................
  puts ""
  puts "int main(int argc, char **argv)\{"
  puts "  int r1, r2;"
  puts "  int code;"
  puts "  r1 = isalnum_test(&code);"
  puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
  puts "  else printf(\"isalnum(): test passed\\n\");"
  puts "  r2 = fold_test(&code);"
  puts "  if( r2 ) printf(\"fold(): Problem with code %d\\n\",code);"
  puts "  else printf(\"fold(): test passed\\n\");"
  puts "  return (r1 || r2);"
  puts "\}"
}

# Proces the command line arguments. Exit early if they are not to
# our liking.
#
................................................................................
  exit 1
}
if {[llength $argv]!=2 && [llength $argv]!=3} usage
if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
set unicodedata.txt [lindex $argv end]
set casefolding.txt [lindex $argv end-1]
set generate_test_code [expr {[llength $argv]==3}]

print_fileheader

# Print the isalnum() function to stdout.
#

set lRange [an_load_separator_ranges]
print_isalnum sqlite3FtsUnicodeIsalnum $lRange

# Leave a gap between the two generated C functions.
#
puts ""
puts ""

# Load the fold data. This is used by the [rd_XXX] commands
# as well as [print_fold].
tl_load_casefolding_txt ${casefolding.txt}

set mappings [rd_load_unicodedata_text ${unicodedata.txt}]
print_rd $mappings
puts ""
puts ""
print_isdiacritic sqlite3FtsUnicodeIsdiacritic $mappings
puts ""
puts ""

# Print the fold() function to stdout.
#

print_fold sqlite3FtsUnicodeFold

# Print the test routines and main() function to stdout, if -test 
# was specified.
#
if {$::generate_test_code} {
  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
  print_fold_test sqlite3FtsUnicodeFold $mappings
  print_test_main 
}

puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */"
puts "#endif /* !defined(SQLITE_DISABLE_FTS3_UNICODE) */"

Changes to test/fts4unicode.test.

15
16
17
18
19
20
21







22
23
24
25
26
27
28
..
35
36
37
38
39
40
41








42
43
44
45
46
47
48
set testdir [file dirname $argv0]
source $testdir/tester.tcl
ifcapable !fts3_unicode { finish_test ; return }
set ::testprefix fts4unicode

proc do_unicode_token_test {tn input res} {
  set input [string map {' ''} $input]







  uplevel [list do_execsql_test $tn "
    SELECT fts3_tokenizer_test('unicode61', '$input');
  " [list [list {*}$res]]]
}

do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
do_unicode_token_test 1.1 {  } {0   1   2  }
................................................................................

do_unicode_token_test 1.6 "The quick brown fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}
do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}









#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {







>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>







15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
..
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
set testdir [file dirname $argv0]
source $testdir/tester.tcl
ifcapable !fts3_unicode { finish_test ; return }
set ::testprefix fts4unicode

proc do_unicode_token_test {tn input res} {
  set input [string map {' ''} $input]
  uplevel [list do_execsql_test $tn "
    SELECT fts3_tokenizer_test('unicode61', 'remove_diacritics=0', '$input');
  " [list [list {*}$res]]]
}

proc do_unicode_token_test2 {tn input res} {
  set input [string map {' ''} $input]
  uplevel [list do_execsql_test $tn "
    SELECT fts3_tokenizer_test('unicode61', '$input');
  " [list [list {*}$res]]]
}

do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
do_unicode_token_test 1.1 {  } {0   1   2  }
................................................................................

do_unicode_token_test 1.6 "The quick brown fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}
do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}

do_unicode_token_test2 1.8  {a B c D} {0 a a 1 b B 2 c c 3 d D}
do_unicode_token_test2 1.9  {  } {0 a  1 o  2 u }
do_unicode_token_test2 1.10 {xx xx xx} {0 xax xx 1 xox xx 2 xux xx}

# Check that diacritics are removed if remove_diacritics=1 is specified.
# And that they do not break tokens.
do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"

#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {