/ Check-in [cf7b25d4]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1:cf7b25d47687635a04f4347d45f135c686b9d758
User & Date: dan 2012-05-25 19:50:12
Context
2012-05-26
14:54
Change the name of the "unicode" tokenizer to "unicode61" to emphasize that the case folding and separator-character identification routines are based on unicode version 6.1. check-in: 8f3e60aa user: dan tags: fts4-unicode
2012-05-25
19:50
Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. check-in: cf7b25d4 user: dan tags: fts4-unicode
18:48
Fix comments in generated file fts3_unicode2.c. check-in: 3dc567ef user: dan tags: fts4-unicode
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode.c.

197
198
199
200
201
202
203

204
205
206
207
208
209
210
  do {
    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
      if( !zNew ) return SQLITE_NOMEM;
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;

    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));

    /* If the cursor is not at EOF, read the next character */







>







197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
  do {
    /* Grow the output buffer if required. */
    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
      if( !zNew ) return SQLITE_NOMEM;
      zOut = &zNew[zOut - pCsr->zToken];
      pCsr->zToken = zNew;
      pCsr->nAlloc += 64;
    }

    /* Write the folded case of the last character read to the output */
    zEnd = z;
    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));

    /* If the cursor is not at EOF, read the next character */

Changes to ext/fts3/fts3_unicode2.c.

117
118
119
120
121
122
123



124


125
126
127
128
129
130
131
132
...
232
233
234
235
236
237
238


239
240
241
242
243
244
245
246
    0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
    0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
    0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
    0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
    0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
    0x43FFF401,
  };






  if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
................................................................................
  };

  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );



  if( c<65536 ){
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);







>
>
>

>
>
|







 







>
>
|







117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
...
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
    0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
    0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
    0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
    0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
    0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
    0x43FFF401,
  };
  static const unsigned int aAscii[4] = {
    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
  };

  if( c<128 ){
    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
  }else if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
................................................................................
  };

  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);

Changes to ext/fts3/unicode/mkunicode.tcl.

100
101
102
103
104
105
106





















107
108
109
110
111
112
113
114
115
116
117

118


119
120
121
122
123
124
125
126
...
361
362
363
364
365
366
367


368
369
370
371
372
373
374
375
    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
    puts -nonewline " $u32,"
    incr i
  }
  puts ""
  puts "  \};"
}






















proc print_isalnum {zFunc lRange} {
  puts "/*"
  puts "** Return true if the argument corresponds to a unicode codepoint"
  puts "** classified as either a letter or a number. Otherwise false."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  an_print_range_array $lRange

  puts {


  if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
................................................................................
  tl_print_table_footer toggle
  puts {
  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );



  if( c<65536 ){
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>











>

>
>
|







 







>
>
|







100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
...
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
    puts -nonewline " $u32,"
    incr i
  }
  puts ""
  puts "  \};"
}

proc an_print_ascii_bitmap {lRange} {
  foreach range $lRange {
    foreach {iFirst nRange} $range {}
    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
      if {$i<=127} { set a($i) 1 }
    }
  }

  set aAscii [list 0 0 0 0]
  foreach key [array names a] {
    set idx [expr $key >> 5]
    lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
  }

  puts "  static const unsigned int aAscii\[4\] = \{"
  puts -nonewline "   "
  foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
  puts ""
  puts "  \};"
}

proc print_isalnum {zFunc lRange} {
  puts "/*"
  puts "** Return true if the argument corresponds to a unicode codepoint"
  puts "** classified as either a letter or a number. Otherwise false."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  an_print_range_array $lRange
  an_print_ascii_bitmap $lRange
  puts {
  if( c<128 ){
    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
  }else if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
................................................................................
  tl_print_table_footer toggle
  puts {
  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
  }else if( c<65536 ){
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    int iRes = -1;

    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      int cmp = (c - aEntry[iTest].iCode);