Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix comments in generated file fts3_unicode2.c. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts4-unicode |
Files: | files | file ages | folders |
SHA1: |
3dc567ef4702d9a63d78d11ff705cb7f |
User & Date: | dan 2012-05-25 18:48:48.456 |
Context
2012-05-25
| ||
19:50 | Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. (check-in: cf7b25d476 user: dan tags: fts4-unicode) | |
18:48 | Fix comments in generated file fts3_unicode2.c. (check-in: 3dc567ef47 user: dan tags: fts4-unicode) | |
17:50 | Add an experimental tokenizer to fts4 - "unicode". This tokenizer works in the same way except that it understands unicode "simple case folding" and recognizes all characters not classified as "Letters" or "Numbers" by unicode as token separators. (check-in: 0c13570ec7 user: dan tags: fts4-unicode) | |
Changes
Changes to ext/fts3/fts3_unicode2.c.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 | /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ #include <assert.h> int sqlite3FtsUnicodeIsalnum(int c){ const static unsigned int aEntry[] = { 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07, 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01, 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401, 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01, 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01, 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802, | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | /* ** 2012 May 25 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** */ /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ #include <assert.h> /* ** Return true if the argument corresponds to a unicode codepoint ** classified as either a letter or a number. Otherwise false. ** ** The results are undefined if the value passed to this function ** is less than zero. */ int sqlite3FtsUnicodeIsalnum(int c){ /* Each unsigned integer in the following array corresponds to a contiguous ** range of unicode codepoints that are not either letters or numbers (i.e. ** codepoints for which this function should return 0). ** ** The most significant 22 bits in each 32-bit value contain the first ** codepoint in the range. The least significant 10 bits are used to store ** the size of the range (always at least 1). In other words, the value ** ((C<<22) + N) represents a range of N codepoints starting with codepoint ** C. It is not possible to represent a range larger than 1023 codepoints ** using this format. */ const static unsigned int aEntry[] = { 0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07, 0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01, 0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401, 0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01, 0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01, 0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802, |
︙ | ︙ | |||
109 110 111 112 113 114 115 116 117 118 119 120 121 122 | assert( key>=aEntry[iRes] ); return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); } return 1; } int sqlite3FtsUnicodeTolower(int c){ /* Each entry in the following array defines a rule for folding a range ** of codepoints to lower case. The rule applies to a range of nRange ** codepoints starting at codepoint iCode. ** ** If bFlag is clear, then all the codepoints in the range are upper ** case and require folding. Or, if bFlag is set, then only every second | > > > > > > > > > | 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | assert( key>=aEntry[iRes] ); return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); } return 1; } /* ** Interpret the argument as a unicode codepoint. If the codepoint ** is an upper case character that has a lower case equivalent, ** return the codepoint corresponding to the lower case version. ** Otherwise, return a copy of the argument. ** ** The results are undefined if the value passed to this function ** is less than zero. */ int sqlite3FtsUnicodeTolower(int c){ /* Each entry in the following array defines a rule for folding a range ** of codepoints to lower case. The rule applies to a range of nRange ** codepoints starting at codepoint iCode. ** ** If bFlag is clear, then all the codepoints in the range are upper ** case and require folding. Or, if bFlag is set, then only every second |
︙ | ︙ |
Changes to ext/fts3/unicode/mkunicode.tcl.
︙ | ︙ | |||
73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | foreach {iFirst nRange} $range {} if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} if {$nRange > $nRangeMax} {set nRangeMax $nRange} } if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} puts -nonewline " const static unsigned int aEntry\[\] = \{" set i 0 foreach range $lRange { foreach {iFirst nRange} $range {} set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] if {($i % 5)==0} {puts "" ; puts -nonewline " "} puts -nonewline " $u32," incr i } puts "" puts " \};" } proc print_isalnum {zFunc lRange} { puts "int ${zFunc}\(int c)\{" an_print_range_array $lRange puts { if( c<(1<<22) ){ unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; int iRes; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; | > > > > > > > > > > > > > > > > > > > > > | 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | foreach {iFirst nRange} $range {} if {$iFirst > $iFirstMax} {set iFirstMax $iFirst} if {$nRange > $nRangeMax} {set nRangeMax $nRange} } if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"} if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"} puts -nonewline " " puts [string trim { /* Each unsigned integer in the following array corresponds to a contiguous ** range of unicode codepoints that are not either letters or numbers (i.e. ** codepoints for which this function should return 0). ** ** The most significant 22 bits in each 32-bit value contain the first ** codepoint in the range. The least significant 10 bits are used to store ** the size of the range (always at least 1). In other words, the value ** ((C<<22) + N) represents a range of N codepoints starting with codepoint ** C. It is not possible to represent a range larger than 1023 codepoints ** using this format. */ }] puts -nonewline " const static unsigned int aEntry\[\] = \{" set i 0 foreach range $lRange { foreach {iFirst nRange} $range {} set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]] if {($i % 5)==0} {puts "" ; puts -nonewline " "} puts -nonewline " $u32," incr i } puts "" puts " \};" } proc print_isalnum {zFunc lRange} { puts "/*" puts "** Return true if the argument corresponds to a unicode codepoint" puts "** classified as either a letter or a number. Otherwise false." puts "**" puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" puts "int ${zFunc}\(int c)\{" an_print_range_array $lRange puts { if( c<(1<<22) ){ unsigned int key = (((unsigned int)c)<<10) | 0x000003FF; int iRes; int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1; |
︙ | ︙ | |||
317 318 319 320 321 322 323 324 325 326 327 328 329 330 | } proc print_tolower {zFunc} { set lRecord [tl_create_records] set lHigh [list] puts "int ${zFunc}\(int c)\{" tl_print_table_header foreach entry $lRecord { if {[tl_print_table_entry toggle $entry]} { lappend lHigh $entry } } | > > > > > > > > > | 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 | } proc print_tolower {zFunc} { set lRecord [tl_create_records] set lHigh [list] puts "/*" puts "** Interpret the argument as a unicode codepoint. If the codepoint" puts "** is an upper case character that has a lower case equivalent," puts "** return the codepoint corresponding to the lower case version." puts "** Otherwise, return a copy of the argument." puts "**" puts "** The results are undefined if the value passed to this function" puts "** is less than zero." puts "*/" puts "int ${zFunc}\(int c)\{" tl_print_table_header foreach entry $lRecord { if {[tl_print_table_entry toggle $entry]} { lappend lHigh $entry } } |
︙ | ︙ | |||
393 394 395 396 397 398 399 400 401 402 403 404 405 406 | puts " return 0;" puts "\}" } proc print_fileheader {} { puts [string trim { /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ }] puts "" puts "#include <assert.h>" puts "" | > > > > > > > > > > > > > | 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 | puts " return 0;" puts "\}" } proc print_fileheader {} { puts [string trim { /* ** 2012 May 25 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** */ /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ }] puts "" puts "#include <assert.h>" puts "" |
︙ | ︙ | |||
457 458 459 460 461 462 463 464 | # was specified. # if {$::generate_test_code} { print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange print_tolower_test sqlite3FtsUnicodeTolower print_test_main } | < < | 500 501 502 503 504 505 506 507 | # was specified. # if {$::generate_test_code} { print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange print_tolower_test sqlite3FtsUnicodeTolower print_test_main } |