Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints except the first and last of each of the three ranges as alphanumeric (eligible to be part of tokens). This commit fixes this so that all private use codepoints are considered alphanumeric. In other words, it fixes the handling of codepoints 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000 and 0x10FFFD. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
6cfd9af5250029c0d275be027b4208c4 |
User & Date: | dan 2013-06-05 16:17:21.916 |
Context
2013-06-07
| ||
22:12 | Improve manual cleaning step performed by the multi-platform build tool for MSVC. (check-in: d5bc1fe1c4 user: mistachkin tags: trunk) | |
2013-06-05
| ||
16:17 | Up until now the fts4 "unicode61" tokenizer has treated all private use codepoints except the first and last of each of the three ranges as alphanumeric (eligible to be part of tokens). This commit fixes this so that all private use codepoints are considered alphanumeric. In other words, it fixes the handling of codepoints 0xE000, 0xF8FF, 0xF0000, 0xFFFFD, 0x100000 and 0x10FFFD. (check-in: 6cfd9af525 user: dan tags: trunk) | |
2013-06-03
| ||
20:39 | Fix a typo in a collating function inside the e_reindex.test script. (check-in: 4d74fccf02 user: drh tags: trunk) | |
Changes
Changes to ext/fts3/fts3_unicode2.c.
︙ | ︙ | |||
97 98 99 100 101 102 103 | 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012, 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, | | | | | | | | | | | | | | | | | | | | | | < | 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 | 0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804, 0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012, 0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004, 0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002, 0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803, 0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07, 0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02, 0x037FFC01, 0x03EC7801, 0x03ECA401, 0x03EEC810, 0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023, 0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807, 0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405, 0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E, 0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01, 0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01, 0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016, 0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004, 0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004, 0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5, 0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01, 0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401, 0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064, 0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F, 0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009, 0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014, 0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001, 0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018, 0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401, 0x38008060, 0x380400F0, }; static const unsigned int aAscii[4] = { 0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001, }; if( c<128 ){ return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 ); |
︙ | ︙ |
Changes to ext/fts3/unicode/mkunicode.tcl.
︙ | ︙ | |||
235 236 237 238 239 240 241 | if {$line == ""} continue set fields [split $line ";"] if {[llength $fields] != [llength $lField]} { error "parse error: $line" } foreach $lField $fields {} set iCode [expr "0x$code"] | > | > > | 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | if {$line == ""} continue set fields [split $line ";"] if {[llength $fields] != [llength $lField]} { error "parse error: $line" } foreach $lField $fields {} set iCode [expr "0x$code"] set bAlnum [expr { [lsearch {L N} [string range $general_category 0 0]] >= 0 || $general_category=="Co" }] if { !$bAlnum } { lappend lRet $iCode } } close $fd set lRet } |
︙ | ︙ | |||
356 357 358 359 360 361 362 | iLo = iTest+1; }else{ iHi = iTest-1; } } assert( aEntry[0]<key ); assert( key>=aEntry[iRes] ); | | | 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 | iLo = iTest+1; }else{ iHi = iTest-1; } } assert( aEntry[0]<key ); assert( key>=aEntry[iRes] ); return (((unsigned int)c) >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF))); } return 1;} puts "\}" } proc print_test_isalnum {zFunc lRange} { foreach range $lRange { |
︙ | ︙ | |||
725 726 727 728 729 730 731 | */ /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ }] puts "" | | | 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 | */ /* ** DO NOT EDIT THIS MACHINE GENERATED FILE. */ }] puts "" puts "#if defined(SQLITE_ENABLE_FTS4_UNICODE61)" puts "#if defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4)" puts "" puts "#include <assert.h>" puts "" } proc print_test_main {} { |
︙ | ︙ | |||
801 802 803 804 805 806 807 | if {$::generate_test_code} { print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange print_fold_test sqlite3FtsUnicodeFold $mappings print_test_main } puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" | | | 804 805 806 807 808 809 810 811 | if {$::generate_test_code} { print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange print_fold_test sqlite3FtsUnicodeFold $mappings print_test_main } puts "#endif /* defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_ENABLE_FTS4) */" puts "#endif /* !defined(SQLITE_ENABLE_FTS4_UNICODE61) */" |
Changes to test/fts4unicode.test.
︙ | ︙ | |||
379 380 381 382 383 384 385 386 387 388 | do_isspace_test 6.$T.19 $T {32 160 5760 6158} do_isspace_test 6.$T.19 $T {8192 8193 8194 8195} do_isspace_test 6.$T.19 $T {8196 8197 8198 8199} do_isspace_test 6.$T.19 $T {8200 8201 8202 8239} do_isspace_test 6.$T.19 $T {8287 12288} } finish_test | > > > > > > > > > > > > > > > > > > | 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 | do_isspace_test 6.$T.19 $T {32 160 5760 6158} do_isspace_test 6.$T.19 $T {8192 8193 8194 8195} do_isspace_test 6.$T.19 $T {8196 8197 8198 8199} do_isspace_test 6.$T.19 $T {8200 8201 8202 8239} do_isspace_test 6.$T.19 $T {8287 12288} } #------------------------------------------------------------------------- # Test that the private use ranges are treated as alphanumeric. # breakpoint foreach {tn1 c} { 1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff } { foreach {tn2 config res} { 1 "" "0 hello*world hello*world" 2 "separators=*" "0 hello hello 1 world world" } { set config [string map [list * $c] $config] set input [string map [list * $c] "hello*world"] set output [string map [list * $c] $res] do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output } } finish_test |