/ Check-in [8f3e60aa]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the name of the "unicode" tokenizer to "unicode61" to emphasize that the case folding and separator-character identification routines are based on unicode version 6.1.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1: 8f3e60aa2253f21bcee5d03982cfdd7f16c00060
User & Date: dan 2012-05-26 14:54:50
Context
2012-05-26
15:44
Add fault-injection tests that use the unicode61 tokenizer. Fix a problem revealed by the same. check-in: ed28c48a user: dan tags: fts4-unicode
14:54
Change the name of the "unicode" tokenizer to "unicode61" to emphasize that the case folding and separator-character identification routines are based on unicode version 6.1. check-in: 8f3e60aa user: dan tags: fts4-unicode
2012-05-25
19:50
Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. check-in: cf7b25d4 user: dan tags: fts4-unicode
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/README.tokenizers.

     7      7     statement:
     8      8   
     9      9       CREATE VIRTUAL TABLE <table-name> USING fts3(
    10     10         <columns ...> [, tokenize <tokenizer-name> [<tokenizer-args>]]
    11     11       );
    12     12   
    13     13     The built-in tokenizers (valid values to pass as <tokenizer name>) are
    14         -  "simple" and "porter".
           14  +  "simple", "porter" and "unicode".
    15     15   
    16     16     <tokenizer-args> should consist of zero or more white-space separated
    17     17     arguments to pass to the selected tokenizer implementation. The 
    18     18     interpretation of the arguments, if any, depends on the individual 
    19     19     tokenizer.
    20     20   
    21     21   2. Custom Tokenizers

Changes to ext/fts3/fts3.c.

  3597   3597       sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
  3598   3598     }
  3599   3599   
  3600   3600     /* Load the built-in tokenizers into the hash table */
  3601   3601     if( rc==SQLITE_OK ){
  3602   3602       if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
  3603   3603        || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) 
  3604         -     || sqlite3Fts3HashInsert(pHash, "unicode", 8, (void *)pUnicode) 
         3604  +     || sqlite3Fts3HashInsert(pHash, "unicode61", 10, (void *)pUnicode) 
  3605   3605   #ifdef SQLITE_ENABLE_ICU
  3606   3606        || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
  3607   3607   #endif
  3608   3608       ){
  3609   3609         rc = SQLITE_NOMEM;
  3610   3610       }
  3611   3611     }

Changes to test/fts4unicode.test.

    16     16   source $testdir/tester.tcl
    17     17   ifcapable !fts3 { finish_test ; return }
    18     18   set ::testprefix fts4unicode
    19     19   
    20     20   proc do_unicode_token_test {tn input res} {
    21     21     set input [string map {' ''} $input]
    22     22     uplevel [list do_execsql_test $tn "
    23         -    SELECT fts3_tokenizer_test('unicode', '$input');
           23  +    SELECT fts3_tokenizer_test('unicode61', '$input');
    24     24     " [list [list {*}$res]]]
    25     25   }
    26     26   
    27     27   do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
    28     28   do_unicode_token_test 1.1 {  } {0   1   2  }
    29     29   do_unicode_token_test 1.2 {xx xx xx} {0 xx xx 1 xx xx 2 xx xx}
    30     30   
................................................................................
    35     35   
    36     36   do_unicode_token_test 1.6 "The quick brown fox" {
    37     37     0 the The 1 quick quick 2 brown brown 3 fox fox
    38     38   }
    39     39   do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
    40     40     0 the The 1 quick quick 2 brown brown 3 fox fox
    41     41   }
           42  +
           43  +#-------------------------------------------------------------------------
           44  +#
           45  +set docs [list {
           46  +  Enhance the INSERT syntax to allow multiple rows to be inserted via the
           47  +  VALUES clause.
           48  +} {
           49  +  Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
           50  +} {
           51  +  Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
           52  +} {
           53  +  Added the sqlite3_db_readonly() interface.
           54  +} {
           55  +  Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
           56  +  ability to add new PRAGMA statements or to override built-in PRAGMAs.  
           57  +} {
           58  +  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
           59  +  the same row that contains the maximum x value.
           60  +} {
           61  +  Added support for the FTS4 languageid option.
           62  +} {
           63  +  Documented support for the FTS4 content option. This feature has actually
           64  +  been in the code since version 3.7.9 but is only now considered to be
           65  +  officially supported.  
           66  +} {
           67  +  Pending statements no longer block ROLLBACK. Instead, the pending statement
           68  +  will return SQLITE_ABORT upon next access after the ROLLBACK.  
           69  +} {
           70  +  Improvements to the handling of CSV inputs in the command-line shell
           71  +} {
           72  +  Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
           73  +  incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
           74  +  connected by OR.  
           75  +}]
           76  +
           77  +set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
           78  +set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
           79  +set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
           80  +set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
           81  +set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
           82  +set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
           83  +set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
           84  +set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
           85  +set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
           86  +foreach k [array names map] {
           87  +  lappend mappings [string toupper $k] [lindex $map($k) 0] 
           88  +  lappend mappings $k [lindex $map($k) 1]
           89  +}
           90  +proc mapdoc {doc} { 
           91  +  set doc [regsub -all {[[:space:]]+} $doc " "]
           92  +  string map $::mappings [string trim $doc] 
           93  +}
           94  +
           95  +do_test 2.0 {
           96  +  execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
           97  +  foreach doc $docs {
           98  +    set d [mapdoc $doc]
           99  +    execsql { INSERT INTO t2 VALUES($d) }
          100  +  }
          101  +} {}
          102  +
          103  +do_test 2.1 {
          104  +  set q [mapdoc "row"]
          105  +  execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
          106  +} [list [mapdoc {
          107  +  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
          108  +  the same row that contains the maximum x value.
          109  +}]]
          110  +
          111  +foreach {tn query snippet} {
          112  +  2 "row" {
          113  +     ...returns the value of y on the same [row] that contains 
          114  +     the maximum x value.
          115  +  }
          116  +  3 "ROW" {
          117  +     ...returns the value of y on the same [row] that contains 
          118  +     the maximum x value.
          119  +  }
          120  +  4 "rollback" {
          121  +     ...[ROLLBACK]. Instead, the pending statement
          122  +     will return SQLITE_ABORT upon next access after the [ROLLBACK].
          123  +  }
          124  +  5 "rOllback" {
          125  +     ...[ROLLBACK]. Instead, the pending statement
          126  +     will return SQLITE_ABORT upon next access after the [ROLLBACK].
          127  +  }
          128  +  6 "lang*" {
          129  +     Added support for the FTS4 [languageid] option.
          130  +  }
          131  +} {
          132  +  do_test 2.$tn {
          133  +    set q [mapdoc $query]
          134  +    execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
          135  +  } [list [mapdoc $snippet]]
          136  +}
    42    137   
    43    138   finish_test
    44    139