/ Check-in [3dc567ef]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix comments in generated file fts3_unicode2.c.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1:3dc567ef4702d9a63d78d11ff705cb7f7359f7a6
User & Date: dan 2012-05-25 18:48:48
Context
2012-05-25
19:50
Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. check-in: cf7b25d4 user: dan tags: fts4-unicode
18:48
Fix comments in generated file fts3_unicode2.c. check-in: 3dc567ef user: dan tags: fts4-unicode
17:50
Add an experimental tokenizer to fts4 - "unicode". This tokenizer works in the same way except that it understands unicode "simple case folding" and recognizes all characters not classified as "Letters" or "Numbers" by unicode as token separators. check-in: 0c13570e user: dan tags: fts4-unicode
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode2.c.

            1  +/*
            2  +** 2012 May 25
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +******************************************************************************
           12  +*/
           13  +
     1     14   /*
     2     15   ** DO NOT EDIT THIS MACHINE GENERATED FILE.
     3     16   */
     4     17   
     5     18   #include <assert.h>
     6     19   
           20  +/*
           21  +** Return true if the argument corresponds to a unicode codepoint
           22  +** classified as either a letter or a number. Otherwise false.
           23  +**
           24  +** The results are undefined if the value passed to this function
           25  +** is less than zero.
           26  +*/
     7     27   int sqlite3FtsUnicodeIsalnum(int c){
           28  +  /* Each unsigned integer in the following array corresponds to a contiguous
           29  +  ** range of unicode codepoints that are not either letters or numbers (i.e.
           30  +  ** codepoints for which this function should return 0).
           31  +  **
           32  +  ** The most significant 22 bits in each 32-bit value contain the first 
           33  +  ** codepoint in the range. The least significant 10 bits are used to store
           34  +  ** the size of the range (always at least 1). In other words, the value 
           35  +  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
           36  +  ** C. It is not possible to represent a range larger than 1023 codepoints 
           37  +  ** using this format.
           38  +  */
     8     39     const static unsigned int aEntry[] = {
     9     40       0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    10     41       0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    11     42       0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    12     43       0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    13     44       0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    14     45       0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
................................................................................
   109    140       assert( key>=aEntry[iRes] );
   110    141       return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
   111    142     }
   112    143     return 1;
   113    144   }
   114    145   
   115    146   
          147  +/*
          148  +** Interpret the argument as a unicode codepoint. If the codepoint
          149  +** is an upper case character that has a lower case equivalent,
          150  +** return the codepoint corresponding to the lower case version.
          151  +** Otherwise, return a copy of the argument.
          152  +**
          153  +** The results are undefined if the value passed to this function
          154  +** is less than zero.
          155  +*/
   116    156   int sqlite3FtsUnicodeTolower(int c){
   117    157     /* Each entry in the following array defines a rule for folding a range
   118    158     ** of codepoints to lower case. The rule applies to a range of nRange
   119    159     ** codepoints starting at codepoint iCode.
   120    160     **
   121    161     ** If bFlag is clear, then all the codepoints in the range are upper
   122    162     ** case and require folding. Or, if bFlag is set, then only every second

Changes to ext/fts3/unicode/mkunicode.tcl.

    73     73       foreach {iFirst nRange} $range {}
    74     74       if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
    75     75       if {$nRange > $nRangeMax} {set nRangeMax $nRange}
    76     76     }
    77     77     if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
    78     78     if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
    79     79   
           80  +  puts -nonewline "  "
           81  +  puts [string trim {
           82  +  /* Each unsigned integer in the following array corresponds to a contiguous
           83  +  ** range of unicode codepoints that are not either letters or numbers (i.e.
           84  +  ** codepoints for which this function should return 0).
           85  +  **
           86  +  ** The most significant 22 bits in each 32-bit value contain the first 
           87  +  ** codepoint in the range. The least significant 10 bits are used to store
           88  +  ** the size of the range (always at least 1). In other words, the value 
           89  +  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
           90  +  ** C. It is not possible to represent a range larger than 1023 codepoints 
           91  +  ** using this format.
           92  +  */
           93  +  }]
    80     94     puts -nonewline "  const static unsigned int aEntry\[\] = \{"
    81     95     set i 0
    82     96     foreach range $lRange {
    83     97       foreach {iFirst nRange} $range {}
    84     98       set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
    85     99   
    86    100       if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
................................................................................
    88    102       incr i
    89    103     }
    90    104     puts ""
    91    105     puts "  \};"
    92    106   }
    93    107   
    94    108   proc print_isalnum {zFunc lRange} {
          109  +  puts "/*"
          110  +  puts "** Return true if the argument corresponds to a unicode codepoint"
          111  +  puts "** classified as either a letter or a number. Otherwise false."
          112  +  puts "**"
          113  +  puts "** The results are undefined if the value passed to this function"
          114  +  puts "** is less than zero."
          115  +  puts "*/"
    95    116     puts "int ${zFunc}\(int c)\{"
    96    117     an_print_range_array $lRange
    97    118     puts {
    98    119     if( c<(1<<22) ){
    99    120       unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
   100    121       int iRes;
   101    122       int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
................................................................................
   317    338   }
   318    339   
   319    340   proc print_tolower {zFunc} {
   320    341   
   321    342     set lRecord [tl_create_records]
   322    343   
   323    344     set lHigh [list]
          345  +  puts "/*"
          346  +  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
          347  +  puts "** is an upper case character that has a lower case equivalent,"
          348  +  puts "** return the codepoint corresponding to the lower case version."
          349  +  puts "** Otherwise, return a copy of the argument."
          350  +  puts "**"
          351  +  puts "** The results are undefined if the value passed to this function"
          352  +  puts "** is less than zero."
          353  +  puts "*/"
   324    354     puts "int ${zFunc}\(int c)\{"
   325    355     tl_print_table_header
   326    356     foreach entry $lRecord { 
   327    357       if {[tl_print_table_entry toggle $entry]} { 
   328    358         lappend lHigh $entry 
   329    359       } 
   330    360     }
................................................................................
   393    423     puts "  return 0;"
   394    424     puts "\}"
   395    425   }
   396    426   
   397    427   
   398    428   proc print_fileheader {} {
   399    429     puts [string trim {
          430  +/*
          431  +** 2012 May 25
          432  +**
          433  +** The author disclaims copyright to this source code.  In place of
          434  +** a legal notice, here is a blessing:
          435  +**
          436  +**    May you do good and not evil.
          437  +**    May you find forgiveness for yourself and forgive others.
          438  +**    May you share freely, never taking more than you give.
          439  +**
          440  +******************************************************************************
          441  +*/
          442  +
   400    443   /*
   401    444   ** DO NOT EDIT THIS MACHINE GENERATED FILE.
   402    445   */
   403    446     }]
   404    447     puts ""
   405    448     puts "#include <assert.h>"
   406    449     puts ""
................................................................................
   457    500   # was specified.
   458    501   #
   459    502   if {$::generate_test_code} {
   460    503     print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
   461    504     print_tolower_test sqlite3FtsUnicodeTolower 
   462    505     print_test_main 
   463    506   }
   464         -
   465         -
   466    507