/ Check-in [3dc567ef]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix comments in generated file fts3_unicode2.c.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts4-unicode
Files: files | file ages | folders
SHA1: 3dc567ef4702d9a63d78d11ff705cb7f7359f7a6
User & Date: dan 2012-05-25 18:48:48
Context
2012-05-25
19:50
Add special fast paths to sqlite3FtsUnicodeTolower() and Isalnum() for codepoints in the ASCII range. check-in: cf7b25d4 user: dan tags: fts4-unicode
18:48
Fix comments in generated file fts3_unicode2.c. check-in: 3dc567ef user: dan tags: fts4-unicode
17:50
Add an experimental tokenizer to fts4 - "unicode". This tokenizer works in the same way except that it understands unicode "simple case folding" and recognizes all characters not classified as "Letters" or "Numbers" by unicode as token separators. check-in: 0c13570e user: dan tags: fts4-unicode
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts3/fts3_unicode2.c.














1
2
3
4
5
6







7











8
9
10
11
12
13
14
...
109
110
111
112
113
114
115









116
117
118
119
120
121
122













/*
** DO NOT EDIT THIS MACHINE GENERATED FILE.
*/

#include <assert.h>








int sqlite3FtsUnicodeIsalnum(int c){











  const static unsigned int aEntry[] = {
    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
................................................................................
    assert( key>=aEntry[iRes] );
    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
  }
  return 1;
}











int sqlite3FtsUnicodeTolower(int c){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If bFlag is clear, then all the codepoints in the range are upper
  ** case and require folding. Or, if bFlag is set, then only every second
>
>
>
>
>
>
>
>
>
>
>
>
>






>
>
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
...
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
/*
** 2012 May 25
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
*/

/*
** DO NOT EDIT THIS MACHINE GENERATED FILE.
*/

#include <assert.h>

/*
** Return true if the argument corresponds to a unicode codepoint
** classified as either a letter or a number. Otherwise false.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeIsalnum(int c){
  /* Each unsigned integer in the following array corresponds to a contiguous
  ** range of unicode codepoints that are not either letters or numbers (i.e.
  ** codepoints for which this function should return 0).
  **
  ** The most significant 22 bits in each 32-bit value contain the first 
  ** codepoint in the range. The least significant 10 bits are used to store
  ** the size of the range (always at least 1). In other words, the value 
  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
  ** C. It is not possible to represent a range larger than 1023 codepoints 
  ** using this format.
  */
  const static unsigned int aEntry[] = {
    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
................................................................................
    assert( key>=aEntry[iRes] );
    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
  }
  return 1;
}


/*
** Interpret the argument as a unicode codepoint. If the codepoint
** is an upper case character that has a lower case equivalent,
** return the codepoint corresponding to the lower case version.
** Otherwise, return a copy of the argument.
**
** The results are undefined if the value passed to this function
** is less than zero.
*/
int sqlite3FtsUnicodeTolower(int c){
  /* Each entry in the following array defines a rule for folding a range
  ** of codepoints to lower case. The rule applies to a range of nRange
  ** codepoints starting at codepoint iCode.
  **
  ** If bFlag is clear, then all the codepoints in the range are upper
  ** case and require folding. Or, if bFlag is set, then only every second

Changes to ext/fts3/unicode/mkunicode.tcl.

73
74
75
76
77
78
79














80
81
82
83
84
85
86
..
88
89
90
91
92
93
94







95
96
97
98
99
100
101
...
317
318
319
320
321
322
323









324
325
326
327
328
329
330
...
393
394
395
396
397
398
399













400
401
402
403
404
405
406
...
457
458
459
460
461
462
463
464
465
466
    foreach {iFirst nRange} $range {}
    if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
    if {$nRange > $nRangeMax} {set nRangeMax $nRange}
  }
  if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
  if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}















  puts -nonewline "  const static unsigned int aEntry\[\] = \{"
  set i 0
  foreach range $lRange {
    foreach {iFirst nRange} $range {}
    set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]

    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
................................................................................
    incr i
  }
  puts ""
  puts "  \};"
}

proc print_isalnum {zFunc lRange} {







  puts "int ${zFunc}\(int c)\{"
  an_print_range_array $lRange
  puts {
  if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
................................................................................
}

proc print_tolower {zFunc} {

  set lRecord [tl_create_records]

  set lHigh [list]









  puts "int ${zFunc}\(int c)\{"
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry]} { 
      lappend lHigh $entry 
    } 
  }
................................................................................
  puts "  return 0;"
  puts "\}"
}


proc print_fileheader {} {
  puts [string trim {













/*
** DO NOT EDIT THIS MACHINE GENERATED FILE.
*/
  }]
  puts ""
  puts "#include <assert.h>"
  puts ""
................................................................................
# was specified.
#
if {$::generate_test_code} {
  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
  print_tolower_test sqlite3FtsUnicodeTolower 
  print_test_main 
}










>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>







 








<
<
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
...
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
...
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
...
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
...
500
501
502
503
504
505
506
507


    foreach {iFirst nRange} $range {}
    if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
    if {$nRange > $nRangeMax} {set nRangeMax $nRange}
  }
  if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
  if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}

  puts -nonewline "  "
  puts [string trim {
  /* Each unsigned integer in the following array corresponds to a contiguous
  ** range of unicode codepoints that are not either letters or numbers (i.e.
  ** codepoints for which this function should return 0).
  **
  ** The most significant 22 bits in each 32-bit value contain the first 
  ** codepoint in the range. The least significant 10 bits are used to store
  ** the size of the range (always at least 1). In other words, the value 
  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
  ** C. It is not possible to represent a range larger than 1023 codepoints 
  ** using this format.
  */
  }]
  puts -nonewline "  const static unsigned int aEntry\[\] = \{"
  set i 0
  foreach range $lRange {
    foreach {iFirst nRange} $range {}
    set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]

    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
................................................................................
    incr i
  }
  puts ""
  puts "  \};"
}

proc print_isalnum {zFunc lRange} {
  puts "/*"
  puts "** Return true if the argument corresponds to a unicode codepoint"
  puts "** classified as either a letter or a number. Otherwise false."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  an_print_range_array $lRange
  puts {
  if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
................................................................................
}

proc print_tolower {zFunc} {

  set lRecord [tl_create_records]

  set lHigh [list]
  puts "/*"
  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
  puts "** is an upper case character that has a lower case equivalent,"
  puts "** return the codepoint corresponding to the lower case version."
  puts "** Otherwise, return a copy of the argument."
  puts "**"
  puts "** The results are undefined if the value passed to this function"
  puts "** is less than zero."
  puts "*/"
  puts "int ${zFunc}\(int c)\{"
  tl_print_table_header
  foreach entry $lRecord { 
    if {[tl_print_table_entry toggle $entry]} { 
      lappend lHigh $entry 
    } 
  }
................................................................................
  puts "  return 0;"
  puts "\}"
}


proc print_fileheader {} {
  puts [string trim {
/*
** 2012 May 25
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
******************************************************************************
*/

/*
** DO NOT EDIT THIS MACHINE GENERATED FILE.
*/
  }]
  puts ""
  puts "#include <assert.h>"
  puts ""
................................................................................
# was specified.
#
if {$::generate_test_code} {
  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
  print_tolower_test sqlite3FtsUnicodeTolower 
  print_test_main 
}