/ Check-in [9f7a6ae8]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add a test case to check that the fts5 unicode64 tokenizer is dealing with codepoints greater than 65535 correctly.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 9f7a6ae878cd17ff4de7c55e654406773e0ea2b9fe1c4e2a9fc2b0da84d059a4
User & Date: dan 2018-05-09 16:32:00
Context
2018-05-11
15:10
Make sure the open_db() routine in the CLI does not invoke access() with a NULL filename. check-in: 20a8c611 user: drh tags: trunk
2018-05-09
16:32
Add a test case to check that the fts5 unicode64 tokenizer is dealing with codepoints greater than 65535 correctly. check-in: 9f7a6ae8 user: dan tags: trunk
15:17
Add 14 new interfaces to the loadable extension mechanism. check-in: 0e809cdc user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/test/fts5unicode.test.

37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58



59























60
  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  tokenize_test 1.$tn.3 $t {} {}
}

#-------------------------------------------------------------------------
# Check that "unicode61" really is the default tokenizer.
#

do_execsql_test 2.0 "
  CREATE VIRTUAL TABLE t1 USING fts5(x);
  CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
  CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
  INSERT INTO t1 VALUES('\xC0\xC8\xCC');
  INSERT INTO t2 VALUES('\xC0\xC8\xCC');
  INSERT INTO t3 VALUES('\xC0\xC8\xCC');
"
do_execsql_test 2.1 "
  SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
  SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
  SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
" {t1 t2}




























finish_test







<














>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

37
38
39
40
41
42
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  tokenize_test 1.$tn.3 $t {} {}
}

#-------------------------------------------------------------------------
# Check that "unicode61" really is the default tokenizer.
#

do_execsql_test 2.0 "
  CREATE VIRTUAL TABLE t1 USING fts5(x);
  CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
  CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
  INSERT INTO t1 VALUES('\xC0\xC8\xCC');
  INSERT INTO t2 VALUES('\xC0\xC8\xCC');
  INSERT INTO t3 VALUES('\xC0\xC8\xCC');
"
do_execsql_test 2.1 "
  SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
  SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
  SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
" {t1 t2}

#-------------------------------------------------------------------------
# Check that codepoints that require 4 bytes to store in utf-8 (those that
# require 17 or more bits to store).
#

set A [db one {SELECT char(0x1F75E)}]    ;# Type So
set B [db one {SELECT char(0x1F5FD)}]    ;# Type So
set C [db one {SELECT char(0x2F802)}]    ;# Type Lo
set D [db one {SELECT char(0x2F808)}]    ;# Type Lo

do_execsql_test 3.0 "
  CREATE VIRTUAL TABLE xyz USING fts5(x,
    tokenize = \"unicode61 separators '$C' tokenchars '$A'\"
  );
  CREATE VIRTUAL TABLE xyz_v USING fts5vocab(xyz, row);

  INSERT INTO xyz VALUES('$A$B$C$D');
"

do_execsql_test 3.1 {
  SELECT * FROM xyz_v;
} [list $A 1 1 $D 1 1]
  




finish_test