SQLite

Check-in [b79ced3e0a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add some tests for the fts2 icu tokenizer. (CVS 4117)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: b79ced3e0a26b0db13613073c847c2d2ba7e174e
User & Date: danielk1977 2007-06-25 11:24:39.000
Context
2007-06-25
12:05
Add a test that calls fts2_tokenizer() with an argument set via C code. (CVS 4118) (check-in: fbcf2d75cd user: danielk1977 tags: trunk)
11:24
Add some tests for the fts2 icu tokenizer. (CVS 4117) (check-in: b79ced3e0a user: danielk1977 tags: trunk)
09:52
Add some documentation for user-defined fts2 tokenizers. (CVS 4116) (check-in: 5a9eee8658 user: danielk1977 tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts2/fts2_tokenizer.c.
113
114
115
116
117
118
119


120
121
122
123
124
125
126
127
128
129
130
131
132
133




134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
  const char *zErr = 0;

  const char *zName;
  int nName;
  const char *zInput;
  int nInput;



  const char *zToken;
  int nToken;
  int iStart;
  int iEnd;
  int iPos;

  Tcl_Obj *pRet;

  assert( argc==2 );

  nName = sqlite3_value_bytes(argv[0]);
  zName = (const char *)sqlite3_value_text(argv[0]);
  nInput = sqlite3_value_bytes(argv[1]);
  zInput = (const char *)sqlite3_value_text(argv[1]);





  pHash = (fts2Hash *)sqlite3_user_data(context);
  p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);

  if( !p ){
    char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
    sqlite3_result_error(context, zErr, -1);
    sqlite3_free(zErr);
    return;
  }

  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);

  if( SQLITE_OK!=p->xCreate(0, 0, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
  pTokenizer->pModule = p;
  if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
    zErr = "error in xOpen()";
    goto finish;







>
>








|



|
|
>
>
>
>














|







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  const char *zErr = 0;

  const char *zName;
  int nName;
  const char *zInput;
  int nInput;

  const char *zArg = 0;

  const char *zToken;
  int nToken;
  int iStart;
  int iEnd;
  int iPos;

  Tcl_Obj *pRet;

  assert( argc==2 || argc==3 );

  nName = sqlite3_value_bytes(argv[0]);
  zName = (const char *)sqlite3_value_text(argv[0]);
  nInput = sqlite3_value_bytes(argv[argc-1]);
  zInput = (const char *)sqlite3_value_text(argv[argc-1]);

  if( argc==3 ){
    zArg = (const char *)sqlite3_value_text(argv[1]);
  }

  pHash = (fts2Hash *)sqlite3_user_data(context);
  p = (sqlite3_tokenizer_module *)sqlite3Fts2HashFind(pHash, zName, nName+1);

  if( !p ){
    char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
    sqlite3_result_error(context, zErr, -1);
    sqlite3_free(zErr);
    return;
  }

  pRet = Tcl_NewObj();
  Tcl_IncrRefCount(pRet);

  if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
  pTokenizer->pModule = p;
  if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
    zErr = "error in xOpen()";
    goto finish;
217
218
219
220
221
222
223

224
225
226
227
228
229
230
  }
#endif

  if( (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
   || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
#ifdef SQLITE_TEST
   || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))

#endif
  );

  sqlite3_free(zTest);
  return rc;
}








>







223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
  }
#endif

  if( (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
   || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
#ifdef SQLITE_TEST
   || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
   || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
#endif
  );

  sqlite3_free(zTest);
  return rc;
}

Changes to test/fts2token.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25













26
27
28
29
30
31
32
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The focus 
# of this script is testing the pluggable tokeniser feature of the 
# FTS2 module.
#
# $Id: fts2token.test,v 1.1 2007/06/22 15:21:16 danielk1977 Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is defined, omit this file.
ifcapable !fts2 {
  finish_test
  return
}














#--------------------------------------------------------------------------
# Test cases fts2token-1.* are the warm-body test for the SQL scalar
# function fts2_tokenizer(). The procedure is as follows:
#
#   1: Verify that there is no such fts2 tokenizer as 'blah'.
#







|










>
>
>
>
>
>
>
>
>
>
>
>
>







8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library. The focus 
# of this script is testing the pluggable tokeniser feature of the 
# FTS2 module.
#
# $Id: fts2token.test,v 1.2 2007/06/25 11:24:39 danielk1977 Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is defined, omit this file.
ifcapable !fts2 {
  finish_test
  return
}

proc escape_string {str} {
  set out ""
  foreach char [split $str ""] {
    scan $char %c i
    if {$i<=127} {
      append out $char
    } else {
      append out [format {\x%.4x} $i]
    }
  }
  set out
}

#--------------------------------------------------------------------------
# Test cases fts2token-1.* are the warm-body test for the SQL scalar
# function fts2_tokenizer(). The procedure is as follows:
#
#   1: Verify that there is no such fts2 tokenizer as 'blah'.
#
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106



















































107
  }
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}
do_test fts2token-3.2 {
  execsql {
    SELECT fts2_tokenizer_test('porter', 'I don''t see how');
  }
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}

ifcapable icu {
  do_test fts2token-3.3 {
    execsql {
      SELECT fts2_tokenizer_test('icu', 'I don''t see how');
    }
  } {{0 i I 1 don't don't 2 see see 3 how how}}
}




















































finish_test







<








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

104
105
106
107
108
109
110

111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
  }
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}
do_test fts2token-3.2 {
  execsql {
    SELECT fts2_tokenizer_test('porter', 'I don''t see how');
  }
} {{0 i I 1 don don 2 t t 3 see see 4 how how}}

ifcapable icu {
  do_test fts2token-3.3 {
    execsql {
      SELECT fts2_tokenizer_test('icu', 'I don''t see how');
    }
  } {{0 i I 1 don't don't 2 see see 3 how how}}
}

#--------------------------------------------------------------------------
# Test cases fts2token-4.* test the ICU tokenizer. In practice, this
# tokenizer only has two modes - "thai" and "everybody else". Some other
# Asian languages (Lao, Khmer etc.) require the same special treatment as 
# Thai, but ICU doesn't support them yet.
#
ifcapable icu {

  proc do_icu_test {name locale input output} {
    set ::out [db eval { SELECT fts2_tokenizer_test('icu', $locale, $input) }]
    do_test $name {
      lindex $::out 0
    } $output
  }
  
  do_icu_test fts2token-4.1 en_US  {}   {}
  do_icu_test fts2token-4.2 en_US {Test cases fts2} [list \
    0 test Test 1 cases cases 2 fts2 fts2
  ]

  # The following test shows that ICU is smart enough to recognise
  # Thai chararacters, even when the locale is set to English/United 
  # States.
  #
  set input "\u0e2d\u0e30\u0e44\u0e23\u0e19\u0e30\u0e04\u0e23\u0e31\u0e1a"
  set output    "0 \u0e2d\u0e30\u0e44\u0e23 \u0e2d\u0e30\u0e44\u0e23 "
  append output "1 \u0e19\u0e30 \u0e19\u0e30 "
  append output "2 \u0e04\u0e23\u0e31\u0e1a \u0e04\u0e23\u0e31\u0e1a"

  do_icu_test fts2token-4.3 th_TH  $input $output
  do_icu_test fts2token-4.4 en_US  $input $output

  # ICU handles an unknown locale by falling back to the default.
  # So this is not an error.
  do_icu_test fts2token-4.5 MiddleOfTheOcean  $input $output

  set    longtoken "AReallyReallyLongTokenOneThatWillSurelyRequire"
  append longtoken "AReallocInTheIcuTokenizerCode"

  set    input "short tokens then "
  append input $longtoken
  set    output "0 short short "
  append output "1 tokens tokens "
  append output "2 then then "
  append output "3 [string tolower $longtoken] $longtoken"

  do_icu_test fts2token-4.6 MiddleOfTheOcean  $input $output
  do_icu_test fts2token-4.7 th_TH  $input $output
  do_icu_test fts2token-4.8 en_US  $input $output
}

finish_test