1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
|
-
-
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
/*
** 2012 April 10
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This module implements a VIRTUAL TABLE that can be used to search
** a large vocabulary for close matches. For example, this virtual
** This module implements the spellfix1 VIRTUAL TABLE that can be used
** to search a large vocabulary for close matches. See separate
** table can be used to suggest corrections to misspelled words. Or,
** it could be used with FTS4 to do full-text search using potentially
** misspelled words.
**
** Create an instance of the virtual table this way:
**
** CREATE VIRTUAL TABLE demo USING spellfix1;
**
** The "spellfix1" term is the name of this module. The "demo" is the
** name of the virtual table you will be creating. The table is initially
** empty. You have to populate it with your vocabulary. Suppose you
** have a list of words in a table named "big_vocabulary". Then do this:
**
** INSERT INTO demo(word) SELECT word FROM big_vocabulary;
**
** If you intend to use this virtual table in cooperation with an FTS4
** table (for spelling correctly of search terms) then you can extract
** the vocabulary using an fts3aux table:
**
** INSERT INTO demo(word) SELECT term FROM search_aux WHERE col='*';
**
** You can also provide the virtual table with a "rank" for each word.
** The "rank" is an estimate of how common the word is. Larger numbers
** mean the word is more common. If you omit the rank when populating
** the table, then a rank of 1 is assumed. But if you have rank
** information, you can supply it and the virtual table will show a
** slight preference for selecting more commonly used terms. To
** populate the rank from an fts4aux table "search_aux" do something
** like this:
**
** INSERT INTO demo(word,rank)
** SELECT term, documents FROM search_aux WHERE col='*';
**
** To query the virtual table, include a MATCH operator in the WHERE
** clause. For example:
**
** SELECT word FROM demo WHERE word MATCH 'kennasaw';
**
** Using a dataset of American place names (derived from
** http://geonames.usgs.gov/domestic/download_data.htm) the query above
** returns 20 results beginning with:
**
** kennesaw
** kenosha
** kenesaw
** kenaga
** keanak
**
** If you append the character '*' to the end of the pattern, then
** a prefix search is performed. For example:
**
** SELECT word FROM demo WHERE word MATCH 'kennes*';
**
** Yields 20 results beginning with:
**
** kennesaw
** kennestone
** kenneson
** kenneys
** keanes
** keenes
**
** The virtual table actually has a unique rowid with five columns plus three
** extra hidden columns. The columns are as follows:
**
** rowid A unique integer number associated with each
** vocabulary item in the table. This can be used
** as a foreign key on other tables in the database.
**
** word The text of the word that matches the pattern.
** Both word and pattern can contains unicode characters
** and can be mixed case.
**
** rank This is the rank of the word, as specified in the
** original INSERT statement.
**
** distance This is an edit distance or Levensthein distance going
** from the pattern to the word.
**
** langid This is the language-id of the word. All queries are
** against a single language-id, which defaults to 0.
** For any given query this value is the same on all rows.
**
** score The score is a combination of rank and distance. The
** idea is that a lower score is better. The virtual table
** attempts to find words with the lowest score and
** by default (unless overridden by ORDER BY) returns
** results in order of increasing score.
**
** matchlen For prefix queries, the number of characters in the prefix
** of the returned value (word) that matched the query term.
** For non-prefix queries, the number of characters in the
** returned value.
**
** top (HIDDEN) For any query, this value is the same on all
** rows. It is an integer which is the maximum number of
** rows that will be output. The actually number of rows
** output might be less than this number, but it will never
** be greater. The default value for top is 20, but that
** can be changed for each query by including a term of
** the form "top=N" in the WHERE clause of the query.
**
** scope (HIDDEN) For any query, this value is the same on all
** rows. The scope is a measure of how widely the virtual
** table looks for matching words. Smaller values of
** scope cause a broader search. The scope is normally
** choosen automatically and is capped at 4. Applications
** can change the scope by including a term of the form
** "scope=N" in the WHERE clause of the query. Increasing
** the scope will make the query run faster, but will reduce
** the possible corrections.
**
** srchcnt (HIDDEN) For any query, this value is the same on all
** rows. This value is an integer which is the number of
** of words examined using the edit-distance algorithm to
** find the top matches that are ultimately displayed. This
** value is for diagnostic use only.
**
** soundslike (HIDDEN) When inserting vocabulary entries, this field
** can be set to an spelling that matches what the word
** sounds like. See the DEALING WITH UNUSUAL AND DIFFICULT
** SPELLINGS section below for details.
** documentation files (spellfix1.wiki and editdist3.wiki) for details.
**
** When inserting into or updating the virtual table, only the rowid, word,
** rank, and langid may be changes. Any attempt to set or modify the values
** of distance, score, top, scope, or srchcnt is silently ignored.
**
** ALGORITHM
**
** A shadow table named "%_vocab" (where the % is replaced by the name of
** the virtual table; Ex: "demo_vocab" for the "demo" virtual table) is
** constructed with these columns:
**
** id The unique id (INTEGER PRIMARY KEY)
**
** rank The rank of word.
**
** langid The language id for this entry.
**
** word The original UTF8 text of the vocabulary word
**
** k1 The word transliterated into lower-case ASCII.
** There is a standard table of mappings from non-ASCII
** characters into ASCII. Examples: "æ" -> "ae",
** "þ" -> "th", "ß" -> "ss", "á" -> "a", ... The
** accessory function spellfix1_translit(X) will do
** the non-ASCII to ASCII mapping. The built-in lower(X)
** function will convert to lower-case. Thus:
** k1 = lower(spellfix1_translit(word)).
**
** k2 This field holds a phonetic code derived from k1. Letters
** that have similar sounds are mapped into the same symbol.
** For example, all vowels and vowel clusters become the
** single symbol "A". And the letters "p", "b", "f", and
** "v" all become "B". All nasal sounds are represented
** as "N". And so forth. The mapping is base on
** ideas found in Soundex, Metaphone, and other
** long-standing phonetic matching systems. This key can
** be generated by the function spellfix1_phonehash(X).
** Hence: k2 = spellfix1_phonehash(k1)
**
** There is also a function for computing the Wagner edit distance or the
** Levenshtein distance between a pattern and a word. This function
** is exposed as spellfix1_editdist(X,Y). The edit distance function
** returns the "cost" of converting X into Y. Some transformations
** cost more than others. Changing one vowel into a different vowel,
** for example is relatively cheap, as is doubling a constant, or
** omitting the second character of a double-constant. Other transformations
** or more expensive. The idea is that the edit distance function returns
** a low cost of words that are similar and a higher cost for words
** that are futher apart. In this implementation, the maximum cost
** of any single-character edit (delete, insert, or substitute) is 100,
** with lower costs for some edits (such as transforming vowels).
**
** The "score" for a comparison is the edit distance between the pattern
** and the word, adjusted down by the base-2 logorithm of the word rank.
** For example, a match with distance 100 but rank 1000 would have a
** score of 122 (= 100 - log2(1000) + 32) where as a match with distance
** 100 with a rank of 1 would have a score of 131 (100 - log2(1) + 32).
** (NB: The constant 32 is added to each score to keep it from going
** negative in case the edit distance is zero.) In this way, frequently
** used words get a slightly lower cost which tends to move them toward
** the top of the list of alternative spellings.
**
** A straightforward implementation of a spelling corrector would be
** to compare the search term against every word in the vocabulary
** and select the 20 with the lowest scores. However, there will
** typically be hundreds of thousands or millions of words in the
** vocabulary, and so this approach is not fast enough.
**
** Suppose the term that is being spell-corrected is X. To limit
** the search space, X is converted to a k2-like key using the
** equivalent of:
**
** key = spellfix1_phonehash(lower(spellfix1_translit(X)))
**
** This key is then limited to "scope" characters. The default scope
** value is 4, but an alternative scope can be specified using the
** "scope=N" term in the WHERE clause. After the key has been truncated,
** the edit distance is run against every term in the vocabulary that
** has a k2 value that begins with the abbreviated key.
**
** For example, suppose the input word is "Paskagula". The phonetic
** key is "BACACALA" which is then truncated to 4 characters "BACA".
** The edit distance is then run on the 4980 entries (out of
** 272,597 entries total) of the vocabulary whose k2 values begin with
** BACA, yielding "Pascagoula" as the best match.
**
** Only terms of the vocabulary with a matching langid are searched.
** Hence, the same table can contain entries from multiple languages
** and only the requested language will be used. The default langid
** is 0.
**
** DEALING WITH UNUSUAL AND DIFFICULT SPELLINGS
**
** The algorithm above works quite well for most cases, but there are
** exceptions. These exceptions can be dealt with by making additional
** entries in the virtual table using the "soundslike" column.
**
** For example, many words of Greek origin begin with letters "ps" where
** the "p" is silent. Ex: psalm, pseudonym, psoriasis, psyche. In
** another example, many Scottish surnames can be spelled with an
** initial "Mac" or "Mc". Thus, "MacKay" and "McKay" are both pronounced
** the same.
**
** Accommodation can be made for words that are not spelled as they
** sound by making additional entries into the virtual table for the
** same word, but adding an alternative spelling in the "soundslike"
** column. For example, the canonical entry for "psalm" would be this:
**
** INSERT INTO demo(word) VALUES('psalm');
**
** To enhance the ability to correct the spelling of "salm" into
** "psalm", make an addition entry like this:
**
** INSERT INTO demo(word,soundslike) VALUES('psalm','salm');
**
** It is ok to make multiple entries for the same word as long as
** each entry has a different soundslike value. Note that if no
** soundslike value is specified, the soundslike defaults to the word
** itself.
**
** Listed below are some cases where it might make sense to add additional
** soundslike entries. The specific entries will depend on the application
** and the target language.
**
** * Silent "p" in words beginning with "ps": psalm, psyche
**
** * Silent "p" in words beginning with "pn": pneumonia, pneumatic
**
** * Silent "p" in words beginning with "pt": pterodactyl, ptolemaic
**
** * Silent "d" in words beginning with "dj": djinn, Djikarta
**
** * Silent "k" in words beginning with "kn": knight, Knuthson
**
** * Silent "g" in words beginning with "gn": gnarly, gnome, gnat
**
** * "Mac" versus "Mc" beginning Scottish surnames
**
** * "Tch" sounds in Slavic words: Tchaikovsky vs. Chaykovsky
**
** * The letter "j" pronounced like "h" in Spanish: LaJolla
**
** * Words beginning with "wr" versus "r": write vs. rite
**
** * Miscellanous problem words such as "debt", "tsetse",
** "Nguyen", "Van Nuyes".
*/
#if SQLITE_CORE
# include "sqliteInt.h"
#else
# include <string.h>
# include <stdio.h>
# include <stdlib.h>
|
︙ | | |
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
|
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
-
+
|
** 4 'D' Alveolar stops: D T
** 5 'H' Letter H at the beginning of a word
** 6 'L' Glide: L
** 7 'R' Semivowel: R
** 8 'M' Nasals: M N
** 9 'W' Letter W at the beginning of a word
** 10 'Y' Letter Y at the beginning of a word.
** 11 '9' A digit: 0 1 2 3 4 5 6 7 8 9
** 11 '9' Digits: 0 1 2 3 4 5 6 7 8 9
** 12 ' ' White space
** 13 '?' Other.
*/
#define CCLASS_SILENT 0
#define CCLASS_VOWEL 1
#define CCLASS_B 2
#define CCLASS_C 3
|
︙ | | |
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
195
196
197
198
199
200
201
202
203
204
205
206
207
208
|
-
|
case 'g':
case 'k': {
if( zIn[1]=='n' ){ zIn++; nIn--; }
break;
}
}
}
if( zIn[0]=='k' && zIn[1]=='n' ){ zIn++, nIn--; }
for(i=0; i<nIn; i++){
unsigned char c = zIn[i];
if( i+1<nIn ){
if( c=='w' && zIn[i+1]=='r' ) continue;
if( c=='d' && (zIn[i+1]=='j' || zIn[i+1]=='g') ) continue;
if( i+2<nIn ){
if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue;
|
︙ | | |
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
|
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
|
-
+
|
/* differ only in case */
return 0;
}
classFrom = characterClass(cPrev, cFrom);
classTo = characterClass(cPrev, cTo);
if( classFrom==classTo ){
/* Same character class */
return classFrom=='A' ? 25 : 40;
return 40;
}
if( classFrom>=CCLASS_B && classFrom<=CCLASS_Y
&& classTo>=CCLASS_B && classTo<=CCLASS_Y ){
/* Convert from one consonant to another, but in a different class */
return 75;
}
/* Any other subsitution */
|
︙ | | |
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
|
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
|
-
+
|
**
** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
** of zB that matched the pattern in zA. If zA does not end with a '*',
** then this value is always the number of bytes in zB (i.e. strlen(zB)).
** If zA does end in a '*', then it is the number of bytes in the prefix
** of zB that was deemed to match zA.
*/
static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){
static int editdist1(const char *zA, const char *zB, int *pnMatch){
int nA, nB; /* Number of characters in zA[] and zB[] */
int xA, xB; /* Loop counters for zA[] and zB[] */
char cA, cB; /* Current character of zA and zB */
char cAprev, cBprev; /* Previous character of zA and zB */
char cAnext, cBnext; /* Next character in zA and zB */
int d; /* North-west cost value */
int dc = 0; /* North-west character value */
|
︙ | | |
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
|
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
|
-
+
-
+
|
#if 0
printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
#endif
/* Verify input strings and measure their lengths */
for(nA=0; zA[nA]; nA++){
if( zA[nA]>127 ) return -2;
if( zA[nA]&0x80 ) return -2;
}
for(nB=0; zB[nB]; nB++){
if( zB[nB]>127 ) return -2;
if( zB[nB]&0x80 ) return -2;
}
/* Special processing if either string is empty */
if( nA==0 ){
cBprev = dc;
for(xB=res=0; (cB = zB[xB])!=0; xB++){
res += insertOrDeleteCost(cBprev, cB, zB[xB+1])/FINAL_INS_COST_DIV;
|
︙ | | |
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
|
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
|
+
+
-
+
-
-
-
+
|
if( m[xB]<res ){
res = m[xB];
if( pnMatch ) *pnMatch = xB+nMatch;
}
}
}else{
res = m[nB];
/* In the current implementation, pnMatch is always NULL if zA does
** not end in "*" */
if( pnMatch ) *pnMatch = -1;
assert( pnMatch==0 );
}
sqlite3_free(toFree);
return res;
}
/*
** Function: editdist(A,B)
** editdist(A,B,langid)
**
** Return the cost of transforming string A into string B. Both strings
** must be pure ASCII text. If A ends with '*' then it is assumed to be
** a prefix of B and extra characters on the end of B have minimal additional
** cost.
*/
static void editdistSqlFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
int langid = argc==2 ? 0 : sqlite3_value_int(argv[2]);
int res = editdist1(
(const char*)sqlite3_value_text(argv[0]),
(const char*)sqlite3_value_text(argv[1]),
langid, 0);
0);
if( res<0 ){
if( res==(-3) ){
sqlite3_result_error_nomem(context);
}else if( res==(-2) ){
sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
}else{
sqlite3_result_error(context, "NULL input to editdist()", -1);
|
︙ | | |
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
|
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
|
-
+
-
+
-
+
+
+
-
+
-
-
+
-
+
+
-
+
|
*/
static int editDist3ConfigLoad(
EditDist3Config *p, /* The edit distance configuration to load */
sqlite3 *db, /* Load from this database */
const char *zTable /* Name of the table from which to load */
){
sqlite3_stmt *pStmt;
int rc;
int rc, rc2;
char *zSql;
int iLangPrev = -9999;
EditDist3Lang *pLang;
zSql = sqlite3_mprintf("SELECT iLang, cFrom, cTo, iCost"
" FROM \"%w\" WHERE iLang>=0 ORDER BY iLang", zTable);
if( zSql==0 ) return SQLITE_NOMEM;
rc = sqlite3_prepare(db, zSql, -1, &pStmt, 0);
sqlite3_free(zSql);
if( rc ) return rc;
editDist3ConfigClear(p);
while( sqlite3_step(pStmt)==SQLITE_ROW ){
int iLang = sqlite3_column_int(pStmt, 0);
const char *zFrom = (const char*)sqlite3_column_text(pStmt, 1);
int nFrom = sqlite3_column_bytes(pStmt, 1);
int nFrom = zFrom ? sqlite3_column_bytes(pStmt, 1) : 0;
const char *zTo = (const char*)sqlite3_column_text(pStmt, 2);
int nTo = sqlite3_column_bytes(pStmt, 2);
int nTo = zTo ? sqlite3_column_bytes(pStmt, 2) : 0;
int iCost = sqlite3_column_int(pStmt, 3);
assert( zFrom!=0 || nFrom==0 );
assert( zTo!=0 || nTo==0 );
if( nFrom>100 || nFrom<0 || nTo>100 || nTo<0 ) continue;
if( nFrom>100 || nTo>100 ) continue;
if( iCost<0 ) continue;
if( iLang!=iLangPrev ){
EditDist3Lang *pNew;
p->nLang++;
pNew = sqlite3_realloc(p->a, p->nLang*sizeof(p->a[0]));
pNew = sqlite3_realloc(p->a, (p->nLang+1)*sizeof(p->a[0]));
if( pNew==0 ){ rc = SQLITE_NOMEM; break; }
p->a = pNew;
pLang = &p->a[p->nLang-1];
pLang = &p->a[p->nLang];
p->nLang++;
pLang->iLang = iLang;
pLang->iInsCost = 100;
pLang->iDelCost = 100;
pLang->iSubCost = 200;
pLang->iSubCost = 150;
pLang->pCost = 0;
iLangPrev = iLang;
}
if( nFrom==1 && zFrom[0]=='?' && nTo==0 ){
pLang->iDelCost = iCost;
}else if( nFrom==0 && nTo==1 && zTo[0]=='?' ){
pLang->iInsCost = iCost;
|
︙ | | |
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
|
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
|
-
+
+
|
pCost->iCost = iCost;
memcpy(pCost->a, zFrom, nFrom);
memcpy(pCost->a + nFrom, zTo, nTo);
pCost->pNext = pLang->pCost;
pLang->pCost = pCost;
}
}
sqlite3_finalize(pStmt);
rc2 = sqlite3_finalize(pStmt);
if( rc==SQLITE_OK ) rc = rc2;
return rc;
}
/*
** Return the length (in bytes) of a utf-8 character. Or return a maximum
** of N.
*/
|
︙ | | |
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
|
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
|
-
+
|
}
/*
** Return TRUE (non-zero) of the To side of the given cost matches
** the given string.
*/
static int matchFrom(EditDist3Cost *p, const char *z, int n){
if( p->nFrom>n ) return 0;
assert( p->nFrom<=n );
if( memcmp(p->a, z, p->nFrom)!=0 ) return 0;
return 1;
}
/*
** Return TRUE (non-zero) of the next FROM character and the next TO
** character are the same.
|
︙ | | |
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
|
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
|
+
+
|
const char *z,
int n
){
EditDist3FromString *pStr;
EditDist3Cost *p;
int i;
if( z==0 ) return 0;
if( n<0 ) n = (int)strlen(z);
pStr = sqlite3_malloc( sizeof(*pStr) + sizeof(pStr->a[0])*n + n + 1 );
if( pStr==0 ) return 0;
pStr->a = (EditDist3From*)&pStr[1];
memset(pStr->a, 0, sizeof(pStr->a[0])*n);
pStr->n = n;
pStr->z = (char*)&pStr->a[n];
memcpy(pStr->z, z, n+1);
if( n && z[n-1]=='*' ){
pStr->isPrefix = 1;
n--;
pStr->n--;
|
︙ | | |
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
|
846
847
848
849
850
851
852
853
854
855
856
857
858
859
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
pStr = 0;
break;
}
}
return pStr;
}
#if 0 /* No longer used */
/*
** Return the number of bytes in the common prefix of two UTF8 strings.
** Only complete characters are considered.
*/
static int editDist3PrefixLen(const char *z1, const char *z2){
int n = 0;
while( z1[n] && z1[n]==z2[n] ){ n++; }
while( n && (z1[n]&0xc0)==0x80 ){ n--; }
return n;
}
/*
** Return the number of bytes in the common suffix of two UTF8 strings.
** Only complete characters are considered.
*/
static int editDist3SuffixLen(const char *z1, int n1, const char *z2, int n2){
int origN1 = n1;
while( n1>0 && n2>0 && z1[n1-1]==z2[n2-1] ){ n1--; n2--; }
while( n1<origN1 && (z1[n1]&0xc0)==0x80 ){ n1++; n2++; }
return origN1 - n1;
}
#endif /* 0 */
/*
** Update entry m[i] such that it is the minimum of its current value
** and m[j]+iCost.
**
** If the iCost is 1,000,000 or greater, then consider the cost to be
** infinite and skip the update.
*/
|
︙ | | |
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
|
894
895
896
897
898
899
900
901
902
903
904
905
906
907
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
|
EditDist3FromString f = *pFrom;
EditDist3To *a2;
unsigned int *m;
int szRow;
EditDist3Cost *p;
int res;
#if 0
/* Remove comment prefix and suffix */
n = editDist3PrefixLen(f.z, z2);
if( f.n==n2 && n2==n ) return 0; /* Identical strings */
f.n -= n;
f.z += n;
f.a += n;
n2 -= n;
z2 += n;
if( f.isPrefix==0 ){
n = editDist3SuffixLen(f.z, f.n, z2, n2);
f.n -= n;
n2 -= n;
}
#endif
/* allocate the Wagner matrix and the aTo[] array for the TO string */
n = (f.n+1)*(n2+1);
n = (n+1)&~1;
m = sqlite3_malloc( n*sizeof(m[0]) + sizeof(a2[0])*n2 );
if( m==0 ) return -1; /* Out of memory */
a2 = (EditDist3To*)&m[n];
memset(a2, 0, sizeof(a2[0])*n2);
|
︙ | | |
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
|
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
|
-
+
|
if( matchTo(p, z2+i2, n2-i2) ){
updateCost(m, cxd+p->nFrom+szRow*p->nTo, cxd, p->iCost);
}
}
}
}
#if 0
#if 0 /* Enable for debugging */
printf(" ^");
for(i1=0; i1<f.n; i1++) printf(" %c-%2x", f.z[i1], f.z[i1]&0xff);
printf("\n ^:");
for(i1=0; i1<szRow; i1++){
int v = m[i1];
if( v>9999 ) printf(" ****");
else printf(" %4d", v);
|
︙ | | |
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
|
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
|
+
+
+
-
+
+
|
pFrom = editDist3FromStringNew(pLang, zA, nA);
if( pFrom==0 ){
sqlite3_result_error_nomem(context);
return;
}
dist = editDist3Core(pFrom, zB, nB, pLang, 0);
editDist3FromStringDelete(pFrom);
if( dist==(-1) ){
sqlite3_result_error_nomem(context);
}else{
sqlite3_result_int(context, dist);
sqlite3_result_int(context, dist);
}
}
}
/*
** Register the editDist3 function with SQLite
*/
static int editDist3Install(sqlite3 *db){
|
︙ | | |
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
|
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
|
+
+
-
+
|
/*
** Return the value of the first UTF-8 character in the string.
*/
static int utf8Read(const unsigned char *z, int n, int *pSize){
int c, i;
/* All callers to this routine (in the current implementation)
** always have n>0. */
if( n==0 ){
if( NEVER(n==0) ){
c = i = 0;
}else{
c = z[0];
i = 1;
if( c>=0xc0 ){
c = sqlite3Utf8Trans1[c-0xc0];
while( i<n && (z[i] & 0xc0)==0x80 ){
|
︙ | | |
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
|
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
|
-
+
-
-
+
+
|
** The returned string might contain more characters than the input.
**
** Space to hold the returned string comes from sqlite3_malloc() and
** should be freed by the caller.
*/
static unsigned char *transliterate(const unsigned char *zIn, int nIn){
unsigned char *zOut = sqlite3_malloc( nIn*4 + 1 );
int i, c, sz, nOut;
int c, sz, nOut;
if( zOut==0 ) return 0;
i = nOut = 0;
while( i<nIn ){
nOut = 0;
while( nIn>0 ){
c = utf8Read(zIn, nIn, &sz);
zIn += sz;
nIn -= sz;
if( c<=127 ){
zOut[nOut++] = c;
}else{
int xTop, xBtm, x;
|
︙ | | |
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
|
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
|
}
sqlite3_result_int(context, res);
}
/* End transliterate
******************************************************************************
******************************************************************************
** Begin Polloc & Zamora SPEEDCOP style keying functions.
*/
/*
** The Pollock & Zamora skeleton function. Move all consonants to the
** front and all vowels to the end, removing duplicates. Except if the
** first letter is a vowel then it remains as the first letter.
*/
static void pollockSkeletonKey(const char *zIn, char *zOut){
int i, j;
unsigned char c;
char seen[26];
static const unsigned char isVowel[] = { 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 };
memset(seen, 0, sizeof(seen));
for(i=j=0; (c = (unsigned char)zIn[i])!=0; i++){
if( c<'a' || c>'z' ) continue;
if( j>0 || isVowel[c-'a'] ) continue;
if( seen[c-'a'] ) continue;
seen[c-'a'] = 1;
zOut[j++] = c;
}
for(i=0; (c = (unsigned char)zIn[i])!=0; i++){
if( c<'a' || c>'z' ) continue;
if( seen[c-'a'] ) continue;
if( !isVowel[c-'a'] ) continue;
seen[c-'a'] = 1;
zOut[j++] = c;
}
zOut[j] = 0;
}
/*
** Function: pollock_skeleton(X)
**
** Return the Pollock and Zamora skeleton key for a string X of all
** lower-case letters.
*/
static void pollockSkeletonSqlFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
const char *zIn = (const char*)sqlite3_value_text(argv[0]);
int nIn = sqlite3_value_bytes(argv[0]);
char *zOut;
if( zIn ){
zOut = sqlite3_malloc( nIn + 1 );
if( zOut==0 ){
sqlite3_result_error_nomem(context);
}else{
pollockSkeletonKey(zIn, zOut);
sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
}
}
}
/*
** The Pollock & Zamora omission key.
**
** The key consists of unique consonants in the following order:
**
** jkqxzvwybfmgpdhclntsr
**
** These are followed by unique vowels in input order.
*/
static void pollockOmissionKey(const char *zIn, char *zOut){
int i, j;
unsigned char c;
char seen[26];
static const unsigned char isVowel[] = { 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0 };
static const unsigned char constOrder[] = "jkqxzvwybfmgpdhclntsr";
memset(seen, 0, sizeof(seen));
for(i=j=0; (c = (unsigned char)zIn[i])!=0; i++){
if( c<'a' || c>'z' ) continue;
if( isVowel[c-'a'] ) continue;
if( seen[c-'a'] ) continue;
seen[c-'a'] = 1;
}
for(i=0; (c = constOrder[i])!=0; i++){
if( seen[c-'a'] ) zOut[j++] = c;
}
for(i=0; (c = (unsigned char)zIn[i])!=0; i++){
if( c<'a' || c>'z' ) continue;
if( seen[c-'a'] ) continue;
if( !isVowel[c-'a'] ) continue;
seen[c-'a'] = 1;
zOut[j++] = c;
}
zOut[j] = 0;
}
/*
** Function: pollock_omission(X)
**
** Return the Pollock and Zamora omission key for a string X of all
** lower-case letters.
*/
static void pollockOmissionSqlFunc(
sqlite3_context *context,
int argc,
sqlite3_value **argv
){
const char *zIn = (const char*)sqlite3_value_text(argv[0]);
int nIn = sqlite3_value_bytes(argv[0]);
char *zOut;
if( zIn ){
zOut = sqlite3_malloc( nIn + 1 );
if( zOut==0 ){
sqlite3_result_error_nomem(context);
}else{
pollockOmissionKey(zIn, zOut);
sqlite3_result_text(context, (char*)zOut, -1, sqlite3_free);
}
}
}
/* End SPEEDCOP keying functions
******************************************************************************
******************************************************************************
** Begin spellfix1 virtual table.
*/
/* Maximum length of a phonehash used for querying the shadow table */
#define SPELLFIX_MX_HASH 8
/* Maximum number of hash strings to examine per query */
#define SPELLFIX_MX_RUN 8
#define SPELLFIX_MX_RUN 1
typedef struct spellfix1_vtab spellfix1_vtab;
typedef struct spellfix1_cursor spellfix1_cursor;
/* Fuzzy-search virtual table object */
struct spellfix1_vtab {
sqlite3_vtab base; /* Base class - must be first */
|
︙ | | |
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
|
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
|
-
+
+
|
struct spellfix1_cursor {
sqlite3_vtab_cursor base; /* Base class - must be first */
spellfix1_vtab *pVTab; /* The table to which this cursor belongs */
char *zPattern; /* rhs of MATCH clause */
int nRow; /* Number of rows of content */
int nAlloc; /* Number of allocated rows */
int iRow; /* Current row of content */
int iLang; /* Value of the lang= constraint */
int iLang; /* Value of the langid= constraint */
int iTop; /* Value of the top= constraint */
int iScope; /* Value of the scope= constraint */
int nSearch; /* Number of vocabulary items checked */
sqlite3_stmt *pFullScan; /* Shadow query for a full table scan */
struct spellfix1_row { /* For each row of content */
sqlite3_int64 iRowid; /* Rowid for this row */
char *zWord; /* Text for this row */
int iRank; /* Rank for this row */
int iDistance; /* Distance from pattern for this row */
int iScore; /* Score for sorting */
int iMatchlen; /* Value of matchlen column (or -1) */
|
︙ | | |
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
|
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
|
+
+
-
+
|
char *zOut;
int i, j;
char c;
while( isspace(zIn[0]) ) zIn++;
zOut = sqlite3_mprintf("%s", zIn);
if( zOut==0 ) return 0;
i = (int)strlen(zOut);
#if 0 /* The parser will never leave spaces at the end */
while( i>0 && isspace(zOut[i-1]) ){ i--; }
#endif
zOut[i] = 0;
c = zOut[0];
if( c=='\'' || c=='"' ){
for(i=1, j=0; zOut[i]; i++){
for(i=1, j=0; ALWAYS(zOut[i]); i++){
zOut[j++] = zOut[i];
if( zOut[i]==c ){
if( zOut[i+1]==c ){
i++;
}else{
zOut[j-1] = 0;
break;
|
︙ | | |
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
|
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
|
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
|
const char *zModule = argv[0];
const char *zDbName = argv[1];
const char *zTableName = argv[2];
int nDbName;
int rc = SQLITE_OK;
int i;
if( argc<3 ){
*pzErr = sqlite3_mprintf(
"%s: wrong number of CREATE VIRTUAL TABLE arguments", argv[0]
);
rc = SQLITE_ERROR;
}else{
nDbName = strlen(zDbName);
pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
if( pNew==0 ){
rc = SQLITE_NOMEM;
}else{
memset(pNew, 0, sizeof(*pNew));
pNew->zDbName = (char*)&pNew[1];
memcpy(pNew->zDbName, zDbName, nDbName+1);
pNew->zTableName = sqlite3_mprintf("%s", zTableName);
pNew->db = db;
if( pNew->zTableName==0 ){
rc = SQLITE_NOMEM;
}else{
rc = sqlite3_declare_vtab(db,
"CREATE TABLE x(word,rank,distance,langid, "
"score, matchlen, phonehash, "
"top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
"soundslike HIDDEN, command HIDDEN)"
);
nDbName = strlen(zDbName);
pNew = sqlite3_malloc( sizeof(*pNew) + nDbName + 1);
if( pNew==0 ){
rc = SQLITE_NOMEM;
}else{
memset(pNew, 0, sizeof(*pNew));
pNew->zDbName = (char*)&pNew[1];
memcpy(pNew->zDbName, zDbName, nDbName+1);
pNew->zTableName = sqlite3_mprintf("%s", zTableName);
pNew->db = db;
if( pNew->zTableName==0 ){
rc = SQLITE_NOMEM;
}else{
rc = sqlite3_declare_vtab(db,
"CREATE TABLE x(word,rank,distance,langid, "
"score, matchlen, phonehash HIDDEN, "
"top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
"soundslike HIDDEN, command HIDDEN)"
);
#define SPELLFIX_COL_WORD 0
#define SPELLFIX_COL_RANK 1
#define SPELLFIX_COL_DISTANCE 2
#define SPELLFIX_COL_LANGID 3
#define SPELLFIX_COL_SCORE 4
#define SPELLFIX_COL_MATCHLEN 5
#define SPELLFIX_COL_PHONEHASH 6
#define SPELLFIX_COL_TOP 7
#define SPELLFIX_COL_SCOPE 8
#define SPELLFIX_COL_SRCHCNT 9
#define SPELLFIX_COL_SOUNDSLIKE 10
#define SPELLFIX_COL_COMMAND 11
}
if( rc==SQLITE_OK && isCreate ){
sqlite3_uint64 r;
spellfix1DbExec(&rc, db,
"CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
" id INTEGER PRIMARY KEY,\n"
" rank INT,\n"
" langid INT,\n"
" word TEXT,\n"
" k1 TEXT,\n"
" k2 TEXT\n"
");\n",
zDbName, zTableName
);
sqlite3_randomness(sizeof(r), &r);
spellfix1DbExec(&rc, db,
"CREATE INDEX IF NOT EXISTS \"%w\".\"%w_index_%llx\" "
"ON \"%w_vocab\"(langid,k2);",
zDbName, zModule, r, zTableName
);
}
for(i=3; rc==SQLITE_OK && i<argc; i++){
if( memcmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
continue;
}
rc = SQLITE_ERROR;
}
}
}
*ppVTab = (sqlite3_vtab *)pNew;
}
if( rc==SQLITE_OK && isCreate ){
sqlite3_uint64 r;
spellfix1DbExec(&rc, db,
"CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
" id INTEGER PRIMARY KEY,\n"
" rank INT,\n"
" langid INT,\n"
" word TEXT,\n"
" k1 TEXT,\n"
" k2 TEXT\n"
");\n",
zDbName, zTableName
);
sqlite3_randomness(sizeof(r), &r);
spellfix1DbExec(&rc, db,
"CREATE INDEX IF NOT EXISTS \"%w\".\"%w_index_%llx\" "
"ON \"%w_vocab\"(langid,k2);",
zDbName, zModule, r, zTableName
);
}
for(i=3; rc==SQLITE_OK && i<argc; i++){
if( memcmp(argv[i],"edit_cost_table=",16)==0 && pNew->zCostTable==0 ){
pNew->zCostTable = spellfix1Dequote(&argv[i][16]);
if( pNew->zCostTable==0 ) rc = SQLITE_NOMEM;
continue;
}
*pzErr = sqlite3_mprintf("bad argument to spellfix1(): \"%s\"", argv[i]);
rc = SQLITE_ERROR;
}
}
if( rc && pNew ){
*ppVTab = 0;
spellfix1Uninit(0, &pNew->base);
}else{
*ppVTab = (sqlite3_vtab *)pNew;
}
return rc;
}
/*
** The xConnect and xCreate methods
*/
static int spellfix1Connect(
|
︙ | | |
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
|
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
|
+
+
+
+
|
int i;
for(i=0; i<pCur->nRow; i++){
sqlite3_free(pCur->a[i].zWord);
}
pCur->nRow = 0;
pCur->iRow = 0;
pCur->nSearch = 0;
if( pCur->pFullScan ){
sqlite3_finalize(pCur->pFullScan);
pCur->pFullScan = 0;
}
}
/*
** Resize the cursor to hold up to N rows of content
*/
static void spellfix1ResizeCursor(spellfix1_cursor *pCur, int N){
struct spellfix1_row *aNew;
|
︙ | | |
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
|
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
|
-
+
|
iDistTerm = i;
}
}
if( iPlan&1 ){
int idx = 2;
pIdxInfo->idxNum = iPlan;
if( pIdxInfo->nOrderBy==1
&& pIdxInfo->aOrderBy[0].iColumn==4
&& pIdxInfo->aOrderBy[0].iColumn==SPELLFIX_COL_SCORE
&& pIdxInfo->aOrderBy[0].desc==0
){
pIdxInfo->orderByConsumed = 1; /* Default order by iScore */
}
if( iPlan&2 ){
pIdxInfo->aConstraintUsage[iLangTerm].argvIndex = idx++;
pIdxInfo->aConstraintUsage[iLangTerm].omit = 1;
|
︙ | | |
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
|
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
|
+
-
|
int iScope = p->iScope;
spellfix1_cursor *pCur = p->pCur;
sqlite3_stmt *pStmt = p->pStmt;
char zHash1[SPELLFIX_MX_HASH];
char zHash2[SPELLFIX_MX_HASH];
char *zClass;
int nClass;
int rc;
if( pCur->a==0 || p->rc ) return; /* Prior memory allocation failure */
if( p->nRun>=SPELLFIX_MX_RUN ) return;
zClass = (char*)phoneticHash((unsigned char*)zQuery, nQuery);
if( zClass==0 ){
p->rc = SQLITE_NOMEM;
return;
}
nClass = strlen(zClass);
if( nClass>SPELLFIX_MX_HASH-2 ){
|
︙ | | |
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
|
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
|
+
+
+
-
-
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
-
+
+
-
+
|
}
memcpy(zHash1, zClass, iScope);
sqlite3_free(zClass);
zHash1[iScope] = 0;
memcpy(zHash2, zHash1, iScope);
zHash2[iScope] = 'Z';
zHash2[iScope+1] = 0;
#if SPELLFIX_MX_RUN>1
for(i=0; i<p->nRun; i++){
if( strcmp(p->azPrior[i], zHash1)==0 ) return;
}
#endif
assert( p->nRun<SPELLFIX_MX_RUN );
memcpy(p->azPrior[p->nRun++], zHash1, iScope+1);
sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC);
sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC);
if( sqlite3_bind_text(pStmt, 1, zHash1, -1, SQLITE_STATIC)==SQLITE_NOMEM
|| sqlite3_bind_text(pStmt, 2, zHash2, -1, SQLITE_STATIC)==SQLITE_NOMEM
){
p->rc = SQLITE_NOMEM;
return;
}
#if SPELLFIX_MX_RUN>1
for(i=0; i<pCur->nRow; i++){
if( pCur->a[i].iScore>iWorst ){
iWorst = pCur->a[i].iScore;
idxWorst = i;
}
}
#endif
while( sqlite3_step(pStmt)==SQLITE_ROW ){
int iMatchlen = -1;
iRank = sqlite3_column_int(pStmt, 2);
if( p->pMatchStr3 ){
int nWord = sqlite3_column_bytes(pStmt, 1);
zWord = (const char*)sqlite3_column_text(pStmt, 1);
iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
}else{
zK1 = (const char*)sqlite3_column_text(pStmt, 3);
if( zK1==0 ) continue;
iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0);
iDist = editdist1(p->zPattern, zK1, 0);
}
if( iDist<0 ){
p->rc = SQLITE_NOMEM;
break;
}
pCur->nSearch++;
iScore = spellfix1Score(iDist,iRank);
if( p->iMaxDist>=0 ){
if( iDist>p->iMaxDist ) continue;
if( pCur->nRow>=pCur->nAlloc-1 ){
spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
if( pCur->a==0 ) break;
}
idx = pCur->nRow;
}else if( pCur->nRow<pCur->nAlloc ){
idx = pCur->nRow;
}else if( iScore<iWorst ){
idx = idxWorst;
sqlite3_free(pCur->a[idx].zWord);
}else{
continue;
}
pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
if( pCur->a[idx].zWord==0 ){
p->rc = SQLITE_NOMEM;
break;
}
pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
pCur->a[idx].iRank = iRank;
pCur->a[idx].iDistance = iDist;
pCur->a[idx].iScore = iScore;
pCur->a[idx].iMatchlen = iMatchlen;
memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
if( pCur->nRow==pCur->nAlloc ){
iWorst = pCur->a[0].iScore;
idxWorst = 0;
for(i=1; i<pCur->nRow; i++){
iScore = pCur->a[i].iScore;
if( iWorst<iScore ){
iWorst = iScore;
idxWorst = i;
}
}
}
}
sqlite3_reset(pStmt);
rc = sqlite3_reset(pStmt);
if( rc ) p->rc = rc;
}
/*
** This version of the xFilter method work if the MATCH term is present
** and we are doing a scan.
*/
static int spellfix1FilterForMatch(
spellfix1_cursor *pCur,
int idxNum,
int argc,
sqlite3_value **argv
){
const unsigned char *zMatchThis; /* RHS of the MATCH operator */
EditDist3FromString *pMatchStr3 = 0; /* zMatchThis as an editdist string */
char *zPattern; /* Transliteration of zMatchThis */
int nPattern; /* Length of zPattern */
int iLimit = 20; /* Max number of rows of output */
int iScope = 3; /* Use this many characters of zClass */
int iLang = 0; /* Language code */
char *zSql; /* SQL of shadow table query */
sqlite3_stmt *pStmt; /* Shadow table query */
sqlite3_stmt *pStmt = 0; /* Shadow table query */
int rc; /* Result code */
int idx = 1; /* Next available filter parameter */
spellfix1_vtab *p = pCur->pVTab; /* The virtual table that owns pCur */
MatchQuery x; /* For passing info to RunQuery() */
/* Load the cost table if we have not already done so */
if( p->zCostTable!=0 && p->pConfig3==0 ){
|
︙ | | |
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
2871
|
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
|
+
+
+
+
-
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
+
+
+
|
spellfix1ResetCursor(pCur);
spellfix1ResizeCursor(pCur, iLimit);
zMatchThis = sqlite3_value_text(argv[0]);
if( zMatchThis==0 ) return SQLITE_OK;
if( p->pConfig3 ){
x.pLang = editDist3FindLang(p->pConfig3, iLang);
pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
if( pMatchStr3==0 ){
x.rc = SQLITE_NOMEM;
goto filter_exit;
}
}else{
x.pLang = 0;
}
zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
sqlite3_free(pCur->zPattern);
pCur->zPattern = zPattern;
if( zPattern==0 ) return SQLITE_NOMEM;
if( zPattern==0 ){
x.rc = SQLITE_NOMEM;
goto filter_exit;
}
nPattern = strlen(zPattern);
if( zPattern[nPattern-1]=='*' ) nPattern--;
zSql = sqlite3_mprintf(
"SELECT id, word, rank, k1"
" FROM \"%w\".\"%w_vocab\""
" WHERE langid=%d AND k2>=?1 AND k2<?2",
p->zDbName, p->zTableName, iLang
);
if( zSql==0 ){
x.rc = SQLITE_NOMEM;
pStmt = 0;
goto filter_exit;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
sqlite3_free(zSql);
pCur->iLang = iLang;
x.pCur = pCur;
x.pStmt = pStmt;
x.zPattern = zPattern;
x.nPattern = nPattern;
x.pMatchStr3 = pMatchStr3;
x.iLang = iLang;
x.rc = rc;
x.pConfig3 = p->pConfig3;
if( x.rc==SQLITE_OK ){
spellfix1RunQuery(&x, zPattern, nPattern);
}
#if 0
/* Convert "ght" to "t" in the original pattern and try again */
if( x.rc==SQLITE_OK ){
int i, j; /* Loop counters */
char zQuery[50]; /* Space for alternative query string */
for(i=j=0; i<nPattern && i<sizeof(zQuery)-1; i++){
char c = zPattern[i];
if( c=='g' && i<nPattern-2 && zPattern[i+1]=='h' && zPattern[i+2]=='t' ){
i += 2;
c= 't';
}
zQuery[j++] = c;
}
zQuery[j] = 0;
if( j<i ){
spellfix1RunQuery(&x, zQuery, j);
}
}
#endif
if( pCur->a ){
qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
pCur->iTop = iLimit;
pCur->iScope = iScope;
}else{
x.rc = SQLITE_NOMEM;
}
filter_exit:
sqlite3_finalize(pStmt);
editDist3FromStringDelete(pMatchStr3);
return pCur->a ? x.rc : SQLITE_NOMEM;
return x.rc;
}
/*
** This version of xFilter handles a full-table scan case
*/
static int spellfix1FilterForFullScan(
spellfix1_cursor *pCur,
int idxNum,
int argc,
sqlite3_value **argv
){
int rc;
char *zSql;
spellfix1_vtab *pVTab = pCur->pVTab;
spellfix1ResetCursor(pCur);
zSql = sqlite3_mprintf(
"SELECT word, rank, NULL, langid, id FROM \"%w\".\"%w_vocab\"",
pVTab->zDbName, pVTab->zTableName);
if( zSql==0 ) return SQLITE_NOMEM;
rc = sqlite3_prepare_v2(pVTab->db, zSql, -1, &pCur->pFullScan, 0);
sqlite3_free(zSql);
pCur->nRow = pCur->iRow = 0;
if( rc==SQLITE_OK ){
rc = sqlite3_step(pCur->pFullScan);
if( rc==SQLITE_ROW ){ pCur->iRow = -1; rc = SQLITE_OK; }
if( rc==SQLITE_DONE ){ rc = SQLITE_OK; }
}else{
spellfix1ResizeCursor(pCur, 0);
return SQLITE_OK;
pCur->iRow = 0;
}
return rc;
}
/*
** Called to "rewind" a cursor back to the beginning so that
** it starts its output over again. Always called at least once
** prior to any spellfix1Column, spellfix1Rowid, or spellfix1Eof call.
|
︙ | | |
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
|
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
|
-
+
+
+
+
+
+
+
+
-
+
+
+
+
+
+
+
+
+
+
+
+
+
|
/*
** Advance a cursor to its next row of output
*/
static int spellfix1Next(sqlite3_vtab_cursor *cur){
spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
if( pCur->iRow < pCur->nRow ) pCur->iRow++;
if( pCur->iRow < pCur->nRow ){
if( pCur->pFullScan ){
int rc = sqlite3_step(pCur->pFullScan);
if( rc!=SQLITE_ROW ) pCur->iRow = pCur->nRow;
}else{
pCur->iRow++;
}
}
return SQLITE_OK;
}
/*
** Return TRUE if we are at the end-of-file
*/
static int spellfix1Eof(sqlite3_vtab_cursor *cur){
spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
return pCur->iRow>=pCur->nRow;
}
/*
** Return columns from the current row.
*/
static int spellfix1Column(sqlite3_vtab_cursor *cur, sqlite3_context *ctx, int i){
static int spellfix1Column(
sqlite3_vtab_cursor *cur,
sqlite3_context *ctx,
int i
){
spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
if( pCur->pFullScan ){
if( i<=SPELLFIX_COL_LANGID ){
sqlite3_result_value(ctx, sqlite3_column_value(pCur->pFullScan, i));
}else{
sqlite3_result_null(ctx);
}
return SQLITE_OK;
}
switch( i ){
case SPELLFIX_COL_WORD: {
sqlite3_result_text(ctx, pCur->a[pCur->iRow].zWord, -1, SQLITE_STATIC);
break;
}
case SPELLFIX_COL_RANK: {
sqlite3_result_int(ctx, pCur->a[pCur->iRow].iRank);
|
︙ | | |
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
|
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
|
-
+
|
int nWord = strlen(zWord);
if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
char *zTranslit;
int res;
zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
if( !zTranslit ) return SQLITE_NOMEM;
res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen);
res = editdist1(pCur->zPattern, zTranslit, &iMatchlen);
sqlite3_free(zTranslit);
if( res<0 ) return SQLITE_NOMEM;
iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
}else{
iMatchlen = utf8Charlen(zWord, nWord);
}
}
|
︙ | | |
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
|
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
|
+
+
+
-
+
+
|
}
/*
** The rowid.
*/
static int spellfix1Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid){
spellfix1_cursor *pCur = (spellfix1_cursor*)cur;
if( pCur->pFullScan ){
*pRowid = sqlite3_column_int64(pCur->pFullScan, 4);
}else{
*pRowid = pCur->a[pCur->iRow].iRowid;
*pRowid = pCur->a[pCur->iRow].iRowid;
}
return SQLITE_OK;
}
/*
** The xUpdate() method.
*/
static int spellfix1Update(
|
︙ | | |
3062
3063
3064
3065
3066
3067
3068
3069
3070
3071
3072
3073
3074
3075
3076
3077
|
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
|
-
-
+
+
|
iRank, iLang, zWord, zK1, zK2
);
*pRowid = sqlite3_last_insert_rowid(db);
}else{
rowid = sqlite3_value_int64(argv[0]);
newRowid = *pRowid = sqlite3_value_int64(argv[1]);
spellfix1DbExec(&rc, db,
"UPDATE \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, lang=%d,"
" word=%Q, rank=%d, k1=%Q, k2=%Q WHERE id=%lld",
"UPDATE \"%w\".\"%w_vocab\" SET id=%lld, rank=%d, langid=%d,"
" word=%Q, k1=%Q, k2=%Q WHERE id=%lld",
p->zDbName, p->zTableName, newRowid, iRank, iLang,
zWord, zK1, zK2, rowid
);
}
sqlite3_free(zK1);
sqlite3_free(zK2);
}
|
︙ | | |
3092
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
|
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
|
+
+
|
spellfix1DbExec(&rc, db,
"ALTER TABLE \"%w\".\"%w_vocab\" RENAME TO \"%w_vocab\"",
p->zDbName, p->zTableName, zNewName
);
if( rc==SQLITE_OK ){
sqlite3_free(p->zTableName);
p->zTableName = zNewName;
}else{
sqlite3_free(zNewName);
}
return rc;
}
/*
** A virtual table module that provides fuzzy search.
|
︙ | | |
3133
3134
3135
3136
3137
3138
3139
3140
3141
3142
3143
3144
3145
3146
3147
3148
3149
3150
3151
3152
3153
3154
3155
3156
|
2775
2776
2777
2778
2779
2780
2781
2782
2783
2784
2785
2786
2787
2788
2789
2790
2791
2792
|
-
-
-
-
-
-
|
static int spellfix1Register(sqlite3 *db){
int nErr = 0;
int i;
nErr += sqlite3_create_function(db, "spellfix1_translit", 1, SQLITE_UTF8, 0,
transliterateSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "spellfix1_editdist", 2, SQLITE_UTF8, 0,
editdistSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "spellfix1_editdist", 3, SQLITE_UTF8, 0,
editdistSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "spellfix1_phonehash", 1, SQLITE_UTF8, 0,
phoneticHashSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "spellfix1_scriptcode", 1, SQLITE_UTF8, 0,
scriptCodeSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "pollock_skeleton", 1, SQLITE_UTF8, 0,
pollockSkeletonSqlFunc, 0, 0);
nErr += sqlite3_create_function(db, "pollock_omission", 1, SQLITE_UTF8, 0,
pollockOmissionSqlFunc, 0, 0);
nErr += sqlite3_create_module(db, "spellfix1", &spellfix1Module, 0);
nErr += editDist3Install(db);
/* Verify sanity of the translit[] table */
for(i=0; i<sizeof(translit)/sizeof(translit[0])-1; i++){
assert( translit[i].cFrom<translit[i+1].cFrom );
}
|
︙ | | |