Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch spellfix-matchlen Excluding Merge-Ins
This is equivalent to a diff from 4353e40b to f96d4e7b
2012-07-16
| ||
23:13 | Merge the spellfix1 changes for supporting matchlen into trunk. (check-in: 6f167adf user: drh tags: trunk) | |
22:16 | Fix compiler warnings about unused code in spellfix. Fix the editDist3Core() routine to return the matchlen in characters instead of bytes. (Closed-Leaf check-in: f96d4e7b user: drh tags: spellfix-matchlen) | |
14:52 | Fix a bug in the phonetic-hash routine in spellfix1: Even if the first character of a word is deemed to be "silent", do not apply the special handling intended for the first character of each word to the second. (check-in: 6333b42d user: dan tags: spellfix-matchlen) | |
10:25 | Merge trunk changes. (check-in: 90df64ab user: dan tags: spellfix-matchlen) | |
10:06 | If a specific database is nominated as part of a "PRAGMA integrity_check" or "PRAGMA quick_check" command, search for problems in the nominated database only. i.e. "PRAGMA main.quick_check" now only scans the main database, not all attached databases as before. (check-in: 4353e40b user: dan tags: trunk) | |
2012-07-13
| ||
16:15 | Update test_spellfix.c with latest changes. (check-in: cba2a658 user: dan tags: trunk) | |
Changes to src/test8.c.
︙ | |||
1377 1378 1379 1380 1381 1382 1383 | 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 | - - | */ static int register_spellfix_module( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ |
︙ |
Changes to src/test_spellfix.c.
︙ | |||
96 97 98 99 100 101 102 103 104 105 106 107 108 109 | 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | + + + + + | ** For any given query this value is the same on all rows. ** ** score The score is a combination of rank and distance. The ** idea is that a lower score is better. The virtual table ** attempts to find words with the lowest score and ** by default (unless overridden by ORDER BY) returns ** results in order of increasing score. ** ** matchlen For prefix queries, the number of characters in the prefix ** of the returned value (word) that matched the query term. ** For non-prefix queries, the number of characters in the ** returned value. ** ** top (HIDDEN) For any query, this value is the same on all ** rows. It is an integer which is the maximum number of ** rows that will be output. The actually number of rows ** output might be less than this number, but it will never ** be greater. The default value for top is 20, but that ** can be changed for each query by including a term of |
︙ | |||
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 | 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 | + - - | if( i+2<nIn ){ if( c=='t' && zIn[i+1]=='c' && zIn[i+2]=='h' ) continue; } } c = aClass[c&0x7f]; if( c==CCLASS_SPACE ) continue; if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue; aClass = midClass; if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){ continue; /* No vowels beside L or R */ } if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){ nOut--; /* No vowels beside L or R */ } cPrev = c; if( c==CCLASS_SILENT ) continue; cPrevX = c; |
︙ | |||
601 602 603 604 605 606 607 608 | 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 | + + + + + + - + + - + + | ** ** Smaller numbers mean a closer match. ** ** Negative values indicate an error: ** -1 One of the inputs is NULL ** -2 Non-ASCII characters on input ** -3 Unable to allocate memory ** ** If pnMatch is not NULL, then *pnMatch is set to the number of bytes ** of zB that matched the pattern in zA. If zA does not end with a '*', ** then this value is always the number of bytes in zB (i.e. strlen(zB)). ** If zA does end in a '*', then it is the number of bytes in the prefix ** of zB that was deemed to match zA. */ |
︙ | |||
733 734 735 736 737 738 739 | 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 | - + + + + + | cAprev = cA; } /* Free the wagner matrix and return the result */ if( cA=='*' ){ res = m[1]; for(xB=1; xB<=nB; xB++){ |
︙ | |||
760 761 762 763 764 765 766 | 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 | - + | int argc, sqlite3_value **argv ){ int langid = argc==2 ? 0 : sqlite3_value_int(argv[2]); int res = editdist1( (const char*)sqlite3_value_text(argv[0]), (const char*)sqlite3_value_text(argv[1]), |
︙ | |||
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 | 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | + | pStr = 0; break; } } return pStr; } #if 0 /* No longer used */ /* ** Return the number of bytes in the common prefix of two UTF8 strings. ** Only complete characters are considered. */ static int editDist3PrefixLen(const char *z1, const char *z2){ int n = 0; while( z1[n] && z1[n]==z2[n] ){ n++; } |
︙ | |||
1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 | 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 | + | */ static int editDist3SuffixLen(const char *z1, int n1, const char *z2, int n2){ int origN1 = n1; while( n1>0 && n2>0 && z1[n1-1]==z2[n2-1] ){ n1--; n2--; } while( n1<origN1 && (z1[n1]&0xc0)==0x80 ){ n1++; n2++; } return origN1 - n1; } #endif /* 0 */ /* ** Update entry m[i] such that it is the minimum of its current value ** and m[j]+iCost. ** ** If the iCost is 1,000,000 or greater, then consider the cost to be ** infinite and skip the update. |
︙ | |||
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 | 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 | + + + + + + + - + + | if( b<m[i] ) m[i] = b; } } /* Compute the edit distance between two strings. ** ** If an error occurs, return a negative number which is the error code. ** ** If pnMatch is not NULL, then *pnMatch is set to the number of characters ** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does ** not contain the pattern for a prefix-search, then this is always the number ** of characters in z2. If pFrom does contain a prefix search pattern, then ** it is the number of characters in the prefix of z2 that was deemed to ** match pFrom. */ static int editDist3Core( EditDist3FromString *pFrom, /* The FROM string */ const char *z2, /* The TO string */ int n2, /* Length of the TO string */ |
︙ | |||
1277 1278 1279 1280 1281 1282 1283 1284 | 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 | + - + + - - + + + + + + + + + + + | } printf("\n"); } #endif /* Free memory allocations and return the result */ res = (int)m[szRow*(n2+1)-1]; n = n2; if( f.isPrefix ){ |
︙ | |||
1340 1341 1342 1343 1344 1345 1346 | 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 | - + | int dist; pFrom = editDist3FromStringNew(pLang, zA, nA); if( pFrom==0 ){ sqlite3_result_error_nomem(context); return; } |
︙ | |||
1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 | 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 | + + + + + + + + + + + + + + + | c = (c<<6) + (0x3f & z[i++]); } } } *pSize = i; return c; } /* ** Return the number of characters in the utf-8 string in the nIn byte ** buffer pointed to by zIn. */ static int utf8Charlen(const char *zIn, int nIn){ int i; int nChar = 0; for(i=0; i<nIn; nChar++){ int sz; utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); i += sz; } return nChar; } /* ** Table of translations from unicode characters into ASCII. */ static const struct { unsigned short int cFrom; unsigned char cTo0, cTo1; |
︙ | |||
1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 | 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | } if( c ) zOut[nOut++] = '?'; } } zOut[nOut] = 0; return zOut; } /* ** Return the number of characters in the shortest prefix of the input ** string that transliterates to an ASCII string nTrans bytes or longer. ** Or, if the transliteration of the input string is less than nTrans ** bytes in size, return the number of characters in the input string. */ static int translen_to_charlen(const char *zIn, int nIn, int nTrans){ int i, c, sz, nOut; int nChar; i = nOut = 0; for(nChar=0; i<nIn && nOut<nTrans; nChar++){ c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz); i += sz; nOut++; if( c>=128 ){ int xTop, xBtm, x; xTop = sizeof(translit)/sizeof(translit[0]) - 1; xBtm = 0; while( xTop>=xBtm ){ x = (xTop + xBtm)/2; if( translit[x].cFrom==c ){ if( translit[x].cTo1 ) nOut++; if( c==0x0429 || c== 0x0449 ) nOut += 2; break; }else if( translit[x].cFrom>c ){ xTop = x-1; }else{ xBtm = x+1; } } } } return nChar; } /* ** spellfix1_translit(X) ** ** Convert a string that contains non-ASCII Roman characters into ** pure ASCII. */ |
︙ | |||
2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 | 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 | + + | EditDist3Config *pConfig3; /* Parsed edit distance costs */ }; /* Fuzzy-search cursor object */ struct spellfix1_cursor { sqlite3_vtab_cursor base; /* Base class - must be first */ spellfix1_vtab *pVTab; /* The table to which this cursor belongs */ char *zPattern; /* rhs of MATCH clause */ int nRow; /* Number of rows of content */ int nAlloc; /* Number of allocated rows */ int iRow; /* Current row of content */ int iLang; /* Value of the lang= constraint */ int iTop; /* Value of the top= constraint */ int iScope; /* Value of the scope= constraint */ int nSearch; /* Number of vocabulary items checked */ struct spellfix1_row { /* For each row of content */ sqlite3_int64 iRowid; /* Rowid for this row */ char *zWord; /* Text for this row */ int iRank; /* Rank for this row */ int iDistance; /* Distance from pattern for this row */ int iScore; /* Score for sorting */ int iMatchlen; /* Value of matchlen column (or -1) */ char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */ } *a; }; /* ** Construct one or more SQL statements from the format string given ** and then evaluate those statements. The success code is written |
︙ | |||
2196 2197 2198 2199 2200 2201 2202 | 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 | - + | /* ** xConnect/xCreate method for the spellfix1 module. Arguments are: ** ** argv[0] -> module name ("spellfix1") ** argv[1] -> database name ** argv[2] -> table name |
︙ | |||
2234 2235 2236 2237 2238 2239 2240 | 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 | - - - + + + + + - - - - - - + + + + + + | memcpy(pNew->zDbName, zDbName, nDbName+1); pNew->zTableName = sqlite3_mprintf("%s", zTableName); pNew->db = db; if( pNew->zTableName==0 ){ rc = SQLITE_NOMEM; }else{ rc = sqlite3_declare_vtab(db, |
︙ | |||
2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 | 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 | + | /* ** Close a fuzzy-search cursor. */ static int spellfix1Close(sqlite3_vtab_cursor *cur){ spellfix1_cursor *pCur = (spellfix1_cursor *)cur; spellfix1ResetCursor(pCur); spellfix1ResizeCursor(pCur, 0); sqlite3_free(pCur->zPattern); sqlite3_free(pCur); return SQLITE_OK; } /* ** Search for terms of these forms: ** |
︙ | |||
2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 | 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 | + - + - + | for(i=0; i<pCur->nRow; i++){ if( pCur->a[i].iScore>iWorst ){ iWorst = pCur->a[i].iScore; idxWorst = i; } } while( sqlite3_step(pStmt)==SQLITE_ROW ){ int iMatchlen = -1; iRank = sqlite3_column_int(pStmt, 2); if( p->pMatchStr3 ){ int nWord = sqlite3_column_bytes(pStmt, 1); zWord = (const char*)sqlite3_column_text(pStmt, 1); |
︙ | |||
2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 | 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 | + | continue; } pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1)); pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0); pCur->a[idx].iRank = iRank; pCur->a[idx].iDistance = iDist; pCur->a[idx].iScore = iScore; pCur->a[idx].iMatchlen = iMatchlen; memcpy(pCur->a[idx].zHash, zHash1, iScope+1); if( pCur->nRow<pCur->nAlloc ) pCur->nRow++; if( pCur->nRow==pCur->nAlloc ){ iWorst = pCur->a[0].iScore; idxWorst = 0; for(i=1; i<pCur->nRow; i++){ iScore = pCur->a[i].iScore; |
︙ | |||
2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 | 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 | + + | if( p->pConfig3 ){ x.pLang = editDist3FindLang(p->pConfig3, iLang); pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1); }else{ x.pLang = 0; } zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0])); sqlite3_free(pCur->zPattern); pCur->zPattern = zPattern; if( zPattern==0 ) return SQLITE_NOMEM; nPattern = strlen(zPattern); if( zPattern[nPattern-1]=='*' ) nPattern--; zSql = sqlite3_mprintf( "SELECT id, word, rank, k1" " FROM \"%w\".\"%w_vocab\"" " WHERE langid=%d AND k2>=?1 AND k2<?2", |
︙ | |||
2742 2743 2744 2745 2746 2747 2748 | 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 | - | if( pCur->a ){ qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare); pCur->iTop = iLimit; pCur->iScope = iScope; } sqlite3_finalize(pStmt); |
︙ | |||
2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 | 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 | + + + + + + + + + + + + + + + + + + + + + + + + | case SPELLFIX_COL_LANGID: { sqlite3_result_int(ctx, pCur->iLang); break; } case SPELLFIX_COL_SCORE: { sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore); break; } case SPELLFIX_COL_MATCHLEN: { int iMatchlen = pCur->a[pCur->iRow].iMatchlen; if( iMatchlen<0 ){ int nPattern = strlen(pCur->zPattern); char *zWord = pCur->a[pCur->iRow].zWord; int nWord = strlen(zWord); if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){ char *zTranslit; int res; zTranslit = (char *)transliterate((unsigned char *)zWord, nWord); if( !zTranslit ) return SQLITE_NOMEM; res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen); sqlite3_free(zTranslit); if( res<0 ) return SQLITE_NOMEM; iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen); }else{ iMatchlen = utf8Charlen(zWord, nWord); } } sqlite3_result_int(ctx, iMatchlen); break; } case SPELLFIX_COL_PHONEHASH: { sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC); break; } case SPELLFIX_COL_TOP: { sqlite3_result_int(ctx, pCur->iTop); |
︙ |
Added test/spellfix.test.