Index: src/test8.c ================================================================== --- src/test8.c +++ src/test8.c @@ -1379,12 +1379,10 @@ ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ - static sqlite3_module aMod[3]; - int iMod; sqlite3 *db; if( objc!=2 ){ Tcl_WrongNumArgs(interp, 1, objv, "DB"); return TCL_ERROR; Index: src/test_spellfix.c ================================================================== --- src/test_spellfix.c +++ src/test_spellfix.c @@ -98,10 +98,15 @@ ** score The score is a combination of rank and distance. The ** idea is that a lower score is better. The virtual table ** attempts to find words with the lowest score and ** by default (unless overridden by ORDER BY) returns ** results in order of increasing score. +** +** matchlen For prefix queries, the number of characters in the prefix +** of the returned value (word) that matched the query term. +** For non-prefix queries, the number of characters in the +** returned value. ** ** top (HIDDEN) For any query, this value is the same on all ** rows. It is an integer which is the maximum number of ** rows that will be output. The actually number of rows ** output might be less than this number, but it will never @@ -472,21 +477,20 @@ } } c = aClass[c&0x7f]; if( c==CCLASS_SPACE ) continue; if( c==CCLASS_OTHER && cPrev!=CCLASS_DIGIT ) continue; + aClass = midClass; if( c==CCLASS_VOWEL && (cPrevX==CCLASS_R || cPrevX==CCLASS_L) ){ continue; /* No vowels beside L or R */ } if( (c==CCLASS_R || c==CCLASS_L) && cPrevX==CCLASS_VOWEL ){ nOut--; /* No vowels beside L or R */ } cPrev = c; if( c==CCLASS_SILENT ) continue; cPrevX = c; - if( c==CCLASS_SPACE ) continue; - aClass = midClass; c = className[c]; if( c!=zOut[nOut-1] ) zOut[nOut++] = c; } zOut[nOut] = 0; return zOut; @@ -603,12 +607,18 @@ ** ** Negative values indicate an error: ** -1 One of the inputs is NULL ** -2 Non-ASCII characters on input ** -3 Unable to allocate memory +** +** If pnMatch is not NULL, then *pnMatch is set to the number of bytes +** of zB that matched the pattern in zA. If zA does not end with a '*', +** then this value is always the number of bytes in zB (i.e. strlen(zB)). +** If zA does end in a '*', then it is the number of bytes in the prefix +** of zB that was deemed to match zA. */ -static int editdist1(const char *zA, const char *zB, int iLangId){ +static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){ int nA, nB; /* Number of characters in zA[] and zB[] */ int xA, xB; /* Loop counters for zA[] and zB[] */ char cA, cB; /* Current character of zA and zB */ char cAprev, cBprev; /* Previous character of zA and zB */ char cAnext, cBnext; /* Next character in zA and zB */ @@ -617,16 +627,18 @@ int res; /* Final result */ int *m; /* The cost matrix */ char *cx; /* Corresponding character values */ int *toFree = 0; /* Malloced space */ int mStack[60+15]; /* Stack space to use if not too much is needed */ + int nMatch = 0; /* Early out if either input is NULL */ if( zA==0 || zB==0 ) return -1; /* Skip any common prefix */ - while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; } + while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; } + if( pnMatch ) *pnMatch = nMatch; if( zA[0]==0 && zB[0]==0 ) return 0; #if 0 printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' '); #endif @@ -735,14 +747,18 @@ /* Free the wagner matrix and return the result */ if( cA=='*' ){ res = m[1]; for(xB=1; xB<=nB; xB++){ - if( m[xB]0 && n2>0 && z1[n1-1]==z2[n2-1] ){ n1--; n2--; } while( n1=128 ){ + int xTop, xBtm, x; + xTop = sizeof(translit)/sizeof(translit[0]) - 1; + xBtm = 0; + while( xTop>=xBtm ){ + x = (xTop + xBtm)/2; + if( translit[x].cFrom==c ){ + if( translit[x].cTo1 ) nOut++; + if( c==0x0429 || c== 0x0449 ) nOut += 2; + break; + }else if( translit[x].cFrom>c ){ + xTop = x-1; + }else{ + xBtm = x+1; + } + } + } + } + + return nChar; +} + /* ** spellfix1_translit(X) ** ** Convert a string that contains non-ASCII Roman characters into @@ -2090,10 +2181,11 @@ /* Fuzzy-search cursor object */ struct spellfix1_cursor { sqlite3_vtab_cursor base; /* Base class - must be first */ spellfix1_vtab *pVTab; /* The table to which this cursor belongs */ + char *zPattern; /* rhs of MATCH clause */ int nRow; /* Number of rows of content */ int nAlloc; /* Number of allocated rows */ int iRow; /* Current row of content */ int iLang; /* Value of the lang= constraint */ int iTop; /* Value of the top= constraint */ @@ -2103,10 +2195,11 @@ sqlite3_int64 iRowid; /* Rowid for this row */ char *zWord; /* Text for this row */ int iRank; /* Rank for this row */ int iDistance; /* Distance from pattern for this row */ int iScore; /* Score for sorting */ + int iMatchlen; /* Value of matchlen column (or -1) */ char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */ } *a; }; /* @@ -2198,11 +2291,11 @@ ** xConnect/xCreate method for the spellfix1 module. Arguments are: ** ** argv[0] -> module name ("spellfix1") ** argv[1] -> database name ** argv[2] -> table name -** argv[3].. -> optional arguments (currently ignored) +** argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter) */ static int spellfix1Init( int isCreate, sqlite3 *db, void *pAux, @@ -2236,25 +2329,27 @@ pNew->db = db; if( pNew->zTableName==0 ){ rc = SQLITE_NOMEM; }else{ rc = sqlite3_declare_vtab(db, - "CREATE TABLE x(word,rank,distance,langid," - "score, phonehash,top HIDDEN,scope HIDDEN,srchcnt HIDDEN," - "soundslike HIDDEN,command HIDDEN)" + "CREATE TABLE x(word,rank,distance,langid, " + "score, matchlen, phonehash, " + "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, " + "soundslike HIDDEN, command HIDDEN)" ); #define SPELLFIX_COL_WORD 0 #define SPELLFIX_COL_RANK 1 #define SPELLFIX_COL_DISTANCE 2 #define SPELLFIX_COL_LANGID 3 #define SPELLFIX_COL_SCORE 4 -#define SPELLFIX_COL_PHONEHASH 5 -#define SPELLFIX_COL_TOP 6 -#define SPELLFIX_COL_SCOPE 7 -#define SPELLFIX_COL_SRCHCNT 8 -#define SPELLFIX_COL_SOUNDSLIKE 9 -#define SPELLFIX_COL_COMMAND 10 +#define SPELLFIX_COL_MATCHLEN 5 +#define SPELLFIX_COL_PHONEHASH 6 +#define SPELLFIX_COL_TOP 7 +#define SPELLFIX_COL_SCOPE 8 +#define SPELLFIX_COL_SRCHCNT 9 +#define SPELLFIX_COL_SOUNDSLIKE 10 +#define SPELLFIX_COL_COMMAND 11 } if( rc==SQLITE_OK && isCreate ){ sqlite3_uint64 r; spellfix1DbExec(&rc, db, "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n" @@ -2348,10 +2443,11 @@ */ static int spellfix1Close(sqlite3_vtab_cursor *cur){ spellfix1_cursor *pCur = (spellfix1_cursor *)cur; spellfix1ResetCursor(pCur); spellfix1ResizeCursor(pCur, 0); + sqlite3_free(pCur->zPattern); sqlite3_free(pCur); return SQLITE_OK; } /* @@ -2581,19 +2677,20 @@ iWorst = pCur->a[i].iScore; idxWorst = i; } } while( sqlite3_step(pStmt)==SQLITE_ROW ){ + int iMatchlen = -1; iRank = sqlite3_column_int(pStmt, 2); if( p->pMatchStr3 ){ int nWord = sqlite3_column_bytes(pStmt, 1); zWord = (const char*)sqlite3_column_text(pStmt, 1); - iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang); + iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen); }else{ zK1 = (const char*)sqlite3_column_text(pStmt, 3); if( zK1==0 ) continue; - iDist = editdist1(p->zPattern, zK1, pCur->iLang); + iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0); } pCur->nSearch++; iScore = spellfix1Score(iDist,iRank); if( p->iMaxDist>=0 ){ if( iDist>p->iMaxDist ) continue; @@ -2613,10 +2710,11 @@ pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1)); pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0); pCur->a[idx].iRank = iRank; pCur->a[idx].iDistance = iDist; pCur->a[idx].iScore = iScore; + pCur->a[idx].iMatchlen = iMatchlen; memcpy(pCur->a[idx].zHash, zHash1, iScope+1); if( pCur->nRownAlloc ) pCur->nRow++; if( pCur->nRow==pCur->nAlloc ){ iWorst = pCur->a[0].iScore; idxWorst = 0; @@ -2694,10 +2792,12 @@ pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1); }else{ x.pLang = 0; } zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0])); + sqlite3_free(pCur->zPattern); + pCur->zPattern = zPattern; if( zPattern==0 ) return SQLITE_NOMEM; nPattern = strlen(zPattern); if( zPattern[nPattern-1]=='*' ) nPattern--; zSql = sqlite3_mprintf( "SELECT id, word, rank, k1" @@ -2744,11 +2844,10 @@ qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare); pCur->iTop = iLimit; pCur->iScope = iScope; } sqlite3_finalize(pStmt); - sqlite3_free(zPattern); editDist3FromStringDelete(pMatchStr3); return pCur->a ? x.rc : SQLITE_NOMEM; } /* @@ -2827,10 +2926,34 @@ break; } case SPELLFIX_COL_SCORE: { sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore); break; + } + case SPELLFIX_COL_MATCHLEN: { + int iMatchlen = pCur->a[pCur->iRow].iMatchlen; + if( iMatchlen<0 ){ + int nPattern = strlen(pCur->zPattern); + char *zWord = pCur->a[pCur->iRow].zWord; + int nWord = strlen(zWord); + + if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){ + char *zTranslit; + int res; + zTranslit = (char *)transliterate((unsigned char *)zWord, nWord); + if( !zTranslit ) return SQLITE_NOMEM; + res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen); + sqlite3_free(zTranslit); + if( res<0 ) return SQLITE_NOMEM; + iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen); + }else{ + iMatchlen = utf8Charlen(zWord, nWord); + } + } + + sqlite3_result_int(ctx, iMatchlen); + break; } case SPELLFIX_COL_PHONEHASH: { sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC); break; } ADDED test/spellfix.test Index: test/spellfix.test ================================================================== --- /dev/null +++ test/spellfix.test @@ -0,0 +1,147 @@ +# 2012 July 12 +# +# The author disclaims copyright to this source code. In place of +# a legal notice, here is a blessing: +# +# May you do good and not evil. +# May you find forgiveness for yourself and forgive others. +# May you share freely, never taking more than you give. +# +#*********************************************************************** +# + +set testdir [file dirname $argv0] +source $testdir/tester.tcl +set testprefix spellfix + +register_spellfix_module db + +set vocab { +rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer +racers races racetrack racial racially racing rack racked racket racketeer +racketeering racketeers rackets racking racks radar radars radial radially +radian radiance radiant radiantly radiate radiated radiates radiating radiation +radiations radiator radiators radical radically radicals radices radii radio +radioactive radioastronomy radioed radiography radioing radiology radios radish +radishes radium radius radix radon raft rafter rafters rafts rag rage raged +rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders +raiding raids rail railed railer railers railing railroad railroaded railroader +railroaders railroading railroads rails railway railways raiment rain rainbow +raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining +rains rainstorm rainy raise raised raiser raisers raises raisin raising rake +raked rakes raking rallied rallies rally rallying ram ramble rambler rambles +rambling ramblings ramification ramifications ramp rampage rampant rampart +ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid +random randomization randomize randomized randomizes randomly randomness randy +rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked +ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack +ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted +ranter ranters ranting rants rap rapacious rape raped raper rapes rapid +rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly +rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal +rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping +rasps raster rat rate rated rater raters rates rather ratification ratified +ratifies ratify ratifying rating ratings ratio ration rational rationale +rationales rationalities rationality rationalization rationalizations +rationalize rationalized rationalizes rationalizing rationally rationals +rationing rations ratios rats rattle rattled rattler rattlers rattles +rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers +ravages ravaging rave raved raven ravening ravenous ravenously ravens raves +ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze +razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach +reachability reachable reachably reached reacher reaches reaching reacquired +react reacted reacting reaction reactionaries reactionary reactions reactivate +reactivated reactivates reactivating reactivation reactive reactively +reactivity reactor reactors reacts read readability readable reader readers +readied readier readies readiest readily readiness reading readings readjusted +readout readouts reads ready readying real realest realign realigned realigning +realigns realism realist realistic realistically realists realities reality +} + +do_test 1.1 { + execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 } + foreach word $vocab { + execsql { INSERT INTO t1(word) VALUES($word) } + } +} {} + +foreach {tn word res} { + 1 raxpi* {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 4} + 2 ril* {rail 4 railway 4 railing 4 rails 4 railways 4} + 3 rilis* {realist 6 realistic 6 realistically 6 realists 6 realism 6} + 4 reail* {realities 3 reality 3 real 3 realest 3 realist 3} + 5 ras* {rasp 3 rash 3 rasped 3 rasping 3 rasps 3} + 6 realistss* {realists 8 realigns 8 realistic 9 realistically 9 realest 7} + 7 realistss {realists 8 realist 7 realigns 8 realistic 9 realest 7} + 8 rllation* {realities 9 reality 7 rallied 7 railed 4} + 9 renstom* {rainstorm 8 ransomer 6 ransom 6 ransoming 6 ransoms 6} +} { + do_execsql_test 1.2.$tn { + SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5 + } $res +} + + +do_execsql_test 2.1 { + CREATE VIRTUAL TABLE t2 USING spellfix1; + INSERT INTO t2 (word, soundslike) VALUES('school', 'skuul'); + INSERT INTO t2 (word, soundslike) VALUES('psalm', 'sarm'); + SELECT word, matchlen FROM t2 WHERE word MATCH 'sar*' LIMIT 5; +} {psalm 4} + +do_execsql_test 2.2 { + SELECT word, matchlen FROM t2 WHERE word MATCH 'skol*' LIMIT 5; +} {school 6} + +set vocab { +kangaroo kanji kappa karate keel keeled keeling keels keen keener keenest +keenly keenness keep keeper keepers keeping keeps ken kennel kennels kept +kerchief kerchiefs kern kernel kernels kerosene ketchup kettle +kettles key keyboard keyboards keyed keyhole keying keynote keypad keypads keys +keystroke keystrokes keyword keywords kick kicked kicker kickers kicking +kickoff kicks kid kidded kiddie kidding kidnap kidnapper kidnappers kidnapping +kidnappings kidnaps kidney kidneys kids kill killed killer killers killing +killingly killings killjoy kills kilobit kilobits kiloblock kilobyte kilobytes +kilogram kilograms kilohertz kilohm kilojoule kilometer kilometers kiloton +kilovolt kilowatt kiloword kimono kin kind kinder kindergarten kindest +kindhearted kindle kindled kindles kindling kindly kindness kindred kinds +kinetic king kingdom kingdoms kingly kingpin kings kink kinky kinship kinsman +kiosk kiss kissed kisser kissers kisses kissing kit kitchen kitchenette +kitchens kite kited kites kiting kits kitten kittenish kittens kitty klaxon +kludge kludges klystron knack knapsack knapsacks knave knaves knead kneads knee +kneecap kneed kneeing kneel kneeled kneeling kneels knees knell knells knelt +knew knife knifed knifes knifing knight knighted knighthood knighting knightly +knights knit knits knives knob knobs knock knockdown knocked knocker knockers +knocking knockout knocks knoll knolls knot knots knotted knotting know knowable +knower knowhow knowing knowingly knowledge knowledgeable known knows knuckle +knuckled knuckles koala kosher kudo +} + +do_execsql_test 3.1 { + CREATE TABLE costs(iLang, cFrom, cTo, iCost); + INSERT INTO costs VALUES(0, 'a', 'e', 1); + INSERT INTO costs VALUES(0, 'e', 'i', 1); + INSERT INTO costs VALUES(0, 'i', 'o', 1); + INSERT INTO costs VALUES(0, 'o', 'u', 1); + INSERT INTO costs VALUES(0, 'u', 'a', 1); + CREATE VIRTUAL TABLE t3 USING spellfix1(edit_cost_table=costs); +} + +do_test 3.2 { + foreach w $vocab { + execsql { INSERT INTO t3(word) VALUES($w) } + } +} {} + +breakpoint +foreach {tn word res} { + 1 kos* {kosher 3 kiosk 4 kudo 2 kappa 1 keypad 1} + 2 kellj* {killjoy 5 killed 4 killingly 4 kill 4 killer 4} + 3 kellj {kill 4 kills 5 killjoy 7 keel 4 killed 6} +} { + do_execsql_test 1.2.$tn { + SELECT word, matchlen FROM t3 WHERE word MATCH $word LIMIT 5 + } $res +} + +finish_test