/ Check-in [4a582c4d]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the "matchlen" column to the spellfix virtual table.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | spellfix-matchlen
Files: files | file ages | folders
SHA1:4a582c4d30c4d42caf007f9f3ae7604188f55c76
User & Date: dan 2012-07-12 19:43:54
Context
2012-07-13
11:09
Fix bug in spellfix1 xUpdate() method introduced by the previous commit. Closed-Leaf check-in: b31aafa5 user: dan tags: spellfix-matchlen
2012-07-12
19:43
Add the "matchlen" column to the spellfix virtual table. check-in: 4a582c4d user: dan tags: spellfix-matchlen
2012-06-30
22:22
Setup the necessary library paths for cross-compilation with MSVC. check-in: 7fac56ed user: mistachkin tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/test8.c.

  1365   1365     rc = sqlite3_declare_vtab(db, Tcl_GetString(objv[2]));
  1366   1366     if( rc!=SQLITE_OK ){
  1367   1367       Tcl_SetResult(interp, (char *)sqlite3_errmsg(db), TCL_VOLATILE);
  1368   1368       return TCL_ERROR;
  1369   1369     }
  1370   1370     return TCL_OK;
  1371   1371   }
         1372  +
         1373  +#include "test_spellfix.c"
         1374  +
         1375  +/*
         1376  +** Register the spellfix virtual table module.
         1377  +*/
         1378  +static int register_spellfix_module(
         1379  +  ClientData clientData, 
         1380  +  Tcl_Interp *interp,   
         1381  +  int objc,             
         1382  +  Tcl_Obj *CONST objv[]
         1383  +){
         1384  +  static sqlite3_module aMod[3];
         1385  +  int iMod;
         1386  +  sqlite3 *db;
         1387  +
         1388  +  if( objc!=2 ){
         1389  +    Tcl_WrongNumArgs(interp, 1, objv, "DB");
         1390  +    return TCL_ERROR;
         1391  +  }
         1392  +  if( getDbPointer(interp, Tcl_GetString(objv[1]), &db) ) return TCL_ERROR;
         1393  +
         1394  +  sqlite3Spellfix1Register(db);
         1395  +  return TCL_OK;
         1396  +}
  1372   1397   
  1373   1398   #endif /* ifndef SQLITE_OMIT_VIRTUALTABLE */
  1374   1399   
  1375   1400   /*
  1376   1401   ** Register commands with the TCL interpreter.
  1377   1402   */
  1378   1403   int Sqlitetest8_Init(Tcl_Interp *interp){
................................................................................
  1379   1404   #ifndef SQLITE_OMIT_VIRTUALTABLE
  1380   1405     static struct {
  1381   1406        char *zName;
  1382   1407        Tcl_ObjCmdProc *xProc;
  1383   1408        void *clientData;
  1384   1409     } aObjCmd[] = {
  1385   1410        { "register_echo_module",   register_echo_module, 0 },
         1411  +     { "register_spellfix_module",   register_spellfix_module, 0 },
  1386   1412        { "sqlite3_declare_vtab",   declare_vtab, 0 },
  1387   1413     };
  1388   1414     int i;
  1389   1415     for(i=0; i<sizeof(aObjCmd)/sizeof(aObjCmd[0]); i++){
  1390   1416       Tcl_CreateObjCommand(interp, aObjCmd[i].zName, 
  1391   1417           aObjCmd[i].xProc, aObjCmd[i].clientData, 0);
  1392   1418     }
  1393   1419   #endif
  1394   1420     return TCL_OK;
  1395   1421   }

Changes to src/test_spellfix.c.

   496    496   ** Smaller numbers mean a closer match.
   497    497   **
   498    498   ** Negative values indicate an error:
   499    499   **    -1  One of the inputs is NULL
   500    500   **    -2  Non-ASCII characters on input
   501    501   **    -3  Unable to allocate memory 
   502    502   */
   503         -static int editdist(const char *zA, const char *zB){
          503  +static int editdist(const char *zA, const char *zB, int *pnMatch){
   504    504     int nA, nB;            /* Number of characters in zA[] and zB[] */
   505    505     int xA, xB;            /* Loop counters for zA[] and zB[] */
   506    506     char cA, cB;           /* Current character of zA and zB */
   507    507     char cAprev, cBprev;   /* Previous character of zA and zB */
   508    508     int d;                 /* North-west cost value */
   509    509     int dc = 0;            /* North-west character value */
   510    510     int res;               /* Final result */
   511    511     int *m;                /* The cost matrix */
   512    512     char *cx;              /* Corresponding character values */
   513    513     int *toFree = 0;       /* Malloced space */
   514    514     int mStack[60+15];     /* Stack space to use if not too much is needed */
          515  +  int nMatch = 0;
   515    516   
   516    517     /* Early out if either input is NULL */
   517    518     if( zA==0 || zB==0 ) return -1;
   518    519   
   519    520     /* Skip any common prefix */
   520         -  while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; }
          521  +  while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
          522  +  if( pnMatch ) *pnMatch = nMatch;
   521    523     if( zA[0]==0 && zB[0]==0 ) return 0;
   522    524   
   523    525   #if 0
   524    526     printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
   525    527   #endif
   526    528   
   527    529     /* Verify input strings and measure their lengths */
................................................................................
   620    622         cx[xB] = ncx;
   621    623         cBprev = cB;
   622    624       }
   623    625       cAprev = cA;
   624    626     }
   625    627   
   626    628     /* Free the wagner matrix and return the result */
   627         -  if( cA=='*' && nB>nA ){
   628         -    res = m[nA];
   629         -    for(xB=nA+1; xB<=nB; xB++){
   630         -      if( m[xB]<res ) res = m[xB];
          629  +  if( cA=='*' && nB>=nA ){
          630  +    res = m[0];
          631  +    for(xB=1; xB<=nB; xB++){
          632  +      if( m[xB]<res ){
          633  +        res = m[xB];
          634  +        if( pnMatch ) *pnMatch = nMatch + xB;
          635  +      }
   631    636       }
   632    637     }else{
   633    638       res = m[nB];
          639  +    if( pnMatch ) *pnMatch = -1;
   634    640     }
   635    641     sqlite3_free(toFree);
   636    642     return res;
   637    643   }
   638    644   
   639    645   /*
   640    646   ** Function:    editdist(A,B)
................................................................................
   646    652   */
   647    653   static void editdistSqlFunc(
   648    654     sqlite3_context *context,
   649    655     int argc,
   650    656     sqlite3_value **argv
   651    657   ){
   652    658     int res = editdist((const char*)sqlite3_value_text(argv[0]),
   653         -                    (const char*)sqlite3_value_text(argv[1]));
          659  +                    (const char*)sqlite3_value_text(argv[1]), 0);
   654    660     if( res<0 ){
   655    661       if( res==(-3) ){
   656    662         sqlite3_result_error_nomem(context);
   657    663       }else if( res==(-2) ){
   658    664         sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
   659    665       }else{
   660    666         sqlite3_result_error(context, "NULL input to editdist()", -1);
   661    667       }
   662    668     }else{ 
   663    669       sqlite3_result_int(context, res);
   664    670     }
   665    671   }
   666    672   
   667         -#if !SQLITE_CORE
          673  +#if !SQLITE_AMALGAMATION
   668    674   /*
   669    675   ** This lookup table is used to help decode the first byte of
   670    676   ** a multi-byte UTF8 character.
   671    677   */
   672    678   static const unsigned char sqlite3Utf8Trans1[] = {
   673    679     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
   674    680     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
................................................................................
  1144   1150         }
  1145   1151         if( c ) zOut[nOut++] = '?';
  1146   1152       }
  1147   1153     }
  1148   1154     zOut[nOut] = 0;
  1149   1155     return zOut;
  1150   1156   }
         1157  +
         1158  +/*
         1159  +** Return the number of characters in the shortest prefix of the input 
         1160  +** string that transliterates to an ASCII string nTrans bytes or longer.
         1161  +** Or, if the transliteration of the input string is less than nTrans
         1162  +** bytes in size, return the number of characters in the input string.
         1163  +*/
         1164  +static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
         1165  +  int i, c, sz, nOut;
         1166  +  int nChar;
         1167  +
         1168  +  i = nOut = 0;
         1169  +  for(nChar=0; i<nIn && nOut<nTrans; nChar++){
         1170  +    c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
         1171  +    i += sz;
         1172  +
         1173  +    nOut++;
         1174  +    if( c>=128 ){
         1175  +      int xTop, xBtm, x;
         1176  +      xTop = sizeof(translit)/sizeof(translit[0]) - 1;
         1177  +      xBtm = 0;
         1178  +      while( xTop>=xBtm ){
         1179  +        x = (xTop + xBtm)/2;
         1180  +        if( translit[x].cFrom==c ){
         1181  +          if( translit[x].cTo1 ) nOut++;
         1182  +          if( c==0x0429 || c== 0x0449 ) nOut += 2;
         1183  +          break;
         1184  +        }else if( translit[x].cFrom>c ){
         1185  +          xTop = x-1;
         1186  +        }else{
         1187  +          xBtm = x+1;
         1188  +        }
         1189  +      }
         1190  +    }
         1191  +  }
         1192  +
         1193  +  return nChar;
         1194  +}
         1195  +
         1196  +/*
         1197  +** Return the number of characters in the utf-8 string in the nIn byte
         1198  +** buffer pointed to by zIn.
         1199  +*/
         1200  +static int utf8_charlen(const char *zIn, int nIn){
         1201  +  int i;
         1202  +  int nChar = 0;
         1203  +  for(i=0; i<nIn; nChar++){
         1204  +    int sz;
         1205  +    utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
         1206  +    i += sz;
         1207  +  }
         1208  +
         1209  +  return nChar;
         1210  +}
  1151   1211   
  1152   1212   /*
  1153   1213   **    spellfix1_translit(X)
  1154   1214   **
  1155   1215   ** Convert a string that contains non-ASCII Roman characters into 
  1156   1216   ** pure ASCII.
  1157   1217   */
................................................................................
  1250   1310     int nSearch;                 /* Number of vocabulary items checked */
  1251   1311     struct spellfix1_row {         /* For each row of content */
  1252   1312       sqlite3_int64 iRowid;         /* Rowid for this row */
  1253   1313       char *zWord;                  /* Text for this row */
  1254   1314       int iRank;                    /* Rank for this row */
  1255   1315       int iDistance;                /* Distance from pattern for this row */
  1256   1316       int iScore;                   /* Score for sorting */
         1317  +    int iMatchlen;                /* Length of prefix match */
  1257   1318     } *a; 
  1258   1319   };
  1259   1320   
  1260   1321   /*
  1261   1322   ** Construct one or more SQL statements from the format string given
  1262   1323   ** and then evaluate those statements. The success code is written
  1263   1324   ** into *pRc.
................................................................................
  1348   1409         pNew->zTableName = sqlite3_mprintf("%s", zTableName);
  1349   1410         pNew->db = db;
  1350   1411         if( pNew->zTableName==0 ){
  1351   1412           rc = SQLITE_NOMEM;
  1352   1413         }else{
  1353   1414           rc = sqlite3_declare_vtab(db, 
  1354   1415                "CREATE TABLE x(word,rank,distance,langid,"
  1355         -             "score,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"
         1416  +             "score,matchlen,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"
  1356   1417                "soundslike HIDDEN)"
  1357   1418           );
  1358   1419         }
  1359   1420         if( rc==SQLITE_OK && isCreate ){
  1360   1421           sqlite3_uint64 r;
  1361   1422           spellfix1DbExec(&rc, db,
  1362   1423              "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
................................................................................
  1615   1676        "  FROM \"%w\".\"%w_vocab\""
  1616   1677        " WHERE langid=%d AND k2 GLOB '%q*'",
  1617   1678        p->zDbName, p->zTableName, iLang, zClass
  1618   1679     );
  1619   1680     rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
  1620   1681     sqlite3_free(zSql);
  1621   1682     if( rc==SQLITE_OK ){
         1683  +    int nK1;
  1622   1684       const char *zK1;
  1623   1685       int iDist;
  1624   1686       int iRank;
  1625   1687       int iScore;
         1688  +    int iMatchlen = 0;
  1626   1689       int iWorst = 999999999;
  1627   1690       int idx;
  1628   1691       int idxWorst;
  1629   1692       int i;
  1630   1693   
  1631   1694       while( sqlite3_step(pStmt)==SQLITE_ROW ){
  1632   1695         zK1 = (const char*)sqlite3_column_text(pStmt, 3);
  1633   1696         if( zK1==0 ) continue;
  1634   1697         pCur->nSearch++;
  1635   1698         iRank = sqlite3_column_int(pStmt, 2);
  1636         -      iDist = editdist(zPattern, zK1);
         1699  +      iDist = editdist(zPattern, zK1, &iMatchlen);
  1637   1700         iScore = spellfix1Score(iDist,iRank);
         1701  +      nK1 = sqlite3_column_bytes(pStmt, 3);
         1702  +      if( iMatchlen>0 ){
         1703  +        iMatchlen = translen_to_charlen(zK1, nK1, iMatchlen);
         1704  +      }else if( iMatchlen<0 ){
         1705  +        iMatchlen = utf8_charlen(zK1, nK1);
         1706  +      }
  1638   1707         if( pCur->nRow<pCur->nAlloc ){
  1639   1708           idx = pCur->nRow;
  1640   1709         }else if( iScore<iWorst ){
  1641   1710           idx = idxWorst;
  1642   1711           sqlite3_free(pCur->a[idx].zWord);
  1643   1712         }else{
  1644   1713           continue;
  1645   1714         }
  1646   1715         pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
  1647   1716         pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
  1648   1717         pCur->a[idx].iRank = iRank;
  1649   1718         pCur->a[idx].iDistance = iDist;
  1650   1719         pCur->a[idx].iScore = iScore;
         1720  +      pCur->a[idx].iMatchlen = iMatchlen;
  1651   1721         if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
  1652   1722         if( pCur->nRow==pCur->nAlloc ){
  1653   1723           iWorst = pCur->a[0].iScore;
  1654   1724           idxWorst = 0;
  1655   1725           for(i=1; i<pCur->nRow; i++){
  1656   1726             iScore = pCur->a[i].iScore;
  1657   1727             if( iWorst<iScore ){
................................................................................
  1746   1816         break;
  1747   1817       }
  1748   1818       case 4: {
  1749   1819         sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
  1750   1820         break;
  1751   1821       }
  1752   1822       case 5: {
  1753         -      sqlite3_result_int(ctx, pCur->iTop);
         1823  +      sqlite3_result_int(ctx, pCur->a[pCur->iRow].iMatchlen);
  1754   1824         break;
  1755   1825       }
  1756   1826       case 6: {
  1757         -      sqlite3_result_int(ctx, pCur->iScope);
         1827  +      sqlite3_result_int(ctx, pCur->iTop);
  1758   1828         break;
  1759   1829       }
  1760   1830       case 7: {
         1831  +      sqlite3_result_int(ctx, pCur->iScope);
         1832  +      break;
         1833  +    }
         1834  +    case 8: {
  1761   1835         sqlite3_result_int(ctx, pCur->nSearch);
  1762   1836         break;
  1763   1837       }
  1764   1838       default: {
  1765   1839         sqlite3_result_null(ctx);
  1766   1840         break;
  1767   1841       }

Added test/spellfix.test.

            1  +# 2012 July 12
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +#
           12  +
           13  +set testdir [file dirname $argv0]
           14  +source $testdir/tester.tcl
           15  +set testprefix spellfix
           16  +
           17  +register_spellfix_module db
           18  +
           19  +set vocab {
           20  +rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer
           21  +racers races racetrack racial racially racing rack racked racket racketeer
           22  +racketeering racketeers rackets racking racks radar radars radial radially
           23  +radian radiance radiant radiantly radiate radiated radiates radiating radiation
           24  +radiations radiator radiators radical radically radicals radices radii radio
           25  +radioactive radioastronomy radioed radiography radioing radiology radios radish
           26  +radishes radium radius radix radon raft rafter rafters rafts rag rage raged
           27  +rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders
           28  +raiding raids rail railed railer railers railing railroad railroaded railroader
           29  +railroaders railroading railroads rails railway railways raiment rain rainbow
           30  +raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining
           31  +rains rainstorm rainy raise raised raiser raisers raises raisin raising rake
           32  +raked rakes raking rallied rallies rally rallying ram ramble rambler rambles
           33  +rambling ramblings ramification ramifications ramp rampage rampant rampart
           34  +ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid
           35  +random randomization randomize randomized randomizes randomly randomness randy
           36  +rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked
           37  +ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack
           38  +ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted
           39  +ranter ranters ranting rants rap rapacious rape raped raper rapes rapid
           40  +rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly
           41  +rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal
           42  +rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping
           43  +rasps raster rat rate rated rater raters rates rather ratification ratified
           44  +ratifies ratify ratifying rating ratings ratio ration rational rationale
           45  +rationales rationalities rationality rationalization rationalizations
           46  +rationalize rationalized rationalizes rationalizing rationally rationals
           47  +rationing rations ratios rats rattle rattled rattler rattlers rattles
           48  +rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers
           49  +ravages ravaging rave raved raven ravening ravenous ravenously ravens raves
           50  +ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze
           51  +razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach
           52  +reachability reachable reachably reached reacher reaches reaching reacquired
           53  +react reacted reacting reaction reactionaries reactionary reactions reactivate
           54  +reactivated reactivates reactivating reactivation reactive reactively
           55  +reactivity reactor reactors reacts read readability readable reader readers
           56  +readied readier readies readiest readily readiness reading readings readjusted
           57  +readout readouts reads ready readying real realest realign realigned realigning
           58  +realigns realism realist realistic realistically realists realities reality
           59  +}
           60  +
           61  +do_test 1.1 {
           62  +  execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 }
           63  +  foreach word $vocab {
           64  +    execsql { INSERT INTO t1(word) VALUES($word) }
           65  +  }
           66  +} {}
           67  +
           68  +foreach {tn word res} {
           69  +  1   laxpi*     {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 5}
           70  +  2   ril*       {rally 3 rallies 3 rallied 3 rallying 3 rawly 4}
           71  +  3   rilis*     {realist 6 realistic 6 realistically 6 realists 6 realism 6}
           72  +  4   reail*     {reality 4 real 4 realities 4 realest 4 realist 4}
           73  +  5   ras*       {rashness 3 rascal 3 rasher 3 rash 3 rascally 3}
           74  +  6   realistss* {realistically 7 realists 8 realigns 8 realistic 9 realest 7}
           75  +  7   realistss  {realists 8 realist 7 realigns 8 realistic 9 realest 7}
           76  +  8   lllation*  {
           77  +      rationale 6 ration 6 rationally 6 rationalizing 6 rationality 6
           78  +  }
           79  +  9   renstom*  {rainstorm 8 ransoming 6 ransomer 6 ransom 6 ransacks 6}
           80  +} {
           81  +  do_execsql_test 1.2.$tn {
           82  +    SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5
           83  +  } $res
           84  +}
           85  +
           86  + 
           87  +finish_test