/ Check-in [f24b9d87]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the "matchlen" column to the spellfix1 virtual table.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | spellfix-matchlen
Files: files | file ages | folders
SHA1:f24b9d87f6b0e8b4d26669d5c1191f9280ba14a3
User & Date: dan 2012-07-13 19:26:34
Context
2012-07-16
10:25
Merge trunk changes. check-in: 90df64ab user: dan tags: spellfix-matchlen
2012-07-13
19:26
Add the "matchlen" column to the spellfix1 virtual table. check-in: f24b9d87 user: dan tags: spellfix-matchlen
16:15
Update test_spellfix.c with latest changes. check-in: cba2a658 user: dan tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/test_spellfix.c.

96
97
98
99
100
101
102





103
104
105
106
107
108
109
...
601
602
603
604
605
606
607






608
609
610
611
612
613
614
615
616
617
618
619
620
621

622
623
624
625
626
627

628
629
630
631
632
633
634
...
733
734
735
736
737
738
739
740



741
742
743

744
745
746
747
748
749
750
...
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
....
1138
1139
1140
1141
1142
1143
1144







1145
1146
1147
1148
1149
1150

1151
1152
1153
1154
1155
1156
1157
....
1278
1279
1280
1281
1282
1283
1284

1285
1286

1287

1288



1289
1290
1291
1292
1293
1294
1295
....
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
....
1413
1414
1415
1416
1417
1418
1419















1420
1421
1422
1423
1424
1425
1426
....
1863
1864
1865
1866
1867
1868
1869







































1870
1871
1872
1873
1874
1875
1876
....
2088
2089
2090
2091
2092
2093
2094

2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107

2108
2109
2110
2111
2112
2113
2114
....
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
....
2234
2235
2236
2237
2238
2239
2240
2241
2242

2243
2244
2245
2246
2247
2248
2249

2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
....
2346
2347
2348
2349
2350
2351
2352

2353
2354
2355
2356
2357
2358
2359
....
2579
2580
2581
2582
2583
2584
2585

2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
....
2611
2612
2613
2614
2615
2616
2617

2618
2619
2620
2621
2622
2623
2624
....
2692
2693
2694
2695
2696
2697
2698


2699
2700
2701
2702
2703
2704
2705
....
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
....
2825
2826
2827
2828
2829
2830
2831
























2832
2833
2834
2835
2836
2837
2838
**                  For any given query this value is the same on all rows.
**
**    score         The score is a combination of rank and distance.  The
**                  idea is that a lower score is better.  The virtual table
**                  attempts to find words with the lowest score and 
**                  by default (unless overridden by ORDER BY) returns
**                  results in order of increasing score.





**
**    top           (HIDDEN)  For any query, this value is the same on all
**                  rows.  It is an integer which is the maximum number of
**                  rows that will be output.  The actually number of rows
**                  output might be less than this number, but it will never
**                  be greater.  The default value for top is 20, but that
**                  can be changed for each query by including a term of
................................................................................
**
** Smaller numbers mean a closer match.
**
** Negative values indicate an error:
**    -1  One of the inputs is NULL
**    -2  Non-ASCII characters on input
**    -3  Unable to allocate memory 






*/
static int editdist1(const char *zA, const char *zB, int iLangId){
  int nA, nB;            /* Number of characters in zA[] and zB[] */
  int xA, xB;            /* Loop counters for zA[] and zB[] */
  char cA, cB;           /* Current character of zA and zB */
  char cAprev, cBprev;   /* Previous character of zA and zB */
  char cAnext, cBnext;   /* Next character in zA and zB */
  int d;                 /* North-west cost value */
  int dc = 0;            /* North-west character value */
  int res;               /* Final result */
  int *m;                /* The cost matrix */
  char *cx;              /* Corresponding character values */
  int *toFree = 0;       /* Malloced space */
  int mStack[60+15];     /* Stack space to use if not too much is needed */


  /* Early out if either input is NULL */
  if( zA==0 || zB==0 ) return -1;

  /* Skip any common prefix */
  while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; }

  if( zA[0]==0 && zB[0]==0 ) return 0;

#if 0
  printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
#endif

  /* Verify input strings and measure their lengths */
................................................................................
    cAprev = cA;
  }

  /* Free the wagner matrix and return the result */
  if( cA=='*' ){
    res = m[1];
    for(xB=1; xB<=nB; xB++){
      if( m[xB]<res ) res = m[xB];



    }
  }else{
    res = m[nB];

  }
  sqlite3_free(toFree);
  return res;
}

/*
** Function:    editdist(A,B)
................................................................................
  int argc,
  sqlite3_value **argv
){
  int langid = argc==2 ? 0 : sqlite3_value_int(argv[2]);
  int res = editdist1(
                    (const char*)sqlite3_value_text(argv[0]),
                    (const char*)sqlite3_value_text(argv[1]),
                    langid);
  if( res<0 ){
    if( res==(-3) ){
      sqlite3_result_error_nomem(context);
    }else if( res==(-2) ){
      sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
    }else{
      sqlite3_result_error(context, "NULL input to editdist()", -1);
................................................................................
    if( b<m[i] ) m[i] = b;
  }
}

/* Compute the edit distance between two strings.
**
** If an error occurs, return a negative number which is the error code.







*/
static int editDist3Core(
  EditDist3FromString *pFrom,  /* The FROM string */
  const char *z2,              /* The TO string */
  int n2,                      /* Length of the TO string */
  const EditDist3Lang *pLang   /* Edit weights for a particular language ID */

){
  int k, n;
  int i1, b1;
  int i2, b2;
  EditDist3FromString f = *pFrom;
  EditDist3To *a2;
  unsigned int *m;
................................................................................
    printf("\n");
  }
#endif

  /* Free memory allocations and return the result */
  res = (int)m[szRow*(n2+1)-1];
  if( f.isPrefix ){

    for(i2=f.n; i2<n2; i2++){
      int b = m[szRow*i2-1];

      if( b<res ) res = b;

    }



  }

editDist3Abort:
  for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
  sqlite3_free(m);
  return res;
}
................................................................................
    int dist;

    pFrom = editDist3FromStringNew(pLang, zA, nA);
    if( pFrom==0 ){
      sqlite3_result_error_nomem(context);
      return;
    }
    dist = editDist3Core(pFrom, zB, nB, pLang);
    editDist3FromStringDelete(pFrom);
    sqlite3_result_int(context, dist);
  } 
}

/*
** Register the editDist3 function with SQLite
................................................................................
        c = (c<<6) + (0x3f & z[i++]);
      }
    }
  }
  *pSize = i;
  return c;
}
















/*
** Table of translations from unicode characters into ASCII.
*/
static const struct {
 unsigned short int cFrom;
 unsigned char cTo0, cTo1;
................................................................................
      }
      if( c ) zOut[nOut++] = '?';
    }
  }
  zOut[nOut] = 0;
  return zOut;
}








































/*
**    spellfix1_translit(X)
**
** Convert a string that contains non-ASCII Roman characters into 
** pure ASCII.
*/
................................................................................
  EditDist3Config *pConfig3; /* Parsed edit distance costs */
};

/* Fuzzy-search cursor object */
struct spellfix1_cursor {
  sqlite3_vtab_cursor base;    /* Base class - must be first */
  spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */

  int nRow;                    /* Number of rows of content */
  int nAlloc;                  /* Number of allocated rows */
  int iRow;                    /* Current row of content */
  int iLang;                   /* Value of the lang= constraint */
  int iTop;                    /* Value of the top= constraint */
  int iScope;                  /* Value of the scope= constraint */
  int nSearch;                 /* Number of vocabulary items checked */
  struct spellfix1_row {       /* For each row of content */
    sqlite3_int64 iRowid;         /* Rowid for this row */
    char *zWord;                  /* Text for this row */
    int iRank;                    /* Rank for this row */
    int iDistance;                /* Distance from pattern for this row */
    int iScore;                   /* Score for sorting */

    char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
  } *a; 
};

/*
** Construct one or more SQL statements from the format string given
** and then evaluate those statements. The success code is written
................................................................................

/*
** xConnect/xCreate method for the spellfix1 module. Arguments are:
**
**   argv[0]   -> module name  ("spellfix1")
**   argv[1]   -> database name
**   argv[2]   -> table name
**   argv[3].. -> optional arguments (currently ignored)
*/
static int spellfix1Init(
  int isCreate,
  sqlite3 *db,
  void *pAux,
  int argc, const char *const*argv,
  sqlite3_vtab **ppVTab,
................................................................................
      memcpy(pNew->zDbName, zDbName, nDbName+1);
      pNew->zTableName = sqlite3_mprintf("%s", zTableName);
      pNew->db = db;
      if( pNew->zTableName==0 ){
        rc = SQLITE_NOMEM;
      }else{
        rc = sqlite3_declare_vtab(db, 
             "CREATE TABLE x(word,rank,distance,langid,"
             "score, phonehash,top HIDDEN,scope HIDDEN,srchcnt HIDDEN,"

             "soundslike HIDDEN,command HIDDEN)"
        );
#define SPELLFIX_COL_WORD            0
#define SPELLFIX_COL_RANK            1
#define SPELLFIX_COL_DISTANCE        2
#define SPELLFIX_COL_LANGID          3
#define SPELLFIX_COL_SCORE           4

#define SPELLFIX_COL_PHONEHASH       5
#define SPELLFIX_COL_TOP             6
#define SPELLFIX_COL_SCOPE           7
#define SPELLFIX_COL_SRCHCNT         8
#define SPELLFIX_COL_SOUNDSLIKE      9
#define SPELLFIX_COL_COMMAND        10
      }
      if( rc==SQLITE_OK && isCreate ){
        sqlite3_uint64 r;
        spellfix1DbExec(&rc, db,
           "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
           "  id INTEGER PRIMARY KEY,\n"
           "  rank INT,\n"
................................................................................
/*
** Close a fuzzy-search cursor.
*/
static int spellfix1Close(sqlite3_vtab_cursor *cur){
  spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
  spellfix1ResetCursor(pCur);
  spellfix1ResizeCursor(pCur, 0);

  sqlite3_free(pCur);
  return SQLITE_OK;
}

/*
** Search for terms of these forms:
**
................................................................................
  for(i=0; i<pCur->nRow; i++){
    if( pCur->a[i].iScore>iWorst ){
      iWorst = pCur->a[i].iScore;
      idxWorst = i;
    }
  }
  while( sqlite3_step(pStmt)==SQLITE_ROW ){

    iRank = sqlite3_column_int(pStmt, 2);
    if( p->pMatchStr3 ){
      int nWord = sqlite3_column_bytes(pStmt, 1);
      zWord = (const char*)sqlite3_column_text(pStmt, 1);
      iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang);
    }else{
      zK1 = (const char*)sqlite3_column_text(pStmt, 3);
      if( zK1==0 ) continue;
      iDist = editdist1(p->zPattern, zK1, pCur->iLang);
    }
    pCur->nSearch++;
    iScore = spellfix1Score(iDist,iRank);
    if( p->iMaxDist>=0 ){
      if( iDist>p->iMaxDist ) continue;
      if( pCur->nRow>=pCur->nAlloc-1 ){
        spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
................................................................................
      continue;
    }
    pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
    pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
    pCur->a[idx].iRank = iRank;
    pCur->a[idx].iDistance = iDist;
    pCur->a[idx].iScore = iScore;

    memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
    if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
    if( pCur->nRow==pCur->nAlloc ){
      iWorst = pCur->a[0].iScore;
      idxWorst = 0;
      for(i=1; i<pCur->nRow; i++){
        iScore = pCur->a[i].iScore;
................................................................................
  if( p->pConfig3 ){
    x.pLang = editDist3FindLang(p->pConfig3, iLang);
    pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
  }else{
    x.pLang = 0;
  }
  zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));


  if( zPattern==0 ) return SQLITE_NOMEM;
  nPattern = strlen(zPattern);
  if( zPattern[nPattern-1]=='*' ) nPattern--;
  zSql = sqlite3_mprintf(
     "SELECT id, word, rank, k1"
     "  FROM \"%w\".\"%w_vocab\""
     " WHERE langid=%d AND k2>=?1 AND k2<?2",
................................................................................

  if( pCur->a ){
    qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
    pCur->iTop = iLimit;
    pCur->iScope = iScope;
  }
  sqlite3_finalize(pStmt);
  sqlite3_free(zPattern);
  editDist3FromStringDelete(pMatchStr3);
  return pCur->a ? x.rc : SQLITE_NOMEM;
}

/*
** This version of xFilter handles a full-table scan case
*/
................................................................................
    case SPELLFIX_COL_LANGID: {
      sqlite3_result_int(ctx, pCur->iLang);
      break;
    }
    case SPELLFIX_COL_SCORE: {
      sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
      break;
























    }
    case SPELLFIX_COL_PHONEHASH: {
      sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
      break;
    }
    case SPELLFIX_COL_TOP: {
      sqlite3_result_int(ctx, pCur->iTop);







>
>
>
>
>







 







>
>
>
>
>
>

|












>





|
>







 







|
>
>
>



>







 







|







 







>
>
>
>
>
>
>





|
>







 







>
|

>
|
>
|
>
>
>







 







|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>













>







 







|







 







|
|
>
|






>
|
|
|
|
|
|







 







>







 







>




|



|







 







>







 







>
>







 







<







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
...
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
...
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
...
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
....
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
....
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
....
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
....
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
....
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
....
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
....
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
....
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
....
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
....
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
....
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
....
2784
2785
2786
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
....
2836
2837
2838
2839
2840
2841
2842

2843
2844
2845
2846
2847
2848
2849
....
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949
2950
2951
2952
2953
2954
2955
**                  For any given query this value is the same on all rows.
**
**    score         The score is a combination of rank and distance.  The
**                  idea is that a lower score is better.  The virtual table
**                  attempts to find words with the lowest score and 
**                  by default (unless overridden by ORDER BY) returns
**                  results in order of increasing score.
**
**    matchlen      For prefix queries, the number of characters in the prefix
**                  of the returned value (word) that matched the query term.
**                  For non-prefix queries, the number of characters in the 
**                  returned value.
**
**    top           (HIDDEN)  For any query, this value is the same on all
**                  rows.  It is an integer which is the maximum number of
**                  rows that will be output.  The actually number of rows
**                  output might be less than this number, but it will never
**                  be greater.  The default value for top is 20, but that
**                  can be changed for each query by including a term of
................................................................................
**
** Smaller numbers mean a closer match.
**
** Negative values indicate an error:
**    -1  One of the inputs is NULL
**    -2  Non-ASCII characters on input
**    -3  Unable to allocate memory 
**
** If pnMatch is not NULL, then *pnMatch is set to the number of bytes
** of zB that matched the pattern in zA. If zA does not end with a '*',
** then this value is always the number of bytes in zB (i.e. strlen(zB)).
** If zA does end in a '*', then it is the number of bytes in the prefix
** of zB that was deemed to match zA.
*/
static int editdist1(const char *zA, const char *zB, int iLangId, int *pnMatch){
  int nA, nB;            /* Number of characters in zA[] and zB[] */
  int xA, xB;            /* Loop counters for zA[] and zB[] */
  char cA, cB;           /* Current character of zA and zB */
  char cAprev, cBprev;   /* Previous character of zA and zB */
  char cAnext, cBnext;   /* Next character in zA and zB */
  int d;                 /* North-west cost value */
  int dc = 0;            /* North-west character value */
  int res;               /* Final result */
  int *m;                /* The cost matrix */
  char *cx;              /* Corresponding character values */
  int *toFree = 0;       /* Malloced space */
  int mStack[60+15];     /* Stack space to use if not too much is needed */
  int nMatch = 0;

  /* Early out if either input is NULL */
  if( zA==0 || zB==0 ) return -1;

  /* Skip any common prefix */
  while( zA[0] && zA[0]==zB[0] ){ dc = zA[0]; zA++; zB++; nMatch++; }
  if( pnMatch ) *pnMatch = nMatch;
  if( zA[0]==0 && zB[0]==0 ) return 0;

#if 0
  printf("A=\"%s\" B=\"%s\" dc=%c\n", zA, zB, dc?dc:' ');
#endif

  /* Verify input strings and measure their lengths */
................................................................................
    cAprev = cA;
  }

  /* Free the wagner matrix and return the result */
  if( cA=='*' ){
    res = m[1];
    for(xB=1; xB<=nB; xB++){
      if( m[xB]<res ){
        res = m[xB];
        if( pnMatch ) *pnMatch = xB+nMatch;
      }
    }
  }else{
    res = m[nB];
    if( pnMatch ) *pnMatch = -1;
  }
  sqlite3_free(toFree);
  return res;
}

/*
** Function:    editdist(A,B)
................................................................................
  int argc,
  sqlite3_value **argv
){
  int langid = argc==2 ? 0 : sqlite3_value_int(argv[2]);
  int res = editdist1(
                    (const char*)sqlite3_value_text(argv[0]),
                    (const char*)sqlite3_value_text(argv[1]),
                    langid, 0);
  if( res<0 ){
    if( res==(-3) ){
      sqlite3_result_error_nomem(context);
    }else if( res==(-2) ){
      sqlite3_result_error(context, "non-ASCII input to editdist()", -1);
    }else{
      sqlite3_result_error(context, "NULL input to editdist()", -1);
................................................................................
    if( b<m[i] ) m[i] = b;
  }
}

/* Compute the edit distance between two strings.
**
** If an error occurs, return a negative number which is the error code.
**
** If pnMatch is not NULL, then *pnMatch is set to the number of characters
** (not bytes) in z2 that matched the search pattern in *pFrom. If pFrom does
** not contain the pattern for a prefix-search, then this is always the number
** of characters in z2. If pFrom does contain a prefix search pattern, then
** it is the number of characters in the prefix of z2 that was deemed to 
** match pFrom.
*/
static int editDist3Core(
  EditDist3FromString *pFrom,  /* The FROM string */
  const char *z2,              /* The TO string */
  int n2,                      /* Length of the TO string */
  const EditDist3Lang *pLang,  /* Edit weights for a particular language ID */
  int *pnMatch                 /* OUT: Characters in matched prefix */
){
  int k, n;
  int i1, b1;
  int i2, b2;
  EditDist3FromString f = *pFrom;
  EditDist3To *a2;
  unsigned int *m;
................................................................................
    printf("\n");
  }
#endif

  /* Free memory allocations and return the result */
  res = (int)m[szRow*(n2+1)-1];
  if( f.isPrefix ){
    *pnMatch = n2;
    for(i2=1; i2<=n2; i2++){
      int b = m[szRow*i2-1];
      if( b<=res ){ 
        res = b;
        if( pnMatch ) *pnMatch = i2-1;
      }
    }
  }else if( pnMatch ){
    *pnMatch = n2;
  }

editDist3Abort:
  for(i2=0; i2<n2; i2++) sqlite3_free(a2[i2].apIns);
  sqlite3_free(m);
  return res;
}
................................................................................
    int dist;

    pFrom = editDist3FromStringNew(pLang, zA, nA);
    if( pFrom==0 ){
      sqlite3_result_error_nomem(context);
      return;
    }
    dist = editDist3Core(pFrom, zB, nB, pLang, 0);
    editDist3FromStringDelete(pFrom);
    sqlite3_result_int(context, dist);
  } 
}

/*
** Register the editDist3 function with SQLite
................................................................................
        c = (c<<6) + (0x3f & z[i++]);
      }
    }
  }
  *pSize = i;
  return c;
}

/*
** Return the number of characters in the utf-8 string in the nIn byte
** buffer pointed to by zIn.
*/
static int utf8Charlen(const char *zIn, int nIn){
  int i;
  int nChar = 0;
  for(i=0; i<nIn; nChar++){
    int sz;
    utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
    i += sz;
  }
  return nChar;
}

/*
** Table of translations from unicode characters into ASCII.
*/
static const struct {
 unsigned short int cFrom;
 unsigned char cTo0, cTo1;
................................................................................
      }
      if( c ) zOut[nOut++] = '?';
    }
  }
  zOut[nOut] = 0;
  return zOut;
}

/*
** Return the number of characters in the shortest prefix of the input
** string that transliterates to an ASCII string nTrans bytes or longer.
** Or, if the transliteration of the input string is less than nTrans
** bytes in size, return the number of characters in the input string.
*/
static int translen_to_charlen(const char *zIn, int nIn, int nTrans){
  int i, c, sz, nOut;
  int nChar;

  i = nOut = 0;
  for(nChar=0; i<nIn && nOut<nTrans; nChar++){
    c = utf8Read((const unsigned char *)&zIn[i], nIn-i, &sz);
    i += sz;

    nOut++;
    if( c>=128 ){
      int xTop, xBtm, x;
      xTop = sizeof(translit)/sizeof(translit[0]) - 1;
      xBtm = 0;
      while( xTop>=xBtm ){
        x = (xTop + xBtm)/2;
        if( translit[x].cFrom==c ){
          if( translit[x].cTo1 ) nOut++;
          if( c==0x0429 || c== 0x0449 ) nOut += 2;
          break;
        }else if( translit[x].cFrom>c ){
          xTop = x-1;
        }else{
          xBtm = x+1;
        }
      }
    }
  }

  return nChar;
}


/*
**    spellfix1_translit(X)
**
** Convert a string that contains non-ASCII Roman characters into 
** pure ASCII.
*/
................................................................................
  EditDist3Config *pConfig3; /* Parsed edit distance costs */
};

/* Fuzzy-search cursor object */
struct spellfix1_cursor {
  sqlite3_vtab_cursor base;    /* Base class - must be first */
  spellfix1_vtab *pVTab;       /* The table to which this cursor belongs */
  char *zPattern;              /* rhs of MATCH clause */
  int nRow;                    /* Number of rows of content */
  int nAlloc;                  /* Number of allocated rows */
  int iRow;                    /* Current row of content */
  int iLang;                   /* Value of the lang= constraint */
  int iTop;                    /* Value of the top= constraint */
  int iScope;                  /* Value of the scope= constraint */
  int nSearch;                 /* Number of vocabulary items checked */
  struct spellfix1_row {       /* For each row of content */
    sqlite3_int64 iRowid;         /* Rowid for this row */
    char *zWord;                  /* Text for this row */
    int iRank;                    /* Rank for this row */
    int iDistance;                /* Distance from pattern for this row */
    int iScore;                   /* Score for sorting */
    int iMatchlen;                /* Value of matchlen column (or -1) */
    char zHash[SPELLFIX_MX_HASH]; /* the phonehash used for this match */
  } *a; 
};

/*
** Construct one or more SQL statements from the format string given
** and then evaluate those statements. The success code is written
................................................................................

/*
** xConnect/xCreate method for the spellfix1 module. Arguments are:
**
**   argv[0]   -> module name  ("spellfix1")
**   argv[1]   -> database name
**   argv[2]   -> table name
**   argv[3].. -> optional arguments (i.e. "edit_cost_table" parameter)
*/
static int spellfix1Init(
  int isCreate,
  sqlite3 *db,
  void *pAux,
  int argc, const char *const*argv,
  sqlite3_vtab **ppVTab,
................................................................................
      memcpy(pNew->zDbName, zDbName, nDbName+1);
      pNew->zTableName = sqlite3_mprintf("%s", zTableName);
      pNew->db = db;
      if( pNew->zTableName==0 ){
        rc = SQLITE_NOMEM;
      }else{
        rc = sqlite3_declare_vtab(db, 
             "CREATE TABLE x(word,rank,distance,langid, "
             "score, matchlen, phonehash, "
             "top HIDDEN, scope HIDDEN, srchcnt HIDDEN, "
             "soundslike HIDDEN, command HIDDEN)"
        );
#define SPELLFIX_COL_WORD            0
#define SPELLFIX_COL_RANK            1
#define SPELLFIX_COL_DISTANCE        2
#define SPELLFIX_COL_LANGID          3
#define SPELLFIX_COL_SCORE           4
#define SPELLFIX_COL_MATCHLEN        5
#define SPELLFIX_COL_PHONEHASH       6
#define SPELLFIX_COL_TOP             7
#define SPELLFIX_COL_SCOPE           8
#define SPELLFIX_COL_SRCHCNT         9
#define SPELLFIX_COL_SOUNDSLIKE     10
#define SPELLFIX_COL_COMMAND        11
      }
      if( rc==SQLITE_OK && isCreate ){
        sqlite3_uint64 r;
        spellfix1DbExec(&rc, db,
           "CREATE TABLE IF NOT EXISTS \"%w\".\"%w_vocab\"(\n"
           "  id INTEGER PRIMARY KEY,\n"
           "  rank INT,\n"
................................................................................
/*
** Close a fuzzy-search cursor.
*/
static int spellfix1Close(sqlite3_vtab_cursor *cur){
  spellfix1_cursor *pCur = (spellfix1_cursor *)cur;
  spellfix1ResetCursor(pCur);
  spellfix1ResizeCursor(pCur, 0);
  sqlite3_free(pCur->zPattern);
  sqlite3_free(pCur);
  return SQLITE_OK;
}

/*
** Search for terms of these forms:
**
................................................................................
  for(i=0; i<pCur->nRow; i++){
    if( pCur->a[i].iScore>iWorst ){
      iWorst = pCur->a[i].iScore;
      idxWorst = i;
    }
  }
  while( sqlite3_step(pStmt)==SQLITE_ROW ){
    int iMatchlen = -1;
    iRank = sqlite3_column_int(pStmt, 2);
    if( p->pMatchStr3 ){
      int nWord = sqlite3_column_bytes(pStmt, 1);
      zWord = (const char*)sqlite3_column_text(pStmt, 1);
      iDist = editDist3Core(p->pMatchStr3, zWord, nWord, p->pLang, &iMatchlen);
    }else{
      zK1 = (const char*)sqlite3_column_text(pStmt, 3);
      if( zK1==0 ) continue;
      iDist = editdist1(p->zPattern, zK1, pCur->iLang, 0);
    }
    pCur->nSearch++;
    iScore = spellfix1Score(iDist,iRank);
    if( p->iMaxDist>=0 ){
      if( iDist>p->iMaxDist ) continue;
      if( pCur->nRow>=pCur->nAlloc-1 ){
        spellfix1ResizeCursor(pCur, pCur->nAlloc*2 + 10);
................................................................................
      continue;
    }
    pCur->a[idx].zWord = sqlite3_mprintf("%s", sqlite3_column_text(pStmt, 1));
    pCur->a[idx].iRowid = sqlite3_column_int64(pStmt, 0);
    pCur->a[idx].iRank = iRank;
    pCur->a[idx].iDistance = iDist;
    pCur->a[idx].iScore = iScore;
    pCur->a[idx].iMatchlen = iMatchlen;
    memcpy(pCur->a[idx].zHash, zHash1, iScope+1);
    if( pCur->nRow<pCur->nAlloc ) pCur->nRow++;
    if( pCur->nRow==pCur->nAlloc ){
      iWorst = pCur->a[0].iScore;
      idxWorst = 0;
      for(i=1; i<pCur->nRow; i++){
        iScore = pCur->a[i].iScore;
................................................................................
  if( p->pConfig3 ){
    x.pLang = editDist3FindLang(p->pConfig3, iLang);
    pMatchStr3 = editDist3FromStringNew(x.pLang, (const char*)zMatchThis, -1);
  }else{
    x.pLang = 0;
  }
  zPattern = (char*)transliterate(zMatchThis, sqlite3_value_bytes(argv[0]));
  sqlite3_free(pCur->zPattern);
  pCur->zPattern = zPattern;
  if( zPattern==0 ) return SQLITE_NOMEM;
  nPattern = strlen(zPattern);
  if( zPattern[nPattern-1]=='*' ) nPattern--;
  zSql = sqlite3_mprintf(
     "SELECT id, word, rank, k1"
     "  FROM \"%w\".\"%w_vocab\""
     " WHERE langid=%d AND k2>=?1 AND k2<?2",
................................................................................

  if( pCur->a ){
    qsort(pCur->a, pCur->nRow, sizeof(pCur->a[0]), spellfix1RowCompare);
    pCur->iTop = iLimit;
    pCur->iScope = iScope;
  }
  sqlite3_finalize(pStmt);

  editDist3FromStringDelete(pMatchStr3);
  return pCur->a ? x.rc : SQLITE_NOMEM;
}

/*
** This version of xFilter handles a full-table scan case
*/
................................................................................
    case SPELLFIX_COL_LANGID: {
      sqlite3_result_int(ctx, pCur->iLang);
      break;
    }
    case SPELLFIX_COL_SCORE: {
      sqlite3_result_int(ctx, pCur->a[pCur->iRow].iScore);
      break;
    }
    case SPELLFIX_COL_MATCHLEN: {
      int iMatchlen = pCur->a[pCur->iRow].iMatchlen;
      if( iMatchlen<0 ){
        int nPattern = strlen(pCur->zPattern);
        char *zWord = pCur->a[pCur->iRow].zWord;
        int nWord = strlen(zWord);

        if( nPattern>0 && pCur->zPattern[nPattern-1]=='*' ){
          char *zTranslit;
          int res;
          zTranslit = (char *)transliterate((unsigned char *)zWord, nWord);
          if( !zTranslit ) return SQLITE_NOMEM;
          res = editdist1(pCur->zPattern, zTranslit, pCur->iLang, &iMatchlen);
          sqlite3_free(zTranslit);
          if( res<0 ) return SQLITE_NOMEM;
          iMatchlen = translen_to_charlen(zWord, nWord, iMatchlen);
        }else{
          iMatchlen = utf8Charlen(zWord, nWord);
        }
      }

      sqlite3_result_int(ctx, iMatchlen);
      break;
    }
    case SPELLFIX_COL_PHONEHASH: {
      sqlite3_result_text(ctx, pCur->a[pCur->iRow].zHash, -1, SQLITE_STATIC);
      break;
    }
    case SPELLFIX_COL_TOP: {
      sqlite3_result_int(ctx, pCur->iTop);

Added test/spellfix.test.







































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# 2012 July 12
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix spellfix

register_spellfix_module db

set vocab {
rabbi rabbit rabbits rabble rabid rabies raccoon raccoons race raced racer
racers races racetrack racial racially racing rack racked racket racketeer
racketeering racketeers rackets racking racks radar radars radial radially
radian radiance radiant radiantly radiate radiated radiates radiating radiation
radiations radiator radiators radical radically radicals radices radii radio
radioactive radioastronomy radioed radiography radioing radiology radios radish
radishes radium radius radix radon raft rafter rafters rafts rag rage raged
rages ragged raggedly raggedness raging rags ragweed raid raided raider raiders
raiding raids rail railed railer railers railing railroad railroaded railroader
railroaders railroading railroads rails railway railways raiment rain rainbow
raincoat raincoats raindrop raindrops rained rainfall rainier rainiest raining
rains rainstorm rainy raise raised raiser raisers raises raisin raising rake
raked rakes raking rallied rallies rally rallying ram ramble rambler rambles
rambling ramblings ramification ramifications ramp rampage rampant rampart
ramps ramrod rams ran ranch ranched rancher ranchers ranches ranching rancid
random randomization randomize randomized randomizes randomly randomness randy
rang range ranged rangeland ranger rangers ranges ranging rangy rank ranked
ranker rankers rankest ranking rankings rankle rankly rankness ranks ransack
ransacked ransacking ransacks ransom ransomer ransoming ransoms rant ranted
ranter ranters ranting rants rap rapacious rape raped raper rapes rapid
rapidity rapidly rapids rapier raping rapport rapprochement raps rapt raptly
rapture raptures rapturous rare rarely rareness rarer rarest rarity rascal
rascally rascals rash rasher rashly rashness rasp raspberry rasped rasping
rasps raster rat rate rated rater raters rates rather ratification ratified
ratifies ratify ratifying rating ratings ratio ration rational rationale
rationales rationalities rationality rationalization rationalizations
rationalize rationalized rationalizes rationalizing rationally rationals
rationing rations ratios rats rattle rattled rattler rattlers rattles
rattlesnake rattlesnakes rattling raucous ravage ravaged ravager ravagers
ravages ravaging rave raved raven ravening ravenous ravenously ravens raves
ravine ravines raving ravings raw rawer rawest rawly rawness ray rays raze
razor razors re reabbreviate reabbreviated reabbreviates reabbreviating reach
reachability reachable reachably reached reacher reaches reaching reacquired
react reacted reacting reaction reactionaries reactionary reactions reactivate
reactivated reactivates reactivating reactivation reactive reactively
reactivity reactor reactors reacts read readability readable reader readers
readied readier readies readiest readily readiness reading readings readjusted
readout readouts reads ready readying real realest realign realigned realigning
realigns realism realist realistic realistically realists realities reality
}

do_test 1.1 {
  execsql { CREATE VIRTUAL TABLE t1 USING spellfix1 }
  foreach word $vocab {
    execsql { INSERT INTO t1(word) VALUES($word) }
  }
} {}

foreach {tn word res} {
  1   raxpi*     {rasping 5 rasped 5 raspberry 6 rasp 4 rasps 4}
  2   ril*       {rail 4 railway 4 railing 4 rails 4 railways 4}
  3   rilis*     {realist 6 realistic 6 realistically 6 realists 6 realism 6}
  4   reail*     {realities 3 reality 3 real 3 realest 3 realist 3}
  5   ras*       {rasp 3 rash 3 rasped 3 rasping 3 rasps 3}
  6   realistss* {realists 8 realigns 8 realistic 9 realistically 9 realest 7}
  7   realistss  {realists 8 realist 7 realigns 8 realistic 9 realest 7}
  8   rllation*  {realities 9 reality 7 rallied 7 railed 4}
  9   renstom*   {rainstorm 8 ransomer 6 ransom 6 ransoming 6 ransoms 6}
} {
  do_execsql_test 1.2.$tn {
    SELECT word, matchlen FROM t1 WHERE word MATCH $word LIMIT 5
  } $res
}


do_execsql_test 2.1 {
  CREATE VIRTUAL TABLE t2 USING spellfix1;
  INSERT INTO t2 (word, soundslike) VALUES('school', 'skuul');
  INSERT INTO t2 (word, soundslike) VALUES('psalm', 'sarm');
  SELECT word, matchlen FROM t2 WHERE word MATCH 'sar*' LIMIT 5;
} {psalm 4}

do_execsql_test 2.2 {
  SELECT word, matchlen FROM t2 WHERE word MATCH 'skol*' LIMIT 5;
} {school 6}

set vocab {
kangaroo kanji kappa karate keel keeled keeling keels keen keener keenest
keenly keenness keep keeper keepers keeping keeps ken kennel kennels kept
kerchief kerchiefs kern kernel kernels kerosene ketchup kettle
kettles key keyboard keyboards keyed keyhole keying keynote keypad keypads keys
keystroke keystrokes keyword keywords kick kicked kicker kickers kicking
kickoff kicks kid kidded kiddie kidding kidnap kidnapper kidnappers kidnapping
kidnappings kidnaps kidney kidneys kids kill killed killer killers killing
killingly killings killjoy kills kilobit kilobits kiloblock kilobyte kilobytes
kilogram kilograms kilohertz kilohm kilojoule kilometer kilometers kiloton
kilovolt kilowatt kiloword kimono kin kind kinder kindergarten kindest
kindhearted kindle kindled kindles kindling kindly kindness kindred kinds
kinetic king kingdom kingdoms kingly kingpin kings kink kinky kinship kinsman
kiosk kiss kissed kisser kissers kisses kissing kit kitchen kitchenette
kitchens kite kited kites kiting kits kitten kittenish kittens kitty klaxon
kludge kludges klystron knack knapsack knapsacks knave knaves knead kneads knee
kneecap kneed kneeing kneel kneeled kneeling kneels knees knell knells knelt
knew knife knifed knifes knifing knight knighted knighthood knighting knightly
knights knit knits knives knob knobs knock knockdown knocked knocker knockers
knocking knockout knocks knoll knolls knot knots knotted knotting know knowable
knower knowhow knowing knowingly knowledge knowledgeable known knows knuckle
knuckled knuckles koala kosher kudo
}

do_execsql_test 3.1 {
  CREATE TABLE costs(iLang, cFrom, cTo, iCost);
  INSERT INTO costs VALUES(0, 'a', 'e', 1);
  INSERT INTO costs VALUES(0, 'e', 'i', 1);
  INSERT INTO costs VALUES(0, 'i', 'o', 1);
  INSERT INTO costs VALUES(0, 'o', 'u', 1);
  INSERT INTO costs VALUES(0, 'u', 'a', 1);
  CREATE VIRTUAL TABLE t3 USING spellfix1(edit_cost_table=costs);
}

do_test 3.2 {
  foreach w $vocab {
    execsql { INSERT INTO t3(word) VALUES($w) }
  }
} {}

breakpoint
foreach {tn word res} {
  1   kos*     {kosher 3 kiosk 4 kudo 2 kappa 1 keypad 1}
  2   kellj*   {killjoy 5 killed 4 killingly 4 kill 4 killer 4}
  3   kellj    {kill 4 kills 5 killjoy 7 keel 4 killed 6}
} {
  do_execsql_test 1.2.$tn {
    SELECT word, matchlen FROM t3 WHERE word MATCH $word LIMIT 5
  } $res
} 

finish_test