SQLite

Check-in [f22dbccad9]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii".
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: f22dbccad9499624880ddd48df1b07fb42b1ad66
User & Date: dan 2015-01-12 17:58:04.627
Context
2015-01-13
17:25
Fix prefix indexes so that they work in characters, not bytes. (check-in: af8d43a4a0 user: dan tags: fts5)
2015-01-12
17:58
Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii". (check-in: f22dbccad9 user: dan tags: fts5)
2015-01-10
20:34
Fix some documentation issues in fts5. (check-in: 512e1bdb40 user: dan tags: fts5)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5.c.
68
69
70
71
72
73
74

75
76
77
78
79
80
81
*/
struct Fts5Global {
  fts5_api api;                   /* User visible part of object (see fts5.h) */
  sqlite3 *db;                    /* Associated database connection */ 
  i64 iNextId;                    /* Used to allocate unique cursor ids */
  Fts5Auxiliary *pAux;            /* First in list of all aux. functions */
  Fts5TokenizerModule *pTok;      /* First in list of all tokenizer modules */

  Fts5Cursor *pCsr;               /* First in list of all open cursors */
};

/*
** Each auxiliary function registered with the FTS5 module is represented
** by an object of the following type. All such objects are stored as part
** of the Fts5Global.pAux list.







>







68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
*/
struct Fts5Global {
  fts5_api api;                   /* User visible part of object (see fts5.h) */
  sqlite3 *db;                    /* Associated database connection */ 
  i64 iNextId;                    /* Used to allocate unique cursor ids */
  Fts5Auxiliary *pAux;            /* First in list of all aux. functions */
  Fts5TokenizerModule *pTok;      /* First in list of all tokenizer modules */
  Fts5TokenizerModule *pDfltTok;  /* Default tokenizer module */
  Fts5Cursor *pCsr;               /* First in list of all open cursors */
};

/*
** Each auxiliary function registered with the FTS5 module is represented
** by an object of the following type. All such objects are stored as part
** of the Fts5Global.pAux list.
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
}


static int fts5FindRankFunction(Fts5Cursor *pCsr){
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  Fts5Config *pConfig = pTab->pConfig;
  int rc = SQLITE_OK;
  Fts5Auxiliary *pAux;
  const char *zRank = pCsr->zRank;
  const char *zRankArgs = pCsr->zRankArgs;

  if( zRankArgs ){
    char *zSql = sqlite3_mprintf("SELECT %s", zRankArgs);
    if( zSql==0 ){
      rc = SQLITE_NOMEM;







|







768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
}


static int fts5FindRankFunction(Fts5Cursor *pCsr){
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  Fts5Config *pConfig = pTab->pConfig;
  int rc = SQLITE_OK;
  Fts5Auxiliary *pAux = 0;
  const char *zRank = pCsr->zRank;
  const char *zRankArgs = pCsr->zRankArgs;

  if( zRankArgs ){
    char *zSql = sqlite3_mprintf("SELECT %s", zRankArgs);
    if( zSql==0 ){
      rc = SQLITE_NOMEM;
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
      }
    }
  }
  return rc;
}

static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){
  int rc;
  va_list ap;                     /* ... printf arguments */
  va_start(ap, zFormat);
  assert( p->base.zErrMsg==0 );
  p->base.zErrMsg = sqlite3_vmprintf(zFormat, ap);
  va_end(ap);
}








<







1025
1026
1027
1028
1029
1030
1031

1032
1033
1034
1035
1036
1037
1038
      }
    }
  }
  return rc;
}

static void fts5SetVtabError(Fts5Table *p, const char *zFormat, ...){

  va_list ap;                     /* ... printf arguments */
  va_start(ap, zFormat);
  assert( p->base.zErrMsg==0 );
  p->base.zErrMsg = sqlite3_vmprintf(zFormat, ap);
  va_end(ap);
}

1792
1793
1794
1795
1796
1797
1798



1799
1800
1801
1802
1803
1804
1805
    pNew->zName = (char*)&pNew[1];
    strcpy(pNew->zName, zName);
    pNew->pUserData = pUserData;
    pNew->x = *pTokenizer;
    pNew->xDestroy = xDestroy;
    pNew->pNext = pGlobal->pTok;
    pGlobal->pTok = pNew;



  }else{
    rc = SQLITE_NOMEM;
  }

  return rc;
}








>
>
>







1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
    pNew->zName = (char*)&pNew[1];
    strcpy(pNew->zName, zName);
    pNew->pUserData = pUserData;
    pNew->x = *pTokenizer;
    pNew->xDestroy = xDestroy;
    pNew->pNext = pGlobal->pTok;
    pGlobal->pTok = pNew;
    if( pNew->pNext==0 ){
      pGlobal->pDfltTok = pNew;
    }
  }else{
    rc = SQLITE_NOMEM;
  }

  return rc;
}

1813
1814
1815
1816
1817
1818
1819



1820
1821

1822
1823
1824
1825
1826
1827
1828
  void **ppUserData,
  fts5_tokenizer *pTokenizer      /* Populate this object */
){
  Fts5Global *pGlobal = (Fts5Global*)pApi;
  int rc = SQLITE_OK;
  Fts5TokenizerModule *pTok;




  for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){
    if( sqlite3_stricmp(zName, pTok->zName)==0 ) break;

  }

  if( pTok ){
    *pTokenizer = pTok->x;
    *ppUserData = pTok->pUserData;
  }else{
    memset(pTokenizer, 0, sizeof(fts5_tokenizer));







>
>
>
|
|
>







1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
  void **ppUserData,
  fts5_tokenizer *pTokenizer      /* Populate this object */
){
  Fts5Global *pGlobal = (Fts5Global*)pApi;
  int rc = SQLITE_OK;
  Fts5TokenizerModule *pTok;

  if( zName==0 ){
    pTok = pGlobal->pDfltTok;
  }else{
    for(pTok=pGlobal->pTok; pTok; pTok=pTok->pNext){
      if( sqlite3_stricmp(zName, pTok->zName)==0 ) break;
    }
  }

  if( pTok ){
    *pTokenizer = pTok->x;
    *ppUserData = pTok->pUserData;
  }else{
    memset(pTokenizer, 0, sizeof(fts5_tokenizer));
1837
1838
1839
1840
1841
1842
1843

1844
1845
1846
1847
1848
1849
1850
1851
1852
  const char **azArg,
  int nArg,
  Fts5Tokenizer **ppTok,
  fts5_tokenizer **ppTokApi
){
  Fts5TokenizerModule *pMod = 0;
  int rc = SQLITE_OK;

  if( nArg==0 ){
    pMod = pGlobal->pTok;
  }else{
    for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){
      if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break;
    }
  }

  if( pMod==0 ){







>

|







1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
  const char **azArg,
  int nArg,
  Fts5Tokenizer **ppTok,
  fts5_tokenizer **ppTokApi
){
  Fts5TokenizerModule *pMod = 0;
  int rc = SQLITE_OK;

  if( nArg==0 ){
    pMod = pGlobal->pDfltTok;
  }else{
    for(pMod=pGlobal->pTok; pMod; pMod=pMod->pNext){
      if( sqlite3_stricmp(azArg[0], pMod->zName)==0 ) break;
    }
  }

  if( pMod==0 ){
Changes to ext/fts5/fts5_tokenize.c.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
*/

#include "fts5.h"
#include <string.h>
#include <assert.h>

/**************************************************************************
** Start of simple tokenizer implementation.
*/

/*
** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters. 
*/
static unsigned char aSimpleTokenChar[128] = {
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
};

typedef struct SimpleTokenizer SimpleTokenizer;
struct SimpleTokenizer {
  unsigned char aTokenChar[128];
};

static void fts5SimpleAddExceptions(
  SimpleTokenizer *p, 
  const char *zArg, 
  int bTokenChars
){
  int i;
  for(i=0; zArg[i]; i++){
    if( (zArg[i] & 0x80)==0 ){
      p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
    }
  }
}

/*
** Create a "simple" tokenizer.
*/
static int fts5SimpleCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  int rc = SQLITE_OK;
  SimpleTokenizer *p = 0;
  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
    p = sqlite3_malloc(sizeof(SimpleTokenizer));
    if( p==0 ){
      rc = SQLITE_NOMEM;
    }else{
      int i;
      memset(p, 0, sizeof(SimpleTokenizer));
      memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar));
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          fts5SimpleAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          fts5SimpleAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
      }
    }
  }

  *ppOut = (Fts5Tokenizer*)p;
  return rc;
}

/*
** Delete a "simple" tokenizer.
*/
static void fts5SimpleDelete(Fts5Tokenizer *p){
  sqlite3_free(p);
}


static void simpleFold(char *aOut, const char *aIn, int nByte){
  int i;
  for(i=0; i<nByte; i++){
    char c = aIn[i];
    if( c>='A' && c<='Z' ) c += 32;
    aOut[i] = c;
  }
}

/*
** Tokenize some text using the simple tokenizer.
*/
static int fts5SimpleTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  SimpleTokenizer *p = (SimpleTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;

  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;
  unsigned char *a = p->aTokenChar;

  while( is<nText && rc==SQLITE_OK ){
    int nByte;

    /* Skip any leading divider characters. */
    while( is<nText && ((pText[is]&0x80) || a[(int)pText[is]]==0) ){
      is++;
    }
    if( is==nText ) break;

    /* Count the token characters */
    ie = is+1;
    while( ie<nText && ((pText[ie]&0x80)==0 && a[(int)pText[ie]] ) ){
      ie++;
    }

    /* Fold to lower case */
    nByte = ie-is;
    if( nByte>nFold ){
      if( pFold!=aFold ) sqlite3_free(pFold);
      pFold = sqlite3_malloc(nByte*2);
      if( pFold==0 ){
        rc = SQLITE_NOMEM;
        break;
      }
      nFold = nByte*2;
    }
    simpleFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie);
    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);







|






|










|
|



|
|












|

|





|



|




|
|



|


|












|

|




|









|

|





|













|






|














|







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
*/

#include "fts5.h"
#include <string.h>
#include <assert.h>

/**************************************************************************
** Start of ascii tokenizer implementation.
*/

/*
** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters. 
*/
static unsigned char aAsciiTokenChar[128] = {
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
};

typedef struct AsciiTokenizer AsciiTokenizer;
struct AsciiTokenizer {
  unsigned char aTokenChar[128];
};

static void fts5AsciiAddExceptions(
  AsciiTokenizer *p, 
  const char *zArg, 
  int bTokenChars
){
  int i;
  for(i=0; zArg[i]; i++){
    if( (zArg[i] & 0x80)==0 ){
      p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
    }
  }
}

/*
** Create a "ascii" tokenizer.
*/
static int fts5AsciiCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  int rc = SQLITE_OK;
  AsciiTokenizer *p = 0;
  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
    p = sqlite3_malloc(sizeof(AsciiTokenizer));
    if( p==0 ){
      rc = SQLITE_NOMEM;
    }else{
      int i;
      memset(p, 0, sizeof(AsciiTokenizer));
      memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          fts5AsciiAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          fts5AsciiAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
      }
    }
  }

  *ppOut = (Fts5Tokenizer*)p;
  return rc;
}

/*
** Delete a "ascii" tokenizer.
*/
static void fts5AsciiDelete(Fts5Tokenizer *p){
  sqlite3_free(p);
}


static void asciiFold(char *aOut, const char *aIn, int nByte){
  int i;
  for(i=0; i<nByte; i++){
    char c = aIn[i];
    if( c>='A' && c<='Z' ) c += 32;
    aOut[i] = c;
  }
}

/*
** Tokenize some text using the ascii tokenizer.
*/
static int fts5AsciiTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;

  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;
  unsigned char *a = p->aTokenChar;

  while( is<nText && rc==SQLITE_OK ){
    int nByte;

    /* Skip any leading divider characters. */
    while( is<nText && ((pText[is]&0x80)==0 && a[(int)pText[is]]==0) ){
      is++;
    }
    if( is==nText ) break;

    /* Count the token characters */
    ie = is+1;
    while( ie<nText && ((pText[ie]&0x80) || a[(int)pText[ie]] ) ){
      ie++;
    }

    /* Fold to lower case */
    nByte = ie-is;
    if( nByte>nFold ){
      if( pFold!=aFold ) sqlite3_free(pFold);
      pFold = sqlite3_malloc(nByte*2);
      if( pFold==0 ){
        rc = SQLITE_NOMEM;
        break;
      }
      nFold = nByte*2;
    }
    asciiFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie);
    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);
201
202
203
204
205
206
207

208
209
210
211
212
213
214
    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
      c = (c<<6) + (0x3f & *(zIn++));                      \
    }                                                      \
    if( c<0x80                                             \
        || (c&0xFFFFF800)==0xD800                          \
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
  }


#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (unsigned char)(c&0xFF);                 \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \







>







201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
      c = (c<<6) + (0x3f & *(zIn++));                      \
    }                                                      \
    if( c<0x80                                             \
        || (c&0xFFFFF800)==0xD800                          \
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
  }


#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (unsigned char)(c&0xFF);                 \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + (unsigned char)((c>>6)&0x1F);     \
226
227
228
229
230
231
232



233
234
235
236
237
238
239
  }                                                    \
}

#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {



  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
  int nException;
  int *aiException;
};

static int fts5UnicodeAddExceptions(
  Unicode61Tokenizer *p,          /* Tokenizer object */







>
>
>







227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
  }                                                    \
}

#endif /* ifndef SQLITE_AMALGAMATION */

typedef struct Unicode61Tokenizer Unicode61Tokenizer;
struct Unicode61Tokenizer {
  unsigned char aTokenChar[128];  /* ASCII range token characters */
  char *aFold;                    /* Buffer to fold text into */
  int nFold;                      /* Size of aFold[] in bytes */
  int bRemoveDiacritic;           /* True if remove_diacritics=1 is set */
  int nException;
  int *aiException;
};

static int fts5UnicodeAddExceptions(
  Unicode61Tokenizer *p,          /* Tokenizer object */
250
251
252
253
254
255
256



257
258
259
260
261
262
263
264
265
266
267

268
269
270
271
272
273
274
      int nNew = p->nException;
      const unsigned char *zCsr = (const unsigned char*)z;
      const unsigned char *zTerm = (const unsigned char*)&z[n];
      while( zCsr<zTerm ){
        int iCode;
        int bToken;
        READ_UTF8(zCsr, zTerm, iCode);



        bToken = sqlite3Fts5UnicodeIsalnum(iCode);
        assert( (bToken==0 || bToken==1) ); 
        assert( (bTokenChars==0 || bTokenChars==1) );
        if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
          int i;
          for(i=0; i<nNew; i++){
            if( aNew[i]>iCode ) break;
          }
          memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
          aNew[i] = iCode;
          nNew++;

        }
      }
      p->aiException = aNew;
      p->nException = nNew;
    }else{
      rc = SQLITE_NOMEM;
    }







>
>
>
|
|
|
|
|
|
|
|
|
|
|
>







254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
      int nNew = p->nException;
      const unsigned char *zCsr = (const unsigned char*)z;
      const unsigned char *zTerm = (const unsigned char*)&z[n];
      while( zCsr<zTerm ){
        int iCode;
        int bToken;
        READ_UTF8(zCsr, zTerm, iCode);
        if( iCode<128 ){
          p->aTokenChar[iCode] = bTokenChars;
        }else{
          bToken = sqlite3Fts5UnicodeIsalnum(iCode);
          assert( (bToken==0 || bToken==1) ); 
          assert( (bTokenChars==0 || bTokenChars==1) );
          if( bToken!=bTokenChars && sqlite3Fts5UnicodeIsdiacritic(iCode)==0 ){
            int i;
            for(i=0; i<nNew; i++){
              if( aNew[i]>iCode ) break;
            }
            memmove(&aNew[i+1], &aNew[i], (nNew-i)*sizeof(int));
            aNew[i] = iCode;
            nNew++;
          }
        }
      }
      p->aiException = aNew;
      p->nException = nNew;
    }else{
      rc = SQLITE_NOMEM;
    }
296
297
298
299
300
301
302













303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321

322





323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341




342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381

382
383
384
385






386


387
388
389






390
391
392
393
394
395



396
397

398

399

400
401



402
403

404
405
406
407
408
409
410
411
412
413
414




415
416


417



418
419


420









421
422
423
424


425
426
427


428

429
430

431
432
433
434
435
436
437
        iHi = iTest-1;
      }
    }
  }

  return 0;
}














/*
** Create a "unicode61" tokenizer.
*/
static int fts5UnicodeCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  int rc = SQLITE_OK;             /* Return code */
  Unicode61Tokenizer *p = 0;      /* New tokenizer object */ 

  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
    if( p ){
      int i;
      memset(p, 0, sizeof(Unicode61Tokenizer));

      p->bRemoveDiacritic = 1;





      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
            rc = SQLITE_ERROR;
          }
          p->bRemoveDiacritic = (zArg[0]=='1');
        }else
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
      }
    }else{
      rc = SQLITE_NOMEM;




    }
    *ppOut = (Fts5Tokenizer*)p;
  }
  return rc;
}

/*
** Delete a "unicode61" tokenizer.
*/
static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
  sqlite3_free(p->aiException);
  sqlite3_free(p);
  return;
}

/*
** Return true if, for the purposes of tokenizing with the tokenizer
** passed as the first argument, codepoint iCode is considered a token 
** character (not a separator).
*/
static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
  assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
  return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
}

/*
** Tokenize some text using a unicode61 tokenizer.
*/
static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  const unsigned char *zInput = (const unsigned char*)pText;
  const unsigned char *zTerm = &zInput[nText];
  const unsigned char *z = zInput;
  int rc = SQLITE_OK;

  int nBuf = 0;
  unsigned char *zBuf = 0;
  unsigned char *zOut = 0;







  while( rc==SQLITE_OK && z<zTerm ){


    int iCode;
    int bAlnum;
    const unsigned char *zStart;






    const unsigned char *zCode;

    if( zOut==zBuf ) zStart = z;
    zCode = z;
    READ_UTF8(z, zTerm, iCode);
    bAlnum = fts5UnicodeIsAlnum(p, iCode);



    if( bAlnum==0 && zOut>zBuf ){
      bAlnum = sqlite3Fts5UnicodeIsdiacritic(iCode);

    }



    if( bAlnum ){
      int iOut;




      /* Grow the output buffer if required */

      while( (zOut-zBuf)+4>=nBuf ){
        unsigned char *zNew;
        nBuf = (nBuf ? nBuf*2 : 128);
        zNew = sqlite3_realloc(zBuf, nBuf);
        if( zNew==0 ){
          rc = SQLITE_NOMEM;
          goto tokenize_finished;
        }else{
          zOut = &zNew[zOut-zBuf];
          zBuf = zNew;
        }




      }



      /* Write the new character to it */



      iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
      if( iOut ) WRITE_UTF8(zOut, iOut);


    }










    if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
      int ie = (bAlnum ? z : zCode) - zInput;
      rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie);


      zOut = zBuf;
    }
  }




 tokenize_finished:
  sqlite3_free(zBuf);

  return rc;
}

/**************************************************************************
** Start of porter stemmer implementation.
*/








>
>
>
>
>
>
>
>
>
>
>
>
>



















>

>
>
>
>
>



















>
>
>
>






<
<
<
<
<
<
<
<
<
<










<
<
<







<
<
<

>
|
|
|

>
>
>
>
>
>
|
>
>
|
|
|
>
>
>
>
>
>
|
<
<
<
|
|
>
>
>
|
|
>
|
>
|
>
|
<
>
>
>

|
>
|
<
<
|
|

|
<
<
<

>
>
>
>


>
>
|
>
>
>
|
|
>
>
|
>
>
>
>
>
>
>
>
>
|
<
<
<
>
>
|

|
>
>
|
>
|
<
>







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378










379
380
381
382
383
384
385
386
387
388



389
390
391
392
393
394
395



396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420



421
422
423
424
425
426
427
428
429
430
431
432
433

434
435
436
437
438
439
440


441
442
443
444



445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472



473
474
475
476
477
478
479
480
481
482

483
484
485
486
487
488
489
490
        iHi = iTest-1;
      }
    }
  }

  return 0;
}

/*
** Delete a "unicode61" tokenizer.
*/
static void fts5UnicodeDelete(Fts5Tokenizer *pTok){
  if( pTok ){
    Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTok;
    sqlite3_free(p->aiException);
    sqlite3_free(p->aFold);
    sqlite3_free(p);
  }
  return;
}

/*
** Create a "unicode61" tokenizer.
*/
static int fts5UnicodeCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  int rc = SQLITE_OK;             /* Return code */
  Unicode61Tokenizer *p = 0;      /* New tokenizer object */ 

  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
    p = (Unicode61Tokenizer*)sqlite3_malloc(sizeof(Unicode61Tokenizer));
    if( p ){
      int i;
      memset(p, 0, sizeof(Unicode61Tokenizer));
      memcpy(p->aTokenChar, aAsciiTokenChar, sizeof(aAsciiTokenChar));
      p->bRemoveDiacritic = 1;
      p->nFold = 64;
      p->aFold = sqlite3_malloc(p->nFold * sizeof(char));
      if( p->aFold==0 ){
        rc = SQLITE_NOMEM;
      }
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "remove_diacritics") ){
          if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1] ){
            rc = SQLITE_ERROR;
          }
          p->bRemoveDiacritic = (zArg[0]=='1');
        }else
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          rc = fts5UnicodeAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
      }
    }else{
      rc = SQLITE_NOMEM;
    }
    if( rc!=SQLITE_OK ){
      fts5UnicodeDelete((Fts5Tokenizer*)p);
      p = 0;
    }
    *ppOut = (Fts5Tokenizer*)p;
  }
  return rc;
}











/*
** Return true if, for the purposes of tokenizing with the tokenizer
** passed as the first argument, codepoint iCode is considered a token 
** character (not a separator).
*/
static int fts5UnicodeIsAlnum(Unicode61Tokenizer *p, int iCode){
  assert( (sqlite3Fts5UnicodeIsalnum(iCode) & 0xFFFFFFFE)==0 );
  return sqlite3Fts5UnicodeIsalnum(iCode) ^ fts5UnicodeIsException(p, iCode);
}




static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;



  int rc = SQLITE_OK;
  unsigned char *a = p->aTokenChar;

  unsigned char *zTerm = (unsigned char*)&pText[nText];
  unsigned char *zCsr = (unsigned char *)pText;

  /* Output buffer */
  char *aFold = p->aFold;
  int nFold = p->nFold;

  /* Each iteration of this loop gobbles up a contiguous run of separators,
  ** then the next token.  */
  while( rc==SQLITE_OK ){
    int iCode;                    /* non-ASCII codepoint read from input */
    char *zOut = aFold;
    int is;
    int ie;

    /* Skip any separator characters. */
    while( 1 ){
      if( zCsr>=zTerm ) goto tokenize_done;
      if( *zCsr & 0x80 ) {
        /* A character outside of the ascii range. Skip past it if it is
        ** a separator character. Or break out of the loop if it is not. */
        is = zCsr - (unsigned char*)pText;



        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p, iCode) ){
          goto non_ascii_tokenchar;
        }
      }else{
        if( a[*zCsr] ){
          is = zCsr - (unsigned char*)pText;
          goto ascii_tokenchar;
        }
        zCsr++;
      }
    }


    /* Run through the tokenchars. Fold them into the output buffer along
    ** the way.  */
    while( zCsr<zTerm ){

      /* Grow the output buffer so that there is sufficient space to fit the
      ** largest possible utf-8 character.  */
      if( (zOut-aFold)+6>nFold ){


        aFold = sqlite3_malloc(nFold*2);
        if( aFold==0 ){
          rc = SQLITE_NOMEM;
          goto tokenize_done;



        }
        memcpy(aFold, p->aFold, nFold);
        sqlite3_free(p->aFold);
        p->aFold = aFold;
        p->nFold = nFold = nFold*2;
      }

      if( *zCsr & 0x80 ){
        /* An non-ascii-range character. Fold it into the output buffer if
        ** it is a token character, or break out of the loop if it is not. */
        READ_UTF8(zCsr, zTerm, iCode);
        if( fts5UnicodeIsAlnum(p,iCode)||sqlite3Fts5UnicodeIsdiacritic(iCode) ){
 non_ascii_tokenchar:
          iCode = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
          if( iCode ) WRITE_UTF8(zOut, iCode);
        }else{
          break;
        }
      }else if( a[*zCsr]==0 ){
        /* An ascii-range separator character. End of token. */
        break; 
      }else{
 ascii_tokenchar:
        if( *zCsr>='A' && *zCsr<='Z' ){
          *zOut++ = *zCsr + 32;
        }else{
          *zOut++ = *zCsr;
        }



        zCsr++;
      }
      ie = zCsr - (unsigned char*)pText;
    }

    /* Invoke the token callback */
    rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
  }
  
 tokenize_done:

  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}

/**************************************************************************
** Start of porter stemmer implementation.
*/

471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
  int rc = SQLITE_OK;
  PorterTokenizer *pRet;
  void *pUserdata = 0;

  pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
  if( pRet ){
    memset(pRet, 0, sizeof(PorterTokenizer));
    rc = pApi->xFindTokenizer(pApi, "simple", &pUserdata, &pRet->tokenizer);
  }else{
    rc = SQLITE_NOMEM;
  }
  if( rc==SQLITE_OK ){
    rc = pRet->tokenizer.xCreate(pUserdata, 0, 0, &pRet->pTokenizer);
  }








|







524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
  int rc = SQLITE_OK;
  PorterTokenizer *pRet;
  void *pUserdata = 0;

  pRet = (PorterTokenizer*)sqlite3_malloc(sizeof(PorterTokenizer));
  if( pRet ){
    memset(pRet, 0, sizeof(PorterTokenizer));
    rc = pApi->xFindTokenizer(pApi, "ascii", &pUserdata, &pRet->tokenizer);
  }else{
    rc = SQLITE_NOMEM;
  }
  if( rc==SQLITE_OK ){
    rc = pRet->tokenizer.xCreate(pUserdata, 0, 0, &pRet->pTokenizer);
  }

785
786
787
788
789
790
791
792
793
794

795
796
797
798
799
800
801
** Register all built-in tokenizers with FTS5.
*/
int sqlite3Fts5TokenizerInit(fts5_api *pApi){
  struct BuiltinTokenizer {
    const char *zName;
    fts5_tokenizer x;
  } aBuiltin[] = {
    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
    { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
    { "simple",    {fts5SimpleCreate, fts5SimpleDelete, fts5SimpleTokenize }}

  };
  
  int rc = SQLITE_OK;             /* Return code */
  int i;                          /* To iterate through builtin functions */

  for(i=0; rc==SQLITE_OK && i<sizeof(aBuiltin)/sizeof(aBuiltin[0]); i++){
    rc = pApi->xCreateTokenizer(pApi,







<

|
>







838
839
840
841
842
843
844

845
846
847
848
849
850
851
852
853
854
** Register all built-in tokenizers with FTS5.
*/
int sqlite3Fts5TokenizerInit(fts5_api *pApi){
  struct BuiltinTokenizer {
    const char *zName;
    fts5_tokenizer x;
  } aBuiltin[] = {

    { "unicode61", {fts5UnicodeCreate, fts5UnicodeDelete, fts5UnicodeTokenize}},
    { "ascii",     {fts5AsciiCreate, fts5AsciiDelete, fts5AsciiTokenize }},
    { "porter",    {fts5PorterCreate, fts5PorterDelete, fts5PorterTokenize }},
  };
  
  int rc = SQLITE_OK;             /* Return code */
  int i;                          /* To iterate through builtin functions */

  for(i=0; rc==SQLITE_OK && i<sizeof(aBuiltin)/sizeof(aBuiltin[0]); i++){
    rc = pApi->xCreateTokenizer(pApi,
Changes to ext/fts5/test/fts5near.test.
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    DELETE FROM t1;
    INSERT INTO t1 VALUES('$doc');
    SELECT count(*) FROM t1 WHERE t1 MATCH '$near';
  " $res]
}

execsql { 
  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'simple tokenchars .') 
}

do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1
do_near_test 1.2 ". . a . . . b . ." { NEAR(a b, 4) } 1
do_near_test 1.3 ". . a . . . b . ." { NEAR(a b, 3) } 1
do_near_test 1.4 ". . a . . . b . ." { NEAR(a b, 2) } 0








|







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
    DELETE FROM t1;
    INSERT INTO t1 VALUES('$doc');
    SELECT count(*) FROM t1 WHERE t1 MATCH '$near';
  " $res]
}

execsql { 
  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'ascii tokenchars .') 
}

do_near_test 1.1 ". . a . . . b . ." { NEAR(a b, 5) } 1
do_near_test 1.2 ". . a . . . b . ." { NEAR(a b, 4) } 1
do_near_test 1.3 ". . a . . . b . ." { NEAR(a b, 3) } 1
do_near_test 1.4 ". . a . . . b . ." { NEAR(a b, 2) } 0

Changes to ext/fts5/test/fts5tokenizer.test.
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  DROP TABLE ft1;
}
do_execsql_test 1.3 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
  DROP TABLE ft1;
}
do_execsql_test 1.4 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter simple');
  DROP TABLE ft1;
}

do_execsql_test 2.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  INSERT INTO ft1 VALUES('embedded databases');
}







|







29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  DROP TABLE ft1;
}
do_execsql_test 1.3 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter');
  DROP TABLE ft1;
}
do_execsql_test 1.4 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize = 'porter ascii');
  DROP TABLE ft1;
}

do_execsql_test 2.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  INSERT INTO ft1 VALUES('embedded databases');
}
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
do_catchsql_test 4.2 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
} {1 {parse error in "x y"}}

#-------------------------------------------------------------------------
# Test the "separators" and "tokenchars" options a bit.
#
foreach {tn tokenizer} {1 simple 2 unicode61} {
  reset_db
  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
  do_execsql_test 5.$tn.1 {
    INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
  }
  foreach {tn2 token res} {







|







71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
do_catchsql_test 4.2 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
} {1 {parse error in "x y"}}

#-------------------------------------------------------------------------
# Test the "separators" and "tokenchars" options a bit.
#
foreach {tn tokenizer} {1 ascii 2 unicode61} {
  reset_db
  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
  do_execsql_test 5.$tn.1 {
    INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
  }
  foreach {tn2 token res} {
Changes to ext/fts5/test/fts5unicode.test.
21
22
23
24
25
26
27
28
29
30
31
32
33




















34
35
36
    foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
      lappend ret [set z]
    }
    set ret
  }] [list {*}$output]]
}

foreach {tn t} {1 simple 2 unicode61} {
  tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
  tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  tokenize_test 1.$tn.3 $t {} {}
}





















finish_test








|





>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
      lappend ret [set z]
    }
    set ret
  }] [list {*}$output]]
}

foreach {tn t} {1 ascii 2 unicode61} {
  tokenize_test 1.$tn.0 $t {A B C D} {a b c d}
  tokenize_test 1.$tn.1 $t {May you share freely,} {may you share freely}
  tokenize_test 1.$tn.2 $t {..May...you.shAre.freely} {may you share freely}
  tokenize_test 1.$tn.3 $t {} {}
}

#-------------------------------------------------------------------------
# Check that "unicode61" really is the default tokenizer.
#

do_execsql_test 2.0 "
  CREATE VIRTUAL TABLE t1 USING fts5(x);
  CREATE VIRTUAL TABLE t2 USING fts5(x, tokenize = unicode61);
  CREATE VIRTUAL TABLE t3 USING fts5(x, tokenize = ascii);
  INSERT INTO t1 VALUES('\xC0\xC8\xCC');
  INSERT INTO t2 VALUES('\xC0\xC8\xCC');
  INSERT INTO t3 VALUES('\xC0\xC8\xCC');
"
breakpoint
do_execsql_test 2.1 "
  SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
  SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
  SELECT 't3' FROM t3 WHERE t3 MATCH '\xE0\xE8\xEC';
" {t1 t2}


finish_test