SQLite

Check-in [fc71868496]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Another change to the fts5 tokenizer API.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5-incompatible
Files: files | file ages | folders
SHA1: fc71868496f45f9c7a79ed2bf2d164a7c4718ce1
User & Date: dan 2015-08-29 15:44:27.938
Context
2015-08-29
18:46
Add a test for an fts5 tokenizer that supports synonyms by adding multiple entries to the fts index. (check-in: 98d07d16ca user: dan tags: fts5-incompatible)
15:44
Another change to the fts5 tokenizer API. (check-in: fc71868496 user: dan tags: fts5-incompatible)
2015-08-28
19:56
Change the fts5 tokenizer API to allow more than one token to occupy a single position within a document. (check-in: 90b85b42f2 user: dan tags: fts5-incompatible)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5.h.
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  int (*xColumnCount)(Fts5Context*);
  int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow);
  int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);

  int (*xTokenize)(Fts5Context*, 
    const char *pText, int nText, /* Text to tokenize */
    void *pCtx,                   /* Context passed to xToken() */
    int (*xToken)(void*, const char*, int, int, int, int)       /* Callback */
  );

  int (*xPhraseCount)(Fts5Context*);
  int (*xPhraseSize)(Fts5Context*, int iPhrase);

  int (*xInstCount)(Fts5Context*, int *pnInst);
  int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);







|







213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
  int (*xColumnCount)(Fts5Context*);
  int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow);
  int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);

  int (*xTokenize)(Fts5Context*, 
    const char *pText, int nText, /* Text to tokenize */
    void *pCtx,                   /* Context passed to xToken() */
    int (*xToken)(void*, int, const char*, int, int, int)       /* Callback */
  );

  int (*xPhraseCount)(Fts5Context*);
  int (*xPhraseSize)(Fts5Context*, int iPhrase);

  int (*xInstCount)(Fts5Context*, int *pnInst);
  int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);
309
310
311
312
313
314
315

316
317
318
319
320
321
322
323
324

325
326
327
328
329




330
331
332
333
334
335
336
  void (*xDelete)(Fts5Tokenizer*);
  int (*xTokenize)(Fts5Tokenizer*, 
      void *pCtx,
      int flags,
      const char *pText, int nText, 
      int (*xToken)(
        void *pCtx,         /* Copy of 2nd argument to xTokenize() */

        const char *pToken, /* Pointer to buffer containing token */
        int nToken,         /* Size of token in bytes */
        int iStart,         /* Byte offset of token within input text */
        int iEnd,           /* Byte offset of end of token within input text */
        int iPos            /* Number of tokens before this one in input text */
      )
  );
};


#define FTS5_TOKENIZE_QUERY     0x0001
#define FTS5_TOKENIZE_PREFIX    0x0002
#define FTS5_TOKENIZE_DOCUMENT  0x0004
#define FTS5_TOKENIZE_AUX       0x0008





/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/

/*************************************************************************
** FTS5 EXTENSION REGISTRATION API
*/







>



|
<




>





>
>
>
>







309
310
311
312
313
314
315
316
317
318
319
320

321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
  void (*xDelete)(Fts5Tokenizer*);
  int (*xTokenize)(Fts5Tokenizer*, 
      void *pCtx,
      int flags,
      const char *pText, int nText, 
      int (*xToken)(
        void *pCtx,         /* Copy of 2nd argument to xTokenize() */
        int tflags,         /* Mask of FTS5_TOKEN_* flags */
        const char *pToken, /* Pointer to buffer containing token */
        int nToken,         /* Size of token in bytes */
        int iStart,         /* Byte offset of token within input text */
        int iEnd            /* Byte offset of end of token within input text */

      )
  );
};

/* Flags that may be passed as the third argument to xTokenize() */
#define FTS5_TOKENIZE_QUERY     0x0001
#define FTS5_TOKENIZE_PREFIX    0x0002
#define FTS5_TOKENIZE_DOCUMENT  0x0004
#define FTS5_TOKENIZE_AUX       0x0008

/* Flags that may be passed by the tokenizer implementation back to FTS5
** as the third argument to the supplied xToken callback. */
#define FTS5_TOKEN_COLOCATED    0x0001      /* Same position as prev. token */

/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/

/*************************************************************************
** FTS5 EXTENSION REGISTRATION API
*/
Changes to ext/fts5/fts5Int.h.
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);

int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  int flags,                      /* FTS5_TOKENIZE_* flags */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int, int)    /* Callback */
);

void sqlite3Fts5Dequote(char *z);

/* Load the contents of the %_config table */
int sqlite3Fts5ConfigLoad(Fts5Config*, int);








|







165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);

int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  int flags,                      /* FTS5_TOKENIZE_* flags */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, int, const char*, int, int, int)    /* Callback */
);

void sqlite3Fts5Dequote(char *z);

/* Load the contents of the %_config table */
int sqlite3Fts5ConfigLoad(Fts5Config*, int);

Changes to ext/fts5/fts5_aux.c.
144
145
146
147
148
149
150

151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
166
167
168
}

/*
** Tokenizer callback used by implementation of highlight() function.
*/
static int fts5HighlightCb(
  void *pContext,                 /* Pointer to HighlightContext object */

  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStartOff,                  /* Start offset of token */
  int iEndOff,                    /* End offset of token */
  int iPos
){
  HighlightContext *p = (HighlightContext*)pContext;
  int rc = SQLITE_OK;


  if( iPos<p->iPos ) return SQLITE_OK;
  p->iPos = iPos+1;

  if( p->iRangeEnd>0 ){
    if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
    if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
  }

  if( iPos==p->iter.iStart ){







>



|
<



>

|
|







144
145
146
147
148
149
150
151
152
153
154
155

156
157
158
159
160
161
162
163
164
165
166
167
168
169
}

/*
** Tokenizer callback used by implementation of highlight() function.
*/
static int fts5HighlightCb(
  void *pContext,                 /* Pointer to HighlightContext object */
  int tflags,                     /* Mask of FTS5_TOKEN_* flags */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStartOff,                  /* Start offset of token */
  int iEndOff                     /* End offset of token */

){
  HighlightContext *p = (HighlightContext*)pContext;
  int rc = SQLITE_OK;
  int iPos;

  if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
  iPos = p->iPos++;

  if( p->iRangeEnd>0 ){
    if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
    if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
  }

  if( iPos==p->iter.iStart ){
Changes to ext/fts5/fts5_config.c.
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
** to be an SQLite error code and returned to the caller.
*/
int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  int flags,                      /* FTS5_TOKENIZE_* flags */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int, int)    /* Callback */
){
  if( pText==0 ) return SQLITE_OK;
  return pConfig->pTokApi->xTokenize(
      pConfig->pTok, pCtx, flags, pText, nText, xToken
  );
}








|







644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
** to be an SQLite error code and returned to the caller.
*/
int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  int flags,                      /* FTS5_TOKENIZE_* flags */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, int, const char*, int, int, int)    /* Callback */
){
  if( pText==0 ) return SQLITE_OK;
  return pConfig->pTokApi->xTokenize(
      pConfig->pTok, pCtx, flags, pText, nText, xToken
  );
}

Changes to ext/fts5/fts5_expr.c.
1334
1335
1336
1337
1338
1339
1340

1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351


1352
1353
1354
1355
1356
1357
1358
};

/*
** Callback for tokenizing terms used by ParseTerm().
*/
static int fts5ParseTokenize(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */

  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos
){
  int rc = SQLITE_OK;
  const int SZALLOC = 8;
  TokenCtx *pCtx = (TokenCtx*)pContext;
  Fts5ExprPhrase *pPhrase = pCtx->pPhrase;
  Fts5ExprTerm *pTerm;



  if( pPhrase==0 || (pPhrase->nTerm % SZALLOC)==0 ){
    Fts5ExprPhrase *pNew;
    int nNew = SZALLOC + (pPhrase ? pPhrase->nTerm : 0);

    pNew = (Fts5ExprPhrase*)sqlite3_realloc(pPhrase, 
        sizeof(Fts5ExprPhrase) + sizeof(Fts5ExprTerm) * nNew







>



|
<






>
>







1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345

1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
};

/*
** Callback for tokenizing terms used by ParseTerm().
*/
static int fts5ParseTokenize(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  int tflags,                     /* Mask of FTS5_TOKEN_* flags */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  int rc = SQLITE_OK;
  const int SZALLOC = 8;
  TokenCtx *pCtx = (TokenCtx*)pContext;
  Fts5ExprPhrase *pPhrase = pCtx->pPhrase;
  Fts5ExprTerm *pTerm;

  if( tflags & FTS5_TOKEN_COLOCATED ) return rc;

  if( pPhrase==0 || (pPhrase->nTerm % SZALLOC)==0 ){
    Fts5ExprPhrase *pNew;
    int nNew = SZALLOC + (pPhrase ? pPhrase->nTerm : 0);

    pNew = (Fts5ExprPhrase*)sqlite3_realloc(pPhrase, 
        sizeof(Fts5ExprPhrase) + sizeof(Fts5ExprTerm) * nNew
Changes to ext/fts5/fts5_main.c.
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
  return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow);
}

static int fts5ApiTokenize(
  Fts5Context *pCtx, 
  const char *pText, int nText, 
  void *pUserData,
  int (*xToken)(void*, const char*, int, int, int, int)
){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  return sqlite3Fts5Tokenize(
      pTab->pConfig, FTS5_TOKENIZE_AUX, pText, nText, pUserData, xToken
  );
}







|







1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
  return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow);
}

static int fts5ApiTokenize(
  Fts5Context *pCtx, 
  const char *pText, int nText, 
  void *pUserData,
  int (*xToken)(void*, int, const char*, int, int, int)
){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  return sqlite3Fts5Tokenize(
      pTab->pConfig, FTS5_TOKENIZE_AUX, pText, nText, pUserData, xToken
  );
}
1653
1654
1655
1656
1657
1658
1659

1660
1661
1662
1663
1664
1665
1666

1667

1668
1669
1670
1671
1672
1673
1674
    }
  }
  return rc;
}

static int fts5ColumnSizeCb(
  void *pContext,                 /* Pointer to int */

  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos
){
  int *pCnt = (int*)pContext;

  *pCnt = iPos+1;

  return SQLITE_OK;
}

static int fts5ApiColumnSize(Fts5Context *pCtx, int iCol, int *pnToken){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  Fts5Config *pConfig = pTab->pConfig;







>



|
<


>
|
>







1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664

1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
    }
  }
  return rc;
}

static int fts5ColumnSizeCb(
  void *pContext,                 /* Pointer to int */
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  int *pCnt = (int*)pContext;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
    (*pCnt)++;
  }
  return SQLITE_OK;
}

static int fts5ApiColumnSize(Fts5Context *pCtx, int iCol, int *pnToken){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  Fts5Config *pConfig = pTab->pConfig;
Changes to ext/fts5/fts5_storage.c.
355
356
357
358
359
360
361

362
363
364
365
366
367
368
369
370
371

372
373
374
375
376
377
378
379
};

/*
** Tokenization callback used when inserting tokens into the FTS index.
*/
static int fts5StorageInsertCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */

  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos
){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  assert( iPos+1>=pCtx->szCol );
  pCtx->szCol = iPos+1;

  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the
** delete-markers to the FTS index necessary to delete it. Do not actually
** remove the %_content row at this time though.
*/







>



|
<



|
|
>
|







355
356
357
358
359
360
361
362
363
364
365
366

367
368
369
370
371
372
373
374
375
376
377
378
379
380
};

/*
** Tokenization callback used when inserting tokens into the FTS index.
*/
static int fts5StorageInsertCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
    pCtx->szCol++;
  }
  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the
** delete-markers to the FTS index necessary to delete it. Do not actually
** remove the %_content row at this time though.
*/
840
841
842
843
844
845
846

847
848
849
850
851
852
853
854

855

856
857
858
859
860
861
862
863
864
};

/*
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */

  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos
){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  assert( iPos+1>=pCtx->szCol );

  pCtx->szCol = iPos+1;

  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken
  );
  return SQLITE_OK;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return







>



|
<


<
>
|
>

|







841
842
843
844
845
846
847
848
849
850
851
852

853
854

855
856
857
858
859
860
861
862
863
864
865
866
};

/*
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;

  if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
    pCtx->szCol++;
  }
  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
  );
  return SQLITE_OK;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return
Changes to ext/fts5/fts5_tcl.c.
137
138
139
140
141
142
143

144
145
146
147
148
149
150
151
152
typedef struct F5tAuxData F5tAuxData;
struct F5tAuxData {
  Tcl_Obj *pObj;
};

static int xTokenizeCb(
  void *pCtx, 

  const char *zToken, int nToken, 
  int iStart, int iEnd, int iPos
){
  F5tFunction *p = (F5tFunction*)pCtx;
  Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
  int rc;

  Tcl_IncrRefCount(pEval);
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewStringObj(zToken, nToken));







>

|







137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
typedef struct F5tAuxData F5tAuxData;
struct F5tAuxData {
  Tcl_Obj *pObj;
};

static int xTokenizeCb(
  void *pCtx, 
  int tflags,
  const char *zToken, int nToken, 
  int iStart, int iEnd
){
  F5tFunction *p = (F5tFunction*)pCtx;
  Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
  int rc;

  Tcl_IncrRefCount(pEval);
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewStringObj(zToken, nToken));
580
581
582
583
584
585
586

587
588
589
590
591
592
593
594
595
  Tcl_Obj *pRet;
  int bSubst;
  const char *zInput;
};

static int xTokenizeCb2(
  void *pCtx, 

  const char *zToken, int nToken, 
  int iStart, int iEnd, int iPos
){
  F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
  if( p->bSubst ){
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(
        0, p->pRet, Tcl_NewStringObj(&p->zInput[iStart], iEnd-iStart)
    );







>

|







581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
  Tcl_Obj *pRet;
  int bSubst;
  const char *zInput;
};

static int xTokenizeCb2(
  void *pCtx, 
  int tflags,
  const char *zToken, int nToken, 
  int iStart, int iEnd
){
  F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
  if( p->bSubst ){
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(
        0, p->pRet, Tcl_NewStringObj(&p->zInput[iStart], iEnd-iStart)
    );
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerModule F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};







|







692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerModule F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, int, const char*, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
}

static int f5tTokenizerTokenize(
  Fts5Tokenizer *p, 
  void *pCtx,
  int flags,
  const char *pText, int nText, 
  int (*xToken)(void*, const char*, int, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, const char*, int, int, int);
  Tcl_Obj *pEval;
  int rc;

  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;

  pEval = Tcl_DuplicateObj(pInst->pScript);







|



|







750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
}

static int f5tTokenizerTokenize(
  Fts5Tokenizer *p, 
  void *pCtx,
  int flags,
  const char *pText, int nText, 
  int (*xToken)(void*, int, const char*, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, int, const char*, int, int, int);
  Tcl_Obj *pEval;
  int rc;

  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;

  pEval = Tcl_DuplicateObj(pInst->pScript);
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
  zToken = Tcl_GetStringFromObj(objv[1], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[3], &iEnd) 
  ){
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, zToken, nToken, iStart, iEnd);
  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
  return TCL_OK;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);







|







811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
  zToken = Tcl_GetStringFromObj(objv[1], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[3], &iEnd) 
  ){
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, 0, zToken, nToken, iStart, iEnd);
  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
  return TCL_OK;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);
Changes to ext/fts5/fts5_tokenize.c.
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
** Tokenize some text using the ascii tokenizer.
*/
static int fts5AsciiTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
  AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;
  int iPos = 0;

  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;
  unsigned char *a = p->aTokenChar;

  while( is<nText && rc==SQLITE_OK ){







|





<







114
115
116
117
118
119
120
121
122
123
124
125
126

127
128
129
130
131
132
133
** Tokenize some text using the ascii tokenizer.
*/
static int fts5AsciiTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
  AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;


  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;
  unsigned char *a = p->aTokenChar;

  while( is<nText && rc==SQLITE_OK ){
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
        break;
      }
      nFold = nByte*2;
    }
    asciiFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie, iPos++);
    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}







|







155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
        break;
      }
      nFold = nByte*2;
    }
    asciiFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, 0, pFold, nByte, is, ie);
    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
}

static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  unsigned char *a = p->aTokenChar;
  int iPos = 0;

  unsigned char *zTerm = (unsigned char*)&pText[nText];
  unsigned char *zCsr = (unsigned char *)pText;

  /* Output buffer */
  char *aFold = p->aFold;
  int nFold = p->nFold;







|




<







384
385
386
387
388
389
390
391
392
393
394
395

396
397
398
399
400
401
402
}

static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  unsigned char *a = p->aTokenChar;


  unsigned char *zTerm = (unsigned char*)&pText[nText];
  unsigned char *zCsr = (unsigned char *)pText;

  /* Output buffer */
  char *aFold = p->aFold;
  int nFold = p->nFold;
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
        }
        zCsr++;
      }
      ie = zCsr - (unsigned char*)pText;
    }

    /* Invoke the token callback */
    rc = xToken(pCtx, aFold, zOut-aFold, is, ie, iPos++);
  }
  
 tokenize_done:
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}








|







473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
        }
        zCsr++;
      }
      ie = zCsr - (unsigned char*)pText;
    }

    /* Invoke the token callback */
    rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie); 
  }
  
 tokenize_done:
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  *ppOut = (Fts5Tokenizer*)pRet;
  return rc;
}

typedef struct PorterContext PorterContext;
struct PorterContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int, int);
  char *aBuf;
};

typedef struct PorterRule PorterRule;
struct PorterRule {
  const char *zSuffix;
  int nSuffix;







|







551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
  *ppOut = (Fts5Tokenizer*)pRet;
  return rc;
}

typedef struct PorterContext PorterContext;
struct PorterContext {
  void *pCtx;
  int (*xToken)(void*, int, const char*, int, int, int);
  char *aBuf;
};

typedef struct PorterRule PorterRule;
struct PorterRule {
  const char *zSuffix;
  int nSuffix;
1118
1119
1120
1121
1122
1123
1124

1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
      *pnBuf = nBuf-1;
    }
  }
}

static int fts5PorterCb(
  void *pCtx, 

  const char *pToken, 
  int nToken, 
  int iStart, 
  int iEnd,
  int iPos
){
  PorterContext *p = (PorterContext*)pCtx;

  char *aBuf;
  int nBuf;

  if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;







>



|
<







1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127

1128
1129
1130
1131
1132
1133
1134
      *pnBuf = nBuf-1;
    }
  }
}

static int fts5PorterCb(
  void *pCtx, 
  int tflags,
  const char *pToken, 
  int nToken, 
  int iStart, 
  int iEnd

){
  PorterContext *p = (PorterContext*)pCtx;

  char *aBuf;
  int nBuf;

  if( nToken>FTS5_PORTER_MAX_TOKEN || nToken<3 ) goto pass_through;
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
  /* Step 5b. */
  if( nBuf>1 && aBuf[nBuf-1]=='l' 
   && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
  ){
    nBuf--;
  }

  return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd, iPos);

 pass_through:
  return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd, iPos);
}

/*
** Tokenize using the porter tokenizer.
*/
static int fts5PorterTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
  PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  PorterContext sCtx;
  sCtx.xToken = xToken;
  sCtx.pCtx = pCtx;
  sCtx.aBuf = p->aBuf;
  return p->tokenizer.xTokenize(







|


|










|







1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
  /* Step 5b. */
  if( nBuf>1 && aBuf[nBuf-1]=='l' 
   && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
  ){
    nBuf--;
  }

  return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);

 pass_through:
  return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
}

/*
** Tokenize using the porter tokenizer.
*/
static int fts5PorterTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  int flags,
  const char *pText, int nText,
  int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
  PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  PorterContext sCtx;
  sCtx.xToken = xToken;
  sCtx.pCtx = pCtx;
  sCtx.aBuf = p->aBuf;
  return p->tokenizer.xTokenize(