SQLite

Check-in [65f0262fb8]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Remove the iPos parameter from the tokenizer callback. Fix the "tokenchars" and "separators" options on the simple tokenizer.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: 65f0262fb82dbfd9f80233ac7c3108e2f2716c0a
User & Date: dan 2015-01-06 19:08:26.571
Context
2015-01-07
17:11
Add the 'rebuild' and 'delete-all' commands. (check-in: 0cb2fed525 user: dan tags: fts5)
2015-01-06
19:08
Remove the iPos parameter from the tokenizer callback. Fix the "tokenchars" and "separators" options on the simple tokenizer. (check-in: 65f0262fb8 user: dan tags: fts5)
14:38
Further fixes and test cases related to external content tables. (check-in: ce6a899baf user: dan tags: fts5)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5.c.
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
    }else{
      i64 iDel = sqlite3_value_int64(apVal[0]);  /* Rowid to delete */
      rc = sqlite3Fts5StorageDelete(pTab->pStorage, iDel);
    }
  }else if( nArg>1 ){
    sqlite3_value *pCmd = apVal[2 + pConfig->nCol];
    if( SQLITE_NULL!=sqlite3_value_type(pCmd) ){
      const char *z = sqlite3_value_text(pCmd);
      if( pConfig->eContent!=FTS5_CONTENT_NORMAL 
       && 0==sqlite3_stricmp("delete", z) 
      ){
        return fts5SpecialDelete(pTab, apVal, pRowid);
      }else{
        return fts5SpecialInsert(pTab, pCmd, apVal[2 + pConfig->nCol + 1]);
      }







|







1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
    }else{
      i64 iDel = sqlite3_value_int64(apVal[0]);  /* Rowid to delete */
      rc = sqlite3Fts5StorageDelete(pTab->pStorage, iDel);
    }
  }else if( nArg>1 ){
    sqlite3_value *pCmd = apVal[2 + pConfig->nCol];
    if( SQLITE_NULL!=sqlite3_value_type(pCmd) ){
      const char *z = (const char*)sqlite3_value_text(pCmd);
      if( pConfig->eContent!=FTS5_CONTENT_NORMAL 
       && 0==sqlite3_stricmp("delete", z) 
      ){
        return fts5SpecialDelete(pTab, apVal, pRowid);
      }else{
        return fts5SpecialInsert(pTab, pCmd, apVal[2 + pConfig->nCol + 1]);
      }
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
  return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow);
}

static int fts5ApiTokenize(
  Fts5Context *pCtx, 
  const char *pText, int nText, 
  void *pUserData,
  int (*xToken)(void*, const char*, int, int, int, int)
){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  return sqlite3Fts5Tokenize(pTab->pConfig, pText, nText, pUserData, xToken);
}

static int fts5ApiPhraseCount(Fts5Context *pCtx){







|







1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
  return sqlite3Fts5StorageRowCount(pTab->pStorage, pnRow);
}

static int fts5ApiTokenize(
  Fts5Context *pCtx, 
  const char *pText, int nText, 
  void *pUserData,
  int (*xToken)(void*, const char*, int, int, int)
){
  Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
  Fts5Table *pTab = (Fts5Table*)(pCsr->base.pVtab);
  return sqlite3Fts5Tokenize(pTab->pConfig, pText, nText, pUserData, xToken);
}

static int fts5ApiPhraseCount(Fts5Context *pCtx){
Changes to ext/fts5/fts5.h.
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
  int (*xColumnCount)(Fts5Context*);
  int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow);
  int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);

  int (*xTokenize)(Fts5Context*, 
    const char *pText, int nText, /* Text to tokenize */
    void *pCtx,                   /* Context passed to xToken() */
    int (*xToken)(void*, const char*, int, int, int, int)    /* Callback */
  );

  int (*xPhraseCount)(Fts5Context*);
  int (*xPhraseSize)(Fts5Context*, int iPhrase);

  int (*xInstCount)(Fts5Context*, int *pnInst);
  int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);







|







191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
  int (*xColumnCount)(Fts5Context*);
  int (*xRowCount)(Fts5Context*, sqlite3_int64 *pnRow);
  int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);

  int (*xTokenize)(Fts5Context*, 
    const char *pText, int nText, /* Text to tokenize */
    void *pCtx,                   /* Context passed to xToken() */
    int (*xToken)(void*, const char*, int, int, int)       /* Callback */
  );

  int (*xPhraseCount)(Fts5Context*);
  int (*xPhraseSize)(Fts5Context*, int iPhrase);

  int (*xInstCount)(Fts5Context*, int *pnInst);
  int (*xInst)(Fts5Context*, int iIdx, int *piPhrase, int *piCol, int *piOff);
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
      void *pCtx,
      const char *pText, int nText, 
      int (*xToken)(
        void *pCtx,         /* Copy of 2nd argument to xTokenize() */
        const char *pToken, /* Pointer to buffer containing token */
        int nToken,         /* Size of token in bytes */
        int iStart,         /* Byte offset of token within input text */
        int iEnd,           /* Byte offset of end of token within input text */
        int iPos            /* Position of token in input (first token is 0) */
      )
  );
};

/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/







|
<







287
288
289
290
291
292
293
294

295
296
297
298
299
300
301
      void *pCtx,
      const char *pText, int nText, 
      int (*xToken)(
        void *pCtx,         /* Copy of 2nd argument to xTokenize() */
        const char *pToken, /* Pointer to buffer containing token */
        int nToken,         /* Size of token in bytes */
        int iStart,         /* Byte offset of token within input text */
        int iEnd            /* Byte offset of end of token within input text */

      )
  );
};

/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/
Changes to ext/fts5/fts5Int.h.
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);

int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int, int)    /* Callback */
);

void sqlite3Fts5Dequote(char *z);

/* Load the contents of the %_config table */
int sqlite3Fts5ConfigLoad(Fts5Config*, int);








|







103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);

int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int)    /* Callback */
);

void sqlite3Fts5Dequote(char *z);

/* Load the contents of the %_config table */
int sqlite3Fts5ConfigLoad(Fts5Config*, int);

Changes to ext/fts5/fts5_aux.c.
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
  int nInst;                      /* Total number of phrase instances */

  /* Output variables */
  int iStart;                     /* First token in coalesced phrase instance */
  int iEnd;                       /* Last token in coalesced phrase instance */
};

/*
** Return non-zero if the iterator is at EOF, or zero otherwise.
*/
static int fts5CInstIterEof(CInstIter *pIter){
  return (pIter->iStart < 0);
}

/*
** Advance the iterator to the next coalesced phrase instance. Return
** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
*/
static int fts5CInstIterNext(CInstIter *pIter){
  int rc = SQLITE_OK;
  pIter->iStart = -1;







<
<
<
<
<
<
<







42
43
44
45
46
47
48







49
50
51
52
53
54
55
  int nInst;                      /* Total number of phrase instances */

  /* Output variables */
  int iStart;                     /* First token in coalesced phrase instance */
  int iEnd;                       /* Last token in coalesced phrase instance */
};








/*
** Advance the iterator to the next coalesced phrase instance. Return
** an SQLite error code if an error occurs, or SQLITE_OK otherwise.
*/
static int fts5CInstIterNext(CInstIter *pIter){
  int rc = SQLITE_OK;
  pIter->iStart = -1;
113
114
115
116
117
118
119

120
121
122
123
124
125
126

/*************************************************************************
** Start of highlight() implementation.
*/
typedef struct HighlightContext HighlightContext;
struct HighlightContext {
  CInstIter iter;                 /* Coalesced Instance Iterator */

  int iRangeStart;                /* First token to include */
  int iRangeEnd;                  /* If non-zero, last token to include */
  const char *zOpen;              /* Opening highlight */
  const char *zClose;             /* Closing highlight */
  const char *zIn;                /* Input text */
  int nIn;                        /* Size of input text in bytes */
  int iOff;                       /* Current offset within zIn[] */







>







106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

/*************************************************************************
** Start of highlight() implementation.
*/
typedef struct HighlightContext HighlightContext;
struct HighlightContext {
  CInstIter iter;                 /* Coalesced Instance Iterator */
  int iPos;                       /* Current token offset in zIn[] */
  int iRangeStart;                /* First token to include */
  int iRangeEnd;                  /* If non-zero, last token to include */
  const char *zOpen;              /* Opening highlight */
  const char *zClose;             /* Closing highlight */
  const char *zIn;                /* Input text */
  int nIn;                        /* Size of input text in bytes */
  int iOff;                       /* Current offset within zIn[] */
152
153
154
155
156
157
158
159
160
161
162
163

164
165
166
167
168
169
170
** Tokenizer callback used by implementation of highlight() function.
*/
static int fts5HighlightCb(
  void *pContext,                 /* Pointer to HighlightContext object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStartOff,                  /* Start offset of token */
  int iEndOff,                    /* End offset of token */
  int iPos                        /* Position offset of token */
){
  HighlightContext *p = (HighlightContext*)pContext;
  int rc = SQLITE_OK;


  if( p->iRangeEnd>0 ){
    if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
    if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
  }

  if( iPos==p->iter.iStart ){







|
<



>







146
147
148
149
150
151
152
153

154
155
156
157
158
159
160
161
162
163
164
** Tokenizer callback used by implementation of highlight() function.
*/
static int fts5HighlightCb(
  void *pContext,                 /* Pointer to HighlightContext object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStartOff,                  /* Start offset of token */
  int iEndOff                     /* End offset of token */

){
  HighlightContext *p = (HighlightContext*)pContext;
  int rc = SQLITE_OK;
  int iPos = p->iPos++;

  if( p->iRangeEnd>0 ){
    if( iPos<p->iRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
    if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
  }

  if( iPos==p->iter.iStart ){
Changes to ext/fts5/fts5_buffer.c.
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
}

int sqlite3Fts5Get32(const u8 *aBuf){
  return (aBuf[0] << 24) + (aBuf[1] << 16) + (aBuf[2] << 8) + aBuf[3];
}

void sqlite3Fts5BufferAppend32(int *pRc, Fts5Buffer *pBuf, int iVal){
  char *a;
  if( sqlite3Fts5BufferGrow(pRc, pBuf, 4) ) return;
  sqlite3Fts5Put32(&pBuf->p[pBuf->n], iVal);
  pBuf->n += 4;
}

/*
** Append buffer nData/pData to buffer pBuf. If an OOM error occurs, set 







<







54
55
56
57
58
59
60

61
62
63
64
65
66
67
}

int sqlite3Fts5Get32(const u8 *aBuf){
  return (aBuf[0] << 24) + (aBuf[1] << 16) + (aBuf[2] << 8) + aBuf[3];
}

void sqlite3Fts5BufferAppend32(int *pRc, Fts5Buffer *pBuf, int iVal){

  if( sqlite3Fts5BufferGrow(pRc, pBuf, 4) ) return;
  sqlite3Fts5Put32(&pBuf->p[pBuf->n], iVal);
  pBuf->n += 4;
}

/*
** Append buffer nData/pData to buffer pBuf. If an OOM error occurs, set 
Changes to ext/fts5/fts5_config.c.
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
      rc = SQLITE_ERROR;
    }else{
      pConfig->zContentRowid = fts5EscapeName(&rc, zArg);
    }
    return rc;
  }

  *pzErr = sqlite3_mprintf("unrecognized directive: \"%s\"", zCmd);
  return SQLITE_ERROR;
}

/*
** Allocate an instance of the default tokenizer ("simple") at 
** Fts5Config.pTokenizer. Return SQLITE_OK if successful, or an SQLite error
** code if an error occurs.







|







360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
      rc = SQLITE_ERROR;
    }else{
      pConfig->zContentRowid = fts5EscapeName(&rc, zArg);
    }
    return rc;
  }

  *pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
  return SQLITE_ERROR;
}

/*
** Allocate an instance of the default tokenizer ("simple") at 
** Fts5Config.pTokenizer. Return SQLITE_OK if successful, or an SQLite error
** code if an error occurs.
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
** because the callback returned another non-zero value, it is assumed
** to be an SQLite error code and returned to the caller.
*/
int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int, int)    /* Callback */
){
  return pConfig->pTokApi->xTokenize(pConfig->pTok, pCtx, pText, nText, xToken);
}

/*
** Argument pIn points to the first character in what is expected to be
** a comma-separated list of SQL literals followed by a ')' character.







|







584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
** because the callback returned another non-zero value, it is assumed
** to be an SQLite error code and returned to the caller.
*/
int sqlite3Fts5Tokenize(
  Fts5Config *pConfig,            /* FTS5 Configuration object */
  const char *pText, int nText,   /* Text to tokenize */
  void *pCtx,                     /* Context passed to xToken() */
  int (*xToken)(void*, const char*, int, int, int)    /* Callback */
){
  return pConfig->pTokApi->xTokenize(pConfig->pTok, pCtx, pText, nText, xToken);
}

/*
** Argument pIn points to the first character in what is expected to be
** a comma-separated list of SQL literals followed by a ')' character.
Changes to ext/fts5/fts5_expr.c.
453
454
455
456
457
458
459

460
461
462

463
464
465
466
467
468
469
  memset(p, 0, sizeof(Fts5LookaheadReader));
  p->a = a;
  p->n = n;
  fts5LookaheadReaderNext(p);
  return fts5LookaheadReaderNext(p);
}


static int fts5LookaheadReaderEof(Fts5LookaheadReader *p){
  return (p->iPos==FTS5_LOOKAHEAD_EOF);
}


typedef struct Fts5NearTrimmer Fts5NearTrimmer;
struct Fts5NearTrimmer {
  Fts5LookaheadReader reader;     /* Input iterator */
  Fts5PoslistWriter writer;       /* Writer context */
  Fts5Buffer *pOut;               /* Output poslist */
};







>



>







453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
  memset(p, 0, sizeof(Fts5LookaheadReader));
  p->a = a;
  p->n = n;
  fts5LookaheadReaderNext(p);
  return fts5LookaheadReaderNext(p);
}

#if 0
static int fts5LookaheadReaderEof(Fts5LookaheadReader *p){
  return (p->iPos==FTS5_LOOKAHEAD_EOF);
}
#endif

typedef struct Fts5NearTrimmer Fts5NearTrimmer;
struct Fts5NearTrimmer {
  Fts5LookaheadReader reader;     /* Input iterator */
  Fts5PoslistWriter writer;       /* Writer context */
  Fts5Buffer *pOut;               /* Output poslist */
};
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
** Callback for tokenizing terms used by ParseTerm().
*/
static int fts5ParseTokenize(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos                        /* Position offset of token */
){
  int rc = SQLITE_OK;
  const int SZALLOC = 8;
  TokenCtx *pCtx = (TokenCtx*)pContext;
  Fts5ExprPhrase *pPhrase = pCtx->pPhrase;
  Fts5ExprTerm *pTerm;








|
<







1139
1140
1141
1142
1143
1144
1145
1146

1147
1148
1149
1150
1151
1152
1153
** Callback for tokenizing terms used by ParseTerm().
*/
static int fts5ParseTokenize(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  int rc = SQLITE_OK;
  const int SZALLOC = 8;
  TokenCtx *pCtx = (TokenCtx*)pContext;
  Fts5ExprPhrase *pPhrase = pCtx->pPhrase;
  Fts5ExprTerm *pTerm;

Changes to ext/fts5/fts5_index.c.
838
839
840
841
842
843
844

845
846
847
848
849
850

851
852
853
854
855
856
857
}

/*
** Close the sqlite3_blob handle used to read records from the %_data table.
** And discard any cached reads. This function is called at the end of
** a read transaction or when any sub-transaction is rolled back.
*/

static void fts5DataReset(Fts5Index *p){
  if( p->pReader ){
    sqlite3_blob_close(p->pReader);
    p->pReader = 0;
  }
}


/*
** Remove all records associated with segment iSegid in index iIdx.
*/
static void fts5DataRemoveSegment(Fts5Index *p, int iIdx, int iSegid){
  i64 iFirst = FTS5_SEGMENT_ROWID(iIdx, iSegid, 0, 0);
  i64 iLast = FTS5_SEGMENT_ROWID(iIdx, iSegid+1, 0, 0)-1;







>






>







838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
}

/*
** Close the sqlite3_blob handle used to read records from the %_data table.
** And discard any cached reads. This function is called at the end of
** a read transaction or when any sub-transaction is rolled back.
*/
#if 0
static void fts5DataReset(Fts5Index *p){
  if( p->pReader ){
    sqlite3_blob_close(p->pReader);
    p->pReader = 0;
  }
}
#endif

/*
** Remove all records associated with segment iSegid in index iIdx.
*/
static void fts5DataRemoveSegment(Fts5Index *p, int iIdx, int iSegid){
  i64 iFirst = FTS5_SEGMENT_ROWID(iIdx, iSegid, 0, 0);
  i64 iLast = FTS5_SEGMENT_ROWID(iIdx, iSegid+1, 0, 0)-1;
Changes to ext/fts5/fts5_storage.c.
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
** Tokenization callback used when inserting tokens into the FTS index.
*/
static int fts5StorageInsertCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos                        /* Position offset of token */
){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  pCtx->szCol = iPos+1;
  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the
** delete-markers to the FTS index necessary to delete it. Do not actually
** remove the %_content row at this time though.







|
<



|







278
279
280
281
282
283
284
285

286
287
288
289
290
291
292
293
294
295
296
** Tokenization callback used when inserting tokens into the FTS index.
*/
static int fts5StorageInsertCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
  Fts5Index *pIdx = pCtx->pStorage->pIndex;
  int iPos = pCtx->szCol++;
  return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken);
}

/*
** If a row with rowid iDel is present in the %_content table, add the
** delete-markers to the FTS index necessary to delete it. Do not actually
** remove the %_content row at this time though.
308
309
310
311
312
313
314

315
316
317
318
319
320
321
    if( sqlite3_step(pSeek)==SQLITE_ROW ){
      int iCol;
      Fts5InsertCtx ctx;
      ctx.pStorage = p;
      ctx.iCol = -1;
      rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
      for(iCol=1; rc==SQLITE_OK && iCol<=pConfig->nCol; iCol++){

        rc = sqlite3Fts5Tokenize(pConfig, 
            (const char*)sqlite3_column_text(pSeek, iCol),
            sqlite3_column_bytes(pSeek, iCol),
            (void*)&ctx,
            fts5StorageInsertCallback
        );
        p->aTotalSize[iCol-1] -= (i64)ctx.szCol;







>







307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
    if( sqlite3_step(pSeek)==SQLITE_ROW ){
      int iCol;
      Fts5InsertCtx ctx;
      ctx.pStorage = p;
      ctx.iCol = -1;
      rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
      for(iCol=1; rc==SQLITE_OK && iCol<=pConfig->nCol; iCol++){
        ctx.szCol = 0;
        rc = sqlite3Fts5Tokenize(pConfig, 
            (const char*)sqlite3_column_text(pSeek, iCol),
            sqlite3_column_bytes(pSeek, iCol),
            (void*)&ctx,
            fts5StorageInsertCallback
        );
        p->aTotalSize[iCol-1] -= (i64)ctx.szCol;
470
471
472
473
474
475
476

477
478
479
480
481
482
483
    int iCol;
    Fts5InsertCtx ctx;
    ctx.pStorage = p;
    ctx.iCol = -1;

    rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
    for(iCol=0; rc==SQLITE_OK && iCol<pConfig->nCol; iCol++){

      rc = sqlite3Fts5Tokenize(pConfig, 
        (const char*)sqlite3_value_text(apVal[iCol]),
        sqlite3_value_bytes(apVal[iCol]),
        (void*)&ctx,
        fts5StorageInsertCallback
      );
      p->aTotalSize[iCol] -= (i64)ctx.szCol;







>







470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
    int iCol;
    Fts5InsertCtx ctx;
    ctx.pStorage = p;
    ctx.iCol = -1;

    rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
    for(iCol=0; rc==SQLITE_OK && iCol<pConfig->nCol; iCol++){
      ctx.szCol = 0;
      rc = sqlite3Fts5Tokenize(pConfig, 
        (const char*)sqlite3_value_text(apVal[iCol]),
        sqlite3_value_bytes(apVal[iCol]),
        (void*)&ctx,
        fts5StorageInsertCallback
      );
      p->aTotalSize[iCol] -= (i64)ctx.szCol;
647
648
649
650
651
652
653
654
655
656
657

658
659
660
661
662
663
664
665
666
667
668
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd,                       /* End offset of token */
  int iPos                        /* Position offset of token */
){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;

  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken
  );
  pCtx->szCol = iPos+1;
  return SQLITE_OK;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return
** some other SQLite error code if an error occurs while attempting to







|
<


>



<







648
649
650
651
652
653
654
655

656
657
658
659
660
661

662
663
664
665
666
667
668
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */

){
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  int iPos = pCtx->szCol++;
  pCtx->cksum ^= sqlite3Fts5IndexCksum(
      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken
  );

  return SQLITE_OK;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return
** some other SQLite error code if an error occurs while attempting to
691
692
693
694
695
696
697

698
699
700
701
702
703
704
    while( SQLITE_ROW==sqlite3_step(pScan) ){
      int i;
      ctx.iRowid = sqlite3_column_int64(pScan, 0);
      ctx.szCol = 0;
      rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
      for(i=0; rc==SQLITE_OK && i<pConfig->nCol; i++){
        ctx.iCol = i;

        rc = sqlite3Fts5Tokenize(
            pConfig, 
            (const char*)sqlite3_column_text(pScan, i+1),
            sqlite3_column_bytes(pScan, i+1),
            (void*)&ctx,
            fts5StorageIntegrityCallback
        );







>







691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
    while( SQLITE_ROW==sqlite3_step(pScan) ){
      int i;
      ctx.iRowid = sqlite3_column_int64(pScan, 0);
      ctx.szCol = 0;
      rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
      for(i=0; rc==SQLITE_OK && i<pConfig->nCol; i++){
        ctx.iCol = i;
        ctx.szCol = 0;
        rc = sqlite3Fts5Tokenize(
            pConfig, 
            (const char*)sqlite3_column_text(pScan, i+1),
            sqlite3_column_bytes(pScan, i+1),
            (void*)&ctx,
            fts5StorageIntegrityCallback
        );
Changes to ext/fts5/fts5_tcl.c.
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
struct F5tAuxData {
  Tcl_Obj *pObj;
};

static int xTokenizeCb(
  void *pCtx, 
  const char *zToken, int nToken, 
  int iStart, int iEnd, int iPos
){
  F5tFunction *p = (F5tFunction*)pCtx;
  Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
  int rc;

  Tcl_IncrRefCount(pEval);
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewStringObj(zToken, nToken));
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewIntObj(iStart));
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewIntObj(iEnd));
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewIntObj(iPos));

  rc = Tcl_EvalObjEx(p->interp, pEval, 0);
  Tcl_DecrRefCount(pEval);

  return rc;
}








|









<







108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124

125
126
127
128
129
130
131
struct F5tAuxData {
  Tcl_Obj *pObj;
};

static int xTokenizeCb(
  void *pCtx, 
  const char *zToken, int nToken, 
  int iStart, int iEnd
){
  F5tFunction *p = (F5tFunction*)pCtx;
  Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
  int rc;

  Tcl_IncrRefCount(pEval);
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewStringObj(zToken, nToken));
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewIntObj(iStart));
  Tcl_ListObjAppendElement(p->interp, pEval, Tcl_NewIntObj(iEnd));


  rc = Tcl_EvalObjEx(p->interp, pEval, 0);
  Tcl_DecrRefCount(pEval);

  return rc;
}

524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
  int bSubst;
  const char *zInput;
};

static int xTokenizeCb2(
  void *pCtx, 
  const char *zToken, int nToken, 
  int iStart, int iEnd, int iPos
){
  F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
  if( p->bSubst ){
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iPos));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(
        0, p->pRet, Tcl_NewStringObj(&p->zInput[iStart], iEnd-iStart)
    );
  }else{
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iStart));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iEnd));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iPos));
  }
  return SQLITE_OK;
}


/*
**      sqlite3_fts5_tokenize DB TOKENIZER TEXT







|



<








<







523
524
525
526
527
528
529
530
531
532
533

534
535
536
537
538
539
540
541

542
543
544
545
546
547
548
  int bSubst;
  const char *zInput;
};

static int xTokenizeCb2(
  void *pCtx, 
  const char *zToken, int nToken, 
  int iStart, int iEnd
){
  F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
  if( p->bSubst ){

    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(
        0, p->pRet, Tcl_NewStringObj(&p->zInput[iStart], iEnd-iStart)
    );
  }else{
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewStringObj(zToken, nToken));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iStart));
    Tcl_ListObjAppendElement(0, p->pRet, Tcl_NewIntObj(iEnd));

  }
  return SQLITE_OK;
}


/*
**      sqlite3_fts5_tokenize DB TOKENIZER TEXT
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerModule F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};







|







630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
typedef struct F5tTokenizerModule F5tTokenizerInstance;

struct F5tTokenizerContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int);
};

struct F5tTokenizerModule {
  Tcl_Interp *interp;
  Tcl_Obj *pScript;
  F5tTokenizerContext *pContext;
};
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
  ckfree(pInst);
}

static int f5tTokenizerTokenize(
  Fts5Tokenizer *p, 
  void *pCtx,
  const char *pText, int nText, 
  int (*xToken)(void*, const char*, int, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, const char*, int, int, int, int);
  Tcl_Obj *pEval;
  int rc;

  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;

  pEval = Tcl_DuplicateObj(pInst->pScript);







|



|







686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
  ckfree(pInst);
}

static int f5tTokenizerTokenize(
  Fts5Tokenizer *p, 
  void *pCtx,
  const char *pText, int nText, 
  int (*xToken)(void*, const char*, int, int, int)
){
  F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
  void *pOldCtx;
  int (*xOldToken)(void*, const char*, int, int, int);
  Tcl_Obj *pEval;
  int rc;

  pOldCtx = pInst->pContext->pCtx;
  xOldToken = pInst->pContext->xToken;

  pEval = Tcl_DuplicateObj(pInst->pScript);
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  F5tTokenizerContext *p = (F5tTokenizerContext*)clientData;
  int iStart;
  int iEnd;
  int iPos;
  int nToken;
  char *zToken;
  int rc;

  assert( p );
  if( objc!=5 ){
    Tcl_WrongNumArgs(interp, 1, objv, "TEXT START END POS");
    return TCL_ERROR;
  }
  if( p->xToken==0 ){
    Tcl_AppendResult(interp, 
        "sqlite3_fts5_token may only be used by tokenizer callback", 0
    );
    return TCL_ERROR;
  }

  zToken = Tcl_GetStringFromObj(objv[1], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[3], &iEnd) 
   || Tcl_GetIntFromObj(interp, objv[4], &iPos) 
  ){
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, zToken, nToken, iStart, iEnd, iPos);
  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
  return TCL_OK;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);







<





|
|












<




|







726
727
728
729
730
731
732

733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751

752
753
754
755
756
757
758
759
760
761
762
763
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  F5tTokenizerContext *p = (F5tTokenizerContext*)clientData;
  int iStart;
  int iEnd;

  int nToken;
  char *zToken;
  int rc;

  assert( p );
  if( objc!=4 ){
    Tcl_WrongNumArgs(interp, 1, objv, "TEXT START END");
    return TCL_ERROR;
  }
  if( p->xToken==0 ){
    Tcl_AppendResult(interp, 
        "sqlite3_fts5_token may only be used by tokenizer callback", 0
    );
    return TCL_ERROR;
  }

  zToken = Tcl_GetStringFromObj(objv[1], &nToken);
  if( Tcl_GetIntFromObj(interp, objv[2], &iStart) 
   || Tcl_GetIntFromObj(interp, objv[3], &iEnd) 

  ){
    return TCL_ERROR;
  }

  rc = p->xToken(p->pCtx, zToken, nToken, iStart, iEnd);
  Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
  return TCL_OK;
}

static void f5tDelTokenizer(void *pCtx){
  F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
  Tcl_DecrRefCount(pMod->pScript);
Changes to ext/fts5/fts5_tokenize.c.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54































































55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74

75
76
77
78
79
80
81
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
*/

#include "fts5.h"
#include <string.h>
#include <assert.h>

/**************************************************************************
** Start of unicode61 tokenizer implementation.
*/

/*
** Create a "simple" tokenizer.
*/
static int fts5SimpleCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  *ppOut = 0;
  return SQLITE_OK;
}

/*
** Delete a "simple" tokenizer.
*/
static void fts5SimpleDelete(Fts5Tokenizer *p){
  return;
}

/*
** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters. 
*/
static unsigned char aSimpleTokenChar[128] = {
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
};

































































static void simpleFold(char *aOut, const char *aIn, int nByte){
  int i;
  for(i=0; i<nByte; i++){
    char c = aIn[i];
    if( c>='A' && c<='Z' ) c += 32;
    aOut[i] = c;
  }
}

/*
** Tokenize some text using the simple tokenizer.
*/
static int fts5SimpleTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){

  int rc = SQLITE_OK;
  int ie;
  int is = 0;
  int iPos = 0;

  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;


  while( is<nText && rc==SQLITE_OK ){
    int nByte;

    /* Skip any leading divider characters. */
    while( is<nText && ((pText[is]&0x80) || aSimpleTokenChar[pText[is]]==0 ) ){
      is++;
    }
    if( is==nText ) break;

    /* Count the token characters */
    ie = is+1;
    while( ie<nText && ((pText[ie]&0x80)==0 && aSimpleTokenChar[pText[ie]] ) ){
      ie++;
    }

    /* Fold to lower case */
    nByte = ie-is;
    if( nByte>nFold ){
      if( pFold!=aFold ) sqlite3_free(pFold);
      pFold = sqlite3_malloc(nByte*2);
      if( pFold==0 ){
        rc = SQLITE_NOMEM;
        break;
      }
      nFold = nByte*2;
    }
    simpleFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie, iPos);
    iPos++;
    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}







|

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


















|

>



<




>





|






|

















|
<







12
13
14
15
16
17
18
19
20



















21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
*/

#include "fts5.h"
#include <string.h>
#include <assert.h>

/**************************************************************************
** Start of simple tokenizer implementation.
*/




















/*
** For tokenizers with no "unicode" modifier, the set of token characters
** is the same as the set of ASCII range alphanumeric characters. 
*/
static unsigned char aSimpleTokenChar[128] = {
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x00..0x0F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x10..0x1F */
  0, 0, 0, 0, 0, 0, 0, 0,   0, 0, 0, 0, 0, 0, 0, 0,   /* 0x20..0x2F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 0, 0, 0, 0, 0, 0,   /* 0x30..0x3F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x40..0x4F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x50..0x5F */
  0, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60..0x6F */
  1, 1, 1, 1, 1, 1, 1, 1,   1, 1, 1, 0, 0, 0, 0, 0,   /* 0x70..0x7F */
};

typedef struct SimpleTokenizer SimpleTokenizer;
struct SimpleTokenizer {
  unsigned char aTokenChar[128];
};

static void fts5SimpleAddExceptions(
  SimpleTokenizer *p, 
  const char *zArg, 
  int bTokenChars
){
  int i;
  for(i=0; zArg[i]; i++){
    if( (zArg[i] & 0x80)==0 ){
      p->aTokenChar[(int)zArg[i]] = (unsigned char)bTokenChars;
    }
  }
}

/*
** Create a "simple" tokenizer.
*/
static int fts5SimpleCreate(
  void *pCtx, 
  const char **azArg, int nArg,
  Fts5Tokenizer **ppOut
){
  int rc = SQLITE_OK;
  SimpleTokenizer *p = 0;
  if( nArg%2 ){
    rc = SQLITE_ERROR;
  }else{
    p = sqlite3_malloc(sizeof(SimpleTokenizer));
    if( p==0 ){
      rc = SQLITE_NOMEM;
    }else{
      int i;
      memset(p, 0, sizeof(SimpleTokenizer));
      memcpy(p->aTokenChar, aSimpleTokenChar, sizeof(aSimpleTokenChar));
      for(i=0; rc==SQLITE_OK && i<nArg; i+=2){
        const char *zArg = azArg[i+1];
        if( 0==sqlite3_stricmp(azArg[i], "tokenchars") ){
          fts5SimpleAddExceptions(p, zArg, 1);
        }else
        if( 0==sqlite3_stricmp(azArg[i], "separators") ){
          fts5SimpleAddExceptions(p, zArg, 0);
        }else{
          rc = SQLITE_ERROR;
        }
      }
    }
  }

  *ppOut = (Fts5Tokenizer*)p;
  return rc;
}

/*
** Delete a "simple" tokenizer.
*/
static void fts5SimpleDelete(Fts5Tokenizer *p){
  sqlite3_free(p);
}


static void simpleFold(char *aOut, const char *aIn, int nByte){
  int i;
  for(i=0; i<nByte; i++){
    char c = aIn[i];
    if( c>='A' && c<='Z' ) c += 32;
    aOut[i] = c;
  }
}

/*
** Tokenize some text using the simple tokenizer.
*/
static int fts5SimpleTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  SimpleTokenizer *p = (SimpleTokenizer*)pTokenizer;
  int rc = SQLITE_OK;
  int ie;
  int is = 0;


  char aFold[64];
  int nFold = sizeof(aFold);
  char *pFold = aFold;
  unsigned char *a = p->aTokenChar;

  while( is<nText && rc==SQLITE_OK ){
    int nByte;

    /* Skip any leading divider characters. */
    while( is<nText && ((pText[is]&0x80) || a[(int)pText[is]]==0) ){
      is++;
    }
    if( is==nText ) break;

    /* Count the token characters */
    ie = is+1;
    while( ie<nText && ((pText[ie]&0x80)==0 && a[(int)pText[ie]] ) ){
      ie++;
    }

    /* Fold to lower case */
    nByte = ie-is;
    if( nByte>nFold ){
      if( pFold!=aFold ) sqlite3_free(pFold);
      pFold = sqlite3_malloc(nByte*2);
      if( pFold==0 ){
        rc = SQLITE_NOMEM;
        break;
      }
      nFold = nByte*2;
    }
    simpleFold(pFold, &pText[is], nByte);

    /* Invoke the token callback */
    rc = xToken(pCtx, pFold, nByte, is, ie);

    is = ie+1;
  }
  
  if( pFold!=aFold ) sqlite3_free(pFold);
  if( rc==SQLITE_DONE ) rc = SQLITE_OK;
  return rc;
}
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
/*
** Tokenize some text using a unicode61 tokenizer.
*/
static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  const unsigned char *zInput = (const unsigned char*)pText;
  const unsigned char *zTerm = &zInput[nText];
  const unsigned char *z = zInput;
  int rc = SQLITE_OK;
  int nBuf = 0;
  unsigned char *zBuf = 0;
  unsigned char *zOut = 0;
  int iPos = 0;

  while( rc==SQLITE_OK && z<zTerm ){
    int iCode;
    int bAlnum;
    const unsigned char *zStart;
    const unsigned char *zCode;








|









<







368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

385
386
387
388
389
390
391
/*
** Tokenize some text using a unicode61 tokenizer.
*/
static int fts5UnicodeTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
  const unsigned char *zInput = (const unsigned char*)pText;
  const unsigned char *zTerm = &zInput[nText];
  const unsigned char *z = zInput;
  int rc = SQLITE_OK;
  int nBuf = 0;
  unsigned char *zBuf = 0;
  unsigned char *zOut = 0;


  while( rc==SQLITE_OK && z<zTerm ){
    int iCode;
    int bAlnum;
    const unsigned char *zStart;
    const unsigned char *zCode;

374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
      /* Write the new character to it */
      iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
      if( iOut ) WRITE_UTF8(zOut, iOut);
    }

    if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
      int ie = (bAlnum ? z : zCode) - zInput;
      rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie, iPos);
      zOut = zBuf;
      iPos++;
    }
  }

 tokenize_finished:
  sqlite3_free(zBuf);
  return rc;
}

/**************************************************************************
** Start of porter2 stemmer implementation.
*/

/* Any tokens larger than this (in bytes) are passed through without
** stemming. */
#define FTS5_PORTER_MAX_TOKEN 64

typedef struct PorterTokenizer PorterTokenizer;







|

<









|







417
418
419
420
421
422
423
424
425

426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
      /* Write the new character to it */
      iOut = sqlite3Fts5UnicodeFold(iCode, p->bRemoveDiacritic);
      if( iOut ) WRITE_UTF8(zOut, iOut);
    }

    if( zOut>zBuf && (bAlnum==0 || z>=zTerm) ){
      int ie = (bAlnum ? z : zCode) - zInput;
      rc = xToken(pCtx, (const char*)zBuf, zOut-zBuf, zStart-zInput, ie);
      zOut = zBuf;

    }
  }

 tokenize_finished:
  sqlite3_free(zBuf);
  return rc;
}

/**************************************************************************
** Start of porter stemmer implementation.
*/

/* Any tokens larger than this (in bytes) are passed through without
** stemming. */
#define FTS5_PORTER_MAX_TOKEN 64

typedef struct PorterTokenizer PorterTokenizer;
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
  *ppOut = (Fts5Tokenizer*)pRet;
  return rc;
}

typedef struct PorterContext PorterContext;
struct PorterContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int, int);
  char *aBuf;
};

typedef struct PorterRule PorterRule;
struct PorterRule {
  const char *zSuffix;
  int nSuffix;
  int (*xCond)(char *zStem, int nStem);
  const char *zOutput;
  int nOutput;
};

static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
  int ret = -1;
  int nBuf = *pnBuf;
  PorterRule *p;


  for(p=aRule; p->zSuffix; p++){
    assert( strlen(p->zSuffix)==p->nSuffix );
    assert( strlen(p->zOutput)==p->nOutput );
    if( nBuf<p->nSuffix ) continue;
    if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
  }







|
















<







490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513

514
515
516
517
518
519
520
  *ppOut = (Fts5Tokenizer*)pRet;
  return rc;
}

typedef struct PorterContext PorterContext;
struct PorterContext {
  void *pCtx;
  int (*xToken)(void*, const char*, int, int, int);
  char *aBuf;
};

typedef struct PorterRule PorterRule;
struct PorterRule {
  const char *zSuffix;
  int nSuffix;
  int (*xCond)(char *zStem, int nStem);
  const char *zOutput;
  int nOutput;
};

static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
  int ret = -1;
  int nBuf = *pnBuf;
  PorterRule *p;


  for(p=aRule; p->zSuffix; p++){
    assert( strlen(p->zSuffix)==p->nSuffix );
    assert( strlen(p->zOutput)==p->nOutput );
    if( nBuf<p->nSuffix ) continue;
    if( 0==memcmp(&aBuf[nBuf - p->nSuffix], p->zSuffix, p->nSuffix) ) break;
  }
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
}

static int fts5PorterCb(
  void *pCtx, 
  const char *pToken, 
  int nToken, 
  int iStart, 
  int iEnd, 
  int iPos
){
  PorterContext *p = (PorterContext*)pCtx;

  PorterRule aStep1A[] = {
    { "sses", 4,  0, "ss", 2 },
    { "ies",  3,  0, "i",  1  },
    { "ss",   2,  0, "ss", 2 },







|
<







614
615
616
617
618
619
620
621

622
623
624
625
626
627
628
}

static int fts5PorterCb(
  void *pCtx, 
  const char *pToken, 
  int nToken, 
  int iStart, 
  int iEnd

){
  PorterContext *p = (PorterContext*)pCtx;

  PorterRule aStep1A[] = {
    { "sses", 4,  0, "ss", 2 },
    { "ies",  3,  0, "i",  1  },
    { "ss",   2,  0, "ss", 2 },
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
  /* Step 5b. */
  if( nBuf>1 && aBuf[nBuf-1]=='l' 
   && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
  ){
    nBuf--;
  }

  return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd, iPos);

 pass_through:
  return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd, iPos);
}

/*
** Tokenize using the porter tokenizer.
*/
static int fts5PorterTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd, int iPos)
){
  PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  PorterContext sCtx;
  sCtx.xToken = xToken;
  sCtx.pCtx = pCtx;
  sCtx.aBuf = p->aBuf;
  return p->tokenizer.xTokenize(







|


|









|







752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
  /* Step 5b. */
  if( nBuf>1 && aBuf[nBuf-1]=='l' 
   && aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1) 
  ){
    nBuf--;
  }

  return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd);

 pass_through:
  return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd);
}

/*
** Tokenize using the porter tokenizer.
*/
static int fts5PorterTokenize(
  Fts5Tokenizer *pTokenizer,
  void *pCtx,
  const char *pText, int nText,
  int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
){
  PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
  PorterContext sCtx;
  sCtx.xToken = xToken;
  sCtx.pCtx = pCtx;
  sCtx.aBuf = p->aBuf;
  return p->tokenizer.xTokenize(
Name change from test/fts5_common.tcl to ext/fts5/test/fts5_common.tcl.
1
2
3
4
5
6
7
8
9
10
11
12


13

14
15
16
17
18
19
20
# 2014 Dec 19
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#







proc fts5_test_poslist {cmd} {
  set res [list]
  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
    lappend res [string map {{ } .} [$cmd xInst $i]]
  }












>
>
|
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 2014 Dec 19
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl


proc fts5_test_poslist {cmd} {
  set res [list]
  for {set i 0} {$i < [$cmd xInstCount]} {incr i} {
    lappend res [string map {{ } .} [$cmd xInst $i]]
  }
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
  set res [list]
  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    lappend res [$cmd xColumnTotalSize $i]
  }
  set res
}

proc test_append_token {varname token iStart iEnd iPos} {
  upvar $varname var
  lappend var $token
}
proc fts5_test_tokenize {cmd} {
  set res [list]
  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    set tokens [list]







|







44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
  set res [list]
  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    lappend res [$cmd xColumnTotalSize $i]
  }
  set res
}

proc test_append_token {varname token iStart iEnd} {
  upvar $varname var
  lappend var $token
}
proc fts5_test_tokenize {cmd} {
  set res [list]
  for {set i 0} {$i < [$cmd xColumnCount]} {incr i} {
    set tokens [list]
Changes to ext/fts5/test/fts5aa.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5aa

# If SQLITE_ENABLE_FTS3 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5aa

# If SQLITE_ENABLE_FTS3 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ab.test.
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ab

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







9
10
11
12
13
14
15

16


17
18
19
20
21
22
23
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ab

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ac.test.
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ac

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







9
10
11
12
13
14
15

16


17
18
19
20
21
22
23
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ac

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ad.test.
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ad

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







9
10
11
12
13
14
15

16


17
18
19
20
21
22
23
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ad

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ae.test.
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ae

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







9
10
11
12
13
14
15

16


17
18
19
20
21
22
23
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ae

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5af.test.
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
# 
# More specifically, the tests in this file focus on the built-in 
# snippet() function.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5af

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







11
12
13
14
15
16
17

18


19
20
21
22
23
24
25
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
# 
# More specifically, the tests in this file focus on the built-in 
# snippet() function.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5af

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ag.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ag

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ag

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ah.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ah

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ah

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ai.test.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, it tests transactions and savepoints
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ai

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







10
11
12
13
14
15
16

17


18
19
20
21
22
23
24
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, it tests transactions and savepoints
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ai

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5aj.test.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# focus of this script is testing the FTS5 module.
#
# Specifically, this tests that, provided the amount of data remains 
# constant, the FTS index does not grow indefinitely as rows are inserted 
# and deleted,
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5aj

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







12
13
14
15
16
17
18

19


20
21
22
23
24
25
26
# focus of this script is testing the FTS5 module.
#
# Specifically, this tests that, provided the amount of data remains 
# constant, the FTS index does not grow indefinitely as rows are inserted 
# and deleted,
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5aj

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5ak.test.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, the auxiliary function "highlight".
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ak

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







10
11
12
13
14
15
16

17


18
19
20
21
22
23
24
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, the auxiliary function "highlight".
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ak

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5al.test.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, this function tests the %_config table.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5al

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}







<
|
<
<







10
11
12
13
14
15
16

17


18
19
20
21
22
23
24
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#
# Specifically, this function tests the %_config table.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5al

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5auxdata.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 xSetAuxdata() and xGetAuxdata() APIs.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5auxdata

do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE f1 USING fts5(a, b);
  INSERT INTO f1(rowid, a, b) VALUES(1, 'a', 'b1');
  INSERT INTO f1(rowid, a, b) VALUES(2, 'a', 'b2');
  INSERT INTO f1(rowid, a, b) VALUES(3, 'a', 'b3');







<
|
<
<







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 xSetAuxdata() and xGetAuxdata() APIs.
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5auxdata

do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE f1 USING fts5(a, b);
  INSERT INTO f1(rowid, a, b) VALUES(1, 'a', 'b1');
  INSERT INTO f1(rowid, a, b) VALUES(2, 'a', 'b2');
  INSERT INTO f1(rowid, a, b) VALUES(3, 'a', 'b3');
Changes to ext/fts5/test/fts5content.test.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5content

#-------------------------------------------------------------------------
# Contentless tables
#
do_execsql_test 1.1 {
  CREATE VIRTUAL TABLE f1 USING fts5(a, b, content='');













<
|
<
<







1
2
3
4
5
6
7
8
9
10
11
12
13

14


15
16
17
18
19
20
21
# 2014 Dec 20
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5content

#-------------------------------------------------------------------------
# Contentless tables
#
do_execsql_test 1.1 {
  CREATE VIRTUAL TABLE f1 USING fts5(a, b, content='');
Changes to ext/fts5/test/fts5ea.test.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# 2014 June 17
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5ea

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}












<
|
<
<







1
2
3
4
5
6
7
8
9
10
11
12

13


14
15
16
17
18
19
20
# 2014 June 17
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5ea

# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}
Changes to ext/fts5/test/fts5fault1.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
source $testdir/malloc_common.tcl
set testprefix fts5fault1

# If SQLITE_ENABLE_FTS3 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return







<
|
<
<







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#


source [file join [file dirname [info script]] fts5_common.tcl]


source $testdir/malloc_common.tcl
set testprefix fts5fault1

# If SQLITE_ENABLE_FTS3 is defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
Changes to ext/fts5/test/fts5porter.test.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#***********************************************************************
#
# Tests focusing on the fts5 porter stemmer implementation.
#
#   http://tartarus.org/martin/PorterStemmer/
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5porter

set test_vocab {
  a               a               aaron           aaron          
  abaissiez       abaissiez       abandon         abandon        
  abandoned       abandon         abase           abas           
  abash           abash           abate           abat           







<
|
<
<







10
11
12
13
14
15
16

17


18
19
20
21
22
23
24
#***********************************************************************
#
# Tests focusing on the fts5 porter stemmer implementation.
#
#   http://tartarus.org/martin/PorterStemmer/
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5porter

set test_vocab {
  a               a               aaron           aaron          
  abaissiez       abaissiez       abandon         abandon        
  abandoned       abandon         abase           abas           
  abash           abash           abate           abat           
Changes to ext/fts5/test/fts5tokenizer.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 tokenizers
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5tokenizer



do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  DROP TABLE ft1;
}
do_execsql_test 1.1 {







<
|
<
<

<







8
9
10
11
12
13
14

15


16

17
18
19
20
21
22
23
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 tokenizers
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5tokenizer



do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE ft1 USING fts5(x, tokenize=porter);
  DROP TABLE ft1;
}
do_execsql_test 1.1 {
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79





















80
81
82
}
do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
do_execsql_test 2.3 { 
  SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' 
} 1


proc tcl_create {args} { 
  set ::targs $args
  error "failed" 
}
sqlite3_fts5_create_tokenizer db tcl tcl_create

foreach {tn directive expected} {
  1 {tokenize='tcl a b c'}             {a b c}
  2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
  3 {tokenize="tcl 'g' 'h' 'i'"}       {g h i}
  4 {tokenize = tcl}                   {}
} {
  do_catchsql_test 3.$tn.1 "
    CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
  " {1 {error in tokenizer constructor}}
  do_test 3.$tn.2 { set ::targs } $expected
}


do_catchsql_test 4.1 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
} {1 {parse error in "tokenize = tcl abc"}}
do_catchsql_test 4.2 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
} {1 {parse error in "x y"}}






















finish_test








<


















<






>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



43
44
45
46
47
48
49

50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
}
do_execsql_test 2.1 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'embedding' } 1
do_execsql_test 2.2 { SELECT rowid FROM ft1 WHERE ft1 MATCH 'database' } 1
do_execsql_test 2.3 { 
  SELECT rowid FROM ft1 WHERE ft1 MATCH 'database embedding' 
} 1


proc tcl_create {args} { 
  set ::targs $args
  error "failed" 
}
sqlite3_fts5_create_tokenizer db tcl tcl_create

foreach {tn directive expected} {
  1 {tokenize='tcl a b c'}             {a b c}
  2 {tokenize='tcl ''d'' ''e'' ''f'''} {d e f}
  3 {tokenize="tcl 'g' 'h' 'i'"}       {g h i}
  4 {tokenize = tcl}                   {}
} {
  do_catchsql_test 3.$tn.1 "
    CREATE VIRTUAL TABLE ft2 USING fts5(x, $directive)
  " {1 {error in tokenizer constructor}}
  do_test 3.$tn.2 { set ::targs } $expected
}


do_catchsql_test 4.1 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x, tokenize = tcl abc);
} {1 {parse error in "tokenize = tcl abc"}}
do_catchsql_test 4.2 {
  CREATE VIRTUAL TABLE ft2 USING fts5(x y)
} {1 {parse error in "x y"}}

#-------------------------------------------------------------------------
# Test the "separators" and "tokenchars" options a bit.
#
foreach {tn tokenizer} {1 simple 2 unicode61} {
  reset_db
  set T "$tokenizer tokenchars ',.:' separators 'xyz'"
  execsql "CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = \"$T\")"
  do_execsql_test 5.$tn.1 {
    INSERT INTO t1 VALUES('abcxdefyghizjkl.mno,pqr:stu/vwx+yz');
  }
  foreach {tn2 token res} {
    1 abc 1     2 def 1     3 ghi 1    4 jkl {}
    5 mno {}    6 pqr {}    7 stu {}   8 jkl.mno,pqr:stu 1
    9 vw  1
  } {
    do_execsql_test 5.$tn.2.$tn2 "
      SELECT rowid FROM t1 WHERE t1 MATCH '\"$token\"'
    " $res
  }
}

finish_test

Changes to ext/fts5/test/fts5unicode.test.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 tokenizers
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5unicode

proc tokenize_test {tn tokenizer input output} {
  uplevel [list do_test $tn [subst -nocommands {
    set ret {}
    foreach {z s e p} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
      lappend ret [set z]
    }
    set ret
  }] [list {*}$output]]
}

foreach {tn t} {1 simple 2 unicode61} {







<
|
<
<





|







8
9
10
11
12
13
14

15


16
17
18
19
20
21
22
23
24
25
26
27
28
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
# Tests focusing on the fts5 tokenizers
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5unicode

proc tokenize_test {tn tokenizer input output} {
  uplevel [list do_test $tn [subst -nocommands {
    set ret {}
    foreach {z s e} [sqlite3_fts5_tokenize db {$tokenizer} {$input}] {
      lappend ret [set z]
    }
    set ret
  }] [list {*}$output]]
}

foreach {tn t} {1 simple 2 unicode61} {
Changes to ext/fts5/test/fts5unicode2.test.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#*************************************************************************
#
# The tests in this file focus on testing the "unicode" FTS tokenizer.
#
# This is a modified copy of FTS4 test file "fts4_unicode.test".
#

if {![info exists testdir]} {
  set testdir [file join [file dirname [info script]] .. .. .. test]
}
source $testdir/tester.tcl
set testprefix fts5unicode2

proc do_unicode_token_test {tn input res} {
  uplevel [list do_test $tn [list \
    sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
  ] [list {*}$res]]
}







<
|
<
<







10
11
12
13
14
15
16

17


18
19
20
21
22
23
24
#*************************************************************************
#
# The tests in this file focus on testing the "unicode" FTS tokenizer.
#
# This is a modified copy of FTS4 test file "fts4_unicode.test".
#


source [file join [file dirname [info script]] fts5_common.tcl]


set testprefix fts5unicode2

proc do_unicode_token_test {tn input res} {
  uplevel [list do_test $tn [list \
    sqlite3_fts5_tokenize -subst db "unicode61 remove_diacritics 0" $input
  ] [list {*}$res]]
}
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  set input [lindex $args end-1]
  set res [lindex $args end]
  uplevel [list do_test $tn [list \
    sqlite3_fts5_tokenize -subst db $tokenizer $input
  ] [list {*}$res]]
}

do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}

do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
    "0 \uE4 \uC4 1 \uF6 \uD6 2 \uFC \uDC"

do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
    "0 x\uE4x x\uC4x 1 x\uF6x x\uD6x 2 x\uFCx x\uDCx"

# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
do_unicode_token_test 1.4 "\u1E9E" "0 \uDF \u1E9E"

do_unicode_token_test 1.5 "The quick brown fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}
do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
  0 the The 1 quick quick 2 brown brown 3 fox fox
}

do_unicode_token_test2 1.7  {a B c D} {0 a a 1 b B 2 c c 3 d D}
do_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "0 a \uC4 1 o \uD6 2 u \uDC"

do_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
    "0 xax x\uC4x 1 xox x\uD6x 2 xux x\uDCx"

# Check that diacritics are removed if remove_diacritics=1 is specified.
# And that they do not break tokens.
do_unicode_token_test2 1.10 "xx\u0301xx" "0 xxxx xx\u301xx"

# Title-case mappings work
do_unicode_token_test 1.11 "\u01c5" "0 \u01c6 \u01c5"

#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {







|


|


|


|
|


|


|


|
|


|



|


|







34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
  set input [lindex $args end-1]
  set res [lindex $args end]
  uplevel [list do_test $tn [list \
    sqlite3_fts5_tokenize -subst db $tokenizer $input
  ] [list {*}$res]]
}

do_unicode_token_test 1.0 {a B c D} {a a b B c c d D}

do_unicode_token_test 1.1 "\uC4 \uD6 \uDC" \
    "\uE4 \uC4 \uF6 \uD6 \uFC \uDC"

do_unicode_token_test 1.2 "x\uC4x x\uD6x x\uDCx" \
    "x\uE4x x\uC4x x\uF6x x\uD6x x\uFCx x\uDCx"

# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
do_unicode_token_test 1.3 "\uDF" "\uDF \uDF"
do_unicode_token_test 1.4 "\u1E9E" "\uDF \u1E9E"

do_unicode_token_test 1.5 "The quick brown fox" {
  the The quick quick brown brown fox fox
}
do_unicode_token_test 1.6 "The\u00bfquick\u224ebrown\u2263fox" {
  the The quick quick brown brown fox fox
}

do_unicode_token_test2 1.7  {a B c D} {a a b B c c d D}
do_unicode_token_test2 1.8  "\uC4 \uD6 \uDC" "a \uC4 o \uD6 u \uDC"

do_unicode_token_test2 1.9  "x\uC4x x\uD6x x\uDCx" \
    "xax x\uC4x xox x\uD6x xux x\uDCx"

# Check that diacritics are removed if remove_diacritics=1 is specified.
# And that they do not break tokens.
do_unicode_token_test2 1.10 "xx\u0301xx" "xxxx xx\u301xx"

# Title-case mappings work
do_unicode_token_test 1.11 "\u01c5" "\u01c6 \u01c5"

#-------------------------------------------------------------------------
#
set docs [list {
  Enhance the INSERT syntax to allow multiple rows to be inserted via the
  VALUES clause.
} {
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350

#-------------------------------------------------------------------------

breakpoint
do_unicode_token_test3 5.1 {tokenchars {}} {
  sqlite3_reset sqlite3_column_int
} {
  0 sqlite3 sqlite3 
  1 reset reset 
  2 sqlite3 sqlite3 
  3 column column 
  4 int int
}

do_unicode_token_test3 5.2 {tokenchars _} {
  sqlite3_reset sqlite3_column_int
} {
  0 sqlite3_reset sqlite3_reset 
  1 sqlite3_column_int sqlite3_column_int
}

do_unicode_token_test3 5.3 {separators xyz} {
  Laotianxhorseyrunszfast
} {
  0 laotian Laotian
  1 horse horse
  2 runs runs
  3 fast fast
}

do_unicode_token_test3 5.4 {tokenchars xyz} {
  Laotianxhorseyrunszfast
} {
  0 laotianxhorseyrunszfast Laotianxhorseyrunszfast
}

do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
  sqlite3_resetxsqlite3_column_intyhonda_phantom
} {
  0 sqlite3_reset sqlite3_reset 
  1 sqlite3_column_int sqlite3_column_int
  2 honda_phantom honda_phantom
}

do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
  0 abc abc 1 def def
}

do_unicode_token_test3 5.7                             \
  "tokenchars \u2444\u2445"                            \
  "separators \u05D0\u05D1\u05D2"                      \
  "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
  [list                                                \
    0 \u2444fre\u2445sh \u2444fre\u2445sh              \
    1 water water                                      \
    2 fish fish                                        \
    3 \u2445timer \u2445timer                          \
  ]

# Check that it is not possible to add a standalone diacritic codepoint 
# to either separators or tokenchars.
do_unicode_token_test3 5.8 "separators \u0301" \
  "hello\u0301world \u0301helloworld"          \
  "0 helloworld hello\u0301world 1 helloworld helloworld"

do_unicode_token_test3 5.9 "tokenchars \u0301" \
  "hello\u0301world \u0301helloworld"          \
  "0 helloworld hello\u0301world 1 helloworld helloworld"

do_unicode_token_test3 5.10 "separators \u0301" \
  "remove_diacritics 0"                        \
  "hello\u0301world \u0301helloworld"          \
  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"

do_unicode_token_test3 5.11 "tokenchars \u0301" \
  "remove_diacritics 0"                         \
  "hello\u0301world \u0301helloworld"           \
  "0 hello\u0301world hello\u0301world 1 helloworld helloworld"


#-------------------------------------------------------------------------

proc do_tokenize {tokenizer txt} {
  set res [list]
  foreach {a b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
    lappend res $b
  }
  set res
}

# Argument $lCodepoint must be a list of codepoints (integers) that 
# correspond to whitespace characters. This command creates a string







|
|
|
|
|





|
|





|
|
|
|





|





|
|
|



|







|
|
|
|






|



|




|




|
<





|







256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

334
335
336
337
338
339
340
341
342
343
344
345
346

#-------------------------------------------------------------------------

breakpoint
do_unicode_token_test3 5.1 {tokenchars {}} {
  sqlite3_reset sqlite3_column_int
} {
  sqlite3 sqlite3 
  reset reset 
  sqlite3 sqlite3 
  column column 
  int int
}

do_unicode_token_test3 5.2 {tokenchars _} {
  sqlite3_reset sqlite3_column_int
} {
  sqlite3_reset sqlite3_reset 
  sqlite3_column_int sqlite3_column_int
}

do_unicode_token_test3 5.3 {separators xyz} {
  Laotianxhorseyrunszfast
} {
  laotian Laotian
  horse horse
  runs runs
  fast fast
}

do_unicode_token_test3 5.4 {tokenchars xyz} {
  Laotianxhorseyrunszfast
} {
  laotianxhorseyrunszfast Laotianxhorseyrunszfast
}

do_unicode_token_test3 5.5 {tokenchars _} {separators zyx} {
  sqlite3_resetxsqlite3_column_intyhonda_phantom
} {
  sqlite3_reset sqlite3_reset 
  sqlite3_column_int sqlite3_column_int
  honda_phantom honda_phantom
}

do_unicode_token_test3 5.6 "separators \u05D1" "abc\u05D1def" {
  abc abc def def
}

do_unicode_token_test3 5.7                             \
  "tokenchars \u2444\u2445"                            \
  "separators \u05D0\u05D1\u05D2"                      \
  "\u2444fre\u2445sh\u05D0water\u05D2fish.\u2445timer" \
  [list                                                \
    \u2444fre\u2445sh \u2444fre\u2445sh              \
    water water                                      \
    fish fish                                        \
    \u2445timer \u2445timer                          \
  ]

# Check that it is not possible to add a standalone diacritic codepoint 
# to either separators or tokenchars.
do_unicode_token_test3 5.8 "separators \u0301" \
  "hello\u0301world \u0301helloworld"          \
  "helloworld hello\u0301world helloworld helloworld"

do_unicode_token_test3 5.9 "tokenchars \u0301" \
  "hello\u0301world \u0301helloworld"          \
  "helloworld hello\u0301world helloworld helloworld"

do_unicode_token_test3 5.10 "separators \u0301" \
  "remove_diacritics 0"                        \
  "hello\u0301world \u0301helloworld"          \
  "hello\u0301world hello\u0301world helloworld helloworld"

do_unicode_token_test3 5.11 "tokenchars \u0301" \
  "remove_diacritics 0"                         \
  "hello\u0301world \u0301helloworld"           \
  "hello\u0301world hello\u0301world helloworld helloworld"


#-------------------------------------------------------------------------

proc do_tokenize {tokenizer txt} {
  set res [list]
  foreach {b c} [sqlite3_fts5_tokenize -subst db $tokenizer $txt] {
    lappend res $b
  }
  set res
}

# Argument $lCodepoint must be a list of codepoints (integers) that 
# correspond to whitespace characters. This command creates a string
386
387
388
389
390
391
392

393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409

  do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
  do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
  do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
  do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
  do_isspace_test 6.$T.23 $T   {8287 12288}
}


#-------------------------------------------------------------------------
# Test that the private use ranges are treated as alphanumeric.
#
foreach {tn1 c} {
  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
} {
  foreach {tn2 config res} {
    1 ""             "0 hello*world hello*world"
    2 "separators *" "0 hello hello 1 world world"
  } {
    set config [string map [list * $c] $config]
    set input  [string map [list * $c] "hello*world"]
    set output [string map [list * $c] $res]
    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
  }
}







>








|
|







382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

  do_isspace_test 6.$T.19 $T   {32 160 5760 6158}
  do_isspace_test 6.$T.20 $T   {8192 8193 8194 8195}
  do_isspace_test 6.$T.21 $T   {8196 8197 8198 8199}
  do_isspace_test 6.$T.22 $T   {8200 8201 8202 8239}
  do_isspace_test 6.$T.23 $T   {8287 12288}
}


#-------------------------------------------------------------------------
# Test that the private use ranges are treated as alphanumeric.
#
foreach {tn1 c} {
  1 \ue000 2 \ue001 3 \uf000 4 \uf8fe 5 \uf8ff
} {
  foreach {tn2 config res} {
    1 ""             "hello*world hello*world"
    2 "separators *" "hello hello world world"
  } {
    set config [string map [list * $c] $config]
    set input  [string map [list * $c] "hello*world"]
    set output [string map [list * $c] $res]
    do_unicode_token_test3 7.$tn1.$tn2 {*}$config $input $output
  }
}
Changes to test/tester.tcl.
1917
1918
1919
1920
1921
1922
1923
1924
# few test cases that deliberately corrupt database files should rescind 
# this setting by invoking "database_can_be_corrupt"
#
database_never_corrupt

source $testdir/thread_common.tcl
source $testdir/malloc_common.tcl
source $testdir/fts5_common.tcl







<
1917
1918
1919
1920
1921
1922
1923

# few test cases that deliberately corrupt database files should rescind 
# this setting by invoking "database_can_be_corrupt"
#
database_never_corrupt

source $testdir/thread_common.tcl
source $testdir/malloc_common.tcl