SQLite

Check-in [f8e9c445dd]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts4-languageid
Files: files | file ages | folders
SHA1: f8e9c445dd358c40e5a7bf3756b9f291909dbea7
User & Date: dan 2012-03-03 18:46:41.456
Context
2012-03-05
15:33
Merge the fts4-languageid branch with the trunk. (check-in: 99a9073b5e user: dan tags: trunk)
2012-03-03
18:46
Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater. (Closed-Leaf check-in: f8e9c445dd user: dan tags: fts4-languageid)
2012-03-02
19:53
Fix problems with combining content= and languageid= in a single fts4 table. (check-in: 22491e7bc3 user: dan tags: fts4-languageid)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts3/fts3.c.
2954
2955
2956
2957
2958
2959
2960



2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
    int iCol = idxNum-FTS3_FULLTEXT_SEARCH;
    const char *zQuery = (const char *)sqlite3_value_text(apVal[0]);

    if( zQuery==0 && sqlite3_value_type(apVal[0])!=SQLITE_NULL ){
      return SQLITE_NOMEM;
    }




    rc = sqlite3Fts3ExprParse(p->pTokenizer, p->azColumn, p->bHasStat, 
        p->nColumn, iCol, zQuery, -1, &pCsr->pExpr
    );
    if( rc!=SQLITE_OK ){
      if( rc==SQLITE_ERROR ){
        static const char *zErr = "malformed MATCH expression: [%s]";
        p->base.zErrMsg = sqlite3_mprintf(zErr, zQuery);
      }
      return rc;
    }

    pCsr->iLangid = 0;
    if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]);

    rc = sqlite3Fts3ReadLock(p);
    if( rc!=SQLITE_OK ) return rc;

    rc = fts3EvalStart(pCsr);

    sqlite3Fts3SegmentsClose(p);
    if( rc!=SQLITE_OK ) return rc;







>
>
>
|
|









<
<
<







2954
2955
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974



2975
2976
2977
2978
2979
2980
2981
    int iCol = idxNum-FTS3_FULLTEXT_SEARCH;
    const char *zQuery = (const char *)sqlite3_value_text(apVal[0]);

    if( zQuery==0 && sqlite3_value_type(apVal[0])!=SQLITE_NULL ){
      return SQLITE_NOMEM;
    }

    pCsr->iLangid = 0;
    if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]);

    rc = sqlite3Fts3ExprParse(p->pTokenizer, pCsr->iLangid,
        p->azColumn, p->bHasStat, p->nColumn, iCol, zQuery, -1, &pCsr->pExpr
    );
    if( rc!=SQLITE_OK ){
      if( rc==SQLITE_ERROR ){
        static const char *zErr = "malformed MATCH expression: [%s]";
        p->base.zErrMsg = sqlite3_mprintf(zErr, zQuery);
      }
      return rc;
    }




    rc = sqlite3Fts3ReadLock(p);
    if( rc!=SQLITE_OK ) return rc;

    rc = fts3EvalStart(pCsr);

    sqlite3Fts3SegmentsClose(p);
    if( rc!=SQLITE_OK ) return rc;
Changes to ext/fts3/fts3Int.h.
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508




509
510
511
512
513
514
515
void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*);
void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *,
  const char *, const char *, int, int
);
void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *);

/* fts3_expr.c */
int sqlite3Fts3ExprParse(sqlite3_tokenizer *, 
  char **, int, int, int, const char *, int, Fts3Expr **
);
void sqlite3Fts3ExprFree(Fts3Expr *);
#ifdef SQLITE_TEST
int sqlite3Fts3ExprInitTestInterface(sqlite3 *db);
int sqlite3Fts3InitTerm(sqlite3 *db);
#endif





/* fts3_aux.c */
int sqlite3Fts3InitAux(sqlite3 *db);

void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *);

int sqlite3Fts3MsrIncrStart(







|







>
>
>
>







494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*);
void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *,
  const char *, const char *, int, int
);
void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *);

/* fts3_expr.c */
int sqlite3Fts3ExprParse(sqlite3_tokenizer *, int,
  char **, int, int, int, const char *, int, Fts3Expr **
);
void sqlite3Fts3ExprFree(Fts3Expr *);
#ifdef SQLITE_TEST
int sqlite3Fts3ExprInitTestInterface(sqlite3 *db);
int sqlite3Fts3InitTerm(sqlite3 *db);
#endif

int sqlite3Fts3OpenTokenizer(sqlite3_tokenizer *, int, const char *, int,
  sqlite3_tokenizer_cursor **
);

/* fts3_aux.c */
int sqlite3Fts3InitAux(sqlite3 *db);

void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *);

int sqlite3Fts3MsrIncrStart(
Changes to ext/fts3/fts3_expr.c.
88
89
90
91
92
93
94

95
96
97
98
99
100
101
**   FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the
**   FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to
**   zero.
*/
typedef struct ParseContext ParseContext;
struct ParseContext {
  sqlite3_tokenizer *pTokenizer;      /* Tokenizer module */

  const char **azCol;                 /* Array of column names for fts3 table */
  int bFts4;                          /* True to allow FTS4-only syntax */
  int nCol;                           /* Number of entries in azCol[] */
  int iDefaultCol;                    /* Default column to query */
  int isNot;                          /* True if getNextNode() sees a unary - */
  sqlite3_context *pCtx;              /* Write error message here */
  int nNest;                          /* Number of nested brackets */







>







88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
**   FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the
**   FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to
**   zero.
*/
typedef struct ParseContext ParseContext;
struct ParseContext {
  sqlite3_tokenizer *pTokenizer;      /* Tokenizer module */
  int iLangid;                        /* Language id used with tokenizer */
  const char **azCol;                 /* Array of column names for fts3 table */
  int bFts4;                          /* True to allow FTS4-only syntax */
  int nCol;                           /* Number of entries in azCol[] */
  int iDefaultCol;                    /* Default column to query */
  int isNot;                          /* True if getNextNode() sees a unary - */
  sqlite3_context *pCtx;              /* Write error message here */
  int nNest;                          /* Number of nested brackets */
123
124
125
126
127
128
129



























130
131
132
133
134
135
136
*/
static void *fts3MallocZero(int nByte){
  void *pRet = sqlite3_malloc(nByte);
  if( pRet ) memset(pRet, 0, nByte);
  return pRet;
}





























/*
** Extract the next token from buffer z (length n) using the tokenizer
** and other information (column names etc.) in pParse. Create an Fts3Expr
** structure of type FTSQUERY_PHRASE containing a phrase consisting of this
** single token and set *ppExpr to point to it. If the end of the buffer is
** reached before a token is found, set *ppExpr to zero. It is the







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
*/
static void *fts3MallocZero(int nByte){
  void *pRet = sqlite3_malloc(nByte);
  if( pRet ) memset(pRet, 0, nByte);
  return pRet;
}

int sqlite3Fts3OpenTokenizer(
  sqlite3_tokenizer *pTokenizer,
  int iLangid,
  const char *z,
  int n,
  sqlite3_tokenizer_cursor **ppCsr
){
  sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
  sqlite3_tokenizer_cursor *pCsr = 0;
  int rc;

  rc = pModule->xOpen(pTokenizer, z, n, &pCsr);
  assert( rc==SQLITE_OK || pCsr==0 );
  if( rc==SQLITE_OK ){
    pCsr->pTokenizer = pTokenizer;
    if( pModule->iVersion>=1 ){
      rc = pModule->xLanguageid(pCsr, iLangid);
      if( rc!=SQLITE_OK ){
        pModule->xClose(pCsr);
        pCsr = 0;
      }
    }
  }
  *ppCsr = pCsr;
  return rc;
}


/*
** Extract the next token from buffer z (length n) using the tokenizer
** and other information (column names etc.) in pParse. Create an Fts3Expr
** structure of type FTSQUERY_PHRASE containing a phrase consisting of this
** single token and set *ppExpr to point to it. If the end of the buffer is
** reached before a token is found, set *ppExpr to zero. It is the
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
  sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
  sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
  int rc;
  sqlite3_tokenizer_cursor *pCursor;
  Fts3Expr *pRet = 0;
  int nConsumed = 0;

  rc = pModule->xOpen(pTokenizer, z, n, &pCursor);
  if( rc==SQLITE_OK ){
    const char *zToken;
    int nToken, iStart, iEnd, iPosition;
    int nByte;                               /* total space to allocate */

    pCursor->pTokenizer = pTokenizer;
    rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition);

    if( rc==SQLITE_OK ){
      nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken;
      pRet = (Fts3Expr *)fts3MallocZero(nByte);
      if( !pRet ){
        rc = SQLITE_NOMEM;
      }else{
        pRet->eType = FTSQUERY_PHRASE;







|





<

<







178
179
180
181
182
183
184
185
186
187
188
189
190

191

192
193
194
195
196
197
198
  sqlite3_tokenizer *pTokenizer = pParse->pTokenizer;
  sqlite3_tokenizer_module const *pModule = pTokenizer->pModule;
  int rc;
  sqlite3_tokenizer_cursor *pCursor;
  Fts3Expr *pRet = 0;
  int nConsumed = 0;

  rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, n, &pCursor);
  if( rc==SQLITE_OK ){
    const char *zToken;
    int nToken, iStart, iEnd, iPosition;
    int nByte;                               /* total space to allocate */


    rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition);

    if( rc==SQLITE_OK ){
      nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken;
      pRet = (Fts3Expr *)fts3MallocZero(nByte);
      if( !pRet ){
        rc = SQLITE_NOMEM;
      }else{
        pRet->eType = FTSQUERY_PHRASE;
264
265
266
267
268
269
270

271
272
273
274
275
276
277
278
279
280
281
  **
  **   Buffer zTemp: Contains copies of all tokens.
  **
  ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below,
  ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase
  ** structures.
  */

  rc = pModule->xOpen(pTokenizer, zInput, nInput, &pCursor);
  if( rc==SQLITE_OK ){
    int ii;
    pCursor->pTokenizer = pTokenizer;
    for(ii=0; rc==SQLITE_OK; ii++){
      const char *zByte;
      int nByte, iBegin, iEnd, iPos;
      rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos);
      if( rc==SQLITE_OK ){
        Fts3PhraseToken *pToken;








>
|


<







290
291
292
293
294
295
296
297
298
299
300

301
302
303
304
305
306
307
  **
  **   Buffer zTemp: Contains copies of all tokens.
  **
  ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below,
  ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase
  ** structures.
  */
  rc = sqlite3Fts3OpenTokenizer(
      pTokenizer, pParse->iLangid, zInput, nInput, &pCursor);
  if( rc==SQLITE_OK ){
    int ii;

    for(ii=0; rc==SQLITE_OK; ii++){
      const char *zByte;
      int nByte, iBegin, iEnd, iPos;
      rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos);
      if( rc==SQLITE_OK ){
        Fts3PhraseToken *pToken;

741
742
743
744
745
746
747

748
749
750
751
752
753
754
755
756
757


758

759
760
761
762
763
764
765
766
767
768
769
** that appears on the left-hand-side of the MATCH operator (the default
** column to match against for tokens for which a column name is not explicitly
** specified as part of the query string), or -1 if tokens may by default
** match any table column.
*/
int sqlite3Fts3ExprParse(
  sqlite3_tokenizer *pTokenizer,      /* Tokenizer module */

  char **azCol,                       /* Array of column names for fts3 table */
  int bFts4,                          /* True to allow FTS4-only syntax */
  int nCol,                           /* Number of entries in azCol[] */
  int iDefaultCol,                    /* Default column to query */
  const char *z, int n,               /* Text of MATCH query */
  Fts3Expr **ppExpr                   /* OUT: Parsed query structure */
){
  int nParsed;
  int rc;
  ParseContext sParse;


  sParse.pTokenizer = pTokenizer;

  sParse.azCol = (const char **)azCol;
  sParse.nCol = nCol;
  sParse.iDefaultCol = iDefaultCol;
  sParse.nNest = 0;
  sParse.bFts4 = bFts4;
  if( z==0 ){
    *ppExpr = 0;
    return SQLITE_OK;
  }
  if( n<0 ){
    n = (int)strlen(z);







>










>
>

>



<







767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791

792
793
794
795
796
797
798
** that appears on the left-hand-side of the MATCH operator (the default
** column to match against for tokens for which a column name is not explicitly
** specified as part of the query string), or -1 if tokens may by default
** match any table column.
*/
int sqlite3Fts3ExprParse(
  sqlite3_tokenizer *pTokenizer,      /* Tokenizer module */
  int iLangid,                        /* Language id for tokenizer */
  char **azCol,                       /* Array of column names for fts3 table */
  int bFts4,                          /* True to allow FTS4-only syntax */
  int nCol,                           /* Number of entries in azCol[] */
  int iDefaultCol,                    /* Default column to query */
  const char *z, int n,               /* Text of MATCH query */
  Fts3Expr **ppExpr                   /* OUT: Parsed query structure */
){
  int nParsed;
  int rc;
  ParseContext sParse;

  memset(&sParse, 0, sizeof(ParseContext));
  sParse.pTokenizer = pTokenizer;
  sParse.iLangid = iLangid;
  sParse.azCol = (const char **)azCol;
  sParse.nCol = nCol;
  sParse.iDefaultCol = iDefaultCol;

  sParse.bFts4 = bFts4;
  if( z==0 ){
    *ppExpr = 0;
    return SQLITE_OK;
  }
  if( n<0 ){
    n = (int)strlen(z);
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
    goto exprtest_out;
  }
  for(ii=0; ii<nCol; ii++){
    azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]);
  }

  rc = sqlite3Fts3ExprParse(
      pTokenizer, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr
  );
  if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){
    sqlite3_result_error(context, "Error parsing expression", -1);
  }else if( rc==SQLITE_NOMEM || !(zBuf = exprToString(pExpr, 0)) ){
    sqlite3_result_error_nomem(context);
  }else{
    sqlite3_result_text(context, zBuf, -1, SQLITE_TRANSIENT);







|







975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
    goto exprtest_out;
  }
  for(ii=0; ii<nCol; ii++){
    azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]);
  }

  rc = sqlite3Fts3ExprParse(
      pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr
  );
  if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){
    sqlite3_result_error(context, "Error parsing expression", -1);
  }else if( rc==SQLITE_NOMEM || !(zBuf = exprToString(pExpr, 0)) ){
    sqlite3_result_error_nomem(context);
  }else{
    sqlite3_result_text(context, zBuf, -1, SQLITE_TRANSIENT);
Changes to ext/fts3/fts3_snippet.c.
528
529
530
531
532
533
534

535
536
537
538
539
540
541
** This is done as part of extracting the snippet text, not when selecting
** the snippet. Snippet selection is done based on doclists only, so there
** is no way for fts3BestSnippet() to know whether or not the document 
** actually contains terms that follow the final highlighted term. 
*/
static int fts3SnippetShift(
  Fts3Table *pTab,                /* FTS3 table snippet comes from */

  int nSnippet,                   /* Number of tokens desired for snippet */
  const char *zDoc,               /* Document text to extract snippet from */
  int nDoc,                       /* Size of buffer zDoc in bytes */
  int *piPos,                     /* IN/OUT: First token of snippet */
  u64 *pHlmask                    /* IN/OUT: Mask of tokens to highlight */
){
  u64 hlmask = *pHlmask;          /* Local copy of initial highlight-mask */







>







528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
** This is done as part of extracting the snippet text, not when selecting
** the snippet. Snippet selection is done based on doclists only, so there
** is no way for fts3BestSnippet() to know whether or not the document 
** actually contains terms that follow the final highlighted term. 
*/
static int fts3SnippetShift(
  Fts3Table *pTab,                /* FTS3 table snippet comes from */
  int iLangid,                    /* Language id to use in tokenizing */
  int nSnippet,                   /* Number of tokens desired for snippet */
  const char *zDoc,               /* Document text to extract snippet from */
  int nDoc,                       /* Size of buffer zDoc in bytes */
  int *piPos,                     /* IN/OUT: First token of snippet */
  u64 *pHlmask                    /* IN/OUT: Mask of tokens to highlight */
){
  u64 hlmask = *pHlmask;          /* Local copy of initial highlight-mask */
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
      sqlite3_tokenizer_module *pMod;
      sqlite3_tokenizer_cursor *pC;
      pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;

      /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
      ** or more tokens in zDoc/nDoc.
      */
      rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
      if( rc!=SQLITE_OK ){
        return rc;
      }
      pC->pTokenizer = pTab->pTokenizer;
      while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
        const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
        rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
      }
      pMod->xClose(pC);
      if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }








|



<







564
565
566
567
568
569
570
571
572
573
574

575
576
577
578
579
580
581
      sqlite3_tokenizer_module *pMod;
      sqlite3_tokenizer_cursor *pC;
      pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;

      /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired)
      ** or more tokens in zDoc/nDoc.
      */
      rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC);
      if( rc!=SQLITE_OK ){
        return rc;
      }

      while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){
        const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3;
        rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent);
      }
      pMod->xClose(pC);
      if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; }

627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
    }
    return SQLITE_OK;
  }
  nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);

  /* Open a token cursor on the document. */
  pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  pC->pTokenizer = pTab->pTokenizer;

  while( rc==SQLITE_OK ){
    int iBegin;                   /* Offset in zDoc of start of token */
    int iFin;                     /* Offset in zDoc of end of token */
    int isHighlight;              /* True for highlighted terms */

    rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);







|



<







627
628
629
630
631
632
633
634
635
636
637

638
639
640
641
642
643
644
    }
    return SQLITE_OK;
  }
  nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol);

  /* Open a token cursor on the document. */
  pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule;
  rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC);
  if( rc!=SQLITE_OK ){
    return rc;
  }


  while( rc==SQLITE_OK ){
    int iBegin;                   /* Offset in zDoc of start of token */
    int iFin;                     /* Offset in zDoc of end of token */
    int isHighlight;              /* True for highlighted terms */

    rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent);
653
654
655
656
657
658
659
660


661
662
663
664
665
666
667
      }
      break;
    }
    if( iCurrent<iPos ){ continue; }

    if( !isShiftDone ){
      int n = nDoc - iBegin;
      rc = fts3SnippetShift(pTab, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask);


      isShiftDone = 1;

      /* Now that the shift has been done, check if the initial "..." are
      ** required. They are required if (a) this is not the first fragment,
      ** or (b) this fragment does not begin at position 0 of its column. 
      */
      if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){







|
>
>







652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
      }
      break;
    }
    if( iCurrent<iPos ){ continue; }

    if( !isShiftDone ){
      int n = nDoc - iBegin;
      rc = fts3SnippetShift(
          pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask
      );
      isShiftDone = 1;

      /* Now that the shift has been done, check if the initial "..." are
      ** required. They are required if (a) this is not the first fragment,
      ** or (b) this fragment does not begin at position 0 of its column. 
      */
      if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){
1386
1387
1388
1389
1390
1391
1392

1393

1394
1395
1396
1397
1398
1399
1400
1401
1402
        continue;
      }
      rc = SQLITE_NOMEM;
      goto offsets_out;
    }

    /* Initialize a tokenizer iterator to iterate through column iCol. */

    rc = pMod->xOpen(pTab->pTokenizer, zDoc, nDoc, &pC);

    if( rc!=SQLITE_OK ) goto offsets_out;
    pC->pTokenizer = pTab->pTokenizer;

    rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
    while( rc==SQLITE_OK ){
      int i;                      /* Used to loop through terms */
      int iMinPos = 0x7FFFFFFF;   /* Position of next token */
      TermOffset *pTerm = 0;      /* TermOffset associated with next token */








>
|
>

<







1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397

1398
1399
1400
1401
1402
1403
1404
        continue;
      }
      rc = SQLITE_NOMEM;
      goto offsets_out;
    }

    /* Initialize a tokenizer iterator to iterate through column iCol. */
    rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid,
        zDoc, nDoc, &pC
    );
    if( rc!=SQLITE_OK ) goto offsets_out;


    rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent);
    while( rc==SQLITE_OK ){
      int i;                      /* Used to loop through terms */
      int iMinPos = 0x7FFFFFFF;   /* Position of next token */
      TermOffset *pTerm = 0;      /* TermOffset associated with next token */

Changes to ext/fts3/fts3_test.c.
9
10
11
12
13
14
15



16
17
18
19
20
21
22
**    May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file is not part of the production FTS code. It is only used for
** testing. It contains a Tcl command that can be used to test if a document
** matches an FTS NEAR expression.



*/

#include <tcl.h>
#include <string.h>
#include <assert.h>

#ifdef SQLITE_TEST







>
>
>







9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
**    May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file is not part of the production FTS code. It is only used for
** testing. It contains a Tcl command that can be used to test if a document
** matches an FTS NEAR expression.
**
** As of March 2012, it also contains a version 1 tokenizer used for testing
** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly.
*/

#include <tcl.h>
#include <string.h>
#include <assert.h>

#ifdef SQLITE_TEST
310
311
312
313
314
315
316

































































































































































































317
318
319
320
321



322
323
324

  Tcl_SetObjResult(interp, pRet);
  Tcl_DecrRefCount(pRet);
#endif
  return TCL_OK;
}


































































































































































































int Sqlitetestfts3_Init(Tcl_Interp *interp){
  Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
  Tcl_CreateObjCommand(interp, 
      "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
  );



  return TCL_OK;
}
#endif                  /* ifdef SQLITE_TEST */







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





>
>
>



313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523

  Tcl_SetObjResult(interp, pRet);
  Tcl_DecrRefCount(pRet);
#endif
  return TCL_OK;
}

/**************************************************************************
** Beginning of test tokenizer code.
**
** For language 0, this tokenizer is similar to the default 'simple' 
** tokenizer. For other languages L, the following:
**
**   * Odd numbered languages are case-sensitive. Even numbered 
**     languages are not.
**
**   * Language ids 100 or greater are considered an error.
**
** The implementation assumes that the input contains only ASCII characters
** (i.e. those that may be encoded in UTF-8 using a single byte).
*/
typedef struct test_tokenizer {
  sqlite3_tokenizer base;
} test_tokenizer;

typedef struct test_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *aInput;          /* Input being tokenized */
  int nInput;                  /* Size of the input in bytes */
  int iInput;                  /* Current offset in aInput */
  int iToken;                  /* Index of next token to be returned */
  char *aBuffer;               /* Buffer containing current token */
  int nBuffer;                 /* Number of bytes allocated at pToken */
  int iLangid;                 /* Configured language id */
} test_tokenizer_cursor;

static int testTokenizerCreate(
  int argc, const char * const *argv,
  sqlite3_tokenizer **ppTokenizer
){
  test_tokenizer *pNew;

  pNew = sqlite3_malloc(sizeof(test_tokenizer));
  if( !pNew ) return SQLITE_NOMEM;
  memset(pNew, 0, sizeof(test_tokenizer));

  *ppTokenizer = (sqlite3_tokenizer *)pNew;
  return SQLITE_OK;
}

static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){
  test_tokenizer *p = (test_tokenizer *)pTokenizer;
  sqlite3_free(p);
  return SQLITE_OK;
}

static int testTokenizerOpen(
  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
  const char *pInput, int nBytes,        /* String to be tokenized */
  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
){
  int rc = SQLITE_OK;                    /* Return code */
  test_tokenizer_cursor *pCsr;           /* New cursor object */

  UNUSED_PARAMETER(pTokenizer);

  pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor));
  if( pCsr==0 ){
    rc = SQLITE_NOMEM;
  }else{
    memset(pCsr, 0, sizeof(test_tokenizer_cursor));
    pCsr->aInput = pInput;
    if( nBytes<0 ){
      pCsr->nInput = strlen(pInput);
    }else{
      pCsr->nInput = nBytes;
    }
  }

  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
  return rc;
}

static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){
  test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  sqlite3_free(pCsr->aBuffer);
  sqlite3_free(pCsr);
  return SQLITE_OK;
}

static int testIsTokenChar(char c){
  return (c>='a' && c<='z') || (c>='A' && c<='Z');
}
static int testTolower(char c){
  char ret = c;
  if( ret>='A' && ret<='Z') ret = ret - ('A'-'a');
  return ret;
}

static int testTokenizerNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by testTokenizerOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  int rc = SQLITE_OK;
  const char *p;
  const char *pEnd;

  p = &pCsr->aInput[pCsr->iInput];
  pEnd = &pCsr->aInput[pCsr->nInput];

  /* Skip past any white-space */
  assert( p<=pEnd );
  while( p<pEnd && testIsTokenChar(*p)==0 ) p++;

  if( p==pEnd ){
    rc = SQLITE_DONE;
  }else{
    /* Advance to the end of the token */
    const char *pToken = p;
    int nToken;
    while( p<pEnd && testIsTokenChar(*p) ) p++;
    nToken = p-pToken;

    /* Copy the token into the buffer */
    if( nToken>pCsr->nBuffer ){
      sqlite3_free(pCsr->aBuffer);
      pCsr->aBuffer = sqlite3_malloc(nToken);
    }
    if( pCsr->aBuffer==0 ){
      rc = SQLITE_NOMEM;
    }else{
      int i;

      if( pCsr->iLangid & 0x00000001 ){
        for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i];
      }else{
        for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]);
      }
      pCsr->iToken++;
      pCsr->iInput = p - pCsr->aInput;

      *ppToken = pCsr->aBuffer;
      *pnBytes = nToken;
      *piStartOffset = pToken - pCsr->aInput;
      *piEndOffset = p - pCsr->aInput;
      *piPosition = pCsr->iToken;
    }
  }

  return rc;
}

static int testTokenizerLanguage(
  sqlite3_tokenizer_cursor *pCursor,
  int iLangid
){
  int rc = SQLITE_OK;
  test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor;
  pCsr->iLangid = iLangid;
  if( pCsr->iLangid>=100 ){
    rc = SQLITE_ERROR;
  }
  return rc;
}

static int fts3_test_tokenizer_cmd(
  ClientData clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *CONST objv[]
){
  static const sqlite3_tokenizer_module testTokenizerModule = {
    1,
    testTokenizerCreate,
    testTokenizerDestroy,
    testTokenizerOpen,
    testTokenizerClose,
    testTokenizerNext,
    testTokenizerLanguage
  };
  const sqlite3_tokenizer_module *pPtr = &testTokenizerModule;
  if( objc!=1 ){
    Tcl_WrongNumArgs(interp, 1, objv, "");
    return TCL_ERROR;
  }
  Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(
    (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *)
  ));
  return TCL_OK;
}

/* 
** End of tokenizer code.
**************************************************************************/ 

int Sqlitetestfts3_Init(Tcl_Interp *interp){
  Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0);
  Tcl_CreateObjCommand(interp, 
      "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0
  );
  Tcl_CreateObjCommand(
      interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0
  );
  return TCL_OK;
}
#endif                  /* ifdef SQLITE_TEST */
Changes to ext/fts3/fts3_tokenizer.c.
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
  Tcl_IncrRefCount(pRet);

  if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
  pTokenizer->pModule = p;
  if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
    zErr = "error in xOpen()";
    goto finish;
  }
  pCsr->pTokenizer = pTokenizer;

  while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
    zToken = &zInput[iStart];
    nToken = iEnd-iStart;
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));







|



<







284
285
286
287
288
289
290
291
292
293
294

295
296
297
298
299
300
301
  Tcl_IncrRefCount(pRet);

  if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
    zErr = "error in xCreate()";
    goto finish;
  }
  pTokenizer->pModule = p;
  if( sqlite3Fts3OpenTokenizer(pTokenizer, 0, zInput, nInput, &pCsr) ){
    zErr = "error in xOpen()";
    goto finish;
  }


  while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
    zToken = &zInput[iStart];
    nToken = iEnd-iStart;
    Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
Changes to ext/fts3/fts3_tokenizer.h.
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;

struct sqlite3_tokenizer_module {

  /*
  ** Structure version. Should always be set to 0.
  */
  int iVersion;

  /*
  ** Create a new tokenizer. The values in the argv[] array are the
  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  ** TABLE statement that created the fts3 table. For example, if







|







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;

struct sqlite3_tokenizer_module {

  /*
  ** Structure version. Should always be set to 0 or 1.
  */
  int iVersion;

  /*
  ** Create a new tokenizer. The values in the argv[] array are the
  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  ** TABLE statement that created the fts3 table. For example, if
129
130
131
132
133
134
135









136
137
138
139
140
141
142
  int (*xNext)(
    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
    int *piPosition      /* OUT: Number of tokens returned before this one */
  );









};

struct sqlite3_tokenizer {
  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
  /* Tokenizer implementations will typically add additional fields */
};








>
>
>
>
>
>
>
>
>







129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
  int (*xNext)(
    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
    int *piPosition      /* OUT: Number of tokens returned before this one */
  );

  /***********************************************************************
  ** Methods below this point are only available if iVersion>=1.
  */

  /* 
  ** Configure the language id of a tokenizer cursor.
  */
  int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
};

struct sqlite3_tokenizer {
  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
  /* Tokenizer implementations will typically add additional fields */
};

Changes to ext/fts3/fts3_write.c.
653
654
655
656
657
658
659

660
661
662
663
664
665
666
** pending-terms hash-table. The docid used is that currently stored in
** p->iPrevDocid, and the column is specified by argument iCol.
**
** If successful, SQLITE_OK is returned. Otherwise, an SQLite error code.
*/
static int fts3PendingTermsAdd(
  Fts3Table *p,                   /* Table into which text will be inserted */

  const char *zText,              /* Text of document to be inserted */
  int iCol,                       /* Column into which text is being inserted */
  u32 *pnWord                     /* OUT: Number of tokens inserted */
){
  int rc;
  int iStart;
  int iEnd;







>







653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
** pending-terms hash-table. The docid used is that currently stored in
** p->iPrevDocid, and the column is specified by argument iCol.
**
** If successful, SQLITE_OK is returned. Otherwise, an SQLite error code.
*/
static int fts3PendingTermsAdd(
  Fts3Table *p,                   /* Table into which text will be inserted */
  int iLangid,                    /* Language id to use */
  const char *zText,              /* Text of document to be inserted */
  int iCol,                       /* Column into which text is being inserted */
  u32 *pnWord                     /* OUT: Number of tokens inserted */
){
  int rc;
  int iStart;
  int iEnd;
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
  ** zText==0. In this case, add zero token entries to the hash table and 
  ** return early. */
  if( zText==0 ){
    *pnWord = 0;
    return SQLITE_OK;
  }

  rc = pModule->xOpen(pTokenizer, zText, -1, &pCsr);
  if( rc!=SQLITE_OK ){
    return rc;
  }
  pCsr->pTokenizer = pTokenizer;

  xNext = pModule->xNext;
  while( SQLITE_OK==rc
      && SQLITE_OK==(rc = xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos))
  ){
    int i;
    if( iPos>=nWord ) nWord = iPos+1;







|



<







683
684
685
686
687
688
689
690
691
692
693

694
695
696
697
698
699
700
  ** zText==0. In this case, add zero token entries to the hash table and 
  ** return early. */
  if( zText==0 ){
    *pnWord = 0;
    return SQLITE_OK;
  }

  rc = sqlite3Fts3OpenTokenizer(pTokenizer, iLangid, zText, -1, &pCsr);
  if( rc!=SQLITE_OK ){
    return rc;
  }


  xNext = pModule->xNext;
  while( SQLITE_OK==rc
      && SQLITE_OK==(rc = xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos))
  ){
    int i;
    if( iPos>=nWord ) nWord = iPos+1;
779
780
781
782
783
784
785
786





787
788
789
790
791
792
793
794
795
796
797
** This function is called by the xUpdate() method as part of an INSERT
** operation. It adds entries for each term in the new record to the
** pendingTerms hash table.
**
** Argument apVal is the same as the similarly named argument passed to
** fts3InsertData(). Parameter iDocid is the docid of the new row.
*/
static int fts3InsertTerms(Fts3Table *p, sqlite3_value **apVal, u32 *aSz){





  int i;                          /* Iterator variable */
  for(i=2; i<p->nColumn+2; i++){
    const char *zText = (const char *)sqlite3_value_text(apVal[i]);
    int rc = fts3PendingTermsAdd(p, zText, i-2, &aSz[i-2]);
    if( rc!=SQLITE_OK ){
      return rc;
    }
    aSz[p->nColumn] += sqlite3_value_bytes(apVal[i]);
  }
  return SQLITE_OK;
}







|
>
>
>
>
>



|







779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
** This function is called by the xUpdate() method as part of an INSERT
** operation. It adds entries for each term in the new record to the
** pendingTerms hash table.
**
** Argument apVal is the same as the similarly named argument passed to
** fts3InsertData(). Parameter iDocid is the docid of the new row.
*/
static int fts3InsertTerms(
  Fts3Table *p, 
  int iLangid, 
  sqlite3_value **apVal, 
  u32 *aSz
){
  int i;                          /* Iterator variable */
  for(i=2; i<p->nColumn+2; i++){
    const char *zText = (const char *)sqlite3_value_text(apVal[i]);
    int rc = fts3PendingTermsAdd(p, iLangid, zText, i-2, &aSz[i-2]);
    if( rc!=SQLITE_OK ){
      return rc;
    }
    aSz[p->nColumn] += sqlite3_value_bytes(apVal[i]);
  }
  return SQLITE_OK;
}
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
  sqlite3_stmt *pSelect;

  if( *pRC ) return;
  rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pSelect, &pRowid);
  if( rc==SQLITE_OK ){
    if( SQLITE_ROW==sqlite3_step(pSelect) ){
      int i;
      rc = fts3PendingTermsDocid(p, 
          langidFromSelect(p, pSelect), 
          sqlite3_column_int64(pSelect, 0)
      );
      for(i=1; rc==SQLITE_OK && i<=p->nColumn; i++){
        const char *zText = (const char *)sqlite3_column_text(pSelect, i);
        rc = fts3PendingTermsAdd(p, zText, -1, &aSz[i-1]);
        aSz[p->nColumn] += sqlite3_column_bytes(pSelect, i);
      }
      if( rc!=SQLITE_OK ){
        sqlite3_reset(pSelect);
        *pRC = rc;
        return;
      }







<
|
|
<


|







934
935
936
937
938
939
940

941
942

943
944
945
946
947
948
949
950
951
952
  sqlite3_stmt *pSelect;

  if( *pRC ) return;
  rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pSelect, &pRowid);
  if( rc==SQLITE_OK ){
    if( SQLITE_ROW==sqlite3_step(pSelect) ){
      int i;

      int iLangid = langidFromSelect(p, pSelect);
      rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pSelect, 0));

      for(i=1; rc==SQLITE_OK && i<=p->nColumn; i++){
        const char *zText = (const char *)sqlite3_column_text(pSelect, i);
        rc = fts3PendingTermsAdd(p, iLangid, zText, -1, &aSz[i-1]);
        aSz[p->nColumn] += sqlite3_column_bytes(pSelect, i);
      }
      if( rc!=SQLITE_OK ){
        sqlite3_reset(pSelect);
        *pRC = rc;
        return;
      }
3098
3099
3100
3101
3102
3103
3104

3105
3106
3107
3108
3109
3110
3111
3112
3113
3114
3115
3116
3117
3118
        aSzIns = &aSz[p->nColumn+1];
        aSzDel = &aSzIns[p->nColumn+1];
      }
    }

    while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){
      int iCol;

      rc = fts3PendingTermsDocid(p, 
          langidFromSelect(p, pStmt), sqlite3_column_int64(pStmt, 0)
      );
      aSz[p->nColumn] = 0;
      for(iCol=0; rc==SQLITE_OK && iCol<p->nColumn; iCol++){
        const char *z = (const char *) sqlite3_column_text(pStmt, iCol+1);
        rc = fts3PendingTermsAdd(p, z, iCol, &aSz[iCol]);
        aSz[p->nColumn] += sqlite3_column_bytes(pStmt, iCol+1);
      }
      if( p->bHasDocsize ){
        fts3InsertDocsize(&rc, p, aSz);
      }
      if( rc!=SQLITE_OK ){
        sqlite3_finalize(pStmt);







>
|
<
<



|







3101
3102
3103
3104
3105
3106
3107
3108
3109


3110
3111
3112
3113
3114
3115
3116
3117
3118
3119
3120
        aSzIns = &aSz[p->nColumn+1];
        aSzDel = &aSzIns[p->nColumn+1];
      }
    }

    while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){
      int iCol;
      int iLangid = langidFromSelect(p, pStmt);
      rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pStmt, 0));


      aSz[p->nColumn] = 0;
      for(iCol=0; rc==SQLITE_OK && iCol<p->nColumn; iCol++){
        const char *z = (const char *) sqlite3_column_text(pStmt, iCol+1);
        rc = fts3PendingTermsAdd(p, iLangid, z, iCol, &aSz[iCol]);
        aSz[p->nColumn] += sqlite3_column_bytes(pStmt, iCol+1);
      }
      if( p->bHasDocsize ){
        fts3InsertDocsize(&rc, p, aSz);
      }
      if( rc!=SQLITE_OK ){
        sqlite3_finalize(pStmt);
3223
3224
3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
    assert( pCsr->isRequireSeek==0 );
    iDocid = sqlite3_column_int64(pCsr->pStmt, 0);
  
    for(i=0; i<p->nColumn && rc==SQLITE_OK; i++){
      const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1);
      sqlite3_tokenizer_cursor *pTC = 0;
  
      rc = pModule->xOpen(pT, zText, -1, &pTC);
      while( rc==SQLITE_OK ){
        char const *zToken;       /* Buffer containing token */
        int nToken;               /* Number of bytes in token */
        int iDum1, iDum2;         /* Dummy variables */
        int iPos;                 /* Position of token in zText */
  
        pTC->pTokenizer = pT;
        rc = pModule->xNext(pTC, &zToken, &nToken, &iDum1, &iDum2, &iPos);
        for(pDef=pCsr->pDeferred; pDef && rc==SQLITE_OK; pDef=pDef->pNext){
          Fts3PhraseToken *pPT = pDef->pToken;
          if( (pDef->iCol>=p->nColumn || pDef->iCol==i)
           && (pPT->bFirst==0 || iPos==0)
           && (pPT->n==nToken || (pPT->isPrefix && pPT->n<nToken))
           && (0==memcmp(zToken, pPT->z, pPT->n))







|






<







3225
3226
3227
3228
3229
3230
3231
3232
3233
3234
3235
3236
3237
3238

3239
3240
3241
3242
3243
3244
3245
    assert( pCsr->isRequireSeek==0 );
    iDocid = sqlite3_column_int64(pCsr->pStmt, 0);
  
    for(i=0; i<p->nColumn && rc==SQLITE_OK; i++){
      const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1);
      sqlite3_tokenizer_cursor *pTC = 0;
  
      rc = sqlite3Fts3OpenTokenizer(pT, pCsr->iLangid, zText, -1, &pTC);
      while( rc==SQLITE_OK ){
        char const *zToken;       /* Buffer containing token */
        int nToken;               /* Number of bytes in token */
        int iDum1, iDum2;         /* Dummy variables */
        int iPos;                 /* Position of token in zText */
  

        rc = pModule->xNext(pTC, &zToken, &nToken, &iDum1, &iDum2, &iPos);
        for(pDef=pCsr->pDeferred; pDef && rc==SQLITE_OK; pDef=pDef->pNext){
          Fts3PhraseToken *pPT = pDef->pToken;
          if( (pDef->iCol>=p->nColumn || pDef->iCol==i)
           && (pPT->bFirst==0 || iPos==0)
           && (pPT->n==nToken || (pPT->isPrefix && pPT->n<nToken))
           && (0==memcmp(zToken, pPT->z, pPT->n))
3463
3464
3465
3466
3467
3468
3469

3470
3471
3472
3473
3474
3475
3476
3477
3478
3479
3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
3491
    assert( sqlite3_value_type(apVal[0])==SQLITE_INTEGER );
    rc = fts3DeleteByRowid(p, apVal[0], &nChng, aSzDel);
    isRemove = 1;
  }
  
  /* If this is an INSERT or UPDATE operation, insert the new record. */
  if( nArg>1 && rc==SQLITE_OK ){

    if( bInsertDone==0 ){
      rc = fts3InsertData(p, apVal, pRowid);
      if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){
        rc = FTS_CORRUPT_VTAB;
      }
    }
    if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){
      rc = fts3PendingTermsDocid(p, 
          sqlite3_value_int(apVal[2 + p->nColumn + 2]),
          *pRowid
      );
    }
    if( rc==SQLITE_OK ){
      assert( p->iPrevDocid==*pRowid );
      rc = fts3InsertTerms(p, apVal, aSzIns);
    }
    if( p->bHasDocsize ){
      fts3InsertDocsize(&rc, p, aSzIns);
    }
    nChng++;
  }








>







|
<
<
<



|







3464
3465
3466
3467
3468
3469
3470
3471
3472
3473
3474
3475
3476
3477
3478
3479



3480
3481
3482
3483
3484
3485
3486
3487
3488
3489
3490
    assert( sqlite3_value_type(apVal[0])==SQLITE_INTEGER );
    rc = fts3DeleteByRowid(p, apVal[0], &nChng, aSzDel);
    isRemove = 1;
  }
  
  /* If this is an INSERT or UPDATE operation, insert the new record. */
  if( nArg>1 && rc==SQLITE_OK ){
    int iLangid = sqlite3_value_int(apVal[2 + p->nColumn + 2]);
    if( bInsertDone==0 ){
      rc = fts3InsertData(p, apVal, pRowid);
      if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){
        rc = FTS_CORRUPT_VTAB;
      }
    }
    if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){
      rc = fts3PendingTermsDocid(p, iLangid, *pRowid);



    }
    if( rc==SQLITE_OK ){
      assert( p->iPrevDocid==*pRowid );
      rc = fts3InsertTerms(p, iLangid, apVal, aSzIns);
    }
    if( p->bHasDocsize ){
      fts3InsertDocsize(&rc, p, aSzIns);
    }
    nChng++;
  }

Changes to test/fts4langid.test.
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#   2.2.* - Same as 2.1.*, after an 'optimize' command.
#
#   2.3.* - Same as 2.1.*, after a 'rebuild' command.
#
#   3.* - Tests with content= tables. Both where there is a real 
#         underlying content table and where there is not.
#
#
#   4.* - Test that if one is provided, the tokenizer xLanguage method
#         is called to configure the tokenizer before tokenizing query
#         or document text.
#
#   5.* - Test the fts4aux table when the associated FTS4 table contains
#         multiple languages.
#







<







36
37
38
39
40
41
42

43
44
45
46
47
48
49
#   2.2.* - Same as 2.1.*, after an 'optimize' command.
#
#   2.3.* - Same as 2.1.*, after a 'rebuild' command.
#
#   3.* - Tests with content= tables. Both where there is a real 
#         underlying content table and where there is not.
#

#   4.* - Test that if one is provided, the tokenizer xLanguage method
#         is called to configure the tokenizer before tokenizing query
#         or document text.
#
#   5.* - Test the fts4aux table when the associated FTS4 table contains
#         multiple languages.
#
338
339
340
341
342
343
344


345






346







347


















348





do_test_query1 3.3.3 {zero one two} {
  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
}
do_test_query1 3.3.4 {"zero one" OR "one two"} {
  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
}


















finish_test































>
>
|
>
>
>
>
>
>
|
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
do_test_query1 3.3.3 {zero one two} {
  and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two]
}
do_test_query1 3.3.4 {"zero one" OR "one two"} {
  or_merge_lists [rowid_list "zero one"] [rowid_list "one two"]
}

#-------------------------------------------------------------------------
# Test cases 4.*
#
proc build_multilingual_db_2 {db} {
  $db eval {
    CREATE VIRTUAL TABLE t4 USING fts4(
        tokenize=testtokenizer, 
        languageid=lid
    );
  }
  for {set i 0} {$i < 50} {incr i} {
    execsql { 
      INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i) 
    }
  }
}

do_test 4.1.0 {
  reset_db
  set ptr [fts3_test_tokenizer]
  execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) }
  build_multilingual_db_2 db
} {}
do_execsql_test 4.1.1 {
  SELECT docid FROM t4 WHERE t4 MATCH 'quick';
} {0}
do_execsql_test 4.1.2 {
  SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1;
} {}
do_execsql_test 4.1.3 {
  SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1;
} {1}
for {set i 0} {$i < 50} {incr i} {
  do_execsql_test 4.1.4.$i {
    SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i;
  } [expr 0==($i%2)]
}
do_catchsql_test 4.1.5 {
  INSERT INTO t4(content, lid) VALUES('hello world', 101)
} {1 {SQL logic error or missing database}}

finish_test
Changes to test/permutations.test.
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
  fts3defer.test fts3defer2.test fts3e.test fts3expr.test fts3expr2.test 
  fts3near.test fts3query.test fts3shared.test fts3snippet.test 
  fts3sort.test
  fts3fault.test fts3malloc.test fts3matchinfo.test
  fts3aux1.test fts3comp1.test fts3auto.test
  fts4aa.test fts4content.test
  fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test
  fts3corrupt2.test
  fts3first.test
}


lappend ::testsuitelist xxx
#-------------------------------------------------------------------------
# Define the coverage related test suites:
#







|
<







180
181
182
183
184
185
186
187

188
189
190
191
192
193
194
  fts3defer.test fts3defer2.test fts3e.test fts3expr.test fts3expr2.test 
  fts3near.test fts3query.test fts3shared.test fts3snippet.test 
  fts3sort.test
  fts3fault.test fts3malloc.test fts3matchinfo.test
  fts3aux1.test fts3comp1.test fts3auto.test
  fts4aa.test fts4content.test
  fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test
  fts3corrupt2.test fts3first.test fts4langid.test

}


lappend ::testsuitelist xxx
#-------------------------------------------------------------------------
# Define the coverage related test suites:
#