Index: ext/fts5/extract_api_docs.tcl
==================================================================
--- ext/fts5/extract_api_docs.tcl
+++ ext/fts5/extract_api_docs.tcl
@@ -106,17 +106,19 @@
regexp {[*][*](.*)} $line -> line
if {[regexp {^ ?x.*:} $line]} {
append res "
\n"
continue
}
+ if {[regexp {SYNONYM SUPPORT} $line]} {
+ set line "
Synonym Support
"
+ }
if {[string trim $line] == ""} {
append res "\n"
} else {
append res "$line\n"
}
}
- append res "\n"
set res
}
proc get_api_docs {data} {
@@ -206,10 +208,14 @@
output [get_fts5_struct $data "typedef struct fts5_api" "^\};"]
}
fts5_tokenizer {
output [get_fts5_struct $data "typedef struct Fts5Tokenizer" "^\};"]
+ output [get_fts5_struct $data \
+ "Flags that may be passed as the third argument to xTokenize()" \
+ "#define FTS5_TOKEN_COLOCATED"
+ ]
}
fts5_extension {
output [get_fts5_struct $data "typedef.*Fts5ExtensionApi" "^.;"]
}
Index: ext/fts5/fts5.h
==================================================================
--- ext/fts5/fts5.h
+++ ext/fts5/fts5.h
@@ -215,11 +215,11 @@
int (*xColumnTotalSize)(Fts5Context*, int iCol, sqlite3_int64 *pnToken);
int (*xTokenize)(Fts5Context*,
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
- int (*xToken)(void*, const char*, int, int, int) /* Callback */
+ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */
);
int (*xPhraseCount)(Fts5Context*);
int (*xPhraseSize)(Fts5Context*, int iPhrase);
@@ -276,21 +276,49 @@
** allocated using xCreate(). Fts5 guarantees that this function will
** be invoked exactly once for each successful call to xCreate().
**
** xTokenize:
** This function is expected to tokenize the nText byte string indicated
-** by argument pText. pText may not be nul-terminated. The first argument
-** passed to this function is a pointer to an Fts5Tokenizer object returned
-** by an earlier call to xCreate().
+** by argument pText. pText may or may not be nul-terminated. The first
+** argument passed to this function is a pointer to an Fts5Tokenizer object
+** returned by an earlier call to xCreate().
+**
+** The second argument indicates the reason that FTS5 is requesting
+** tokenization of the supplied text. This is always one of the following
+** four values:
+**
+**
- FTS5_TOKENIZE_DOCUMENT - A document is being inserted into
+** or removed from the FTS table. The tokenizer is being invoked to
+** determine the set of tokens to add to (or delete from) the
+** FTS index.
+**
+**
- FTS5_TOKENIZE_QUERY - A MATCH query is being executed
+** against the FTS index. The tokenizer is being called to tokenize
+** a bareword or quoted string specified as part of the query.
+**
+**
- (FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX) - Same as
+** FTS5_TOKENIZE_QUERY, except that the bareword or quoted string is
+** followed by a "*" character, indicating that the last token
+** returned by the tokenizer will be treated as a token prefix.
+**
+**
- FTS5_TOKENIZE_AUX - The tokenizer is being invoked to
+** satisfy an fts5_api.xTokenize() request made by an auxiliary
+** function. Or an fts5_api.xColumnSize() request made by the same
+** on a columnsize=0 database.
+**
**
** For each token in the input string, the supplied callback xToken() must
** be invoked. The first argument to it should be a copy of the pointer
-** passed as the second argument to xTokenize(). The next two arguments
-** are a pointer to a buffer containing the token text, and the size of
-** the token in bytes. The 4th and 5th arguments are the byte offsets of
-** the first byte of and first byte immediately following the text from
+** passed as the second argument to xTokenize(). The third and fourth
+** arguments are a pointer to a buffer containing the token text, and the
+** size of the token in bytes. The 4th and 5th arguments are the byte offsets
+** of the first byte of and first byte immediately following the text from
** which the token is derived within the input.
+**
+** The second argument passed to the xToken() callback ("tflags") should
+** normally be set to 0. The exception is if the tokenizer supports
+** synonyms. In this case see the discussion below for details.
**
** FTS5 assumes the xToken() callback is invoked for each token in the
** order that they occur within the input text.
**
** If an xToken() callback returns any value other than SQLITE_OK, then
@@ -299,39 +327,157 @@
** input buffer is exhausted, xTokenize() should return SQLITE_OK. Finally,
** if an error occurs with the xTokenize() implementation itself, it
** may abandon the tokenization and return any error code other than
** SQLITE_OK or SQLITE_DONE.
**
+** SYNONYM SUPPORT
+**
+** Custom tokenizers may also support synonyms. Consider a case in which a
+** user wishes to query for a phrase such as "first place". Using the
+** built-in tokenizers, the FTS5 query 'first + place' will match instances
+** of "first place" within the document set, but not alternative forms
+** such as "1st place". In some applications, it would be better to match
+** all instances of "first place" or "1st place" regardless of which form
+** the user specified in the MATCH query text.
+**
+** There are several ways to approach this in FTS5:
+**
+** - By mapping all synonyms to a single token. In this case, the
+** In the above example, this means that the tokenizer returns the
+** same token for inputs "first" and "1st". Say that token is in
+** fact "first", so that when the user inserts the document "I won
+** 1st place" entries are added to the index for tokens "i", "won",
+** "first" and "place". If the user then queries for '1st + place',
+** the tokenizer substitutes "first" for "1st" and the query works
+** as expected.
+**
+**
- By adding multiple synonyms for a single term to the FTS index.
+** In this case, when tokenizing query text, the tokenizer may
+** provide multiple synonyms for a single term within the document.
+** FTS5 then queries the index for each synonym individually. For
+** example, faced with the query:
+**
+**
+** ... MATCH 'first place'
+**
+** the tokenizer offers both "1st" and "first" as synonyms for the
+** first token in the MATCH query and FTS5 effectively runs a query
+** similar to:
+**
+**
+** ... MATCH '(first OR 1st) place'
+**
+** except that, for the purposes of auxiliary functions, the query
+** still appears to contain just two phrases - "(first OR 1st)"
+** being treated as a single phrase.
+**
+**
- By adding multiple synonyms for a single term to the FTS index.
+** Using this method, when tokenizing document text, the tokenizer
+** provides multiple synonyms for each token. So that when a
+** document such as "I won first place" is tokenized, entries are
+** added to the FTS index for "i", "won", "first", "1st" and
+** "place".
+**
+** This way, even if the tokenizer does not provide synonyms
+** when tokenizing query text (it should not - to do would be
+** inefficient), it doesn't matter if the user queries for
+** 'first + place' or '1st + place', as there are entires in the
+** FTS index corresponding to both forms of the first token.
+**
+**
+** Whether is is parsing document or query text, any call to xToken that
+** specifies a tflags argument with the FTS5_TOKEN_COLOCATED bit
+** is considered to supply a synonym for the previous token. For example,
+** when parsing the document "I won first place", a tokenizer that supports
+** synonyms would call xToken() 5 times, as follows:
+**
+**
+** xToken(pCtx, 0, "i", 1, 0, 1);
+** xToken(pCtx, 0, "won", 3, 2, 5);
+** xToken(pCtx, 0, "first", 5, 6, 11);
+** xToken(pCtx, FTS5_TOKEN_COLOCATED, "1st", 3, 6, 11);
+** xToken(pCtx, 0, "place", 5, 12, 17);
+**
+**
+** It is an error to specify the FTS5_TOKEN_COLOCATED flag the first time
+** xToken() is called. Multiple synonyms may be specified for a single token
+** by making multiple calls to xToken(FTS5_TOKEN_COLOCATED) in sequence.
+** There is no limit to the number of synonyms that may be provided for a
+** single token.
+**
+** In many cases, method (1) above is the best approach. It does not add
+** extra data to the FTS index or require FTS5 to query for multiple terms,
+** so it is efficient in terms of disk space and query speed. However, it
+** does not support prefix queries very well. If, as suggested above, the
+** token "first" is subsituted for "1st" by the tokenizer, then the query:
+**
+**
+** ... MATCH '1s*'
+**
+** will not match documents that contain the token "1st" (as the tokenizer
+** will probably not map "1s" to any prefix of "first").
+**
+** For full prefix support, method (3) may be preferred. In this case,
+** because the index contains entries for both "first" and "1st", prefix
+** queries such as 'fi*' or '1s*' will match correctly. However, because
+** extra entries are added to the FTS index, this method uses more space
+** within the database.
+**
+** Method (2) offers a midpoint between (1) and (3). Using this method,
+** a query such as '1s*' will match documents that contain the literal
+** token "1st", but not "first" (assuming the tokenizer is not able to
+** provide synonyms for prefixes). However, a non-prefix query like '1st'
+** will match against "1st" and "first". This method does not require
+** extra disk space, as no extra entries are added to the FTS index.
+** On the other hand, it may require more CPU cycles to run MATCH queries,
+** as separate queries of the FTS index are required for each synonym.
+**
+** When using methods (2) or (3), it is important that the tokenizer only
+** provide synonyms when tokenizing document text (method (2)) or query
+** text (method (3)), not both. Doing so will not cause any errors, but is
+** inefficient.
*/
typedef struct Fts5Tokenizer Fts5Tokenizer;
typedef struct fts5_tokenizer fts5_tokenizer;
struct fts5_tokenizer {
int (*xCreate)(void*, const char **azArg, int nArg, Fts5Tokenizer **ppOut);
void (*xDelete)(Fts5Tokenizer*);
int (*xTokenize)(Fts5Tokenizer*,
void *pCtx,
+ int flags, /* Mask of FTS5_TOKENIZE_* flags */
const char *pText, int nText,
int (*xToken)(
void *pCtx, /* Copy of 2nd argument to xTokenize() */
+ int tflags, /* Mask of FTS5_TOKEN_* flags */
const char *pToken, /* Pointer to buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Byte offset of token within input text */
int iEnd /* Byte offset of end of token within input text */
)
);
};
+/* Flags that may be passed as the third argument to xTokenize() */
+#define FTS5_TOKENIZE_QUERY 0x0001
+#define FTS5_TOKENIZE_PREFIX 0x0002
+#define FTS5_TOKENIZE_DOCUMENT 0x0004
+#define FTS5_TOKENIZE_AUX 0x0008
+
+/* Flags that may be passed by the tokenizer implementation back to FTS5
+** as the third argument to the supplied xToken callback. */
+#define FTS5_TOKEN_COLOCATED 0x0001 /* Same position as prev. token */
+
/*
** END OF CUSTOM TOKENIZERS
*************************************************************************/
/*************************************************************************
** FTS5 EXTENSION REGISTRATION API
*/
typedef struct fts5_api fts5_api;
struct fts5_api {
- int iVersion; /* Currently always set to 1 */
+ int iVersion; /* Currently always set to 2 */
/* Create a new tokenizer */
int (*xCreateTokenizer)(
fts5_api *pApi,
const char *zName,
Index: ext/fts5/fts5Int.h
==================================================================
--- ext/fts5/fts5Int.h
+++ ext/fts5/fts5Int.h
@@ -164,13 +164,14 @@
int sqlite3Fts5ConfigDeclareVtab(Fts5Config *pConfig);
int sqlite3Fts5Tokenize(
Fts5Config *pConfig, /* FTS5 Configuration object */
+ int flags, /* FTS5_TOKENIZE_* flags */
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
- int (*xToken)(void*, const char*, int, int, int) /* Callback */
+ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */
);
void sqlite3Fts5Dequote(char *z);
/* Load the contents of the %_config table */
@@ -232,12 +233,14 @@
int iCol; /* If (iCol>=0), this column only */
const u8 *a; /* Position list to iterate through */
int n; /* Size of buffer at a[] in bytes */
int i; /* Current offset in a[] */
+ u8 bFlag; /* For client use (any custom purpose) */
+
/* Output variables */
- int bEof; /* Set to true at EOF */
+ u8 bEof; /* Set to true at EOF */
i64 iPos; /* (iCol<<32) + iPos */
};
int sqlite3Fts5PoslistReaderInit(
int iCol, /* If (iCol>=0), this column only */
const u8 *a, int n, /* Poslist buffer to iterate through */
@@ -379,13 +382,13 @@
*/
int sqlite3Fts5IndexErrcode(Fts5Index*);
void sqlite3Fts5IndexReset(Fts5Index*);
/*
-** Get or set the "averages" record.
+** Get or set the "averages" values.
*/
-int sqlite3Fts5IndexGetAverages(Fts5Index *p, Fts5Buffer *pBuf);
+int sqlite3Fts5IndexGetAverages(Fts5Index *p, i64 *pnRow, i64 *anSize);
int sqlite3Fts5IndexSetAverages(Fts5Index *p, const u8*, int);
/*
** Functions called by the storage module as part of integrity-check.
*/
@@ -594,11 +597,11 @@
int sqlite3Fts5ExprPhraseCount(Fts5Expr*);
int sqlite3Fts5ExprPhraseSize(Fts5Expr*, int iPhrase);
int sqlite3Fts5ExprPoslist(Fts5Expr*, int, const u8 **);
-int sqlite3Fts5ExprPhraseExpr(Fts5Config*, Fts5Expr*, int, Fts5Expr**);
+int sqlite3Fts5ExprClonePhrase(Fts5Config*, Fts5Expr*, int, Fts5Expr**);
/*******************************************
** The fts5_expr.c API above this point is used by the other hand-written
** C code in this module. The interfaces below this point are called by
** the parser code in fts5parse.y. */
Index: ext/fts5/fts5_aux.c
==================================================================
--- ext/fts5/fts5_aux.c
+++ ext/fts5/fts5_aux.c
@@ -146,18 +146,22 @@
/*
** Tokenizer callback used by implementation of highlight() function.
*/
static int fts5HighlightCb(
void *pContext, /* Pointer to HighlightContext object */
+ int tflags, /* Mask of FTS5_TOKEN_* flags */
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStartOff, /* Start offset of token */
int iEndOff /* End offset of token */
){
HighlightContext *p = (HighlightContext*)pContext;
int rc = SQLITE_OK;
- int iPos = p->iPos++;
+ int iPos;
+
+ if( tflags & FTS5_TOKEN_COLOCATED ) return SQLITE_OK;
+ iPos = p->iPos++;
if( p->iRangeEnd>0 ){
if( iPosiRangeStart || iPos>p->iRangeEnd ) return SQLITE_OK;
if( p->iRangeStart && iPos==p->iRangeStart ) p->iOff = iStartOff;
}
Index: ext/fts5/fts5_config.c
==================================================================
--- ext/fts5/fts5_config.c
+++ ext/fts5/fts5_config.c
@@ -643,16 +643,19 @@
** because the callback returned another non-zero value, it is assumed
** to be an SQLite error code and returned to the caller.
*/
int sqlite3Fts5Tokenize(
Fts5Config *pConfig, /* FTS5 Configuration object */
+ int flags, /* FTS5_TOKENIZE_* flags */
const char *pText, int nText, /* Text to tokenize */
void *pCtx, /* Context passed to xToken() */
- int (*xToken)(void*, const char*, int, int, int) /* Callback */
+ int (*xToken)(void*, int, const char*, int, int, int) /* Callback */
){
if( pText==0 ) return SQLITE_OK;
- return pConfig->pTokApi->xTokenize(pConfig->pTok, pCtx, pText, nText, xToken);
+ return pConfig->pTokApi->xTokenize(
+ pConfig->pTok, pCtx, flags, pText, nText, xToken
+ );
}
/*
** Argument pIn points to the first character in what is expected to be
** a comma-separated list of SQL literals followed by a ')' character.
Index: ext/fts5/fts5_expr.c
==================================================================
--- ext/fts5/fts5_expr.c
+++ ext/fts5/fts5_expr.c
@@ -20,10 +20,12 @@
/*
** All token types in the generated fts5parse.h file are greater than 0.
*/
#define FTS5_EOF 0
+#define FTS5_LARGEST_INT64 (0xffffffff|(((i64)0x7fffffff)<<32))
+
typedef struct Fts5ExprTerm Fts5ExprTerm;
/*
** Functions generated by lemon from fts5parse.y.
*/
@@ -71,10 +73,11 @@
*/
struct Fts5ExprTerm {
int bPrefix; /* True for a prefix term */
char *zTerm; /* nul-terminated term */
Fts5IndexIter *pIter; /* Iterator for this term */
+ Fts5ExprTerm *pSynonym; /* Pointer to first in list of synonyms */
};
/*
** A phrase. One or more terms that must appear in a contiguous sequence
** within a document for it to match.
@@ -179,10 +182,14 @@
break;
}
default: {
const char *z2;
+ if( sqlite3Fts5IsBareword(z[0])==0 ){
+ sqlite3Fts5ParseError(pParse, "fts5: syntax error near \"%.1s\"", z);
+ return FTS5_EOF;
+ }
tok = FTS5_STRING;
for(z2=&z[1]; sqlite3Fts5IsBareword(*z2); z2++);
pToken->n = (z2 - z);
if( pToken->n==2 && memcmp(pToken->p, "OR", 2)==0 ) tok = FTS5_OR;
if( pToken->n==3 && memcmp(pToken->p, "NOT", 3)==0 ) tok = FTS5_NOT;
@@ -242,83 +249,10 @@
sqlite3_free(sParse.apPhrase);
*pzErr = sParse.zErr;
return sParse.rc;
}
-/*
-** Create a new FTS5 expression by cloning phrase iPhrase of the
-** expression passed as the second argument.
-*/
-int sqlite3Fts5ExprPhraseExpr(
- Fts5Config *pConfig,
- Fts5Expr *pExpr,
- int iPhrase,
- Fts5Expr **ppNew
-){
- int rc = SQLITE_OK; /* Return code */
- Fts5ExprPhrase *pOrig; /* The phrase extracted from pExpr */
- Fts5ExprPhrase *pCopy; /* Copy of pOrig */
- Fts5Expr *pNew = 0; /* Expression to return via *ppNew */
-
- pOrig = pExpr->apExprPhrase[iPhrase];
- pCopy = (Fts5ExprPhrase*)sqlite3Fts5MallocZero(&rc,
- sizeof(Fts5ExprPhrase) + sizeof(Fts5ExprTerm) * pOrig->nTerm
- );
- if( pCopy ){
- int i; /* Used to iterate through phrase terms */
- Fts5ExprPhrase **apPhrase;
- Fts5ExprNode *pNode;
- Fts5ExprNearset *pNear;
-
- pNew = (Fts5Expr*)sqlite3Fts5MallocZero(&rc, sizeof(Fts5Expr));
- apPhrase = (Fts5ExprPhrase**)sqlite3Fts5MallocZero(&rc,
- sizeof(Fts5ExprPhrase*)
- );
- pNode = (Fts5ExprNode*)sqlite3Fts5MallocZero(&rc, sizeof(Fts5ExprNode));
- pNear = (Fts5ExprNearset*)sqlite3Fts5MallocZero(&rc,
- sizeof(Fts5ExprNearset) + sizeof(Fts5ExprPhrase*)
- );
-
- for(i=0; inTerm; i++){
- pCopy->aTerm[i].zTerm = sqlite3Fts5Strndup(&rc, pOrig->aTerm[i].zTerm,-1);
- pCopy->aTerm[i].bPrefix = pOrig->aTerm[i].bPrefix;
- }
-
- if( rc==SQLITE_OK ){
- /* All the allocations succeeded. Put the expression object together. */
- pNew->pIndex = pExpr->pIndex;
- pNew->pRoot = pNode;
- pNew->nPhrase = 1;
- pNew->apExprPhrase = apPhrase;
- pNew->apExprPhrase[0] = pCopy;
-
- pNode->eType = (pOrig->nTerm==1 ? FTS5_TERM : FTS5_STRING);
- pNode->pNear = pNear;
-
- pNear->nPhrase = 1;
- pNear->apPhrase[0] = pCopy;
-
- pCopy->nTerm = pOrig->nTerm;
- pCopy->pNode = pNode;
- }else{
- /* At least one allocation failed. Free them all. */
- for(i=0; inTerm; i++){
- sqlite3_free(pCopy->aTerm[i].zTerm);
- }
- sqlite3_free(pCopy);
- sqlite3_free(pNear);
- sqlite3_free(pNode);
- sqlite3_free(apPhrase);
- sqlite3_free(pNew);
- pNew = 0;
- }
- }
-
- *ppNew = pNew;
- return rc;
-}
-
/*
** Free the expression node object passed as the only argument.
*/
void sqlite3Fts5ParseNodeFree(Fts5ExprNode *p){
if( p ){
@@ -347,10 +281,119 @@
for(i=0; inCol; i++){
if( pColset->aiCol[i]==iCol ) return 1;
}
return 0;
}
+
+/*
+** Argument pTerm must be a synonym iterator. Return the current rowid
+** that it points to.
+*/
+static i64 fts5ExprSynonymRowid(Fts5ExprTerm *pTerm, int bDesc, int *pbEof){
+ i64 iRet = 0;
+ int bRetValid = 0;
+ Fts5ExprTerm *p;
+
+ assert( pTerm->pSynonym );
+ assert( bDesc==0 || bDesc==1 );
+ for(p=pTerm; p; p=p->pSynonym){
+ if( 0==sqlite3Fts5IterEof(p->pIter) ){
+ i64 iRowid = sqlite3Fts5IterRowid(p->pIter);
+ if( bRetValid==0 || (bDesc!=(iRowidpSynonym );
+ for(p=pTerm; p; p=p->pSynonym){
+ Fts5IndexIter *pIter = p->pIter;
+ if( sqlite3Fts5IterEof(pIter)==0 && sqlite3Fts5IterRowid(pIter)==iRowid ){
+ const u8 *a;
+ int n;
+ i64 dummy;
+ rc = sqlite3Fts5IterPoslist(pIter, &a, &n, &dummy);
+ if( rc!=SQLITE_OK ) goto synonym_poslist_out;
+ if( nIter==nAlloc ){
+ int nByte = sizeof(Fts5PoslistReader) * nAlloc * 2;
+ Fts5PoslistReader *aNew = (Fts5PoslistReader*)sqlite3_malloc(nByte);
+ if( aNew==0 ){
+ rc = SQLITE_NOMEM;
+ goto synonym_poslist_out;
+ }
+ memcpy(aNew, aIter, sizeof(Fts5PoslistReader) * nIter);
+ nAlloc = nAlloc*2;
+ if( aIter!=aStatic ) sqlite3_free(aIter);
+ aIter = aNew;
+ }
+ sqlite3Fts5PoslistReaderInit(-1, a, n, &aIter[nIter]);
+ assert( aIter[nIter].bEof==0 );
+ nIter++;
+ }
+ }
+
+ assert( *pbDel==0 );
+ if( nIter==1 ){
+ *pa = (u8*)aIter[0].a;
+ *pn = aIter[0].n;
+ }else{
+ Fts5PoslistWriter writer = {0};
+ Fts5Buffer buf = {0,0,0};
+ i64 iPrev = -1;
+ while( 1 ){
+ int i;
+ i64 iMin = FTS5_LARGEST_INT64;
+ for(i=0; inTerm>(sizeof(aStatic) / sizeof(aStatic[0])) ){
int nByte = sizeof(Fts5PoslistReader) * pPhrase->nTerm;
aIter = (Fts5PoslistReader*)sqlite3_malloc(nByte);
if( !aIter ) return SQLITE_NOMEM;
}
+ memset(aIter, 0, sizeof(Fts5PoslistReader) * pPhrase->nTerm);
/* Initialize a term iterator for each term in the phrase */
for(i=0; inTerm; i++){
+ Fts5ExprTerm *pTerm = &pPhrase->aTerm[i];
i64 dummy;
- int n;
- const u8 *a;
- rc = sqlite3Fts5IterPoslist(pPhrase->aTerm[i].pIter, &a, &n, &dummy);
- if( rc || sqlite3Fts5PoslistReaderInit(iCol, a, n, &aIter[i]) ){
- goto ismatch_out;
+ int n = 0;
+ int bFlag = 0;
+ const u8 *a = 0;
+ if( pTerm->pSynonym ){
+ rc = fts5ExprSynonymPoslist(pTerm, pNode->iRowid, &bFlag, (u8**)&a, &n);
+ }else{
+ rc = sqlite3Fts5IterPoslist(pTerm->pIter, &a, &n, &dummy);
}
+ if( rc!=SQLITE_OK ) goto ismatch_out;
+ sqlite3Fts5PoslistReaderInit(iCol, a, n, &aIter[i]);
+ aIter[i].bFlag = bFlag;
+ if( aIter[i].bEof ) goto ismatch_out;
}
while( 1 ){
int bMatch;
i64 iPos = aIter[0].iPos;
@@ -429,10 +480,13 @@
}
}
ismatch_out:
*pbMatch = (pPhrase->poslist.n>0);
+ for(i=0; inTerm; i++){
+ if( aIter[i].bFlag ) sqlite3_free((u8*)aIter[i].a);
+ }
if( aIter!=aStatic ) sqlite3_free(aIter);
return rc;
}
typedef struct Fts5LookaheadReader Fts5LookaheadReader;
@@ -596,21 +650,59 @@
Fts5Expr *pExpr, /* Expression pPhrase belongs to */
Fts5ExprNode *pNode, /* FTS5_STRING or FTS5_TERM node */
int bFromValid,
i64 iFrom
){
- Fts5IndexIter *pIter = pNode->pNear->apPhrase[0]->aTerm[0].pIter;
+ Fts5ExprTerm *pTerm = &pNode->pNear->apPhrase[0]->aTerm[0];
int rc;
- assert( Fts5NodeIsString(pNode) );
- if( bFromValid ){
- rc = sqlite3Fts5IterNextFrom(pIter, iFrom);
+ if( pTerm->pSynonym ){
+ int bEof = 1;
+ Fts5ExprTerm *p;
+
+ /* Find the firstest rowid any synonym points to. */
+ i64 iRowid = fts5ExprSynonymRowid(pTerm, pExpr->bDesc, 0);
+
+ /* Advance each iterator that currently points to iRowid. Or, if iFrom
+ ** is valid - each iterator that points to a rowid before iFrom. */
+ for(p=pTerm; p; p=p->pSynonym){
+ if( sqlite3Fts5IterEof(p->pIter)==0 ){
+ i64 ii = sqlite3Fts5IterRowid(p->pIter);
+ if( ii==iRowid
+ || (bFromValid && ii!=iFrom && (ii>iFrom)==pExpr->bDesc)
+ ){
+ if( bFromValid ){
+ rc = sqlite3Fts5IterNextFrom(p->pIter, iFrom);
+ }else{
+ rc = sqlite3Fts5IterNext(p->pIter);
+ }
+ if( rc!=SQLITE_OK ) break;
+ if( sqlite3Fts5IterEof(p->pIter)==0 ){
+ bEof = 0;
+ }
+ }else{
+ bEof = 0;
+ }
+ }
+ }
+
+ /* Set the EOF flag if either all synonym iterators are at EOF or an
+ ** error has occurred. */
+ pNode->bEof = (rc || bEof);
}else{
- rc = sqlite3Fts5IterNext(pIter);
+ Fts5IndexIter *pIter = pTerm->pIter;
+
+ assert( Fts5NodeIsString(pNode) );
+ if( bFromValid ){
+ rc = sqlite3Fts5IterNextFrom(pIter, iFrom);
+ }else{
+ rc = sqlite3Fts5IterNext(pIter);
+ }
+
+ pNode->bEof = (rc || sqlite3Fts5IterEof(pIter));
}
- pNode->bEof = (rc || sqlite3Fts5IterEof(pIter));
return rc;
}
/*
** Advance iterator pIter until it points to a value equal to or laster
@@ -644,10 +736,39 @@
}
*piLast = iRowid;
return 0;
}
+
+static int fts5ExprSynonymAdvanceto(
+ Fts5ExprTerm *pTerm, /* Term iterator to advance */
+ int bDesc, /* True if iterator is "rowid DESC" */
+ i64 *piLast, /* IN/OUT: Lastest rowid seen so far */
+ int *pRc /* OUT: Error code */
+){
+ int rc = SQLITE_OK;
+ i64 iLast = *piLast;
+ Fts5ExprTerm *p;
+ int bEof = 0;
+
+ for(p=pTerm; rc==SQLITE_OK && p; p=p->pSynonym){
+ if( sqlite3Fts5IterEof(p->pIter)==0 ){
+ i64 iRowid = sqlite3Fts5IterRowid(p->pIter);
+ if( (bDesc==0 && iLast>iRowid) || (bDesc && iLastpIter, iLast);
+ }
+ }
+ }
+
+ if( rc!=SQLITE_OK ){
+ *pRc = rc;
+ bEof = 1;
+ }else{
+ *piLast = fts5ExprSynonymRowid(pTerm, bDesc, &bEof);
+ }
+ return bEof;
+}
/*
** IN/OUT parameter (*pa) points to a position list n bytes in size. If
** the position list contains entries for column iCol, then (*pa) is set
** to point to the sub-position-list for that column and the number of
@@ -715,13 +836,13 @@
/* Check that each phrase in the nearset matches the current row.
** Populate the pPhrase->poslist buffers at the same time. If any
** phrase is not a match, break out of the loop early. */
for(i=0; rc==SQLITE_OK && inPhrase; i++){
Fts5ExprPhrase *pPhrase = pNear->apPhrase[i];
- if( pPhrase->nTerm>1 || pNear->pColset ){
+ if( pPhrase->nTerm>1 || pPhrase->aTerm[0].pSynonym || pNear->pColset ){
int bMatch = 0;
- rc = fts5ExprPhraseIsMatch(pExpr, pNear->pColset, pPhrase, &bMatch);
+ rc = fts5ExprPhraseIsMatch(pNode, pNear->pColset, pPhrase, &bMatch);
if( bMatch==0 ) break;
}else{
rc = sqlite3Fts5IterPoslistBuffer(
pPhrase->aTerm[0].pIter, &pPhrase->poslist
);
@@ -753,10 +874,11 @@
int nPos;
int rc;
assert( pNode->eType==FTS5_TERM );
assert( pNear->nPhrase==1 && pPhrase->nTerm==1 );
+ assert( pPhrase->aTerm[0].pSynonym==0 );
rc = sqlite3Fts5IterPoslist(pIter, &pPos, &nPos, &pNode->iRowid);
/* If the term may match any column, then this must be a match.
** Return immediately in this case. Otherwise, try to find the
@@ -799,73 +921,103 @@
Fts5ExprPhrase *pLeft = pNear->apPhrase[0];
int rc = SQLITE_OK;
i64 iLast; /* Lastest rowid any iterator points to */
int i, j; /* Phrase and token index, respectively */
int bMatch; /* True if all terms are at the same rowid */
+ const int bDesc = pExpr->bDesc;
- assert( pNear->nPhrase>1 || pNear->apPhrase[0]->nTerm>1 );
+ /* Check that this node should not be FTS5_TERM */
+ assert( pNear->nPhrase>1
+ || pNear->apPhrase[0]->nTerm>1
+ || pNear->apPhrase[0]->aTerm[0].pSynonym
+ );
/* Initialize iLast, the "lastest" rowid any iterator points to. If the
** iterator skips through rowids in the default ascending order, this means
** the maximum rowid. Or, if the iterator is "ORDER BY rowid DESC", then it
** means the minimum rowid. */
- iLast = sqlite3Fts5IterRowid(pLeft->aTerm[0].pIter);
+ if( pLeft->aTerm[0].pSynonym ){
+ iLast = fts5ExprSynonymRowid(&pLeft->aTerm[0], bDesc, 0);
+ }else{
+ iLast = sqlite3Fts5IterRowid(pLeft->aTerm[0].pIter);
+ }
do {
bMatch = 1;
for(i=0; inPhrase; i++){
Fts5ExprPhrase *pPhrase = pNear->apPhrase[i];
for(j=0; jnTerm; j++){
- Fts5IndexIter *pIter = pPhrase->aTerm[j].pIter;
- i64 iRowid = sqlite3Fts5IterRowid(pIter);
- if( iRowid!=iLast ) bMatch = 0;
- if( fts5ExprAdvanceto(pIter, pExpr->bDesc, &iLast,&rc,&pNode->bEof) ){
- return rc;
+ Fts5ExprTerm *pTerm = &pPhrase->aTerm[j];
+ if( pTerm->pSynonym ){
+ Fts5ExprTerm *p;
+ int bEof = 1;
+ i64 iRowid = fts5ExprSynonymRowid(pTerm, bDesc, 0);
+ if( iRowid==iLast ) continue;
+ bMatch = 0;
+ if( fts5ExprSynonymAdvanceto(pTerm, bDesc, &iLast, &rc) ){
+ pNode->bEof = 1;
+ return rc;
+ }
+ }else{
+ Fts5IndexIter *pIter = pPhrase->aTerm[j].pIter;
+ i64 iRowid = sqlite3Fts5IterRowid(pIter);
+ if( iRowid==iLast ) continue;
+ bMatch = 0;
+ if( fts5ExprAdvanceto(pIter, bDesc, &iLast, &rc, &pNode->bEof) ){
+ return rc;
+ }
}
}
}
}while( bMatch==0 );
- pNode->bNomatch = (0==fts5ExprNearTest(&rc, pExpr, pNode));
pNode->iRowid = iLast;
+ pNode->bNomatch = (0==fts5ExprNearTest(&rc, pExpr, pNode));
return rc;
}
/*
** Initialize all term iterators in the pNear object. If any term is found
-** to match no documents at all, set *pbEof to true and return immediately,
-** without initializing any further iterators.
+** to match no documents at all, return immediately without initializing any
+** further iterators.
*/
static int fts5ExprNearInitAll(
Fts5Expr *pExpr,
Fts5ExprNode *pNode
){
Fts5ExprNearset *pNear = pNode->pNear;
- Fts5ExprTerm *pTerm;
- Fts5ExprPhrase *pPhrase;
int i, j;
int rc = SQLITE_OK;
for(i=0; rc==SQLITE_OK && inPhrase; i++){
- pPhrase = pNear->apPhrase[i];
+ Fts5ExprPhrase *pPhrase = pNear->apPhrase[i];
for(j=0; jnTerm; j++){
- pTerm = &pPhrase->aTerm[j];
- if( pTerm->pIter ){
- sqlite3Fts5IterClose(pTerm->pIter);
- pTerm->pIter = 0;
- }
- rc = sqlite3Fts5IndexQuery(
- pExpr->pIndex, pTerm->zTerm, strlen(pTerm->zTerm),
- (pTerm->bPrefix ? FTS5INDEX_QUERY_PREFIX : 0) |
- (pExpr->bDesc ? FTS5INDEX_QUERY_DESC : 0),
- &pTerm->pIter
- );
- assert( rc==SQLITE_OK || pTerm->pIter==0 );
- if( pTerm->pIter==0 || sqlite3Fts5IterEof(pTerm->pIter) ){
+ Fts5ExprTerm *pTerm = &pPhrase->aTerm[j];
+ Fts5ExprTerm *p;
+ int bEof = 1;
+
+ for(p=pTerm; p && rc==SQLITE_OK; p=p->pSynonym){
+ if( p->pIter ){
+ sqlite3Fts5IterClose(p->pIter);
+ p->pIter = 0;
+ }
+ rc = sqlite3Fts5IndexQuery(
+ pExpr->pIndex, p->zTerm, strlen(p->zTerm),
+ (pTerm->bPrefix ? FTS5INDEX_QUERY_PREFIX : 0) |
+ (pExpr->bDesc ? FTS5INDEX_QUERY_DESC : 0),
+ &p->pIter
+ );
+ assert( rc==SQLITE_OK || p->pIter==0 );
+ if( p->pIter && 0==sqlite3Fts5IterEof(p->pIter) ){
+ bEof = 0;
+ }
+ }
+
+ if( bEof ){
pNode->bEof = 1;
- break;
+ return rc;
}
}
}
return rc;
@@ -1027,14 +1179,21 @@
rc = fts5ExprNearAdvanceFirst(pExpr, pNode, bFromValid, iFrom);
break;
};
case FTS5_TERM: {
- rc = fts5ExprNearAdvanceFirst(pExpr, pNode, bFromValid, iFrom);
- if( pNode->bEof==0 ){
+ Fts5IndexIter *pIter = pNode->pNear->apPhrase[0]->aTerm[0].pIter;
+ if( bFromValid ){
+ rc = sqlite3Fts5IterNextFrom(pIter, iFrom);
+ }else{
+ rc = sqlite3Fts5IterNext(pIter);
+ }
+ if( rc==SQLITE_OK && sqlite3Fts5IterEof(pIter)==0 ){
assert( rc==SQLITE_OK );
rc = fts5ExprTokenTest(pExpr, pNode);
+ }else{
+ pNode->bEof = 1;
}
return rc;
};
case FTS5_AND: {
@@ -1264,14 +1423,20 @@
*/
static void fts5ExprPhraseFree(Fts5ExprPhrase *pPhrase){
if( pPhrase ){
int i;
for(i=0; inTerm; i++){
+ Fts5ExprTerm *pSyn;
+ Fts5ExprTerm *pNext;
Fts5ExprTerm *pTerm = &pPhrase->aTerm[i];
sqlite3_free(pTerm->zTerm);
- if( pTerm->pIter ){
- sqlite3Fts5IterClose(pTerm->pIter);
+ sqlite3Fts5IterClose(pTerm->pIter);
+
+ for(pSyn=pTerm->pSynonym; pSyn; pSyn=pNext){
+ pNext = pSyn->pSynonym;
+ sqlite3Fts5IterClose(pSyn->pIter);
+ sqlite3_free(pSyn);
}
}
if( pPhrase->poslist.nSpace>0 ) fts5BufferFree(&pPhrase->poslist);
sqlite3_free(pPhrase);
}
@@ -1329,45 +1494,72 @@
}
typedef struct TokenCtx TokenCtx;
struct TokenCtx {
Fts5ExprPhrase *pPhrase;
+ int rc;
};
/*
** Callback for tokenizing terms used by ParseTerm().
*/
static int fts5ParseTokenize(
void *pContext, /* Pointer to Fts5InsertCtx object */
+ int tflags, /* Mask of FTS5_TOKEN_* flags */
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
- int iStart, /* Start offset of token */
- int iEnd /* End offset of token */
+ int iUnused1, /* Start offset of token */
+ int iUnused2 /* End offset of token */
){
int rc = SQLITE_OK;
const int SZALLOC = 8;
TokenCtx *pCtx = (TokenCtx*)pContext;
Fts5ExprPhrase *pPhrase = pCtx->pPhrase;
- Fts5ExprTerm *pTerm;
-
- if( pPhrase==0 || (pPhrase->nTerm % SZALLOC)==0 ){
- Fts5ExprPhrase *pNew;
- int nNew = SZALLOC + (pPhrase ? pPhrase->nTerm : 0);
-
- pNew = (Fts5ExprPhrase*)sqlite3_realloc(pPhrase,
- sizeof(Fts5ExprPhrase) + sizeof(Fts5ExprTerm) * nNew
- );
- if( pNew==0 ) return SQLITE_NOMEM;
- if( pPhrase==0 ) memset(pNew, 0, sizeof(Fts5ExprPhrase));
- pCtx->pPhrase = pPhrase = pNew;
- pNew->nTerm = nNew - SZALLOC;
- }
-
- pTerm = &pPhrase->aTerm[pPhrase->nTerm++];
- memset(pTerm, 0, sizeof(Fts5ExprTerm));
- pTerm->zTerm = sqlite3Fts5Strndup(&rc, pToken, nToken);
-
+
+ /* If an error has already occurred, this is a no-op */
+ if( pCtx->rc!=SQLITE_OK ) return pCtx->rc;
+
+ assert( pPhrase==0 || pPhrase->nTerm>0 );
+ if( pPhrase && (tflags & FTS5_TOKEN_COLOCATED) ){
+ Fts5ExprTerm *pSyn;
+ int nByte = sizeof(Fts5ExprTerm) + nToken+1;
+ pSyn = (Fts5ExprTerm*)sqlite3_malloc(nByte);
+ if( pSyn==0 ){
+ rc = SQLITE_NOMEM;
+ }else{
+ memset(pSyn, 0, nByte);
+ pSyn->zTerm = (char*)&pSyn[1];
+ memcpy(pSyn->zTerm, pToken, nToken);
+ pSyn->pSynonym = pPhrase->aTerm[pPhrase->nTerm-1].pSynonym;
+ pPhrase->aTerm[pPhrase->nTerm-1].pSynonym = pSyn;
+ }
+ }else{
+ Fts5ExprTerm *pTerm;
+ if( pPhrase==0 || (pPhrase->nTerm % SZALLOC)==0 ){
+ Fts5ExprPhrase *pNew;
+ int nNew = SZALLOC + (pPhrase ? pPhrase->nTerm : 0);
+
+ pNew = (Fts5ExprPhrase*)sqlite3_realloc(pPhrase,
+ sizeof(Fts5ExprPhrase) + sizeof(Fts5ExprTerm) * nNew
+ );
+ if( pNew==0 ){
+ rc = SQLITE_NOMEM;
+ }else{
+ if( pPhrase==0 ) memset(pNew, 0, sizeof(Fts5ExprPhrase));
+ pCtx->pPhrase = pPhrase = pNew;
+ pNew->nTerm = nNew - SZALLOC;
+ }
+ }
+
+ if( rc==SQLITE_OK ){
+ pTerm = &pPhrase->aTerm[pPhrase->nTerm++];
+ memset(pTerm, 0, sizeof(Fts5ExprTerm));
+ pTerm->zTerm = sqlite3Fts5Strndup(&rc, pToken, nToken);
+ }
+ }
+
+ pCtx->rc = rc;
return rc;
}
/*
@@ -1415,15 +1607,18 @@
memset(&sCtx, 0, sizeof(TokenCtx));
sCtx.pPhrase = pAppend;
rc = fts5ParseStringFromToken(pToken, &z);
if( rc==SQLITE_OK ){
+ int flags = FTS5_TOKENIZE_QUERY | (bPrefix ? FTS5_TOKENIZE_QUERY : 0);
+ int n;
sqlite3Fts5Dequote(z);
- rc = sqlite3Fts5Tokenize(pConfig, z, strlen(z), &sCtx, fts5ParseTokenize);
+ n = strlen(z);
+ rc = sqlite3Fts5Tokenize(pConfig, flags, z, n, &sCtx, fts5ParseTokenize);
}
sqlite3_free(z);
- if( rc ){
+ if( rc || (rc = sCtx.rc) ){
pParse->rc = rc;
fts5ExprPhraseFree(sCtx.pPhrase);
sCtx.pPhrase = 0;
}else if( sCtx.pPhrase ){
@@ -1447,10 +1642,87 @@
sCtx.pPhrase->aTerm[sCtx.pPhrase->nTerm-1].bPrefix = bPrefix;
}
return sCtx.pPhrase;
}
+
+/*
+** Create a new FTS5 expression by cloning phrase iPhrase of the
+** expression passed as the second argument.
+*/
+int sqlite3Fts5ExprClonePhrase(
+ Fts5Config *pConfig,
+ Fts5Expr *pExpr,
+ int iPhrase,
+ Fts5Expr **ppNew
+){
+ int rc = SQLITE_OK; /* Return code */
+ Fts5ExprPhrase *pOrig; /* The phrase extracted from pExpr */
+ Fts5ExprPhrase *pCopy; /* Copy of pOrig */
+ int i; /* Used to iterate through phrase terms */
+
+ Fts5Expr *pNew = 0; /* Expression to return via *ppNew */
+ Fts5ExprPhrase **apPhrase; /* pNew->apPhrase */
+ Fts5ExprNode *pNode; /* pNew->pRoot */
+ Fts5ExprNearset *pNear; /* pNew->pRoot->pNear */
+
+ TokenCtx sCtx = {0,0}; /* Context object for fts5ParseTokenize */
+
+
+ pOrig = pExpr->apExprPhrase[iPhrase];
+
+ pNew = (Fts5Expr*)sqlite3Fts5MallocZero(&rc, sizeof(Fts5Expr));
+ if( rc==SQLITE_OK ){
+ pNew->apExprPhrase = (Fts5ExprPhrase**)sqlite3Fts5MallocZero(&rc,
+ sizeof(Fts5ExprPhrase*));
+ }
+ if( rc==SQLITE_OK ){
+ pNew->pRoot = (Fts5ExprNode*)sqlite3Fts5MallocZero(&rc,
+ sizeof(Fts5ExprNode));
+ }
+ if( rc==SQLITE_OK ){
+ pNew->pRoot->pNear = (Fts5ExprNearset*)sqlite3Fts5MallocZero(&rc,
+ sizeof(Fts5ExprNearset) + sizeof(Fts5ExprPhrase*));
+ }
+
+ for(i=0; rc==SQLITE_OK && inTerm; i++){
+ int tflags = 0;
+ Fts5ExprTerm *p;
+ for(p=&pOrig->aTerm[i]; p && rc==SQLITE_OK; p=p->pSynonym){
+ const char *zTerm = p->zTerm;
+ rc = fts5ParseTokenize((void*)&sCtx, tflags, zTerm, strlen(zTerm), 0, 0);
+ tflags = FTS5_TOKEN_COLOCATED;
+ }
+ if( rc==SQLITE_OK ){
+ sCtx.pPhrase->aTerm[i].bPrefix = pOrig->aTerm[i].bPrefix;
+ }
+ }
+
+ if( rc==SQLITE_OK ){
+ /* All the allocations succeeded. Put the expression object together. */
+ pNew->pIndex = pExpr->pIndex;
+ pNew->nPhrase = 1;
+ pNew->apExprPhrase[0] = sCtx.pPhrase;
+ pNew->pRoot->pNear->apPhrase[0] = sCtx.pPhrase;
+ pNew->pRoot->pNear->nPhrase = 1;
+ sCtx.pPhrase->pNode = pNew->pRoot;
+
+ if( pOrig->nTerm==1 && pOrig->aTerm[0].pSynonym==0 ){
+ pNew->pRoot->eType = FTS5_TERM;
+ }else{
+ pNew->pRoot->eType = FTS5_STRING;
+ }
+ }else{
+ sqlite3Fts5ExprFree(pNew);
+ fts5ExprPhraseFree(sCtx.pPhrase);
+ pNew = 0;
+ }
+
+ *ppNew = pNew;
+ return rc;
+}
+
/*
** Token pTok has appeared in a MATCH expression where the NEAR operator
** is expected. If token pTok does not contain "NEAR", store an error
** in the pParse object.
@@ -1628,11 +1900,14 @@
if( eType==FTS5_STRING ){
int iPhrase;
for(iPhrase=0; iPhrasenPhrase; iPhrase++){
pNear->apPhrase[iPhrase]->pNode = pRet;
}
- if( pNear->nPhrase==1 && pNear->apPhrase[0]->nTerm==1 ){
+ if( pNear->nPhrase==1
+ && pNear->apPhrase[0]->nTerm==1
+ && pNear->apPhrase[0]->aTerm[0].pSynonym==0
+ ){
pRet->eType = FTS5_TERM;
}
}else{
fts5ExprAddChildren(pRet, pLeft);
fts5ExprAddChildren(pRet, pRight);
@@ -1648,20 +1923,32 @@
}
return pRet;
}
static char *fts5ExprTermPrint(Fts5ExprTerm *pTerm){
- char *zQuoted = sqlite3_malloc(strlen(pTerm->zTerm) * 2 + 3 + 2);
+ int nByte = 0;
+ Fts5ExprTerm *p;
+ char *zQuoted;
+
+ /* Determine the maximum amount of space required. */
+ for(p=pTerm; p; p=p->pSynonym){
+ nByte += strlen(pTerm->zTerm) * 2 + 3 + 2;
+ }
+ zQuoted = sqlite3_malloc(nByte);
+
if( zQuoted ){
int i = 0;
- char *zIn = pTerm->zTerm;
- zQuoted[i++] = '"';
- while( *zIn ){
- if( *zIn=='"' ) zQuoted[i++] = '"';
- zQuoted[i++] = *zIn++;
+ for(p=pTerm; p; p=p->pSynonym){
+ char *zIn = p->zTerm;
+ zQuoted[i++] = '"';
+ while( *zIn ){
+ if( *zIn=='"' ) zQuoted[i++] = '"';
+ zQuoted[i++] = *zIn++;
+ }
+ zQuoted[i++] = '"';
+ if( p->pSynonym ) zQuoted[i++] = '|';
}
- zQuoted[i++] = '"';
if( pTerm->bPrefix ){
zQuoted[i++] = ' ';
zQuoted[i++] = '*';
}
zQuoted[i++] = '\0';
Index: ext/fts5/fts5_index.c
==================================================================
--- ext/fts5/fts5_index.c
+++ ext/fts5/fts5_index.c
@@ -291,11 +291,10 @@
typedef struct Fts5Data Fts5Data;
typedef struct Fts5DlidxIter Fts5DlidxIter;
typedef struct Fts5DlidxLvl Fts5DlidxLvl;
typedef struct Fts5DlidxWriter Fts5DlidxWriter;
-typedef struct Fts5NodeIter Fts5NodeIter;
typedef struct Fts5PageWriter Fts5PageWriter;
typedef struct Fts5SegIter Fts5SegIter;
typedef struct Fts5DoclistIter Fts5DoclistIter;
typedef struct Fts5SegWriter Fts5SegWriter;
typedef struct Fts5Structure Fts5Structure;
@@ -524,28 +523,10 @@
Fts5CResult *aFirst; /* Current merge state (see above) */
Fts5SegIter aSeg[1]; /* Array of segment iterators */
};
-/*
-** Object for iterating through the conents of a single internal node in
-** memory.
-*/
-struct Fts5NodeIter {
- /* Internal. Set and managed by fts5NodeIterXXX() functions. Except,
- ** the EOF test for the iterator is (Fts5NodeIter.aData==0). */
- const u8 *aData;
- int nData;
- int iOff;
-
- /* Output variables */
- Fts5Buffer term;
- int nEmpty;
- int iChild;
- int bDlidx;
-};
-
/*
** An instance of the following type is used to iterate through the contents
** of a doclist-index record.
**
** pData:
@@ -571,27 +552,10 @@
int nLvl;
int iSegid;
Fts5DlidxLvl aLvl[1];
};
-
-
-/*
-** The first argument passed to this macro is a pointer to an Fts5Buffer
-** object.
-*/
-#define fts5BufferSize(pBuf,n) { \
- if( pBuf->nSpacep, n); \
- if( pNew==0 ){ \
- sqlite3_free(pBuf->p); \
- } \
- pBuf->nSpace = n; \
- pBuf->p = pNew; \
- } \
-}
-
static void fts5PutU16(u8 *aOut, u16 iVal){
aOut[0] = (iVal>>8);
aOut[1] = (iVal&0xFF);
}
@@ -615,19 +579,20 @@
** Return -ve if pLeft is smaller than pRight, 0 if they are equal or
** +ve if pRight is smaller than pLeft. In other words:
**
** res = *pLeft - *pRight
*/
+#ifdef SQLITE_DEBUG
static int fts5BufferCompareBlob(
Fts5Buffer *pLeft, /* Left hand side of comparison */
const u8 *pRight, int nRight /* Right hand side of comparison */
){
int nCmp = MIN(pLeft->n, nRight);
int res = memcmp(pLeft->p, pRight, nCmp);
return (res==0 ? (pLeft->n - nRight) : res);
}
-
+#endif
/*
** Compare the contents of the two buffers using memcmp(). If one buffer
** is a prefix of the other, it is considered the lesser.
**
@@ -663,15 +628,18 @@
p->pReader = 0;
sqlite3_blob_close(pReader);
}
}
-static Fts5Data *fts5DataReadOrBuffer(
- Fts5Index *p,
- Fts5Buffer *pBuf,
- i64 iRowid
-){
+
+/*
+** Retrieve a record from the %_data table.
+**
+** If an error occurs, NULL is returned and an error left in the
+** Fts5Index object.
+*/
+static Fts5Data *fts5DataRead(Fts5Index *p, i64 iRowid){
Fts5Data *pRet = 0;
if( p->rc==SQLITE_OK ){
int rc = SQLITE_OK;
if( p->pReader ){
@@ -687,12 +655,12 @@
fts5CloseReader(p);
}
if( rc==SQLITE_ABORT ) rc = SQLITE_OK;
}
- /* If the blob handle is not yet open, open and seek it. Otherwise, use
- ** the blob_reopen() API to reseek the existing blob handle. */
+ /* If the blob handle is not open at this point, open it and seek
+ ** to the requested entry. */
if( p->pReader==0 && rc==SQLITE_OK ){
Fts5Config *pConfig = p->pConfig;
rc = sqlite3_blob_open(pConfig->db,
pConfig->zDb, p->zDataTbl, "block", iRowid, 0, &p->pReader
);
@@ -706,26 +674,17 @@
if( rc==SQLITE_ERROR ) rc = FTS5_CORRUPT;
if( rc==SQLITE_OK ){
u8 *aOut = 0; /* Read blob data into this buffer */
int nByte = sqlite3_blob_bytes(p->pReader);
- if( pBuf ){
- fts5BufferSize(pBuf, MAX(nByte, p->pConfig->pgsz) + 20);
- pBuf->n = nByte;
- aOut = pBuf->p;
- if( aOut==0 ){
- rc = SQLITE_NOMEM;
- }
- }else{
- int nSpace = nByte + FTS5_DATA_PADDING;
- pRet = (Fts5Data*)sqlite3_malloc(nSpace+sizeof(Fts5Data));
- if( pRet ){
- pRet->n = nByte;
- aOut = pRet->p = (u8*)&pRet[1];
- }else{
- rc = SQLITE_NOMEM;
- }
+ int nAlloc = sizeof(Fts5Data) + nByte + FTS5_DATA_PADDING;
+ pRet = (Fts5Data*)sqlite3_malloc(nAlloc);
+ if( pRet ){
+ pRet->n = nByte;
+ aOut = pRet->p = (u8*)&pRet[1];
+ }else{
+ rc = SQLITE_NOMEM;
}
if( rc==SQLITE_OK ){
rc = sqlite3_blob_read(p->pReader, aOut, nByte, 0);
}
@@ -736,37 +695,14 @@
}
p->rc = rc;
p->nRead++;
}
- return pRet;
-}
-
-/*
-** Retrieve a record from the %_data table.
-**
-** If an error occurs, NULL is returned and an error left in the
-** Fts5Index object.
-*/
-static Fts5Data *fts5DataRead(Fts5Index *p, i64 iRowid){
- Fts5Data *pRet = fts5DataReadOrBuffer(p, 0, iRowid);
assert( (pRet==0)==(p->rc!=SQLITE_OK) );
return pRet;
}
-/*
-** Read a record from the %_data table into the buffer supplied as the
-** second argument.
-**
-** If an error occurs, an error is left in the Fts5Index object. If an
-** error has already occurred when this function is called, it is a
-** no-op.
-*/
-static void fts5DataBuffer(Fts5Index *p, Fts5Buffer *pBuf, i64 iRowid){
- (void)fts5DataReadOrBuffer(p, pBuf, iRowid);
-}
-
/*
** Release a reference to data record returned by an earlier call to
** fts5DataRead().
*/
static void fts5DataRelease(Fts5Data *pData){
@@ -1031,23 +967,22 @@
*/
static Fts5Structure *fts5StructureRead(Fts5Index *p){
Fts5Config *pConfig = p->pConfig;
Fts5Structure *pRet = 0; /* Object to return */
int iCookie; /* Configuration cookie */
+ Fts5Data *pData;
Fts5Buffer buf = {0, 0, 0};
- fts5DataBuffer(p, &buf, FTS5_STRUCTURE_ROWID);
- if( buf.p==0 ) return 0;
- assert( buf.nSpace>=(buf.n + FTS5_DATA_ZERO_PADDING) );
- memset(&buf.p[buf.n], 0, FTS5_DATA_ZERO_PADDING);
- p->rc = fts5StructureDecode(buf.p, buf.n, &iCookie, &pRet);
-
+ pData = fts5DataRead(p, FTS5_STRUCTURE_ROWID);
+ if( p->rc ) return 0;
+ memset(&pData->p[pData->n], 0, FTS5_DATA_PADDING);
+ p->rc = fts5StructureDecode(pData->p, pData->n, &iCookie, &pRet);
if( p->rc==SQLITE_OK && pConfig->iCookie!=iCookie ){
p->rc = sqlite3Fts5ConfigLoad(pConfig, iCookie);
}
- fts5BufferFree(&buf);
+ fts5DataRelease(pData);
if( p->rc!=SQLITE_OK ){
fts5StructureRelease(pRet);
pRet = 0;
}
return pRet;
@@ -1226,66 +1161,10 @@
fts5StructurePromoteTo(p, iPromote, szPromote, pStruct);
}
}
-/*
-** If the pIter->iOff offset currently points to an entry indicating one
-** or more term-less nodes, advance past it and set pIter->nEmpty to
-** the number of empty child nodes.
-*/
-static void fts5NodeIterGobbleNEmpty(Fts5NodeIter *pIter){
- if( pIter->iOffnData && 0==(pIter->aData[pIter->iOff] & 0xfe) ){
- pIter->bDlidx = pIter->aData[pIter->iOff] & 0x01;
- pIter->iOff++;
- pIter->iOff += fts5GetVarint32(&pIter->aData[pIter->iOff], pIter->nEmpty);
- }else{
- pIter->nEmpty = 0;
- pIter->bDlidx = 0;
- }
-}
-
-/*
-** Advance to the next entry within the node.
-*/
-static void fts5NodeIterNext(int *pRc, Fts5NodeIter *pIter){
- if( pIter->iOff>=pIter->nData ){
- pIter->aData = 0;
- pIter->iChild += pIter->nEmpty;
- }else{
- int nPre, nNew;
- pIter->iOff += fts5GetVarint32(&pIter->aData[pIter->iOff], nPre);
- pIter->iOff += fts5GetVarint32(&pIter->aData[pIter->iOff], nNew);
- pIter->term.n = nPre-2;
- fts5BufferAppendBlob(pRc, &pIter->term, nNew, pIter->aData+pIter->iOff);
- pIter->iOff += nNew;
- pIter->iChild += (1 + pIter->nEmpty);
- fts5NodeIterGobbleNEmpty(pIter);
- if( *pRc ) pIter->aData = 0;
- }
-}
-
-
-/*
-** Initialize the iterator object pIter to iterate through the internal
-** segment node in pData.
-*/
-static void fts5NodeIterInit(const u8 *aData, int nData, Fts5NodeIter *pIter){
- memset(pIter, 0, sizeof(*pIter));
- pIter->aData = aData;
- pIter->nData = nData;
- pIter->iOff = fts5GetVarint32(aData, pIter->iChild);
- fts5NodeIterGobbleNEmpty(pIter);
-}
-
-/*
-** Free any memory allocated by the iterator object.
-*/
-static void fts5NodeIterFree(Fts5NodeIter *pIter){
- fts5BufferFree(&pIter->term);
-}
-
/*
** Advance the iterator passed as the only argument. If the end of the
** doclist-index page is reached, return non-zero.
*/
static int fts5DlidxLvlNext(Fts5DlidxLvl *pLvl){
@@ -2039,123 +1918,10 @@
}
pIter->pDlidx = fts5DlidxIterInit(p, bRev, iSeg, pIter->iTermLeafPgno);
}
-#ifdef SQLITE_DEBUG
-static void fts5AssertNodeSeekOk(
- Fts5Buffer *pNode,
- const u8 *pTerm, int nTerm, /* Term to search for */
- int iExpectPg,
- int bExpectDlidx
-){
- int bDlidx;
- int iPg;
- int rc = SQLITE_OK;
- Fts5NodeIter node;
-
- fts5NodeIterInit(pNode->p, pNode->n, &node);
- assert( node.term.n==0 );
- iPg = node.iChild;
- bDlidx = node.bDlidx;
- for(fts5NodeIterNext(&rc, &node);
- node.aData && fts5BufferCompareBlob(&node.term, pTerm, nTerm)<=0;
- fts5NodeIterNext(&rc, &node)
- ){
- iPg = node.iChild;
- bDlidx = node.bDlidx;
- }
- fts5NodeIterFree(&node);
-
- assert( rc!=SQLITE_OK || iPg==iExpectPg );
- assert( rc!=SQLITE_OK || bDlidx==bExpectDlidx );
-}
-#else
-#define fts5AssertNodeSeekOk(v,w,x,y,z)
-#endif
-
-/*
-** Argument pNode is an internal b-tree node. This function searches
-** within the node for the largest term that is smaller than or equal
-** to (pTerm/nTerm).
-**
-** It returns the associated page number. Or, if (pTerm/nTerm) is smaller
-** than all terms within the node, the leftmost child page number.
-**
-** Before returning, (*pbDlidx) is set to true if the last term on the
-** returned child page number has a doclist-index. Or left as is otherwise.
-*/
-static int fts5NodeSeek(
- Fts5Buffer *pNode, /* Node to search */
- const u8 *pTerm, int nTerm, /* Term to search for */
- int *pbDlidx /* OUT: True if dlidx flag is set */
-){
- int iPg;
- u8 *pPtr = pNode->p;
- u8 *pEnd = &pPtr[pNode->n];
- int nMatch = 0; /* Number of bytes of pTerm already matched */
-
- assert( *pbDlidx==0 );
-
- pPtr += fts5GetVarint32(pPtr, iPg);
- while( pPtr=pEnd ) break;
- }
-
- /* Read the next "term" pointer. Set nKeep to the number of bytes to
- ** keep from the previous term, and nNew to the number of bytes of
- ** new data that will be appended to it. */
- nKeep = (int)*pPtr++;
- nNew = (int)*pPtr++;
- if( (nKeep | nNew) & 0x0080 ){
- pPtr -= 2;
- pPtr += fts5GetVarint32(pPtr, nKeep);
- pPtr += fts5GetVarint32(pPtr, nNew);
- }
- nKeep -= 2;
-
- /* Compare (pTerm/nTerm) to the current term on the node (the one described
- ** by nKeep/nNew). If the node term is larger, break out of the while()
- ** loop.
- **
- ** Otherwise, if (pTerm/nTerm) is larger or the two terms are equal,
- ** leave variable nMatch set to the size of the largest prefix common to
- ** both terms in bytes. */
- if( nKeep==nMatch ){
- int nTst = MIN(nNew, nTerm-nMatch);
- int i;
- for(i=0; i pTerm[nMatch]) ) break;
- }else if( nKeeprc==SQLITE_OK ){
+ do{
if( bMove ) fts5SegIterNext(p, pIter, 0);
if( pIter->pLeaf==0 ) break;
if( bRev==0 && pIter->iRowid>=iMatch ) break;
if( bRev!=0 && pIter->iRowid<=iMatch ) break;
bMove = 1;
- }
+ }while( p->rc==SQLITE_OK );
}
/*
** Free the iterator object passed as the second argument.
@@ -4457,17 +4223,13 @@
** function populates it with the initial structure objects for each index,
** and the initial version of the "averages" record (a zero-byte blob).
*/
int sqlite3Fts5IndexReinit(Fts5Index *p){
Fts5Structure s;
-
- assert( p->rc==SQLITE_OK );
- p->rc = sqlite3Fts5IndexSetAverages(p, (const u8*)"", 0);
-
memset(&s, 0, sizeof(Fts5Structure));
+ fts5DataWrite(p, FTS5_AVERAGES_ROWID, (const u8*)"", 0);
fts5StructureWrite(p, &s);
-
return fts5IndexReturn(p);
}
/*
** Open a new Fts5Index handle. If the bCreate argument is true, create
@@ -4785,17 +4547,32 @@
fts5CloseReader(pIndex);
}
}
/*
-** Read the "averages" record into the buffer supplied as the second
-** argument. Return SQLITE_OK if successful, or an SQLite error code
-** if an error occurs.
+** Read and decode the "averages" record from the database.
+**
+** Parameter anSize must point to an array of size nCol, where nCol is
+** the number of user defined columns in the FTS table.
*/
-int sqlite3Fts5IndexGetAverages(Fts5Index *p, Fts5Buffer *pBuf){
- assert( p->rc==SQLITE_OK );
- fts5DataReadOrBuffer(p, pBuf, FTS5_AVERAGES_ROWID);
+int sqlite3Fts5IndexGetAverages(Fts5Index *p, i64 *pnRow, i64 *anSize){
+ int nCol = p->pConfig->nCol;
+ Fts5Data *pData;
+
+ *pnRow = 0;
+ memset(anSize, 0, sizeof(i64) * nCol);
+ pData = fts5DataRead(p, FTS5_AVERAGES_ROWID);
+ if( p->rc==SQLITE_OK && pData->n ){
+ int i = 0;
+ int iCol;
+ i += fts5GetVarint(&pData->p[i], (u64*)pnRow);
+ for(iCol=0; in && iColp[i], (u64*)&anSize[iCol]);
+ }
+ }
+
+ fts5DataRelease(pData);
return fts5IndexReturn(p);
}
/*
** Replace the current "averages" record with the contents of the buffer
@@ -5485,77 +5262,57 @@
/* todo */
}else{
fts5DecodeStructure(&rc, &s, a, n);
}
}else{
-
Fts5Buffer term;
+ int iTermOff = 0;
+ int iRowidOff = 0;
+ int iOff;
+ int nKeep = 0;
+
memset(&term, 0, sizeof(Fts5Buffer));
- if( iHeight==0 ){
- int iTermOff = 0;
- int iRowidOff = 0;
- int iOff;
- int nKeep = 0;
-
- if( n>=4 ){
- iRowidOff = fts5GetU16(&a[0]);
- iTermOff = fts5GetU16(&a[2]);
- }else{
- sqlite3Fts5BufferSet(&rc, &s, 8, (const u8*)"corrupt");
- goto decode_out;
- }
-
- if( iRowidOff ){
- iOff = iRowidOff;
- }else if( iTermOff ){
- iOff = iTermOff;
- }else{
- iOff = n;
- }
- fts5DecodePoslist(&rc, &s, &a[4], iOff-4);
-
- assert( iRowidOff==0 || iOff==iRowidOff );
- if( iRowidOff ){
- iOff += fts5DecodeDoclist(&rc, &s, &a[iOff], n-iOff);
- }
-
- assert( iTermOff==0 || iOff==iTermOff );
- while( iOff=4 ){
+ iRowidOff = fts5GetU16(&a[0]);
+ iTermOff = fts5GetU16(&a[2]);
+ }else{
+ sqlite3Fts5BufferSet(&rc, &s, 8, (const u8*)"corrupt");
+ goto decode_out;
+ }
+
+ if( iRowidOff ){
+ iOff = iRowidOff;
+ }else if( iTermOff ){
+ iOff = iTermOff;
+ }else{
+ iOff = n;
+ }
+ fts5DecodePoslist(&rc, &s, &a[4], iOff-4);
+
+ assert( iRowidOff==0 || iOff==iRowidOff );
+ if( iRowidOff ){
+ iOff += fts5DecodeDoclist(&rc, &s, &a[iOff], n-iOff);
+ }
+
+ assert( iTermOff==0 || iOff==iTermOff );
+ while( iOffbase.pVtab);
- return sqlite3Fts5Tokenize(pTab->pConfig, pText, nText, pUserData, xToken);
+ return sqlite3Fts5Tokenize(
+ pTab->pConfig, FTS5_TOKENIZE_AUX, pText, nText, pUserData, xToken
+ );
}
static int fts5ApiPhraseCount(Fts5Context *pCtx){
Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
return sqlite3Fts5ExprPhraseCount(pCsr->pExpr);
@@ -1653,17 +1655,20 @@
return rc;
}
static int fts5ColumnSizeCb(
void *pContext, /* Pointer to int */
+ int tflags,
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
){
int *pCnt = (int*)pContext;
- *pCnt = *pCnt + 1;
+ if( (tflags & FTS5_TOKEN_COLOCATED)==0 ){
+ (*pCnt)++;
+ }
return SQLITE_OK;
}
static int fts5ApiColumnSize(Fts5Context *pCtx, int iCol, int *pnToken){
Fts5Cursor *pCsr = (Fts5Cursor*)pCtx;
@@ -1689,11 +1694,13 @@
const char *z; int n;
void *p = (void*)(&pCsr->aColumnSize[i]);
pCsr->aColumnSize[i] = 0;
rc = fts5ApiColumnText(pCtx, i, &z, &n);
if( rc==SQLITE_OK ){
- rc = sqlite3Fts5Tokenize(pConfig, z, n, p, fts5ColumnSizeCb);
+ rc = sqlite3Fts5Tokenize(
+ pConfig, FTS5_TOKENIZE_AUX, z, n, p, fts5ColumnSizeCb
+ );
}
}
}
}
CsrFlagClear(pCsr, FTS5CSR_REQUIRE_DOCSIZE);
@@ -1851,11 +1858,11 @@
Fts5Config *pConf = pTab->pConfig;
pNew->ePlan = FTS5_PLAN_MATCH;
pNew->iFirstRowid = SMALLEST_INT64;
pNew->iLastRowid = LARGEST_INT64;
pNew->base.pVtab = (sqlite3_vtab*)pTab;
- rc = sqlite3Fts5ExprPhraseExpr(pConf, pCsr->pExpr, iPhrase, &pNew->pExpr);
+ rc = sqlite3Fts5ExprClonePhrase(pConf, pCsr->pExpr, iPhrase, &pNew->pExpr);
}
if( rc==SQLITE_OK ){
for(rc = fts5CursorFirst(pTab, pNew, 0);
rc==SQLITE_OK && CsrFlagTest(pNew, FTS5CSR_EOF)==0;
@@ -2342,11 +2349,11 @@
rc = SQLITE_NOMEM;
}else{
void *p = (void*)pGlobal;
memset(pGlobal, 0, sizeof(Fts5Global));
pGlobal->db = db;
- pGlobal->api.iVersion = 1;
+ pGlobal->api.iVersion = 2;
pGlobal->api.xCreateFunction = fts5CreateAux;
pGlobal->api.xCreateTokenizer = fts5CreateTokenizer;
pGlobal->api.xFindTokenizer = fts5FindTokenizer;
rc = sqlite3_create_module_v2(db, "fts5", &fts5Mod, p, fts5ModuleDestroy);
if( rc==SQLITE_OK ) rc = sqlite3Fts5IndexInit(db);
Index: ext/fts5/fts5_storage.c
==================================================================
--- ext/fts5/fts5_storage.c
+++ ext/fts5/fts5_storage.c
@@ -357,19 +357,22 @@
/*
** Tokenization callback used when inserting tokens into the FTS index.
*/
static int fts5StorageInsertCallback(
void *pContext, /* Pointer to Fts5InsertCtx object */
+ int tflags,
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
){
Fts5InsertCtx *pCtx = (Fts5InsertCtx*)pContext;
Fts5Index *pIdx = pCtx->pStorage->pIndex;
- int iPos = pCtx->szCol++;
- return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, iPos, pToken, nToken);
+ if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
+ pCtx->szCol++;
+ }
+ return sqlite3Fts5IndexWrite(pIdx, pCtx->iCol, pCtx->szCol-1, pToken, nToken);
}
/*
** If a row with rowid iDel is present in the %_content table, add the
** delete-markers to the FTS index necessary to delete it. Do not actually
@@ -392,10 +395,11 @@
rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
for(iCol=1; rc==SQLITE_OK && iCol<=pConfig->nCol; iCol++){
if( pConfig->abUnindexed[iCol-1] ) continue;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(pConfig,
+ FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pSeek, iCol),
sqlite3_column_bytes(pSeek, iCol),
(void*)&ctx,
fts5StorageInsertCallback
);
@@ -449,26 +453,11 @@
** occurs.
*/
static int fts5StorageLoadTotals(Fts5Storage *p, int bCache){
int rc = SQLITE_OK;
if( p->bTotalsValid==0 ){
- int nCol = p->pConfig->nCol;
- Fts5Buffer buf;
- memset(&buf, 0, sizeof(buf));
-
- memset(p->aTotalSize, 0, sizeof(i64) * nCol);
- p->nTotalRow = 0;
- rc = sqlite3Fts5IndexGetAverages(p->pIndex, &buf);
- if( rc==SQLITE_OK && buf.n ){
- int i = 0;
- int iCol;
- i += fts5GetVarint(&buf.p[i], (u64*)&p->nTotalRow);
- for(iCol=0; iaTotalSize[iCol]);
- }
- }
- sqlite3_free(buf.p);
+ rc = sqlite3Fts5IndexGetAverages(p->pIndex, &p->nTotalRow, p->aTotalSize);
p->bTotalsValid = bCache;
}
return rc;
}
@@ -563,10 +552,11 @@
rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iDel);
for(iCol=0; rc==SQLITE_OK && iColnCol; iCol++){
if( pConfig->abUnindexed[iCol] ) continue;
ctx.szCol = 0;
rc = sqlite3Fts5Tokenize(pConfig,
+ FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_value_text(apVal[iCol]),
sqlite3_value_bytes(apVal[iCol]),
(void*)&ctx,
fts5StorageInsertCallback
);
@@ -652,10 +642,11 @@
rc = sqlite3Fts5IndexBeginWrite(p->pIndex, iRowid);
for(ctx.iCol=0; rc==SQLITE_OK && ctx.iColnCol; ctx.iCol++){
ctx.szCol = 0;
if( pConfig->abUnindexed[ctx.iCol]==0 ){
rc = sqlite3Fts5Tokenize(pConfig,
+ FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, ctx.iCol+1),
sqlite3_column_bytes(pScan, ctx.iCol+1),
(void*)&ctx,
fts5StorageInsertCallback
);
@@ -769,10 +760,11 @@
}
for(ctx.iCol=0; rc==SQLITE_OK && ctx.iColnCol; ctx.iCol++){
ctx.szCol = 0;
if( pConfig->abUnindexed[ctx.iCol]==0 ){
rc = sqlite3Fts5Tokenize(pConfig,
+ FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_value_text(apVal[ctx.iCol+2]),
sqlite3_value_bytes(apVal[ctx.iCol+2]),
(void*)&ctx,
fts5StorageInsertCallback
);
@@ -836,19 +828,22 @@
/*
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
void *pContext, /* Pointer to Fts5InsertCtx object */
+ int tflags,
const char *pToken, /* Buffer containing token */
int nToken, /* Size of token in bytes */
int iStart, /* Start offset of token */
int iEnd /* End offset of token */
){
Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
- int iPos = pCtx->szCol++;
+ if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
+ pCtx->szCol++;
+ }
pCtx->cksum ^= sqlite3Fts5IndexCksum(
- pCtx->pConfig, pCtx->iRowid, pCtx->iCol, iPos, pToken, nToken
+ pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
);
return SQLITE_OK;
}
/*
@@ -879,23 +874,27 @@
int rc2;
while( SQLITE_ROW==sqlite3_step(pScan) ){
int i;
ctx.iRowid = sqlite3_column_int64(pScan, 0);
ctx.szCol = 0;
- rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
+ if( pConfig->bColumnsize ){
+ rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
+ }
for(i=0; rc==SQLITE_OK && inCol; i++){
if( pConfig->abUnindexed[i] ) continue;
ctx.iCol = i;
ctx.szCol = 0;
- rc = sqlite3Fts5Tokenize(
- pConfig,
+ rc = sqlite3Fts5Tokenize(pConfig,
+ FTS5_TOKENIZE_DOCUMENT,
(const char*)sqlite3_column_text(pScan, i+1),
sqlite3_column_bytes(pScan, i+1),
(void*)&ctx,
fts5StorageIntegrityCallback
);
- if( ctx.szCol!=aColSize[i] ) rc = FTS5_CORRUPT;
+ if( pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){
+ rc = FTS5_CORRUPT;
+ }
aTotalSize[i] += ctx.szCol;
}
if( rc!=SQLITE_OK ) break;
}
rc2 = sqlite3_reset(pScan);
@@ -916,11 +915,11 @@
if( rc==SQLITE_OK && pConfig->eContent==FTS5_CONTENT_NORMAL ){
i64 nRow;
rc = fts5StorageCount(p, "content", &nRow);
if( rc==SQLITE_OK && nRow!=p->nTotalRow ) rc = FTS5_CORRUPT;
}
- if( rc==SQLITE_OK ){
+ if( rc==SQLITE_OK && pConfig->bColumnsize ){
i64 nRow;
rc = fts5StorageCount(p, "docsize", &nRow);
if( rc==SQLITE_OK && nRow!=p->nTotalRow ) rc = FTS5_CORRUPT;
}
@@ -1000,13 +999,16 @@
**
** An SQLite error code is returned if an error occurs, or SQLITE_OK
** otherwise.
*/
int sqlite3Fts5StorageDocsize(Fts5Storage *p, i64 iRowid, int *aCol){
- int nCol = p->pConfig->nCol;
- sqlite3_stmt *pLookup = 0;
- int rc = fts5StorageGetStmt(p, FTS5_STMT_LOOKUP_DOCSIZE, &pLookup, 0);
+ int nCol = p->pConfig->nCol; /* Number of user columns in table */
+ sqlite3_stmt *pLookup = 0; /* Statement to query %_docsize */
+ int rc; /* Return Code */
+
+ assert( p->pConfig->bColumnsize );
+ rc = fts5StorageGetStmt(p, FTS5_STMT_LOOKUP_DOCSIZE, &pLookup, 0);
if( rc==SQLITE_OK ){
int bCorrupt = 1;
sqlite3_bind_int64(pLookup, 1, iRowid);
if( SQLITE_ROW==sqlite3_step(pLookup) ){
const u8 *aBlob = sqlite3_column_blob(pLookup, 0);
Index: ext/fts5/fts5_tcl.c
==================================================================
--- ext/fts5/fts5_tcl.c
+++ ext/fts5/fts5_tcl.c
@@ -139,10 +139,11 @@
Tcl_Obj *pObj;
};
static int xTokenizeCb(
void *pCtx,
+ int tflags,
const char *zToken, int nToken,
int iStart, int iEnd
){
F5tFunction *p = (F5tFunction*)pCtx;
Tcl_Obj *pEval = Tcl_DuplicateObj(p->pScript);
@@ -582,10 +583,11 @@
const char *zInput;
};
static int xTokenizeCb2(
void *pCtx,
+ int tflags,
const char *zToken, int nToken,
int iStart, int iEnd
){
F5tTokenizeCtx *p = (F5tTokenizeCtx*)pCtx;
if( p->bSubst ){
@@ -664,11 +666,13 @@
pRet = Tcl_NewObj();
Tcl_IncrRefCount(pRet);
ctx.bSubst = (objc==5);
ctx.pRet = pRet;
ctx.zInput = zText;
- rc = tokenizer.xTokenize(pTok, (void*)&ctx, zText, nText, xTokenizeCb2);
+ rc = tokenizer.xTokenize(
+ pTok, (void*)&ctx, FTS5_TOKENIZE_DOCUMENT, zText, nText, xTokenizeCb2
+ );
tokenizer.xDelete(pTok);
if( rc!=SQLITE_OK ){
Tcl_AppendResult(interp, "error in tokenizer.xTokenize()", 0);
Tcl_DecrRefCount(pRet);
return TCL_ERROR;
@@ -686,19 +690,25 @@
*/
typedef struct F5tTokenizerContext F5tTokenizerContext;
typedef struct F5tTokenizerCb F5tTokenizerCb;
typedef struct F5tTokenizerModule F5tTokenizerModule;
-typedef struct F5tTokenizerModule F5tTokenizerInstance;
+typedef struct F5tTokenizerInstance F5tTokenizerInstance;
struct F5tTokenizerContext {
void *pCtx;
- int (*xToken)(void*, const char*, int, int, int);
+ int (*xToken)(void*, int, const char*, int, int, int);
};
struct F5tTokenizerModule {
Tcl_Interp *interp;
+ Tcl_Obj *pScript;
+ F5tTokenizerContext *pContext;
+};
+
+struct F5tTokenizerInstance {
+ Tcl_Interp *interp;
Tcl_Obj *pScript;
F5tTokenizerContext *pContext;
};
static int f5tTokenizerCreate(
@@ -746,39 +756,66 @@
}
static int f5tTokenizerTokenize(
Fts5Tokenizer *p,
void *pCtx,
+ int flags,
const char *pText, int nText,
- int (*xToken)(void*, const char*, int, int, int)
+ int (*xToken)(void*, int, const char*, int, int, int)
){
F5tTokenizerInstance *pInst = (F5tTokenizerInstance*)p;
void *pOldCtx;
- int (*xOldToken)(void*, const char*, int, int, int);
+ int (*xOldToken)(void*, int, const char*, int, int, int);
Tcl_Obj *pEval;
int rc;
+ const char *zFlags;
pOldCtx = pInst->pContext->pCtx;
xOldToken = pInst->pContext->xToken;
+ pInst->pContext->pCtx = pCtx;
+ pInst->pContext->xToken = xToken;
+
+ assert(
+ flags==FTS5_TOKENIZE_DOCUMENT
+ || flags==FTS5_TOKENIZE_AUX
+ || flags==FTS5_TOKENIZE_QUERY
+ || flags==(FTS5_TOKENIZE_QUERY | FTS5_TOKENIZE_PREFIX)
+ );
pEval = Tcl_DuplicateObj(pInst->pScript);
Tcl_IncrRefCount(pEval);
- rc = Tcl_ListObjAppendElement(
- pInst->interp, pEval, Tcl_NewStringObj(pText, nText)
- );
- if( rc==TCL_OK ){
- rc = Tcl_EvalObjEx(pInst->interp, pEval, TCL_GLOBAL_ONLY);
+ switch( flags ){
+ case FTS5_TOKENIZE_DOCUMENT:
+ zFlags = "document";
+ break;
+ case FTS5_TOKENIZE_AUX:
+ zFlags = "aux";
+ break;
+ case FTS5_TOKENIZE_QUERY:
+ zFlags = "query";
+ break;
+ case (FTS5_TOKENIZE_PREFIX | FTS5_TOKENIZE_QUERY):
+ zFlags = "prefixquery";
+ break;
+ default:
+ assert( 0 );
+ zFlags = "invalid";
+ break;
}
+
+ Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(zFlags, -1));
+ Tcl_ListObjAppendElement(pInst->interp, pEval, Tcl_NewStringObj(pText,nText));
+ rc = Tcl_EvalObjEx(pInst->interp, pEval, TCL_GLOBAL_ONLY);
Tcl_DecrRefCount(pEval);
pInst->pContext->pCtx = pOldCtx;
pInst->pContext->xToken = xOldToken;
return rc;
}
/*
-** sqlite3_fts5_token TEXT START END POS
+** sqlite3_fts5_token ?-colocated? TEXT START END
*/
static int f5tTokenizerReturn(
void * clientData,
Tcl_Interp *interp,
int objc,
@@ -786,35 +823,47 @@
){
F5tTokenizerContext *p = (F5tTokenizerContext*)clientData;
int iStart;
int iEnd;
int nToken;
+ int tflags = 0;
char *zToken;
int rc;
- assert( p );
- if( objc!=4 ){
- Tcl_WrongNumArgs(interp, 1, objv, "TEXT START END");
+ if( objc==5 ){
+ int nArg;
+ char *zArg = Tcl_GetStringFromObj(objv[1], &nArg);
+ if( nArg<=10 && nArg>=2 && memcmp("-colocated", zArg, nArg)==0 ){
+ tflags |= FTS5_TOKEN_COLOCATED;
+ }else{
+ goto usage;
+ }
+ }else if( objc!=4 ){
+ goto usage;
+ }
+
+ zToken = Tcl_GetStringFromObj(objv[objc-3], &nToken);
+ if( Tcl_GetIntFromObj(interp, objv[objc-2], &iStart)
+ || Tcl_GetIntFromObj(interp, objv[objc-1], &iEnd)
+ ){
return TCL_ERROR;
}
+
if( p->xToken==0 ){
Tcl_AppendResult(interp,
"sqlite3_fts5_token may only be used by tokenizer callback", 0
);
return TCL_ERROR;
}
- zToken = Tcl_GetStringFromObj(objv[1], &nToken);
- if( Tcl_GetIntFromObj(interp, objv[2], &iStart)
- || Tcl_GetIntFromObj(interp, objv[3], &iEnd)
- ){
- return TCL_ERROR;
- }
-
- rc = p->xToken(p->pCtx, zToken, nToken, iStart, iEnd);
+ rc = p->xToken(p->pCtx, tflags, zToken, nToken, iStart, iEnd);
Tcl_SetResult(interp, (char*)sqlite3ErrName(rc), TCL_VOLATILE);
return TCL_OK;
+
+ usage:
+ Tcl_WrongNumArgs(interp, 1, objv, "?-colocated? TEXT START END");
+ return TCL_ERROR;
}
static void f5tDelTokenizer(void *pCtx){
F5tTokenizerModule *pMod = (F5tTokenizerModule*)pCtx;
Tcl_DecrRefCount(pMod->pScript);
Index: ext/fts5/fts5_test_mi.c
==================================================================
--- ext/fts5/fts5_test_mi.c
+++ ext/fts5/fts5_test_mi.c
@@ -350,11 +350,11 @@
int nVal, /* Number of values in apVal[] array */
sqlite3_value **apVal /* Array of trailing arguments */
){
const char *zArg;
Fts5MatchinfoCtx *p;
- int rc;
+ int rc = SQLITE_OK;
if( nVal>0 ){
zArg = (const char*)sqlite3_value_text(apVal[0]);
}else{
zArg = "pcx";
@@ -361,15 +361,20 @@
}
p = (Fts5MatchinfoCtx*)pApi->xGetAuxdata(pFts, 0);
if( p==0 || sqlite3_stricmp(zArg, p->zArg) ){
p = fts5MatchinfoNew(pApi, pFts, pCtx, zArg);
- pApi->xSetAuxdata(pFts, p, sqlite3_free);
- if( p==0 ) return;
+ if( p==0 ){
+ rc = SQLITE_NOMEM;
+ }else{
+ rc = pApi->xSetAuxdata(pFts, p, sqlite3_free);
+ }
}
- rc = fts5MatchinfoIter(pApi, pFts, p, fts5MatchinfoLocalCb);
+ if( rc==SQLITE_OK ){
+ rc = fts5MatchinfoIter(pApi, pFts, p, fts5MatchinfoLocalCb);
+ }
if( rc!=SQLITE_OK ){
sqlite3_result_error_code(pCtx, rc);
}else{
/* No errors has occured, so return a copy of the array of integers. */
int nByte = p->nRet * sizeof(u32);
Index: ext/fts5/fts5_tokenize.c
==================================================================
--- ext/fts5/fts5_tokenize.c
+++ ext/fts5/fts5_tokenize.c
@@ -114,12 +114,13 @@
** Tokenize some text using the ascii tokenizer.
*/
static int fts5AsciiTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
+ int flags,
const char *pText, int nText,
- int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
+ int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
AsciiTokenizer *p = (AsciiTokenizer*)pTokenizer;
int rc = SQLITE_OK;
int ie;
int is = 0;
@@ -156,11 +157,11 @@
nFold = nByte*2;
}
asciiFold(pFold, &pText[is], nByte);
/* Invoke the token callback */
- rc = xToken(pCtx, pFold, nByte, is, ie);
+ rc = xToken(pCtx, 0, pFold, nByte, is, ie);
is = ie+1;
}
if( pFold!=aFold ) sqlite3_free(pFold);
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
@@ -383,12 +384,13 @@
}
static int fts5UnicodeTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
+ int flags,
const char *pText, int nText,
- int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
+ int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
Unicode61Tokenizer *p = (Unicode61Tokenizer*)pTokenizer;
int rc = SQLITE_OK;
unsigned char *a = p->aTokenChar;
@@ -473,11 +475,11 @@
}
ie = zCsr - (unsigned char*)pText;
}
/* Invoke the token callback */
- rc = xToken(pCtx, aFold, zOut-aFold, is, ie);
+ rc = xToken(pCtx, 0, aFold, zOut-aFold, is, ie);
}
tokenize_done:
if( rc==SQLITE_DONE ) rc = SQLITE_OK;
return rc;
@@ -551,11 +553,11 @@
}
typedef struct PorterContext PorterContext;
struct PorterContext {
void *pCtx;
- int (*xToken)(void*, const char*, int, int, int);
+ int (*xToken)(void*, int, const char*, int, int, int);
char *aBuf;
};
typedef struct PorterRule PorterRule;
struct PorterRule {
@@ -1116,10 +1118,11 @@
}
}
static int fts5PorterCb(
void *pCtx,
+ int tflags,
const char *pToken,
int nToken,
int iStart,
int iEnd
){
@@ -1173,32 +1176,33 @@
&& aBuf[nBuf-2]=='l' && fts5Porter_MGt1(aBuf, nBuf-1)
){
nBuf--;
}
- return p->xToken(p->pCtx, aBuf, nBuf, iStart, iEnd);
+ return p->xToken(p->pCtx, tflags, aBuf, nBuf, iStart, iEnd);
pass_through:
- return p->xToken(p->pCtx, pToken, nToken, iStart, iEnd);
+ return p->xToken(p->pCtx, tflags, pToken, nToken, iStart, iEnd);
}
/*
** Tokenize using the porter tokenizer.
*/
static int fts5PorterTokenize(
Fts5Tokenizer *pTokenizer,
void *pCtx,
+ int flags,
const char *pText, int nText,
- int (*xToken)(void*, const char*, int nToken, int iStart, int iEnd)
+ int (*xToken)(void*, int, const char*, int nToken, int iStart, int iEnd)
){
PorterTokenizer *p = (PorterTokenizer*)pTokenizer;
PorterContext sCtx;
sCtx.xToken = xToken;
sCtx.pCtx = pCtx;
sCtx.aBuf = p->aBuf;
return p->tokenizer.xTokenize(
- p->pTokenizer, (void*)&sCtx, pText, nText, fts5PorterCb
+ p->pTokenizer, (void*)&sCtx, flags, pText, nText, fts5PorterCb
);
}
/*
** Register all built-in tokenizers with FTS5.
@@ -1223,9 +1227,9 @@
&aBuiltin[i].x,
0
);
}
- return SQLITE_OK;
+ return rc;
}
Index: ext/fts5/test/fts5_common.tcl
==================================================================
--- ext/fts5/test/fts5_common.tcl
+++ ext/fts5/test/fts5_common.tcl
@@ -292,6 +292,39 @@
}
proc NOT {a b} {
if {[llength $b]>0} { return [list] }
return $a
}
+
+#-------------------------------------------------------------------------
+# This command is similar to [split], except that it also provides the
+# start and end offsets of each token. For example:
+#
+# [fts5_tokenize_split "abc d ef"] -> {abc 0 3 d 4 5 ef 6 8}
+#
+
+proc gobble_whitespace {textvar} {
+ upvar $textvar t
+ regexp {([ ]*)(.*)} $t -> space t
+ return [string length $space]
+}
+
+proc gobble_text {textvar wordvar} {
+ upvar $textvar t
+ upvar $wordvar w
+ regexp {([^ ]*)(.*)} $t -> w t
+ return [string length $w]
+}
+
+proc fts5_tokenize_split {text} {
+ set token ""
+ set ret [list]
+ set iOff [gobble_whitespace text]
+ while {[set nToken [gobble_text text word]]} {
+ lappend ret $word $iOff [expr $iOff+$nToken]
+ incr iOff $nToken
+ incr iOff [gobble_whitespace text]
+ }
+
+ set ret
+}
Index: ext/fts5/test/fts5aa.test
==================================================================
--- ext/fts5/test/fts5aa.test
+++ ext/fts5/test/fts5aa.test
@@ -341,11 +341,11 @@
do_execsql_test 13.5 {
SELECT rowid FROM t1 WHERE t1 MATCH 'o';
} {1}
do_execsql_test 13.6 {
- SELECT rowid FROM t1 WHERE t1 MATCH '.';
+ SELECT rowid FROM t1 WHERE t1 MATCH '""';
} {}
#-------------------------------------------------------------------------
#
reset_db
@@ -503,9 +503,39 @@
SELECT t1.rowid, t2.rowid FROM t1, t2 WHERE t2 MATCH t1.a AND t1.rowid = t2.c
} {1 1}
do_execsql_test 18.3 {
SELECT t1.rowid, t2.rowid FROM t2, t1 WHERE t2 MATCH t1.a AND t1.rowid = t2.c
} {1 1}
+
+#--------------------------------------------------------------------
+# fts5 table in the temp schema.
+#
+reset_db
+do_execsql_test 19.0 {
+ CREATE VIRTUAL TABLE temp.t1 USING fts5(x);
+ INSERT INTO t1 VALUES('x y z');
+ INSERT INTO t1 VALUES('w x 1');
+ SELECT rowid FROM t1 WHERE t1 MATCH 'x';
+} {1 2}
+
+#--------------------------------------------------------------------
+# Test that 6 and 7 byte varints can be read.
+#
+reset_db
+do_execsql_test 20.0 {
+ CREATE VIRTUAL TABLE temp.tmp USING fts5(x);
+}
+set ::ids [list \
+ 0 [expr 1<<36] [expr 2<<36] [expr 1<<43] [expr 2<<43]
+]
+do_test 20.1 {
+ foreach id $::ids {
+ execsql { INSERT INTO tmp(rowid, x) VALUES($id, 'x y z') }
+ }
+ execsql { SELECT rowid FROM tmp WHERE tmp MATCH 'y' }
+} $::ids
+
+
finish_test
Index: ext/fts5/test/fts5columnsize.test
==================================================================
--- ext/fts5/test/fts5columnsize.test
+++ ext/fts5/test/fts5columnsize.test
@@ -132,7 +132,20 @@
SELECT rowid, fts5_test_columnsize(t4) FROM t4 WHERE t4 MATCH 'a'
} {
1 {-1 0 -1} 2 {-1 0 -1}
}
+#-------------------------------------------------------------------------
+# Test the integrity-check
+#
+do_execsql_test 4.1.1 {
+ CREATE VIRTUAL TABLE t5 USING fts5(x, columnsize=0);
+ INSERT INTO t5 VALUES('1 2 3 4');
+ INSERT INTO t5 VALUES('2 4 6 8');
+}
+
+breakpoint
+do_execsql_test 4.1.2 {
+ INSERT INTO t5(t5) VALUES('integrity-check');
+}
finish_test
Index: ext/fts5/test/fts5ea.test
==================================================================
--- ext/fts5/test/fts5ea.test
+++ ext/fts5/test/fts5ea.test
@@ -85,9 +85,15 @@
#
do_execsql_test 4.0 {
SELECT fts5_expr('a AND """"', 'x', 'tokenize="unicode61 tokenchars ''""''"');
} {{"a" AND """"}}
+#-------------------------------------------------------------------------
+# Experiment with a tokenizer that considers " to be a token character.
+#
+do_catchsql_test 5.0 {
+ SELECT fts5_expr('abc | def');
+} {1 {fts5: syntax error near "|"}}
finish_test
Index: ext/fts5/test/fts5eb.test
==================================================================
--- ext/fts5/test/fts5eb.test
+++ ext/fts5/test/fts5eb.test
@@ -28,22 +28,22 @@
set ::se_expr $expr
do_execsql_test $tn {SELECT fts5_expr($se_expr)} [list $res]
}
foreach {tn expr res} {
- 1 {abc} {"abc"}
- 2 {abc .} {"abc"}
- 3 {.} {}
- 4 {abc OR .} {"abc"}
- 5 {abc NOT .} {"abc"}
- 6 {abc AND .} {"abc"}
- 7 {. OR abc} {"abc"}
- 8 {. NOT abc} {"abc"}
- 9 {. AND abc} {"abc"}
- 10 {abc + . + def} {"abc" + "def"}
- 11 {abc . def} {"abc" AND "def"}
- 12 {r+e OR w} {"r" + "e" OR "w"}
+ 1 {abc} {"abc"}
+ 2 {abc ""} {"abc"}
+ 3 {""} {}
+ 4 {abc OR ""} {"abc"}
+ 5 {abc NOT ""} {"abc"}
+ 6 {abc AND ""} {"abc"}
+ 7 {"" OR abc} {"abc"}
+ 8 {"" NOT abc} {"abc"}
+ 9 {"" AND abc} {"abc"}
+ 10 {abc + "" + def} {"abc" + "def"}
+ 11 {abc "" def} {"abc" AND "def"}
+ 12 {r+e OR w} {"r" + "e" OR "w"}
} {
do_execsql_test 1.$tn {SELECT fts5_expr($expr)} [list $res]
}
do_catchsql_test 2.1 {
Index: ext/fts5/test/fts5fault6.test
==================================================================
--- ext/fts5/test/fts5fault6.test
+++ ext/fts5/test/fts5fault6.test
@@ -19,10 +19,11 @@
# If SQLITE_ENABLE_FTS5 is defined, omit this file.
ifcapable !fts5 {
finish_test
return
}
+
#-------------------------------------------------------------------------
# OOM while rebuilding an FTS5 table.
#
do_execsql_test 1.0 {
@@ -146,7 +147,151 @@
}
} -test {
faultsim_test_result {0 {}}
}
+#-------------------------------------------------------------------------
+#
+# 5.2.* OOM while running a query that includes synonyms and matchinfo().
+#
+# 5.3.* OOM while running a query that returns a row containing instances
+# of more than 4 synonyms for a single term.
+#
+proc mit {blob} {
+ set scan(littleEndian) i*
+ set scan(bigEndian) I*
+ binary scan $blob $scan($::tcl_platform(byteOrder)) r
+ return $r
+}
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ if {$tflags=="query" && [string length $w]==1} {
+ for {set i 2} {$i < 7} {incr i} {
+ sqlite3_fts5_token -colo [string repeat $w $i] $iStart $iEnd
+ }
+ }
+ }
+}
+proc tcl_create {args} { return "tcl_tokenize" }
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+db func mit mit
+sqlite3_fts5_register_matchinfo db
+do_test 5.0 {
+ execsql { CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl) }
+ execsql { INSERT INTO t1(t1, rank) VALUES('pgsz', 32) }
+ foreach {rowid text} {
+ 1 {aaaa cc b aaaaa cc aa}
+ 2 {aa aa bb a bbb}
+ 3 {bb aaaaa aaaaa b aaaa aaaaa}
+ 4 {aa a b aaaa aa}
+ 5 {aa b ccc aaaaa cc}
+ 6 {aa aaaaa bbbb cc aaa}
+ 7 {aaaaa aa aa ccccc bb}
+ 8 {ccc bbbbb ccccc bbb c}
+ 9 {cccccc bbbb a aaa cccc c}
+
+ 20 {ddd f ddd eeeee fff ffff eeee ddd fff eeeee dddddd eeee}
+ 21 {fffff eee dddd fffff dd ee ee eeeee eee eeeeee ee dd e}
+ 22 {fffff d eeee dddd fffff dddddd ffff ddddd eeeee ee eee dddd ddddd}
+ 23 {ddddd fff ddd eeeee ffff eeee ddd ff ff ffffff eeeeee dddd ffffff}
+ 24 {eee dd ee dddd dddd eeeeee e eee fff ffff}
+ 25 {ddddd ffffff dddddd fff ddd ddddd ddd f eeee fff dddd f}
+ 26 {f ffff fff fff eeeeee dddd d dddddd ddddd eee ff eeeee}
+ 27 {eee fff dddddd eeeee eeeee dddd ddddd ffff f eeeee eee dddddd ddddd d}
+ 28 {dd ddddd d ddd d fff d dddd ee dddd ee ddd dddddd dddddd}
+ 29 {eeee dddd ee dddd eeee dddd dd fffff f ddd eeeee ddd ee}
+ 30 {ff ffffff eeeeee eeeee eee ffffff ff ffff f fffff eeeee}
+ 31 {fffff eeeeee dddd eeee eeee eeeeee eee fffff d ddddd ffffff ffff dddddd}
+ 32 {dddddd fffff ee eeeeee eeee ee fff dddd fff eeee ffffff eeeeee ffffff}
+ 33 {ddddd eeee dd ffff dddddd fff eeee ddddd ffff eeee ddd}
+ 34 {ee dddd ddddd dddddd eeee eeeeee f dd ee dddddd ffffff}
+ 35 {ee dddd dd eeeeee ddddd eee d eeeeee dddddd eee dddd fffff}
+ 36 {eee ffffff ffffff e fffff eeeee ff dddddd dddddd fff}
+ 37 {eeeee fffff dddddd dddd ffffff fff f dd ee dd dd eeeee}
+ 38 {eeeeee ee d ff eeeeee eeeeee eee eeeee ee ffffff dddd eeee dddddd ee}
+ 39 {eeeeee ddd fffff e dddd ee eee eee ffffff ee f d dddd}
+ 40 {ffffff dddddd eee ee ffffff eee eeee ddddd ee eeeeee f}
+ 41 {ddd ddd fff fffff ee fffff f fff ddddd fffff}
+ 42 {dddd ee ff d f ffffff fff ffffff ff dd dddddd f eeee}
+ 43 {d dd fff fffff d f fff e dddd ee ee}
+ 44 {ff ffff eee ddd d dd ffff dddd d eeee d eeeeee}
+ 45 {eeee f eeeee ee e ffff f ddd e fff}
+ 46 {ffff d ffff eeee ffff eeeee f ffff ddddd eee}
+ 47 {dd dd dddddd ddddd fffff dddddd ddd ddddd eeeeee ffff eeee eee ee}
+ 48 {ffff ffff e dddd ffffff dd dd dddd f fffff}
+ 49 {ffffff d dddddd ffff eeeee f ffff ffff d dd fffff eeeee}
+
+ 50 {x e}
+ } {
+ execsql { INSERT INTO t1(rowid, a) VALUES($rowid, $text) }
+ }
+} {}
+
+set res [list {*}{
+ 1 {3 24 8 2 12 6}
+ 5 {2 24 8 2 12 6}
+ 6 {3 24 8 1 12 6}
+ 7 {3 24 8 1 12 6}
+ 9 {2 24 8 3 12 6}
+}]
+do_execsql_test 5.1.1 {
+ SELECT rowid, mit(matchinfo(t1, 'x')) FROM t1 WHERE t1 MATCH 'a AND c'
+} $res
+do_execsql_test 5.1.2 {
+ SELECT count(*) FROM t1 WHERE t1 MATCH 'd e f'
+} 29
+
+faultsim_save_and_close
+do_faultsim_test 5.2 -faults oom* -prep {
+ faultsim_restore_and_reopen
+ sqlite3_fts5_create_tokenizer db tcl tcl_create
+ sqlite3_fts5_register_matchinfo db
+ db func mit mit
+} -body {
+ db eval {
+ SELECT rowid, mit(matchinfo(t1, 'x')) FROM t1 WHERE t1 MATCH 'a AND c'
+ }
+} -test {
+ faultsim_test_result [list 0 $::res]
+}
+
+do_faultsim_test 5.3 -faults oom* -prep {
+ faultsim_restore_and_reopen
+ sqlite3_fts5_create_tokenizer db tcl tcl_create
+} -body {
+ db eval {
+ SELECT count(*) FROM t1 WHERE t1 MATCH 'd AND e AND f'
+ }
+} -test {
+ faultsim_test_result {0 29}
+}
+
+do_faultsim_test 5.4 -faults oom* -prep {
+ faultsim_restore_and_reopen
+ sqlite3_fts5_create_tokenizer db tcl tcl_create
+} -body {
+ db eval {
+ SELECT count(*) FROM t1 WHERE t1 MATCH 'x + e'
+ }
+} -test {
+ faultsim_test_result {0 1}
+}
+
+#-------------------------------------------------------------------------
+catch { db close }
+breakpoint
+do_faultsim_test 6 -faults oom* -prep {
+ sqlite_orig db test.db
+ sqlite3_db_config_lookaside db 0 0 0
+} -body {
+ load_static_extension db fts5
+} -test {
+ faultsim_test_result {0 {}} {1 {initialization of fts5 failed: }}
+ if {$testrc==0} {
+ db eval { CREATE VIRTUAL TABLE temp.t1 USING fts5(x) }
+ }
+ db close
+}
finish_test
ADDED ext/fts5/test/fts5fault7.test
Index: ext/fts5/test/fts5fault7.test
==================================================================
--- /dev/null
+++ ext/fts5/test/fts5fault7.test
@@ -0,0 +1,45 @@
+# 2015 September 3
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#*************************************************************************
+#
+# This file is focused on OOM errors.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+source $testdir/malloc_common.tcl
+set testprefix fts5fault2
+
+# If SQLITE_ENABLE_FTS3 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+#-------------------------------------------------------------------------
+# Test fault-injection on a query that uses xColumnSize() on columnsize=0
+# table.
+#
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE t1 USING fts5(x, columnsize=0);
+ INSERT INTO t1 VALUES('a b c d e f g');
+ INSERT INTO t1 VALUES('a b c d');
+ INSERT INTO t1 VALUES('a b c d e f g h i j');
+}
+
+
+fts5_aux_test_functions db
+do_faultsim_test 1 -faults oom* -body {
+ execsql { SELECT fts5_test_columnsize(t1) FROM t1 WHERE t1 MATCH 'b' }
+} -test {
+ faultsim_test_result {0 {7 4 10}} {1 SQLITE_NOMEM}
+}
+
+finish_test
+
Index: ext/fts5/test/fts5matchinfo.test
==================================================================
--- ext/fts5/test/fts5matchinfo.test
+++ ext/fts5/test/fts5matchinfo.test
@@ -353,14 +353,14 @@
} {1 1 one 2 2 two 3 3 three}
#---------------------------------------------------------------------------
# Test the 'y' matchinfo flag
#
-set sqlite_fts3_enable_parentheses 1
reset_db
+sqlite3_fts5_register_matchinfo db
do_execsql_test 11.0 {
- CREATE VIRTUAL TABLE tt USING fts3(x, y);
+ CREATE VIRTUAL TABLE tt USING fts5(x, y);
INSERT INTO tt VALUES('c d a c d d', 'e a g b d a'); -- 1
INSERT INTO tt VALUES('c c g a e b', 'c g d g e c'); -- 2
INSERT INTO tt VALUES('b e f d e g', 'b a c b c g'); -- 3
INSERT INTO tt VALUES('a c f f g d', 'd b f d e g'); -- 4
INSERT INTO tt VALUES('g a c f c f', 'd g g b c c'); -- 5
@@ -430,28 +430,26 @@
do_execsql_test 11.1.$tn.2 {
SELECT rowid, mit(matchinfo(tt, 'b')) FROM tt WHERE tt MATCH $expr
} $r2
}
-set sqlite_fts3_enable_parentheses 0
#---------------------------------------------------------------------------
# Test the 'b' matchinfo flag
#
-set sqlite_fts3_enable_parentheses 1
reset_db
+sqlite3_fts5_register_matchinfo db
db func mit mit
do_test 12.0 {
set cols [list]
for {set i 0} {$i < 50} {incr i} { lappend cols "c$i" }
- execsql "CREATE VIRTUAL TABLE tt USING fts3([join $cols ,])"
+ execsql "CREATE VIRTUAL TABLE tt USING fts5([join $cols ,])"
} {}
do_execsql_test 12.1 {
INSERT INTO tt (rowid, c4, c45) VALUES(1, 'abc', 'abc');
SELECT mit(matchinfo(tt, 'b')) FROM tt WHERE tt MATCH 'abc';
} [list [list [expr 1<<4] [expr 1<<(45-32)]]]
-set sqlite_fts3_enable_parentheses 0
finish_test
ADDED ext/fts5/test/fts5synonym.test
Index: ext/fts5/test/fts5synonym.test
==================================================================
--- /dev/null
+++ ext/fts5/test/fts5synonym.test
@@ -0,0 +1,460 @@
+# 2014 Dec 20
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests focusing on custom tokenizers that support synonyms.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5synonym
+
+# If SQLITE_ENABLE_FTS5 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+foreach S {
+ {zero 0}
+ {one 1 i}
+ {two 2 ii}
+ {three 3 iii}
+ {four 4 iv}
+ {five 5 v}
+ {six 6 vi}
+ {seven 7 vii}
+ {eight 8 viii}
+ {nine 9 ix}
+} {
+ foreach s $S {
+ set o [list]
+ foreach x $S {if {$x!=$s} {lappend o $x}}
+ set ::syn($s) $o
+ }
+}
+
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ }
+}
+
+proc tcl_create {args} {
+ return "tcl_tokenize"
+}
+
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+
+#-------------------------------------------------------------------------
+# Warm body test for the code in fts5_tcl.c.
+#
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
+ INSERT INTO ft VALUES('abc def ghi');
+ INSERT INTO ft VALUES('jkl mno pqr');
+ SELECT rowid, x FROM ft WHERE ft MATCH 'def';
+ SELECT x, rowid FROM ft WHERE ft MATCH 'pqr';
+} {1 {abc def ghi} {jkl mno pqr} 2}
+
+#-------------------------------------------------------------------------
+# Test a tokenizer that supports synonyms by adding extra entries to the
+# FTS index.
+#
+
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ if {$tflags=="document" && [info exists ::syn($w)]} {
+ foreach s $::syn($w) {
+ sqlite3_fts5_token -colo $s $iStart $iEnd
+ }
+ }
+ }
+}
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+
+do_execsql_test 2.0 {
+ CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
+ INSERT INTO ft VALUES('one two three');
+ INSERT INTO ft VALUES('four five six');
+ INSERT INTO ft VALUES('eight nine ten');
+} {}
+
+foreach {tn expr res} {
+ 1 "3" 1
+ 2 "eight OR 8 OR 5" {2 3}
+ 3 "10" {}
+ 4 "1*" {1}
+ 5 "1 + 2" {1}
+} {
+ do_execsql_test 2.1.$tn {
+ SELECT rowid FROM ft WHERE ft MATCH $expr
+ } $res
+}
+
+#-------------------------------------------------------------------------
+# Test some broken tokenizers:
+#
+# 3.1.*: A tokenizer that declares the very first token to be colocated.
+#
+# 3.2.*: A tokenizer that reports two identical tokens at the same position.
+# This is allowed.
+#
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+proc tcl_tokenize {tflags text} {
+ set bColo 1
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ if {$bColo} {
+ sqlite3_fts5_token -colo $w $iStart $iEnd
+ set bColo 0
+ } {
+ sqlite3_fts5_token $w $iStart $iEnd
+ }
+ }
+}
+do_execsql_test 3.1.0 {
+ CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
+ INSERT INTO ft VALUES('one two three');
+ CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
+ SELECT * FROM vv;
+} {
+ one 1 1 three 1 1 two 1 1
+}
+
+do_execsql_test 3.1.1 {
+ INSERT INTO ft(ft) VALUES('integrity-check');
+} {}
+
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ }
+}
+
+do_execsql_test 3.1.2 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one two three'
+} {1}
+
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ sqlite3_fts5_token -colo $w $iStart $iEnd
+ }
+}
+do_execsql_test 3.2.0 {
+ CREATE VIRTUAL TABLE ft USING fts5(x, tokenize = tcl);
+ INSERT INTO ft VALUES('one one two three');
+ CREATE VIRTUAL TABLE vv USING fts5vocab(ft, row);
+ SELECT * FROM vv;
+} {
+ one 1 4 three 1 2 two 1 2
+}
+do_execsql_test 3.2.1 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one';
+} {1}
+do_execsql_test 3.2.2 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one two three';
+} {1}
+do_execsql_test 3.2.3 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one + one + two + three';
+} {1}
+do_execsql_test 3.2.4 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one two two three';
+} {1}
+do_execsql_test 3.2.5 {
+ SELECT rowid FROM ft WHERE ft MATCH 'one + two + two + three';
+} {}
+
+#-------------------------------------------------------------------------
+# Check that expressions with synonyms can be parsed and executed.
+#
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ if {$tflags=="query" && [info exists ::syn($w)]} {
+ foreach s $::syn($w) {
+ sqlite3_fts5_token -colo $s $iStart $iEnd
+ }
+ }
+ }
+}
+
+foreach {tn expr res} {
+ 1 {abc} {"abc"}
+ 2 {one} {"one"|"i"|"1"}
+ 3 {3} {"3"|"iii"|"three"}
+ 4 {3*} {"3"|"iii"|"three" *}
+} {
+ do_execsql_test 4.1.$tn {SELECT fts5_expr($expr, 'tokenize=tcl')} [list $res]
+}
+
+do_execsql_test 4.2.1 {
+ CREATE VIRTUAL TABLE xx USING fts5(x, tokenize=tcl);
+ INSERT INTO xx VALUES('one two');
+ INSERT INTO xx VALUES('three four');
+}
+
+do_execsql_test 4.2.2 {
+ SELECT rowid FROM xx WHERE xx MATCH '2'
+} {1}
+
+do_execsql_test 4.2.3 {
+ SELECT rowid FROM xx WHERE xx MATCH '3'
+} {2}
+
+do_test 5.0 {
+ execsql {
+ CREATE VIRTUAL TABLE t1 USING fts5(a, b, tokenize=tcl)
+ }
+ foreach {rowid a b} {
+ 1 {four v 4 i three} {1 3 five five 4 one}
+ 2 {5 1 3 4 i} {2 2 v two 4}
+ 3 {5 i 5 2 four 4 1} {iii ii five two 1}
+ 4 {ii four 4 one 5 three five} {one 5 1 iii 4 3}
+ 5 {three i v i four 4 1} {ii five five five iii}
+ 6 {4 2 ii two 2 iii} {three 1 four 4 iv 1 iv}
+ 7 {ii ii two three 2 5} {iii i ii iii iii one one}
+ 8 {2 ii i two 3 three 2} {two iv v iii 3 five}
+ 9 {i 2 iv 3 five four v} {iii 4 three i three ii 1}
+ } {
+ execsql { INSERT INTO t1(rowid, a, b) VALUES($rowid, $a, $b) }
+ }
+} {}
+
+
+foreach {tn q res} {
+ 1 {one} {
+ 1 {four v 4 [i] three} {[1] 3 five five 4 [one]}
+ 2 {5 [1] 3 4 [i]} {2 2 v two 4}
+ 3 {5 [i] 5 2 four 4 [1]} {iii ii five two [1]}
+ 4 {ii four 4 [one] 5 three five} {[one] 5 [1] iii 4 3}
+ 5 {three [i] v [i] four 4 [1]} {ii five five five iii}
+ 6 {4 2 ii two 2 iii} {three [1] four 4 iv [1] iv}
+ 7 {ii ii two three 2 5} {iii [i] ii iii iii [one] [one]}
+ 8 {2 ii [i] two 3 three 2} {two iv v iii 3 five}
+ 9 {[i] 2 iv 3 five four v} {iii 4 three [i] three ii [1]}
+ }
+ 2 {five four} {
+ 1 {[four] [v] [4] i three} {1 3 [five] [five] [4] one}
+ 2 {[5] 1 3 [4] i} {2 2 [v] two [4]}
+ 3 {[5] i [5] 2 [four] [4] 1} {iii ii [five] two 1}
+ 4 {ii [four] [4] one [5] three [five]} {one [5] 1 iii [4] 3}
+ 5 {three i [v] i [four] [4] 1} {ii [five] [five] [five] iii}
+ 8 {2 ii i two 3 three 2} {two [iv] [v] iii 3 [five]}
+ 9 {i 2 [iv] 3 [five] [four] [v]} {iii [4] three i three ii 1}
+ }
+ 3 {one OR two OR iii OR 4 OR v} {
+ 1 {[four] [v] [4] [i] [three]} {[1] [3] [five] [five] [4] [one]}
+ 2 {[5] [1] [3] [4] [i]} {[2] [2] [v] [two] [4]}
+ 3 {[5] [i] [5] [2] [four] [4] [1]} {[iii] [ii] [five] [two] [1]}
+ 4 {[ii] [four] [4] [one] [5] [three] [five]} {[one] [5] [1] [iii] [4] [3]}
+ 5 {[three] [i] [v] [i] [four] [4] [1]} {[ii] [five] [five] [five] [iii]}
+ 6 {[4] [2] [ii] [two] [2] [iii]} {[three] [1] [four] [4] [iv] [1] [iv]}
+ 7 {[ii] [ii] [two] [three] [2] [5]} {[iii] [i] [ii] [iii] [iii] [one] [one]}
+ 8 {[2] [ii] [i] [two] [3] [three] [2]} {[two] [iv] [v] [iii] [3] [five]}
+ 9 {[i] [2] [iv] [3] [five] [four] [v]} {[iii] [4] [three] [i] [three] [ii] [1]}
+ }
+
+ 4 {5 + 1} {
+ 2 {[5 1] 3 4 i} {2 2 v two 4}
+ 3 {[5 i] 5 2 four 4 1} {iii ii five two 1}
+ 4 {ii four 4 one 5 three five} {one [5 1] iii 4 3}
+ 5 {three i [v i] four 4 1} {ii five five five iii}
+ }
+
+ 5 {one + two + three} {
+ 7 {ii ii two three 2 5} {iii [i ii iii] iii one one}
+ 8 {2 ii [i two 3] three 2} {two iv v iii 3 five}
+ }
+
+ 6 {"v v"} {
+ 1 {four v 4 i three} {1 3 [five five] 4 one}
+ 5 {three i v i four 4 1} {ii [five five five] iii}
+ }
+} {
+ do_execsql_test 5.1.$tn {
+ SELECT rowid, highlight(t1, 0, '[', ']'), highlight(t1, 1, '[', ']')
+ FROM t1 WHERE t1 MATCH $q
+ } $res
+}
+
+# Test that the xQueryPhrase() API works with synonyms.
+#
+proc mit {blob} {
+ set scan(littleEndian) i*
+ set scan(bigEndian) I*
+ binary scan $blob $scan($::tcl_platform(byteOrder)) r
+ return $r
+}
+db func mit mit
+sqlite3_fts5_register_matchinfo db
+
+foreach {tn q res} {
+ 1 {one} {
+ 1 {1 11 7 2 12 6} 2 {2 11 7 0 12 6}
+ 3 {2 11 7 1 12 6} 4 {1 11 7 2 12 6}
+ 5 {3 11 7 0 12 6} 6 {0 11 7 2 12 6}
+ 7 {0 11 7 3 12 6} 8 {1 11 7 0 12 6}
+ 9 {1 11 7 2 12 6}
+ }
+} {
+ do_execsql_test 5.2.$tn {
+ SELECT rowid, mit(matchinfo(t1, 'x')) FROM t1 WHERE t1 MATCH $q
+ } $res
+}
+
+
+#-------------------------------------------------------------------------
+# Test terms with more than 4 synonyms.
+#
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ if {$tflags=="query" && [string length $w]==1} {
+ for {set i 2} {$i<=10} {incr i} {
+ sqlite3_fts5_token -colo [string repeat $w $i] $iStart $iEnd
+ }
+ }
+ }
+}
+
+do_execsql_test 6.0.1 {
+ CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize=tcl);
+ INSERT INTO t1 VALUES('yy xx qq');
+ INSERT INTO t1 VALUES('yy xx xx');
+}
+do_execsql_test 6.0.2 {
+ SELECT * FROM t1 WHERE t1 MATCH 'NEAR(y q)';
+} {{yy xx qq}}
+
+do_test 6.0.3 {
+ execsql {
+ CREATE VIRTUAL TABLE t2 USING fts5(a, b, tokenize=tcl)
+ }
+ foreach {rowid a b} {
+ 1 {yyyy vvvvv qq oo yyyyyy vvvv eee} {ffff uu r qq aaaa}
+ 2 {ww oooooo bbbbb ssssss mm} {ffffff yy iiii rr s ccc qqqqq}
+ 3 {zzzz llll gggggg cccc uu} {hhhhhh aaaa ppppp rr ee jjjj}
+ 4 {r f i rrrrrr ww hhh} {aa yyy t x aaaaa ii}
+ 5 {fffff mm vvvv ooo ffffff kkkk tttt} {cccccc bb e zzz d n}
+ 6 {iii dddd hh qqqq ddd ooo} {ttt d c b aaaaaa qqqq}
+ 7 {jjjj rrrr v zzzzz u tt t} {ppppp pp dddd mm hhh uuu}
+ 8 {gggg rrrrrr kkkk vvvv gggg jjjjjj b} {dddddd jj r w cccc wwwwww ss}
+ 9 {kkkkk qqq oooo e tttttt mmm} {e ss qqqqqq hhhh llllll gg}
+ } {
+ execsql { INSERT INTO t2(rowid, a, b) VALUES($rowid, $a, $b) }
+ }
+} {}
+
+foreach {tn q res} {
+ 1 {a} {
+ 1 {yyyy vvvvv qq oo yyyyyy vvvv eee} {ffff uu r qq [aaaa]}
+ 3 {zzzz llll gggggg cccc uu} {hhhhhh [aaaa] ppppp rr ee jjjj}
+ 4 {r f i rrrrrr ww hhh} {[aa] yyy t x [aaaaa] ii}
+ 6 {iii dddd hh qqqq ddd ooo} {ttt d c b [aaaaaa] qqqq}
+ }
+
+ 2 {a AND q} {
+ 1 {yyyy vvvvv [qq] oo yyyyyy vvvv eee} {ffff uu r [qq] [aaaa]}
+ 6 {iii dddd hh [qqqq] ddd ooo} {ttt d c b [aaaaaa] [qqqq]}
+ }
+
+ 3 {o OR (q AND a)} {
+ 1 {yyyy vvvvv [qq] [oo] yyyyyy vvvv eee} {ffff uu r [qq] [aaaa]}
+ 2 {ww [oooooo] bbbbb ssssss mm} {ffffff yy iiii rr s ccc qqqqq}
+ 5 {fffff mm vvvv [ooo] ffffff kkkk tttt} {cccccc bb e zzz d n}
+ 6 {iii dddd hh [qqqq] ddd [ooo]} {ttt d c b [aaaaaa] [qqqq]}
+ 9 {kkkkk qqq [oooo] e tttttt mmm} {e ss qqqqqq hhhh llllll gg}
+ }
+
+ 4 {NEAR(q y, 20)} {
+ 1 {[yyyy] vvvvv [qq] oo [yyyyyy] vvvv eee} {ffff uu r qq aaaa}
+ 2 {ww oooooo bbbbb ssssss mm} {ffffff [yy] iiii rr s ccc [qqqqq]}
+ }
+} {
+ do_execsql_test 6.1.$tn.asc {
+ SELECT rowid, highlight(t2, 0, '[', ']'), highlight(t2, 1, '[', ']')
+ FROM t2 WHERE t2 MATCH $q
+ } $res
+
+ set res2 [list]
+ foreach {rowid a b} $res {
+ set res2 [concat [list $rowid $a $b] $res2]
+ }
+
+ do_execsql_test 6.1.$tn.desc {
+ SELECT rowid, highlight(t2, 0, '[', ']'), highlight(t2, 1, '[', ']')
+ FROM t2 WHERE t2 MATCH $q ORDER BY rowid DESC
+ } $res2
+}
+
+do_execsql_test 6.2.1 {
+ INSERT INTO t2(rowid, a, b) VALUES(13,
+ 'x xx xxx xxxx xxxxx xxxxxx xxxxxxx', 'y yy yyy yyyy yyyyy yyyyyy yyyyyyy'
+ );
+ SELECT rowid, highlight(t2, 0, '<', '>'), highlight(t2, 1, '(', ')')
+ FROM t2 WHERE t2 MATCH 'x OR y'
+} {
+ 1 { vvvvv qq oo vvvv eee} {ffff uu r qq aaaa}
+ 2 {ww oooooo bbbbb ssssss mm} {ffffff (yy) iiii rr s ccc qqqqq}
+ 4 {r f i rrrrrr ww hhh} {aa (yyy) t (x) aaaaa ii}
+ 13 { }
+ {(y) (yy) (yyy) (yyyy) (yyyyy) (yyyyyy) (yyyyyyy)}
+}
+
+#-------------------------------------------------------------------------
+# Test that the xColumnSize() API is not confused by colocated tokens.
+#
+reset_db
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+fts5_aux_test_functions db
+proc tcl_tokenize {tflags text} {
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ if {[string length $w]==1} {
+ for {set i 2} {$i<=10} {incr i} {
+ sqlite3_fts5_token -colo [string repeat $w $i] $iStart $iEnd
+ }
+ }
+ }
+}
+
+do_execsql_test 7.0.1 {
+ CREATE VIRTUAL TABLE t1 USING fts5(a, b, columnsize=1, tokenize=tcl);
+ INSERT INTO t1 VALUES('0 2 3', '4 5 6 7');
+ INSERT INTO t1 VALUES('8 9', '0 0 0 0 0 0 0 0 0 0');
+ SELECT fts5_test_columnsize(t1) FROM t1 WHERE t1 MATCH '000 AND 00 AND 0';
+} {{3 4} {2 10}}
+
+do_execsql_test 7.0.2 {
+ INSERT INTO t1(t1) VALUES('integrity-check');
+}
+
+do_execsql_test 7.1.1 {
+ CREATE VIRTUAL TABLE t2 USING fts5(a, b, columnsize=0, tokenize=tcl);
+ INSERT INTO t2 VALUES('0 2 3', '4 5 6 7');
+ INSERT INTO t2 VALUES('8 9', '0 0 0 0 0 0 0 0 0 0');
+ SELECT fts5_test_columnsize(t2) FROM t2 WHERE t2 MATCH '000 AND 00 AND 0';
+} {{3 4} {2 10}}
+
+do_execsql_test 7.1.2 {
+ INSERT INTO t2(t2) VALUES('integrity-check');
+}
+
+finish_test
+
Index: main.mk
==================================================================
--- main.mk
+++ main.mk
@@ -45,10 +45,11 @@
# This is how we compile
#
TCCX = $(TCC) $(OPTS) -I. -I$(TOP)/src -I$(TOP)
TCCX += -I$(TOP)/ext/rtree -I$(TOP)/ext/icu -I$(TOP)/ext/fts3
TCCX += -I$(TOP)/ext/async -I$(TOP)/ext/userauth
+TCCX += -I$(TOP)/ext/fts5
# Object files for the SQLite library.
#
LIBOBJ+= vdbe.o parse.o \
alter.o analyze.o attach.o auth.o \
@@ -227,10 +228,33 @@
SRC += \
$(TOP)/ext/rbu/sqlite3rbu.c \
$(TOP)/ext/rbu/sqlite3rbu.h
+
+# FTS5 things
+#
+FTS5_HDR = \
+ $(TOP)/ext/fts5/fts5.h \
+ $(TOP)/ext/fts5/fts5Int.h \
+ fts5parse.h
+
+FTS5_SRC = \
+ $(TOP)/ext/fts5/fts5_aux.c \
+ $(TOP)/ext/fts5/fts5_buffer.c \
+ $(TOP)/ext/fts5/fts5_main.c \
+ $(TOP)/ext/fts5/fts5_config.c \
+ $(TOP)/ext/fts5/fts5_expr.c \
+ $(TOP)/ext/fts5/fts5_hash.c \
+ $(TOP)/ext/fts5/fts5_index.c \
+ fts5parse.c \
+ $(TOP)/ext/fts5/fts5_storage.c \
+ $(TOP)/ext/fts5/fts5_tokenize.c \
+ $(TOP)/ext/fts5/fts5_unicode2.c \
+ $(TOP)/ext/fts5/fts5_varint.c \
+ $(TOP)/ext/fts5/fts5_vocab.c \
+
# Generated source code files
#
SRC += \
keywordhash.h \
@@ -634,40 +658,20 @@
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_write.c
rtree.o: $(TOP)/ext/rtree/rtree.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/rtree/rtree.c
-# FTS5 things
-#
-FTS5_SRC = \
- $(TOP)/ext/fts5/fts5.h \
- $(TOP)/ext/fts5/fts5Int.h \
- $(TOP)/ext/fts5/fts5_aux.c \
- $(TOP)/ext/fts5/fts5_buffer.c \
- $(TOP)/ext/fts5/fts5_main.c \
- $(TOP)/ext/fts5/fts5_config.c \
- $(TOP)/ext/fts5/fts5_expr.c \
- $(TOP)/ext/fts5/fts5_hash.c \
- $(TOP)/ext/fts5/fts5_index.c \
- fts5parse.c fts5parse.h \
- $(TOP)/ext/fts5/fts5_storage.c \
- $(TOP)/ext/fts5/fts5_tokenize.c \
- $(TOP)/ext/fts5/fts5_unicode2.c \
- $(TOP)/ext/fts5/fts5_varint.c \
- $(TOP)/ext/fts5/fts5_vocab.c \
-
fts5parse.c: $(TOP)/ext/fts5/fts5parse.y lemon
cp $(TOP)/ext/fts5/fts5parse.y .
rm -f fts5parse.h
./lemon $(OPTS) fts5parse.y
fts5parse.h: fts5parse.c
-fts5.c: $(FTS5_SRC)
+fts5.c: $(FTS5_SRC) $(FTS5_HDR)
tclsh $(TOP)/ext/fts5/tool/mkfts5c.tcl
cp $(TOP)/ext/fts5/fts5.h .
-
userauth.o: $(TOP)/ext/userauth/userauth.c $(HDR) $(EXTHDR)
$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/userauth/userauth.c
sqlite3rbu.o: $(TOP)/ext/rbu/sqlite3rbu.c $(HDR) $(EXTHDR)