/* ** 2009 Oct 23 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ****************************************************************************** */ #include "fts3Int.h" #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) #include #include /* ** Characters that may appear in the second argument to matchinfo(). */ #define FTS3_MATCHINFO_NPHRASE 'p' /* 1 value */ #define FTS3_MATCHINFO_NCOL 'c' /* 1 value */ #define FTS3_MATCHINFO_NDOC 'n' /* 1 value */ #define FTS3_MATCHINFO_AVGLENGTH 'a' /* nCol values */ #define FTS3_MATCHINFO_LENGTH 'l' /* nCol values */ #define FTS3_MATCHINFO_LCS 's' /* nCol values */ #define FTS3_MATCHINFO_HITS 'x' /* 3*nCol*nPhrase values */ /* ** The default value for the second argument to matchinfo(). */ #define FTS3_MATCHINFO_DEFAULT "pcx" /* ** Used as an fts3ExprIterate() context when loading phrase doclists to ** Fts3Expr.aDoclist[]/nDoclist. */ typedef struct LoadDoclistCtx LoadDoclistCtx; struct LoadDoclistCtx { Fts3Cursor *pCsr; /* FTS3 Cursor */ int nPhrase; /* Number of phrases seen so far */ int nToken; /* Number of tokens seen so far */ }; /* ** The following types are used as part of the implementation of the ** fts3BestSnippet() routine. */ typedef struct SnippetIter SnippetIter; typedef struct SnippetPhrase SnippetPhrase; typedef struct SnippetFragment SnippetFragment; struct SnippetIter { Fts3Cursor *pCsr; /* Cursor snippet is being generated from */ int iCol; /* Extract snippet from this column */ int nSnippet; /* Requested snippet length (in tokens) */ int nPhrase; /* Number of phrases in query */ SnippetPhrase *aPhrase; /* Array of size nPhrase */ int iCurrent; /* First token of current snippet */ }; struct SnippetPhrase { int nToken; /* Number of tokens in phrase */ char *pList; /* Pointer to start of phrase position list */ int iHead; /* Next value in position list */ char *pHead; /* Position list data following iHead */ int iTail; /* Next value in trailing position list */ char *pTail; /* Position list data following iTail */ }; struct SnippetFragment { int iCol; /* Column snippet is extracted from */ int iPos; /* Index of first token in snippet */ u64 covered; /* Mask of query phrases covered */ u64 hlmask; /* Mask of snippet terms to highlight */ }; /* ** This type is used as an fts3ExprIterate() context object while ** accumulating the data returned by the matchinfo() function. */ typedef struct MatchInfo MatchInfo; struct MatchInfo { Fts3Cursor *pCursor; /* FTS3 Cursor */ int nCol; /* Number of columns in table */ int nPhrase; /* Number of matchable phrases in query */ sqlite3_int64 nDoc; /* Number of docs in database */ u32 *aMatchinfo; /* Pre-allocated buffer */ }; /* ** The snippet() and offsets() functions both return text values. An instance ** of the following structure is used to accumulate those values while the ** functions are running. See fts3StringAppend() for details. */ typedef struct StrBuffer StrBuffer; struct StrBuffer { char *z; /* Pointer to buffer containing string */ int n; /* Length of z in bytes (excl. nul-term) */ int nAlloc; /* Allocated size of buffer z in bytes */ }; /* ** This function is used to help iterate through a position-list. A position ** list is a list of unique integers, sorted from smallest to largest. Each ** element of the list is represented by an FTS3 varint that takes the value ** of the difference between the current element and the previous one plus ** two. For example, to store the position-list: ** ** 4 9 113 ** ** the three varints: ** ** 6 7 106 ** ** are encoded. ** ** When this function is called, *pp points to the start of an element of ** the list. *piPos contains the value of the previous entry in the list. ** After it returns, *piPos contains the value of the next element of the ** list and *pp is advanced to the following varint. */ static void fts3GetDeltaPosition(char **pp, int *piPos){ int iVal; *pp += sqlite3Fts3GetVarint32(*pp, &iVal); *piPos += (iVal-2); } /* ** Helper function for fts3ExprIterate() (see below). */ static int fts3ExprIterate2( Fts3Expr *pExpr, /* Expression to iterate phrases of */ int *piPhrase, /* Pointer to phrase counter */ int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ int rc; /* Return code */ int eType = pExpr->eType; /* Type of expression node pExpr */ if( eType!=FTSQUERY_PHRASE ){ assert( pExpr->pLeft && pExpr->pRight ); rc = fts3ExprIterate2(pExpr->pLeft, piPhrase, x, pCtx); if( rc==SQLITE_OK && eType!=FTSQUERY_NOT ){ rc = fts3ExprIterate2(pExpr->pRight, piPhrase, x, pCtx); } }else{ rc = x(pExpr, *piPhrase, pCtx); (*piPhrase)++; } return rc; } /* ** Iterate through all phrase nodes in an FTS3 query, except those that ** are part of a sub-tree that is the right-hand-side of a NOT operator. ** For each phrase node found, the supplied callback function is invoked. ** ** If the callback function returns anything other than SQLITE_OK, ** the iteration is abandoned and the error code returned immediately. ** Otherwise, SQLITE_OK is returned after a callback has been made for ** all eligible phrase nodes. */ static int fts3ExprIterate( Fts3Expr *pExpr, /* Expression to iterate phrases of */ int (*x)(Fts3Expr*,int,void*), /* Callback function to invoke for phrases */ void *pCtx /* Second argument to pass to callback */ ){ int iPhrase = 0; /* Variable used as the phrase counter */ return fts3ExprIterate2(pExpr, &iPhrase, x, pCtx); } /* ** This is an fts3ExprIterate() callback used while loading the doclists ** for each phrase into Fts3Expr.aDoclist[]/nDoclist. See also ** fts3ExprLoadDoclists(). */ static int fts3ExprLoadDoclistsCb(Fts3Expr *pExpr, int iPhrase, void *ctx){ int rc = SQLITE_OK; Fts3Phrase *pPhrase = pExpr->pPhrase; LoadDoclistCtx *p = (LoadDoclistCtx *)ctx; UNUSED_PARAMETER(iPhrase); p->nPhrase++; p->nToken += pPhrase->nToken; return rc; } /* ** Load the doclists for each phrase in the query associated with FTS3 cursor ** pCsr. ** ** If pnPhrase is not NULL, then *pnPhrase is set to the number of matchable ** phrases in the expression (all phrases except those directly or ** indirectly descended from the right-hand-side of a NOT operator). If ** pnToken is not NULL, then it is set to the number of tokens in all ** matchable phrases of the expression. */ static int fts3ExprLoadDoclists( Fts3Cursor *pCsr, /* Fts3 cursor for current query */ int *pnPhrase, /* OUT: Number of phrases in query */ int *pnToken /* OUT: Number of tokens in query */ ){ int rc; /* Return Code */ LoadDoclistCtx sCtx = {0,0,0}; /* Context for fts3ExprIterate() */ sCtx.pCsr = pCsr; rc = fts3ExprIterate(pCsr->pExpr, fts3ExprLoadDoclistsCb, (void *)&sCtx); if( pnPhrase ) *pnPhrase = sCtx.nPhrase; if( pnToken ) *pnToken = sCtx.nToken; return rc; } static int fts3ExprPhraseCountCb(Fts3Expr *pExpr, int iPhrase, void *ctx){ (*(int *)ctx)++; UNUSED_PARAMETER(pExpr); UNUSED_PARAMETER(iPhrase); return SQLITE_OK; } static int fts3ExprPhraseCount(Fts3Expr *pExpr){ int nPhrase = 0; (void)fts3ExprIterate(pExpr, fts3ExprPhraseCountCb, (void *)&nPhrase); return nPhrase; } /* ** Advance the position list iterator specified by the first two ** arguments so that it points to the first element with a value greater ** than or equal to parameter iNext. */ static void fts3SnippetAdvance(char **ppIter, int *piIter, int iNext){ char *pIter = *ppIter; if( pIter ){ int iIter = *piIter; while( iIteriCurrent<0 ){ /* The SnippetIter object has just been initialized. The first snippet ** candidate always starts at offset 0 (even if this candidate has a ** score of 0.0). */ pIter->iCurrent = 0; /* Advance the 'head' iterator of each phrase to the first offset that ** is greater than or equal to (iNext+nSnippet). */ for(i=0; inPhrase; i++){ SnippetPhrase *pPhrase = &pIter->aPhrase[i]; fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, pIter->nSnippet); } }else{ int iStart; int iEnd = 0x7FFFFFFF; for(i=0; inPhrase; i++){ SnippetPhrase *pPhrase = &pIter->aPhrase[i]; if( pPhrase->pHead && pPhrase->iHeadiHead; } } if( iEnd==0x7FFFFFFF ){ return 1; } pIter->iCurrent = iStart = iEnd - pIter->nSnippet + 1; for(i=0; inPhrase; i++){ SnippetPhrase *pPhrase = &pIter->aPhrase[i]; fts3SnippetAdvance(&pPhrase->pHead, &pPhrase->iHead, iEnd+1); fts3SnippetAdvance(&pPhrase->pTail, &pPhrase->iTail, iStart); } } return 0; } /* ** Retrieve information about the current candidate snippet of snippet ** iterator pIter. */ static void fts3SnippetDetails( SnippetIter *pIter, /* Snippet iterator */ u64 mCovered, /* Bitmask of phrases already covered */ int *piToken, /* OUT: First token of proposed snippet */ int *piScore, /* OUT: "Score" for this snippet */ u64 *pmCover, /* OUT: Bitmask of phrases covered */ u64 *pmHighlight /* OUT: Bitmask of terms to highlight */ ){ int iStart = pIter->iCurrent; /* First token of snippet */ int iScore = 0; /* Score of this snippet */ int i; /* Loop counter */ u64 mCover = 0; /* Mask of phrases covered by this snippet */ u64 mHighlight = 0; /* Mask of tokens to highlight in snippet */ for(i=0; inPhrase; i++){ SnippetPhrase *pPhrase = &pIter->aPhrase[i]; if( pPhrase->pTail ){ char *pCsr = pPhrase->pTail; int iCsr = pPhrase->iTail; while( iCsr<(iStart+pIter->nSnippet) ){ int j; u64 mPhrase = (u64)1 << i; u64 mPos = (u64)1 << (iCsr - iStart); assert( iCsr>=iStart ); if( (mCover|mCovered)&mPhrase ){ iScore++; }else{ iScore += 1000; } mCover |= mPhrase; for(j=0; jnToken; j++){ mHighlight |= (mPos>>j); } if( 0==(*pCsr & 0x0FE) ) break; fts3GetDeltaPosition(&pCsr, &iCsr); } } } /* Set the output variables before returning. */ *piToken = iStart; *piScore = iScore; *pmCover = mCover; *pmHighlight = mHighlight; } /* ** This function is an fts3ExprIterate() callback used by fts3BestSnippet(). ** Each invocation populates an element of the SnippetIter.aPhrase[] array. */ static int fts3SnippetFindPositions(Fts3Expr *pExpr, int iPhrase, void *ctx){ SnippetIter *p = (SnippetIter *)ctx; SnippetPhrase *pPhrase = &p->aPhrase[iPhrase]; char *pCsr; int rc; pPhrase->nToken = pExpr->pPhrase->nToken; rc = sqlite3Fts3EvalPhrasePoslist(p->pCsr, pExpr, p->iCol, &pCsr); assert( rc==SQLITE_OK || pCsr==0 ); if( pCsr ){ int iFirst = 0; pPhrase->pList = pCsr; fts3GetDeltaPosition(&pCsr, &iFirst); assert( iFirst>=0 ); pPhrase->pHead = pCsr; pPhrase->pTail = pCsr; pPhrase->iHead = iFirst; pPhrase->iTail = iFirst; }else{ assert( rc!=SQLITE_OK || ( pPhrase->pList==0 && pPhrase->pHead==0 && pPhrase->pTail==0 )); } return rc; } /* ** Select the fragment of text consisting of nFragment contiguous tokens ** from column iCol that represent the "best" snippet. The best snippet ** is the snippet with the highest score, where scores are calculated ** by adding: ** ** (a) +1 point for each occurence of a matchable phrase in the snippet. ** ** (b) +1000 points for the first occurence of each matchable phrase in ** the snippet for which the corresponding mCovered bit is not set. ** ** The selected snippet parameters are stored in structure *pFragment before ** returning. The score of the selected snippet is stored in *piScore ** before returning. */ static int fts3BestSnippet( int nSnippet, /* Desired snippet length */ Fts3Cursor *pCsr, /* Cursor to create snippet for */ int iCol, /* Index of column to create snippet from */ u64 mCovered, /* Mask of phrases already covered */ u64 *pmSeen, /* IN/OUT: Mask of phrases seen */ SnippetFragment *pFragment, /* OUT: Best snippet found */ int *piScore /* OUT: Score of snippet pFragment */ ){ int rc; /* Return Code */ int nList; /* Number of phrases in expression */ SnippetIter sIter; /* Iterates through snippet candidates */ int nByte; /* Number of bytes of space to allocate */ int iBestScore = -1; /* Best snippet score found so far */ int i; /* Loop counter */ memset(&sIter, 0, sizeof(sIter)); /* Iterate through the phrases in the expression to count them. The same ** callback makes sure the doclists are loaded for each phrase. */ rc = fts3ExprLoadDoclists(pCsr, &nList, 0); if( rc!=SQLITE_OK ){ return rc; } /* Now that it is known how many phrases there are, allocate and zero ** the required space using malloc(). */ nByte = sizeof(SnippetPhrase) * nList; sIter.aPhrase = (SnippetPhrase *)sqlite3_malloc(nByte); if( !sIter.aPhrase ){ return SQLITE_NOMEM; } memset(sIter.aPhrase, 0, nByte); /* Initialize the contents of the SnippetIter object. Then iterate through ** the set of phrases in the expression to populate the aPhrase[] array. */ sIter.pCsr = pCsr; sIter.iCol = iCol; sIter.nSnippet = nSnippet; sIter.nPhrase = nList; sIter.iCurrent = -1; (void)fts3ExprIterate(pCsr->pExpr, fts3SnippetFindPositions, (void *)&sIter); /* Set the *pmSeen output variable. */ for(i=0; iiCol = iCol; while( !fts3SnippetNextCandidate(&sIter) ){ int iPos; int iScore; u64 mCover; u64 mHighlight; fts3SnippetDetails(&sIter, mCovered, &iPos, &iScore, &mCover, &mHighlight); assert( iScore>=0 ); if( iScore>iBestScore ){ pFragment->iPos = iPos; pFragment->hlmask = mHighlight; pFragment->covered = mCover; iBestScore = iScore; } } sqlite3_free(sIter.aPhrase); *piScore = iBestScore; return SQLITE_OK; } /* ** Append a string to the string-buffer passed as the first argument. ** ** If nAppend is negative, then the length of the string zAppend is ** determined using strlen(). */ static int fts3StringAppend( StrBuffer *pStr, /* Buffer to append to */ const char *zAppend, /* Pointer to data to append to buffer */ int nAppend /* Size of zAppend in bytes (or -1) */ ){ if( nAppend<0 ){ nAppend = (int)strlen(zAppend); } /* If there is insufficient space allocated at StrBuffer.z, use realloc() ** to grow the buffer until so that it is big enough to accomadate the ** appended data. */ if( pStr->n+nAppend+1>=pStr->nAlloc ){ int nAlloc = pStr->nAlloc+nAppend+100; char *zNew = sqlite3_realloc(pStr->z, nAlloc); if( !zNew ){ return SQLITE_NOMEM; } pStr->z = zNew; pStr->nAlloc = nAlloc; } /* Append the data to the string buffer. */ memcpy(&pStr->z[pStr->n], zAppend, nAppend); pStr->n += nAppend; pStr->z[pStr->n] = '\0'; return SQLITE_OK; } /* ** The fts3BestSnippet() function often selects snippets that end with a ** query term. That is, the final term of the snippet is always a term ** that requires highlighting. For example, if 'X' is a highlighted term ** and '.' is a non-highlighted term, BestSnippet() may select: ** ** ........X.....X ** ** This function "shifts" the beginning of the snippet forward in the ** document so that there are approximately the same number of ** non-highlighted terms to the right of the final highlighted term as there ** are to the left of the first highlighted term. For example, to this: ** ** ....X.....X.... ** ** This is done as part of extracting the snippet text, not when selecting ** the snippet. Snippet selection is done based on doclists only, so there ** is no way for fts3BestSnippet() to know whether or not the document ** actually contains terms that follow the final highlighted term. */ static int fts3SnippetShift( Fts3Table *pTab, /* FTS3 table snippet comes from */ int iLangid, /* Language id to use in tokenizing */ int nSnippet, /* Number of tokens desired for snippet */ const char *zDoc, /* Document text to extract snippet from */ int nDoc, /* Size of buffer zDoc in bytes */ int *piPos, /* IN/OUT: First token of snippet */ u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */ ){ u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */ if( hlmask ){ int nLeft; /* Tokens to the left of first highlight */ int nRight; /* Tokens to the right of last highlight */ int nDesired; /* Ideal number of tokens to shift forward */ for(nLeft=0; !(hlmask & ((u64)1 << nLeft)); nLeft++); for(nRight=0; !(hlmask & ((u64)1 << (nSnippet-1-nRight))); nRight++); nDesired = (nLeft-nRight)/2; /* Ideally, the start of the snippet should be pushed forward in the ** document nDesired tokens. This block checks if there are actually ** nDesired tokens to the right of the snippet. If so, *piPos and ** *pHlMask are updated to shift the snippet nDesired tokens to the ** right. Otherwise, the snippet is shifted by the number of tokens ** available. */ if( nDesired>0 ){ int nShift; /* Number of tokens to shift snippet by */ int iCurrent = 0; /* Token counter */ int rc; /* Return Code */ sqlite3_tokenizer_module *pMod; sqlite3_tokenizer_cursor *pC; pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) ** or more tokens in zDoc/nDoc. */ rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC); if( rc!=SQLITE_OK ){ return rc; } while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){ const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3; rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); } pMod->xClose(pC); if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; } nShift = (rc==SQLITE_DONE)+iCurrent-nSnippet; assert( nShift<=nDesired ); if( nShift>0 ){ *piPos += nShift; *pHlmask = hlmask >> nShift; } } } return SQLITE_OK; } /* ** Extract the snippet text for fragment pFragment from cursor pCsr and ** append it to string buffer pOut. */ static int fts3SnippetText( Fts3Cursor *pCsr, /* FTS3 Cursor */ SnippetFragment *pFragment, /* Snippet to extract */ int iFragment, /* Fragment number */ int isLast, /* True for final fragment in snippet */ int nSnippet, /* Number of tokens in extracted snippet */ const char *zOpen, /* String inserted before highlighted term */ const char *zClose, /* String inserted after highlighted term */ const char *zEllipsis, /* String inserted between snippets */ StrBuffer *pOut /* Write output here */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc; /* Return code */ const char *zDoc; /* Document text to extract snippet from */ int nDoc; /* Size of zDoc in bytes */ int iCurrent = 0; /* Current token number of document */ int iEnd = 0; /* Byte offset of end of current token */ int isShiftDone = 0; /* True after snippet is shifted */ int iPos = pFragment->iPos; /* First token of snippet */ u64 hlmask = pFragment->hlmask; /* Highlight-mask for snippet */ int iCol = pFragment->iCol+1; /* Query column to extract text from */ sqlite3_tokenizer_module *pMod; /* Tokenizer module methods object */ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor open on zDoc/nDoc */ const char *ZDUMMY; /* Dummy argument used with tokenizer */ int DUMMY1; /* Dummy argument used with tokenizer */ zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol); if( zDoc==0 ){ if( sqlite3_column_type(pCsr->pStmt, iCol)!=SQLITE_NULL ){ return SQLITE_NOMEM; } return SQLITE_OK; } nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol); /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC); if( rc!=SQLITE_OK ){ return rc; } while( rc==SQLITE_OK ){ int iBegin; /* Offset in zDoc of start of token */ int iFin; /* Offset in zDoc of end of token */ int isHighlight; /* True for highlighted terms */ rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); if( rc!=SQLITE_OK ){ if( rc==SQLITE_DONE ){ /* Special case - the last token of the snippet is also the last token ** of the column. Append any punctuation that occurred between the end ** of the previous token and the end of the document to the output. ** Then break out of the loop. */ rc = fts3StringAppend(pOut, &zDoc[iEnd], -1); } break; } if( iCurrentiLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask ); isShiftDone = 1; /* Now that the shift has been done, check if the initial "..." are ** required. They are required if (a) this is not the first fragment, ** or (b) this fragment does not begin at position 0 of its column. */ if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){ rc = fts3StringAppend(pOut, zEllipsis, -1); } if( rc!=SQLITE_OK || iCurrent=(iPos+nSnippet) ){ if( isLast ){ rc = fts3StringAppend(pOut, zEllipsis, -1); } break; } /* Set isHighlight to true if this term should be highlighted. */ isHighlight = (hlmask & ((u64)1 << (iCurrent-iPos)))!=0; if( iCurrent>iPos ) rc = fts3StringAppend(pOut, &zDoc[iEnd], iBegin-iEnd); if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zOpen, -1); if( rc==SQLITE_OK ) rc = fts3StringAppend(pOut, &zDoc[iBegin], iFin-iBegin); if( rc==SQLITE_OK && isHighlight ) rc = fts3StringAppend(pOut, zClose, -1); iEnd = iFin; } pMod->xClose(pC); return rc; } /* ** This function is used to count the entries in a column-list (a ** delta-encoded list of term offsets within a single column of a single ** row). When this function is called, *ppCollist should point to the ** beginning of the first varint in the column-list (the varint that ** contains the position of the first matching term in the column data). ** Before returning, *ppCollist is set to point to the first byte after ** the last varint in the column-list (either the 0x00 signifying the end ** of the position-list, or the 0x01 that precedes the column number of ** the next column in the position-list). ** ** The number of elements in the column-list is returned. */ static int fts3ColumnlistCount(char **ppCollist){ char *pEnd = *ppCollist; char c = 0; int nEntry = 0; /* A column-list is terminated by either a 0x01 or 0x00. */ while( 0xFE & (*pEnd | c) ){ c = *pEnd++ & 0x80; if( !c ) nEntry++; } *ppCollist = pEnd; return nEntry; } /* ** fts3ExprIterate() callback used to collect the "global" matchinfo stats ** for a single query. ** ** fts3ExprIterate() callback to load the 'global' elements of a ** FTS3_MATCHINFO_HITS matchinfo array. The global stats are those elements ** of the matchinfo array that are constant for all rows returned by the ** current query. ** ** Argument pCtx is actually a pointer to a struct of type MatchInfo. This ** function populates Matchinfo.aMatchinfo[] as follows: ** ** for(iCol=0; iColpCursor, pExpr, &p->aMatchinfo[3*iPhrase*p->nCol] ); } /* ** fts3ExprIterate() callback used to collect the "local" part of the ** FTS3_MATCHINFO_HITS array. The local stats are those elements of the ** array that are different for each row returned by the query. */ static int fts3ExprLocalHitsCb( Fts3Expr *pExpr, /* Phrase expression node */ int iPhrase, /* Phrase number */ void *pCtx /* Pointer to MatchInfo structure */ ){ int rc = SQLITE_OK; MatchInfo *p = (MatchInfo *)pCtx; int iStart = iPhrase * p->nCol * 3; int i; for(i=0; inCol && rc==SQLITE_OK; i++){ char *pCsr; rc = sqlite3Fts3EvalPhrasePoslist(p->pCursor, pExpr, i, &pCsr); if( pCsr ){ p->aMatchinfo[iStart+i*3] = fts3ColumnlistCount(&pCsr); }else{ p->aMatchinfo[iStart+i*3] = 0; } } return rc; } static int fts3MatchinfoCheck( Fts3Table *pTab, char cArg, char **pzErr ){ if( (cArg==FTS3_MATCHINFO_NPHRASE) || (cArg==FTS3_MATCHINFO_NCOL) || (cArg==FTS3_MATCHINFO_NDOC && pTab->bFts4) || (cArg==FTS3_MATCHINFO_AVGLENGTH && pTab->bFts4) || (cArg==FTS3_MATCHINFO_LENGTH && pTab->bHasDocsize) || (cArg==FTS3_MATCHINFO_LCS) || (cArg==FTS3_MATCHINFO_HITS) ){ return SQLITE_OK; } *pzErr = sqlite3_mprintf("unrecognized matchinfo request: %c", cArg); return SQLITE_ERROR; } static int fts3MatchinfoSize(MatchInfo *pInfo, char cArg){ int nVal; /* Number of integers output by cArg */ switch( cArg ){ case FTS3_MATCHINFO_NDOC: case FTS3_MATCHINFO_NPHRASE: case FTS3_MATCHINFO_NCOL: nVal = 1; break; case FTS3_MATCHINFO_AVGLENGTH: case FTS3_MATCHINFO_LENGTH: case FTS3_MATCHINFO_LCS: nVal = pInfo->nCol; break; default: assert( cArg==FTS3_MATCHINFO_HITS ); nVal = pInfo->nCol * pInfo->nPhrase * 3; break; } return nVal; } static int fts3MatchinfoSelectDoctotal( Fts3Table *pTab, sqlite3_stmt **ppStmt, sqlite3_int64 *pnDoc, const char **paLen ){ sqlite3_stmt *pStmt; const char *a; sqlite3_int64 nDoc; if( !*ppStmt ){ int rc = sqlite3Fts3SelectDoctotal(pTab, ppStmt); if( rc!=SQLITE_OK ) return rc; } pStmt = *ppStmt; assert( sqlite3_data_count(pStmt)==1 ); a = sqlite3_column_blob(pStmt, 0); a += sqlite3Fts3GetVarint(a, &nDoc); if( nDoc==0 ) return FTS_CORRUPT_VTAB; *pnDoc = (u32)nDoc; if( paLen ) *paLen = a; return SQLITE_OK; } /* ** An instance of the following structure is used to store state while ** iterating through a multi-column position-list corresponding to the ** hits for a single phrase on a single row in order to calculate the ** values for a matchinfo() FTS3_MATCHINFO_LCS request. */ typedef struct LcsIterator LcsIterator; struct LcsIterator { Fts3Expr *pExpr; /* Pointer to phrase expression */ int iPosOffset; /* Tokens count up to end of this phrase */ char *pRead; /* Cursor used to iterate through aDoclist */ int iPos; /* Current position */ }; /* ** If LcsIterator.iCol is set to the following value, the iterator has ** finished iterating through all offsets for all columns. */ #define LCS_ITERATOR_FINISHED 0x7FFFFFFF; static int fts3MatchinfoLcsCb( Fts3Expr *pExpr, /* Phrase expression node */ int iPhrase, /* Phrase number (numbered from zero) */ void *pCtx /* Pointer to MatchInfo structure */ ){ LcsIterator *aIter = (LcsIterator *)pCtx; aIter[iPhrase].pExpr = pExpr; return SQLITE_OK; } /* ** Advance the iterator passed as an argument to the next position. Return ** 1 if the iterator is at EOF or if it now points to the start of the ** position list for the next column. */ static int fts3LcsIteratorAdvance(LcsIterator *pIter){ char *pRead = pIter->pRead; sqlite3_int64 iRead; int rc = 0; pRead += sqlite3Fts3GetVarint(pRead, &iRead); if( iRead==0 || iRead==1 ){ pRead = 0; rc = 1; }else{ pIter->iPos += (int)(iRead-2); } pIter->pRead = pRead; return rc; } /* ** This function implements the FTS3_MATCHINFO_LCS matchinfo() flag. ** ** If the call is successful, the longest-common-substring lengths for each ** column are written into the first nCol elements of the pInfo->aMatchinfo[] ** array before returning. SQLITE_OK is returned in this case. ** ** Otherwise, if an error occurs, an SQLite error code is returned and the ** data written to the first nCol elements of pInfo->aMatchinfo[] is ** undefined. */ static int fts3MatchinfoLcs(Fts3Cursor *pCsr, MatchInfo *pInfo){ LcsIterator *aIter; int i; int iCol; int nToken = 0; /* Allocate and populate the array of LcsIterator objects. The array ** contains one element for each matchable phrase in the query. **/ aIter = sqlite3_malloc(sizeof(LcsIterator) * pCsr->nPhrase); if( !aIter ) return SQLITE_NOMEM; memset(aIter, 0, sizeof(LcsIterator) * pCsr->nPhrase); (void)fts3ExprIterate(pCsr->pExpr, fts3MatchinfoLcsCb, (void*)aIter); for(i=0; inPhrase; i++){ LcsIterator *pIter = &aIter[i]; nToken -= pIter->pExpr->pPhrase->nToken; pIter->iPosOffset = nToken; } for(iCol=0; iColnCol; iCol++){ int nLcs = 0; /* LCS value for this column */ int nLive = 0; /* Number of iterators in aIter not at EOF */ for(i=0; inPhrase; i++){ int rc; LcsIterator *pIt = &aIter[i]; rc = sqlite3Fts3EvalPhrasePoslist(pCsr, pIt->pExpr, iCol, &pIt->pRead); if( rc!=SQLITE_OK ) return rc; if( pIt->pRead ){ pIt->iPos = pIt->iPosOffset; fts3LcsIteratorAdvance(&aIter[i]); nLive++; } } while( nLive>0 ){ LcsIterator *pAdv = 0; /* The iterator to advance by one position */ int nThisLcs = 0; /* LCS for the current iterator positions */ for(i=0; inPhrase; i++){ LcsIterator *pIter = &aIter[i]; if( pIter->pRead==0 ){ /* This iterator is already at EOF for this column. */ nThisLcs = 0; }else{ if( pAdv==0 || pIter->iPosiPos ){ pAdv = pIter; } if( nThisLcs==0 || pIter->iPos==pIter[-1].iPos ){ nThisLcs++; }else{ nThisLcs = 1; } if( nThisLcs>nLcs ) nLcs = nThisLcs; } } if( fts3LcsIteratorAdvance(pAdv) ) nLive--; } pInfo->aMatchinfo[iCol] = nLcs; } sqlite3_free(aIter); return SQLITE_OK; } /* ** Populate the buffer pInfo->aMatchinfo[] with an array of integers to ** be returned by the matchinfo() function. Argument zArg contains the ** format string passed as the second argument to matchinfo (or the ** default value "pcx" if no second argument was specified). The format ** string has already been validated and the pInfo->aMatchinfo[] array ** is guaranteed to be large enough for the output. ** ** If bGlobal is true, then populate all fields of the matchinfo() output. ** If it is false, then assume that those fields that do not change between ** rows (i.e. FTS3_MATCHINFO_NPHRASE, NCOL, NDOC, AVGLENGTH and part of HITS) ** have already been populated. ** ** Return SQLITE_OK if successful, or an SQLite error code if an error ** occurs. If a value other than SQLITE_OK is returned, the state the ** pInfo->aMatchinfo[] buffer is left in is undefined. */ static int fts3MatchinfoValues( Fts3Cursor *pCsr, /* FTS3 cursor object */ int bGlobal, /* True to grab the global stats */ MatchInfo *pInfo, /* Matchinfo context object */ const char *zArg /* Matchinfo format string */ ){ int rc = SQLITE_OK; int i; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; sqlite3_stmt *pSelect = 0; for(i=0; rc==SQLITE_OK && zArg[i]; i++){ switch( zArg[i] ){ case FTS3_MATCHINFO_NPHRASE: if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nPhrase; break; case FTS3_MATCHINFO_NCOL: if( bGlobal ) pInfo->aMatchinfo[0] = pInfo->nCol; break; case FTS3_MATCHINFO_NDOC: if( bGlobal ){ sqlite3_int64 nDoc = 0; rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, 0); pInfo->aMatchinfo[0] = (u32)nDoc; } break; case FTS3_MATCHINFO_AVGLENGTH: if( bGlobal ){ sqlite3_int64 nDoc; /* Number of rows in table */ const char *a; /* Aggregate column length array */ rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &nDoc, &a); if( rc==SQLITE_OK ){ int iCol; for(iCol=0; iColnCol; iCol++){ u32 iVal; sqlite3_int64 nToken; a += sqlite3Fts3GetVarint(a, &nToken); iVal = (u32)(((u32)(nToken&0xffffffff)+nDoc/2)/nDoc); pInfo->aMatchinfo[iCol] = iVal; } } } break; case FTS3_MATCHINFO_LENGTH: { sqlite3_stmt *pSelectDocsize = 0; rc = sqlite3Fts3SelectDocsize(pTab, pCsr->iPrevId, &pSelectDocsize); if( rc==SQLITE_OK ){ int iCol; const char *a = sqlite3_column_blob(pSelectDocsize, 0); for(iCol=0; iColnCol; iCol++){ sqlite3_int64 nToken; a += sqlite3Fts3GetVarint(a, &nToken); pInfo->aMatchinfo[iCol] = (u32)nToken; } } sqlite3_reset(pSelectDocsize); break; } case FTS3_MATCHINFO_LCS: rc = fts3ExprLoadDoclists(pCsr, 0, 0); if( rc==SQLITE_OK ){ rc = fts3MatchinfoLcs(pCsr, pInfo); } break; default: { Fts3Expr *pExpr; assert( zArg[i]==FTS3_MATCHINFO_HITS ); pExpr = pCsr->pExpr; rc = fts3ExprLoadDoclists(pCsr, 0, 0); if( rc!=SQLITE_OK ) break; if( bGlobal ){ if( pCsr->pDeferred ){ rc = fts3MatchinfoSelectDoctotal(pTab, &pSelect, &pInfo->nDoc, 0); if( rc!=SQLITE_OK ) break; } rc = fts3ExprIterate(pExpr, fts3ExprGlobalHitsCb,(void*)pInfo); if( rc!=SQLITE_OK ) break; } (void)fts3ExprIterate(pExpr, fts3ExprLocalHitsCb,(void*)pInfo); break; } } pInfo->aMatchinfo += fts3MatchinfoSize(pInfo, zArg[i]); } sqlite3_reset(pSelect); return rc; } /* ** Populate pCsr->aMatchinfo[] with data for the current row. The ** 'matchinfo' data is an array of 32-bit unsigned integers (C type u32). */ static int fts3GetMatchinfo( Fts3Cursor *pCsr, /* FTS3 Cursor object */ const char *zArg /* Second argument to matchinfo() function */ ){ MatchInfo sInfo; Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc = SQLITE_OK; int bGlobal = 0; /* Collect 'global' stats as well as local */ memset(&sInfo, 0, sizeof(MatchInfo)); sInfo.pCursor = pCsr; sInfo.nCol = pTab->nColumn; /* If there is cached matchinfo() data, but the format string for the ** cache does not match the format string for this request, discard ** the cached data. */ if( pCsr->zMatchinfo && strcmp(pCsr->zMatchinfo, zArg) ){ assert( pCsr->aMatchinfo ); sqlite3_free(pCsr->aMatchinfo); pCsr->zMatchinfo = 0; pCsr->aMatchinfo = 0; } /* If Fts3Cursor.aMatchinfo[] is NULL, then this is the first time the ** matchinfo function has been called for this query. In this case ** allocate the array used to accumulate the matchinfo data and ** initialize those elements that are constant for every row. */ if( pCsr->aMatchinfo==0 ){ int nMatchinfo = 0; /* Number of u32 elements in match-info */ int nArg; /* Bytes in zArg */ int i; /* Used to iterate through zArg */ /* Determine the number of phrases in the query */ pCsr->nPhrase = fts3ExprPhraseCount(pCsr->pExpr); sInfo.nPhrase = pCsr->nPhrase; /* Determine the number of integers in the buffer returned by this call. */ for(i=0; zArg[i]; i++){ nMatchinfo += fts3MatchinfoSize(&sInfo, zArg[i]); } /* Allocate space for Fts3Cursor.aMatchinfo[] and Fts3Cursor.zMatchinfo. */ nArg = (int)strlen(zArg); pCsr->aMatchinfo = (u32 *)sqlite3_malloc(sizeof(u32)*nMatchinfo + nArg + 1); if( !pCsr->aMatchinfo ) return SQLITE_NOMEM; pCsr->zMatchinfo = (char *)&pCsr->aMatchinfo[nMatchinfo]; pCsr->nMatchinfo = nMatchinfo; memcpy(pCsr->zMatchinfo, zArg, nArg+1); memset(pCsr->aMatchinfo, 0, sizeof(u32)*nMatchinfo); pCsr->isMatchinfoNeeded = 1; bGlobal = 1; } sInfo.aMatchinfo = pCsr->aMatchinfo; sInfo.nPhrase = pCsr->nPhrase; if( pCsr->isMatchinfoNeeded ){ rc = fts3MatchinfoValues(pCsr, bGlobal, &sInfo, zArg); pCsr->isMatchinfoNeeded = 0; } return rc; } /* ** Implementation of snippet() function. */ void sqlite3Fts3Snippet( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr, /* Cursor object */ const char *zStart, /* Snippet start text - "" */ const char *zEnd, /* Snippet end text - "" */ const char *zEllipsis, /* Snippet ellipsis text - "..." */ int iCol, /* Extract snippet from this column */ int nToken /* Approximate number of tokens in snippet */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc = SQLITE_OK; int i; StrBuffer res = {0, 0, 0}; /* The returned text includes up to four fragments of text extracted from ** the data in the current row. The first iteration of the for(...) loop ** below attempts to locate a single fragment of text nToken tokens in ** size that contains at least one instance of all phrases in the query ** expression that appear in the current row. If such a fragment of text ** cannot be found, the second iteration of the loop attempts to locate ** a pair of fragments, and so on. */ int nSnippet = 0; /* Number of fragments in this snippet */ SnippetFragment aSnippet[4]; /* Maximum of 4 fragments per snippet */ int nFToken = -1; /* Number of tokens in each fragment */ if( !pCsr->pExpr ){ sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC); return; } for(nSnippet=1; 1; nSnippet++){ int iSnip; /* Loop counter 0..nSnippet-1 */ u64 mCovered = 0; /* Bitmask of phrases covered by snippet */ u64 mSeen = 0; /* Bitmask of phrases seen by BestSnippet() */ if( nToken>=0 ){ nFToken = (nToken+nSnippet-1) / nSnippet; }else{ nFToken = -1 * nToken; } for(iSnip=0; iSnipnColumn; iRead++){ SnippetFragment sF = {0, 0, 0, 0}; int iS; if( iCol>=0 && iRead!=iCol ) continue; /* Find the best snippet of nFToken tokens in column iRead. */ rc = fts3BestSnippet(nFToken, pCsr, iRead, mCovered, &mSeen, &sF, &iS); if( rc!=SQLITE_OK ){ goto snippet_out; } if( iS>iBestScore ){ *pFragment = sF; iBestScore = iS; } } mCovered |= pFragment->covered; } /* If all query phrases seen by fts3BestSnippet() are present in at least ** one of the nSnippet snippet fragments, break out of the loop. */ assert( (mCovered&mSeen)==mCovered ); if( mSeen==mCovered || nSnippet==SizeofArray(aSnippet) ) break; } assert( nFToken>0 ); for(i=0; ipCsr, pExpr, p->iCol, &pList); nTerm = pExpr->pPhrase->nToken; if( pList ){ fts3GetDeltaPosition(&pList, &iPos); assert( iPos>=0 ); } for(iTerm=0; iTermaTerm[p->iTerm++]; pT->iOff = nTerm-iTerm-1; pT->pList = pList; pT->iPos = iPos; } return rc; } /* ** Implementation of offsets() function. */ void sqlite3Fts3Offsets( sqlite3_context *pCtx, /* SQLite function call context */ Fts3Cursor *pCsr /* Cursor object */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; sqlite3_tokenizer_module const *pMod = pTab->pTokenizer->pModule; const char *ZDUMMY; /* Dummy argument used with xNext() */ int NDUMMY; /* Dummy argument used with xNext() */ int rc; /* Return Code */ int nToken; /* Number of tokens in query */ int iCol; /* Column currently being processed */ StrBuffer res = {0, 0, 0}; /* Result string */ TermOffsetCtx sCtx; /* Context for fts3ExprTermOffsetInit() */ if( !pCsr->pExpr ){ sqlite3_result_text(pCtx, "", 0, SQLITE_STATIC); return; } memset(&sCtx, 0, sizeof(sCtx)); assert( pCsr->isRequireSeek==0 ); /* Count the number of terms in the query */ rc = fts3ExprLoadDoclists(pCsr, 0, &nToken); if( rc!=SQLITE_OK ) goto offsets_out; /* Allocate the array of TermOffset iterators. */ sCtx.aTerm = (TermOffset *)sqlite3_malloc(sizeof(TermOffset)*nToken); if( 0==sCtx.aTerm ){ rc = SQLITE_NOMEM; goto offsets_out; } sCtx.iDocid = pCsr->iPrevId; sCtx.pCsr = pCsr; /* Loop through the table columns, appending offset information to ** string-buffer res for each column. */ for(iCol=0; iColnColumn; iCol++){ sqlite3_tokenizer_cursor *pC; /* Tokenizer cursor */ int iStart; int iEnd; int iCurrent; const char *zDoc; int nDoc; /* Initialize the contents of sCtx.aTerm[] for column iCol. There is ** no way that this operation can fail, so the return code from ** fts3ExprIterate() can be discarded. */ sCtx.iCol = iCol; sCtx.iTerm = 0; (void)fts3ExprIterate(pCsr->pExpr, fts3ExprTermOffsetInit, (void *)&sCtx); /* Retreive the text stored in column iCol. If an SQL NULL is stored ** in column iCol, jump immediately to the next iteration of the loop. ** If an OOM occurs while retrieving the data (this can happen if SQLite ** needs to transform the data from utf-16 to utf-8), return SQLITE_NOMEM ** to the caller. */ zDoc = (const char *)sqlite3_column_text(pCsr->pStmt, iCol+1); nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol+1); if( zDoc==0 ){ if( sqlite3_column_type(pCsr->pStmt, iCol+1)==SQLITE_NULL ){ continue; } rc = SQLITE_NOMEM; goto offsets_out; } /* Initialize a tokenizer iterator to iterate through column iCol. */ rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc, nDoc, &pC ); if( rc!=SQLITE_OK ) goto offsets_out; rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); while( rc==SQLITE_OK ){ int i; /* Used to loop through terms */ int iMinPos = 0x7FFFFFFF; /* Position of next token */ TermOffset *pTerm = 0; /* TermOffset associated with next token */ for(i=0; ipList && (pT->iPos-pT->iOff)iPos-pT->iOff; pTerm = pT; } } if( !pTerm ){ /* All offsets for this column have been gathered. */ rc = SQLITE_DONE; }else{ assert( iCurrent<=iMinPos ); if( 0==(0xFE&*pTerm->pList) ){ pTerm->pList = 0; }else{ fts3GetDeltaPosition(&pTerm->pList, &pTerm->iPos); } while( rc==SQLITE_OK && iCurrentxNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); } if( rc==SQLITE_OK ){ char aBuffer[64]; sqlite3_snprintf(sizeof(aBuffer), aBuffer, "%d %d %d %d ", iCol, pTerm-sCtx.aTerm, iStart, iEnd-iStart ); rc = fts3StringAppend(&res, aBuffer, -1); }else if( rc==SQLITE_DONE && pTab->zContentTbl==0 ){ rc = FTS_CORRUPT_VTAB; } } } if( rc==SQLITE_DONE ){ rc = SQLITE_OK; } pMod->xClose(pC); if( rc!=SQLITE_OK ) goto offsets_out; } offsets_out: sqlite3_free(sCtx.aTerm); assert( rc!=SQLITE_DONE ); sqlite3Fts3SegmentsClose(pTab); if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pCtx, rc); sqlite3_free(res.z); }else{ sqlite3_result_text(pCtx, res.z, res.n-1, sqlite3_free); } return; } /* ** Implementation of matchinfo() function. */ void sqlite3Fts3Matchinfo( sqlite3_context *pContext, /* Function call context */ Fts3Cursor *pCsr, /* FTS3 table cursor */ const char *zArg /* Second arg to matchinfo() function */ ){ Fts3Table *pTab = (Fts3Table *)pCsr->base.pVtab; int rc; int i; const char *zFormat; if( zArg ){ for(i=0; zArg[i]; i++){ char *zErr = 0; if( fts3MatchinfoCheck(pTab, zArg[i], &zErr) ){ sqlite3_result_error(pContext, zErr, -1); sqlite3_free(zErr); return; } } zFormat = zArg; }else{ zFormat = FTS3_MATCHINFO_DEFAULT; } if( !pCsr->pExpr ){ sqlite3_result_blob(pContext, "", 0, SQLITE_STATIC); return; } /* Retrieve matchinfo() data. */ rc = fts3GetMatchinfo(pCsr, zFormat); sqlite3Fts3SegmentsClose(pTab); if( rc!=SQLITE_OK ){ sqlite3_result_error_code(pContext, rc); }else{ int n = pCsr->nMatchinfo * sizeof(u32); sqlite3_result_blob(pContext, pCsr->aMatchinfo, n, SQLITE_TRANSIENT); } } #endif