Index: src/fts5.c ================================================================== --- src/fts5.c +++ src/fts5.c @@ -14,12 +14,18 @@ #include "sqliteInt.h" #include "vdbeInt.h" /* ** Stream numbers must be lower than this. +** +** For optimization purposes, it is assumed that a given tokenizer uses +** a set of contiguous stream numbers starting with 0. And that most +** tokens belong to stream 0. +** +** The hard limit is 63 (due to the format of "row size" records). */ -#define SQLITE4_FTS5_NSTREAM 60 +#define SQLITE4_FTS5_NSTREAM 32 /* ** Records stored within the index: ** ** Row size record: @@ -54,14 +60,20 @@ ** varint is the total number of rows in the table. The subsequent ** varints make up a "row size" record containing the total number of ** tokens for each S/C combination in all rows of the table. ** ** FTS index records: -** ** The FTS index records implement the following mapping: ** ** (token, document-pk) -> (list of instances) +** +** The key for each index record is in the same format as the keys for +** regular text indexes. An 0x24 byte, followed by the utf-8 representation +** of the token, followed by 0x00, followed by the PK blob for the table +** row. +** +** TODO: Describe value format. */ /* ** Default distance value for NEAR operators. */ @@ -135,10 +147,11 @@ typedef struct Fts5List Fts5List; typedef struct Fts5Parser Fts5Parser; typedef struct Fts5ParserToken Fts5ParserToken; typedef struct Fts5Phrase Fts5Phrase; typedef struct Fts5Prefix Fts5Prefix; +typedef struct Fts5Size Fts5Size; typedef struct Fts5Str Fts5Str; typedef struct Fts5Token Fts5Token; struct Fts5ParserToken { @@ -238,17 +251,31 @@ int nKeyAlloc; /* Bytes allocated at aKey[] */ KVCursor *pCsr; /* Cursor used to retrive values */ Mem *aMem; /* Array of column values */ - /* Array of nPhrase*nCol integers. See sqlite4_mi_row_count() for details. */ + Fts5Size *pSz; /* Local size data */ + Fts5Size *pGlobal; /* Global size data */ + i64 nGlobal; /* Total number of rows in table */ int *anRow; + +#if 1 i64 *aGlobal; /* Size of each column of current row (in tokens). */ int bSzValid; int *aSz; +#endif +}; + +/* +** A deserialized 'size record' (see above). +*/ +struct Fts5Size { + int nCol; /* Number of columns in indexed table */ + int nStream; /* Number of streams */ + i64 *aSz; /* Token count for each C/S */ }; /* ** This type is used when reading (decoding) an instance-list. */ @@ -258,11 +285,11 @@ int nList; int iList; /* The current entry */ int iCol; - int iWeight; + int iStream; int iOff; }; /* ** Return true for EOF, or false if the next entry is valid. @@ -277,11 +304,11 @@ if( (iVal & 0x03)==0x01 ){ p->iCol = (iVal>>2); p->iOff = 0; } else if( (iVal & 0x03)==0x03 ){ - p->iWeight = (iVal>>2); + p->iStream = (iVal>>2); } else{ p->iOff += (iVal>>1); bRet = 0; } @@ -299,11 +326,11 @@ } static void fts5InstanceListAppend( InstanceList *p, /* Instance list to append to */ int iCol, /* Column of new entry */ - int iWeight, /* Weight of new entry */ + int iStream, /* Weight of new entry */ int iOff /* Offset of new entry */ ){ assert( iCol>=p->iCol ); assert( iCol>p->iCol || iOff>=p->iOff ); @@ -311,13 +338,13 @@ p->iList += putVarint32(&p->aList[p->iList], (iCol<<2)|0x01); p->iCol = iCol; p->iOff = 0; } - if( iWeight!=p->iWeight ){ - p->iList += putVarint32(&p->aList[p->iList], (iWeight<<2)|0x03); - p->iWeight = iWeight; + if( iStream!=p->iStream ){ + p->iList += putVarint32(&p->aList[p->iList], (iStream<<2)|0x03); + p->iStream = iStream; } p->iList += putVarint32(&p->aList[p->iList], (iOff-p->iOff)<<1); p->iOff = iOff; @@ -506,11 +533,11 @@ /* ** Callback for fts5CountTokens(). */ static int fts5CountTokensCb( void *pCtx, - int iWeight, + int iStream, int iOff, const char *z, int n, int iSrc, int nSrc ){ (*((int *)pCtx))++; @@ -541,11 +568,11 @@ Fts5Str *pStr; }; static int fts5AppendTokensCb( void *pCtx, - int iWeight, + int iStream, int iOff, const char *z, int n, int iSrc, int nSrc ){ struct AppendTokensCtx *p = (struct AppendTokensCtx *)pCtx; @@ -1157,17 +1184,19 @@ typedef struct TokenizeCtx TokenizeCtx; typedef struct TokenizeTerm TokenizeTerm; struct TokenizeCtx { int rc; int iCol; + int nCol; /* Number of columns in table */ sqlite4 *db; int nMax; - int *aSz; /* Number of tokens in each column */ + i64 *aSz; /* Number of tokens in each column/stream */ + int nStream; /* Number of streams in document */ Hash hash; }; struct TokenizeTerm { - int iWeight; /* Weight of previous entry */ + int iStream; /* Weight of previous entry */ int iCol; /* Column containing previous entry */ int iOff; /* Token offset of previous entry */ int nToken; /* Size of token in bytes */ int nData; /* Bytes of data in value */ int nAlloc; /* Bytes of data allocated */ @@ -1193,23 +1222,35 @@ return pTerm; } static int fts5TokenizeCb( void *pCtx, - int iWeight, + int iStream, int iOff, const char *zToken, int nToken, int iSrc, int nSrc ){ TokenizeCtx *p = (TokenizeCtx *)pCtx; + sqlite4 *db = p->db; TokenizeTerm *pTerm = 0; TokenizeTerm *pOrig = 0; + /* TODO: Error here if iStream is out of range */ + if( nToken>p->nMax ) p->nMax = nToken; - p->aSz[p->iCol]++; + + if( iStream>=p->nStream ){ + int nOld = p->nStream; + int nNew = 4; + while( nNew<=iStream ) nNew = nNew*2; + p->aSz = (i64*)sqlite4DbReallocOrFree(db, p->aSz, nNew*p->nCol*sizeof(i64)); + if( p->aSz==0 ) goto tokenize_cb_out; + memset(&p->aSz[p->nStream * p->nCol], 0, (nNew-nOld)*p->nCol*sizeof(i64)); + } + p->aSz[iStream*p->nCol + p->iCol]++; pTerm = (TokenizeTerm *)sqlite4HashFind(&p->hash, zToken, nToken); if( pTerm==0 ){ /* Size the initial allocation so that it fits in the lookaside buffer */ int nAlloc = sizeof(TokenizeTerm) + nToken + 32; @@ -1228,14 +1269,14 @@ if( pTerm==0 ) goto tokenize_cb_out; } } pOrig = pTerm; - if( iWeight!=pTerm->iWeight ){ - pTerm = fts5TokenizeAppendInt(p, pTerm, (iWeight << 2) | 0x00000003); + if( iStream!=pTerm->iStream ){ + pTerm = fts5TokenizeAppendInt(p, pTerm, (iStream << 2) | 0x00000003); if( !pTerm ) goto tokenize_cb_out; - pTerm->iWeight = iWeight; + pTerm->iStream = iStream; } if( pTerm && p->iCol!=pTerm->iCol ){ pTerm = fts5TokenizeAppendInt(p, pTerm, (p->iCol << 2) | 0x00000001); if( !pTerm ) goto tokenize_cb_out; @@ -1257,95 +1298,128 @@ } return 0; } -static int fts5LoadGlobal(sqlite4 *db, Fts5Info *pInfo, i64 *aVal){ - int rc; - int nVal = pInfo->nCol + 1; - u8 aKey[10]; /* Global record key */ - int nKey; /* Bytes in key aKey */ +static int fts5LoadSizeRecord( + sqlite4 *db, /* Database handle */ + u8 *aKey, int nKey, /* KVStore key */ + int nMinStream, /* Space for at least this many streams */ + Fts5Info *pInfo, /* Info record */ + i64 *pnRow, /* non-NULL when reading global record */ + Fts5Size **ppSz /* OUT: Loaded size record */ +){ + Fts5Size *pSz = 0; /* Size object */ KVCursor *pCsr = 0; /* Cursor used to read global record */ - - nKey = putVarint32(aKey, pInfo->iRoot); - aKey[nKey++] = 0x00; + int rc; rc = sqlite4KVStoreOpenCursor(db->aDb[pInfo->iDb].pKV, &pCsr); if( rc==SQLITE4_OK ){ rc = sqlite4KVCursorSeek(pCsr, aKey, nKey, 0); if( rc==SQLITE4_NOTFOUND ){ - rc = SQLITE4_OK; - memset(aVal, 0, sizeof(i64)*nVal); + rc = SQLITE4_CORRUPT_BKPT; }else if( rc==SQLITE4_OK ){ const u8 *aData = 0; int nData = 0; rc = sqlite4KVCursorData(pCsr, 0, -1, &aData, &nData); if( rc==SQLITE4_OK ){ - int i; int iOff = 0; - for(i=0; inCol * nAlloc + ); + if( pSz==0 ){ + rc = SQLITE4_NOMEM; + }else{ + int iCol = 0; + pSz->nCol = pInfo->nCol; + pSz->nStream = nAlloc; + while( iOffaSz[iCol*nAlloc]; + for(i=0; i=0 ){ + iOff += sqlite4PutVarint(&a[iOff], nRow); + } + iOff += sqlite4PutVarint(&a[iOff], pSz->nStream); + for(iCol=0; iColnCol; iCol++){ + int i; + for(i=0; inStream; i++){ + iOff += sqlite4PutVarint(&a[iOff], pSz->aSz[iCol*pSz->nCol+i]); + } + } + + return sqlite4KVStoreReplace(p, aKey, nKey, a, iOff); +} static int fts5CsrLoadGlobal(Fts5Cursor *pCsr){ int rc = SQLITE4_OK; - if( pCsr->aGlobal==0 ){ - int nByte = sizeof(i64) * (pCsr->pInfo->nCol + 1); - pCsr->aGlobal = (i64 *)sqlite4DbMallocZero(pCsr->db, nByte); - if( pCsr->aGlobal==0 ){ - rc = SQLITE4_NOMEM; - }else{ - rc = fts5LoadGlobal(pCsr->db, pCsr->pInfo, pCsr->aGlobal); - } + if( pCsr->pGlobal==0 ){ + int nKey; + u8 aKey[10]; + nKey = putVarint32(aKey, pCsr->pInfo->iRoot); + aKey[nKey++] = 0x00; + rc = fts5LoadSizeRecord( + pCsr->db, aKey, nKey, 0, pCsr->pInfo, &pCsr->nGlobal, &pCsr->pGlobal + ); } return rc; } static int fts5CsrLoadSz(Fts5Cursor *pCsr){ - sqlite4 *db = pCsr->db; - Fts5Info *pInfo = pCsr->pInfo; - int nVal = pInfo->nCol; - int rc; - u8 *aKey; - int nKey = 0; - int nPk = pCsr->pExpr->pRoot->nPk; - KVCursor *pKVCsr = 0; /* Cursor used to read global record */ - - aKey = (u8 *)sqlite4DbMallocZero(db, 10 + nPk); - if( !aKey ) return SQLITE4_NOMEM; - - nKey = putVarint32(aKey, pInfo->iRoot); - aKey[nKey++] = 0x00; - memcpy(&aKey[nKey], pCsr->pExpr->pRoot->aPk, nPk); - nKey += nPk; - - rc = sqlite4KVStoreOpenCursor(db->aDb[pInfo->iDb].pKV, &pKVCsr); - if( rc==SQLITE4_OK ){ - rc = sqlite4KVCursorSeek(pKVCsr, aKey, nKey, 0); - if( rc==SQLITE4_NOTFOUND ){ - rc = SQLITE4_CORRUPT_BKPT; - }else if( rc==SQLITE4_OK ){ - const u8 *aData = 0; - int nData = 0; - rc = sqlite4KVCursorData(pKVCsr, 0, -1, &aData, &nData); - if( rc==SQLITE4_OK ){ - int i; - int iOff = 0; - for(i=0; iaSz[i]); - } - } - pCsr->bSzValid = 1; - } - sqlite4KVCursorClose(pKVCsr); + int rc = SQLITE4_OK; + if( pCsr->pSz==0 ){ + sqlite4 *db = pCsr->db; + Fts5Info *pInfo = pCsr->pInfo; + u8 *aKey; + int nKey = 0; + int nPk = pCsr->pExpr->pRoot->nPk; + + aKey = (u8 *)sqlite4DbMallocZero(db, 10 + nPk); + if( !aKey ) return SQLITE4_NOMEM; + + nKey = putVarint32(aKey, pInfo->iRoot); + aKey[nKey++] = 0x00; + memcpy(&aKey[nKey], pCsr->pExpr->pRoot->aPk, nPk); + nKey += nPk; + + rc = fts5LoadSizeRecord(pCsr->db, aKey, nKey, 0, pInfo, 0, &pCsr->pSz); + sqlite4DbFree(db, aKey); } return rc; } @@ -1363,35 +1437,34 @@ ){ int i; int rc = SQLITE4_OK; KVStore *pStore; TokenizeCtx sCtx; - u8 *aKey = 0; - int nKey = 0; int nTnum = 0; u32 dummy = 0; + + u8 *aSpace = 0; + int nSpace = 0; const u8 *pPK; int nPK; HashElem *pElem; pStore = db->aDb[pInfo->iDb].pKV; - sCtx.rc = SQLITE4_OK; + + memset(&sCtx, 0, sizeof(sCtx)); sCtx.db = db; - sCtx.nMax = 0; + sCtx.nCol = pInfo->nCol; sqlite4HashInit(db->pEnv, &sCtx.hash, 1); pPK = (const u8 *)sqlite4_value_blob(pKey); nPK = sqlite4_value_bytes(pKey); nTnum = getVarint32(pPK, dummy); nPK -= nTnum; pPK += nTnum; - sCtx.aSz = (int *)sqlite4DbMallocZero(db, pInfo->nCol * sizeof(int)); - if( sCtx.aSz==0 ) rc = SQLITE4_NOMEM; - for(i=0; rc==SQLITE4_OK && inCol; i++){ sqlite4_value *pArg = (sqlite4_value *)(&aArg[i]); if( pArg->flags & MEM_Str ){ const char *zText; int nText; @@ -1403,19 +1476,33 @@ &sCtx, pInfo->p, zText, nText, fts5TokenizeCb ); } } - nKey = sqlite4VarintLen(pInfo->iRoot)+2+sCtx.nMax+nPK + 10*(pInfo->nCol+1); - aKey = sqlite4DbMallocRaw(db, nKey); - if( aKey==0 ) rc = SQLITE4_NOMEM; + /* Allocate enough space to serialize all the stuff that needs to + ** be inserted into the database. Specifically: + ** + ** * Space for index record keys, + ** * space for the size record and key for this document, and + ** * space for the updated global size record for the document set. + ** + ** To make it easier, the below allocates enough space to simultaneously + ** store the largest index record key and the largest possible global + ** size record. + */ + nSpace = (sqlite4VarintLen(pInfo->iRoot) + 2 + sCtx.nMax + nPK) + + (9 * (2 + pInfo->nCol * sCtx.nStream)); + aSpace = sqlite4DbMallocRaw(db, nSpace); + if( aSpace==0 ) rc = SQLITE4_NOMEM; for(pElem=sqliteHashFirst(&sCtx.hash); pElem; pElem=sqliteHashNext(pElem)){ TokenizeTerm *pTerm = (TokenizeTerm *)sqliteHashData(pElem); if( rc==SQLITE4_OK ){ int nToken = sqliteHashKeysize(pElem); char *zToken = (char *)sqliteHashKey(pElem); + u8 *aKey = aSpace; + int nKey; nKey = putVarint32(aKey, pInfo->iRoot); aKey[nKey++] = 0x24; memcpy(&aKey[nKey], zToken, nToken); nKey += nToken; @@ -1434,53 +1521,61 @@ } } sqlite4DbFree(db, pTerm); } - /* Write the "sizes" record into the db */ + /* Write the size record into the db */ if( rc==SQLITE4_OK ){ + u8 *aKey = aSpace; + int nKey; + nKey = putVarint32(aKey, pInfo->iRoot); aKey[nKey++] = 0x00; memcpy(&aKey[nKey], pPK, nPK); nKey += nPK; - if( bDel ){ - rc = sqlite4KVStoreReplace(pStore, aKey, nKey, 0, -1); + if( bDel==0 ){ + Fts5Size sSz; + sSz.nCol = pInfo->nCol; + sSz.nStream = sCtx.nStream; + sSz.aSz = sCtx.aSz; + rc = fts5StoreSizeRecord(pStore, aKey, nKey, &sSz, -1, &aKey[nKey]); }else{ - u8 *aData = &aKey[nKey]; - int nData = 0; - for(i=0; inCol; i++){ - nData += putVarint32(&aData[nData], sCtx.aSz[i]); - } - rc = sqlite4KVStoreReplace(pStore, aKey, nKey, aData, nData); + rc = sqlite4KVStoreReplace(pStore, aKey, nKey, 0, -1); } } /* Update the global record */ if( rc==SQLITE4_OK ){ - i64 *aGlobal = (i64 *)aKey; - u8 *aData = (u8 *)&aGlobal[pInfo->nCol+1]; - int nData = 0; - - rc = fts5LoadGlobal(db, pInfo, aGlobal); - if( rc==SQLITE4_OK ){ - u8 aDbKey[10]; - int nDbKey; - nDbKey = putVarint32(aDbKey, pInfo->iRoot); - aDbKey[nDbKey++] = 0x00; - - nData += sqlite4PutVarint(&aData[nData], aGlobal[0] + (bDel?-1:1)); - for(i=0; inCol; i++){ - i64 iNew = aGlobal[i+1] + (i64)sCtx.aSz[i] * (bDel?-1:1); - nData += sqlite4PutVarint(&aData[nData], iNew); - } - - rc = sqlite4KVStoreReplace(pStore, aDbKey, nDbKey, aData, nData); - } - } - - sqlite4DbFree(db, aKey); + Fts5Size *pSz; /* Deserialized global size record */ + i64 nRow; /* Number of rows in indexed table */ + u8 *aKey = aSpace; /* Space to format the global record key */ + int nKey; /* Size of global record key in bytes */ + + nKey = putVarint32(aKey, pInfo->iRoot); + aKey[nKey++] = 0x00; + rc = fts5LoadSizeRecord(db, aKey, nKey, sCtx.nStream, pInfo, &nRow, &pSz); + assert( rc!=SQLITE4_OK || pSz->nStream>=sCtx.nStream ); + + if( rc==SQLITE4_OK ){ + int iCol; + for(iCol=0; iColnCol; iCol++){ + int iStr; + i64 *aIn = &sCtx.aSz[iCol * sCtx.nStream]; + i64 *aOut = &pSz->aSz[iCol * pSz->nStream]; + for(iStr=0; iStrpPK, p->nPK, - (const u8 *)zToken, nToken, iWeight, p->iCol, iOff + (const u8 *)zToken, nToken, iStream, p->iCol, iOff ); p->cksum = (p->cksum ^ cksum); return 0; } @@ -1865,11 +1960,11 @@ pAdv = &in1; }else{ pAdv = &in2; } - fts5InstanceListAppend(&out, pAdv->iCol, pAdv->iWeight, pAdv->iOff); + fts5InstanceListAppend(&out, pAdv->iCol, pAdv->iStream, pAdv->iOff); fts5InstanceListNext(pAdv); } if( bFree ){ sqlite4DbFree(db, p1->aData); @@ -2176,11 +2271,11 @@ int bMatch = fts5TokenAdvanceToMatch(&aIn[i], &aIn[0], i, &bEof); if( bMatch==0 || bEof ) break; } if( i==pStr->nToken && (iCol<0 || aIn[0].iCol==iCol) ){ /* Record a match here */ - fts5InstanceListAppend(&out, aIn[0].iCol, aIn[0].iWeight, aIn[0].iOff); + fts5InstanceListAppend(&out, aIn[0].iCol, aIn[0].iStream, aIn[0].iOff); } bEof = fts5InstanceListNext(&aIn[0]); } pStr->nList = out.iList; @@ -2222,19 +2317,19 @@ if( fts5IsNear(&near, &in, nTrail) || fts5IsNear(&in, &near, nLead) ){ /* The current position is a match. Append an entry to the output ** and advance the input cursor. */ - fts5InstanceListAppend(&out, in.iCol, in.iWeight, in.iOff); + fts5InstanceListAppend(&out, in.iCol, in.iStream, in.iOff); bEof = fts5InstanceListNext(&in); }else{ if( near.iColbSzValid = 0; + sqlite4DbFree(pCsr->db, pCsr->pSz); + pCsr->pSz = 0; return fts5ExprAdvance(pCsr->db, pCsr->pExpr->pRoot, 0); } int sqlite4Fts5Open( sqlite4 *db, /* Database handle */ @@ -2506,73 +2602,111 @@ *paKey = pCsr->aKey; *pnKey = nReq; return SQLITE4_OK; } -int sqlite4_mi_column_count(sqlite4_context *pCtx, int *pnCol){ - int rc = SQLITE4_OK; - if( pCtx->pFts ){ - *pnCol = pCtx->pFts->pInfo->nCol; - }else{ - rc = SQLITE4_MISUSE; - } - return rc; -} - -int sqlite4_mi_column_size(sqlite4_context *pCtx, int iCol, int *pnToken){ - int rc = SQLITE4_OK; - Fts5Cursor *pCsr = pCtx->pFts; - - if( pCsr==0 ){ - rc = SQLITE4_MISUSE; - }else if( iCol>=pCsr->pInfo->nCol ){ - rc = SQLITE4_ERROR; - }else{ - if( pCsr->aSz==0 ){ - pCsr->aSz = (int *)sqlite4DbMallocZero( - pCsr->db, sizeof(int)*pCsr->pInfo->nCol - ); - if( pCsr->aSz==0 ) rc = SQLITE4_NOMEM; - } - if( rc==SQLITE4_OK && pCsr->bSzValid==0 ){ - rc = fts5CsrLoadSz(pCsr); - } - if( rc==SQLITE4_OK ){ - assert( pCsr->bSzValid ); - if( iCol>=0 ){ - *pnToken = pCsr->aSz[iCol]; - }else{ - int i; - int nToken = 0; - for(i=0; ipInfo->nCol; i++){ - nToken += pCsr->aSz[i]; - } - *pnToken = nToken; - } - } - } - return rc; -} +int sqlite4_mi_column_count(sqlite4_context *pCtx, int *pn){ + int rc = SQLITE4_OK; + if( pCtx->pFts ){ + *pn = pCtx->pFts->pInfo->nCol; + }else{ + rc = SQLITE4_MISUSE; + } + return rc; +} + +int sqlite4_mi_phrase_count(sqlite4_context *pCtx, int *pn){ + int rc = SQLITE4_OK; + if( pCtx->pFts ){ + *pn = pCtx->pFts->pExpr->nPhrase; + }else{ + rc = SQLITE4_MISUSE; + } + return rc; +} + +int sqlite4_mi_stream_count(sqlite4_context *pCtx, int *pn){ + int rc = SQLITE4_OK; + Fts5Cursor *pCsr = pCtx->pFts; + if( pCsr ){ + rc = fts5CsrLoadGlobal(pCtx->pFts); + if( rc==SQLITE4_OK ) *pn = pCsr->pGlobal->nStream; + }else{ + rc = SQLITE4_MISUSE; + } + return rc; +} + +static int fts5GetSize(Fts5Size *pSz, int iC, int iS){ + int nToken = 0; + int i; + + if( iC<0 && iS<0 ){ + int nFin = pSz->nCol * pSz->nStream; + for(i=0; iaSz[i]; + }else if( iC<0 ){ + for(i=0; inCol; i++) nToken += pSz->aSz[i*pSz->nStream + iS]; + }else if( iS<0 ){ + for(i=0; inStream; i++) nToken += pSz->aSz[pSz->nStream*iC + iS]; + }else if( iCnCol && iSnStream ){ + nToken = pSz->aSz[iC * pSz->nStream + iS]; + } + + return nToken; +} + +int sqlite4_mi_size(sqlite4_context *pCtx, int iC, int iS, int *pn){ + int rc = SQLITE4_OK; + Fts5Cursor *pCsr = pCtx->pFts; + + if( pCsr==0 ){ + rc = SQLITE4_MISUSE; + }else{ + rc = fts5CsrLoadSz(pCsr); + if( rc==SQLITE4_OK ){ + *pn = fts5GetSize(pCsr->pSz, iC, iS); + } + } + return rc; +} + +int sqlite4_mi_total_size(sqlite4_context *pCtx, int iC, int iS, int *pn){ + int rc = SQLITE4_OK; + Fts5Cursor *pCsr = pCtx->pFts; + + if( pCsr==0 ){ + rc = SQLITE4_MISUSE; + }else{ + rc = fts5CsrLoadGlobal(pCsr); + if( rc==SQLITE4_OK ){ + *pn = fts5GetSize(pCsr->pGlobal, iC, iS); + } + } + return rc; +} + +int sqlite4_mi_total_rows(sqlite4_context *pCtx, int *pn){ + int rc = SQLITE4_OK; + Fts5Cursor *pCsr = pCtx->pFts; + if( pCsr==0 ){ + rc = SQLITE4_MISUSE; + }else{ + rc = fts5CsrLoadGlobal(pCsr); + if( rc==SQLITE4_OK ) *pn = pCsr->nGlobal; + } + return rc; +} + int sqlite4_mi_column_value( sqlite4_context *pCtx, int iCol, sqlite4_value **ppVal ){ int rc = SQLITE4_OK; if( pCtx->pFts ){ }else{ - rc = SQLITE4_MISUSE; - } - return rc; -} - -int sqlite4_mi_phrase_count(sqlite4_context *pCtx, int *pnPhrase){ - int rc = SQLITE4_OK; - if( pCtx->pFts ){ - *pnPhrase = pCtx->pFts->pExpr->nPhrase; - }else{ rc = SQLITE4_MISUSE; } return rc; } @@ -2592,28 +2726,29 @@ return pRet; } int sqlite4_mi_match_count( sqlite4_context *pCtx, - int iCol, + int iC, + int iS, int iPhrase, int *pnMatch ){ int rc = SQLITE4_OK; Fts5Cursor *pCsr = pCtx->pFts; if( pCsr ){ int nMatch = 0; Fts5Str *pStr; - int iCopy = iCol; + int iCopy = iPhrase; InstanceList sList; pStr = fts5FindStr(pCsr->pExpr->pRoot, &iCopy); assert( pStr ); fts5InstanceListInit(pStr->aList, pStr->nList, &sList); while( 0==fts5InstanceListNext(&sList) ){ - if( iCol<0 || sList.iCol==iCol ) nMatch++; + if( (iC<0 || sList.iCol==iC) && (iS<0 || sList.iStream==iS) ) nMatch++; } *pnMatch = nMatch; }else{ rc = SQLITE4_MISUSE; } @@ -2639,52 +2774,30 @@ int *pnRelevant ){ return SQLITE4_OK; } -int sqlite4_mi_total_size(sqlite4_context *pCtx, int iCol, int *pnToken){ - int rc = SQLITE4_OK; - if( pCtx->pFts ){ - Fts5Cursor *pCsr = pCtx->pFts; - int nCol = pCsr->pInfo->nCol; - - if( iCol>=nCol ){ - rc = SQLITE4_ERROR; - }else{ - rc = fts5CsrLoadGlobal(pCsr); - if( rc==SQLITE4_OK ){ - if( iCol<0 ){ - int i; - int nToken = 0; - for(i=0; iaGlobal[i+1]; - } - *pnToken = nToken; - }else{ - *pnToken = pCsr->aGlobal[iCol+1]; - } - } - } - }else{ - rc = SQLITE4_MISUSE; - } - return rc; -} - -static void fts5StrLoadRowcounts(Fts5Str *pStr, int *anRow){ +static void fts5StrLoadRowcounts(Fts5Str *pStr, int nStream, int *anRow){ + u32 mask = 0; + int iPrevCol = 0; InstanceList sList; fts5InstanceListInit(pStr->aList, pStr->nList, &sList); while( 0==fts5InstanceListNext(&sList) ){ - anRow[sList.iCol]++; + if( sList.iCol!=iPrevCol ) mask = 0; + if( (mask & (1<pPhrase; rc = fts5ExprAdvance(db, pNode, 1); while( rc==SQLITE4_OK ){ + int nIncr = pInfo->nCol * nStream; /* Values for each Fts5Str */ int i; for(i=0; inStr; i++){ - fts5StrLoadRowcounts(&pPhrase->aStr[i], &anRow[i*pInfo->nCol]); + fts5StrLoadRowcounts(&pPhrase->aStr[i], nStream, &anRow[i*nIncr]); } rc = fts5ExprAdvance(db, pNode, 0); } - *panRow = &anRow[pInfo->nCol * pPhrase->nStr]; + *panRow = &anRow[pInfo->nCol * nStream * pPhrase->nStr]; } if( rc==SQLITE4_OK ){ - rc = fts5ExprLoadRowcounts(db, pInfo, pNode->pLeft, panRow); + rc = fts5ExprLoadRowcounts(db, pInfo, nStream, pNode->pLeft, panRow); } if( rc==SQLITE4_OK ){ - rc = fts5ExprLoadRowcounts(db, pInfo, pNode->pLeft, panRow); + rc = fts5ExprLoadRowcounts(db, pInfo, nStream, pNode->pRight, panRow); } } return rc; } @@ -2718,30 +2832,30 @@ static int fts5CsrLoadRowcounts(Fts5Cursor *pCsr){ int rc = SQLITE4_OK; if( pCsr->anRow==0 ){ + int nStream = pCsr->pGlobal->nStream; sqlite4 *db = pCsr->db; Fts5Expr *pCopy; Fts5Expr *pExpr = pCsr->pExpr; Fts5Info *pInfo = pCsr->pInfo; int *anRow; pCsr->anRow = anRow = (int *)sqlite4DbMallocZero(db, - pExpr->nPhrase * pInfo->nCol * sizeof(int) + pExpr->nPhrase * pInfo->nCol * pCsr->pGlobal->nStream * sizeof(int) ); if( !anRow ) return SQLITE4_NOMEM; rc = fts5ParseExpression(db, pInfo->pTokenizer, pInfo->p, pInfo->iRoot, pInfo->azCol, pInfo->nCol, pCsr->zExpr, &pCopy, 0 ); if( rc==SQLITE4_OK ){ rc = fts5OpenExprCursors(db, pInfo, pExpr->pRoot); } - if( rc==SQLITE4_OK ){ - rc = fts5ExprLoadRowcounts(db, pInfo, pCopy->pRoot, &anRow); + rc = fts5ExprLoadRowcounts(db, pInfo, nStream, pCopy->pRoot, &anRow); } fts5ExpressionFree(db, pCopy); } @@ -2748,50 +2862,43 @@ return rc; } int sqlite4_mi_row_count( sqlite4_context *pCtx, /* Context object passed to mi function */ - int iCol, /* Specific column (or -1) */ - int iPhrase, /* Specific phrase (or -1) */ - int *pnRow /* Total number of rows */ + int iC, /* Specific column (or -ve for all columns) */ + int iS, /* Specific stream (or -ve for all streams) */ + int iP, /* Specific phrase */ + int *pn /* Total number of rows containing C/S/P */ ){ int rc = SQLITE4_OK; - if( pCtx->pFts ){ - Fts5Cursor *pCsr = pCtx->pFts; - Fts5Expr *pExpr = pCsr->pExpr; - int nCol = pCsr->pInfo->nCol; - int nPhrase = pExpr->nPhrase; - - if( iCol>=nCol || iPhrase>=nPhrase ){ - rc = SQLITE4_ERROR; - } - - else if( iPhrase>=0 ){ - int iIdx = iPhrase * pCsr->pInfo->nCol; - - rc = fts5CsrLoadRowcounts(pCsr); - if( rc==SQLITE4_OK ){ - if( iCol>0 ){ - *pnRow = pCsr->anRow[iIdx + iCol]; - }else{ - int i; - int nRow = 0; - for(i=0; ipInfo->nCol; i++){ - nRow += pCsr->anRow[iIdx + i]; - } - *pnRow = nRow; - } - } - }else{ - /* Total number of rows in table... */ - rc = fts5CsrLoadGlobal(pCsr); - if( rc==SQLITE4_OK ){ - *pnRow = (int)pCsr->aGlobal[0]; - } - } - }else{ - rc = SQLITE4_MISUSE; + Fts5Cursor *pCsr = pCtx->pFts; + if( pCsr==0 ){ + rc = SQLITE4_MISUSE; + }else{ + rc = fts5CsrLoadGlobal(pCsr); + if( rc==SQLITE4_OK ) rc = fts5CsrLoadRowcounts(pCsr); + + if( rc==SQLITE4_OK ){ + int i; + int nRow = 0; + int nStream = pCsr->pGlobal->nStream; + int nCol = pCsr->pInfo->nCol; + int *aRow = &pCsr->anRow[iP * nStream * nCol]; + + if( iC<0 && iS<0 ){ + int nFin = nCol * nStream; + for(i=0; idb = db; p->nPhrase = nPhrase; p->aIdf = (double *)&p[1]; /* Determine the IDF weight for each phrase in the query. */ - rc = sqlite4_mi_row_count(pCtx, -1, -1, &N); + rc = sqlite4_mi_total_rows(pCtx, &N); for(i=0; rc==SQLITE4_OK && iaIdf[i] = log((0.5 + N - ni) / (0.5 + ni)); } } /* Determine the average document length */ if( rc==SQLITE4_OK ){ int nTotal; - rc = sqlite4_mi_total_size(pCtx, -1, &nTotal); + rc = sqlite4_mi_total_size(pCtx, -1, -1, &nTotal); if( rc==SQLITE4_OK ){ p->avgdl = (double)nTotal / (double)N; } } } @@ -131,12 +131,12 @@ double prank; /* Contribution to rank of this phrase */ /* Set variable tf to the total number of occurrences of phrase iPhrase ** in this row (within any column). And dl to the number of tokens in ** the current row (again, in any column). */ - rc = sqlite4_mi_match_count(pCtx, -1, i, &tf); - if( rc==SQLITE4_OK ) rc = sqlite4_mi_column_size(pCtx, -1, &dl); + rc = sqlite4_mi_match_count(pCtx, -1, -1, i, &tf); + if( rc==SQLITE4_OK ) rc = sqlite4_mi_size(pCtx, -1, -1, &dl); /* Calculate the normalized document length */ L = (double)dl / p->avgdl; /* Calculate the contribution to the rank made by this phrase. Then Index: src/sqlite.h.in ================================================================== --- src/sqlite.h.in +++ src/sqlite.h.in @@ -4421,71 +4421,75 @@ ** Special functions that may be called from within matchinfo UDFs. All ** return an SQLite error code - SQLITE4_OK if successful, or some other ** error code otherwise. ** ** sqlite4_mi_column_count(): -** Set *pnCol to the number of columns in the queried table. +** Set *pn to the number of columns in the queried table. +** +** sqlite4_mi_phrase_count(): +** Set *pn to the number of phrases in the query. +** +** sqlite4_mi_stream_count(): +** Set *pn to the number of streams in the FTS index. +** +** sqlite4_mi_size(): +** Set *pn to the number of tokens belonging to stream iS in the value +** stored in column iC of the current row. +** +** Either or both of iS and iC may be negative. If iC is negative, then the +** output value is the total number of tokens for the specified stream (or +** streams) across all table columns. Similarly, if iS is negative, the +** output value is the total number of tokens in the specified column or +** columns, regardless of stream. +** +** sqlite4_mi_total_size(): +** Similar to sqlite4_mi_size(), except the output parameter is set to +** the total number of tokens belonging to the specified column(s) +** and stream(s) in all rows of the table, not just the current row. +** +** sqlite4_mi_total_rows(): +** Set *pn to the total number of rows in the indexed table. +** +** sqlite4_mi_row_count(): +** Set the output parameter to the total number of rows in the table that +** contain at least one instance of the phrase identified by parameter +** iP in the column(s) and stream(s) identified by parameters iC and iS. +** +** sqlite4_mi_match_count(): +** Set the output parameter to the total number of occurences of phrase +** iP in the current row that belong to the column(s) and stream(s) +** identified by parameters iC and iS. +** +** Parameter iP may also be negative. In this case, the output value is +** set to the total number of occurrences of all query phrases in the +** current row, subject to the constraints imposed by iC and iS. ** -** sqlite4_mi_column_size(): -** Set *pnToken to the number of tokens in the value stored in column iCol -** of the current row. +** sqlite4_mi_match_detail(): +** This function may be used to iterate through all matches in the +** current row in order of occurrence. ** ** sqlite4_mi_column_value(): ** Set *ppVal to point to an sqlite4_value object containing the value ** read from column iCol of the current row. This object is valid until ** the function callback returns. -** -** sqlite4_mi_phrase_count(): -** Set *pnPhrase to the number of phrases in the query. -** -** sqlite4_mi_match_count(): -** Set *pn to the number of occurences of phrase iPhrase in column iCol of -** the current row. -** -** sqlite4_mi_total_match_count(): -** Set *pnMatch to the total number of occurrences of phrase iPhrase -** in column iCol of all rows in the indexed table. Set *pnDoc to the -** number of rows that contain at least one match for phrase iPhrase in -** column iCol. -** -** sqlite4_mi_match_offset(): -** Set *piOff to the token offset of the iMatch'th instance of phrase -** iPhrase in column iCol of the current row. If any parameter is out -** of range (i.e. too large) it is not an error. In this case *piOff is -** set to -1 before returning. -** -** sqlite4_mi_total_size(): -** Set *pnToken to the total number of tokens in column iCol of all rows -** in the indexed table. -** -** sqlite4_mi_row_count(): -** If parameter iPhrase is negative, this function sets the output -** parameter to the total number of documents in the collection (rows -** in the indexed table). -** -** Otherwise, if iPhrase is not negative, then the output is set to the -** total number of rows that contain at least one instance of phrase iPhrase -** in column iCol, or in any column if iCol is negative. -** -** If parameter iPhrase is equal to or greater than the number of phrases -** in the current query, or if iCol is equal to or greater than the number -** of columns in the indexed table, SQLITE4_MISUSE is returned. The value -** of the output parameter is undefined in this case. -*/ - -int sqlite4_mi_column_count(sqlite4_context *, int *pnCol); -int sqlite4_mi_phrase_count(sqlite4_context *, int *pnPhrase); - -int sqlite4_mi_column_size(sqlite4_context *, int iCol, int *pnToken); -int sqlite4_mi_match_count(sqlite4_context *, int iCol, int iPhrase, int *pn); -int sqlite4_mi_total_size(sqlite4_context *, int iCol, int *pnToken); -int sqlite4_mi_row_count(sqlite4_context *, int iCol, int iPhrase, int *pnRow); - -int sqlite4_mi_column_value(sqlite4_context *, int iCol, sqlite4_value **ppVal); -int sqlite4_mi_match_detail(sqlite4_context *, - int iCol, int iPhrase, int iMatch, int *piOff, int *piWeight -); +*/ +int sqlite4_mi_column_count(sqlite4_context *, int *pn); +int sqlite4_mi_phrase_count(sqlite4_context *, int *pn); +int sqlite4_mi_stream_count(sqlite4_context *, int *pn); + +int sqlite4_mi_total_size(sqlite4_context *, int iC, int iS, int *pn); +int sqlite4_mi_total_rows(sqlite4_context *, int *pn); + +int sqlite4_mi_row_count(sqlite4_context *, int iC, int iS, int iP, int *pn); + +int sqlite4_mi_size(sqlite4_context *, int iC, int iS, int *pn); +int sqlite4_mi_match_count(sqlite4_context *, int iC, int iS, int iP, int *pn); +int sqlite4_mi_match_detail( + sqlite4_context *, int iMatch, int *piOff, int *piC, int *piS, int *piP +); +int sqlite4_mi_column_value(sqlite4_context *, int iCol, sqlite4_value **ppVal); + /* ** Undo the hack that converts floating point types to integer for ** builds on processors without floating point support.