Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Modify loadSegmentLeavesInt() to correctly handle prefix searching. The new function docListUnion() is used to accumulate a union of the hits for the matching terms, which will be merged across segments using docListMerge(). (CVS 3891) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
72c796307338c2751a91c30f6fb16989 |
User & Date: | shess 2007-05-01 17:14:59.000 |
Context
2007-05-01
| ||
17:49 | First approximation of incremental blob IO API. (CVS 3892) (check-in: c444836e7b user: danielk1977 tags: trunk) | |
17:14 | Modify loadSegmentLeavesInt() to correctly handle prefix searching. The new function docListUnion() is used to accumulate a union of the hits for the matching terms, which will be merged across segments using docListMerge(). (CVS 3891) (check-in: 72c7963073 user: shess tags: trunk) | |
16:59 | The pager takes the sector size to be the larger of the sector size reported by sqlite3OsSectorSize() and the page size. (CVS 3890) (check-in: e5e6af55cc user: drh tags: trunk) | |
Changes
Changes to ext/fts2/fts2.c.
︙ | ︙ | |||
704 705 706 707 708 709 710 711 712 713 714 715 716 717 | /*******************************************************************/ /* DLWriter is used to write doclist data to a DataBuffer. DLWriter ** always appends to the buffer and does not own it. ** ** dlwInit - initialize to write a given type doclistto a buffer. ** dlwDestroy - clear the writer's memory. Does not free buffer. ** dlwAppend - append raw doclist data to buffer. ** dlwAdd - construct doclist element and append to buffer. ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). */ typedef struct DLWriter { DocListType iType; DataBuffer *b; sqlite_int64 iPrevDocid; | > | 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 | /*******************************************************************/ /* DLWriter is used to write doclist data to a DataBuffer. DLWriter ** always appends to the buffer and does not own it. ** ** dlwInit - initialize to write a given type doclistto a buffer. ** dlwDestroy - clear the writer's memory. Does not free buffer. ** dlwAppend - append raw doclist data to buffer. ** dlwCopy - copy next doclist from reader to writer. ** dlwAdd - construct doclist element and append to buffer. ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). */ typedef struct DLWriter { DocListType iType; DataBuffer *b; sqlite_int64 iPrevDocid; |
︙ | ︙ | |||
766 767 768 769 770 771 772 773 774 775 776 777 778 779 | if( nFirstOld<nData ){ dataBufferAppend2(pWriter->b, c, nFirstNew, pData+nFirstOld, nData-nFirstOld); }else{ dataBufferAppend(pWriter->b, c, nFirstNew); } pWriter->iPrevDocid = iLastDocid; } static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n = putVarint(c, iDocid-pWriter->iPrevDocid); assert( pWriter->iPrevDocid<iDocid ); assert( pWriter->iType==DL_DOCIDS ); | > > > > | 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 | if( nFirstOld<nData ){ dataBufferAppend2(pWriter->b, c, nFirstNew, pData+nFirstOld, nData-nFirstOld); }else{ dataBufferAppend(pWriter->b, c, nFirstNew); } pWriter->iPrevDocid = iLastDocid; } static void dlwCopy(DLWriter *pWriter, DLReader *pReader){ dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader), dlrDocid(pReader), dlrDocid(pReader)); } static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n = putVarint(c, iDocid-pWriter->iPrevDocid); assert( pWriter->iPrevDocid<iDocid ); assert( pWriter->iType==DL_DOCIDS ); |
︙ | ︙ | |||
882 883 884 885 886 887 888 889 890 891 892 893 894 895 | /* PLWriter is used in constructing a document's position list. As a ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. ** PLWriter writes to the associated DLWriter's buffer. ** ** plwInit - init for writing a document's poslist. ** plwDestroy - clear a writer. ** plwAdd - append position and offset information. ** plwTerminate - add any necessary doclist terminator. ** ** Calling plwAdd() after plwTerminate() may result in a corrupt ** doclist. */ /* TODO(shess) Until we've written the second item, we can cache the ** first item's information. Then we'd have three states: | > | 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 | /* PLWriter is used in constructing a document's position list. As a ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. ** PLWriter writes to the associated DLWriter's buffer. ** ** plwInit - init for writing a document's poslist. ** plwDestroy - clear a writer. ** plwAdd - append position and offset information. ** plwCopy - copy next position's data from reader to writer. ** plwTerminate - add any necessary doclist terminator. ** ** Calling plwAdd() after plwTerminate() may result in a corrupt ** doclist. */ /* TODO(shess) Until we've written the second item, we can cache the ** first item's information. Then we'd have three states: |
︙ | ︙ | |||
940 941 942 943 944 945 946 947 948 949 950 951 952 953 | assert( iStartOffset>=pWriter->iOffset ); n += putVarint(c+n, iStartOffset-pWriter->iOffset); pWriter->iOffset = iStartOffset; assert( iEndOffset>=iStartOffset ); n += putVarint(c+n, iEndOffset-iStartOffset); } dataBufferAppend(pWriter->dlw->b, c, n); } static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n; pWriter->dlw = dlw; | > > > > | 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 | assert( iStartOffset>=pWriter->iOffset ); n += putVarint(c+n, iStartOffset-pWriter->iOffset); pWriter->iOffset = iStartOffset; assert( iEndOffset>=iStartOffset ); n += putVarint(c+n, iEndOffset-iStartOffset); } dataBufferAppend(pWriter->dlw->b, c, n); } static void plwCopy(PLWriter *pWriter, PLReader *pReader){ plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader), plrStartOffset(pReader), plrEndOffset(pReader)); } static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n; pWriter->dlw = dlw; |
︙ | ︙ | |||
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 | } } /* Copy over any remaining elements. */ if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); dlwDestroy(&writer); } /* pLeft and pRight are DLReaders positioned to the same docid. ** ** If there are no instances in pLeft or pRight where the position ** of pLeft is one less than the position of pRight, then this ** routine adds nothing to pOut. ** ** If there are one or more instances where positions from pLeft ** are exactly one less than positions from pRight, then add a new ** document record to pOut. If pOut wants to hold positions, then ** include the positions from pRight that are one more than a ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > | 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 | } } /* Copy over any remaining elements. */ if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); dlwDestroy(&writer); } /* Helper function for posListUnion(). Compares the current position ** between left and right, returning as standard C idiom of <0 if ** left<right, >0 if left>right, and 0 if left==right. "End" always ** compares greater. */ static int posListCmp(PLReader *pLeft, PLReader *pRight){ assert( pLeft->iType==pRight->iType ); if( pLeft->iType==DL_DOCIDS ) return 0; if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1; if( plrAtEnd(pRight) ) return -1; if( plrColumn(pLeft)<plrColumn(pRight) ) return -1; if( plrColumn(pLeft)>plrColumn(pRight) ) return 1; if( plrPosition(pLeft)<plrPosition(pRight) ) return -1; if( plrPosition(pLeft)>plrPosition(pRight) ) return 1; if( pLeft->iType==DL_POSITIONS ) return 0; if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1; if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1; if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1; if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1; return 0; } /* Write the union of position lists in pLeft and pRight to pOut. ** "Union" in this case meaning "All unique position tuples". Should ** work with any doclist type, though both inputs and the output ** should be the same type. */ static void posListUnion(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ PLReader left, right; PLWriter writer; assert( dlrDocid(pLeft)==dlrDocid(pRight) ); assert( pLeft->iType==pRight->iType ); assert( pLeft->iType==pOut->iType ); plrInit(&left, pLeft); plrInit(&right, pRight); plwInit(&writer, pOut, dlrDocid(pLeft)); while( !plrAtEnd(&left) || !plrAtEnd(&right) ){ int c = posListCmp(&left, &right); if( c<0 ){ plwCopy(&writer, &left); plrStep(&left); }else if( c>0 ){ plwCopy(&writer, &right); plrStep(&right); }else{ plwCopy(&writer, &left); plrStep(&left); plrStep(&right); } } plwTerminate(&writer); plwDestroy(&writer); plrDestroy(&left); plrDestroy(&right); } /* Write the union of doclists in pLeft and pRight to pOut. For ** docids in common between the inputs, the union of the position ** lists is written. Inputs and outputs are always type DL_DEFAULT. */ static void docListUnion( const char *pLeft, int nLeft, const char *pRight, int nRight, DataBuffer *pOut /* Write the combined doclist here */ ){ DLReader left, right; DLWriter writer; if( nLeft==0 ){ dataBufferAppend(pOut, pRight, nRight); return; } if( nRight==0 ){ dataBufferAppend(pOut, pLeft, nLeft); return; } dlrInit(&left, DL_DEFAULT, pLeft, nLeft); dlrInit(&right, DL_DEFAULT, pRight, nRight); dlwInit(&writer, DL_DEFAULT, pOut); while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ if( dlrAtEnd(&right) ){ dlwCopy(&writer, &left); dlrStep(&left); }else if( dlrAtEnd(&left) ){ dlwCopy(&writer, &right); dlrStep(&right); }else if( dlrDocid(&left)<dlrDocid(&right) ){ dlwCopy(&writer, &left); dlrStep(&left); }else if( dlrDocid(&left)>dlrDocid(&right) ){ dlwCopy(&writer, &right); dlrStep(&right); }else{ posListUnion(&left, &right, &writer); dlrStep(&left); dlrStep(&right); } } dlrDestroy(&left); dlrDestroy(&right); dlwDestroy(&writer); } /* pLeft and pRight are DLReaders positioned to the same docid. ** ** If there are no instances in pLeft or pRight where the position ** of pLeft is one less than the position of pRight, then this ** routine adds nothing to pOut. ** ** If there are one or more instances where positions from pLeft ** are exactly one less than positions from pRight, then add a new ** document record to pOut. If pOut wants to hold positions, then ** include the positions from pRight that are one more than a ** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1. */ static void posListPhraseMerge(DLReader *pLeft, DLReader *pRight, DLWriter *pOut){ PLReader left, right; PLWriter writer; int match = 0; assert( dlrDocid(pLeft)==dlrDocid(pRight) ); assert( pOut->iType!=DL_POSITIONS_OFFSETS ); |
︙ | ︙ | |||
1298 1299 1300 1301 1302 1303 1304 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ if( dlrDocid(&left)<dlrDocid(&right) ){ dlrStep(&left); }else if( dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); }else{ | | | 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ if( dlrDocid(&left)<dlrDocid(&right) ){ dlrStep(&left); }else if( dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); }else{ posListPhraseMerge(&left, &right, &writer); dlrStep(&left); dlrStep(&right); } } dlrDestroy(&left); dlrDestroy(&right); |
︙ | ︙ | |||
4753 4754 4755 4756 4757 4758 4759 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); pReader->pData += n+nSuffix; pReader->nData -= n+nSuffix; } } | | > > | > | 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 | dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix); pReader->pData += n+nSuffix; pReader->nData -= n+nSuffix; } } /* strcmp-style comparison of pReader's current term against pTerm. ** If isPrefix, equality means equal through nTerm bytes. */ static int leafReaderTermCmp(LeafReader *pReader, const char *pTerm, int nTerm, int isPrefix){ int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm; if( n==0 ){ if( pReader->term.nData>0 ) return -1; if(nTerm>0 ) return 1; return 0; } c = memcmp(pReader->term.pData, pTerm, n); if( c!=0 ) return c; if( isPrefix && n==nTerm ) return 0; return pReader->term.nData - nTerm; } /****************************************************************/ /* LeavesReader wraps LeafReader to allow iterating over the entire ** leaf layer of the tree. |
︙ | ︙ | |||
4912 4913 4914 4915 4916 4917 4918 | if( leavesReaderAtEnd(lr1) ){ if( leavesReaderAtEnd(lr2) ) return 0; return 1; } if( leavesReaderAtEnd(lr2) ) return -1; return leafReaderTermCmp(&lr1->leafReader, | | > | 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 | if( leavesReaderAtEnd(lr1) ){ if( leavesReaderAtEnd(lr2) ) return 0; return 1; } if( leavesReaderAtEnd(lr2) ) return -1; return leafReaderTermCmp(&lr1->leafReader, leavesReaderTerm(lr2), leavesReaderTermBytes(lr2), 0); } /* Similar to leavesReaderTermCmp(), with additional ordering by idx ** so that older segments sort before newer segments. */ static int leavesReaderCmp(LeavesReader *lr1, LeavesReader *lr2){ int c = leavesReaderTermCmp(lr1, lr2); |
︙ | ︙ | |||
5101 5102 5103 5104 5105 5106 5107 | } /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). */ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, | | > > | | | > > > > > > > > > | > | | > | | > | | 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 | } /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). */ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ assert( nTerm>0 ); /* Process while the prefix matches. */ while( !leavesReaderAtEnd(pReader) ){ /* TODO(shess) Really want leavesReaderTermCmp(), but that name is ** already taken to compare the terms of two LeavesReaders. Think ** on a better name. [Meanwhile, break encapsulation rather than ** use a confusing name.] */ int rc; int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); if( c==0 ){ const char *pData = leavesReaderData(pReader); int nData = leavesReaderDataBytes(pReader); if( out->nData==0 ){ dataBufferReplace(out, pData, nData); }else{ DataBuffer result; dataBufferInit(&result, out->nData+nData); docListUnion(out->pData, out->nData, pData, nData, &result); dataBufferDestroy(out); *out = result; /* TODO(shess) Rather than destroy out, we could retain it for ** later reuse. */ } } if( c>0 ) break; /* Past any possible matches. */ rc = leavesReaderStep(v, pReader); if( rc!=SQLITE_OK ) return rc; } return SQLITE_OK; } /* Call loadSegmentLeavesInt() with pData/nData as input. */ static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ LeavesReader reader; int rc; assert( nData>1 ); assert( *pData=='\0' ); rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader); if( rc!=SQLITE_OK ) return rc; rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); leavesReaderReset(&reader); leavesReaderDestroy(&reader); return rc; } /* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to ** iEndLeaf (inclusive) as input, and merge the resulting doclist into ** out. */ static int loadSegmentLeaves(fulltext_vtab *v, sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ int rc; LeavesReader reader; assert( iStartLeaf<=iEndLeaf ); rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader); if( rc!=SQLITE_OK ) return rc; rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out); leavesReaderReset(&reader); leavesReaderDestroy(&reader); return rc; } /* Taking pData/nData as an interior node, find the sequence of child ** nodes which could include pTerm/nTerm/isPrefix. Note that the |
︙ | ︙ | |||
5254 5255 5256 5257 5258 5259 5260 | */ static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, sqlite_int64 iLeavesEnd, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ /* Special case where root is a leaf. */ if( *pData=='\0' ){ | < | | 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 | */ static int loadSegmentInt(fulltext_vtab *v, const char *pData, int nData, sqlite_int64 iLeavesEnd, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ /* Special case where root is a leaf. */ if( *pData=='\0' ){ return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out); }else{ int rc; sqlite_int64 iStartChild, iEndChild; /* Process pData as an interior node, then loop down the tree ** until we find the set of leaf nodes to scan for the term. */ |
︙ | ︙ | |||
5286 5287 5288 5289 5290 5291 5292 | assert( iNextStart<=iNextEnd ); iStartChild = iNextStart; iEndChild = iNextEnd; } assert( iStartChild<=iLeavesEnd ); assert( iEndChild<=iLeavesEnd ); | < | > < < < < | 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 | assert( iNextStart<=iNextEnd ); iStartChild = iNextStart; iEndChild = iNextEnd; } assert( iStartChild<=iLeavesEnd ); assert( iEndChild<=iLeavesEnd ); return loadSegmentLeaves(v, iStartChild, iEndChild, pTerm, nTerm, isPrefix, out); } } /* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then ** merge its doclist over *out (any duplicate doclists read from the ** segment rooted at pData will overwrite those in *out). */ static int loadSegment(fulltext_vtab *v, const char *pData, int nData, sqlite_int64 iLeavesEnd, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ DataBuffer result; int rc; |
︙ | ︙ |