Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Change prefix search from O(N*M) to O(NlogM). The previous code linearly merged the doclists, so as the accumulated list got large, things got slow (the M term, a fucntion of the number of documents in the index). This change does pairwise merges until a single doclist remains. A test search of 't*' against a database of RFC text improves from 1m16s to 4.75s. (CVS 4599) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
feef1b15d645d638b4a05742f214b044 |
User & Date: | shess 2007-12-07 23:47:53.000 |
References
2010-07-19
| ||
11:16 | Re-introduce the prefix-search optimization of [feef1b15d6], which was lost in a reorganization of FTS3 code. (check-in: d692434b49 user: dan tags: trunk) | |
Context
2007-12-08
| ||
17:55 | Fix a bug in the debugging printf logic. (CVS 4600) (check-in: 1d6a9f5faf user: drh tags: trunk) | |
2007-12-07
| ||
23:47 | Change prefix search from O(N*M) to O(NlogM). The previous code linearly merged the doclists, so as the accumulated list got large, things got slow (the M term, a fucntion of the number of documents in the index). This change does pairwise merges until a single doclist remains. A test search of 't*' against a database of RFC text improves from 1m16s to 4.75s. (CVS 4599) (check-in: feef1b15d6 user: shess tags: trunk) | |
18:55 | In shared-cache mode, make sure the busy hander invoked is the busy handler associated with the database connection that caused the lock contention in the first place. (CVS 4598) (check-in: c9eb65912f user: drh tags: trunk) | |
Changes
Changes to ext/fts3/fts3.c.
︙ | ︙ | |||
433 434 435 436 437 438 439 440 441 442 443 444 445 446 | /* DataBuffer is used to collect data into a buffer in piecemeal ** fashion. It implements the usual distinction between amount of ** data currently stored (nData) and buffer capacity (nCapacity). ** ** dataBufferInit - create a buffer with given initial capacity. ** dataBufferReset - forget buffer's data, retaining capacity. ** dataBufferDestroy - free buffer's data. ** dataBufferExpand - expand capacity without adding data. ** dataBufferAppend - append data. ** dataBufferAppend2 - append two pieces of data at once. ** dataBufferReplace - replace buffer's data. */ typedef struct DataBuffer { char *pData; /* Pointer to malloc'ed buffer. */ | > | 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 | /* DataBuffer is used to collect data into a buffer in piecemeal ** fashion. It implements the usual distinction between amount of ** data currently stored (nData) and buffer capacity (nCapacity). ** ** dataBufferInit - create a buffer with given initial capacity. ** dataBufferReset - forget buffer's data, retaining capacity. ** dataBufferDestroy - free buffer's data. ** dataBufferSwap - swap contents of two buffers. ** dataBufferExpand - expand capacity without adding data. ** dataBufferAppend - append data. ** dataBufferAppend2 - append two pieces of data at once. ** dataBufferReplace - replace buffer's data. */ typedef struct DataBuffer { char *pData; /* Pointer to malloc'ed buffer. */ |
︙ | ︙ | |||
456 457 458 459 460 461 462 463 464 465 466 467 468 469 | } static void dataBufferReset(DataBuffer *pBuffer){ pBuffer->nData = 0; } static void dataBufferDestroy(DataBuffer *pBuffer){ if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData); SCRAMBLE(pBuffer); } static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ assert( nAddCapacity>0 ); /* TODO(shess) Consider expanding more aggressively. Note that the ** underlying malloc implementation may take care of such things for ** us already. */ | > > > > > | 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 | } static void dataBufferReset(DataBuffer *pBuffer){ pBuffer->nData = 0; } static void dataBufferDestroy(DataBuffer *pBuffer){ if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData); SCRAMBLE(pBuffer); } static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){ DataBuffer tmp = *pBuffer1; *pBuffer1 = *pBuffer2; *pBuffer2 = tmp; } static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){ assert( nAddCapacity>0 ); /* TODO(shess) Consider expanding more aggressively. Note that the ** underlying malloc implementation may take care of such things for ** us already. */ |
︙ | ︙ | |||
5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 | err: for(i=0; i<MERGE_COUNT; i++){ leavesReaderDestroy(&lrs[i]); } leafWriterDestroy(&writer); return rc; } /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). */ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ assert( nTerm>0 ); | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < | > < > | | > | > > | > > > | > | > > > > > > | > > > > > > > > | > | > > | > > > > > > > | > > > > | > > > | > > > > > > > > > > > > > | > > > > > > > > > > > | > > > | > > > | | | | 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 | err: for(i=0; i<MERGE_COUNT; i++){ leavesReaderDestroy(&lrs[i]); } leafWriterDestroy(&writer); return rc; } /* Accumulate the union of *acc and *pData into *acc. */ static void docListAccumulateUnion(DataBuffer *acc, const char *pData, int nData) { DataBuffer tmp = *acc; dataBufferInit(acc, tmp.nData+nData); docListUnion(tmp.pData, tmp.nData, pData, nData, acc); dataBufferDestroy(&tmp); } /* TODO(shess) It might be interesting to explore different merge ** strategies, here. For instance, since this is a sorted merge, we ** could easily merge many doclists in parallel. With some ** comprehension of the storage format, we could merge all of the ** doclists within a leaf node directly from the leaf node's storage. ** It may be worthwhile to merge smaller doclists before larger ** doclists, since they can be traversed more quickly - but the ** results may have less overlap, making them more expensive in a ** different way. */ /* Scan pReader for pTerm/nTerm, and merge the term's doclist over ** *out (any doclists with duplicate docids overwrite those in *out). ** Internal function for loadSegmentLeaf(). */ static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ /* doclist data is accumulated into pBuffers similar to how one does ** increment in binary arithmetic. If index 0 is empty, the data is ** stored there. If there is data there, it is merged and the ** results carried into position 1, with further merge-and-carry ** until an empty position is found. */ DataBuffer *pBuffers = NULL; int nBuffers = 0, nMaxBuffers = 0, rc; assert( nTerm>0 ); for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader); rc=leavesReaderStep(v, pReader)){ /* TODO(shess) Really want leavesReaderTermCmp(), but that name is ** already taken to compare the terms of two LeavesReaders. Think ** on a better name. [Meanwhile, break encapsulation rather than ** use a confusing name.] */ int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix); if( c>0 ) break; /* Past any possible matches. */ if( c==0 ){ const char *pData = leavesReaderData(pReader); int iBuffer, nData = leavesReaderDataBytes(pReader); /* Find the first empty buffer. */ for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){ if( 0==pBuffers[iBuffer].nData ) break; } /* Out of buffers, add an empty one. */ if( iBuffer==nBuffers ){ if( nBuffers==nMaxBuffers ){ DataBuffer *p; nMaxBuffers += 20; /* Manual realloc so we can handle NULL appropriately. */ p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers)); if( p==NULL ){ rc = SQLITE_NOMEM; break; } if( nBuffers>0 ){ assert(pBuffers!=NULL); memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers)); sqlite3_free(pBuffers); } pBuffers = p; } dataBufferInit(&(pBuffers[nBuffers]), 0); nBuffers++; } /* At this point, must have an empty at iBuffer. */ assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0); /* If empty was first buffer, no need for merge logic. */ if( iBuffer==0 ){ dataBufferReplace(&(pBuffers[0]), pData, nData); }else{ /* pAcc is the empty buffer the merged data will end up in. */ DataBuffer *pAcc = &(pBuffers[iBuffer]); DataBuffer *p = &(pBuffers[0]); /* Handle position 0 specially to avoid need to prime pAcc ** with pData/nData. */ dataBufferSwap(p, pAcc); docListAccumulateUnion(pAcc, pData, nData); /* Accumulate remaining doclists into pAcc. */ for(++p; p<pAcc; ++p){ docListAccumulateUnion(pAcc, p->pData, p->nData); /* dataBufferReset() could allow a large doclist to blow up ** our memory requirements. */ if( p->nCapacity<1024 ){ dataBufferReset(p); }else{ dataBufferDestroy(p); dataBufferInit(p, 0); } } } } } /* Union all the doclists together into *out. */ /* TODO(shess) What if *out is big? Sigh. */ if( rc==SQLITE_OK && nBuffers>0 ){ int iBuffer; for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){ if( pBuffers[iBuffer].nData>0 ){ if( out->nData==0 ){ dataBufferSwap(out, &(pBuffers[iBuffer])); }else{ docListAccumulateUnion(out, pBuffers[iBuffer].pData, pBuffers[iBuffer].nData); } } } } while( nBuffers-- ){ dataBufferDestroy(&(pBuffers[nBuffers])); } if( pBuffers!=NULL ) sqlite3_free(pBuffers); return rc; } /* Call loadSegmentLeavesInt() with pData/nData as input. */ static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData, const char *pTerm, int nTerm, int isPrefix, DataBuffer *out){ LeavesReader reader; |
︙ | ︙ |