SQLite

Check-in [feef1b15d6]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change prefix search from O(N*M) to O(NlogM). The previous code linearly merged the doclists, so as the accumulated list got large, things got slow (the M term, a fucntion of the number of documents in the index). This change does pairwise merges until a single doclist remains. A test search of 't*' against a database of RFC text improves from 1m16s to 4.75s. (CVS 4599)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: feef1b15d645d638b4a05742f214b0445fa7e176
User & Date: shess 2007-12-07 23:47:53.000
References
2010-07-19
11:16
Re-introduce the prefix-search optimization of [feef1b15d6], which was lost in a reorganization of FTS3 code. (check-in: d692434b49 user: dan tags: trunk)
Context
2007-12-08
17:55
Fix a bug in the debugging printf logic. (CVS 4600) (check-in: 1d6a9f5faf user: drh tags: trunk)
2007-12-07
23:47
Change prefix search from O(N*M) to O(NlogM). The previous code linearly merged the doclists, so as the accumulated list got large, things got slow (the M term, a fucntion of the number of documents in the index). This change does pairwise merges until a single doclist remains. A test search of 't*' against a database of RFC text improves from 1m16s to 4.75s. (CVS 4599) (check-in: feef1b15d6 user: shess tags: trunk)
18:55
In shared-cache mode, make sure the busy hander invoked is the busy handler associated with the database connection that caused the lock contention in the first place. (CVS 4598) (check-in: c9eb65912f user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts3/fts3.c.
433
434
435
436
437
438
439

440
441
442
443
444
445
446
/* DataBuffer is used to collect data into a buffer in piecemeal
** fashion.  It implements the usual distinction between amount of
** data currently stored (nData) and buffer capacity (nCapacity).
**
** dataBufferInit - create a buffer with given initial capacity.
** dataBufferReset - forget buffer's data, retaining capacity.
** dataBufferDestroy - free buffer's data.

** dataBufferExpand - expand capacity without adding data.
** dataBufferAppend - append data.
** dataBufferAppend2 - append two pieces of data at once.
** dataBufferReplace - replace buffer's data.
*/
typedef struct DataBuffer {
  char *pData;          /* Pointer to malloc'ed buffer. */







>







433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
/* DataBuffer is used to collect data into a buffer in piecemeal
** fashion.  It implements the usual distinction between amount of
** data currently stored (nData) and buffer capacity (nCapacity).
**
** dataBufferInit - create a buffer with given initial capacity.
** dataBufferReset - forget buffer's data, retaining capacity.
** dataBufferDestroy - free buffer's data.
** dataBufferSwap - swap contents of two buffers.
** dataBufferExpand - expand capacity without adding data.
** dataBufferAppend - append data.
** dataBufferAppend2 - append two pieces of data at once.
** dataBufferReplace - replace buffer's data.
*/
typedef struct DataBuffer {
  char *pData;          /* Pointer to malloc'ed buffer. */
456
457
458
459
460
461
462





463
464
465
466
467
468
469
}
static void dataBufferReset(DataBuffer *pBuffer){
  pBuffer->nData = 0;
}
static void dataBufferDestroy(DataBuffer *pBuffer){
  if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
  SCRAMBLE(pBuffer);





}
static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
  assert( nAddCapacity>0 );
  /* TODO(shess) Consider expanding more aggressively.  Note that the
  ** underlying malloc implementation may take care of such things for
  ** us already.
  */







>
>
>
>
>







457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
}
static void dataBufferReset(DataBuffer *pBuffer){
  pBuffer->nData = 0;
}
static void dataBufferDestroy(DataBuffer *pBuffer){
  if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
  SCRAMBLE(pBuffer);
}
static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
  DataBuffer tmp = *pBuffer1;
  *pBuffer1 = *pBuffer2;
  *pBuffer2 = tmp;
}
static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
  assert( nAddCapacity>0 );
  /* TODO(shess) Consider expanding more aggressively.  Note that the
  ** underlying malloc implementation may take care of such things for
  ** us already.
  */
5550
5551
5552
5553
5554
5555
5556




















5557
5558
5559
5560
5561
5562
5563
5564









5565
5566
5567
5568

5569
5570
5571
5572
5573
5574
5575

5576
5577
5578
5579

5580


5581



5582

5583






5584








5585

5586


5587







5588


5589


5590



5591













5592











5593



5594



5595
5596
5597
5598
5599
5600
5601
5602
5603
5604
 err:
  for(i=0; i<MERGE_COUNT; i++){
    leavesReaderDestroy(&lrs[i]);
  }
  leafWriterDestroy(&writer);
  return rc;
}





















/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
** *out (any doclists with duplicate docids overwrite those in *out).
** Internal function for loadSegmentLeaf().
*/
static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
                                const char *pTerm, int nTerm, int isPrefix,
                                DataBuffer *out){









  assert( nTerm>0 );

  /* Process while the prefix matches. */
  while( !leavesReaderAtEnd(pReader) ){

    /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
    ** already taken to compare the terms of two LeavesReaders.  Think
    ** on a better name.  [Meanwhile, break encapsulation rather than
    ** use a confusing name.]
    */
    int rc;
    int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);

    if( c==0 ){
      const char *pData = leavesReaderData(pReader);
      int nData = leavesReaderDataBytes(pReader);
      if( out->nData==0 ){

        dataBufferReplace(out, pData, nData);


      }else{



        DataBuffer result;

        dataBufferInit(&result, out->nData+nData);






        docListUnion(out->pData, out->nData, pData, nData, &result);








        dataBufferDestroy(out);

        *out = result;


        /* TODO(shess) Rather than destroy out, we could retain it for







        ** later reuse.


        */


      }



    }













    if( c>0 ) break;      /* Past any possible matches. */















    rc = leavesReaderStep(v, pReader);



    if( rc!=SQLITE_OK ) return rc;
  }
  return SQLITE_OK;
}

/* Call loadSegmentLeavesInt() with pData/nData as input. */
static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
                           const char *pTerm, int nTerm, int isPrefix,
                           DataBuffer *out){
  LeavesReader reader;







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>








>
>
>
>
>
>
>
>
>


<
|
>





<

>


|
|
>
|
>
>
|
>
>
>
|
>
|
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
|
>
|
>
>
|
>
>
>
>
>
>
>
|
>
>

>
>
|
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
|
>
>
>
|
|
|







5556
5557
5558
5559
5560
5561
5562
5563
5564
5565
5566
5567
5568
5569
5570
5571
5572
5573
5574
5575
5576
5577
5578
5579
5580
5581
5582
5583
5584
5585
5586
5587
5588
5589
5590
5591
5592
5593
5594
5595
5596
5597
5598
5599
5600
5601

5602
5603
5604
5605
5606
5607
5608

5609
5610
5611
5612
5613
5614
5615
5616
5617
5618
5619
5620
5621
5622
5623
5624
5625
5626
5627
5628
5629
5630
5631
5632
5633
5634
5635
5636
5637
5638
5639
5640
5641
5642
5643
5644
5645
5646
5647
5648
5649
5650
5651
5652
5653
5654
5655
5656
5657
5658
5659
5660
5661
5662
5663
5664
5665
5666
5667
5668
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
5706
5707
 err:
  for(i=0; i<MERGE_COUNT; i++){
    leavesReaderDestroy(&lrs[i]);
  }
  leafWriterDestroy(&writer);
  return rc;
}

/* Accumulate the union of *acc and *pData into *acc. */
static void docListAccumulateUnion(DataBuffer *acc,
                                   const char *pData, int nData) {
  DataBuffer tmp = *acc;
  dataBufferInit(acc, tmp.nData+nData);
  docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
  dataBufferDestroy(&tmp);
}

/* TODO(shess) It might be interesting to explore different merge
** strategies, here.  For instance, since this is a sorted merge, we
** could easily merge many doclists in parallel.  With some
** comprehension of the storage format, we could merge all of the
** doclists within a leaf node directly from the leaf node's storage.
** It may be worthwhile to merge smaller doclists before larger
** doclists, since they can be traversed more quickly - but the
** results may have less overlap, making them more expensive in a
** different way.
*/

/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
** *out (any doclists with duplicate docids overwrite those in *out).
** Internal function for loadSegmentLeaf().
*/
static int loadSegmentLeavesInt(fulltext_vtab *v, LeavesReader *pReader,
                                const char *pTerm, int nTerm, int isPrefix,
                                DataBuffer *out){
  /* doclist data is accumulated into pBuffers similar to how one does
  ** increment in binary arithmetic.  If index 0 is empty, the data is
  ** stored there.  If there is data there, it is merged and the
  ** results carried into position 1, with further merge-and-carry
  ** until an empty position is found.
  */
  DataBuffer *pBuffers = NULL;
  int nBuffers = 0, nMaxBuffers = 0, rc;

  assert( nTerm>0 );


  for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
      rc=leavesReaderStep(v, pReader)){
    /* TODO(shess) Really want leavesReaderTermCmp(), but that name is
    ** already taken to compare the terms of two LeavesReaders.  Think
    ** on a better name.  [Meanwhile, break encapsulation rather than
    ** use a confusing name.]
    */

    int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
    if( c>0 ) break;      /* Past any possible matches. */
    if( c==0 ){
      const char *pData = leavesReaderData(pReader);
      int iBuffer, nData = leavesReaderDataBytes(pReader);

      /* Find the first empty buffer. */
      for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
        if( 0==pBuffers[iBuffer].nData ) break;
      }

      /* Out of buffers, add an empty one. */
      if( iBuffer==nBuffers ){
        if( nBuffers==nMaxBuffers ){
          DataBuffer *p;
          nMaxBuffers += 20;

          /* Manual realloc so we can handle NULL appropriately. */
          p = sqlite3_malloc(nMaxBuffers*sizeof(*pBuffers));
          if( p==NULL ){
            rc = SQLITE_NOMEM;
            break;
          }

          if( nBuffers>0 ){
            assert(pBuffers!=NULL);
            memcpy(p, pBuffers, nBuffers*sizeof(*pBuffers));
            sqlite3_free(pBuffers);
          }
          pBuffers = p;
        }
        dataBufferInit(&(pBuffers[nBuffers]), 0);
        nBuffers++;
      }

      /* At this point, must have an empty at iBuffer. */
      assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);

      /* If empty was first buffer, no need for merge logic. */
      if( iBuffer==0 ){
        dataBufferReplace(&(pBuffers[0]), pData, nData);
      }else{
        /* pAcc is the empty buffer the merged data will end up in. */
        DataBuffer *pAcc = &(pBuffers[iBuffer]);
        DataBuffer *p = &(pBuffers[0]);

        /* Handle position 0 specially to avoid need to prime pAcc
        ** with pData/nData.
        */
        dataBufferSwap(p, pAcc);
        docListAccumulateUnion(pAcc, pData, nData);

        /* Accumulate remaining doclists into pAcc. */
        for(++p; p<pAcc; ++p){
          docListAccumulateUnion(pAcc, p->pData, p->nData);

          /* dataBufferReset() could allow a large doclist to blow up
          ** our memory requirements.
          */
          if( p->nCapacity<1024 ){
            dataBufferReset(p);
          }else{
            dataBufferDestroy(p);
            dataBufferInit(p, 0);
          }
        }
      }
    }
  }

  /* Union all the doclists together into *out. */
  /* TODO(shess) What if *out is big?  Sigh. */
  if( rc==SQLITE_OK && nBuffers>0 ){
    int iBuffer;
    for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
      if( pBuffers[iBuffer].nData>0 ){
        if( out->nData==0 ){
          dataBufferSwap(out, &(pBuffers[iBuffer]));
        }else{
          docListAccumulateUnion(out, pBuffers[iBuffer].pData,
                                 pBuffers[iBuffer].nData);
        }
      }
    }
  }

  while( nBuffers-- ){
    dataBufferDestroy(&(pBuffers[nBuffers]));
  }
  if( pBuffers!=NULL ) sqlite3_free(pBuffers);

  return rc;
}

/* Call loadSegmentLeavesInt() with pData/nData as input. */
static int loadSegmentLeaf(fulltext_vtab *v, const char *pData, int nData,
                           const char *pTerm, int nTerm, int isPrefix,
                           DataBuffer *out){
  LeavesReader reader;