/ Check-in [f6e0b080]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Store minimal terms in interior nodes. Whenever there's a break between leaf nodes, instead of storing the entire leftmost term of the rightmost child, store only that portion of the leftmost term necessary to distinguish it from the rightmost term of the leftmost child. (CVS 3513)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:f6e0b080dcfaf554b2c05df5e7d4db69d012fba3
User & Date: shess 2006-11-18 00:12:45
Context
2006-11-18
20:20
Make sure VACUUM cleans up after itself. Ticket #2071. (CVS 3514) check-in: 2fdc147d user: drh tags: trunk
00:12
Store minimal terms in interior nodes. Whenever there's a break between leaf nodes, instead of storing the entire leftmost term of the rightmost child, store only that portion of the leftmost term necessary to distinguish it from the rightmost term of the leftmost child. (CVS 3513) check-in: f6e0b080 user: shess tags: trunk
2006-11-17
21:12
Refactoring groundwork for coming work on interior nodes. Change LeafWriter to use empty data buffer (instead of empty term) to detect an empty block. Code to validate interior nodes. Moderate revisions to leaf-node and doclist validation. Recast leafWriterStep() in terms of LeafWriterStepMerge(). (CVS 3512) check-in: f30771d5 user: shess tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts2/fts2.c.

   148    148   **
   149    149   ** An interior node encodes n terms separating n+1 subtrees.  The
   150    150   ** subtree blocks are contiguous, so only the first subtree's blockid
   151    151   ** is encoded.  The subtree at iBlockid will contain all terms less
   152    152   ** than the first term encoded (or all terms if no term is encoded).
   153    153   ** Otherwise, for terms greater than or equal to pTerm[i] but less
   154    154   ** than pTerm[i+1], the subtree for that term will be rooted at
   155         -** iBlockid+i.
          155  +** iBlockid+i.  Interior nodes only store enough term data to
          156  +** distinguish adjacent children (if the rightmost term of the left
          157  +** child is "something", and the leftmost term of the right child is
          158  +** "wicked", only "w" is stored).
   156    159   **
   157    160   ** New data is spilled to a new interior node at the same height when
   158    161   ** the current node exceeds INTERIOR_MAX bytes (default 2048).
   159    162   ** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
   160    163   ** interior nodes and making the tree too skinny.  The interior nodes
   161    164   ** at a given height are naturally tracked by interior nodes at
   162    165   ** height+1, and so on.
................................................................................
  3956   3959     int iLevel;
  3957   3960     int idx;
  3958   3961     sqlite_int64 iStartBlockid;     /* needed to create the root info */
  3959   3962     sqlite_int64 iEndBlockid;       /* when we're done writing. */
  3960   3963   
  3961   3964     DataBuffer term;                /* previous encoded term */
  3962   3965     DataBuffer data;                /* encoding buffer */
         3966  +
         3967  +  /* bytes of first term in the current node which distinguishes that
         3968  +  ** term from the last term of the previous node.
         3969  +  */
         3970  +  int nTermDistinct;
  3963   3971   
  3964   3972     InteriorWriter parentWriter;    /* if we overflow */
  3965   3973     int has_parent;
  3966   3974   } LeafWriter;
  3967   3975   
  3968   3976   static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
  3969   3977     CLEAR(pWriter);
................................................................................
  4068   4076   
  4069   4077     /* Reconstruct the first term in the leaf for purposes of building
  4070   4078     ** the interior node.
  4071   4079     */
  4072   4080     n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
  4073   4081     pStartingTerm = pWriter->data.pData+iData+1+n;
  4074   4082     assert( pWriter->data.nData>iData+1+n+nStartingTerm );
         4083  +  assert( pWriter->nTermDistinct>0 );
         4084  +  assert( pWriter->nTermDistinct<=nStartingTerm );
         4085  +  nStartingTerm = pWriter->nTermDistinct;
  4075   4086   
  4076   4087     if( pWriter->has_parent ){
  4077   4088       interiorWriterAppend(&pWriter->parentWriter,
  4078   4089                            pStartingTerm, nStartingTerm, iBlockid);
  4079   4090     }else{
  4080   4091       interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
  4081   4092                          &pWriter->parentWriter);
................................................................................
  4162   4173   
  4163   4174   static void leafWriterDestroy(LeafWriter *pWriter){
  4164   4175     if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
  4165   4176     dataBufferDestroy(&pWriter->term);
  4166   4177     dataBufferDestroy(&pWriter->data);
  4167   4178   }
  4168   4179   
  4169         -/* Encode a term into the leafWriter, delta-encoding as appropriate. */
  4170         -static void leafWriterEncodeTerm(LeafWriter *pWriter,
  4171         -                                 const char *pTerm, int nTerm){
         4180  +/* Encode a term into the leafWriter, delta-encoding as appropriate.
         4181  +** Returns the length of the new term which distinguishes it from the
         4182  +** previous term, which can be used to set nTermDistinct when a node
         4183  +** boundary is crossed.
         4184  +*/
         4185  +static int leafWriterEncodeTerm(LeafWriter *pWriter,
         4186  +                                const char *pTerm, int nTerm){
  4172   4187     char c[VARINT_MAX+VARINT_MAX];
  4173         -  int n;
         4188  +  int n, nPrefix = 0;
         4189  +
         4190  +  assert( nTerm>0 );
         4191  +  while( nPrefix<pWriter->term.nData &&
         4192  +         pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
         4193  +    nPrefix++;
         4194  +    /* Failing this implies that the terms weren't in order. */
         4195  +    assert( nPrefix<nTerm );
         4196  +  }
  4174   4197   
  4175   4198     if( pWriter->data.nData==0 ){
  4176   4199       /* Encode the node header and leading term as:
  4177   4200       **  varint(0)
  4178   4201       **  varint(nTerm)
  4179   4202       **  char pTerm[nTerm]
  4180   4203       */
................................................................................
  4183   4206       dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
  4184   4207     }else{
  4185   4208       /* Delta-encode the term as:
  4186   4209       **  varint(nPrefix)
  4187   4210       **  varint(nSuffix)
  4188   4211       **  char pTermSuffix[nSuffix]
  4189   4212       */
  4190         -    int nPrefix = 0;
  4191         -
  4192         -    assert( nTerm>0 );
  4193         -    while( nPrefix<pWriter->term.nData &&
  4194         -           pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
  4195         -      nPrefix++;
  4196         -      /* Failing this implies that the terms weren't in order. */
  4197         -      assert( nPrefix<nTerm );
  4198         -    }
  4199         -
  4200   4213       n = putVarint(c, nPrefix);
  4201   4214       n += putVarint(c+n, nTerm-nPrefix);
  4202   4215       dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
  4203   4216     }
  4204   4217     dataBufferReplace(&pWriter->term, pTerm, nTerm);
         4218  +
         4219  +  return nPrefix+1;
  4205   4220   }
  4206   4221   
  4207   4222   /* Used to avoid a memmove when a large amount of doclist data is in
  4208   4223   ** the buffer.  This constructs a node and term header before
  4209   4224   ** iDoclistData and flushes the resulting complete node using
  4210   4225   ** leafWriterInternalFlush().
  4211   4226   */
................................................................................
  4234   4249   ** %_segments.
  4235   4250   */
  4236   4251   static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
  4237   4252                                  const char *pTerm, int nTerm,
  4238   4253                                  DLReader *pReaders, int nReaders){
  4239   4254     char c[VARINT_MAX+VARINT_MAX];
  4240   4255     int iTermData = pWriter->data.nData, iDoclistData;
  4241         -  int i, nData, n, nActualData, nActual, rc;
         4256  +  int i, nData, n, nActualData, nActual, rc, nTermDistinct;
  4242   4257   
  4243   4258     ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
  4244         -  leafWriterEncodeTerm(pWriter, pTerm, nTerm);
         4259  +  nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
         4260  +
         4261  +  /* Remember nTermDistinct if opening a new node. */
         4262  +  if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
  4245   4263   
  4246   4264     iDoclistData = pWriter->data.nData;
  4247   4265   
  4248   4266     /* Estimate the length of the merged doclist so we can leave space
  4249   4267     ** to encode it.
  4250   4268     */
  4251   4269     for(i=0, nData=0; i<nReaders; i++){
................................................................................
  4279   4297     ** pWriter->data.nData-iTermData>STANDALONE_MIN.
  4280   4298     */
  4281   4299     if( nTerm+nActualData>STANDALONE_MIN ){
  4282   4300       /* Push leaf node from before this term. */
  4283   4301       if( iTermData>0 ){
  4284   4302         rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
  4285   4303         if( rc!=SQLITE_OK ) return rc;
         4304  +
         4305  +      pWriter->nTermDistinct = nTermDistinct;
  4286   4306       }
  4287   4307   
  4288   4308       /* Fix the encoded doclist length. */
  4289   4309       iDoclistData += n - nActual;
  4290   4310       memcpy(pWriter->data.pData+iDoclistData, c, nActual);
  4291   4311   
  4292   4312       /* Push the standalone leaf node. */
................................................................................
  4318   4338     ** doclist lengths.  At some point, change to
  4319   4339     ** pWriter->data.nData>LEAF_MAX.
  4320   4340     */
  4321   4341     if( iTermData+nTerm+nActualData>LEAF_MAX ){
  4322   4342       /* Flush out the leading data as a node */
  4323   4343       rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
  4324   4344       if( rc!=SQLITE_OK ) return rc;
         4345  +
         4346  +    pWriter->nTermDistinct = nTermDistinct;
  4325   4347   
  4326   4348       /* Rebuild header using the current term */
  4327   4349       n = putVarint(pWriter->data.pData, 0);
  4328   4350       n += putVarint(pWriter->data.pData+n, nTerm);
  4329   4351       memcpy(pWriter->data.pData+n, pTerm, nTerm);
  4330   4352       n += nTerm;
  4331   4353