/ Check-in [f6e0b080]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Store minimal terms in interior nodes. Whenever there's a break between leaf nodes, instead of storing the entire leftmost term of the rightmost child, store only that portion of the leftmost term necessary to distinguish it from the rightmost term of the leftmost child. (CVS 3513)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:f6e0b080dcfaf554b2c05df5e7d4db69d012fba3
User & Date: shess 2006-11-18 00:12:45
Context
2006-11-18
20:20
Make sure VACUUM cleans up after itself. Ticket #2071. (CVS 3514) check-in: 2fdc147d user: drh tags: trunk
00:12
Store minimal terms in interior nodes. Whenever there's a break between leaf nodes, instead of storing the entire leftmost term of the rightmost child, store only that portion of the leftmost term necessary to distinguish it from the rightmost term of the leftmost child. (CVS 3513) check-in: f6e0b080 user: shess tags: trunk
2006-11-17
21:12
Refactoring groundwork for coming work on interior nodes. Change LeafWriter to use empty data buffer (instead of empty term) to detect an empty block. Code to validate interior nodes. Moderate revisions to leaf-node and doclist validation. Recast leafWriterStep() in terms of LeafWriterStepMerge(). (CVS 3512) check-in: f30771d5 user: shess tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts2/fts2.c.

148
149
150
151
152
153
154
155



156
157
158
159
160
161
162
....
3956
3957
3958
3959
3960
3961
3962





3963
3964
3965
3966
3967
3968
3969
....
4068
4069
4070
4071
4072
4073
4074



4075
4076
4077
4078
4079
4080
4081
....
4162
4163
4164
4165
4166
4167
4168
4169




4170
4171
4172
4173








4174
4175
4176
4177
4178
4179
4180
....
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
4204


4205
4206
4207
4208
4209
4210
4211
....
4234
4235
4236
4237
4238
4239
4240
4241
4242
4243
4244



4245
4246
4247
4248
4249
4250
4251
....
4279
4280
4281
4282
4283
4284
4285


4286
4287
4288
4289
4290
4291
4292
....
4318
4319
4320
4321
4322
4323
4324


4325
4326
4327
4328
4329
4330
4331
**
** An interior node encodes n terms separating n+1 subtrees.  The
** subtree blocks are contiguous, so only the first subtree's blockid
** is encoded.  The subtree at iBlockid will contain all terms less
** than the first term encoded (or all terms if no term is encoded).
** Otherwise, for terms greater than or equal to pTerm[i] but less
** than pTerm[i+1], the subtree for that term will be rooted at
** iBlockid+i.



**
** New data is spilled to a new interior node at the same height when
** the current node exceeds INTERIOR_MAX bytes (default 2048).
** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
** interior nodes and making the tree too skinny.  The interior nodes
** at a given height are naturally tracked by interior nodes at
** height+1, and so on.
................................................................................
  int iLevel;
  int idx;
  sqlite_int64 iStartBlockid;     /* needed to create the root info */
  sqlite_int64 iEndBlockid;       /* when we're done writing. */

  DataBuffer term;                /* previous encoded term */
  DataBuffer data;                /* encoding buffer */






  InteriorWriter parentWriter;    /* if we overflow */
  int has_parent;
} LeafWriter;

static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
  CLEAR(pWriter);
................................................................................

  /* Reconstruct the first term in the leaf for purposes of building
  ** the interior node.
  */
  n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
  pStartingTerm = pWriter->data.pData+iData+1+n;
  assert( pWriter->data.nData>iData+1+n+nStartingTerm );




  if( pWriter->has_parent ){
    interiorWriterAppend(&pWriter->parentWriter,
                         pStartingTerm, nStartingTerm, iBlockid);
  }else{
    interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
                       &pWriter->parentWriter);
................................................................................

static void leafWriterDestroy(LeafWriter *pWriter){
  if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
  dataBufferDestroy(&pWriter->term);
  dataBufferDestroy(&pWriter->data);
}

/* Encode a term into the leafWriter, delta-encoding as appropriate. */




static void leafWriterEncodeTerm(LeafWriter *pWriter,
                                 const char *pTerm, int nTerm){
  char c[VARINT_MAX+VARINT_MAX];
  int n;









  if( pWriter->data.nData==0 ){
    /* Encode the node header and leading term as:
    **  varint(0)
    **  varint(nTerm)
    **  char pTerm[nTerm]
    */
................................................................................
    dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
  }else{
    /* Delta-encode the term as:
    **  varint(nPrefix)
    **  varint(nSuffix)
    **  char pTermSuffix[nSuffix]
    */
    int nPrefix = 0;

    assert( nTerm>0 );
    while( nPrefix<pWriter->term.nData &&
           pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
      nPrefix++;
      /* Failing this implies that the terms weren't in order. */
      assert( nPrefix<nTerm );
    }

    n = putVarint(c, nPrefix);
    n += putVarint(c+n, nTerm-nPrefix);
    dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
  }
  dataBufferReplace(&pWriter->term, pTerm, nTerm);


}

/* Used to avoid a memmove when a large amount of doclist data is in
** the buffer.  This constructs a node and term header before
** iDoclistData and flushes the resulting complete node using
** leafWriterInternalFlush().
*/
................................................................................
** %_segments.
*/
static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
                               const char *pTerm, int nTerm,
                               DLReader *pReaders, int nReaders){
  char c[VARINT_MAX+VARINT_MAX];
  int iTermData = pWriter->data.nData, iDoclistData;
  int i, nData, n, nActualData, nActual, rc;

  ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
  leafWriterEncodeTerm(pWriter, pTerm, nTerm);




  iDoclistData = pWriter->data.nData;

  /* Estimate the length of the merged doclist so we can leave space
  ** to encode it.
  */
  for(i=0, nData=0; i<nReaders; i++){
................................................................................
  ** pWriter->data.nData-iTermData>STANDALONE_MIN.
  */
  if( nTerm+nActualData>STANDALONE_MIN ){
    /* Push leaf node from before this term. */
    if( iTermData>0 ){
      rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
      if( rc!=SQLITE_OK ) return rc;


    }

    /* Fix the encoded doclist length. */
    iDoclistData += n - nActual;
    memcpy(pWriter->data.pData+iDoclistData, c, nActual);

    /* Push the standalone leaf node. */
................................................................................
  ** doclist lengths.  At some point, change to
  ** pWriter->data.nData>LEAF_MAX.
  */
  if( iTermData+nTerm+nActualData>LEAF_MAX ){
    /* Flush out the leading data as a node */
    rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
    if( rc!=SQLITE_OK ) return rc;



    /* Rebuild header using the current term */
    n = putVarint(pWriter->data.pData, 0);
    n += putVarint(pWriter->data.pData+n, nTerm);
    memcpy(pWriter->data.pData+n, pTerm, nTerm);
    n += nTerm;








|
>
>
>







 







>
>
>
>
>







 







>
>
>







 







|
>
>
>
>
|
|

|
>
>
>
>
>
>
>
>







 







<
<
<
<
<
<
<
<
<
<





>
>







 







|


|
>
>
>







 







>
>







 







>
>







148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
....
3959
3960
3961
3962
3963
3964
3965
3966
3967
3968
3969
3970
3971
3972
3973
3974
3975
3976
3977
....
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
4092
....
4173
4174
4175
4176
4177
4178
4179
4180
4181
4182
4183
4184
4185
4186
4187
4188
4189
4190
4191
4192
4193
4194
4195
4196
4197
4198
4199
4200
4201
4202
4203
....
4206
4207
4208
4209
4210
4211
4212










4213
4214
4215
4216
4217
4218
4219
4220
4221
4222
4223
4224
4225
4226
....
4249
4250
4251
4252
4253
4254
4255
4256
4257
4258
4259
4260
4261
4262
4263
4264
4265
4266
4267
4268
4269
....
4297
4298
4299
4300
4301
4302
4303
4304
4305
4306
4307
4308
4309
4310
4311
4312
....
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
4351
4352
4353
**
** An interior node encodes n terms separating n+1 subtrees.  The
** subtree blocks are contiguous, so only the first subtree's blockid
** is encoded.  The subtree at iBlockid will contain all terms less
** than the first term encoded (or all terms if no term is encoded).
** Otherwise, for terms greater than or equal to pTerm[i] but less
** than pTerm[i+1], the subtree for that term will be rooted at
** iBlockid+i.  Interior nodes only store enough term data to
** distinguish adjacent children (if the rightmost term of the left
** child is "something", and the leftmost term of the right child is
** "wicked", only "w" is stored).
**
** New data is spilled to a new interior node at the same height when
** the current node exceeds INTERIOR_MAX bytes (default 2048).
** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
** interior nodes and making the tree too skinny.  The interior nodes
** at a given height are naturally tracked by interior nodes at
** height+1, and so on.
................................................................................
  int iLevel;
  int idx;
  sqlite_int64 iStartBlockid;     /* needed to create the root info */
  sqlite_int64 iEndBlockid;       /* when we're done writing. */

  DataBuffer term;                /* previous encoded term */
  DataBuffer data;                /* encoding buffer */

  /* bytes of first term in the current node which distinguishes that
  ** term from the last term of the previous node.
  */
  int nTermDistinct;

  InteriorWriter parentWriter;    /* if we overflow */
  int has_parent;
} LeafWriter;

static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
  CLEAR(pWriter);
................................................................................

  /* Reconstruct the first term in the leaf for purposes of building
  ** the interior node.
  */
  n = getVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
  pStartingTerm = pWriter->data.pData+iData+1+n;
  assert( pWriter->data.nData>iData+1+n+nStartingTerm );
  assert( pWriter->nTermDistinct>0 );
  assert( pWriter->nTermDistinct<=nStartingTerm );
  nStartingTerm = pWriter->nTermDistinct;

  if( pWriter->has_parent ){
    interiorWriterAppend(&pWriter->parentWriter,
                         pStartingTerm, nStartingTerm, iBlockid);
  }else{
    interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
                       &pWriter->parentWriter);
................................................................................

static void leafWriterDestroy(LeafWriter *pWriter){
  if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
  dataBufferDestroy(&pWriter->term);
  dataBufferDestroy(&pWriter->data);
}

/* Encode a term into the leafWriter, delta-encoding as appropriate.
** Returns the length of the new term which distinguishes it from the
** previous term, which can be used to set nTermDistinct when a node
** boundary is crossed.
*/
static int leafWriterEncodeTerm(LeafWriter *pWriter,
                                const char *pTerm, int nTerm){
  char c[VARINT_MAX+VARINT_MAX];
  int n, nPrefix = 0;

  assert( nTerm>0 );
  while( nPrefix<pWriter->term.nData &&
         pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
    nPrefix++;
    /* Failing this implies that the terms weren't in order. */
    assert( nPrefix<nTerm );
  }

  if( pWriter->data.nData==0 ){
    /* Encode the node header and leading term as:
    **  varint(0)
    **  varint(nTerm)
    **  char pTerm[nTerm]
    */
................................................................................
    dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
  }else{
    /* Delta-encode the term as:
    **  varint(nPrefix)
    **  varint(nSuffix)
    **  char pTermSuffix[nSuffix]
    */










    n = putVarint(c, nPrefix);
    n += putVarint(c+n, nTerm-nPrefix);
    dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
  }
  dataBufferReplace(&pWriter->term, pTerm, nTerm);

  return nPrefix+1;
}

/* Used to avoid a memmove when a large amount of doclist data is in
** the buffer.  This constructs a node and term header before
** iDoclistData and flushes the resulting complete node using
** leafWriterInternalFlush().
*/
................................................................................
** %_segments.
*/
static int leafWriterStepMerge(fulltext_vtab *v, LeafWriter *pWriter,
                               const char *pTerm, int nTerm,
                               DLReader *pReaders, int nReaders){
  char c[VARINT_MAX+VARINT_MAX];
  int iTermData = pWriter->data.nData, iDoclistData;
  int i, nData, n, nActualData, nActual, rc, nTermDistinct;

  ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
  nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);

  /* Remember nTermDistinct if opening a new node. */
  if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;

  iDoclistData = pWriter->data.nData;

  /* Estimate the length of the merged doclist so we can leave space
  ** to encode it.
  */
  for(i=0, nData=0; i<nReaders; i++){
................................................................................
  ** pWriter->data.nData-iTermData>STANDALONE_MIN.
  */
  if( nTerm+nActualData>STANDALONE_MIN ){
    /* Push leaf node from before this term. */
    if( iTermData>0 ){
      rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
      if( rc!=SQLITE_OK ) return rc;

      pWriter->nTermDistinct = nTermDistinct;
    }

    /* Fix the encoded doclist length. */
    iDoclistData += n - nActual;
    memcpy(pWriter->data.pData+iDoclistData, c, nActual);

    /* Push the standalone leaf node. */
................................................................................
  ** doclist lengths.  At some point, change to
  ** pWriter->data.nData>LEAF_MAX.
  */
  if( iTermData+nTerm+nActualData>LEAF_MAX ){
    /* Flush out the leading data as a node */
    rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
    if( rc!=SQLITE_OK ) return rc;

    pWriter->nTermDistinct = nTermDistinct;

    /* Rebuild header using the current term */
    n = putVarint(pWriter->data.pData, 0);
    n += putVarint(pWriter->data.pData+n, nTerm);
    memcpy(pWriter->data.pData+n, pTerm, nTerm);
    n += nTerm;