SQLite

Check-in [c16900dc76]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implement optimize() function. Backports check-in (5417) from fts3. (CVS 5458)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: c16900dc7603cab30f8729b25361bc88bb37ae43
User & Date: shess 2008-07-22 23:49:44.000
Context
2008-07-22
23:54
Be a bit more susicious of invalid results from the tokenizer. Backports check-in (4514) from fts3. (CVS 5459) (check-in: 311aeb9c2b user: shess tags: trunk)
23:49
Implement optimize() function. Backports check-in (5417) from fts3. (CVS 5458) (check-in: c16900dc76 user: shess tags: trunk)
23:41
Delete all fts2 index data the table becomes empty. Backports check-in (5413) from fts3. (CVS 5457) (check-in: 4c98179be2 user: shess tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts2/fts2.c.
1785
1786
1787
1788
1789
1790
1791

1792
1793
1794
1795
1796
1797
1798
  SEGDIR_SET_STMT,
  SEGDIR_SELECT_LEVEL_STMT,
  SEGDIR_SPAN_STMT,
  SEGDIR_DELETE_STMT,
  SEGDIR_SELECT_SEGMENT_STMT,
  SEGDIR_SELECT_ALL_STMT,
  SEGDIR_DELETE_ALL_STMT,


  MAX_STMT                     /* Always at end! */
} fulltext_statement;

/* These must exactly match the enum above. */
/* TODO(shess): Is there some risk that a statement will be used in two
** cursors at once, e.g.  if a query joins a virtual table to itself?







>







1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
  SEGDIR_SET_STMT,
  SEGDIR_SELECT_LEVEL_STMT,
  SEGDIR_SPAN_STMT,
  SEGDIR_DELETE_STMT,
  SEGDIR_SELECT_SEGMENT_STMT,
  SEGDIR_SELECT_ALL_STMT,
  SEGDIR_DELETE_ALL_STMT,
  SEGDIR_COUNT_STMT,

  MAX_STMT                     /* Always at end! */
} fulltext_statement;

/* These must exactly match the enum above. */
/* TODO(shess): Is there some risk that a statement will be used in two
** cursors at once, e.g.  if a query joins a virtual table to itself?
1826
1827
1828
1829
1830
1831
1832

1833
1834
1835
1836
1837
1838
1839
  /* SEGDIR_SELECT_SEGMENT */
  "select start_block, leaves_end_block, root from %_segdir "
  " where level = ? and idx = ?",
  /* SEGDIR_SELECT_ALL */
  "select start_block, leaves_end_block, root from %_segdir "
  " order by level desc, idx asc",
  /* SEGDIR_DELETE_ALL */ "delete from %_segdir",

};

/*
** A connection to a fulltext index is an instance of the following
** structure.  The xCreate and xConnect methods create an instance
** of this structure and xDestroy and xDisconnect free that instance.
** All other methods receive a pointer to the structure as one of their







>







1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
  /* SEGDIR_SELECT_SEGMENT */
  "select start_block, leaves_end_block, root from %_segdir "
  " where level = ? and idx = ?",
  /* SEGDIR_SELECT_ALL */
  "select start_block, leaves_end_block, root from %_segdir "
  " order by level desc, idx asc",
  /* SEGDIR_DELETE_ALL */ "delete from %_segdir",
  /* SEGDIR_COUNT */ "select count(*), ifnull(max(level),0) from %_segdir",
};

/*
** A connection to a fulltext index is an instance of the following
** structure.  The xCreate and xConnect methods create an instance
** of this structure and xDestroy and xDisconnect free that instance.
** All other methods receive a pointer to the structure as one of their
1976
1977
1978
1979
1980
1981
1982
1983

1984
1985
1986
1987
1988
1989
1990


1991
1992
1993
1994
1995
1996
1997
1998
*/
static int sql_single_step(sqlite3_stmt *s){
  int rc = sqlite3_step(s);
  return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
}

/* Like sql_get_statement(), but for special replicated LEAF_SELECT
** statements.

*/
/* TODO(shess) Write version for generic statements and then share
** that between the cached-statement functions.
*/
static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
                                  sqlite3_stmt **ppStmt){
  assert( idx>=0 && idx<MERGE_COUNT );


  if( v->pLeafSelectStmts[idx]==NULL ){
    int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
                         LEAF_SELECT);
    if( rc!=SQLITE_OK ) return rc;
  }else{
    int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
    if( rc!=SQLITE_OK ) return rc;
  }







|
>






|
>
>
|







1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
*/
static int sql_single_step(sqlite3_stmt *s){
  int rc = sqlite3_step(s);
  return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
}

/* Like sql_get_statement(), but for special replicated LEAF_SELECT
** statements.  idx -1 is a special case for an uncached version of
** the statement (used in the optimize implementation).
*/
/* TODO(shess) Write version for generic statements and then share
** that between the cached-statement functions.
*/
static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
                                  sqlite3_stmt **ppStmt){
  assert( idx>=-1 && idx<MERGE_COUNT );
  if( idx==-1 ){
    return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT);
  }else if( v->pLeafSelectStmts[idx]==NULL ){
    int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
                         LEAF_SELECT);
    if( rc!=SQLITE_OK ) return rc;
  }else{
    int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
    if( rc!=SQLITE_OK ) return rc;
  }
2310
2311
2312
2313
2314
2315
2316































2317
2318
2319
2320
2321
2322
2323
  if( rc!=SQLITE_OK ) return rc;

  rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
  if( rc!=SQLITE_OK ) return rc;

  return sql_single_step(s);
}
































/* TODO(shess) clearPendingTerms() is far down the file because
** writeZeroSegment() is far down the file because LeafWriter is far
** down the file.  Consider refactoring the code to move the non-vtab
** code above the vtab code so that we don't need this forward
** reference.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
  if( rc!=SQLITE_OK ) return rc;

  rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
  if( rc!=SQLITE_OK ) return rc;

  return sql_single_step(s);
}

/* Returns SQLITE_OK with *pnSegments set to the number of entries in
** %_segdir and *piMaxLevel set to the highest level which has a
** segment.  Otherwise returns the SQLite error which caused failure.
*/
static int segdir_count(fulltext_vtab *v, int *pnSegments, int *piMaxLevel){
  sqlite3_stmt *s;
  int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s);
  if( rc!=SQLITE_OK ) return rc;

  rc = sqlite3_step(s);
  /* TODO(shess): This case should not be possible?  Should stronger
  ** measures be taken if it happens?
  */
  if( rc==SQLITE_DONE ){
    *pnSegments = 0;
    *piMaxLevel = 0;
    return SQLITE_OK;
  }
  if( rc!=SQLITE_ROW ) return rc;

  *pnSegments = sqlite3_column_int(s, 0);
  *piMaxLevel = sqlite3_column_int(s, 1);

  /* We expect only one row.  We must execute another sqlite3_step()
   * to complete the iteration; otherwise the table will remain locked. */
  rc = sqlite3_step(s);
  if( rc==SQLITE_DONE ) return SQLITE_OK;
  if( rc==SQLITE_ROW ) return SQLITE_ERROR;
  return rc;
}

/* TODO(shess) clearPendingTerms() is far down the file because
** writeZeroSegment() is far down the file because LeafWriter is far
** down the file.  Consider refactoring the code to move the non-vtab
** code above the vtab code so that we don't need this forward
** reference.
*/
5000
5001
5002
5003
5004
5005
5006






5007
5008
5009
5010
5011
5012
5013
** this case.  Probably a brittle assumption.
*/
static int leavesReaderReset(LeavesReader *pReader){
  return sqlite3_reset(pReader->pStmt);
}

static void leavesReaderDestroy(LeavesReader *pReader){






  leafReaderDestroy(&pReader->leafReader);
  dataBufferDestroy(&pReader->rootData);
  SCRAMBLE(pReader);
}

/* Initialize pReader with the given root data (if iStartBlockid==0
** the leaf data was entirely contained in the root), or from the







>
>
>
>
>
>







5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
** this case.  Probably a brittle assumption.
*/
static int leavesReaderReset(LeavesReader *pReader){
  return sqlite3_reset(pReader->pStmt);
}

static void leavesReaderDestroy(LeavesReader *pReader){
  /* If idx is -1, that means we're using a non-cached statement
  ** handle in the optimize() case, so we need to release it.
  */
  if( pReader->pStmt!=NULL && pReader->idx==-1 ){
    sqlite3_finalize(pReader->pStmt);
  }
  leafReaderDestroy(&pReader->leafReader);
  dataBufferDestroy(&pReader->rootData);
  SCRAMBLE(pReader);
}

/* Initialize pReader with the given root data (if iStartBlockid==0
** the leaf data was entirely contained in the root), or from the
5944
5945
5946
5947
5948
5949
5950























































































































































































































































































5951
5952
5953
5954
5955
5956
5957
    snippetAllOffsets(pCursor);
    snippetOffsetText(&pCursor->snippet);
    sqlite3_result_text(pContext,
                        pCursor->snippet.zOffset, pCursor->snippet.nOffset,
                        SQLITE_STATIC);
  }
}
























































































































































































































































































#ifdef SQLITE_TEST
/* Generate an error of the form "<prefix>: <msg>".  If msg is NULL,
** pull the error from the context's db handle.
*/
static void generateError(sqlite3_context *pContext,
                          const char *prefix, const char *msg){







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







5986
5987
5988
5989
5990
5991
5992
5993
5994
5995
5996
5997
5998
5999
6000
6001
6002
6003
6004
6005
6006
6007
6008
6009
6010
6011
6012
6013
6014
6015
6016
6017
6018
6019
6020
6021
6022
6023
6024
6025
6026
6027
6028
6029
6030
6031
6032
6033
6034
6035
6036
6037
6038
6039
6040
6041
6042
6043
6044
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094
6095
6096
6097
6098
6099
6100
6101
6102
6103
6104
6105
6106
6107
6108
6109
6110
6111
6112
6113
6114
6115
6116
6117
6118
6119
6120
6121
6122
6123
6124
6125
6126
6127
6128
6129
6130
6131
6132
6133
6134
6135
6136
6137
6138
6139
6140
6141
6142
6143
6144
6145
6146
6147
6148
6149
6150
6151
6152
6153
6154
6155
6156
6157
6158
6159
6160
6161
6162
6163
6164
6165
6166
6167
6168
6169
6170
6171
6172
6173
6174
6175
6176
6177
6178
6179
6180
6181
6182
6183
6184
6185
6186
6187
6188
6189
6190
6191
6192
6193
6194
6195
6196
6197
6198
6199
6200
6201
6202
6203
6204
6205
6206
6207
6208
6209
6210
6211
6212
6213
6214
6215
6216
6217
6218
6219
6220
6221
6222
6223
6224
6225
6226
6227
6228
6229
6230
6231
6232
6233
6234
6235
6236
6237
6238
6239
6240
6241
6242
6243
6244
6245
6246
6247
6248
6249
6250
6251
6252
6253
6254
6255
6256
6257
6258
6259
6260
6261
6262
6263
6264
6265
6266
6267
6268
6269
6270
6271
6272
6273
6274
6275
6276
6277
6278
    snippetAllOffsets(pCursor);
    snippetOffsetText(&pCursor->snippet);
    sqlite3_result_text(pContext,
                        pCursor->snippet.zOffset, pCursor->snippet.nOffset,
                        SQLITE_STATIC);
  }
}

/* OptLeavesReader is nearly identical to LeavesReader, except that
** where LeavesReader is geared towards the merging of complete
** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
** is geared towards implementation of the optimize() function, and
** can merge all segments simultaneously.  This version may be
** somewhat less efficient than LeavesReader because it merges into an
** accumulator rather than doing an N-way merge, but since segment
** size grows exponentially (so segment count logrithmically) this is
** probably not an immediate problem.
*/
/* TODO(shess): Prove that assertion, or extend the merge code to
** merge tree fashion (like the prefix-searching code does).
*/
/* TODO(shess): OptLeavesReader and LeavesReader could probably be
** merged with little or no loss of performance for LeavesReader.  The
** merged code would need to handle >MERGE_COUNT segments, and would
** also need to be able to optionally optimize away deletes.
*/
typedef struct OptLeavesReader {
  /* Segment number, to order readers by age. */
  int segment;
  LeavesReader reader;
} OptLeavesReader;

static int optLeavesReaderAtEnd(OptLeavesReader *pReader){
  return leavesReaderAtEnd(&pReader->reader);
}
static int optLeavesReaderTermBytes(OptLeavesReader *pReader){
  return leavesReaderTermBytes(&pReader->reader);
}
static const char *optLeavesReaderData(OptLeavesReader *pReader){
  return leavesReaderData(&pReader->reader);
}
static int optLeavesReaderDataBytes(OptLeavesReader *pReader){
  return leavesReaderDataBytes(&pReader->reader);
}
static const char *optLeavesReaderTerm(OptLeavesReader *pReader){
  return leavesReaderTerm(&pReader->reader);
}
static int optLeavesReaderStep(fulltext_vtab *v, OptLeavesReader *pReader){
  return leavesReaderStep(v, &pReader->reader);
}
static int optLeavesReaderTermCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
  return leavesReaderTermCmp(&lr1->reader, &lr2->reader);
}
/* Order by term ascending, segment ascending (oldest to newest), with
** exhausted readers to the end.
*/
static int optLeavesReaderCmp(OptLeavesReader *lr1, OptLeavesReader *lr2){
  int c = optLeavesReaderTermCmp(lr1, lr2);
  if( c!=0 ) return c;
  return lr1->segment-lr2->segment;
}
/* Bubble pLr[0] to appropriate place in pLr[1..nLr-1].  Assumes that
** pLr[1..nLr-1] is already sorted.
*/
static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){
  while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){
    OptLeavesReader tmp = pLr[0];
    pLr[0] = pLr[1];
    pLr[1] = tmp;
    nLr--;
    pLr++;
  }
}

/* optimize() helper function.  Put the readers in order and iterate
** through them, merging doclists for matching terms into pWriter.
** Returns SQLITE_OK on success, or the SQLite error code which
** prevented success.
*/
static int optimizeInternal(fulltext_vtab *v,
                            OptLeavesReader *readers, int nReaders,
                            LeafWriter *pWriter){
  int i, rc = SQLITE_OK;
  DataBuffer doclist, merged, tmp;

  /* Order the readers. */
  i = nReaders;
  while( i-- > 0 ){
    optLeavesReaderReorder(&readers[i], nReaders-i);
  }

  dataBufferInit(&doclist, LEAF_MAX);
  dataBufferInit(&merged, LEAF_MAX);

  /* Exhausted readers bubble to the end, so when the first reader is
  ** at eof, all are at eof.
  */
  while( !optLeavesReaderAtEnd(&readers[0]) ){

    /* Figure out how many readers share the next term. */
    for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){
      if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break;
    }

    /* Special-case for no merge. */
    if( i==1 ){
      /* Trim deletions from the doclist. */
      dataBufferReset(&merged);
      docListTrim(DL_DEFAULT,
                  optLeavesReaderData(&readers[0]),
                  optLeavesReaderDataBytes(&readers[0]),
                  -1, DL_DEFAULT, &merged);
    }else{
      DLReader dlReaders[MERGE_COUNT];
      int iReader, nReaders;

      /* Prime the pipeline with the first reader's doclist.  After
      ** one pass index 0 will reference the accumulated doclist.
      */
      dlrInit(&dlReaders[0], DL_DEFAULT,
              optLeavesReaderData(&readers[0]),
              optLeavesReaderDataBytes(&readers[0]));
      iReader = 1;

      assert( iReader<i );  /* Must execute the loop at least once. */
      while( iReader<i ){
        /* Merge 16 inputs per pass. */
        for( nReaders=1; iReader<i && nReaders<MERGE_COUNT;
             iReader++, nReaders++ ){
          dlrInit(&dlReaders[nReaders], DL_DEFAULT,
                  optLeavesReaderData(&readers[iReader]),
                  optLeavesReaderDataBytes(&readers[iReader]));
        }

        /* Merge doclists and swap result into accumulator. */
        dataBufferReset(&merged);
        docListMerge(&merged, dlReaders, nReaders);
        tmp = merged;
        merged = doclist;
        doclist = tmp;

        while( nReaders-- > 0 ){
          dlrDestroy(&dlReaders[nReaders]);
        }

        /* Accumulated doclist to reader 0 for next pass. */
        dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData);
      }

      /* Destroy reader that was left in the pipeline. */
      dlrDestroy(&dlReaders[0]);

      /* Trim deletions from the doclist. */
      dataBufferReset(&merged);
      docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
                  -1, DL_DEFAULT, &merged);
    }

    /* Only pass doclists with hits (skip if all hits deleted). */
    if( merged.nData>0 ){
      rc = leafWriterStep(v, pWriter,
                          optLeavesReaderTerm(&readers[0]),
                          optLeavesReaderTermBytes(&readers[0]),
                          merged.pData, merged.nData);
      if( rc!=SQLITE_OK ) goto err;
    }

    /* Step merged readers to next term and reorder. */
    while( i-- > 0 ){
      rc = optLeavesReaderStep(v, &readers[i]);
      if( rc!=SQLITE_OK ) goto err;

      optLeavesReaderReorder(&readers[i], nReaders-i);
    }
  }

 err:
  dataBufferDestroy(&doclist);
  dataBufferDestroy(&merged);
  return rc;
}

/* Implement optimize() function for FTS3.  optimize(t) merges all
** segments in the fts index into a single segment.  't' is the magic
** table-named column.
*/
static void optimizeFunc(sqlite3_context *pContext,
                         int argc, sqlite3_value **argv){
  fulltext_cursor *pCursor;
  if( argc>1 ){
    sqlite3_result_error(pContext, "excess arguments to optimize()",-1);
  }else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
            sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
    sqlite3_result_error(pContext, "illegal first argument to optimize",-1);
  }else{
    fulltext_vtab *v;
    int i, rc, iMaxLevel;
    OptLeavesReader *readers;
    int nReaders;
    LeafWriter writer;
    sqlite3_stmt *s;

    memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
    v = cursor_vtab(pCursor);

    /* Flush any buffered updates before optimizing. */
    rc = flushPendingTerms(v);
    if( rc!=SQLITE_OK ) goto err;

    rc = segdir_count(v, &nReaders, &iMaxLevel);
    if( rc!=SQLITE_OK ) goto err;
    if( nReaders==0 || nReaders==1 ){
      sqlite3_result_text(pContext, "Index already optimal", -1,
                          SQLITE_STATIC);
      return;
    }

    rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
    if( rc!=SQLITE_OK ) goto err;

    readers = sqlite3_malloc(nReaders*sizeof(readers[0]));
    if( readers==NULL ) goto err;

    /* Note that there will already be a segment at this position
    ** until we call segdir_delete() on iMaxLevel.
    */
    leafWriterInit(iMaxLevel, 0, &writer);

    i = 0;
    while( (rc = sqlite3_step(s))==SQLITE_ROW ){
      sqlite_int64 iStart = sqlite3_column_int64(s, 0);
      sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
      const char *pRootData = sqlite3_column_blob(s, 2);
      int nRootData = sqlite3_column_bytes(s, 2);

      assert( i<nReaders );
      rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData,
                            &readers[i].reader);
      if( rc!=SQLITE_OK ) break;

      readers[i].segment = i;
      i++;
    }

    /* If we managed to succesfully read them all, optimize them. */
    if( rc==SQLITE_DONE ){
      assert( i==nReaders );
      rc = optimizeInternal(v, readers, nReaders, &writer);
    }

    while( i-- > 0 ){
      leavesReaderDestroy(&readers[i].reader);
    }
    sqlite3_free(readers);

    /* If we've successfully gotten to here, delete the old segments
    ** and flush the interior structure of the new segment.
    */
    if( rc==SQLITE_OK ){
      for( i=0; i<=iMaxLevel; i++ ){
        rc = segdir_delete(v, i);
        if( rc!=SQLITE_OK ) break;
      }

      if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer);
    }

    leafWriterDestroy(&writer);

    if( rc!=SQLITE_OK ) goto err;

    sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
    return;

    /* TODO(shess): Error-handling needs to be improved along the
    ** lines of the dump_ functions.
    */
 err:
    {
      char buf[512];
      sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s",
                       sqlite3_errmsg(sqlite3_context_db_handle(pContext)));
      sqlite3_result_error(pContext, buf, -1);
    }
  }
}

#ifdef SQLITE_TEST
/* Generate an error of the form "<prefix>: <msg>".  If msg is NULL,
** pull the error from the context's db handle.
*/
static void generateError(sqlite3_context *pContext,
                          const char *prefix, const char *msg){
6342
6343
6344
6345
6346
6347
6348



6349
6350
6351
6352
6353
6354
6355
){
  if( strcmp(zName,"snippet")==0 ){
    *pxFunc = snippetFunc;
    return 1;
  }else if( strcmp(zName,"offsets")==0 ){
    *pxFunc = snippetOffsetsFunc;
    return 1;



#ifdef SQLITE_TEST
    /* NOTE(shess): These functions are present only for testing
    ** purposes.  No particular effort is made to optimize their
    ** execution or how they build their results.
    */
  }else if( strcmp(zName,"dump_terms")==0 ){
    /* fprintf(stderr, "Found dump_terms\n"); */







>
>
>







6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
){
  if( strcmp(zName,"snippet")==0 ){
    *pxFunc = snippetFunc;
    return 1;
  }else if( strcmp(zName,"offsets")==0 ){
    *pxFunc = snippetOffsetsFunc;
    return 1;
  }else if( strcmp(zName,"optimize")==0 ){
    *pxFunc = optimizeFunc;
    return 1;
#ifdef SQLITE_TEST
    /* NOTE(shess): These functions are present only for testing
    ** purposes.  No particular effort is made to optimize their
    ** execution or how they build their results.
    */
  }else if( strcmp(zName,"dump_terms")==0 ){
    /* fprintf(stderr, "Found dump_terms\n"); */
6475
6476
6477
6478
6479
6480
6481

6482
6483
6484
6485
6486
6487
6488
  ** the two scalar functions. If this is successful, register the
  ** module with sqlite.
  */
  if( SQLITE_OK==rc 
   && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))

#ifdef SQLITE_TEST
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
#endif
  ){
    return sqlite3_create_module_v2(
        db, "fts2", &fts2Module, (void *)pHash, hashDestroy







>







6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
  ** the two scalar functions. If this is successful, register the
  ** module with sqlite.
  */
  if( SQLITE_OK==rc 
   && SQLITE_OK==(rc = sqlite3Fts2InitHashTable(db, pHash, "fts2_tokenizer"))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1))
#ifdef SQLITE_TEST
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
   && SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
#endif
  ){
    return sqlite3_create_module_v2(
        db, "fts2", &fts2Module, (void *)pHash, hashDestroy
Changes to test/fts2q.test.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2008 June 26
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The focus
# of this script is testing the FTS2 module's optimize() function.
#
# $Id: fts2q.test,v 1.1 2008/07/22 23:41:26 shess Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is not defined, omit this file.
ifcapable !fts2 {













|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2008 June 26
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The focus
# of this script is testing the FTS2 module's optimize() function.
#
# $Id: fts2q.test,v 1.2 2008/07/22 23:49:44 shess Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is not defined, omit this file.
ifcapable !fts2 {
119
120
121
122
123
124
125

















126




127





































































































































































































128

check_terms   fts2q-1.2   0 0 {a is test this}
check_doclist fts2q-1.2.1 0 0 a {[1 0[2]]}
check_doclist fts2q-1.2.2 0 0 is {[1 0[1]]}
check_doclist fts2q-1.2.3 0 0 test {[1 0[3]]}
check_doclist fts2q-1.2.4 0 0 this {[1 0[0]]}


















# TODO(shess): optimize() tests here.










































































































































































































finish_test







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346

check_terms   fts2q-1.2   0 0 {a is test this}
check_doclist fts2q-1.2.1 0 0 a {[1 0[2]]}
check_doclist fts2q-1.2.2 0 0 is {[1 0[1]]}
check_doclist fts2q-1.2.3 0 0 test {[1 0[3]]}
check_doclist fts2q-1.2.4 0 0 this {[1 0[0]]}

#*************************************************************************
# Test results when everything is optimized manually.
# NOTE(shess): This is a copy of fts2c-1.3.  I've pulled a copy here
# because fts2q-2 and fts2q-3 should have identical results.
db eval {
  DROP TABLE IF EXISTS t1;
  CREATE VIRTUAL TABLE t1 USING fts2(c);
  INSERT INTO t1 (rowid, c) VALUES (1, 'This is a test');
  INSERT INTO t1 (rowid, c) VALUES (2, 'That was a test');
  INSERT INTO t1 (rowid, c) VALUES (3, 'This is a test');
  DELETE FROM t1 WHERE rowid IN (1,3);
  DROP TABLE IF EXISTS t1old;
  ALTER TABLE t1 RENAME TO t1old;
  CREATE VIRTUAL TABLE t1 USING fts2(c);
  INSERT INTO t1 (rowid, c) SELECT rowid, c FROM t1old;
  DROP TABLE t1old;
}

# Should be a single optimal segment with the same logical results.
do_test fts2q-2.segments {
  execsql {
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {0 0}
do_test fts2q-2.matches {
  execsql {
    SELECT OFFSETS(t1) FROM t1
     WHERE t1 MATCH 'this OR that OR was OR a OR is OR test' ORDER BY rowid;
  }
} {{0 1 0 4 0 2 5 3 0 3 9 1 0 5 11 4}}

check_terms_all fts2q-2.1 {a test that was}
check_doclist_all fts2q-2.1.1 a {[2 0[2]]}
check_doclist_all fts2q-2.1.2 test {[2 0[3]]}
check_doclist_all fts2q-2.1.3 that {[2 0[0]]}
check_doclist_all fts2q-2.1.4 was {[2 0[1]]}

check_terms fts2q-2.2 0 0 {a test that was}
check_doclist fts2q-2.2.1 0 0 a {[2 0[2]]}
check_doclist fts2q-2.2.2 0 0 test {[2 0[3]]}
check_doclist fts2q-2.2.3 0 0 that {[2 0[0]]}
check_doclist fts2q-2.2.4 0 0 was {[2 0[1]]}

#*************************************************************************
# Test results when everything is optimized via optimize().
db eval {
  DROP TABLE IF EXISTS t1;
  CREATE VIRTUAL TABLE t1 USING fts2(c);
  INSERT INTO t1 (rowid, c) VALUES (1, 'This is a test');
  INSERT INTO t1 (rowid, c) VALUES (2, 'That was a test');
  INSERT INTO t1 (rowid, c) VALUES (3, 'This is a test');
  DELETE FROM t1 WHERE rowid IN (1,3);
  SELECT OPTIMIZE(t1) FROM t1 LIMIT 1;
}

# Should be a single optimal segment with the same logical results.
do_test fts2q-3.segments {
  execsql {
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {0 0}
do_test fts2q-3.matches {
  execsql {
    SELECT OFFSETS(t1) FROM t1
     WHERE t1 MATCH 'this OR that OR was OR a OR is OR test' ORDER BY rowid;
  }
} {{0 1 0 4 0 2 5 3 0 3 9 1 0 5 11 4}}

check_terms_all fts2q-3.1 {a test that was}
check_doclist_all fts2q-3.1.1 a {[2 0[2]]}
check_doclist_all fts2q-3.1.2 test {[2 0[3]]}
check_doclist_all fts2q-3.1.3 that {[2 0[0]]}
check_doclist_all fts2q-3.1.4 was {[2 0[1]]}

check_terms fts2q-3.2 0 0 {a test that was}
check_doclist fts2q-3.2.1 0 0 a {[2 0[2]]}
check_doclist fts2q-3.2.2 0 0 test {[2 0[3]]}
check_doclist fts2q-3.2.3 0 0 that {[2 0[0]]}
check_doclist fts2q-3.2.4 0 0 was {[2 0[1]]}

#*************************************************************************
# Test optimize() against a table involving segment merges.
# NOTE(shess): Since there's no transaction, each of the INSERT/UPDATE
# statements generates a segment.
db eval {
  DROP TABLE IF EXISTS t1;
  CREATE VIRTUAL TABLE t1 USING fts2(c);

  INSERT INTO t1 (rowid, c) VALUES (1, 'This is a test');
  INSERT INTO t1 (rowid, c) VALUES (2, 'That was a test');
  INSERT INTO t1 (rowid, c) VALUES (3, 'This is a test');

  UPDATE t1 SET c = 'This is a test one' WHERE rowid = 1;
  UPDATE t1 SET c = 'That was a test one' WHERE rowid = 2;
  UPDATE t1 SET c = 'This is a test one' WHERE rowid = 3;

  UPDATE t1 SET c = 'This is a test two' WHERE rowid = 1;
  UPDATE t1 SET c = 'That was a test two' WHERE rowid = 2;
  UPDATE t1 SET c = 'This is a test two' WHERE rowid = 3;

  UPDATE t1 SET c = 'This is a test three' WHERE rowid = 1;
  UPDATE t1 SET c = 'That was a test three' WHERE rowid = 2;
  UPDATE t1 SET c = 'This is a test three' WHERE rowid = 3;

  UPDATE t1 SET c = 'This is a test four' WHERE rowid = 1;
  UPDATE t1 SET c = 'That was a test four' WHERE rowid = 2;
  UPDATE t1 SET c = 'This is a test four' WHERE rowid = 3;

  UPDATE t1 SET c = 'This is a test' WHERE rowid = 1;
  UPDATE t1 SET c = 'That was a test' WHERE rowid = 2;
  UPDATE t1 SET c = 'This is a test' WHERE rowid = 3;
}

# 2 segments in level 0, 1 in level 1 (18 segments created, 16
# merged).
do_test fts2q-4.segments {
  execsql {
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {0 0 0 1 1 0}

do_test fts2q-4.matches {
  execsql {
    SELECT OFFSETS(t1) FROM t1
     WHERE t1 MATCH 'this OR that OR was OR a OR is OR test' ORDER BY rowid;
  }
} [list {0 0 0 4 0 4 5 2 0 3 8 1 0 5 10 4} \
        {0 1 0 4 0 2 5 3 0 3 9 1 0 5 11 4} \
        {0 0 0 4 0 4 5 2 0 3 8 1 0 5 10 4}]

check_terms_all fts2q-4.1      {a four is one test that this three two was}
check_doclist_all fts2q-4.1.1  a {[1 0[2]] [2 0[2]] [3 0[2]]}
check_doclist_all fts2q-4.1.2  four {}
check_doclist_all fts2q-4.1.3  is {[1 0[1]] [3 0[1]]}
check_doclist_all fts2q-4.1.4  one {}
check_doclist_all fts2q-4.1.5  test {[1 0[3]] [2 0[3]] [3 0[3]]}
check_doclist_all fts2q-4.1.6  that {[2 0[0]]}
check_doclist_all fts2q-4.1.7  this {[1 0[0]] [3 0[0]]}
check_doclist_all fts2q-4.1.8  three {}
check_doclist_all fts2q-4.1.9  two {}
check_doclist_all fts2q-4.1.10 was {[2 0[1]]}

check_terms fts2q-4.2     0 0 {a four test that was}
check_doclist fts2q-4.2.1 0 0 a {[2 0[2]]}
check_doclist fts2q-4.2.2 0 0 four {[2]}
check_doclist fts2q-4.2.3 0 0 test {[2 0[3]]}
check_doclist fts2q-4.2.4 0 0 that {[2 0[0]]}
check_doclist fts2q-4.2.5 0 0 was {[2 0[1]]}

check_terms fts2q-4.3     0 1 {a four is test this}
check_doclist fts2q-4.3.1 0 1 a {[3 0[2]]}
check_doclist fts2q-4.3.2 0 1 four {[3]}
check_doclist fts2q-4.3.3 0 1 is {[3 0[1]]}
check_doclist fts2q-4.3.4 0 1 test {[3 0[3]]}
check_doclist fts2q-4.3.5 0 1 this {[3 0[0]]}

check_terms fts2q-4.4      1 0 {a four is one test that this three two was}
check_doclist fts2q-4.4.1  1 0 a {[1 0[2]] [2 0[2]] [3 0[2]]}
check_doclist fts2q-4.4.2  1 0 four {[1] [2 0[4]] [3 0[4]]}
check_doclist fts2q-4.4.3  1 0 is {[1 0[1]] [3 0[1]]}
check_doclist fts2q-4.4.4  1 0 one {[1] [2] [3]}
check_doclist fts2q-4.4.5  1 0 test {[1 0[3]] [2 0[3]] [3 0[3]]}
check_doclist fts2q-4.4.6  1 0 that {[2 0[0]]}
check_doclist fts2q-4.4.7  1 0 this {[1 0[0]] [3 0[0]]}
check_doclist fts2q-4.4.8  1 0 three {[1] [2] [3]}
check_doclist fts2q-4.4.9  1 0 two {[1] [2] [3]}
check_doclist fts2q-4.4.10 1 0 was {[2 0[1]]}

# Optimize should leave the result in the level of the highest-level
# prior segment.
do_test fts2q-4.5 {
  execsql {
    SELECT OPTIMIZE(t1) FROM t1 LIMIT 1;
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {{Index optimized} 1 0}

# Identical to fts2q-4.matches.
do_test fts2q-4.5.matches {
  execsql {
    SELECT OFFSETS(t1) FROM t1
     WHERE t1 MATCH 'this OR that OR was OR a OR is OR test' ORDER BY rowid;
  }
} [list {0 0 0 4 0 4 5 2 0 3 8 1 0 5 10 4} \
        {0 1 0 4 0 2 5 3 0 3 9 1 0 5 11 4} \
        {0 0 0 4 0 4 5 2 0 3 8 1 0 5 10 4}]

check_terms_all fts2q-4.5.1     {a is test that this was}
check_doclist_all fts2q-4.5.1.1 a {[1 0[2]] [2 0[2]] [3 0[2]]}
check_doclist_all fts2q-4.5.1.2 is {[1 0[1]] [3 0[1]]}
check_doclist_all fts2q-4.5.1.3 test {[1 0[3]] [2 0[3]] [3 0[3]]}
check_doclist_all fts2q-4.5.1.4 that {[2 0[0]]}
check_doclist_all fts2q-4.5.1.5 this {[1 0[0]] [3 0[0]]}
check_doclist_all fts2q-4.5.1.6 was {[2 0[1]]}

check_terms fts2q-4.5.2     1 0 {a is test that this was}
check_doclist fts2q-4.5.2.1 1 0 a {[1 0[2]] [2 0[2]] [3 0[2]]}
check_doclist fts2q-4.5.2.2 1 0 is {[1 0[1]] [3 0[1]]}
check_doclist fts2q-4.5.2.3 1 0 test {[1 0[3]] [2 0[3]] [3 0[3]]}
check_doclist fts2q-4.5.2.4 1 0 that {[2 0[0]]}
check_doclist fts2q-4.5.2.5 1 0 this {[1 0[0]] [3 0[0]]}
check_doclist fts2q-4.5.2.6 1 0 was {[2 0[1]]}

# Re-optimizing does nothing.
do_test fts2q-5.0 {
  execsql {
    SELECT OPTIMIZE(t1) FROM t1 LIMIT 1;
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {{Index already optimal} 1 0}

# Even if we move things around, still does nothing.
do_test fts2q-5.1 {
  execsql {
    UPDATE t1_segdir SET level = 2 WHERE level = 1 AND idx = 0;
    SELECT OPTIMIZE(t1) FROM t1 LIMIT 1;
    SELECT level, idx FROM t1_segdir ORDER BY level, idx;
  }
} {{Index already optimal} 2 0}

finish_test