SQLite

Check-in [40b5bbf02a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add the "offsets=0" option to fts5, to create a smaller index without term offset information. A few things are currently broken on this branch.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5-offsets
Files: files | file ages | folders
SHA1: 40b5bbf02a824ca73b33aa4ae1c7d5f65b7cda10
User & Date: dan 2015-12-17 20:36:13.853
Context
2015-12-18
19:07
Fix a problem with prefix queries on fts5 offsets=0 tables. (check-in: ad0987d83c user: dan tags: fts5-offsets)
2015-12-17
20:36
Add the "offsets=0" option to fts5, to create a smaller index without term offset information. A few things are currently broken on this branch. (check-in: 40b5bbf02a user: dan tags: fts5-offsets)
14:18
Fix the spellfix1_scriptcode() function to ignore whitespace and punctuation, and to recognize hebrew and arabic scripts. (check-in: 7adfa4a579 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts5/fts5Int.h.
147
148
149
150
151
152
153

154
155
156
157
158
159
160
  u8 *abUnindexed;                /* True for unindexed columns */
  int nPrefix;                    /* Number of prefix indexes */
  int *aPrefix;                   /* Sizes in bytes of nPrefix prefix indexes */
  int eContent;                   /* An FTS5_CONTENT value */
  char *zContent;                 /* content table */ 
  char *zContentRowid;            /* "content_rowid=" option value */ 
  int bColumnsize;                /* "columnsize=" option value (dflt==1) */

  char *zContentExprlist;
  Fts5Tokenizer *pTok;
  fts5_tokenizer *pTokApi;

  /* Values loaded from the %_config table */
  int iCookie;                    /* Incremented when %_config is modified */
  int pgsz;                       /* Approximate page size used in %_data */







>







147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
  u8 *abUnindexed;                /* True for unindexed columns */
  int nPrefix;                    /* Number of prefix indexes */
  int *aPrefix;                   /* Sizes in bytes of nPrefix prefix indexes */
  int eContent;                   /* An FTS5_CONTENT value */
  char *zContent;                 /* content table */ 
  char *zContentRowid;            /* "content_rowid=" option value */ 
  int bColumnsize;                /* "columnsize=" option value (dflt==1) */
  int bOffsets;                   /* "offsets=" option value (dflt==1) */
  char *zContentExprlist;
  Fts5Tokenizer *pTok;
  fts5_tokenizer *pTokApi;

  /* Values loaded from the %_config table */
  int iCookie;                    /* Incremented when %_config is modified */
  int pgsz;                       /* Approximate page size used in %_data */
288
289
290
291
292
293
294







295
296
297
298
299
300
301
/* Malloc utility */
void *sqlite3Fts5MallocZero(int *pRc, int nByte);
char *sqlite3Fts5Strndup(int *pRc, const char *pIn, int nIn);

/* Character set tests (like isspace(), isalpha() etc.) */
int sqlite3Fts5IsBareword(char t);








/*
** End of interface to code in fts5_buffer.c.
**************************************************************************/

/**************************************************************************
** Interface to code in fts5_index.c. fts5_index.c contains contains code
** to access the data stored in the %_data table.







>
>
>
>
>
>
>







289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
/* Malloc utility */
void *sqlite3Fts5MallocZero(int *pRc, int nByte);
char *sqlite3Fts5Strndup(int *pRc, const char *pIn, int nIn);

/* Character set tests (like isspace(), isalpha() etc.) */
int sqlite3Fts5IsBareword(char t);


/* Bucket of terms object used by the integrity-check in offsets=0 mode. */
typedef struct Fts5Termset Fts5Termset;
int sqlite3Fts5TermsetNew(Fts5Termset**);
int sqlite3Fts5TermsetAdd(Fts5Termset*, const char*, int, int *pbPresent);
void sqlite3Fts5TermsetFree(Fts5Termset*);

/*
** End of interface to code in fts5_buffer.c.
**************************************************************************/

/**************************************************************************
** Interface to code in fts5_index.c. fts5_index.c contains contains code
** to access the data stored in the %_data table.
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
** Interface to code in fts5_hash.c. 
*/
typedef struct Fts5Hash Fts5Hash;

/*
** Create a hash table, free a hash table.
*/
int sqlite3Fts5HashNew(Fts5Hash**, int *pnSize);
void sqlite3Fts5HashFree(Fts5Hash*);

int sqlite3Fts5HashWrite(
  Fts5Hash*,
  i64 iRowid,                     /* Rowid for this entry */
  int iCol,                       /* Column token appears in (-ve -> delete) */
  int iPos,                       /* Position of token within column */







|







496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
** Interface to code in fts5_hash.c. 
*/
typedef struct Fts5Hash Fts5Hash;

/*
** Create a hash table, free a hash table.
*/
int sqlite3Fts5HashNew(Fts5Config*, Fts5Hash**, int *pnSize);
void sqlite3Fts5HashFree(Fts5Hash*);

int sqlite3Fts5HashWrite(
  Fts5Hash*,
  i64 iRowid,                     /* Rowid for this entry */
  int iCol,                       /* Column token appears in (-ve -> delete) */
  int iPos,                       /* Position of token within column */
Changes to ext/fts5/fts5_buffer.c.
286
287
288
289
290
291
292










































































293
294
    1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 0, 0, 0, 0, 1,   /* 0x50 .. 0x5F */
    0, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60 .. 0x6F */
    1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 0, 0, 0, 0, 0    /* 0x70 .. 0x7F */
  };

  return (t & 0x80) || aBareword[(int)t];
}



















































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>


286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
    1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 0, 0, 0, 0, 1,   /* 0x50 .. 0x5F */
    0, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 1, 1, 1,   /* 0x60 .. 0x6F */
    1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 0, 0, 0, 0, 0    /* 0x70 .. 0x7F */
  };

  return (t & 0x80) || aBareword[(int)t];
}


/*************************************************************************
*/
typedef struct Fts5TermsetEntry Fts5TermsetEntry;
struct Fts5TermsetEntry {
  char *pTerm;
  int nTerm;
  Fts5TermsetEntry *pNext;
};

struct Fts5Termset {
  Fts5TermsetEntry *apHash[512];
};

int sqlite3Fts5TermsetNew(Fts5Termset **pp){
  int rc = SQLITE_OK;
  *pp = sqlite3Fts5MallocZero(&rc, sizeof(Fts5Termset));
  return rc;
}

int sqlite3Fts5TermsetAdd(
  Fts5Termset *p, 
  const char *pTerm, int nTerm, 
  int *pbPresent
){
  int rc = SQLITE_OK;
  int i;
  int hash = 13;
  Fts5TermsetEntry *pEntry;

  /* Calculate a hash value for this term */
  for(i=0; i<nTerm; i++){
    hash += (hash << 3) + (int)pTerm[i];
  }
  hash = hash % ArraySize(p->apHash);

  *pbPresent = 0;
  for(pEntry=p->apHash[hash]; pEntry; pEntry=pEntry->pNext){
    if( pEntry->nTerm==nTerm && memcmp(pEntry->pTerm, pTerm, nTerm)==0 ){
      *pbPresent = 1;
      break;
    }
  }

  if( pEntry==0 ){
    pEntry = sqlite3Fts5MallocZero(&rc, sizeof(Fts5TermsetEntry) + nTerm);
    if( pEntry ){
      pEntry->pTerm = (char*)&pEntry[1];
      pEntry->nTerm = nTerm;
      memcpy(pEntry->pTerm, pTerm, nTerm);
      pEntry->pNext = p->apHash[hash];
      p->apHash[hash] = pEntry;
    }
  }

  return rc;
}

void sqlite3Fts5TermsetFree(Fts5Termset *p){
  if( p ){
    int i;
    for(i=0; i<ArraySize(p->apHash); i++){
      Fts5TermsetEntry *pEntry = p->apHash[i];
      while( pEntry ){
        Fts5TermsetEntry *pDel = pEntry;
        pEntry = pEntry->pNext;
        sqlite3_free(pDel);
      }
    }
    sqlite3_free(p);
  }
}



Changes to ext/fts5/fts5_config.c.
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
**
******************************************************************************
**
** This is an SQLite module implementing full-text search.
*/



#include "fts5Int.h"

#define FTS5_DEFAULT_PAGE_SIZE   4050
#define FTS5_DEFAULT_AUTOMERGE      4
#define FTS5_DEFAULT_CRISISMERGE   16
#define FTS5_DEFAULT_HASHSIZE    (1024*1024)








<







10
11
12
13
14
15
16

17
18
19
20
21
22
23
**
******************************************************************************
**
** This is an SQLite module implementing full-text search.
*/



#include "fts5Int.h"

#define FTS5_DEFAULT_PAGE_SIZE   4050
#define FTS5_DEFAULT_AUTOMERGE      4
#define FTS5_DEFAULT_CRISISMERGE   16
#define FTS5_DEFAULT_HASHSIZE    (1024*1024)

340
341
342
343
344
345
346










347
348
349
350
351
352
353
      *pzErr = sqlite3_mprintf("malformed columnsize=... directive");
      rc = SQLITE_ERROR;
    }else{
      pConfig->bColumnsize = (zArg[0]=='1');
    }
    return rc;
  }











  *pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
  return SQLITE_ERROR;
}

/*
** Allocate an instance of the default tokenizer ("simple") at 







>
>
>
>
>
>
>
>
>
>







339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
      *pzErr = sqlite3_mprintf("malformed columnsize=... directive");
      rc = SQLITE_ERROR;
    }else{
      pConfig->bColumnsize = (zArg[0]=='1');
    }
    return rc;
  }

  if( sqlite3_strnicmp("offsets", zCmd, nCmd)==0 ){
    if( (zArg[0]!='0' && zArg[0]!='1') || zArg[1]!='\0' ){
      *pzErr = sqlite3_mprintf("malformed offsets=... directive");
      rc = SQLITE_ERROR;
    }else{
      pConfig->bOffsets = (zArg[0]=='1');
    }
    return rc;
  }

  *pzErr = sqlite3_mprintf("unrecognized option: \"%.*s\"", nCmd, zCmd);
  return SQLITE_ERROR;
}

/*
** Allocate an instance of the default tokenizer ("simple") at 
496
497
498
499
500
501
502

503
504
505
506
507
508
509

  nByte = nArg * (sizeof(char*) + sizeof(u8));
  pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte);
  pRet->abUnindexed = (u8*)&pRet->azCol[nArg];
  pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
  pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
  pRet->bColumnsize = 1;

#ifdef SQLITE_DEBUG
  pRet->bPrefixIndex = 1;
#endif
  if( rc==SQLITE_OK && sqlite3_stricmp(pRet->zName, FTS5_RANK_NAME)==0 ){
    *pzErr = sqlite3_mprintf("reserved fts5 table name: %s", pRet->zName);
    rc = SQLITE_ERROR;
  }







>







505
506
507
508
509
510
511
512
513
514
515
516
517
518
519

  nByte = nArg * (sizeof(char*) + sizeof(u8));
  pRet->azCol = (char**)sqlite3Fts5MallocZero(&rc, nByte);
  pRet->abUnindexed = (u8*)&pRet->azCol[nArg];
  pRet->zDb = sqlite3Fts5Strndup(&rc, azArg[1], -1);
  pRet->zName = sqlite3Fts5Strndup(&rc, azArg[2], -1);
  pRet->bColumnsize = 1;
  pRet->bOffsets = 1;
#ifdef SQLITE_DEBUG
  pRet->bPrefixIndex = 1;
#endif
  if( rc==SQLITE_OK && sqlite3_stricmp(pRet->zName, FTS5_RANK_NAME)==0 ){
    *pzErr = sqlite3_mprintf("reserved fts5 table name: %s", pRet->zName);
    rc = SQLITE_ERROR;
  }
Changes to ext/fts5/fts5_hash.c.
22
23
24
25
26
27
28

29
30
31
32
33
34
35
** This file contains the implementation of an in-memory hash table used
** to accumuluate "term -> doclist" content before it is flused to a level-0
** segment.
*/


struct Fts5Hash {

  int *pnByte;                    /* Pointer to bytes counter */
  int nEntry;                     /* Number of entries currently in hash */
  int nSlot;                      /* Size of aSlot[] array */
  Fts5HashEntry *pScan;           /* Current ordered scan item */
  Fts5HashEntry **aSlot;          /* Array of hash slots */
};








>







22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
** This file contains the implementation of an in-memory hash table used
** to accumuluate "term -> doclist" content before it is flused to a level-0
** segment.
*/


struct Fts5Hash {
  int bOffsets;                   /* Copy of Fts5Config.bOffsets */
  int *pnByte;                    /* Pointer to bytes counter */
  int nEntry;                     /* Number of entries currently in hash */
  int nSlot;                      /* Size of aSlot[] array */
  Fts5HashEntry *pScan;           /* Current ordered scan item */
  Fts5HashEntry **aSlot;          /* Array of hash slots */
};

75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

93
94
95
96
97
98
99
#define FTS5_HASHENTRYSIZE (sizeof(Fts5HashEntry)-8)



/*
** Allocate a new hash table.
*/
int sqlite3Fts5HashNew(Fts5Hash **ppNew, int *pnByte){
  int rc = SQLITE_OK;
  Fts5Hash *pNew;

  *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
  if( pNew==0 ){
    rc = SQLITE_NOMEM;
  }else{
    int nByte;
    memset(pNew, 0, sizeof(Fts5Hash));
    pNew->pnByte = pnByte;


    pNew->nSlot = 1024;
    nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
    pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc(nByte);
    if( pNew->aSlot==0 ){
      sqlite3_free(pNew);
      *ppNew = 0;







|










>







76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#define FTS5_HASHENTRYSIZE (sizeof(Fts5HashEntry)-8)



/*
** Allocate a new hash table.
*/
int sqlite3Fts5HashNew(Fts5Config *pConfig, Fts5Hash **ppNew, int *pnByte){
  int rc = SQLITE_OK;
  Fts5Hash *pNew;

  *ppNew = pNew = (Fts5Hash*)sqlite3_malloc(sizeof(Fts5Hash));
  if( pNew==0 ){
    rc = SQLITE_NOMEM;
  }else{
    int nByte;
    memset(pNew, 0, sizeof(Fts5Hash));
    pNew->pnByte = pnByte;
    pNew->bOffsets = pConfig->bOffsets;

    pNew->nSlot = 1024;
    nByte = sizeof(Fts5HashEntry*) * pNew->nSlot;
    pNew->aSlot = (Fts5HashEntry**)sqlite3_malloc(nByte);
    if( pNew->aSlot==0 ){
      sqlite3_free(pNew);
      *ppNew = 0;
210
211
212
213
214
215
216

217
218
219
220
221
222
223
  char bByte,                     /* First byte of token */
  const char *pToken, int nToken  /* Token to add or remove to or from index */
){
  unsigned int iHash;
  Fts5HashEntry *p;
  u8 *pPtr;
  int nIncr = 0;                  /* Amount to increment (*pHash->pnByte) by */


  /* Attempt to locate an existing hash entry */
  iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
  for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
    if( p->zKey[0]==bByte 
     && memcmp(&p->zKey[1], pToken, nToken)==0 
     && p->zKey[nToken+1]==0 







>







212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
  char bByte,                     /* First byte of token */
  const char *pToken, int nToken  /* Token to add or remove to or from index */
){
  unsigned int iHash;
  Fts5HashEntry *p;
  u8 *pPtr;
  int nIncr = 0;                  /* Amount to increment (*pHash->pnByte) by */
  int bNew = pHash->bOffsets;     /* If non-delete entry should be written */

  /* Attempt to locate an existing hash entry */
  iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
  for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
    if( p->zKey[0]==bByte 
     && memcmp(&p->zKey[1], pToken, nToken)==0 
     && p->zKey[nToken+1]==0 
246
247
248
249
250
251
252

253
254
255
256
257
258
259
    assert( iHash==fts5HashKey(pHash->nSlot, (u8*)p->zKey, nToken+1) );
    p->zKey[nToken+1] = '\0';
    p->nData = nToken+1 + 1 + FTS5_HASHENTRYSIZE;
    p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
    p->iSzPoslist = p->nData;
    p->nData += 1;
    p->iRowid = iRowid;

    p->pHashNext = pHash->aSlot[iHash];
    pHash->aSlot[iHash] = p;
    pHash->nEntry++;
    nIncr += p->nData;
  }

  /* Check there is enough space to append a new entry. Worst case scenario







>







249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
    assert( iHash==fts5HashKey(pHash->nSlot, (u8*)p->zKey, nToken+1) );
    p->zKey[nToken+1] = '\0';
    p->nData = nToken+1 + 1 + FTS5_HASHENTRYSIZE;
    p->nData += sqlite3Fts5PutVarint(&((u8*)p)[p->nData], iRowid);
    p->iSzPoslist = p->nData;
    p->nData += 1;
    p->iRowid = iRowid;
    p->iCol = (pHash->bOffsets-1);
    p->pHashNext = pHash->aSlot[iHash];
    pHash->aSlot[iHash] = p;
    pHash->nEntry++;
    nIncr += p->nData;
  }

  /* Check there is enough space to append a new entry. Worst case scenario
282
283
284
285
286
287
288
289
290
291

292
293
294
295
296
297




298
299
300
301
302
303

304

305
306

307
308
309
310
311
312
313
  /* If this is a new rowid, append the 4-byte size field for the previous
  ** entry, and the new rowid for this entry.  */
  if( iRowid!=p->iRowid ){
    fts5HashAddPoslistSize(p);
    p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iRowid - p->iRowid);
    p->iSzPoslist = p->nData;
    p->nData += 1;
    p->iCol = 0;
    p->iPos = 0;
    p->iRowid = iRowid;

  }

  if( iCol>=0 ){
    /* Append a new column value, if necessary */
    assert( iCol>=p->iCol );
    if( iCol!=p->iCol ){




      pPtr[p->nData++] = 0x01;
      p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
      p->iCol = iCol;
      p->iPos = 0;
    }


    /* Append the new position offset */

    p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
    p->iPos = iPos;

  }else{
    /* This is a delete. Set the delete flag. */
    p->bDel = 1;
  }
  nIncr += p->nData;

  *pHash->pnByte += nIncr;







|


>






>
>
>
>
|
|
|
|
|
|
>
|
>
|
|
>







286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
  /* If this is a new rowid, append the 4-byte size field for the previous
  ** entry, and the new rowid for this entry.  */
  if( iRowid!=p->iRowid ){
    fts5HashAddPoslistSize(p);
    p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iRowid - p->iRowid);
    p->iSzPoslist = p->nData;
    p->nData += 1;
    p->iCol = (pHash->bOffsets-1);
    p->iPos = 0;
    p->iRowid = iRowid;
    bNew = 1;
  }

  if( iCol>=0 ){
    /* Append a new column value, if necessary */
    assert( iCol>=p->iCol );
    if( iCol!=p->iCol ){
      if( pHash->bOffsets==0 ){
        bNew = 1;
        p->iCol = iPos = iCol;
      }else{
        pPtr[p->nData++] = 0x01;
        p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iCol);
        p->iCol = iCol;
        p->iPos = 0;
      }
    }

    /* Append the new position offset, if necessary */
    if( bNew ){
      p->nData += sqlite3Fts5PutVarint(&pPtr[p->nData], iPos - p->iPos + 2);
      p->iPos = iPos;
    }
  }else{
    /* This is a delete. Set the delete flag. */
    p->bDel = 1;
  }
  nIncr += p->nData;

  *pHash->pnByte += nIncr;
Changes to ext/fts5/fts5_index.c.
3997
3998
3999
4000
4001
4002
4003








4004
4005
4006
4007
4008
4009
4010
4011
4012
4013






















4014
4015
4016
4017
4018
4019
4020
typedef struct PoslistCallbackCtx PoslistCallbackCtx;
struct PoslistCallbackCtx {
  Fts5Buffer *pBuf;               /* Append to this buffer */
  Fts5Colset *pColset;            /* Restrict matches to this column */
  int eState;                     /* See above */
};









/*
** TODO: Make this more efficient!
*/
static int fts5IndexColsetTest(Fts5Colset *pColset, int iCol){
  int i;
  for(i=0; i<pColset->nCol; i++){
    if( pColset->aiCol[i]==iCol ) return 1;
  }
  return 0;
}























static void fts5PoslistFilterCallback(
  Fts5Index *p, 
  void *pContext, 
  const u8 *pChunk, int nChunk
){
  PoslistCallbackCtx *pCtx = (PoslistCallbackCtx*)pContext;







>
>
>
>
>
>
>
>










>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
4010
4011
4012
4013
4014
4015
4016
4017
4018
4019
4020
4021
4022
4023
4024
4025
4026
4027
4028
4029
4030
4031
4032
4033
4034
4035
4036
4037
4038
4039
4040
4041
4042
4043
4044
4045
4046
4047
4048
4049
4050
typedef struct PoslistCallbackCtx PoslistCallbackCtx;
struct PoslistCallbackCtx {
  Fts5Buffer *pBuf;               /* Append to this buffer */
  Fts5Colset *pColset;            /* Restrict matches to this column */
  int eState;                     /* See above */
};

typedef struct PoslistOffsetsCtx PoslistOffsetsCtx;
struct PoslistOffsetsCtx {
  Fts5Buffer *pBuf;               /* Append to this buffer */
  Fts5Colset *pColset;            /* Restrict matches to this column */
  int iRead;
  int iWrite;
};

/*
** TODO: Make this more efficient!
*/
static int fts5IndexColsetTest(Fts5Colset *pColset, int iCol){
  int i;
  for(i=0; i<pColset->nCol; i++){
    if( pColset->aiCol[i]==iCol ) return 1;
  }
  return 0;
}

static void fts5PoslistOffsetsCallback(
  Fts5Index *p, 
  void *pContext, 
  const u8 *pChunk, int nChunk
){
  PoslistOffsetsCtx *pCtx = (PoslistOffsetsCtx*)pContext;
  assert_nc( nChunk>=0 );
  if( nChunk>0 ){
    int i = 0;
    while( i<nChunk ){
      int iVal;
      i += fts5GetVarint32(&pChunk[i], iVal);
      iVal += pCtx->iRead - 2;
      pCtx->iRead = iVal;
      if( fts5IndexColsetTest(pCtx->pColset, iVal) ){
        fts5BufferSafeAppendVarint(pCtx->pBuf, iVal + 2 - pCtx->iWrite);
        pCtx->iWrite = iVal;
      }
    }
  }
}

static void fts5PoslistFilterCallback(
  Fts5Index *p, 
  void *pContext, 
  const u8 *pChunk, int nChunk
){
  PoslistCallbackCtx *pCtx = (PoslistCallbackCtx*)pContext;
4075
4076
4077
4078
4079
4080
4081







4082
4083
4084
4085
4086

4087

4088
4089
4090
4091
4092
4093
4094
  Fts5Colset *pColset,
  Fts5Buffer *pBuf
){
  if( 0==fts5BufferGrow(&p->rc, pBuf, pSeg->nPos) ){
    if( pColset==0 ){
      fts5ChunkIterate(p, pSeg, (void*)pBuf, fts5PoslistCallback);
    }else{







      PoslistCallbackCtx sCtx;
      sCtx.pBuf = pBuf;
      sCtx.pColset = pColset;
      sCtx.eState = fts5IndexColsetTest(pColset, 0);
      assert( sCtx.eState==0 || sCtx.eState==1 );

      fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistFilterCallback);

    }
  }
}

/*
** IN/OUT parameter (*pa) points to a position list n bytes in size. If
** the position list contains entries for column iCol, then (*pa) is set







>
>
>
>
>
>
>
|
|
|
<
|
>
|
>







4105
4106
4107
4108
4109
4110
4111
4112
4113
4114
4115
4116
4117
4118
4119
4120
4121

4122
4123
4124
4125
4126
4127
4128
4129
4130
4131
4132
  Fts5Colset *pColset,
  Fts5Buffer *pBuf
){
  if( 0==fts5BufferGrow(&p->rc, pBuf, pSeg->nPos) ){
    if( pColset==0 ){
      fts5ChunkIterate(p, pSeg, (void*)pBuf, fts5PoslistCallback);
    }else{
      if( p->pConfig->bOffsets==0 ){
        PoslistOffsetsCtx sCtx;
        memset(&sCtx, 0, sizeof(sCtx));
        sCtx.pBuf = pBuf;
        sCtx.pColset = pColset;
        fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistOffsetsCallback);
      }else{
        PoslistCallbackCtx sCtx;
        sCtx.pBuf = pBuf;
        sCtx.pColset = pColset;

        assert( sCtx.eState==0 || sCtx.eState==1 );
        sCtx.eState = fts5IndexColsetTest(pColset, 0);
        fts5ChunkIterate(p, pSeg, (void*)&sCtx, fts5PoslistFilterCallback);
      }
    }
  }
}

/*
** IN/OUT parameter (*pa) points to a position list n bytes in size. If
** the position list contains entries for column iCol, then (*pa) is set
4442
4443
4444
4445
4446
4447
4448
4449
4450
4451
4452
4453
4454
4455
4456
** to the document with rowid iRowid.
*/
int sqlite3Fts5IndexBeginWrite(Fts5Index *p, int bDelete, i64 iRowid){
  assert( p->rc==SQLITE_OK );

  /* Allocate the hash table if it has not already been allocated */
  if( p->pHash==0 ){
    p->rc = sqlite3Fts5HashNew(&p->pHash, &p->nPendingData);
  }

  /* Flush the hash table to disk if required */
  if( iRowid<p->iWriteRowid 
   || (iRowid==p->iWriteRowid && p->bDelete==0)
   || (p->nPendingData > p->pConfig->nHashSize) 
  ){







|







4480
4481
4482
4483
4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
** to the document with rowid iRowid.
*/
int sqlite3Fts5IndexBeginWrite(Fts5Index *p, int bDelete, i64 iRowid){
  assert( p->rc==SQLITE_OK );

  /* Allocate the hash table if it has not already been allocated */
  if( p->pHash==0 ){
    p->rc = sqlite3Fts5HashNew(p->pConfig, &p->pHash, &p->nPendingData);
  }

  /* Flush the hash table to disk if required */
  if( iRowid<p->iWriteRowid 
   || (iRowid==p->iWriteRowid && p->bDelete==0)
   || (p->nPendingData > p->pConfig->nHashSize) 
  ){
4800
4801
4802
4803
4804
4805
4806

4807

4808
4809
4810
4811
4812
4813
4814
  const u8 **pp,                  /* OUT: Pointer to position-list data */
  int *pn,                        /* OUT: Size of position-list in bytes */
  i64 *piRowid                    /* OUT: Current rowid */
){
  Fts5SegIter *pSeg = &pIter->aSeg[ pIter->aFirst[1].iFirst ];
  assert( pIter->pIndex->rc==SQLITE_OK );
  *piRowid = pSeg->iRowid;

  if( pSeg->iLeafOffset+pSeg->nPos<=pSeg->pLeaf->szLeaf ){

    u8 *pPos = &pSeg->pLeaf->p[pSeg->iLeafOffset];
    if( pColset==0 || pIter->bFiltered ){
      *pn = pSeg->nPos;
      *pp = pPos;
    }else if( pColset->nCol==1 ){
      *pp = pPos;
      *pn = fts5IndexExtractCol(pp, pSeg->nPos, pColset->aiCol[0]);







>
|
>







4838
4839
4840
4841
4842
4843
4844
4845
4846
4847
4848
4849
4850
4851
4852
4853
4854
  const u8 **pp,                  /* OUT: Pointer to position-list data */
  int *pn,                        /* OUT: Size of position-list in bytes */
  i64 *piRowid                    /* OUT: Current rowid */
){
  Fts5SegIter *pSeg = &pIter->aSeg[ pIter->aFirst[1].iFirst ];
  assert( pIter->pIndex->rc==SQLITE_OK );
  *piRowid = pSeg->iRowid;
  if( pIter->pIndex->pConfig->bOffsets 
   && pSeg->iLeafOffset+pSeg->nPos<=pSeg->pLeaf->szLeaf 
  ){
    u8 *pPos = &pSeg->pLeaf->p[pSeg->iLeafOffset];
    if( pColset==0 || pIter->bFiltered ){
      *pn = pSeg->nPos;
      *pp = pPos;
    }else if( pColset->nCol==1 ){
      *pp = pPos;
      *pn = fts5IndexExtractCol(pp, pSeg->nPos, pColset->aiCol[0]);
Changes to ext/fts5/fts5_storage.c.
821
822
823
824
825
826
827

828
829
830
831
832
833
834
835
836
837
838
839
840
841

842
843
844
845





846





847
848

849
850
851
852
853
854
855
856
*/
typedef struct Fts5IntegrityCtx Fts5IntegrityCtx;
struct Fts5IntegrityCtx {
  i64 iRowid;
  int iCol;
  int szCol;
  u64 cksum;

  Fts5Config *pConfig;
};

/*
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5InsertCtx object */
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){

  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
    pCtx->szCol++;
  }





  pCtx->cksum ^= sqlite3Fts5IndexCksum(





      pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
  );

  return SQLITE_OK;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return
** some other SQLite error code if an error occurs while attempting to
** determine this.







>







|






>




>
>
>
>
>
|
>
>
>
>
>
|
|
>
|







821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
*/
typedef struct Fts5IntegrityCtx Fts5IntegrityCtx;
struct Fts5IntegrityCtx {
  i64 iRowid;
  int iCol;
  int szCol;
  u64 cksum;
  Fts5Termset *pTermset;
  Fts5Config *pConfig;
};

/*
** Tokenization callback used by integrity check.
*/
static int fts5StorageIntegrityCallback(
  void *pContext,                 /* Pointer to Fts5IntegrityCtx object */
  int tflags,
  const char *pToken,             /* Buffer containing token */
  int nToken,                     /* Size of token in bytes */
  int iStart,                     /* Start offset of token */
  int iEnd                        /* End offset of token */
){
  int rc = SQLITE_OK;
  Fts5IntegrityCtx *pCtx = (Fts5IntegrityCtx*)pContext;
  if( (tflags & FTS5_TOKEN_COLOCATED)==0 || pCtx->szCol==0 ){
    pCtx->szCol++;
  }

  if( pCtx->pTermset ){
    int bPresent = 0;
    rc = sqlite3Fts5TermsetAdd(pCtx->pTermset, pToken, nToken, &bPresent);
    if( rc==SQLITE_OK && bPresent==0 ){
      pCtx->cksum ^= sqlite3Fts5IndexCksum(
          pCtx->pConfig, pCtx->iRowid, 0, pCtx->iCol, pToken, nToken
      );
    }
  }else{
    pCtx->cksum ^= sqlite3Fts5IndexCksum(
        pCtx->pConfig, pCtx->iRowid, pCtx->iCol, pCtx->szCol-1, pToken, nToken
    );
  }
  return rc;
}

/*
** Check that the contents of the FTS index match that of the %_content
** table. Return SQLITE_OK if they do, or SQLITE_CORRUPT if not. Return
** some other SQLite error code if an error occurs while attempting to
** determine this.
882
883
884
885
886
887
888




889
890
891
892
893
894
895

896
897
898
899


900
901
902
903
904
905
906
      if( pConfig->bColumnsize ){
        rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
      }
      for(i=0; rc==SQLITE_OK && i<pConfig->nCol; i++){
        if( pConfig->abUnindexed[i] ) continue;
        ctx.iCol = i;
        ctx.szCol = 0;




        rc = sqlite3Fts5Tokenize(pConfig, 
            FTS5_TOKENIZE_DOCUMENT,
            (const char*)sqlite3_column_text(pScan, i+1),
            sqlite3_column_bytes(pScan, i+1),
            (void*)&ctx,
            fts5StorageIntegrityCallback
        );

        if( pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){
          rc = FTS5_CORRUPT;
        }
        aTotalSize[i] += ctx.szCol;


      }
      if( rc!=SQLITE_OK ) break;
    }
    rc2 = sqlite3_reset(pScan);
    if( rc==SQLITE_OK ) rc = rc2;
  }








>
>
>
>
|
|
|
|
|
|
|
>
|



>
>







895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
      if( pConfig->bColumnsize ){
        rc = sqlite3Fts5StorageDocsize(p, ctx.iRowid, aColSize);
      }
      for(i=0; rc==SQLITE_OK && i<pConfig->nCol; i++){
        if( pConfig->abUnindexed[i] ) continue;
        ctx.iCol = i;
        ctx.szCol = 0;
        if( pConfig->bOffsets==0 ){
          rc = sqlite3Fts5TermsetNew(&ctx.pTermset);
        }
        if( rc==SQLITE_OK ){
          rc = sqlite3Fts5Tokenize(pConfig, 
              FTS5_TOKENIZE_DOCUMENT,
              (const char*)sqlite3_column_text(pScan, i+1),
              sqlite3_column_bytes(pScan, i+1),
              (void*)&ctx,
              fts5StorageIntegrityCallback
          );
        }
        if( rc==SQLITE_OK && pConfig->bColumnsize && ctx.szCol!=aColSize[i] ){
          rc = FTS5_CORRUPT;
        }
        aTotalSize[i] += ctx.szCol;
        sqlite3Fts5TermsetFree(ctx.pTermset);
        ctx.pTermset = 0;
      }
      if( rc!=SQLITE_OK ) break;
    }
    rc2 = sqlite3_reset(pScan);
    if( rc==SQLITE_OK ) rc = rc2;
  }

Added ext/fts5/test/fts5offsets.test.






















































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 2015 December 18
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS5 module.
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5offsets

# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
ifcapable !fts5 {
  finish_test
  return
}


do_execsql_test 1.0 {
  CREATE VIRTUAL TABLE t1 USING fts5(a, b, c, offsets=0);

  INSERT INTO t1 VALUES('h d g', 'j b b g b', 'i e i d h g g'); -- 1
  INSERT INTO t1 VALUES('h j d', 'j h d a h', 'f d d g g f b'); -- 2
  INSERT INTO t1 VALUES('j c i', 'f f h e f', 'c j i j c h f'); -- 3
  INSERT INTO t1 VALUES('e g g', 'g e d h i', 'e d b e g d c'); -- 4
  INSERT INTO t1 VALUES('b c c', 'd i h a f', 'd i j f a b c'); -- 5
  INSERT INTO t1 VALUES('e d e', 'b c j g d', 'a i f d h b d'); -- 6
  INSERT INTO t1 VALUES('g h e', 'b c d i d', 'e f c i f i c'); -- 7
  INSERT INTO t1 VALUES('c f j', 'j j i e a', 'h a c f d h e'); -- 8
  INSERT INTO t1 VALUES('a h i', 'c i a f a', 'c f d h g d g'); -- 9
  INSERT INTO t1 VALUES('j g g', 'e f e f f', 'h j b i c g e'); -- 10
}

do_execsql_test 1.1 {
  INSERT INTO t1(t1) VALUES('integrity-check');
}

foreach {tn match res} {
  1 "a:a" {9}
  2 "b:g" {1 4 6}
  3 "c:h" {1 3 6 8 9 10}
} {
  do_execsql_test 1.2.$tn.1 {
    SELECT rowid FROM t1($match);
  } $res

  do_execsql_test 1.2.$tn.2 {
    SELECT rowid FROM t1($match || '*');
  } $res
}

finish_test