/ Check-in [ce972f6a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:When scanning the full-text index as part of the fts5 integrity-check, also run a point query for every term and verify that these results are consistent with those found by the linear scan.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1:ce972f6aab90f6929d018696f1ab3c2649eca802
User & Date: dan 2015-03-21 15:37:19
Context
2015-03-21
15:45
Merge trunk changes with this branch. check-in: 14274391 user: dan tags: fts5
15:37
When scanning the full-text index as part of the fts5 integrity-check, also run a point query for every term and verify that these results are consistent with those found by the linear scan. check-in: ce972f6a user: dan tags: fts5
2015-03-11
14:51
Add an optimization to the fts5 unicode tokenizer code. check-in: f5db4892 user: dan tags: fts5
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5Int.h.

   238    238   
   239    239   /*
   240    240   ** for(
   241    241   **   pIter = sqlite3Fts5IndexQuery(p, "token", 5, 0);
   242    242   **   0==sqlite3Fts5IterEof(pIter);
   243    243   **   sqlite3Fts5IterNext(pIter)
   244    244   ** ){
   245         -**   i64 iDocid = sqlite3Fts5IndexDocid(pIter);
          245  +**   i64 iRowid = sqlite3Fts5IterRowid(pIter);
   246    246   ** }
   247    247   */
   248    248   
   249    249   /*
   250    250   ** Open a new iterator to iterate though all docids that match the 
   251    251   ** specified token or token prefix.
   252    252   */

Changes to ext/fts5/fts5_index.c.

  4308   4308   ** error, or some other SQLite error code if another error (e.g. OOM)
  4309   4309   ** occurs.
  4310   4310   */
  4311   4311   int sqlite3Fts5IndexIntegrityCheck(Fts5Index *p, u64 cksum){
  4312   4312     Fts5Config *pConfig = p->pConfig;
  4313   4313     int iIdx;                       /* Used to iterate through indexes */
  4314   4314     u64 cksum2 = 0;                 /* Checksum based on contents of indexes */
         4315  +  u64 cksum3 = 0;                 /* Checksum based on contents of indexes */
         4316  +  Fts5Buffer term = {0,0,0};      /* Buffer used to hold most recent term */
  4315   4317   
  4316   4318     /* Check that the internal nodes of each segment match the leaves */
  4317   4319     for(iIdx=0; p->rc==SQLITE_OK && iIdx<=pConfig->nPrefix; iIdx++){
  4318   4320       Fts5Structure *pStruct = fts5StructureRead(p, iIdx);
  4319   4321       if( pStruct ){
  4320   4322         int iLvl, iSeg;
  4321   4323         for(iLvl=0; iLvl<pStruct->nLevel; iLvl++){
................................................................................
  4324   4326             fts5IndexIntegrityCheckSegment(p, iIdx, pSeg);
  4325   4327           }
  4326   4328         }
  4327   4329       }
  4328   4330       fts5StructureRelease(pStruct);
  4329   4331     }
  4330   4332   
  4331         -  /* Check that the checksum of the index matches the argument checksum */
         4333  +  /* The cksum argument passed to this function is a checksum calculated
         4334  +  ** based on all expected entries in the FTS index (including prefix index
         4335  +  ** entries). This block checks that a checksum calculated based on the
         4336  +  ** actual contents of FTS index is identical.
         4337  +  **
         4338  +  ** Two versions of the same checksum are calculated. The first (stack
         4339  +  ** variable cksum2) based on entries extracted from the full-text index
         4340  +  ** while doing a linear scan of each individual index in turn. 
         4341  +  **
         4342  +  ** As each term visited by the linear scans, a separate query for the
         4343  +  ** same term is performed. cksum3 is calculated based on the entries
         4344  +  ** extracted by these queries.
         4345  +  */
  4332   4346     for(iIdx=0; iIdx<=pConfig->nPrefix; iIdx++){
  4333   4347       Fts5MultiSegIter *pIter;
  4334   4348       Fts5Structure *pStruct = fts5StructureRead(p, iIdx);
  4335   4349       for(fts5MultiIterNew(p, pStruct, iIdx, 0, 0, 0, 0, -1, 0, &pIter);
  4336   4350           fts5MultiIterEof(p, pIter)==0;
  4337   4351           fts5MultiIterNext(p, pIter, 0, 0)
  4338   4352       ){
  4339   4353         Fts5PosIter sPos;           /* Used to iterate through position list */
  4340   4354         int n;                      /* Size of term in bytes */
  4341   4355         i64 iRowid = fts5MultiIterRowid(pIter);
  4342   4356         char *z = (char*)fts5MultiIterTerm(pIter, &n);
  4343   4357   
         4358  +      /* Update cksum2 with the entries associated with the current term
         4359  +      ** and rowid.  */
  4344   4360         for(fts5PosIterInit(p, pIter, &sPos);
  4345   4361             fts5PosIterEof(p, &sPos)==0;
  4346   4362             fts5PosIterNext(p, &sPos)
  4347   4363         ){
  4348   4364           cksum2 ^= fts5IndexEntryCksum(iRowid, sPos.iCol, sPos.iPos, z, n);
  4349         -#if 0
  4350         -        fprintf(stdout, "rowid=%d ", (int)iRowid);
  4351         -        fprintf(stdout, "term=%.*s ", n, z);
  4352         -        fprintf(stdout, "col=%d ", sPos.iCol);
  4353         -        fprintf(stdout, "off=%d\n", sPos.iPos);
  4354         -        fflush(stdout);
  4355         -#endif
         4365  +      }
         4366  +
         4367  +      /* If this is a new term, query for it. Update cksum3 with the results. */
         4368  +      if( p->rc==SQLITE_OK && (term.n!=n || memcmp(term.p, z, n)) ){
         4369  +        Fts5IndexIter *pIdxIter = 0;
         4370  +        int flags = (iIdx==0 ? 0 : FTS5INDEX_QUERY_PREFIX);
         4371  +        int rc = sqlite3Fts5IndexQuery(p, z, n, flags, &pIdxIter);
         4372  +        while( rc==SQLITE_OK && 0==sqlite3Fts5IterEof(pIdxIter) ){
         4373  +          const u8 *pPos;
         4374  +          int nPos;
         4375  +          i64 rowid = sqlite3Fts5IterRowid(pIdxIter);
         4376  +          rc = sqlite3Fts5IterPoslist(pIdxIter, &pPos, &nPos);
         4377  +          if( rc==SQLITE_OK ){
         4378  +            Fts5PoslistReader sReader;
         4379  +            for(sqlite3Fts5PoslistReaderInit(-1, pPos, nPos, &sReader);
         4380  +                sReader.bEof==0;
         4381  +                sqlite3Fts5PoslistReaderNext(&sReader)
         4382  +            ){
         4383  +              int iCol = FTS5_POS2COLUMN(sReader.iPos);
         4384  +              int iOff = FTS5_POS2OFFSET(sReader.iPos);
         4385  +              cksum3 ^= fts5IndexEntryCksum(rowid, iCol, iOff, z, n);
         4386  +            }
         4387  +            rc = sqlite3Fts5IterNext(pIdxIter);
         4388  +          }
         4389  +        }
         4390  +        sqlite3Fts5IterClose(pIdxIter);
         4391  +        fts5BufferSet(&rc, &term, n, (const u8*)z);
         4392  +        p->rc = rc;
  4356   4393         }
  4357   4394       }
  4358   4395       fts5MultiIterFree(p, pIter);
  4359   4396       fts5StructureRelease(pStruct);
  4360   4397     }
  4361   4398     if( p->rc==SQLITE_OK && cksum!=cksum2 ) p->rc = FTS5_CORRUPT;
         4399  +  if( p->rc==SQLITE_OK && cksum!=cksum3 ) p->rc = FTS5_CORRUPT;
  4362   4400   
         4401  +  fts5BufferFree(&term);
  4363   4402     return fts5IndexReturn(p);
  4364   4403   }
  4365   4404   
  4366   4405   
  4367   4406   /*
  4368   4407   ** Indicate that all subsequent calls to sqlite3Fts5IndexWrite() pertain
  4369   4408   ** to the document with rowid iRowid.