/ Check-in [ae502657]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add support for OR and NOT terms in fts1. (CVS 3399)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:ae50265791d1a7500aa3c405a78a9bca8ff0cc08
User & Date: drh 2006-09-09 23:11:51
Context
2006-09-10
03:34
Add some simple test cases for the OR and NOT logic of the fts1 module. Fix lots of bugs discovered while developing these test cases. (CVS 3400) check-in: 70bcff02 user: drh tags: trunk
2006-09-09
23:11
Add support for OR and NOT terms in fts1. (CVS 3399) check-in: ae502657 user: drh tags: trunk
2006-09-08
17:00
Write doclists using a segmented technique to amortize costs better. New items for a term are merged with the term's segment 0 doclist, until that doclist exceeds CHUNK_MAX. Then the segments are merged in exponential fashion, so that segment 1 contains approximately 2*CHUNK_MAX data, segment 2 4*CHUNK_MAX, and so on. (CVS 3398) check-in: b6b93a33 user: shess tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1.c.

   230    230     addPos(d, iPos);
   231    231     appendVarint(d, iStartOffset-d->iLastOffset);
   232    232     d->iLastOffset = iStartOffset;
   233    233     appendVarint(d, iEndOffset-iStartOffset);
   234    234     appendVarint(d, 0);  /* add new terminator */
   235    235   }
   236    236   
          237  +/*
          238  +** A DocListReader object is a cursor into a doclist.  Initialize
          239  +** the cursor to the beginning of the doclist by calling readerInit().
          240  +** Then use routines
          241  +**
          242  +**      peekDocid()
          243  +**      readDocid()
          244  +**      readPosition()
          245  +**      skipPositionList()
          246  +**      and so forth...
          247  +**
          248  +** to read information out of the doclist.  When we reach the end
          249  +** of the doclist, atEnd() returns TRUE.
          250  +*/
   237    251   typedef struct DocListReader {
   238         -  DocList *pDoclist;
   239         -  char *p;
          252  +  DocList *pDoclist;  /* The document list we are stepping through */
          253  +  char *p;            /* Pointer to next unread byte in the doclist */
   240    254     int iLastPos;  /* the last position read, or -1 when not in a position list */
   241    255   } DocListReader;
   242    256   
          257  +/*
          258  +** Initialize the DocListReader r to point to the beginning of pDoclist.
          259  +*/
   243    260   static void readerInit(DocListReader *r, DocList *pDoclist){
   244    261     r->pDoclist = pDoclist;
   245    262     if( pDoclist!=NULL ){
   246    263       r->p = pDoclist->pData;
   247    264     }
   248    265     r->iLastPos = -1;
   249    266   }
   250    267   
          268  +/*
          269  +** Return TRUE if we have reached then end of pReader and there is
          270  +** nothing else left to read.
          271  +*/
   251    272   static int atEnd(DocListReader *pReader){
   252         -  return pReader->p >= docListEnd(pReader->pDoclist);
          273  +  return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
   253    274   }
   254    275   
   255         -/* Peek at the next docid without advancing the read pointer. */
          276  +/* Peek at the next docid without advancing the read pointer. 
          277  +*/
   256    278   static sqlite_int64 peekDocid(DocListReader *pReader){
   257    279     sqlite_int64 ret;
   258    280     assert( !atEnd(pReader) );
   259    281     assert( pReader->iLastPos==-1 );
   260    282     getVarint(pReader->p, &ret);
   261    283     return ret;
   262    284   }
   263    285   
   264         -/* Read the next docid. */
          286  +/* Read the next docid. 
          287  +*/
   265    288   static sqlite_int64 readDocid(DocListReader *pReader){
   266    289     sqlite_int64 ret;
   267    290     assert( !atEnd(pReader) );
   268    291     assert( pReader->iLastPos==-1 );
   269    292     pReader->p += getVarint(pReader->p, &ret);
   270    293     if( pReader->pDoclist->iType>=DL_POSITIONS ){
   271    294       pReader->iLastPos = 0;
................................................................................
   274    297   }
   275    298   
   276    299   /* Read the next position from a position list.
   277    300    * Returns the position, or -1 at the end of the list. */
   278    301   static int readPosition(DocListReader *pReader){
   279    302     int i;
   280    303     int iType = pReader->pDoclist->iType;
   281         -  assert( iType>=DL_POSITIONS );
          304  +
   282    305     assert( !atEnd(pReader) );
   283    306     assert( pReader->iLastPos!=-1 );
   284    307   
          308  +  if( iType<DL_POSITIONS ){
          309  +    return -1;
          310  +  }
   285    311     pReader->p += getVarint32(pReader->p, &i);
   286    312     if( i==0 ){
   287    313       pReader->iLastPos = -1;
   288    314       return -1;
   289    315     }
   290    316     pReader->iLastPos += ((int) i)-1;
   291    317     if( iType>=DL_POSITIONS_OFFSETS ){
................................................................................
   295    321       pReader->p += getVarint32(pReader->p, &iEnd);
   296    322     }
   297    323     return pReader->iLastPos;
   298    324   }
   299    325   
   300    326   /* Skip past the end of a position list. */
   301    327   static void skipPositionList(DocListReader *pReader){
   302         -  while( readPosition(pReader)!=-1 )
   303         -    ;
          328  +  DocList *p = pReader->pDoclist;
          329  +  if( p && p->iType>=DL_POSITIONS ){
          330  +    while( readPosition(pReader)!=-1 ){}
          331  +  }
   304    332   }
   305    333   
   306    334   /* Skip over a docid, including its position list if the doclist has
   307    335    * positions. */
   308    336   static void skipDocument(DocListReader *pReader){
   309    337     readDocid(pReader);
   310         -  if( pReader->pDoclist->iType>=DL_POSITIONS ){
   311         -    skipPositionList(pReader);
   312         -  }
          338  +  skipPositionList(pReader);
   313    339   }
   314    340   
   315    341   /* Skip past all docids which are less than [iDocid].  Returns 1 if a docid
   316    342    * matching [iDocid] was found.  */
   317    343   static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
   318    344     sqlite_int64 d = 0;
   319    345     while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
   320    346       skipDocument(pReader);
   321    347     }
   322    348     return !atEnd(pReader) && d==iDocid;
   323    349   }
   324    350   
          351  +/* Return the first document in a document list.
          352  +*/
   325    353   static sqlite_int64 firstDocid(DocList *d){
   326    354     DocListReader r;
   327    355     readerInit(&r, d);
   328    356     return readDocid(&r);
   329    357   }
   330    358   
          359  +#ifdef SQLITE_DEBUG
          360  +/*
          361  +** This routine is used for debugging purpose only.
          362  +**
          363  +** Write the content of a doclist to standard output.
          364  +*/
          365  +static void printDoclist(DocList *p){
          366  +  DocListReader r;
          367  +  const char *zSep = "";
          368  +
          369  +  readerInit(&r, p);
          370  +  while( !atEnd(&r) ){
          371  +    sqlite_int64 docid = readDocid(&r);
          372  +    if( docid==0 ){
          373  +      skipPositionList(&r);
          374  +      continue;
          375  +    }
          376  +    printf("%s%lld", zSep, docid);
          377  +    zSep =  ",";
          378  +    if( p->iType>=DL_POSITIONS ){
          379  +      int iPos;
          380  +      const char *zDiv = "";
          381  +      printf("(");
          382  +      while( (iPos = readPosition(&r))>=0 ){
          383  +        printf("%s%d", zDiv, iPos);
          384  +        zDiv = ":";
          385  +      }
          386  +      printf(")");
          387  +    }
          388  +  }
          389  +  printf("\n");
          390  +  fflush(stdout);
          391  +}
          392  +#endif /* SQLITE_DEBUG */
          393  +
   331    394   /* Helper function for docListUpdate() and docListAccumulate().
   332    395   ** Splices a doclist element into the doclist represented by r,
   333    396   ** leaving r pointing after the newly spliced element.
   334    397   */
   335    398   static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
   336    399                                    const char *pSource, int nSource){
   337    400     DocList *d = r->pDoclist;
................................................................................
   406    469       char *pSource = updateReader.p;
   407    470       sqlite_int64 iDocid = readDocid(&updateReader);
   408    471       skipPositionList(&updateReader);
   409    472       docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
   410    473     }
   411    474   }
   412    475   
   413         -/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
   414         - * on-disk doclist, resulting in another in-memory DocList [out].  [in]
   415         - * and [out] may or may not store position information according to the
   416         - * caller's wishes.  The on-disk doclist always comes with positions.
   417         - *
   418         - * The caller must read each chunk of the on-disk doclist in succession and
   419         - * pass it to mergeBlock().
   420         - *
   421         - * If [in] has positions, then the merge output contains only documents with
   422         - * matching positions in the two input doclists.  If [in] does not have
   423         - * positions, then the merge output contains all documents common to the two
   424         - * input doclists.
   425         - *
   426         - * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
   427         - *
   428         - * A merge is performed using an integer [iPhrasePos] provided by the caller.
   429         - * [iPhrasePos] is subtracted from each position in the on-disk doclist for the
   430         - * purpose of position comparison; this is helpful in implementing phrase
   431         - * searches.
   432         - *
   433         - * A DocListMerge is not yet able to propagate offsets through query
   434         - * processing; we should add that capability soon.
   435         -*/
   436         -/* TODO(shess) Adam indicates that since we no longer can stream
   437         -** ordered doclist chunks, DocListMerge is no longer as useful and
   438         -** should be removed.  Not removing at this time so that the removal
   439         -** doesn't obscure the exponential-chunking change.
   440         -*/
   441         -typedef struct DocListMerge {
   442         -  DocListReader in;
   443         -  DocList *pOut;
   444         -  int iPhrasePos;
   445         -} DocListMerge;
   446         -
   447         -static void mergeInit(DocListMerge *m,
   448         -                      DocList *pIn, int iPhrasePos, DocList *pOut){
   449         -  readerInit(&m->in, pIn);
   450         -  m->pOut = pOut;
   451         -  m->iPhrasePos = iPhrasePos;
   452         -
   453         -  /* can't handle offsets yet */
   454         -  assert( pIn==NULL || pIn->iType<=DL_POSITIONS );
   455         -  assert( pOut->iType<=DL_POSITIONS );
   456         -}
   457         -
   458         -/* A helper function for mergeBlock(), below.  Merge the position lists
   459         - * pointed to by m->in and pBlockReader.
   460         - * If the merge matches, write [iDocid] to m->pOut; if m->pOut
   461         - * has positions then write all matching positions as well. */
   462         -static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
   463         -                         DocListReader *pBlockReader){
   464         -  int iBlockPos = readPosition(pBlockReader);
   465         -  int iInPos = readPosition(&m->in);
          476  +/*
          477  +** pLeft and pRight are two DocListReaders that are pointing to
          478  +** positions lists of the same document: iDocid. 
          479  +**
          480  +** If there are no instances in pLeft or pRight where the position
          481  +** of pLeft is iOffset less than the position of pRight, then this
          482  +** routine adds nothing to pOut.
          483  +**
          484  +** If there are one or more instances where positions from pLeft
          485  +** are exactly iOffset less than positions from pRight, then add a new
          486  +** document record to pOut and include the positions from pLeft.
          487  +**
          488  +** pLeft and pRight are left pointing at the next document.
          489  +*/
          490  +static void mergePosList(
          491  +  DocListReader *pLeft,    /* Left position list */
          492  +  DocListReader *pRight,   /* Right position list */
          493  +  sqlite_int64 iDocid,     /* The docid from pLeft and pRight */
          494  +  int iOffset,             /* Copy if pLeft.pos+iOffset==pRight.iPos */
          495  +  DocList *pOut            /* Write the merged document record here */
          496  +){
          497  +  int iLeftPos = readPosition(pLeft);
          498  +  int iRightPos = readPosition(pRight);
   466    499     int match = 0;
   467    500   
   468    501     /* Loop until we've reached the end of both position lists. */
   469         -  while( iBlockPos!=-1 || iInPos!=-1 ){
   470         -    if( iBlockPos-m->iPhrasePos==iInPos ){
          502  +  while( iLeftPos!=-1 && iRightPos!=-1 ){
          503  +    if( iLeftPos+iOffset==iRightPos ){
   471    504         if( !match ){
   472         -        docListAddDocid(m->pOut, iDocid);
          505  +        docListAddDocid(pOut, iDocid);
   473    506           match = 1;
   474    507         }
   475         -      if( m->pOut->iType>=DL_POSITIONS ){
   476         -        docListAddPos(m->pOut, iInPos);
          508  +      if( pOut->iType>=DL_POSITIONS ){
          509  +        docListAddPos(pOut, iLeftPos);
   477    510         }
   478         -      iBlockPos = readPosition(pBlockReader);
   479         -      iInPos = readPosition(&m->in);
   480         -    } else if( iInPos==-1 || (iBlockPos!=-1 && iBlockPos-m->iPhrasePos<iInPos) ){
   481         -      iBlockPos = readPosition(pBlockReader);
   482         -    } else {
   483         -      iInPos = readPosition(&m->in);
   484         -    }
   485         -  }
   486         -}
   487         -
   488         -/* A helper function for mergeBlock(), below.  Copy the docid and
   489         - * position list (if wanted) from pBlockReader to pOut. */
   490         -static void copyDocument(DocList *pOut, sqlite_int64 iDocid,
   491         -                         DocListReader *pBlockReader){
   492         -  docListAddDocid(pOut, iDocid);
   493         -  if( pOut->iType<DL_POSITIONS ){
   494         -    skipPositionList(pBlockReader);
   495         -  } else {
   496         -    /* Copy all positions to the output doclist. */
   497         -    int pos;
   498         -    while( (pos = readPosition(pBlockReader))!=-1 ){
   499         -      docListAddPos(pOut, pos);
   500         -    }
   501         -  }
   502         -}
   503         -
   504         -/* Merge one block of an on-disk doclist into a DocListMerge. */
   505         -static void mergeBlock(DocListMerge *m, DocList *pBlock){
   506         -  DocListReader blockReader;
   507         -  assert( pBlock->iType>=DL_POSITIONS );
   508         -  readerInit(&blockReader, pBlock);
   509         -  while( !atEnd(&blockReader) ){
   510         -    sqlite_int64 iDocid = readDocid(&blockReader);
   511         -    if( m->in.pDoclist==NULL ){
   512         -      /* Skip document delete crumbs */
   513         -      if( *blockReader.p=='\0' ){
   514         -        skipPositionList(&blockReader);
   515         -      } else {
   516         -        copyDocument(m->pOut, iDocid, &blockReader);
   517         -      }
   518         -      continue;
   519         -    }
   520         -    if( skipToDocid(&m->in, iDocid) ){  /* we have a docid match */
   521         -      readDocid(&m->in);
   522         -      /* Skip document delete crumbs */
   523         -      if( *blockReader.p=='\0' ){
   524         -        skipPositionList(&blockReader);
          511  +      iLeftPos = readPosition(pLeft);
          512  +      iRightPos = readPosition(pRight);
          513  +    }else if( iRightPos<iLeftPos+iOffset ){
          514  +      iRightPos = readPosition(pRight);
          515  +    }else{
          516  +      iLeftPos = readPosition(pLeft);
          517  +    }
          518  +  }
          519  +  if( iLeftPos>=0 ) skipPositionList(pLeft);
          520  +  if( iRightPos>=0 ) skipPositionList(pRight);
          521  +}
          522  +
          523  +/*
          524  +** Read the next non-deleted docid off of pIn.  Return
          525  +** 0 if we reach the end of pDoclist.
          526  +*/
          527  +static sqlite_int64 nextValidDocid(DocListReader *pIn){
          528  +  sqlite_int64 docid = 0;
          529  +  while( !atEnd(pIn) && (docid = readDocid(pIn))==0 ){
          530  +    skipPositionList(pIn);
          531  +  }
          532  +  return docid;
          533  +}
          534  +
          535  +/* We have two doclists:  pLeft and pRight.
          536  +** Write the intersection of these two doclists into pOut.
          537  +**
          538  +** nLeftPhrase is the number of words of a phrase that have
          539  +** contributed to pLeft.
          540  +*/
          541  +static void mergeBlockAnd(
          542  +  DocList *pLeft,    /* Doclist resulting from the words on the left */
          543  +  DocList *pRight,   /* Doclist for the next word to the right */
          544  +  int nLeftPhrase,   /* Number of matching phrase terms in pLeft */
          545  +  DocList *pOut      /* Write the combined doclist here */
          546  +){
          547  +  DocListReader left, right;
          548  +  sqlite_int64 docidLeft, docidRight;
          549  +
          550  +  readerInit(&left, pLeft);
          551  +  readerInit(&right, pRight);
          552  +  docidLeft = nextValidDocid(&left);
          553  +  docidRight = nextValidDocid(&right);
          554  +
          555  +  while( docidLeft>0 && docidRight>0 ){
          556  +    if( docidLeft<docidRight ){
          557  +      skipPositionList(&left);
          558  +      docidLeft = nextValidDocid(&left);
          559  +    }else if( docidRight<docidLeft ){
          560  +      skipPositionList(&right);
          561  +      docidRight = nextValidDocid(&right);
          562  +    }else{
          563  +      /* Found a match */
          564  +      if( pLeft->iType>=DL_POSITIONS ){
          565  +        mergePosList(&left, &right, docidLeft, nLeftPhrase, pOut);
   525    566         }else{
   526         -        if( m->in.pDoclist->iType>=DL_POSITIONS ){
   527         -          mergePosList(m, iDocid, &blockReader);
   528         -        } else {
   529         -          copyDocument(m->pOut, iDocid, &blockReader);
   530         -        }
          567  +        docListAddDocid(pOut, docidLeft);
          568  +        skipPositionList(&left);
          569  +        skipPositionList(&right);
   531    570         }
   532         -    } else if( !atEnd(&m->in) ){
   533         -      skipPositionList(&blockReader);  /* skip this docid in the block */
   534         -    } else return;  /* nothing more to merge */
          571  +      docidLeft = nextValidDocid(&left);
          572  +      docidRight = nextValidDocid(&right);
          573  +    }
          574  +  }
          575  +}
          576  +
          577  +/* We have two doclists:  pLeft and pRight.
          578  +** Write into pOut all documents that occur in pLeft but not
          579  +** in pRight.
          580  +**
          581  +** The output pOut never holds positions.
          582  +*/
          583  +static void mergeBlockExcept(
          584  +  DocList *pLeft,    /* Doclist resulting from the words on the left */
          585  +  DocList *pRight,   /* Doclist for the next word to the right */
          586  +  DocList *pOut      /* Write the combined doclist here */
          587  +){
          588  +  DocListReader left, right;
          589  +  sqlite_int64 docidLeft, docidRight, priorLeft;
          590  +
          591  +  readerInit(&left, pLeft);
          592  +  readerInit(&right, pRight);
          593  +  docidLeft = nextValidDocid(&left);
          594  +  docidRight = nextValidDocid(&right);
          595  +
          596  +  while( docidLeft>0 ){
          597  +    priorLeft = docidLeft;
          598  +    if( docidRight==0 || docidLeft<docidRight ){
          599  +      docListAddDocid(pOut, docidLeft);
          600  +    }
          601  +    if( docidRight==0 || docidLeft<=docidRight ){
          602  +      skipPositionList(&left);
          603  +      docidLeft = nextValidDocid(&left);
          604  +    }
          605  +    if( docidRight>0 && docidRight<=priorLeft ){
          606  +      skipPositionList(&right);
          607  +      docidRight = nextValidDocid(&right);
          608  +    }
          609  +  }
          610  +}
          611  +
          612  +/* We have two doclists:  pLeft and pRight.
          613  +** Write the union of these two doclists into pOut.
          614  +**
          615  +** The output pOut never holds positions.
          616  +*/
          617  +static void mergeBlockOr(
          618  +  DocList *pLeft,    /* Doclist resulting from the words on the left */
          619  +  DocList *pRight,   /* Doclist for the next word to the right */
          620  +  DocList *pOut      /* Write the combined doclist here */
          621  +){
          622  +  DocListReader left, right;
          623  +  sqlite_int64 docidLeft, docidRight, priorLeft;
          624  +
          625  +  readerInit(&left, pLeft);
          626  +  readerInit(&right, pRight);
          627  +  docidLeft = nextValidDocid(&left);
          628  +  docidRight = nextValidDocid(&right);
          629  +
          630  +  while( docidLeft>0 && docidRight>0 ){
          631  +    if( docidLeft<=docidRight ){
          632  +      docListAddDocid(pOut, docidLeft);
          633  +    }else{
          634  +      docListAddDocid(pOut, docidRight);
          635  +    }
          636  +    priorLeft = docidLeft;
          637  +    if( docidLeft<=docidRight ){
          638  +      skipPositionList(&left);
          639  +      docidLeft = nextValidDocid(&left);
          640  +    }
          641  +    if( docidRight>0 && docidRight<=priorLeft ){
          642  +      skipPositionList(&right);
          643  +      docidRight = nextValidDocid(&right);
          644  +    }
          645  +  }
          646  +  while( docidLeft>0 ){
          647  +    docListAddDocid(pOut, docidLeft);
          648  +    skipPositionList(&left);
          649  +    docidLeft = nextValidDocid(&left);
          650  +  }
          651  +  while( docidRight>0 ){
          652  +    docListAddDocid(pOut, docidRight);
          653  +    skipPositionList(&right);
          654  +    docidRight = nextValidDocid(&right);
   535    655     }
   536    656   }
   537    657   
   538    658   /* Duplicate a string; the caller must free() the returned string.
   539    659    * (We don't use strdup() since it's not part of the standard C library and
   540    660    * may not be available everywhere.) */
   541    661   static char *string_dup(const char *s){
................................................................................
  1160   1280         return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
  1161   1281       default:
  1162   1282         assert( 0 );
  1163   1283         return SQLITE_ERROR;  /* not reached */
  1164   1284     }
  1165   1285   }
  1166   1286   
         1287  +/*
         1288  +** Different kinds of allowed document merge operations.
         1289  +*/
         1290  +#define MERGE_AND    1     /* Intersection of left and right */
         1291  +#define MERGE_OR     2     /* Union of left and right */
         1292  +#define MERGE_EXCEPT 3     /* Documents in left but not in right */
         1293  +
  1167   1294   /* Read the posting list for [pTerm]; AND it with the doclist [pIn] to
  1168   1295    * produce the doclist [out], using the given phrase position [iPhrasePos].
  1169   1296    * (*pSelect) is used to hold an SQLite statement used inside this function;
  1170   1297    * the caller should initialize *pSelect to NULL before the first call.
  1171   1298    */
  1172         -static int mergeQuery(fulltext_vtab *v, const char *pTerm, int nTerm,
  1173         -                       DocList *pIn, int iPhrasePos, DocList *out){
         1299  +static int mergeQuery(
         1300  +  fulltext_vtab *v,              /* The full text index virtual table */
         1301  +  const char *pTerm, int nTerm,  /* Term we are searching for */
         1302  +  DocList *pIn,                  /* Prior search results. NULL for first term */
         1303  +  int iPhrasePos,                /* Offset to first term of phrase search */
         1304  +  int eOp,                       /* MERGE_AND, MERGE_OR, or MERGE_EXCEPT */
         1305  +  DocList *out                   /* Write results here */
         1306  +){
  1174   1307     int rc;
  1175         -  DocListMerge merge;
  1176   1308     DocList doclist;
  1177   1309   
  1178   1310     /* If [pIn] is already empty, there's no point in reading the
  1179   1311      * posting list to AND it in; return immediately. */
  1180         -  if( pIn!=NULL && !pIn->nData ) return SQLITE_OK;
         1312  +  if( pIn!=NULL && eOp==MERGE_AND && !pIn->nData ) return SQLITE_OK;
  1181   1313   
  1182   1314     rc = term_select_all(v, pTerm, nTerm, &doclist);
  1183   1315     if( rc!=SQLITE_OK ) return rc;
  1184   1316   
  1185         -  mergeInit(&merge, pIn, iPhrasePos, out);
  1186         -  mergeBlock(&merge, &doclist);
         1317  +  /* If there is no input and the output wants to contain position
         1318  +  ** information, then make the result the doclist for pTerm.
         1319  +  */
         1320  +  if( pIn==0 && out->iType>=DL_POSITIONS ){
         1321  +    docListDestroy(out);
         1322  +    *out = doclist;
         1323  +    return SQLITE_OK;
         1324  +  }
         1325  +
         1326  +  if( eOp==MERGE_AND && pIn!=0 ){
         1327  +    mergeBlockAnd(pIn, &doclist, iPhrasePos, out);
         1328  +  }else if( eOp==MERGE_OR || pIn==0 ){
         1329  +    mergeBlockOr(pIn, &doclist, out);
         1330  +  }else if( eOp==MERGE_EXCEPT ){
         1331  +    mergeBlockExcept(pIn, &doclist, out);
         1332  +  }
  1187   1333     docListDestroy(&doclist);
  1188   1334   
  1189   1335     return SQLITE_OK;
  1190   1336   }
  1191   1337   
         1338  +/* A single term in a query is represented by an instances of
         1339  +** the following structure.
         1340  +*/
  1192   1341   typedef struct QueryTerm {
  1193         -  int isPhrase;    /* true if this term begins a new phrase */
  1194         -  char *pTerm;
  1195         -  int nTerm;
         1342  +  int firstInPhrase; /* true if this term begins a new phrase */
         1343  +  int isOr;          /* this term is preceded by "OR" */
         1344  +  int isNot;         /* this term is preceded by "-" */
         1345  +  char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
         1346  +  int nTerm;         /* Number of bytes in pTerm[] */
  1196   1347   } QueryTerm;
  1197   1348   
  1198   1349   /* A parsed query.
  1199   1350    *
  1200         - * As an example, parsing the query ["four score" years "new nation"] will
  1201         - * yield a Query with 5 terms:
  1202         - *   "four",   isPhrase = 1
  1203         - *   "score",  isPhrase = 0
  1204         - *   "years",  isPhrase = 1
  1205         - *   "new",    isPhrase = 1
  1206         - *   "nation", isPhrase = 0
         1351  + * We could, in theory, allow query strings to be complicated
         1352  + * nested expressions with precedence determined by parentheses.
         1353  + * But none of the major search engines do this.  (Perhaps the
         1354  + * feeling is that an parenthesized expression is two complex of
         1355  + * an idea for the average user to grasp.)  Taking our lead from
         1356  + * the major search engines, we will allow queries to be a list
         1357  + * of terms (with an implied AND operator) or phrases in double-quotes,
         1358  + * with a single optional "-" before each non-phrase term to designate
         1359  + * negation and an optional OR connector.
         1360  + *
         1361  + * OR binds more tightly than the implied AND, which is what the
         1362  + * major search engines seem to do.  So, for example:
         1363  + * 
         1364  + *    [one two OR three]     ==>    one AND (two OR three)
         1365  + *    [one OR two three]     ==>    (one OR two) AND three
         1366  + *
         1367  + * A "-" before a term matches all entries that lack that term.
         1368  + * The "-" must occur immediately before the term with in intervening
         1369  + * space.  This is how the search engines do it.
         1370  + *
         1371  + * A NOT term cannot be the right-hand operand of an OR.  If this
         1372  + * occurs in the query string, the NOT is ignored:
         1373  + *
         1374  + *    [one OR -two]          ==>    one OR two
         1375  + *
  1207   1376    */
  1208   1377   typedef struct Query {
  1209         -  int nTerms;
  1210         -  QueryTerm *pTerms;
         1378  +  int nTerms;           /* Number of terms in the query */
         1379  +  QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
  1211   1380   } Query;
  1212   1381   
  1213         -static void queryAdd(Query *q, int isPhrase, const char *pTerm, int nTerm){
         1382  +/* Add a new term pTerm[0..nTerm-1] to the query *q.  The new term is
         1383  +** the beginning of a phrase is firstInPhrase is true.
         1384  +*/
         1385  +static void queryAdd(Query *q, int firstInPhrase, const char *pTerm, int nTerm){
  1214   1386     QueryTerm *t;
  1215   1387     ++q->nTerms;
  1216   1388     q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
         1389  +  if( q->pTerms==0 ){
         1390  +    q->nTerms = 0;
         1391  +    return;
         1392  +  }
  1217   1393     t = &q->pTerms[q->nTerms - 1];
  1218         -  t->isPhrase = isPhrase;
  1219         -  t->pTerm = malloc(nTerm);
         1394  +  memset(t, 0, sizeof(*t));
         1395  +  t->firstInPhrase = firstInPhrase;
         1396  +  t->pTerm = malloc(nTerm+1);
  1220   1397     memcpy(t->pTerm, pTerm, nTerm);
         1398  +  t->pTerm[nTerm] = 0;
  1221   1399     t->nTerm = nTerm;
  1222   1400   }
  1223         -    
         1401  +
         1402  +/* Free all of the memory that was malloced in order to build *q.
         1403  +*/
  1224   1404   static void queryDestroy(Query *q){
  1225   1405     int i;
  1226   1406     for(i = 0; i < q->nTerms; ++i){
  1227   1407       free(q->pTerms[i].pTerm);
  1228   1408     }
  1229   1409     free(q->pTerms);
  1230   1410   }
  1231   1411   
         1412  +/*
         1413  +** Parse the text at pSegment[0..nSegment-1].  Add additional terms
         1414  +** to the query being assemblied in pQuery.
         1415  +**
         1416  +** inPhrase is true if pSegment[0..nSegement-1] is contained within
         1417  +** double-quotes.  If inPhrase is true, then only the first term
         1418  +** is marked with firstInPhrase and OR and "-" syntax is ignored.
         1419  +** If inPhrase is false, then every term found is marked with
         1420  +** firstInPhrase and OR and "-" syntax is significant.
         1421  +*/
  1232   1422   static int tokenizeSegment(sqlite3_tokenizer *pTokenizer,
  1233   1423                               const char *pSegment, int nSegment, int inPhrase,
  1234   1424                               Query *pQuery){
  1235   1425     const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
  1236   1426     sqlite3_tokenizer_cursor *pCursor;
  1237   1427     int is_first = 1;
         1428  +  int isOr = 0;
  1238   1429     
  1239   1430     int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
  1240   1431     if( rc!=SQLITE_OK ) return rc;
  1241   1432     pCursor->pTokenizer = pTokenizer;
  1242   1433   
  1243   1434     while( 1 ){
  1244   1435       const char *pToken;
  1245         -    int nToken, iDummyOffset, iDummyPos;
         1436  +    int nToken, iBegin, iEnd, iPos;
  1246   1437   
  1247   1438       rc = pModule->xNext(pCursor,
  1248   1439                           &pToken, &nToken,
  1249         -                        &iDummyOffset, &iDummyOffset,
  1250         -                        &iDummyPos);
         1440  +                        &iBegin, &iEnd, &iPos);
  1251   1441       if( rc!=SQLITE_OK ) break;
         1442  +    if( !inPhrase && pQuery->nTerms>0 && nToken==2
         1443  +         && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
         1444  +      isOr = 1;
         1445  +      continue;
         1446  +    }
  1252   1447       queryAdd(pQuery, !inPhrase || is_first, pToken, nToken);
         1448  +    if( !inPhrase ){
         1449  +      if( isOr ){
         1450  +        pQuery->pTerms[pQuery->nTerms-1].isOr = 1;
         1451  +      }else if( iBegin>0 && pSegment[iBegin-1]=='-' ){
         1452  +        pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
         1453  +      }
         1454  +    }
  1253   1455       is_first = 0;
         1456  +    isOr = 0;
  1254   1457     }
  1255   1458   
  1256   1459     return pModule->xClose(pCursor);
  1257   1460   }
  1258   1461   
  1259   1462   /* Parse a query string, yielding a Query object [pQuery], which the caller
  1260   1463    * must free. */
................................................................................
  1284   1487     if(inPhrase) {  /* unmatched quote */
  1285   1488       queryDestroy(pQuery);
  1286   1489       return SQLITE_ERROR;
  1287   1490     }
  1288   1491     return SQLITE_OK;
  1289   1492   }
  1290   1493   
  1291         -/* Perform a full-text query; return a list of documents in [pResult]. */
         1494  +/* Perform a full-text query using the search expression in
         1495  +** pInput[0..nInput-1].  Return a list of matching documents
         1496  +** in pResult.
         1497  +*/
  1292   1498   static int fulltextQuery(fulltext_vtab *v, const char *pInput, int nInput,
  1293   1499                             DocList **pResult){
  1294   1500     Query q;
  1295   1501     int phrase_start = -1;
  1296   1502     int i;
  1297   1503     DocList *d = NULL;
  1298   1504   
  1299   1505     int rc = parseQuery(v, pInput, nInput, &q);
  1300   1506     if( rc!=SQLITE_OK ) return rc;
  1301   1507   
  1302         -  /* Merge terms. */
         1508  +  /* Merge AND terms. */
  1303   1509     for(i = 0 ; i < q.nTerms ; ++i){
         1510  +    int needPositions;
         1511  +    DocList *next;
         1512  +
         1513  +    if( q.pTerms[i].isNot || q.pTerms[i].isOr ){
         1514  +      /* Handle all OR and NOT terms in a separate pass */
         1515  +      continue;
         1516  +    }
         1517  +
  1304   1518       /* In each merge step, we need to generate positions whenever we're
  1305   1519        * processing a phrase which hasn't ended yet. */
  1306         -    int needPositions = i<q.nTerms-1 && !q.pTerms[i+1].isPhrase;
  1307         -    DocList *next = docListNew(needPositions ? DL_POSITIONS : DL_DOCIDS);
  1308         -    if( q.pTerms[i].isPhrase ){
         1520  +    needPositions = i<q.nTerms-1 && !q.pTerms[i+1].firstInPhrase;
         1521  +    next = docListNew(needPositions ? DL_POSITIONS : DL_DOCIDS);
         1522  +    if( q.pTerms[i].firstInPhrase ){
  1309   1523         phrase_start = i;
  1310   1524       }
  1311   1525       rc = mergeQuery(v, q.pTerms[i].pTerm, q.pTerms[i].nTerm,
  1312         -                     d, i-phrase_start, next);
         1526  +                     d, i-phrase_start, MERGE_AND, next);
  1313   1527       if( rc!=SQLITE_OK ) break;
  1314   1528       if( d!=NULL ){
  1315   1529         docListDelete(d);
  1316   1530       }
  1317   1531       d = next;
  1318   1532     }
         1533  +
         1534  +  /* Do the OR terms */
         1535  +  for(i=0; i<q.nTerms; i++){
         1536  +    DocList *next;
         1537  +    if( !q.pTerms[i].isOr ) continue;
         1538  +    next = docListNew(DL_DOCIDS);
         1539  +    rc = mergeQuery(v, q.pTerms[i].pTerm, q.pTerms[i].nTerm,
         1540  +                    d, 0, MERGE_OR, next);
         1541  +    if( d ){
         1542  +      docListDelete(d);
         1543  +    }
         1544  +    d = next;
         1545  +  }
         1546  +
         1547  +  /* Do the EXCEPT terms */
         1548  +  for(i=0; i<q.nTerms; i++){
         1549  +    DocList *next;
         1550  +    if( !q.pTerms[i].isNot ) continue;
         1551  +    next = docListNew(DL_DOCIDS);
         1552  +    rc = mergeQuery(v, q.pTerms[i].pTerm, q.pTerms[i].nTerm,
         1553  +                    d, 0, MERGE_EXCEPT, next);
         1554  +    if( d ){
         1555  +      docListDelete(d);
         1556  +    }
         1557  +    d = next;
         1558  +  }
  1319   1559   
  1320   1560     queryDestroy(&q);
  1321   1561     *pResult = d;
  1322   1562     return rc;
  1323   1563   }
  1324   1564   
  1325   1565   static int fulltextFilter(sqlite3_vtab_cursor *pCursor,