SQLite

Check-in [820634f71e]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implementation of "column:" modifiers in FTS1 queries. (CVS 3411)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 820634f71e3a3499994f82b56b784d22a7e3cdcf
User & Date: drh 2006-09-13 16:02:44.000
Context
2006-09-13
17:17
Minor code cleanup in FTS1. (CVS 3412) (check-in: fca5928167 user: drh tags: trunk)
16:02
Implementation of "column:" modifiers in FTS1 queries. (CVS 3411) (check-in: 820634f71e user: drh tags: trunk)
15:20
Module spec parser enhancements for FTS1. Now able to cope with column names in the spec that are SQL keywords or have special characters, etc. Also added support for additional control lines. Column names can be followed by a type specifier (which is ignored.) (CVS 3410) (check-in: adb780e0dc user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts1/fts1.c.
1952
1953
1954
1955
1956
1957
1958
1959

1960
1961
1962
1963
1964
1965
1966
1967
1968
  }
}

/* A single term in a query is represented by an instances of
** the following structure.
*/
typedef struct QueryTerm {
  int nPhrase;       /* How many following terms are part of the same phrase */

  int isOr;          /* this term is preceded by "OR" */
  int isNot;         /* this term is preceded by "-" */
  char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
  int nTerm;         /* Number of bytes in pTerm[] */
} QueryTerm;


/* Return a DocList corresponding to the query term *pTerm.  If *pTerm
** is the first term of a phrase query, go ahead and evaluate the phrase







|
>
|
|







1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
  }
}

/* A single term in a query is represented by an instances of
** the following structure.
*/
typedef struct QueryTerm {
  short int nPhrase; /* How many following terms are part of the same phrase */
  short int iColumn; /* Column of the index that must match this term */
  signed char isOr;  /* this term is preceded by "OR" */
  signed char isNot; /* this term is preceded by "-" */
  char *pTerm;       /* text of the term.  '\000' terminated.  malloced */
  int nTerm;         /* Number of bytes in pTerm[] */
} QueryTerm;


/* Return a DocList corresponding to the query term *pTerm.  If *pTerm
** is the first term of a phrase query, go ahead and evaluate the phrase
2024
2025
2026
2027
2028
2029
2030

2031
2032
2033

2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053


2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064




















2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084

2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097






2098
2099
2100
2101
2102
2103
2104
 * A NOT term cannot be the right-hand operand of an OR.  If this
 * occurs in the query string, the NOT is ignored:
 *
 *    [one OR -two]          ==>    one OR two
 *
 */
typedef struct Query {

  int nTerms;           /* Number of terms in the query */
  QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
  int nextIsOr;         /* Set the isOr flag on the next inserted term */

} Query;

/* Add a new term pTerm[0..nTerm-1] to the query *q.
*/
static void queryAdd(Query *q, const char *pTerm, int nTerm){
  QueryTerm *t;
  ++q->nTerms;
  q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
  if( q->pTerms==0 ){
    q->nTerms = 0;
    return;
  }
  t = &q->pTerms[q->nTerms - 1];
  memset(t, 0, sizeof(*t));
  t->pTerm = malloc(nTerm+1);
  memcpy(t->pTerm, pTerm, nTerm);
  t->pTerm[nTerm] = 0;
  t->nTerm = nTerm;
  t->isOr = q->nextIsOr;
  q->nextIsOr = 0;


}

/* Free all of the memory that was malloced in order to build *q.
*/
static void queryDestroy(Query *q){
  int i;
  for(i = 0; i < q->nTerms; ++i){
    free(q->pTerms[i].pTerm);
  }
  free(q->pTerms);
}





















/*
** Parse the text at pSegment[0..nSegment-1].  Add additional terms
** to the query being assemblied in pQuery.
**
** inPhrase is true if pSegment[0..nSegement-1] is contained within
** double-quotes.  If inPhrase is true, then the first term
** is marked with the number of terms in the phrase less one and
** OR and "-" syntax is ignored.  If inPhrase is false, then every
** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
*/
static int tokenizeSegment(
  sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
  const char *pSegment, int nSegment,     /* Query expression being parsed */
  int inPhrase,                           /* True if within "..." */
  Query *pQuery                           /* Append results here */
){
  const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
  sqlite3_tokenizer_cursor *pCursor;
  int firstIndex = pQuery->nTerms;

  
  int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
  if( rc!=SQLITE_OK ) return rc;
  pCursor->pTokenizer = pTokenizer;

  while( 1 ){
    const char *pToken;
    int nToken, iBegin, iEnd, iPos;

    rc = pModule->xNext(pCursor,
                        &pToken, &nToken,
                        &iBegin, &iEnd, &iPos);
    if( rc!=SQLITE_OK ) break;






    if( !inPhrase && pQuery->nTerms>0 && nToken==2
         && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
      pQuery->nextIsOr = 1;
      continue;
    }
    queryAdd(pQuery, pToken, nToken);
    if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){







>



>




















>
>











>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>




















>













>
>
>
>
>
>







2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
 * A NOT term cannot be the right-hand operand of an OR.  If this
 * occurs in the query string, the NOT is ignored:
 *
 *    [one OR -two]          ==>    one OR two
 *
 */
typedef struct Query {
  fulltext_vtab *pFts;  /* The full text index */
  int nTerms;           /* Number of terms in the query */
  QueryTerm *pTerms;    /* Array of terms.  Space obtained from malloc() */
  int nextIsOr;         /* Set the isOr flag on the next inserted term */
  int iColumn;          /* Text word parsed must be in this column */
} Query;

/* Add a new term pTerm[0..nTerm-1] to the query *q.
*/
static void queryAdd(Query *q, const char *pTerm, int nTerm){
  QueryTerm *t;
  ++q->nTerms;
  q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
  if( q->pTerms==0 ){
    q->nTerms = 0;
    return;
  }
  t = &q->pTerms[q->nTerms - 1];
  memset(t, 0, sizeof(*t));
  t->pTerm = malloc(nTerm+1);
  memcpy(t->pTerm, pTerm, nTerm);
  t->pTerm[nTerm] = 0;
  t->nTerm = nTerm;
  t->isOr = q->nextIsOr;
  q->nextIsOr = 0;
  t->iColumn = q->iColumn;
  q->iColumn = -1;
}

/* Free all of the memory that was malloced in order to build *q.
*/
static void queryDestroy(Query *q){
  int i;
  for(i = 0; i < q->nTerms; ++i){
    free(q->pTerms[i].pTerm);
  }
  free(q->pTerms);
}

/*
** Check to see if the string zToken[0...nToken-1] matches any
** column name in the virtual table.   If it does,
** return the zero-indexed column number.  If not, return -1.
*/
static int checkColumnSpecifier(
  fulltext_vtab *pVtab,    /* The virtual table */
  const char *zToken,      /* Text of the token */
  int nToken               /* Number of characters in the token */
){
  int i;
  for(i=0; i<pVtab->nColumn; i++){
    if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
        && pVtab->azColumn[i][nToken]==0 ){
      return i;
    }
  }
  return -1;
}

/*
** Parse the text at pSegment[0..nSegment-1].  Add additional terms
** to the query being assemblied in pQuery.
**
** inPhrase is true if pSegment[0..nSegement-1] is contained within
** double-quotes.  If inPhrase is true, then the first term
** is marked with the number of terms in the phrase less one and
** OR and "-" syntax is ignored.  If inPhrase is false, then every
** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
*/
static int tokenizeSegment(
  sqlite3_tokenizer *pTokenizer,          /* The tokenizer to use */
  const char *pSegment, int nSegment,     /* Query expression being parsed */
  int inPhrase,                           /* True if within "..." */
  Query *pQuery                           /* Append results here */
){
  const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
  sqlite3_tokenizer_cursor *pCursor;
  int firstIndex = pQuery->nTerms;
  int iCol;
  
  int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
  if( rc!=SQLITE_OK ) return rc;
  pCursor->pTokenizer = pTokenizer;

  while( 1 ){
    const char *pToken;
    int nToken, iBegin, iEnd, iPos;

    rc = pModule->xNext(pCursor,
                        &pToken, &nToken,
                        &iBegin, &iEnd, &iPos);
    if( rc!=SQLITE_OK ) break;
    if( !inPhrase &&
        pSegment[iEnd]==':' &&
         (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
      pQuery->iColumn = iCol;
      continue;
    }
    if( !inPhrase && pQuery->nTerms>0 && nToken==2
         && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
      pQuery->nextIsOr = 1;
      continue;
    }
    queryAdd(pQuery, pToken, nToken);
    if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
2119
2120
2121
2122
2123
2124
2125


2126
2127
2128
2129
2130
2131
2132
                      Query *pQuery){
  int iInput, inPhrase = 0;

  if( nInput<0 ) nInput = strlen(pInput);
  pQuery->nTerms = 0;
  pQuery->pTerms = NULL;
  pQuery->nextIsOr = 0;



  for(iInput=0; iInput<nInput; ++iInput){
    int i;
    for(i=iInput; i<nInput && pInput[i]!='"'; ++i)
      ;
    if( i>iInput ){
      tokenizeSegment(v->pTokenizer, pInput+iInput, i-iInput, inPhrase,







>
>







2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
                      Query *pQuery){
  int iInput, inPhrase = 0;

  if( nInput<0 ) nInput = strlen(pInput);
  pQuery->nTerms = 0;
  pQuery->pTerms = NULL;
  pQuery->nextIsOr = 0;
  pQuery->iColumn = -1;
  pQuery->pFts = v;

  for(iInput=0; iInput<nInput; ++iInput){
    int i;
    for(i=iInput; i<nInput && pInput[i]!='"'; ++i)
      ;
    if( i>iInput ){
      tokenizeSegment(v->pTokenizer, pInput+iInput, i-iInput, inPhrase,
2145
2146
2147
2148
2149
2150
2151



2152
2153
2154
2155
2156
2157
2158
2159

2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172


2173
2174
2175
2176
2177
2178
2179
2180
  }
  return SQLITE_OK;
}

/* Perform a full-text query using the search expression in
** pInput[0..nInput-1].  Return a list of matching documents
** in pResult.



*/
static int fulltextQuery(fulltext_vtab *v, int iColumn,
                         const char *pInput, int nInput, DocList **pResult){
  Query q;
  int i, rc;
  DocList *pLeft = NULL;
  DocList *pRight, *pNew;
  int nNot = 0;


  rc = parseQuery(v, pInput, nInput, &q);
  if( rc!=SQLITE_OK ) return rc;

  /* Merge AND terms. */
  for(i = 0 ; i < q.nTerms; i += q.pTerms[i].nPhrase + 1){

    if( q.pTerms[i].isNot ){
      /* Handle all NOT terms in a separate pass */
      nNot++;
      continue;
    }



    rc = docListOfTerm(v, iColumn, &q.pTerms[i], &pRight);
    if( rc ){
      queryDestroy(&q);
      return rc;
    }
    if( pLeft==0 ){
      pLeft = pRight;
    }else{







>
>
>








>













>
>
|







2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
  }
  return SQLITE_OK;
}

/* Perform a full-text query using the search expression in
** pInput[0..nInput-1].  Return a list of matching documents
** in pResult.
**
** Queries must match column iColumn.  Or if iColumn>=nColumn
** they are allowed to match against any column.
*/
static int fulltextQuery(fulltext_vtab *v, int iColumn,
                         const char *pInput, int nInput, DocList **pResult){
  Query q;
  int i, rc;
  DocList *pLeft = NULL;
  DocList *pRight, *pNew;
  int nNot = 0;
  int iCol;

  rc = parseQuery(v, pInput, nInput, &q);
  if( rc!=SQLITE_OK ) return rc;

  /* Merge AND terms. */
  for(i = 0 ; i < q.nTerms; i += q.pTerms[i].nPhrase + 1){

    if( q.pTerms[i].isNot ){
      /* Handle all NOT terms in a separate pass */
      nNot++;
      continue;
    }

    iCol = q.pTerms[i].iColumn;
    if( iCol<0 ) iCol = iColumn;
    rc = docListOfTerm(v, iCol, &q.pTerms[i], &pRight);
    if( rc ){
      queryDestroy(&q);
      return rc;
    }
    if( pLeft==0 ){
      pLeft = pRight;
    }else{
2194
2195
2196
2197
2198
2199
2200


2201
2202
2203
2204
2205
2206
2207
2208
    /* We do not yet know how to handle a query of only NOT terms */
    return SQLITE_ERROR;
  }

  /* Do the EXCEPT terms */
  for(i=0; i<q.nTerms;  i += q.pTerms[i].nPhrase + 1){
    if( !q.pTerms[i].isNot ) continue;


    rc = docListOfTerm(v, iColumn, &q.pTerms[i], &pRight);
    if( rc ){
      queryDestroy(&q);
      docListDelete(pLeft);
      return rc;
    }
    pNew = docListNew(DL_DOCIDS);
    docListExceptMerge(pLeft, pRight, pNew);







>
>
|







2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
    /* We do not yet know how to handle a query of only NOT terms */
    return SQLITE_ERROR;
  }

  /* Do the EXCEPT terms */
  for(i=0; i<q.nTerms;  i += q.pTerms[i].nPhrase + 1){
    if( !q.pTerms[i].isNot ) continue;
    iCol = q.pTerms[i].iColumn;
    if( iCol<0 ) iCol = iColumn;
    rc = docListOfTerm(v, iCol, &q.pTerms[i], &pRight);
    if( rc ){
      queryDestroy(&q);
      docListDelete(pLeft);
      return rc;
    }
    pNew = docListNew(DL_DOCIDS);
    docListExceptMerge(pLeft, pRight, pNew);
Changes to test/fts1b.test.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2006 September 13
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS1 module.
#
# $Id: fts1b.test,v 1.2 2006/09/13 15:20:13 drh Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS1 is defined, omit this file.
ifcapable !fts1 {













|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2006 September 13
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS1 module.
#
# $Id: fts1b.test,v 1.3 2006/09/13 16:02:44 drh Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS1 is defined, omit this file.
ifcapable !fts1 {
84
85
86
87
88
89
90
91























































92
do_test fts1b-2.1 {
  execsql {
    CREATE VIRTUAL TABLE t2 USING fts1(from,to);
    INSERT INTO t2([from],[to]) VALUES ('one two three', 'four five six');
    SELECT [from], [to] FROM t2
  }
} {{one two three} {four five six}}
























































finish_test








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
do_test fts1b-2.1 {
  execsql {
    CREATE VIRTUAL TABLE t2 USING fts1(from,to);
    INSERT INTO t2([from],[to]) VALUES ('one two three', 'four five six');
    SELECT [from], [to] FROM t2
  }
} {{one two three} {four five six}}


# Compute an SQL string that contains the words one, two, three,... to
# describe bits set in the value $i.  Only the lower 5 bits are examined.
#
proc wordset {i} {
  set x {}
  for {set j 0; set k 1} {$j<5} {incr j; incr k $k} {
    if {$k&$i} {lappend x [lindex {one two three four five} $j]}
  }
  return '$x'
}

# Create a new FTS table with three columns:
#
#    norm:      words for the bits of rowid
#    plusone:   words for the bits of rowid+1
#    invert:    words for the bits of ~rowid
#
db eval {
   CREATE VIRTUAL TABLE t4 USING fts1([norm],'plusone',"invert");
}
for {set i 1} {$i<=15} {incr i} {
  set vset [list [wordset $i] [wordset [expr {$i+1}]] [wordset [expr {~$i}]]]
  db eval "INSERT INTO t4(norm,plusone,invert) VALUES([join $vset ,]);"
}

do_test fts1b-4.1 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'norm:one'}
} {1 3 5 7 9 11 13 15}
do_test fts1b-4.2 {
  execsql {SELECT rowid FROM t4 WHERE norm MATCH 'one'}
} {1 3 5 7 9 11 13 15}
do_test fts1b-4.3 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'one'}
} {1 2 3 4 5 6 7 8 9 10 11 12 13 14 15}
do_test fts1b-4.4 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'plusone:one'}
} {2 4 6 8 10 12 14}
do_test fts1b-4.5 {
  execsql {SELECT rowid FROM t4 WHERE plusone MATCH 'one'}
} {2 4 6 8 10 12 14}
do_test fts1b-4.6 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'norm:one plusone:two'}
} {1 5 9 13}
do_test fts1b-4.7 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'norm:one two'}
} {1 3 5 7 9 11 13 15}
do_test fts1b-4.8 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'plusone:two norm:one'}
} {1 5 9 13}
do_test fts1b-4.9 {
  execsql {SELECT rowid FROM t4 WHERE _all MATCH 'two norm:one'}
} {1 3 5 7 9 11 13 15}


finish_test