SQLite

Check-in [7c4c659240]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Enable prefix-search in query-parsing and snippet generation. If the character immediately after the end of a term is '*', that term is marked for prefix matching. Modify term comparison in snippetOffsetsOfColumn() to respect isPrefix. fts2n.test runs prefix searching through some obvious test cases. (CVS 3893)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 7c4c65924035d9f260f6b64eb92c5c6cf6c04b7b
User & Date: shess 2007-05-01 18:25:53.000
Context
2007-05-02
01:34
Begin adding the zeroblob API to support incremental blob i/o. (CVS 3894) (check-in: 7a01836dde user: drh tags: trunk)
2007-05-01
18:25
Enable prefix-search in query-parsing and snippet generation. If the character immediately after the end of a term is '*', that term is marked for prefix matching. Modify term comparison in snippetOffsetsOfColumn() to respect isPrefix. fts2n.test runs prefix searching through some obvious test cases. (CVS 3893) (check-in: 7c4c659240 user: shess tags: trunk)
17:49
First approximation of incremental blob IO API. (CVS 3892) (check-in: c444836e7b user: danielk1977 tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts2/fts2.c.
3052
3053
3054
3055
3056
3057
3058
3059


3060
3061
3062
3063
3064
3065
3066
3067
    iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
    iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
    match = 0;
    for(i=0; i<nTerm; i++){
      int iCol;
      iCol = aTerm[i].iColumn;
      if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
      if( aTerm[i].nTerm!=nToken ) continue;


      if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
      if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
      match |= 1<<i;
      if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
        for(j=aTerm[i].iPhrase-1; j>=0; j--){
          int k = (iRotor-j) & FTS2_ROTOR_MASK;
          snippetAppendMatch(pSnippet, iColumn, i-j,
                iRotorBegin[k], iRotorLen[k]);







|
>
>
|







3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
3064
3065
3066
3067
3068
3069
    iRotorBegin[iRotor&FTS2_ROTOR_MASK] = iBegin;
    iRotorLen[iRotor&FTS2_ROTOR_MASK] = iEnd-iBegin;
    match = 0;
    for(i=0; i<nTerm; i++){
      int iCol;
      iCol = aTerm[i].iColumn;
      if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
      if( aTerm[i].nTerm>nToken ) continue;
      if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
      assert( aTerm[i].nTerm<=nToken );
      if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
      if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
      match |= 1<<i;
      if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
        for(j=aTerm[i].iPhrase-1; j>=0; j--){
          int k = (iRotor-j) & FTS2_ROTOR_MASK;
          snippetAppendMatch(pSnippet, iColumn, i-j,
                iRotorBegin[k], iRotorLen[k]);
3496
3497
3498
3499
3500
3501
3502



3503
3504
3505
3506
3507
3508
3509
      pQuery->nextIsOr = 1;
      continue;
    }
    queryAdd(pQuery, pToken, nToken);
    if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
      pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
    }



    pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
    if( inPhrase ){
      nTerm++;
    }
  }

  if( inPhrase && pQuery->nTerms>firstIndex ){







>
>
>







3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
      pQuery->nextIsOr = 1;
      continue;
    }
    queryAdd(pQuery, pToken, nToken);
    if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
      pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
    }
    if( iEnd<nSegment && pSegment[iEnd]=='*' ){
      pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
    }
    pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
    if( inPhrase ){
      nTerm++;
    }
  }

  if( inPhrase && pQuery->nTerms>firstIndex ){
Added test/fts2n.test.








































































































































































































































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# 2007 April 26
#
# The author disclaims copyright to this source code.
#
#*************************************************************************
# This file implements tests for prefix-searching in the fts2
# component of the SQLite library.
#
# $Id: fts2n.test,v 1.1 2007/05/01 18:25:53 shess Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS2 is defined, omit this file.
ifcapable !fts2 {
  finish_test
  return
}

# A large string to prime the pump with.
set text {
  Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Maecenas
  iaculis mollis ipsum. Praesent rhoncus placerat justo. Duis non quam
  sed turpis posuere placerat. Curabitur et lorem in lorem porttitor
  aliquet. Pellentesque bibendum tincidunt diam. Vestibulum blandit
  ante nec elit. In sapien diam, facilisis eget, dictum sed, viverra
  at, felis. Vestibulum magna. Sed magna dolor, vestibulum rhoncus,
  ornare vel, vulputate sit amet, felis. Integer malesuada, tellus at
  luctus gravida, diam nunc porta nibh, nec imperdiet massa metus eu
  lectus. Aliquam nisi. Nunc fringilla nulla at lectus. Suspendisse
  potenti. Cum sociis natoque penatibus et magnis dis parturient
  montes, nascetur ridiculus mus. Pellentesque odio nulla, feugiat eu,
  suscipit nec, consequat quis, risus.
}

db eval {
  CREATE VIRTUAL TABLE t1 USING fts2(c);

  INSERT INTO t1(rowid, c) VALUES(1, $text);
  INSERT INTO t1(rowid, c) VALUES(2, 'Another lovely row');
}

# Exact match
do_test fts2n-1.1 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lorem'"
} {1}

# And a prefix
do_test fts2n-1.2 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lore*'"
} {1}

# Prefix includes exact match
do_test fts2n-1.3 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lorem*'"
} {1}

# Make certain everything isn't considered a prefix!
do_test fts2n-1.4 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lore'"
} {}

# Prefix across multiple rows.
do_test fts2n-1.5 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lo*'"
} {1 2}

# Likewise, with multiple hits in one document.
do_test fts2n-1.6 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'l*'"
} {1 2}

# Prefix which should only hit one document.
do_test fts2n-1.7 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lov*'"
} {2}

# * not at end is dropped.
do_test fts2n-1.8 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH 'lo *'"
} {}

# Stand-alone * is dropped.
do_test fts2n-1.9 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH '*'"
} {}

# Phrase-query prefix.
do_test fts2n-1.10 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH '\"lovely r*\"'"
} {2}
do_test fts2n-1.11 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH '\"lovely r\"'"
} {}

# Phrase query with multiple prefix matches.
do_test fts2n-1.12 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH '\"a* l*\"'"
} {1 2}

# Phrase query with multiple prefix matches.
do_test fts2n-1.13 {
  execsql "SELECT rowid FROM t1 WHERE t1 MATCH '\"a* l* row\"'"
} {2}




# Test across updates (and, by implication, deletes).

# Version of text without "lorem".
regsub -all {[Ll]orem} $text '' ntext

db eval {
  CREATE VIRTUAL TABLE t2 USING fts2(c);

  INSERT INTO t2(rowid, c) VALUES(1, $text);
  INSERT INTO t2(rowid, c) VALUES(2, 'Another lovely row');
  UPDATE t2 SET c = $ntext WHERE rowid = 1;
}

# Can't see lorem as an exact match.
do_test fts2n-2.1 {
  execsql "SELECT rowid FROM t2 WHERE t2 MATCH 'lorem'"
} {}

# Can't see a prefix of lorem, either.
do_test fts2n-2.2 {
  execsql "SELECT rowid FROM t2 WHERE t2 MATCH 'lore*'"
} {}

# Can see lovely in the other document.
do_test fts2n-2.3 {
  execsql "SELECT rowid FROM t2 WHERE t2 MATCH 'lo*'"
} {2}

# Can still see other hits.
do_test fts2n-2.4 {
  execsql "SELECT rowid FROM t2 WHERE t2 MATCH 'l*'"
} {1 2}

# Prefix which should only hit one document.
do_test fts2n-2.5 {
  execsql "SELECT rowid FROM t2 WHERE t2 MATCH 'lov*'"
} {2}



# Test with a segment which will have multiple levels in the tree.

# Build a big document with lots of unique terms.
set bigtext $text
foreach c {a b c d e} {
  regsub -all {[A-Za-z]+} $bigtext "&$c" t
  append bigtext $t
}

# Populate a table with many copies of the big document, so that we
# can test the number of hits found.  Populate $ret with the expected
# hit counts for each row.  offsets() returns 4 elements for every
# hit.  We'll have 6 hits for row 1, 1 for row 2, and 6*(2^5)==192 for
# $bigtext.
set ret {6 1}
db eval {
  BEGIN;
  CREATE VIRTUAL TABLE t3 USING fts2(c);

  INSERT INTO t3(rowid, c) VALUES(1, $text);
  INSERT INTO t3(rowid, c) VALUES(2, 'Another lovely row');
}
for {set i 0} {$i<100} {incr i} {
  db eval {INSERT INTO t3(rowid, c) VALUES(3+$i, $bigtext)}
  lappend ret 192
}
db eval {COMMIT;}

# Test that we get the expected number of hits.
do_test fts2n-3.1 {
  set t {}
  db eval {SELECT offsets(t3) as o FROM t3 WHERE t3 MATCH 'l*'} {
    set l [llength $o]
    lappend t [expr {$l/4}]
  }
  set t
} $ret

# TODO(shess) It would be useful to test a couple edge cases, but I
# don't know if we have the precision to manage it from here at this
# time.  Prefix hits can cross leaves, which the code above _should_
# hit by virtue of size.  There are two variations on this.  If the
# tree is 2 levels high, the code will find the leaf-node extent
# directly, but if it's higher, the code will have to follow two
# separate interior branches down the tree.  Both should be tested.

finish_test