Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
| SHA1 Hash: | e3ae335670ce7c246524171e47bdbda64d361695 |
|---|---|
| Date: | 2010-01-20 18:32:01 |
| User: | dan |
| Comment: | Change the way the fts3.html document is generated to be more similar to most of the other documents. |
Tags And Properties
- branch=trunk inherited from [b2e03e19ab]
- sym-trunk inherited from [b2e03e19ab]
Changes
Changes to main.mk
51 51 52 # This rule generates all documention files from their sources. The 52 # This rule generates all documention files from their sources. The 53 # special markup on HTML files used to identify testable statements and 53 # special markup on HTML files used to identify testable statements and 54 # requirements are retained in the HTML and so the HTML generated by 54 # requirements are retained in the HTML and so the HTML generated by 55 # this rule is not suitable for publication. This is the first step 55 # this rule is not suitable for publication. This is the first step 56 # only. 56 # only. 57 # 57 # 58 base: tclsh sqlite3.h docdir always | 58 base: tclsh sqlite3.h docdir always parsehtml.so 59 rm -rf doc/images 59 rm -rf doc/images 60 cp -r $(DOC)/images doc 60 cp -r $(DOC)/images doc 61 cp $(SRC)/art/*.gif doc/images 61 cp $(SRC)/art/*.gif doc/images 62 mkdir doc/images/syntax 62 mkdir doc/images/syntax 63 cp $(DOC)/art/syntax/*.gif doc/images/syntax 63 cp $(DOC)/art/syntax/*.gif doc/images/syntax 64 cp $(DOC)/rawpages/* doc 64 cp $(DOC)/rawpages/* doc 65 ./tclsh $(DOC)/wrap.tcl $(DOC) $(SRC) doc $(DOC)/pages/*.in 65 ./tclsh $(DOC)/wrap.tcl $(DOC) $(SRC) doc $(DOC)/pages/*.in
Changes to pages/fancyformat.tcl
230 <div style="font-size:1.5em;margin:1em;color:#80a796">Table Of Contents</div 230 <div style="font-size:1.5em;margin:1em;color:#80a796">Table Of Contents</div 231 <div id=toc> 231 <div id=toc> 232 $::TOC 232 $::TOC 233 </div id> 233 </div id> 234 [FixReferences $body] 234 [FixReferences $body] 235 }] 235 }] 236 } 236 } > 237 > 238 > 239 proc addtoc_cb {tag details} { > 240 upvar #0 ::Addtoc G > 241 switch -glob -- $tag { > 242 > 243 "" { ;# Text node. Copy the text to the output. And the TOC, if applicable. > 244 if {$G(inCodeblock)} { > 245 append G(codeblock) $details > 246 } else { > 247 append G(doc) $details > 248 if {$G(inHeading)} { append G(toc) $details } > 249 if {$G(inTitle)} { append G(title) $details } > 250 } > 251 } > 252 > 253 h[1-6] { ;# A heading. > 254 array set D $details > 255 set level [string range $tag 1 end] > 256 > 257 set HN "" > 258 if {![info exists D(notoc)]} { > 259 if {![info exists D(nonumber)]} { set HN [headingnumber $level] } > 260 > 261 # If the heading does not have an 'id' attribute, generate one. > 262 if {[info exists D(id)]==0} { > 263 if {$HN != ""} { > 264 set D(id) "section_[string map {. _} [string range $HN 0 end-1]]" > 265 } else { > 266 set D(id) "notoc[incr G(notoccounter)]" > 267 } > 268 } > 269 > 270 # Append the entry to the table-of-contents. > 271 append G(toc) "<div style=\"margin-left:[expr $level*6]ex\">" > 272 append G(toc) "<a href=\"#$D(id)\">$HN " > 273 set G(inHeading) 1 > 274 } > 275 catch { unset D(nonumber) } > 276 catch { unset D(notoc) } > 277 > 278 # If there is a "tags" attribute, then add an [hd_fragment] command > 279 # to the output. > 280 if {[info exists D(tags)]} { > 281 append G(doc) "<tcl>[list set ::hd(fragment) $D(id)]</tcl>" > 282 foreach t [split $D(tags) ,] { > 283 append G(doc) "<tcl>[list hd_keywords [string trim $t]]</tcl>" > 284 } > 285 unset D(tags) > 286 } > 287 > 288 append G(doc) [formattag $tag [array get D]] > 289 append G(doc) "$HN " > 290 } > 291 > 292 /h[1-6] { ;# End of current heading. > 293 if {$::Addtoc(inHeading)} { > 294 append G(toc) "</a></div>" > 295 } > 296 set G(inHeading) 0 > 297 append G(doc) [formattag $tag $details] > 298 } > 299 > 300 title { > 301 set G(inTitle) 1 > 302 append G(doc) [formattag $tag $details] > 303 } > 304 /title { > 305 set G(inTitle) 0 > 306 append G(doc) [formattag $tag $details] > 307 } > 308 > 309 codeblock { set G(inCodeblock) 1 } > 310 /codeblock { > 311 append G(doc) [Code $G(codeblock)] > 312 set G(codeblock) "" > 313 set G(inCodeblock) 0 > 314 } > 315 > 316 table { > 317 catch {array unset D} > 318 array set D $details > 319 if {[info exists D(striped)]} { > 320 unset D(striped) > 321 set D(style) "margin:1em auto; width:80%; border-spacing:0" > 322 set G(inStripedTable) 1 > 323 } > 324 append G(doc) [formattag $tag [array get D]] > 325 } > 326 /table { > 327 set G(inStripedTable) 0 > 328 append G(doc) [formattag $tag [array get D]] > 329 } > 330 tr { > 331 catch {array unset D} > 332 array set D $details > 333 switch $G(inStripedTable) { > 334 1 { > 335 set D(style) "text-align:left" > 336 set G(inStripedTable) 2 > 337 } > 338 2 { > 339 set D(style) "text-align:left;background-color:#DDDDDD" > 340 set G(inStripedTable) 1 > 341 } > 342 } > 343 append G(doc) [formattag $tag [array get D]] > 344 } > 345 > 346 default { > 347 if {$G(inCodeblock)} { > 348 append G(codeblock) [formattag $tag $details] > 349 } else { > 350 append G(doc) [formattag $tag $details] > 351 } > 352 } > 353 } > 354 } > 355 > 356 proc formattag {tag details} { > 357 set ret "<$tag" > 358 foreach {key value} $details { > 359 append ret " $key=\"$value\"" > 360 } > 361 append ret ">" > 362 set ret > 363 } > 364 > 365 proc headingnumber {level} { > 366 upvar #0 ::Addtoc G > 367 set ret "" > 368 incr G(heading:$level) > 369 for {set i 1} {$i < 6} {incr i} { > 370 if {$i > $level} { > 371 set G(heading:$i) 0 > 372 } else { > 373 append ret "$G(heading:$i)." > 374 } > 375 } > 376 set ret > 377 } > 378 > 379 proc addtoc {zDoc} { > 380 # If the extension with the [parsehtml] command has not been loaded, > 381 # load it now. > 382 # > 383 if {[info commands parsehtml] == ""} { load ./parsehtml.so } > 384 > 385 # These variables are all used to store state between invocations of > 386 # the [parsehtml] callback used to do preprocessing. > 387 # > 388 set ::Addtoc(heading:1) 0 > 389 set ::Addtoc(heading:2) 0 > 390 set ::Addtoc(heading:3) 0 > 391 set ::Addtoc(heading:4) 0 > 392 set ::Addtoc(heading:5) 0 > 393 set ::Addtoc(heading:6) 0 > 394 set ::Addtoc(inHeading) 0 > 395 set ::Addtoc(inTitle) 0 > 396 set ::Addtoc(inCodeblock) 0 > 397 set ::Addtoc(inStripedTable) 0 > 398 set ::Addtoc(notoccounter) 0 > 399 set ::Addtoc(codeblock) "" > 400 > 401 # The following three are set by the [parsehtml] callback. The title, > 402 # table-of-contents and text of the pre-processed document. > 403 # > 404 set ::Addtoc(title) "" > 405 set ::Addtoc(toc) "" > 406 set ::Addtoc(doc) "" > 407 > 408 parsehtml $zDoc addtoc_cb > 409 > 410 # Variable $toc is set to the HTML text for the table of contents. The > 411 # text "<table_of_contents>" in the input file will be replaced by > 412 # this text. The "<div class=startsearch>" tag tells the script that > 413 # builds the site-search database not to index any text that occurs > 414 # before it. This stops the table of contents from being used for > 415 # snippets on search results pages. > 416 # > 417 set toc [subst { > 418 <div class=fancy> > 419 <div style="font-size:2em;text-align:center;color:#80a796"> > 420 $::Addtoc(title) > 421 </div> > 422 <div style="font-size:1.5em;margin:1em;color:#80a796"> > 423 Table Of Contents</div> > 424 <div id=toc> $::Addtoc(toc) </div> > 425 <div class=startsearch></div> > 426 }] > 427 > 428 string map [list <table_of_contents> $toc] $::Addtoc(doc) > 429 } > 430
Changes to pages/fts3.in
> 1 > 2 <tcl>hd_keywords *fts3 FTS3</tcl> 1 <title>SQLite FTS3 Extension</title> 3 <title>SQLite FTS3 Extension</title> 2 <tcl> < 3 4 4 hd_keywords *fts3 FTS3 | 5 <table_of_contents> 5 source [file join $::DOC pages fancyformat.tcl] < 6 fancyformat_document "SQLite FTS3 Extension" {} { < 7 6 8 <h2 style="margin-left:1.0em"> Overview</h2> | 7 <h2 style="margin-left:1.0em" notoc> Overview</h2> 9 8 10 <p> 9 <p> 11 FTS3 is an SQLite virtual table module that allows users to perform 10 FTS3 is an SQLite virtual table module that allows users to perform 12 full-text searches on a set of documents. The most common (and effective) 11 full-text searches on a set of documents. The most common (and effective) 13 way to describe full-text searches is "what Google, Yahoo and Altavista do 12 way to describe full-text searches is "what Google, Yahoo and Altavista do 14 with documents placed on the World Wide Web". Users input a term, or series 13 with documents placed on the World Wide Web". Users input a term, or series 15 of terms, perhaps connected by a binary operator or grouped together into a 14 of terms, perhaps connected by a binary operator or grouped together into a ................................................................................................................................................................................ 18 specified. This document describes the deployment and usage of FTS3. 17 specified. This document describes the deployment and usage of FTS3. 19 18 20 <p> 19 <p> 21 Portions of the original FTS3 code were contributed to the SQLite project 20 Portions of the original FTS3 code were contributed to the SQLite project 22 by Scott Hess of <a href="http://www.google.com">Google</a>. It is now 21 by Scott Hess of <a href="http://www.google.com">Google</a>. It is now 23 developed and maintained as part of SQLite. 22 developed and maintained as part of SQLite. 24 23 25 [h1 "Introduction to FTS3"] | 24 <h1>Introduction to FTS3</h1> 26 25 27 <p> 26 <p> 28 The FTS3 extension module allows users to create special tables with a 27 The FTS3 extension module allows users to create special tables with a 29 built-in full-text index (hereafter "FTS3 tables"). The full-text index 28 built-in full-text index (hereafter "FTS3 tables"). The full-text index 30 allows the user to efficiently query the database for all rows that contain 29 allows the user to efficiently query the database for all rows that contain 31 one or more instances specified word (hereafter a "token", even if the table 30 one or more instances specified word (hereafter a "token", even if the table 32 contains many large documents. 31 contains many large documents. ................................................................................................................................................................................ 33 32 34 <p> 33 <p> 35 For example, if each of the 517430 documents in the 34 For example, if each of the 517430 documents in the 36 "<a href="http://www.cs.cmu.edu/~enron/">Enron E-Mail Dataset</a>" 35 "<a href="http://www.cs.cmu.edu/~enron/">Enron E-Mail Dataset</a>" 37 is inserted into both the FTS3 table and the ordinary SQLite table 36 is inserted into both the FTS3 table and the ordinary SQLite table 38 created using the following SQL script: 37 created using the following SQL script: 39 38 40 [Code { | 39 <codeblock> 41 CREATE VIRTUAL TABLE enrondata1 USING fts3(content TEXT); /* FTS3 table */ 40 CREATE VIRTUAL TABLE enrondata1 USING fts3(content TEXT); /* FTS3 table */ 42 CREATE TABLE enrondata2(content TEXT); /* Ordinary tabl 41 CREATE TABLE enrondata2(content TEXT); /* Ordinary tabl 43 }] < > 42 </codeblock> 44 43 45 <p> 44 <p> 46 Then either of the two queries below may be executed to find the number of 45 Then either of the two queries below may be executed to find the number of 47 documents in the database that contain the word "linux" (351). Using one 46 documents in the database that contain the word "linux" (351). Using one 48 desktop PC hardware configuration, the query on the FTS3 table returns in 47 desktop PC hardware configuration, the query on the FTS3 table returns in 49 approximately 0.03 seconds, versus 22.5 for querying the ordinary table. 48 approximately 0.03 seconds, versus 22.5 for querying the ordinary table. 50 49 51 [Code { | 50 <codeblock> 52 SELECT count(*) FROM enrondata1 WHERE content MATCH 'linux'; /* 0.03 seconds 51 SELECT count(*) FROM enrondata1 WHERE content MATCH 'linux'; /* 0.03 seconds 53 SELECT count(*) FROM enrondata2 WHERE content LIKE '%linux%'; /* 22.5 seconds 52 SELECT count(*) FROM enrondata2 WHERE content LIKE '%linux%'; /* 22.5 seconds 54 }] < > 53 </codeblock> 55 54 56 <p> 55 <p> 57 Of course, the two queries above are not entirely equivalent. For example 56 Of course, the two queries above are not entirely equivalent. For example 58 the LIKE query matches rows that contain terms such as "linuxophobe" 57 the LIKE query matches rows that contain terms such as "linuxophobe" 59 or "EnterpriseLinux" (as it happens, the Enron E-Mail Dataset does not 58 or "EnterpriseLinux" (as it happens, the Enron E-Mail Dataset does not 60 actually contain any such terms), whereas the MATCH query on the FTS3 table 59 actually contain any such terms), whereas the MATCH query on the FTS3 table 61 selects only those rows that contain "linux" as a discrete token. Both 60 selects only those rows that contain "linux" as a discrete token. Both 62 searches are case-insensitive. The FTS3 table consumes around 2006 MB on 61 searches are case-insensitive. The FTS3 table consumes around 2006 MB on 63 disk compared to just 1453 MB for the ordinary table. Using the same 62 disk compared to just 1453 MB for the ordinary table. Using the same 64 hardware configuration used to perform the SELECT queries above, the FTS3 63 hardware configuration used to perform the SELECT queries above, the FTS3 65 table took just under 31 minutes to populate, versus 25 for the ordinary 64 table took just under 31 minutes to populate, versus 25 for the ordinary 66 table. 65 table. 67 66 68 [h2 "Creating and Destroying FTS3 Tables"] | 67 <h2>Creating and Destroying FTS3 Tables</h2> 69 68 70 <p> 69 <p> 71 Like other virtual table types, new FTS3 tables are created using a 70 Like other virtual table types, new FTS3 tables are created using a 72 \[CREATE VIRTUAL TABLE\] statement. The module name, which follows | 71 [CREATE VIRTUAL TABLE] statement. The module name, which follows 73 the USING keyword, is "fts3". The virtual table module arguments may 72 the USING keyword, is "fts3". The virtual table module arguments may 74 be left empty, in which case an FTS3 table with a single user-defined 73 be left empty, in which case an FTS3 table with a single user-defined 75 column named "content" is created. Alternatively, the module arguments 74 column named "content" is created. Alternatively, the module arguments 76 may be passed a list of comma separated column names. 75 may be passed a list of comma separated column names. 77 76 78 <p> 77 <p> 79 If column names are explicitly provided for the FTS3 table as part of 78 If column names are explicitly provided for the FTS3 table as part of ................................................................................................................................................................................ 80 the CREATE VIRTUAL TABLE statement, then a datatype name may be optionally 79 the CREATE VIRTUAL TABLE statement, then a datatype name may be optionally 81 specified for each column. However, this is pure syntactic sugar, the 80 specified for each column. However, this is pure syntactic sugar, the 82 supplied typenames are not used by FTS3 or the SQLite core for any 81 supplied typenames are not used by FTS3 or the SQLite core for any 83 purpose. The same applies to any constraints specified along with an 82 purpose. The same applies to any constraints specified along with an 84 FTS3 column name - they are parsed but not used or recorded by the system 83 FTS3 column name - they are parsed but not used or recorded by the system 85 in any way. 84 in any way. 86 85 87 [Code { | 86 <codeblock> 88 <i>-- Create an FTS3 table named "data" with one column - "content":</i> 87 <i>-- Create an FTS3 table named "data" with one column - "content":</i> 89 CREATE VIRTUAL TABLE data USING fts3(); 88 CREATE VIRTUAL TABLE data USING fts3(); 90 89 91 <i>-- Create an FTS3 table named "pages" with three columns:</i> 90 <i>-- Create an FTS3 table named "pages" with three columns:</i> 92 CREATE VIRTUAL TABLE pages USING fts3(title, keywords, body); 91 CREATE VIRTUAL TABLE pages USING fts3(title, keywords, body); 93 92 94 <i>-- Create an FTS3 table named "mail" with two columns. Datatypes 93 <i>-- Create an FTS3 table named "mail" with two columns. Datatypes 95 -- and column constraints are specified along with each column. These 94 -- and column constraints are specified along with each column. These 96 -- are completely ignored by FTS3 and SQLite. </i> 95 -- are completely ignored by FTS3 and SQLite. </i> 97 CREATE VIRTUAL TABLE mail USING fts3( 96 CREATE VIRTUAL TABLE mail USING fts3( 98 subject VARCHAR(256) NOT NULL, 97 subject VARCHAR(256) NOT NULL, 99 body TEXT CHECK(length(body)<10240) | 98 body TEXT CHECK(length(body)<10240) 100 ); 99 ); 101 }] < > 100 </codeblock> 102 101 103 <p> 102 <p> 104 As well as a list of columns, the module arguments passed to a CREATE 103 As well as a list of columns, the module arguments passed to a CREATE 105 VIRTUAL TABLE statement used to create an FTS3 table may be used to specify 104 VIRTUAL TABLE statement used to create an FTS3 table may be used to specify 106 a \[tokenizer\]. This is done by specifying a string of the form | 105 a [tokenizer]. This is done by specifying a string of the form 107 "tokenize=<tokenizer name> <tokenizer args>" in place of a column 106 "tokenize=<tokenizer name> <tokenizer args>" in place of a column 108 name, where <tokenizer name> is the name of the tokenizer to use and 107 name, where <tokenizer name> is the name of the tokenizer to use and 109 <tokenizer args> is an optional list of whitespace separated qualifiers 108 <tokenizer args> is an optional list of whitespace separated qualifiers 110 to pass to the tokenizer implementation. A tokenizer specification may be 109 to pass to the tokenizer implementation. A tokenizer specification may be 111 placed anywhere in the column list, but at most one tokenizer declaration is 110 placed anywhere in the column list, but at most one tokenizer declaration is 112 allowed for each CREATE VIRTUAL TABLE statement. The second and subsequent 111 allowed for each CREATE VIRTUAL TABLE statement. The second and subsequent 113 tokenizer declaration are interpreted as column names. 112 tokenizer declaration are interpreted as column names. 114 \[tokenizer|See below\] for a detailed description of using (and, if | 113 [tokenizer|See below] for a detailed description of using (and, if 115 necessary, implementing) a tokenizer. 114 necessary, implementing) a tokenizer. 116 115 117 [Code { | 116 <codeblock> 118 <i>-- Create an FTS3 table named "papers" with two columns that uses</i> 117 <i>-- Create an FTS3 table named "papers" with two columns that uses</i> 119 <i>-- the tokenizer "porter".</i> 118 <i>-- the tokenizer "porter".</i> 120 CREATE VIRTUAL TABLE papers USING fts3(author, document, tokenize=porter); 119 CREATE VIRTUAL TABLE papers USING fts3(author, document, tokenize=porter); 121 120 122 <i>-- Create an FTS3 table with a single column - "content" - that uses</i> 121 <i>-- Create an FTS3 table with a single column - "content" - that uses</i> 123 <i>-- the "simple" tokenizer.</i> 122 <i>-- the "simple" tokenizer.</i> 124 CREATE VIRTUAL TABLE data USING fts3(tokenize=simple); 123 CREATE VIRTUAL TABLE data USING fts3(tokenize=simple); 125 124 126 <i>-- Create an FTS3 table with two columns that uses the "icu" tokenizer.</i> 125 <i>-- Create an FTS3 table with two columns that uses the "icu" tokenizer.</i> 127 <i>-- The qualifier "en_AU" is passed to the tokenizer implementation</i> 126 <i>-- The qualifier "en_AU" is passed to the tokenizer implementation</i> 128 CREATE VIRTUAL TABLE names USING fts3(a, b, tokenize=icu en_AU); 127 CREATE VIRTUAL TABLE names USING fts3(a, b, tokenize=icu en_AU); 129 }] < > 128 </codeblock> 130 129 131 <p> 130 <p> 132 FTS3 tables may be dropped from the database using an ordinary \[DROP TABLE\] | 131 FTS3 tables may be dropped from the database using an ordinary [DROP TABLE] 133 statement. For example: 132 statement. For example: 134 133 135 [Code { | 134 <codeblock> 136 <i>-- Create, then immediately drop, an FTS3 table.</i> 135 <i>-- Create, then immediately drop, an FTS3 table.</i> 137 CREATE VIRTUAL TABLE data USING fts3(); 136 CREATE VIRTUAL TABLE data USING fts3(); 138 DROP TABLE data; 137 DROP TABLE data; 139 }] < > 138 </codeblock> 140 139 141 [h2 "Populating FTS3 Tables"] | 140 <h2>Populating FTS3 Tables</h2> 142 141 143 <p> 142 <p> 144 FTS3 tables are populated using \[INSERT\], \[UPDATE\] and \[DELETE\] | 143 FTS3 tables are populated using [INSERT], [UPDATE] and [DELETE] 145 statements in the same way as ordinary SQLite tables are. 144 statements in the same way as ordinary SQLite tables are. 146 145 147 <p> 146 <p> 148 As well as the columns named by the user (or the "content" column if no 147 As well as the columns named by the user (or the "content" column if no 149 module arguments where specified as part of the \[CREATE VIRTUAL TABLE\] | 148 module arguments where specified as part of the [CREATE VIRTUAL TABLE] 150 statement), each FTS3 table has a "rowid" column. The rowid of an FTS3 149 statement), each FTS3 table has a "rowid" column. The rowid of an FTS3 151 table behaves in the same way as the rowid column of an ordinary SQLite 150 table behaves in the same way as the rowid column of an ordinary SQLite 152 table, except that the values stored in the rowid column of an FTS3 table 151 table, except that the values stored in the rowid column of an FTS3 table 153 remain unchanged if the database is rebuilt using the \[VACUUM\] command. | 152 remain unchanged if the database is rebuilt using the [VACUUM] command. 154 For FTS3 tables, "docid" is allowed as an alias along with the usual "rowid" 153 For FTS3 tables, "docid" is allowed as an alias along with the usual "rowid" 155 "oid" and "_oid_" identifiers. Attempting to insert or update a row with a 154 "oid" and "_oid_" identifiers. Attempting to insert or update a row with a 156 docid value that already exists in the table is an error, just as it would 155 docid value that already exists in the table is an error, just as it would 157 be with an ordinary SQLite table. 156 be with an ordinary SQLite table. 158 157 159 <p> 158 <p> 160 There is one other subtle difference between "docid" and the normal SQLite 159 There is one other subtle difference between "docid" and the normal SQLite ................................................................................................................................................................................ 161 aliases for the rowid column. Normally, if an INSERT or UPDATE statement 160 aliases for the rowid column. Normally, if an INSERT or UPDATE statement 162 assigns discreet values to two or more aliases of the rowid column, SQLite 161 assigns discreet values to two or more aliases of the rowid column, SQLite 163 writes the rightmost of such values specified in the INSERT or UPDATE 162 writes the rightmost of such values specified in the INSERT or UPDATE 164 statement to the database. However, assigning a non-NULL value to both 163 statement to the database. However, assigning a non-NULL value to both 165 the "docid" and one or more of the SQLite rowid aliases when inserting or 164 the "docid" and one or more of the SQLite rowid aliases when inserting or 166 updating an FTS3 table is considered an error. See below for an example. 165 updating an FTS3 table is considered an error. See below for an example. 167 166 168 [Code { | 167 <codeblock> 169 <i>-- Create an FTS3 table</i> 168 <i>-- Create an FTS3 table</i> 170 CREATE VIRTUAL TABLE pages USING fts3(title, body); 169 CREATE VIRTUAL TABLE pages USING fts3(title, body); 171 170 172 <i>-- Insert a row with a specific docid value.</i> 171 <i>-- Insert a row with a specific docid value.</i> 173 INSERT INTO pages(docid, title, body) VALUES(53, 'Home Page', 'SQLite is a sof 172 INSERT INTO pages(docid, title, body) VALUES(53, 'Home Page', 'SQLite is a sof 174 173 175 <i>-- Insert a row and allow FTS3 to assign a docid value using the same algor 174 <i>-- Insert a row and allow FTS3 to assign a docid value using the same algor ................................................................................................................................................................................ 182 181 183 <i>-- Delete the entire table contents.</i> 182 <i>-- Delete the entire table contents.</i> 184 DELETE FROM pages; 183 DELETE FROM pages; 185 184 186 <i>-- The following is an error. It is not possible to assign non-NULL values 185 <i>-- The following is an error. It is not possible to assign non-NULL values 187 <i>-- the rowid and docid columns of an FTS3 table.</i> 186 <i>-- the rowid and docid columns of an FTS3 table.</i> 188 INSERT INTO pages(rowid, docid, title, body) VALUES(1, 2, 'A title', 'A docume 187 INSERT INTO pages(rowid, docid, title, body) VALUES(1, 2, 'A title', 'A docume 189 }] < > 188 </codeblock> 190 189 191 <p> 190 <p> 192 To support full-text queries, FTS3 maintains an inverted index that maps 191 To support full-text queries, FTS3 maintains an inverted index that maps 193 from each unique term or word that appears in the dataset to the locations 192 from each unique term or word that appears in the dataset to the locations 194 in which it appears within the table contents. For the curious, a 193 in which it appears within the table contents. For the curious, a 195 complete description of the \[segment btree|data structure\] used to store | 194 complete description of the [segment btree|data structure] used to store 196 this index within the database file is described below. A feature of 195 this index within the database file is described below. A feature of 197 this data structure is that at any time the database may contain not 196 this data structure is that at any time the database may contain not 198 one index b-tree, but several different b-trees that are incrementally 197 one index b-tree, but several different b-trees that are incrementally 199 merged as rows are inserted, updated and deleted. This technique improves 198 merged as rows are inserted, updated and deleted. This technique improves 200 performance when writing to an FTS3 table, but causes some overhead for 199 performance when writing to an FTS3 table, but causes some overhead for 201 full-text queries that use the index. Executing an SQL statement of the 200 full-text queries that use the index. Executing an SQL statement of the 202 form "INSERT INTO <fts3-table>(<fts3-table>) VALUES('optimize')" 201 form "INSERT INTO <fts3-table>(<fts3-table>) VALUES('optimize')" ................................................................................................................................................................................ 204 b-tree containing the entire index. This can be an expensive operation, 203 b-tree containing the entire index. This can be an expensive operation, 205 but may speed up future queries. 204 but may speed up future queries. 206 205 207 <p> 206 <p> 208 For example, to optimize the full-text index for an FTS3 table named 207 For example, to optimize the full-text index for an FTS3 table named 209 "docs": 208 "docs": 210 209 211 [Code { | 210 <codeblock> 212 <i>-- Optimize the internal structure of FTS3 table "docs".</i> 211 <i>-- Optimize the internal structure of FTS3 table "docs".</i> 213 INSERT INTO docs(docs) VALUES('optimize'); 212 INSERT INTO docs(docs) VALUES('optimize'); 214 }] < > 213 </codeblock> 215 214 216 <p> 215 <p> 217 The statement above may appear syntacticly incorrect to some. Refer to 216 The statement above may appear syntacticly incorrect to some. Refer to 218 the section describing the \[simple fts3 queries\] for an explanation. | 217 the section describing the [simple fts3 queries] for an explanation. 219 218 220 <p> 219 <p> 221 There is another, deprecated, method for invoking the optimize 220 There is another, deprecated, method for invoking the optimize 222 operation using a SELECT statement. New code should use statements 221 operation using a SELECT statement. New code should use statements 223 similar to the INSERT above to optimize FTS3 structures. 222 similar to the INSERT above to optimize FTS3 structures. 224 223 225 [h2 "Querying FTS3 Tables" {} {simple fts3 queries}] | 224 <h2 tags="simple fts3 queries">Simple FTS3 Queries</h2> 226 225 227 <p> 226 <p> 228 As for all other SQLite tables, virtual or otherwise, data is retrieved 227 As for all other SQLite tables, virtual or otherwise, data is retrieved 229 from FTS3 tables using a \[SELECT\] statement. | 228 from FTS3 tables using a [SELECT] statement. 230 229 231 <p> 230 <p> 232 FTS3 tables can be queried efficiently using SELECT statements of two 231 FTS3 tables can be queried efficiently using SELECT statements of two 233 different forms: 232 different forms: 234 233 235 <ul> 234 <ul> 236 <li><p> 235 <li><p> 237 <b>Query by rowid</b>. If the WHERE clause of the SELECT statement 236 <b>Query by rowid</b>. If the WHERE clause of the SELECT statement 238 contains a sub-clause of the form "rowid = ?", where ? is an SQL expression, 237 contains a sub-clause of the form "rowid = ?", where ? is an SQL expression, 239 FTS3 is able to retreive the requested row directly using the equivalent 238 FTS3 is able to retreive the requested row directly using the equivalent 240 of an SQLite \[INTEGER PRIMARY KEY\] index. | 239 of an SQLite [INTEGER PRIMARY KEY] index. 241 240 242 <li><p> 241 <li><p> 243 <b>Full-text query</b>. If the WHERE clause of the SELECT statement contains 242 <b>Full-text query</b>. If the WHERE clause of the SELECT statement contains 244 a sub-clause of the form "<column> MATCH ?", FTS3 is able to use 243 a sub-clause of the form "<column> MATCH ?", FTS3 is able to use 245 the built-in full-text index to restrict the search to those documents 244 the built-in full-text index to restrict the search to those documents 246 that match the full-text query string specified as the right-hand operand 245 that match the full-text query string specified as the right-hand operand 247 of the MATCH clause. 246 of the MATCH clause. ................................................................................................................................................................................ 250 <p> 249 <p> 251 If neither of the two query strategies enumerated above can be used, all 250 If neither of the two query strategies enumerated above can be used, all 252 queries on FTS3 tables are implemented using a linear scan of the entire 251 queries on FTS3 tables are implemented using a linear scan of the entire 253 table. If the table contains large amounts of data, this may be an 252 table. If the table contains large amounts of data, this may be an 254 impractically approach (the first example on this page shows that a linear 253 impractically approach (the first example on this page shows that a linear 255 scan of 1.5 GB of data takes around 30 seconds using a modern PC). 254 scan of 1.5 GB of data takes around 30 seconds using a modern PC). 256 255 257 [Code { | 256 <codeblock> 258 <i>-- The examples in this block assume the following FTS3 table:</i> 257 <i>-- The examples in this block assume the following FTS3 table:</i> 259 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 258 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 260 259 261 SELECT * FROM mail WHERE rowid = 15; <i>-- Fast. Rowid lookup.< 260 SELECT * FROM mail WHERE rowid = 15; <i>-- Fast. Rowid lookup.< 262 SELECT * FROM mail WHERE body MATCH 'sqlite'; <i>-- Fast. Full-text quer 261 SELECT * FROM mail WHERE body MATCH 'sqlite'; <i>-- Fast. Full-text quer 263 SELECT * FROM mail WHERE mail MATCH 'search'; <i>-- Fast. Full-text quer 262 SELECT * FROM mail WHERE mail MATCH 'search'; <i>-- Fast. Full-text quer 264 SELECT * FROM mail WHERE rowid BETWEEN 15 AND 20; <i>-- Slow. Linear scan.</ 263 SELECT * FROM mail WHERE rowid BETWEEN 15 AND 20; <i>-- Slow. Linear scan.</ 265 SELECT * FROM mail WHERE subject = 'database'; <i>-- Slow. Linear scan.</ 264 SELECT * FROM mail WHERE subject = 'database'; <i>-- Slow. Linear scan.</ 266 SELECT * FROM mail WHERE subject MATCH 'database'; <i>-- Fast. Full-text quer 265 SELECT * FROM mail WHERE subject MATCH 'database'; <i>-- Fast. Full-text quer 267 }] < > 266 </codeblock> 268 267 269 <p> 268 <p> 270 In all of the full-text queries above, the right-hand operand of the MATCH 269 In all of the full-text queries above, the right-hand operand of the MATCH 271 operator is a string consisting of a single term. In this case, the MATCH 270 operator is a string consisting of a single term. In this case, the MATCH 272 expression evaluates to true for all documents that contain one or more 271 expression evaluates to true for all documents that contain one or more 273 instances of the specified word ("sqlite", "search" or "database", depending 272 instances of the specified word ("sqlite", "search" or "database", depending 274 on which example you look at). Specifying a single term as the right-hand 273 on which example you look at). Specifying a single term as the right-hand 275 operand of the MATCH operator results in the simplest (and most common) type 274 operand of the MATCH operator results in the simplest (and most common) type 276 of full-text query possible. However more complicated queries are possible, 275 of full-text query possible. However more complicated queries are possible, 277 including phrase searches, term-prefix searches and searches for documents 276 including phrase searches, term-prefix searches and searches for documents 278 containing combinations of terms occuring within a defined proximity of each 277 containing combinations of terms occuring within a defined proximity of each 279 other. The various ways in which the full-text index may be queried are 278 other. The various ways in which the full-text index may be queried are 280 \[FTS3 MATCH|described below\]. | 279 [FTS3 MATCH|described below]. 281 280 282 <p> 281 <p> 283 Normally, full-text queries are case-insensitive. However, this is 282 Normally, full-text queries are case-insensitive. However, this is 284 is dependent on the specific \[tokenizer\] used by the FTS3 table | 283 is dependent on the specific [tokenizer] used by the FTS3 table 285 being queried. Refer to the section on \[tokenizer|tokenizers\] for details. | 284 being queried. Refer to the section on [tokenizer|tokenizers] for details. 286 285 287 <p> 286 <p> 288 The paragraph above notes that a MATCH operator with a simple term as the 287 The paragraph above notes that a MATCH operator with a simple term as the 289 right-hand operand evaluates to true for all documents that contain the 288 right-hand operand evaluates to true for all documents that contain the 290 specified term. In this context, the "document" may refer to either the 289 specified term. In this context, the "document" may refer to either the 291 data stored in a single column of a row of an FTS3 table, or to the contents 290 data stored in a single column of a row of an FTS3 table, or to the contents 292 of all columns in a single row, depending on the identifier used as the 291 of all columns in a single row, depending on the identifier used as the ................................................................................................................................................................................ 294 the left-hand operand of the MATCH operator is an FTS3 table column name, 293 the left-hand operand of the MATCH operator is an FTS3 table column name, 295 then the document that the search term must be contained in is the value 294 then the document that the search term must be contained in is the value 296 stored in the specified column. However, if the identifier is the name 295 stored in the specified column. However, if the identifier is the name 297 of the FTS3 <i>table</i> itself, then the MATCH operator evaluates to true 296 of the FTS3 <i>table</i> itself, then the MATCH operator evaluates to true 298 for each row of the FTS3 table for which any column contains the search 297 for each row of the FTS3 table for which any column contains the search 299 term. The following example demonstrates this: 298 term. The following example demonstrates this: 300 299 301 [Code { | 300 <codeblock> 302 <i>-- Example schema</i> 301 <i>-- Example schema</i> 303 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 302 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 304 303 305 <i>-- Example table population</i> 304 <i>-- Example table population</i> 306 INSERT INTO mail(docid, subject, body) VALUES(1, 'software feedback', 'found i 305 INSERT INTO mail(docid, subject, body) VALUES(1, 'software feedback', 'found i 307 INSERT INTO mail(docid, subject, body) VALUES(2, 'software feedback', 'no feed 306 INSERT INTO mail(docid, subject, body) VALUES(2, 'software feedback', 'no feed 308 INSERT INTO mail(docid, subject, body) VALUES(3, 'slow lunch order', 'was a s 307 INSERT INTO mail(docid, subject, body) VALUES(3, 'slow lunch order', 'was a s 309 308 310 <i>-- Example queries</i> 309 <i>-- Example queries</i> 311 SELECT * FROM mail WHERE subject MATCH 'software'; <i>-- Selects rows 1 and 310 SELECT * FROM mail WHERE subject MATCH 'software'; <i>-- Selects rows 1 and 312 SELECT * FROM mail WHERE body MATCH 'feedback'; <i>-- Selects row 2</i> 311 SELECT * FROM mail WHERE body MATCH 'feedback'; <i>-- Selects row 2</i> 313 SELECT * FROM mail WHERE mail MATCH 'software'; <i>-- Selects rows 1, 2 312 SELECT * FROM mail WHERE mail MATCH 'software'; <i>-- Selects rows 1, 2 314 SELECT * FROM mail WHERE mail MATCH 'slow'; <i>-- Selects rows 1 and 313 SELECT * FROM mail WHERE mail MATCH 'slow'; <i>-- Selects rows 1 and 315 }] < > 314 </codeblock> 316 315 317 <p> 316 <p> 318 At first glance, the final two full-text queries in the example above seem 317 At first glance, the final two full-text queries in the example above seem 319 to be syntacticly incorrect, as there is a table name ("mail") used as 318 to be syntacticly incorrect, as there is a table name ("mail") used as 320 an SQL expression. The reason this is acceptable is that each FTS3 table 319 an SQL expression. The reason this is acceptable is that each FTS3 table 321 actually has a \[sqlite3_declare_vtab|HIDDEN\] column with the same name | 320 actually has a [sqlite3_declare_vtab|HIDDEN] column with the same name 322 as the table itself (in this case, "mail"). The value stored in this 321 as the table itself (in this case, "mail"). The value stored in this 323 column is not meaningful to the application, but can be used as the 322 column is not meaningful to the application, but can be used as the 324 left-hand operand to a MATCH operator. This special column may also be 323 left-hand operand to a MATCH operator. This special column may also be 325 passed as an argument to the \[snippet()|FTS3 auxillary functions\]. | 324 passed as an argument to the [snippet()|FTS3 auxillary functions]. 326 325 327 <p> 326 <p> 328 The following example illustrates the above. The expressions "docs", 327 The following example illustrates the above. The expressions "docs", 329 "docs.docs" and "main.docs.docs" all refer to column "docs". However, the 328 "docs.docs" and "main.docs.docs" all refer to column "docs". However, the 330 expression "main.docs" does not refer to any column. It could be used to 329 expression "main.docs" does not refer to any column. It could be used to 331 refer to a table, but a table name is not allowed in the context in which 330 refer to a table, but a table name is not allowed in the context in which 332 it is used below. 331 it is used below. 333 332 334 [Code { | 333 <codeblock> 335 <i>-- Example schema</i> 334 <i>-- Example schema</i> 336 CREATE VIRTUAL TABLE docs USING fts3(content); 335 CREATE VIRTUAL TABLE docs USING fts3(content); 337 336 338 <i>-- Example queries</i> 337 <i>-- Example queries</i> 339 SELECT * FROM docs WHERE docs MATCH 'sqlite'; <i>-- OK.</i> 338 SELECT * FROM docs WHERE docs MATCH 'sqlite'; <i>-- OK.</i> 340 SELECT * FROM docs WHERE docs.docs MATCH 'sqlite'; <i>-- OK.</i> 339 SELECT * FROM docs WHERE docs.docs MATCH 'sqlite'; <i>-- OK.</i> 341 SELECT * FROM docs WHERE main.docs.docs MATCH 'sqlite'; <i>-- OK.</i> 340 SELECT * FROM docs WHERE main.docs.docs MATCH 'sqlite'; <i>-- OK.</i> 342 SELECT * FROM docs WHERE main.docs MATCH 'sqlite'; <i>-- Error.</i> 341 SELECT * FROM docs WHERE main.docs MATCH 'sqlite'; <i>-- Error.</i> 343 }] < > 342 </codeblock> 344 343 345 [h2 "Summary"] | 344 <h2>Summary</h2> 346 345 347 <p> 346 <p> 348 From the users point of view, FTS3 tables are similar to ordinary SQLite 347 From the users point of view, FTS3 tables are similar to ordinary SQLite 349 tables in many ways. Data may be added to, modified within and removed 348 tables in many ways. Data may be added to, modified within and removed 350 from FTS3 tables using the INSERT, UPDATE and DELETE commands just as 349 from FTS3 tables using the INSERT, UPDATE and DELETE commands just as 351 it may be with ordinary tables. Similarly, the SELECT command may be used 350 it may be with ordinary tables. Similarly, the SELECT command may be used 352 to query data. The following list summarizes the differences between FTS3 351 to query data. The following list summarizes the differences between FTS3 ................................................................................................................................................................................ 358 triggers attached to FTS3 tables. Nor is it possible to use the ALTER TABLE 357 triggers attached to FTS3 tables. Nor is it possible to use the ALTER TABLE 359 command to add extra columns to FTS3 tables (although it is possible to use 358 command to add extra columns to FTS3 tables (although it is possible to use 360 ALTER TABLE to rename an FTS3 table). 359 ALTER TABLE to rename an FTS3 table). 361 360 362 <li><p> 361 <li><p> 363 Data-types specified as part of the "CREATE VIRTUAL TABLE" statement 362 Data-types specified as part of the "CREATE VIRTUAL TABLE" statement 364 used to create an FTS3 table are ignored completely. Instead of the 363 used to create an FTS3 table are ignored completely. Instead of the 365 normal rules for applying type \[affinity\] to inserted values, all | 364 normal rules for applying type [affinity] to inserted values, all 366 values inserted into FTS3 table columns (except the special rowid 365 values inserted into FTS3 table columns (except the special rowid 367 column) are converted to type TEXT before being stored. 366 column) are converted to type TEXT before being stored. 368 367 369 <li><p> 368 <li><p> 370 FTS3 tables permit the special alias "docid" to be used to refer to the 369 FTS3 tables permit the special alias "docid" to be used to refer to the 371 rowid column supported by all \[virtual tables\]. | 370 rowid column supported by all [virtual tables]. 372 371 373 <li><p> 372 <li><p> 374 The \[FTS3 MATCH\] operator is supported for queries based on the built-in | 373 The [FTS3 MATCH] operator is supported for queries based on the built-in 375 full-text index. 374 full-text index. 376 375 377 <li><p> 376 <li><p> 378 The FTS3 auxillary functions, \[snippet|snippet() and offsets()\], are | 377 The FTS3 auxillary functions, [snippet|snippet() and offsets()], are 379 available to support full-text queries. 378 available to support full-text queries. 380 379 381 <li><p> 380 <li><p> 382 Each FTS3 table has a \[sqlite3_declare_vtab()|HIDDEN column\] with the | 381 Each FTS3 table has a [sqlite3_declare_vtab()|HIDDEN column] with the 383 same name as the table itself. The value contained in each row for the 382 same name as the table itself. The value contained in each row for the 384 special column is only useful when used on the left-hand side of a 383 special column is only useful when used on the left-hand side of a 385 \[FTS3 MATCH|MATCH\] operator, or when specified as an argument to one | 384 [FTS3 MATCH|MATCH] operator, or when specified as an argument to one 386 of the \[snippet|FTS3 auxillary functions\]. | 385 of the [snippet|FTS3 auxillary functions]. 387 386 388 387 389 </ol> 388 </ol> 390 389 391 390 392 [h1 "Compiling and Enabling FTS3" {} {compile fts3}] | 391 <h1 tags="compile fts3">Compiling and Enabling FTS3</h1> 393 392 394 <p> 393 <p> 395 Although FTS3 is distributed as part of the SQLite source code, it is not 394 Although FTS3 is distributed as part of the SQLite source code, it is not 396 enabled by default. To build SQLite with FTS3 functionality enabled, define 395 enabled by default. To build SQLite with FTS3 functionality enabled, define 397 the preprocessor macro \[SQLITE_ENABLE_FTS3\] when compiling. New applications | 396 the preprocessor macro [SQLITE_ENABLE_FTS3] when compiling. New applications 398 should also define the \[SQLITE_ENABLE_FTS3_PARENTHESIS\] macro to enable the | 397 should also define the [SQLITE_ENABLE_FTS3_PARENTHESIS] macro to enable the 399 \[enhanced query syntax\] (see below). Usually, this is done by adding the | 398 [enhanced query syntax] (see below). Usually, this is done by adding the 400 following two switches to the compiler command line: 399 following two switches to the compiler command line: 401 400 402 [Code { | 401 <codeblock> 403 -DSQLITE_ENABLE_FTS3 402 -DSQLITE_ENABLE_FTS3 404 -DSQLITE_ENABLE_FTS3_PARENTHESIS 403 -DSQLITE_ENABLE_FTS3_PARENTHESIS 405 }] < > 404 </codeblock> 406 405 407 <p> 406 <p> 408 If using the amalgamation autoconf based build system, setting the CPPFLAGS 407 If using the amalgamation autoconf based build system, setting the CPPFLAGS 409 environment variable while running the 'configure' script is an easy 408 environment variable while running the 'configure' script is an easy 410 way to set these macros. For example, the following command: 409 way to set these macros. For example, the following command: 411 410 412 [Code { | 411 <codeblock> 413 CPPFLAGS="-DSQLITE_ENABLE_FTS3 -DSQLITE_ENABLE_FTS3_PARENTHESIS" ./configure & 412 CPPFLAGS="-DSQLITE_ENABLE_FTS3 -DSQLITE_ENABLE_FTS3_PARENTHESIS" ./configure & 414 }] < > 413 </codeblock> 415 414 416 <p> 415 <p> 417 where <i><configure options></i> are those options normally passed to 416 where <i><configure options></i> are those options normally passed to 418 the configure script, if any. 417 the configure script, if any. 419 418 420 <p> 419 <p> 421 Because FTS3 is a virtual table, it is incompatible with the 420 Because FTS3 is a virtual table, it is incompatible with the 422 \[SQLITE_OMIT_VIRTUALTABLE\] option. | 421 [SQLITE_OMIT_VIRTUALTABLE] option. 423 422 424 <p> 423 <p> 425 If an SQLite build does not include FTS3, then any attempt to prepare an 424 If an SQLite build does not include FTS3, then any attempt to prepare an 426 SQL statement to create an FTS3 table or to drop or access an existing 425 SQL statement to create an FTS3 table or to drop or access an existing 427 FTS3 table in any way will fail. The error message returned will be similar 426 FTS3 table in any way will fail. The error message returned will be similar 428 to "no such module: fts3". 427 to "no such module: fts3". 429 428 430 <p> 429 <p> 431 If the C version of the <a href=http://site.icu-project.org/>ICU library</a> 430 If the C version of the <a href=http://site.icu-project.org/>ICU library</a> 432 is available, then FTS3 may also be compiled with the SQLITE_ENABLE_ICU 431 is available, then FTS3 may also be compiled with the SQLITE_ENABLE_ICU 433 pre-processor macro defined. Compiling with this macro enables an FTS3 432 pre-processor macro defined. Compiling with this macro enables an FTS3 434 \[tokenizer\] that uses the ICU library to split a document into terms | 433 [tokenizer] that uses the ICU library to split a document into terms 435 (words) using the conventions for a specified language and locale. 434 (words) using the conventions for a specified language and locale. 436 435 437 [Code { | 436 <codeblock> 438 -DSQLITE_ENABLE_ICU 437 -DSQLITE_ENABLE_ICU 439 }] < > 438 </codeblock> 440 439 441 440 442 [h1 "Full-text Index Queries" {} {FTS3 MATCH}] | 441 <h1 tags="FTS3 MATCH">Full-text Index Queries</h1> 443 442 444 <p> 443 <p> 445 The most useful thing about FTS3 tables is the queries that may be 444 The most useful thing about FTS3 tables is the queries that may be 446 performed using the built-in full-text index. Full-text queries are 445 performed using the built-in full-text index. Full-text queries are 447 performed by specifying a clause of the form 446 performed by specifying a clause of the form 448 "<column> MATCH <full-text query expression>" to the WHERE 447 "<column> MATCH <full-text query expression>" to the WHERE 449 clause of a SELECT statement that reads data from an FTS3 table. 448 clause of a SELECT statement that reads data from an FTS3 table. 450 \[simple fts3 queries|Simple FTS3 queries\] that return all documents that | 449 [simple fts3 queries|Simple FTS3 queries] that return all documents that 451 contain a given term are described above. In that discussion the right-hand 450 contain a given term are described above. In that discussion the right-hand 452 operand of the MATCH operator was assumed to be a string consisting of a 451 operand of the MATCH operator was assumed to be a string consisting of a 453 single term. This section describes the more complex query types supported 452 single term. This section describes the more complex query types supported 454 by FTS3 tables, and how they may be utilized by specifying a more 453 by FTS3 tables, and how they may be utilized by specifying a more 455 complex query expression as the right-hand operand of a MATCH operator. 454 complex query expression as the right-hand operand of a MATCH operator. 456 455 457 <p> 456 <p> 458 FTS3 tables support three basic query types: 457 FTS3 tables support three basic query types: 459 458 460 <ul> 459 <ul> 461 <li><p><b>Token or token prefix queries</b>. 460 <li><p><b>Token or token prefix queries</b>. 462 An FTS3 table may be queried for all documents that contain a specified 461 An FTS3 table may be queried for all documents that contain a specified 463 term (the \[simple fts3 queries|simple case\] described above), or for | 462 term (the [simple fts3 queries|simple case] described above), or for 464 all documents that contain a term with a specified prefix. As we have 463 all documents that contain a term with a specified prefix. As we have 465 seen, the query expression for a specific term is simply the term itself. 464 seen, the query expression for a specific term is simply the term itself. 466 The query expression used to search for a term prefix is the prefix 465 The query expression used to search for a term prefix is the prefix 467 itself with a '*' character appended to it. For example: 466 itself with a '*' character appended to it. For example: 468 </ul> 467 </ul> 469 468 470 [Code { | 469 <codeblock> 471 <i>-- Virtual table declaration</i> 470 <i>-- Virtual table declaration</i> 472 CREATE VIRTUAL TABLE docs USING fts3(title, body); 471 CREATE VIRTUAL TABLE docs USING fts3(title, body); 473 472 474 <i>-- Query for all documents containing the term "linux":</i> 473 <i>-- Query for all documents containing the term "linux":</i> 475 SELECT * FROM docs WHERE docs MATCH 'linux'; 474 SELECT * FROM docs WHERE docs MATCH 'linux'; 476 475 477 <i>-- Query for all documents containing a term with the prefix "lin". This wi 476 <i>-- Query for all documents containing a term with the prefix "lin". This wi 478 <i>-- all documents that contain "linux", but also those that contain terms "l 477 <i>-- all documents that contain "linux", but also those that contain terms "l 479 <i>--"linker", "linguistic" and so on.</i> 478 <i>--"linker", "linguistic" and so on.</i> 480 SELECT * FROM docs WHERE docs MATCH 'lin*'; 479 SELECT * FROM docs WHERE docs MATCH 'lin*'; 481 }] < > 480 </codeblock> 482 481 483 <ul> 482 <ul> 484 <li style="list-style:none"><p> 483 <li style="list-style:none"><p> 485 Normally, a token or token prefix query is matched against the FTS3 table 484 Normally, a token or token prefix query is matched against the FTS3 table 486 column specified as the right-hand side of the MATCH operator. Or, if the 485 column specified as the right-hand side of the MATCH operator. Or, if the 487 special column with the same name as the FTS3 table itself is specified, 486 special column with the same name as the FTS3 table itself is specified, 488 against all columns. This may be overridden by specifying a column-name 487 against all columns. This may be overridden by specifying a column-name 489 followed by a ":" character before a basic term query. There may be space 488 followed by a ":" character before a basic term query. There may be space 490 between the ":" and the term to query for, but not between the column-name 489 between the ":" and the term to query for, but not between the column-name 491 and the ":" character. For example: 490 and the ":" character. For example: 492 </ul> 491 </ul> 493 492 494 [Code { | 493 <codeblock> 495 <i>-- Query the database for documents for which the term "linux" appears in</ 494 <i>-- Query the database for documents for which the term "linux" appears in</ 496 <i>-- the document title, and the term "problems" appears in either the title< 495 <i>-- the document title, and the term "problems" appears in either the title< 497 <i>-- or body of the document.</i> 496 <i>-- or body of the document.</i> 498 SELECT * FROM docs WHERE docs MATCH 'title:linux problems'; 497 SELECT * FROM docs WHERE docs MATCH 'title:linux problems'; 499 498 500 <i>-- Query the database for documents for which the term "linux" appears in</ 499 <i>-- Query the database for documents for which the term "linux" appears in</ 501 <i>-- the document title, and the term "driver" appears in the body of the doc 500 <i>-- the document title, and the term "driver" appears in the body of the doc 502 <i>-- ("driver" may also appear in the title, but this alone will not satisfy 501 <i>-- ("driver" may also appear in the title, but this alone will not satisfy 503 <i>-- query criteria).</i> 502 <i>-- query criteria).</i> 504 SELECT * FROM docs WHERE body MATCH 'title:linux driver'; 503 SELECT * FROM docs WHERE body MATCH 'title:linux driver'; 505 }] < > 504 </codeblock> 506 505 507 <ul> 506 <ul> 508 <li><p><b>Phrase queries</b>. 507 <li><p><b>Phrase queries</b>. 509 A phrase query is a query that retrieves all documents that contain a 508 A phrase query is a query that retrieves all documents that contain a 510 nominated set of terms or term prefixes in a specified order with no 509 nominated set of terms or term prefixes in a specified order with no 511 intervening tokens. Phrase queries are specified by enclosing a space 510 intervening tokens. Phrase queries are specified by enclosing a space 512 separated sequence of terms or term prefixes in double quotes ("). 511 separated sequence of terms or term prefixes in double quotes ("). 513 For example: 512 For example: 514 </ul> 513 </ul> 515 514 516 [Code { | 515 <codeblock> 517 <i>-- Query for all documents that contain the phrase "linux applications".</i 516 <i>-- Query for all documents that contain the phrase "linux applications".</i 518 SELECT * FROM docs WHERE docs MATCH '"linux applications"'; 517 SELECT * FROM docs WHERE docs MATCH '"linux applications"'; 519 518 520 <i>-- Query for all documents that contain a phrase that matches "lin* app*". 519 <i>-- Query for all documents that contain a phrase that matches "lin* app*". 521 <i>-- "linux applications", this will match common phrases such as "linoleum a 520 <i>-- "linux applications", this will match common phrases such as "linoleum a 522 <i>-- or "link apprentice".</i> 521 <i>-- or "link apprentice".</i> 523 SELECT * FROM docs WHERE docs MATCH '"lin* app*"'; 522 SELECT * FROM docs WHERE docs MATCH '"lin* app*"'; 524 }] < > 523 </codeblock> 525 524 526 <ul> 525 <ul> 527 <li><p><b>NEAR queries</b>. 526 <li><p><b>NEAR queries</b>. 528 A NEAR query is a query that returns documents that contain a two or 527 A NEAR query is a query that returns documents that contain a two or 529 more nominated terms or phrases within a specified proximity of each 528 more nominated terms or phrases within a specified proximity of each 530 other (by default with 10 or less intervening terms). A NEAR query is 529 other (by default with 10 or less intervening terms). A NEAR query is 531 specified by putting the keyword "NEAR" between two phrase, term or 530 specified by putting the keyword "NEAR" between two phrase, term or 532 term prefix queries. To specify a proximity other than the default, 531 term prefix queries. To specify a proximity other than the default, 533 an operator of the form "NEAR/<i><N></i>" may be used, where 532 an operator of the form "NEAR/<i><N></i>" may be used, where 534 <i><N></i> is the maximum number of intervening terms allowed. 533 <i><N></i> is the maximum number of intervening terms allowed. 535 For example: 534 For example: 536 </ul> 535 </ul> 537 536 538 [Code { | 537 <codeblock> 539 <i>-- Virtual table declaration.</i> 538 <i>-- Virtual table declaration.</i> 540 CREATE VIRTUAL TABLE docs USING fts3(); 539 CREATE VIRTUAL TABLE docs USING fts3(); 541 540 542 <i>-- Virtual table data.</i> 541 <i>-- Virtual table data.</i> 543 INSERT INTO docs VALUES('SQLite is an ACID compliant embedded relational datab 542 INSERT INTO docs VALUES('SQLite is an ACID compliant embedded relational datab 544 543 545 <i>-- Search for a document that contains the terms "sqlite" and "database" wi 544 <i>-- Search for a document that contains the terms "sqlite" and "database" wi ................................................................................................................................................................................ 563 <i>-- document stored in table docs.</i> 562 <i>-- document stored in table docs.</i> 564 SELECT * FROM docs WHERE docs MATCH 'database NEAR/2 "ACID compliant"'; 563 SELECT * FROM docs WHERE docs MATCH 'database NEAR/2 "ACID compliant"'; 565 564 566 <i>-- Search for a document that contains the phrase "ACID compliant" and the 565 <i>-- Search for a document that contains the phrase "ACID compliant" and the 567 <i>-- "sqlite" with not more than 2 terms separating the two. This also matche 566 <i>-- "sqlite" with not more than 2 terms separating the two. This also matche 568 <i>-- the only document stored in table docs.</i> 567 <i>-- the only document stored in table docs.</i> 569 SELECT * FROM docs WHERE docs MATCH '"ACID compliant" NEAR/2 sqlite'; 568 SELECT * FROM docs WHERE docs MATCH '"ACID compliant" NEAR/2 sqlite'; 570 }] < > 569 </codeblock> 571 570 572 <ul> 571 <ul> 573 <li style="list-style: none"><p> 572 <li style="list-style: none"><p> 574 More than one NEAR operator may appear in a single query. In this case each 573 More than one NEAR operator may appear in a single query. In this case each 575 pair of terms or phrases separated by a NEAR operator must appear within the 574 pair of terms or phrases separated by a NEAR operator must appear within the 576 specified proximity of each other in the document. Using the same table and 575 specified proximity of each other in the document. Using the same table and 577 data as in the block of examples above: 576 data as in the block of examples above: 578 </ul> 577 </ul> 579 578 > 579 <codeblock> 580 [Code { <i>-- The following query selects documents that contains an instance of | 580 <i>-- The following query selects documents that contains an instance of the te 581 <i>-- "sqlite" separated by two or fewer terms from an instance of the term "a 581 <i>-- "sqlite" separated by two or fewer terms from an instance of the term "a 582 <i>-- which is in turn separated by two or fewer terms from an instance of the 582 <i>-- which is in turn separated by two or fewer terms from an instance of the 583 <i>-- "relational". As it happens, the only document in table docs satisfies t 583 <i>-- "relational". As it happens, the only document in table docs satisfies t 584 SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR/2 acid NEAR/2 relational'; 584 SELECT * FROM docs WHERE docs MATCH 'sqlite NEAR/2 acid NEAR/2 relational'; 585 585 586 <i>-- This query matches no documents. There is an instance of the term "sqlit 586 <i>-- This query matches no documents. There is an instance of the term "sqlit 587 <i>-- sufficient proximity to an instance of "acid" but it is not sufficiently 587 <i>-- sufficient proximity to an instance of "acid" but it is not sufficiently 588 <i>-- to an instance of the term "relational".</i> 588 <i>-- to an instance of the term "relational".</i> 589 SELECT * FROM docs WHERE docs MATCH 'acid NEAR/2 sqlite NEAR/2 relational'; 589 SELECT * FROM docs WHERE docs MATCH 'acid NEAR/2 sqlite NEAR/2 relational'; 590 }] < > 590 </codeblock> 591 591 592 <p> 592 <p> 593 Phrase and NEAR queries may not span multiple columns within a row. 593 Phrase and NEAR queries may not span multiple columns within a row. 594 594 595 <p> 595 <p> 596 The three basic query types described above may be used to query the full-text 596 The three basic query types described above may be used to query the full-text 597 index for the set of documents that match the specified criteria. Using the 597 index for the set of documents that match the specified criteria. Using the ................................................................................................................................................................................ 612 <p> 612 <p> 613 The FTS3 module may be compiled to use one of two slightly different versions 613 The FTS3 module may be compiled to use one of two slightly different versions 614 of the full-text query syntax, the "standard" query syntax and the "enhanced" 614 of the full-text query syntax, the "standard" query syntax and the "enhanced" 615 query syntax. The basic term, term-prefix, phrase and NEAR queries described 615 query syntax. The basic term, term-prefix, phrase and NEAR queries described 616 above are the same in both versions of the syntax. The way in which set 616 above are the same in both versions of the syntax. The way in which set 617 operations are specified is slightly different. The following two sub-sections 617 operations are specified is slightly different. The following two sub-sections 618 describe the part of the two query syntaxes that pertains to set operations. 618 describe the part of the two query syntaxes that pertains to set operations. 619 Refer to the description of how to \[compile fts3\] for compilation notes. | 619 Refer to the description of how to [compile fts3] for compilation notes. 620 620 > 621 <h2 tags="enhanced query syntax"> 621 [h2 "Set Operations Using The Enhanced Query Syntax" {} {enhanced query syntax}] | 622 Set Operations Using The Enhanced Query Syntax</h2> 622 623 623 <p> 624 <p> 624 The enhanced query syntax supports the AND, OR and NOT binary set operators. 625 The enhanced query syntax supports the AND, OR and NOT binary set operators. 625 Each of the two operands to an operator may be a basic FTS3 query, or the 626 Each of the two operands to an operator may be a basic FTS3 query, or the 626 result of another AND, OR or NOT set operation. Operators must be entered 627 result of another AND, OR or NOT set operation. Operators must be entered 627 using capital letters. Otherwise, they are interpreted as basic term queries 628 using capital letters. Otherwise, they are interpreted as basic term queries 628 instead of set operators. 629 instead of set operators. ................................................................................................................................................................................ 630 <p> 631 <p> 631 The AND operator may be implicitly specified. If two basic queries appear 632 The AND operator may be implicitly specified. If two basic queries appear 632 with no operator separating them in an FTS3 query string, the results are 633 with no operator separating them in an FTS3 query string, the results are 633 the same as if the two basic queries were separated by an AND operator. 634 the same as if the two basic queries were separated by an AND operator. 634 For example, the query expression "implicit operator" is a more succinct 635 For example, the query expression "implicit operator" is a more succinct 635 version of "implicit AND operator". 636 version of "implicit AND operator". 636 637 637 [Code { | 638 <codeblock> 638 <i>-- Virtual table declaration</i> 639 <i>-- Virtual table declaration</i> 639 CREATE VIRTUAL TABLE docs USING fts3(); 640 CREATE VIRTUAL TABLE docs USING fts3(); 640 641 641 <i>-- Virtual table data</i> 642 <i>-- Virtual table data</i> 642 INSERT INTO docs(docid, content) VALUES(1, 'a database is a software system'); 643 INSERT INTO docs(docid, content) VALUES(1, 'a database is a software system'); 643 INSERT INTO docs(docid, content) VALUES(2, 'sqlite is a software system'); 644 INSERT INTO docs(docid, content) VALUES(2, 'sqlite is a software system'); 644 INSERT INTO docs(docid, content) VALUES(3, 'sqlite is a database'); 645 INSERT INTO docs(docid, content) VALUES(3, 'sqlite is a database'); ................................................................................................................................................................................ 662 663 663 <i>-- The following query matches no documents. Because "and" is in lowercase 664 <i>-- The following query matches no documents. Because "and" is in lowercase 664 <i>-- it is interpreted as a basic term query instead of an operator. Operator 665 <i>-- it is interpreted as a basic term query instead of an operator. Operator 665 <i>-- be specified using capital letters. In practice, this query will match a 666 <i>-- be specified using capital letters. In practice, this query will match a 666 <i>-- that contain each of the three terms "database", "and" and "sqlite" at l 667 <i>-- that contain each of the three terms "database", "and" and "sqlite" at l 667 <i>-- No documents in the example data above match this criteria.</i> 668 <i>-- No documents in the example data above match this criteria.</i> 668 SELECT * FROM docs WHERE docs MATCH 'database and sqlite'; 669 SELECT * FROM docs WHERE docs MATCH 'database and sqlite'; 669 }] < > 670 </codeblock> 670 671 671 <p> 672 <p> 672 The examples above all use basic full-text term queries as both operands of 673 The examples above all use basic full-text term queries as both operands of 673 the set operations demonstrated. Phrase and NEAR queries may also be used, 674 the set operations demonstrated. Phrase and NEAR queries may also be used, 674 as may the results of other set operations. When more than one set operation 675 as may the results of other set operations. When more than one set operation 675 is present in an FTS3 query, the precedence of operators is as follows: 676 is present in an FTS3 query, the precedence of operators is as follows: 676 677 677 [Table] | 678 <table striped=1> 678 [Tr]<th>Operator<th>Enhanced Query Syntax Precedence | 679 <tr><th>Operator<th>Enhanced Query Syntax Precedence 679 [Tr]<td>NOT <td> Highest precedence (tightest grouping). | 680 <tr><td>NOT <td> Highest precedence (tightest grouping). 680 [Tr]<td>AND <td> | 681 <tr><td>AND <td> 681 [Tr]<td>OR <td> Lowest precedence (loosest grouping). | 682 <tr><td>OR <td> Lowest precedence (loosest grouping). 682 </table> 683 </table> 683 684 684 <p> 685 <p> 685 When using the enhanced query syntax, parenthesis may be used to override 686 When using the enhanced query syntax, parenthesis may be used to override 686 the default precedence of the various operators. For example: 687 the default precedence of the various operators. For example: 687 688 688 [Code { | 689 <codeblock> 689 690 690 <i>-- Return the docid values associated with all documents that contain the</ 691 <i>-- Return the docid values associated with all documents that contain the</ 691 <i>-- two terms "sqlite" and "database", and/or contain the term "library".</i 692 <i>-- two terms "sqlite" and "database", and/or contain the term "library".</i 692 SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database OR library'; 693 SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database OR library'; 693 694 694 <i>-- This query is equivalent to the above.</i> 695 <i>-- This query is equivalent to the above.</i> 695 SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database' 696 SELECT docid FROM docs WHERE docs MATCH 'sqlite AND database' ................................................................................................................................................................................ 704 SELECT docid FROM docs WHERE docs MATCH 'linux' 705 SELECT docid FROM docs WHERE docs MATCH 'linux' 705 INTERSECT 706 INTERSECT 706 SELECT docid FROM ( 707 SELECT docid FROM ( 707 SELECT docid FROM docs WHERE docs MATCH '"sqlite library"' 708 SELECT docid FROM docs WHERE docs MATCH '"sqlite library"' 708 UNION 709 UNION 709 SELECT docid FROM docs WHERE docs MATCH '"sqlite database"' 710 SELECT docid FROM docs WHERE docs MATCH '"sqlite database"' 710 ); 711 ); 711 }] < > 712 </codeblock> 712 713 713 714 714 [h2 "Set Operations Using The Standard Query Syntax"] | 715 <h2>Set Operations Using The Standard Query Syntax</h2> 715 716 716 <p> 717 <p> 717 FTS3 query set operations using the standard query syntax are similar, but 718 FTS3 query set operations using the standard query syntax are similar, but 718 not identical, to set operations with the enhanced query syntax. There 719 not identical, to set operations with the enhanced query syntax. There 719 are four differences, as follows: 720 are four differences, as follows: 720 721 721 <ol> 722 <ol> ................................................................................................................................................................................ 735 may be applied to basic term and term-prefix queries (but not to phrase 736 may be applied to basic term and term-prefix queries (but not to phrase 736 or NEAR queries). A term or term-prefix that has a unary "-" operator 737 or NEAR queries). A term or term-prefix that has a unary "-" operator 737 attached to it may not appear as an operand to an OR operator. An FTS3 738 attached to it may not appear as an operand to an OR operator. An FTS3 738 query may not consist entirely of terms or term-prefix queries with unary 739 query may not consist entirely of terms or term-prefix queries with unary 739 "-" operators attached to them. 740 "-" operators attached to them. 740 </ol> 741 </ol> 741 742 742 [Code { | 743 <codeblock> 743 <i>-- Search for the set of documents that contain the term "sqlite" but do</i 744 <i>-- Search for the set of documents that contain the term "sqlite" but do</i 744 <i>-- not contain the term "database".</i> 745 <i>-- not contain the term "database".</i> 745 SELECT * FROM docs WHERE docs MATCH 'sqlite -database'; 746 SELECT * FROM docs WHERE docs MATCH 'sqlite -database'; 746 }] < > 747 </codeblock> 747 748 748 <ol> 749 <ol> 749 <li value=4><p> The relative precedence of the set operations is different. 750 <li value=4><p> The relative precedence of the set operations is different. 750 In particular, using the standard query syntax the "OR" operator has a 751 In particular, using the standard query syntax the "OR" operator has a 751 higher precedence than "AND". The precedence of operators when using the 752 higher precedence than "AND". The precedence of operators when using the 752 standard query syntax is: 753 standard query syntax is: 753 </ol> 754 </ol> 754 755 755 [Table] | 756 <table striped=1> 756 [Tr]<th>Operator<th>Standard Query Syntax Precedence | 757 <tr><th>Operator<th>Standard Query Syntax Precedence 757 [Tr]<td>Unary "-" <td> Highest precedence (tightest grouping). | 758 <tr><td>Unary "-" <td> Highest precedence (tightest grouping). 758 [Tr]<td>OR <td> | 759 <tr><td>OR <td> 759 [Tr]<td>AND <td> Lowest precedence (loosest grouping). | 760 <tr><td>AND <td> Lowest precedence (loosest grouping). 760 </table> 761 </table> 761 762 762 <ol><li style="list-style:none"> 763 <ol><li style="list-style:none"> 763 The following example illustrates precedence of operators using the standard 764 The following example illustrates precedence of operators using the standard 764 query syntax: 765 query syntax: 765 </ol> 766 </ol> 766 767 767 [Code { | 768 <codeblock> 768 <i>-- Search for documents that contains at least one of the terms "database"< 769 <i>-- Search for documents that contains at least one of the terms "database"< 769 <i>-- and "sqlite", and also contains the term "library". Because of the diffe 770 <i>-- and "sqlite", and also contains the term "library". Because of the diffe 770 <i>-- in operator precedences, this query would have a different interpretatio 771 <i>-- in operator precedences, this query would have a different interpretatio 771 <i>-- the enhanced query syntax.</i> 772 <i>-- the enhanced query syntax.</i> 772 SELECT * FROM docs WHERE docs MATCH 'sqlite OR database library'; 773 SELECT * FROM docs WHERE docs MATCH 'sqlite OR database library'; 773 }] < > 774 </codeblock> 774 775 775 [h1 "Auxillary functions - Snippet, Offsets and Matchinfo" {} snippet offsets] | 776 <h1 tags="snippet, offsets">Auxillary functions - Snippet, Offsets and Matchinfo 776 777 777 <p> 778 <p> 778 The FTS3 module provides three special SQL scalar functions that may be useful 779 The FTS3 module provides three special SQL scalar functions that may be useful 779 to the developers of full-text query systems: "snippet", "offsets" and 780 to the developers of full-text query systems: "snippet", "offsets" and 780 "matchinfo". The purpose of the "snippet" and "offsets" functions is to allow 781 "matchinfo". The purpose of the "snippet" and "offsets" functions is to allow 781 the user to identify the location of queried terms in the returned documents. 782 the user to identify the location of queried terms in the returned documents. 782 The "matchinfo" function provides the user with metrics that may be useful 783 The "matchinfo" function provides the user with metrics that may be useful ................................................................................................................................................................................ 784 785 785 <p> 786 <p> 786 The first argument to all three special SQL scalar functions 787 The first argument to all three special SQL scalar functions 787 must be the the special hidden column of an FTS3 table that has the same 788 must be the the special hidden column of an FTS3 table that has the same 788 name as the table (see above). For example, given an FTS3 table named 789 name as the table (see above). For example, given an FTS3 table named 789 "mail": 790 "mail": 790 791 791 [Code { | 792 <codeblock> 792 SELECT offsets(mail) FROM mail WHERE mail MATCH <full-text query expression 793 SELECT offsets(mail) FROM mail WHERE mail MATCH <full-text query expression 793 SELECT snippet(mail) FROM mail WHERE mail MATCH <full-text query expression 794 SELECT snippet(mail) FROM mail WHERE mail MATCH <full-text query expression 794 SELECT matchinfo(mail) FROM mail WHERE mail MATCH <full-text query expressi 795 SELECT matchinfo(mail) FROM mail WHERE mail MATCH <full-text query expressi 795 }] < > 796 </codeblock> 796 797 797 <p> 798 <p> 798 The three auxillary functions are only useful within a SELECT statement that 799 The three auxillary functions are only useful within a SELECT statement that 799 uses the FTS3 table's full-text index. If used within a SELECT that uses 800 uses the FTS3 table's full-text index. If used within a SELECT that uses 800 the "query by rowid" or "linear scan" strategies, then the snippet and 801 the "query by rowid" or "linear scan" strategies, then the snippet and 801 offsets both return an an empty string, and the matchinfo function returns 802 offsets both return an an empty string, and the matchinfo function returns 802 a blob value zero bytes in size. 803 a blob value zero bytes in size. ................................................................................................................................................................................ 822 823 823 <li> If the matchable phrase in the FTS3 query is restricted to matching 824 <li> If the matchable phrase in the FTS3 query is restricted to matching 824 data in a specified FTS3 table column, then only phrase matches that 825 data in a specified FTS3 table column, then only phrase matches that 825 occur within that column are considered. 826 occur within that column are considered. 826 </ol> 827 </ol> 827 828 828 829 829 [h2 "The Offsets Function"] | 830 <h2>The Offsets Function</h2> 830 831 831 <p> 832 <p> 832 For a SELECT query that uses the full-text index, the offsets() function 833 For a SELECT query that uses the full-text index, the offsets() function 833 returns a text value containing a series of space-separated integers. For 834 returns a text value containing a series of space-separated integers. For 834 each term in each <a href=#matchable>phrase match</a> of the current row, 835 each term in each <a href=#matchable>phrase match</a> of the current row, 835 there are four integers in the returned list. Each set of four integers is 836 there are four integers in the returned list. Each set of four integers is 836 interpreted as follows: 837 interpreted as follows: 837 838 838 [Table] | 839 <table striped=1> 839 [Tr]<th>Integer <th>Interpretation | 840 <tr><th>Integer <th>Interpretation 840 [Tr]<td>0 | 841 <tr><td>0 841 <td>The column number that the term instance occurs in (0 for the 842 <td>The column number that the term instance occurs in (0 for the 842 leftmost column of the FTS3 table, 1 for the next leftmost, etc.). 843 leftmost column of the FTS3 table, 1 for the next leftmost, etc.). 843 [Tr]<td>1 | 844 <tr><td>1 844 <td>The term number of the matching term within the full-text query 845 <td>The term number of the matching term within the full-text query 845 expression. Terms within a query expression are numbered starting 846 expression. Terms within a query expression are numbered starting 846 from 0 in the order that they occur. 847 from 0 in the order that they occur. 847 [Tr]<td>2 | 848 <tr><td>2 848 <td>The byte offset of the matching term within the column. 849 <td>The byte offset of the matching term within the column. 849 [Tr]<td>3 | 850 <tr><td>3 850 <td>The size of the matching term in bytes. 851 <td>The size of the matching term in bytes. 851 </table> 852 </table> 852 853 853 <p> 854 <p> 854 The following block contains examples that use the offsets function. 855 The following block contains examples that use the offsets function. 855 856 856 [Code { | 857 <codeblock> 857 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 858 CREATE VIRTUAL TABLE mail USING fts3(subject, body); 858 INSERT INTO mail VALUES('hello world', 'This message is a hello world message. 859 INSERT INTO mail VALUES('hello world', 'This message is a hello world message. 859 INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more seriou 860 INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more seriou 860 861 861 <i>-- The following query returns a single row (as it matches only the first</ 862 <i>-- The following query returns a single row (as it matches only the first</ 862 <i>-- entry in table "mail". The text returned by the offsets function is</i> 863 <i>-- entry in table "mail". The text returned by the offsets function is</i> 863 <i>-- "0 0 6 5 1 0 24 5".</i> 864 <i>-- "0 0 6 5 1 0 24 5".</i> ................................................................................................................................................................................ 874 SELECT offsets(mail) FROM mail WHERE mail MATCH 'message'; 875 SELECT offsets(mail) FROM mail WHERE mail MATCH 'message'; 875 876 876 <i>-- The following query matches the second row in table "mail". It returns t 877 <i>-- The following query matches the second row in table "mail". It returns t 877 <i>-- text "1 0 28 7 1 1 36 4". Only those occurences of terms "serious" and " 878 <i>-- text "1 0 28 7 1 1 36 4". Only those occurences of terms "serious" and " 878 <i>-- that are part of an instance of the phrase "serious mail" are identified 879 <i>-- that are part of an instance of the phrase "serious mail" are identified 879 <i>-- other occurences of "serious" and "mail" are ignored.</i> 880 <i>-- other occurences of "serious" and "mail" are ignored.</i> 880 SELECT offsets(mail) FROM mail WHERE mail MATCH '"serious mail"'; 881 SELECT offsets(mail) FROM mail WHERE mail MATCH '"serious mail"'; 881 }] < > 882 </codeblock> 882 883 883 [h2 "The Snippet Function"] | 884 <h2>The Snippet Function</h2> 884 885 885 <p> 886 <p> 886 The snippet function is used to create formatted fragments of document text 887 The snippet function is used to create formatted fragments of document text 887 for display as part of a full-text query results report. The snippet function 888 for display as part of a full-text query results report. The snippet function 888 may be passed between one and four arguments, as follows: 889 may be passed between one and four arguments, as follows: 889 890 890 [Table] | 891 <table striped=1> 891 [Tr]<th>Argument <th>Default Value <th>Description | 892 <tr><th>Argument <th>Default Value <th>Description 892 [Tr]<td>0 <td>N/A | 893 <tr><td>0 <td>N/A 893 <td> The first argument to the snippet function must always be the special 894 <td> The first argument to the snippet function must always be the special 894 hidden column of the FTS3 table that takes the same name as the table 895 hidden column of the FTS3 table that takes the same name as the table 895 itself. 896 itself. 896 [Tr]<td>1 <td>"<b>" | 897 <tr><td>1 <td>"<b>" 897 <td> The "start match" text. 898 <td> The "start match" text. 898 [Tr]<td>2 <td>"<b>" | 899 <tr><td>2 <td>"<b>" 899 <td> The "end match" text. 900 <td> The "end match" text. 900 [Tr]<td>3 <td>"<b>...</b>" | 901 <tr><td>3 <td>"<b>...</b>" 901 <td> The "ellipses" text. 902 <td> The "ellipses" text. 902 [Tr]<td>4 <td>-1 | 903 <tr><td>4 <td>-1 903 <td> The FTS3 table column number to extract the returned fragments of 904 <td> The FTS3 table column number to extract the returned fragments of 904 text from. Columns are numbered from left to right starting with 905 text from. Columns are numbered from left to right starting with 905 zero. A negative value indicates that the text may be extracted 906 zero. A negative value indicates that the text may be extracted 906 from any column. 907 from any column. 907 [Tr]<td>5 <td>-15 | 908 <tr><td>5 <td>-15 908 <td> The absolute value of this integer argument is used as the 909 <td> The absolute value of this integer argument is used as the 909 (approximate) number of tokens to include in the returned text 910 (approximate) number of tokens to include in the returned text 910 value. The maximum allowable absolute value is 64. The value of 911 value. The maximum allowable absolute value is 64. The value of 911 this argument is refered to as <i>N</i> in the discussion below. 912 this argument is refered to as <i>N</i> in the discussion below. 912 </table> 913 </table> 913 914 914 <p> 915 <p> ................................................................................................................................................................................ 963 <p> 964 <p> 964 After the <i>M</i> fragments have been located, where <i>M</i> is between 965 After the <i>M</i> fragments have been located, where <i>M</i> is between 965 two and four as described in the paragraphs above, they are joined together 966 two and four as described in the paragraphs above, they are joined together 966 in sorted order with the "ellipses" text separating them. The three 967 in sorted order with the "ellipses" text separating them. The three 967 modifications enumerated earlier are performed on the text before it is 968 modifications enumerated earlier are performed on the text before it is 968 returned. 969 returned. 969 970 970 [Code { | 971 <codeblock> 971 <b>Note: In this block of examples, newlines and whitespace characters have 972 <b>Note: In this block of examples, newlines and whitespace characters have 972 been inserted into the document inserted into the FTS3 table, and the expected 973 been inserted into the document inserted into the FTS3 table, and the expected 973 results described in SQL comments. This is done to enhance readability only, 974 results described in SQL comments. This is done to enhance readability only, 974 they would not be present in actual SQLite commands or output.</b> 975 they would not be present in actual SQLite commands or output.</b> 975 976 976 <i>-- Create and populate an FTS3 table.</i> 977 <i>-- Create and populate an FTS3 table.</i> 977 CREATE VIRTUAL TABLE text USING fts3(); 978 CREATE VIRTUAL TABLE text USING fts3(); ................................................................................................................................................................................ 991 992 992 <i>-- The following query returns the text value:</i> 993 <i>-- The following query returns the text value:</i> 993 <i>--</i> 994 <i>--</i> 994 <i>-- "...the upper portion, [minimum] [temperature] 14-16oC 995 <i>-- "...the upper portion, [minimum] [temperature] 14-16oC 995 <i>-- [minimum] [temperature] 17-20oC. Cold..."</i> 996 <i>-- [minimum] [temperature] 17-20oC. Cold..."</i> 996 <i>--</i> 997 <i>--</i> 997 SELECT snippet(text, '[ ']', '...') FROM text WHERE text MATCH '"min* 998 SELECT snippet(text, '[ ']', '...') FROM text WHERE text MATCH '"min* 998 }] < > 999 </codeblock> 999 1000 1000 [h2 "The Matchinfo Function" matchinfo matchinfo] | 1001 <h2 id=matchinfo tags=matchinfo>The Matchinfo Function</h2> 1001 1002 1002 <p> 1003 <p> 1003 The matchinfo function returns a blob value. If used within a query that 1004 The matchinfo function returns a blob value. If used within a query that 1004 uses the full-text index (not a "query by rowid" or "linear scan"), then 1005 uses the full-text index (not a "query by rowid" or "linear scan"), then 1005 the blob consists of (2 + <i>C</i> * <i>P</i> * 3) 32-bit unsigned 1006 the blob consists of (2 + <i>C</i> * <i>P</i> * 3) 32-bit unsigned 1006 integers in machine byte-order, where <i>C</i> is the number of columns 1007 integers in machine byte-order, where <i>C</i> is the number of columns 1007 in the FTS3 table being queried, and <i>P</i> is the number of 1008 in the FTS3 table being queried, and <i>P</i> is the number of 1008 <a href=#matchable>matchable phrases</a> in the query. 1009 <a href=#matchable>matchable phrases</a> in the query. 1009 1010 1010 <p> 1011 <p> 1011 Phrases and columns are both numbered from left to right starting from 1012 Phrases and columns are both numbered from left to right starting from 1012 zero. 1013 zero. 1013 1014 1014 [Table] | 1015 <table striped=1> 1015 [Tr]<th>Array Element <th>Interpretation | 1016 <tr><th>Array Element <th>Interpretation 1016 [Tr]<td>0 <td> | 1017 <tr><td>0 <td> 1017 Number of matchable phrases in the query expression (value <i>P</i> in 1018 Number of matchable phrases in the query expression (value <i>P</i> in 1018 the formula below). 1019 the formula below). 1019 [Tr]<td>1 <td> | 1020 <tr><td>1 <td> 1020 Number of columns in the FTS3 table being queried (value <i>C</i> in 1021 Number of columns in the FTS3 table being queried (value <i>C</i> in 1021 the formula below). 1022 the formula below). 1022 1023 1023 [Tr]<td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 0< | 1024 <tr><td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 0< 1024 Number of phrase matches for matchable phrase <i>p</i> in column 1025 Number of phrase matches for matchable phrase <i>p</i> in column 1025 <i>c</i> of the current FTS3 table row. 1026 <i>c</i> of the current FTS3 table row. 1026 [Tr]<td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 1< | 1027 <tr><td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 1< 1027 Sum of the number of phrase matches for matchable phrase <i>p</i> in 1028 Sum of the number of phrase matches for matchable phrase <i>p</i> in 1028 column <i>c</i> for all rows of the FTS3 table. 1029 column <i>c</i> for all rows of the FTS3 table. 1029 [Tr]<td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 2< | 1030 <tr><td style="white-space:nowrap">2 + 3 * (<i>c</i> + <i>C</i>*<i>p</i>) + 2< 1030 Number of rows of the FTS3 table for which column <i>c</i> contains at 1031 Number of rows of the FTS3 table for which column <i>c</i> contains at 1031 least one phrase match for matchable phrase <i>p</i>. 1032 least one phrase match for matchable phrase <i>p</i>. 1032 </table> 1033 </table> 1033 1034 1034 <p> 1035 <p> 1035 For example: 1036 For example: 1036 1037 1037 [Code { | 1038 <codeblock> 1038 <i>-- Create and populate an FTS3 table with two columns:</i> 1039 <i>-- Create and populate an FTS3 table with two columns:</i> 1039 CREATE VIRTUAL TABLE t1 USING fts3(a, b); 1040 CREATE VIRTUAL TABLE t1 USING fts3(a, b); 1040 INSERT INTO t1 VALUES('transaction default models default', 'Non transaction r 1041 INSERT INTO t1 VALUES('transaction default models default', 'Non transaction r 1041 INSERT INTO t1 VALUES('the default transaction', 'these semantics present'); 1042 INSERT INTO t1 VALUES('the default transaction', 'these semantics present'); 1042 INSERT INTO t1 VALUES('single request', 'default data'); 1043 INSERT INTO t1 VALUES('single request', 'default data'); 1043 1044 1044 <i>-- The following query returns a single row consisting of a single blob</i> 1045 <i>-- The following query returns a single row consisting of a single blob</i> ................................................................................................................................................................................ 1057 <i>-- 0 of any table row. The 3 hits are spread across 2 different rows.</i> 1058 <i>-- 0 of any table row. The 3 hits are spread across 2 different rows.</i> 1058 <i>--</i> 1059 <i>--</i> 1059 <i>-- The next set of three integers (0 1 1) pertain to the hits for "default" 1060 <i>-- The next set of three integers (0 1 1) pertain to the hits for "default" 1060 <i>-- in column 1 of the table (0 in this row, 1 in all rows, spread across </ 1061 <i>-- in column 1 of the table (0 in this row, 1 in all rows, spread across </ 1061 <i>-- 1 rows).</i> 1062 <i>-- 1 rows).</i> 1062 <i>--</i> 1063 <i>--</i> 1063 SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semant 1064 SELECT matchinfo(t1) FROM t1 WHERE t1 MATCH 'default transaction "these semant 1064 }] < > 1065 </codeblock> 1065 1066 1066 <p> 1067 <p> 1067 The matchinfo function is much faster than either the snippet or offsets 1068 The matchinfo function is much faster than either the snippet or offsets 1068 functions. This is because the implementation of both snippet and offsets 1069 functions. This is because the implementation of both snippet and offsets 1069 is required to retrieve the documents being analyzed from disk, whereas 1070 is required to retrieve the documents being analyzed from disk, whereas 1070 all data required by matchinfo is available as part of the same portions 1071 all data required by matchinfo is available as part of the same portions 1071 of the full-text index that are required to implement the full-text query 1072 of the full-text index that are required to implement the full-text query 1072 itself. This means that of the following two queries, the first may be 1073 itself. This means that of the following two queries, the first may be 1073 an order of magnitude faster than the second: 1074 an order of magnitude faster than the second: 1074 1075 1075 [Code { | 1076 <codeblock> 1076 SELECT docid, matchinfo(tbl) FROM tbl WHERE tbl MATCH <query expression> 1077 SELECT docid, matchinfo(tbl) FROM tbl WHERE tbl MATCH <query expression> 1077 SELECT docid, offsets(tbl) FROM tbl WHERE tbl MATCH <query expression>; 1078 SELECT docid, offsets(tbl) FROM tbl WHERE tbl MATCH <query expression>; 1078 }] < > 1079 </codeblock> 1079 1080 1080 <p> 1081 <p> 1081 The matchinfo function provides much of the information required to calculate 1082 The matchinfo function provides much of the information required to calculate 1082 probabalistic "bag-of-words" relevancy scores such as 1083 probabalistic "bag-of-words" relevancy scores such as 1083 <a href=http://en.wikipedia.org/wiki/Okapi_BM25>Okapi BM25/BM25F</a> that may 1084 <a href=http://en.wikipedia.org/wiki/Okapi_BM25>Okapi BM25/BM25F</a> that may 1084 be used to order results in a full-text search application. Also often 1085 be used to order results in a full-text search application. Also often 1085 used in such functions is the length or relative length of each document 1086 used in such functions is the length or relative length of each document 1086 or document field. Unfortunately, this information is not made available 1087 or document field. Unfortunately, this information is not made available 1087 by the matchinfo function as it would require loading extra data from the 1088 by the matchinfo function as it would require loading extra data from the 1088 database, potentially slowing matchinfo() down by an order of magnitude. 1089 database, potentially slowing matchinfo() down by an order of magnitude. 1089 One solution is for the application to store the lengths of each document 1090 One solution is for the application to store the lengths of each document 1090 or document field in a separate table for use in calculating relevancy 1091 or document field in a separate table for use in calculating relevancy 1091 scores. Appendix A of this document, "\[search application tips\]", contains | 1092 scores. Appendix A of this document, "[search application tips]", contains 1092 an example of using the matchinfo() function efficiently. 1093 an example of using the matchinfo() function efficiently. 1093 1094 1094 [h1 "Tokenizers" tokenizer {tokenizer}] | 1095 <h1 id=tokenizer tags="tokenizer">Tokenizers</h1> 1095 1096 1096 <p> 1097 <p> 1097 An FTS3 tokenizer is a set of rules for extracting terms from a document 1098 An FTS3 tokenizer is a set of rules for extracting terms from a document 1098 or basic FTS3 full-text query. 1099 or basic FTS3 full-text query. 1099 1100 1100 <p> 1101 <p> 1101 Unless a specific tokenizer is specified as part of the CREATE 1102 Unless a specific tokenizer is specified as part of the CREATE ................................................................................................................................................................................ 1144 against similar English language terms. For more information on the 1145 against similar English language terms. For more information on the 1145 Porter Stemmer algorithm, please refer to the page linked above. 1146 Porter Stemmer algorithm, please refer to the page linked above. 1146 1147 1147 <p> 1148 <p> 1148 Example illustrating the difference between the "simple" and "porter" 1149 Example illustrating the difference between the "simple" and "porter" 1149 tokenizers: 1150 tokenizers: 1150 1151 1151 [Code { | 1152 <codeblock> 1152 <i>-- Create a table using the simple tokenizer. Insert a document into it.</i 1153 <i>-- Create a table using the simple tokenizer. Insert a document into it.</i 1153 CREATE VIRTUAL TABLE simple USING fts3(tokenize=simple); 1154 CREATE VIRTUAL TABLE simple USING fts3(tokenize=simple); 1154 INSERT INTO simple VALUES('Right now they''re very frustrated'); 1155 INSERT INTO simple VALUES('Right now they''re very frustrated'); 1155 1156 1156 <i>-- The first of the following two queries matches the document stored in</i 1157 <i>-- The first of the following two queries matches the document stored in</i 1157 <i>-- table "simple". The second does not.</i> 1158 <i>-- table "simple". The second does not.</i> 1158 SELECT * FROM simple WHERE simple MATCH 'Frustrated'); 1159 SELECT * FROM simple WHERE simple MATCH 'Frustrated'); ................................................................................................................................................................................ 1161 <i>-- Create a table using the porter tokenizer. Insert the same document into 1162 <i>-- Create a table using the porter tokenizer. Insert the same document into 1162 CREATE VIRTUAL TABLE porter USING fts3(tokenize=porter); 1163 CREATE VIRTUAL TABLE porter USING fts3(tokenize=porter); 1163 INSERT INTO porter VALUES('Right now they''re very frustrated'); 1164 INSERT INTO porter VALUES('Right now they''re very frustrated'); 1164 1165 1165 <i>-- Both of the following queries match the document stored in table "porter 1166 <i>-- Both of the following queries match the document stored in table "porter 1166 SELECT * FROM porter WHERE porter MATCH 'Frustrated'); 1167 SELECT * FROM porter WHERE porter MATCH 'Frustrated'); 1167 SELECT * FROM porter WHERE porter MATCH 'Frustration'); 1168 SELECT * FROM porter WHERE porter MATCH 'Frustration'); 1168 }] < > 1169 </codeblock> 1169 1170 1170 <p> 1171 <p> 1171 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor 1172 If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor 1172 symbol defined, then there exists a built-in tokenizer named "icu" 1173 symbol defined, then there exists a built-in tokenizer named "icu" 1173 implemented using the ICU library. The first argument passed to the 1174 implemented using the ICU library. The first argument passed to the 1174 xCreate() method (see fts3_tokenizer.h) of this tokenizer may be 1175 xCreate() method (see fts3_tokenizer.h) of this tokenizer may be 1175 an ICU locale identifier. For example "tr_TR" for Turkish as used 1176 an ICU locale identifier. For example "tr_TR" for Turkish as used 1176 in Turkey, or "en_AU" for English as used in Australia. For example: 1177 in Turkey, or "en_AU" for English as used in Australia. For example: 1177 1178 1178 [Code { | 1179 <codeblock> 1179 CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH) 1180 CREATE VIRTUAL TABLE thai_text USING fts3(text, tokenize=icu th_TH) 1180 }] < > 1181 </codeblock> 1181 1182 1182 <p> 1183 <p> 1183 The ICU tokenizer implementation is very simple. It splits the input 1184 The ICU tokenizer implementation is very simple. It splits the input 1184 text according to the ICU rules for finding word boundaries and discards 1185 text according to the ICU rules for finding word boundaries and discards 1185 any tokens that consist entirely of white-space. This may be suitable 1186 any tokens that consist entirely of white-space. This may be suitable 1186 for some applications in some locales, but not all. If more complex 1187 for some applications in some locales, but not all. If more complex 1187 processing is required, for example to implement stemming or 1188 processing is required, for example to implement stemming or 1188 discard punctuation, this can be done by creating a tokenizer 1189 discard punctuation, this can be done by creating a tokenizer 1189 implementation that uses the ICU tokenizer as part of its implementation. 1190 implementation that uses the ICU tokenizer as part of its implementation. 1190 1191 1191 [h2 "Custom (User Implemented) Tokenizers"] | 1192 <h2>Custom (User Implemented) Tokenizers</h2> 1192 1193 1193 <p> 1194 <p> 1194 As well as the built-in "simple", "porter" and (possibly) "icu" tokenizers, 1195 As well as the built-in "simple", "porter" and (possibly) "icu" tokenizers, 1195 FTS3 exports an interface that allows users to implement custom tokenizers 1196 FTS3 exports an interface that allows users to implement custom tokenizers 1196 using C. The interface used to create a new tokenizer is defined and 1197 using C. The interface used to create a new tokenizer is defined and 1197 described in the fts3_tokenizer.h source file. 1198 described in the fts3_tokenizer.h source file. 1198 1199 ................................................................................................................................................................................ 1208 FTS3 does not expose a C-function that users call to register new 1209 FTS3 does not expose a C-function that users call to register new 1209 tokenizer types with a database handle. Instead, the pointer must 1210 tokenizer types with a database handle. Instead, the pointer must 1210 be encoded as an SQL blob value and passed to FTS3 through the SQL 1211 be encoded as an SQL blob value and passed to FTS3 through the SQL 1211 engine by evaluating a special scalar function, "fts3_tokenizer()". 1212 engine by evaluating a special scalar function, "fts3_tokenizer()". 1212 The fts3_tokenizer() function may be called with one or two arguments, 1213 The fts3_tokenizer() function may be called with one or two arguments, 1213 as follows: 1214 as follows: 1214 1215 1215 [Code { | 1216 <codeblock> 1216 SELECT fts3_tokenizer(<tokenizer-name>); 1217 SELECT fts3_tokenizer(<tokenizer-name>); 1217 SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module p 1218 SELECT fts3_tokenizer(<tokenizer-name>, <sqlite3_tokenizer_module p 1218 }] < > 1219 </codeblock> 1219 1220 1220 <p> 1221 <p> 1221 Where <tokenizer-name> is a string identifying the tokenizer and 1222 Where <tokenizer-name> is a string identifying the tokenizer and 1222 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module 1223 <sqlite3_tokenizer_module ptr> is a pointer to an sqlite3_tokenizer_module 1223 structure encoded as an SQL blob. If the second argument is present, 1224 structure encoded as an SQL blob. If the second argument is present, 1224 it is registered as tokenizer <tokenizer-name> and a copy of it 1225 it is registered as tokenizer <tokenizer-name> and a copy of it 1225 returned. If only one argument is passed, a pointer to the tokenizer 1226 returned. If only one argument is passed, a pointer to the tokenizer ................................................................................................................................................................................ 1227 encoded as a blob. Or, if no such tokenizer exists, an SQL exception 1228 encoded as a blob. Or, if no such tokenizer exists, an SQL exception 1228 (error) is raised. 1229 (error) is raised. 1229 1230 1230 <p> 1231 <p> 1231 <b>SECURITY WARNING</b>: If the fts3 extension is used in an environment 1232 <b>SECURITY WARNING</b>: If the fts3 extension is used in an environment 1232 where potentially malicious users may execute arbitrary SQL, they should 1233 where potentially malicious users may execute arbitrary SQL, they should 1233 be prevented from invoking the fts3_tokenizer() function, possibly using 1234 be prevented from invoking the fts3_tokenizer() function, possibly using 1234 the \[sqlite3_set_authorizer()|authorisation callback\]. | 1235 the [sqlite3_set_authorizer()|authorisation callback]. 1235 1236 1236 <p> 1237 <p> 1237 The following block contains an example of calling the fts3_tokenizer() 1238 The following block contains an example of calling the fts3_tokenizer() 1238 function from C code: 1239 function from C code: 1239 1240 1240 [Code { | 1241 <codeblock> 1241 <i>/* 1242 <i>/* 1242 ** Register a tokenizer implementation with FTS3. 1243 ** Register a tokenizer implementation with FTS3. 1243 */</i> 1244 */</i> 1244 int registerTokenizer( 1245 int registerTokenizer( 1245 sqlite3 *db, 1246 sqlite3 *db, 1246 char *zName, 1247 char *zName, 1247 const sqlite3_tokenizer_module *p 1248 const sqlite3_tokenizer_module *p ................................................................................................................................................................................ 1285 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ 1286 if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){ 1286 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); 1287 memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp)); 1287 } 1288 } 1288 } 1289 } 1289 1290 1290 return sqlite3_finalize(pStmt); 1291 return sqlite3_finalize(pStmt); 1291 } 1292 } 1292 }] < > 1293 </codeblock> 1293 1294 1294 1295 1295 < 1296 < 1297 [h1 "Data Structures" {} {segment btree}] | 1296 <h1 tags="segment btree">Data Structures</h1> 1298 1297 1299 <p> 1298 <p> 1300 This section describes at a high-level the way the FTS3 module stores its 1299 This section describes at a high-level the way the FTS3 module stores its 1301 index and content in the database. It is <b>not necessary to read or 1300 index and content in the database. It is <b>not necessary to read or 1302 understand the material in this section in order to use FTS3</b> in an 1301 understand the material in this section in order to use FTS3</b> in an 1303 application. However, it may be useful to application developers attempting 1302 application. However, it may be useful to application developers attempting 1304 to analyze and understand FTS3 performance characteristics, or to developers 1303 to analyze and understand FTS3 performance characteristics, or to developers ................................................................................................................................................................................ 1315 named "docid". Following this is one column for each column of the FTS3 1314 named "docid". Following this is one column for each column of the FTS3 1316 virtual table as declared by the user, named by prepending the column name 1315 virtual table as declared by the user, named by prepending the column name 1317 supplied by the user with "c<i>N</i>", where <i>N</i> is the index of the 1316 supplied by the user with "c<i>N</i>", where <i>N</i> is the index of the 1318 column within the table, numbered from left to right starting with 1. Data 1317 column within the table, numbered from left to right starting with 1. Data 1319 types supplied as part of the virtual table declaration are not used as 1318 types supplied as part of the virtual table declaration are not used as 1320 part of the %_content table declaration. For example: 1319 part of the %_content table declaration. For example: 1321 1320 1322 [Code { | 1321 <codeblock> 1323 <i>-- Virtual table declaration</i> 1322 <i>-- Virtual table declaration</i> 1324 CREATE VIRTUAL TABLE abc USING FTS3(a NUMBER, b TEXT, c); 1323 CREATE VIRTUAL TABLE abc USING FTS3(a NUMBER, b TEXT, c); 1325 1324 1326 <i>-- Corresponding %_content table declaration</i> 1325 <i>-- Corresponding %_content table declaration</i> 1327 CREATE TABLE abc_content(docid INTEGER PRIMARY KEY, c0a, c1b, c2c); 1326 CREATE TABLE abc_content(docid INTEGER PRIMARY KEY, c0a, c1b, c2c); 1328 }] < > 1327 </codeblock> 1329 1328 1330 <p> 1329 <p> 1331 The %_content table contains the unadulterated data inserted by the user 1330 The %_content table contains the unadulterated data inserted by the user 1332 into the FTS3 virtual table by the user. If the user does not explicitly 1331 into the FTS3 virtual table by the user. If the user does not explicitly 1333 supply a "docid" value when inserting records, one is selected automatically 1332 supply a "docid" value when inserting records, one is selected automatically 1334 by the system. 1333 by the system. 1335 1334 ................................................................................................................................................................................ 1340 %_content table that contain one or more occurrences of the term. To 1339 %_content table that contain one or more occurrences of the term. To 1341 retrieve all documents that contain a specified term, the FTS3 module 1340 retrieve all documents that contain a specified term, the FTS3 module 1342 queries this index to determine the set of docid values for records that 1341 queries this index to determine the set of docid values for records that 1343 contain the term, then retrieves the required documents from the %_content 1342 contain the term, then retrieves the required documents from the %_content 1344 table. Regardless of the schema of the FTS3 virtual table, the %_segments 1343 table. Regardless of the schema of the FTS3 virtual table, the %_segments 1345 and %_segdir tables are always created as follows: 1344 and %_segdir tables are always created as follows: 1346 1345 1347 [Code { | 1346 <codeblock> 1348 CREATE TABLE %_segments( 1347 CREATE TABLE %_segments( 1349 blockid INTEGER PRIMARY KEY, <i>-- B-tree node id</i> 1348 blockid INTEGER PRIMARY KEY, <i>-- B-tree node id</i> 1350 block blob <i>-- B-tree node data</i> 1349 block blob <i>-- B-tree node data</i> 1351 ); 1350 ); 1352 1351 1353 CREATE TABLE %_segdir( 1352 CREATE TABLE %_segdir( 1354 level INTEGER, 1353 level INTEGER, ................................................................................................................................................................................ 1355 idx INTEGER, 1354 idx INTEGER, 1356 start_block INTEGER, <i>-- Blockid of first node in %_segments 1355 start_block INTEGER, <i>-- Blockid of first node in %_segments 1357 leaves_end_block INTEGER, <i>-- Blockid of last leaf node in %_segm 1356 leaves_end_block INTEGER, <i>-- Blockid of last leaf node in %_segm 1358 end_block INTEGER, <i>-- Blockid of last node in %_segments< 1357 end_block INTEGER, <i>-- Blockid of last node in %_segments< 1359 root BLOB, <i>-- B-tree root node</i> 1358 root BLOB, <i>-- B-tree root node</i> 1360 PRIMARY KEY(level, idx) 1359 PRIMARY KEY(level, idx) 1361 ); 1360 ); 1362 }] < > 1361 </codeblock> 1363 1362 1364 <p> 1363 <p> 1365 The schema depicted above is not designed to store the full-text index 1364 The schema depicted above is not designed to store the full-text index 1366 directly. Instead, it is used to one or more b-tree structures. There 1365 directly. Instead, it is used to one or more b-tree structures. There 1367 is one b-tree for each row in the %_segdir table. The %_segdir table 1366 is one b-tree for each row in the %_segdir table. The %_segdir table 1368 row contains the root node and various meta-data associated with the 1367 row contains the root node and various meta-data associated with the 1369 b-tree structure, and the %_segments table contains all other (non-root) 1368 b-tree structure, and the %_segments table contains all other (non-root) ................................................................................................................................................................................ 1410 (or b-trees) later on. Merging of b-tree structures can be performed as 1409 (or b-trees) later on. Merging of b-tree structures can be performed as 1411 a background task, or once a certain number of separate b-tree structures 1410 a background task, or once a certain number of separate b-tree structures 1412 have been accumulated. Of course, this scheme makes queries more expensive 1411 have been accumulated. Of course, this scheme makes queries more expensive 1413 (as the FTS3 code may have to look up individual terms in more than one 1412 (as the FTS3 code may have to look up individual terms in more than one 1414 b-tree and merge the results), but it has been found that in practice this 1413 b-tree and merge the results), but it has been found that in practice this 1415 overhead is often negligible. 1414 overhead is often negligible. 1416 1415 1417 [h2 "Variable Length Integer (varint) Format"] | 1416 <h2>Variable Length Integer (varint) Format</h2> 1418 1417 1419 <p> 1418 <p> 1420 Integer values stored as part of segment b-tree nodes are encoded using the 1419 Integer values stored as part of segment b-tree nodes are encoded using the 1421 FTS3 varint format. This encoding is similar, but <b>not identical</b>, to the 1420 FTS3 varint format. This encoding is similar, but <b>not identical</b>, to the 1422 the <a href="fileformat.html#varint_format">SQLite varint format</a>. 1421 the <a href="fileformat.html#varint_format">SQLite varint format</a>. 1423 1422 1424 <p> 1423 <p> ................................................................................................................................................................................ 1437 is stored in the remaining seven least signficant bits of each byte. 1436 is stored in the remaining seven least signficant bits of each byte. 1438 The first byte of the encoded representation contains the least significant 1437 The first byte of the encoded representation contains the least significant 1439 seven bits of the encoded integer value. The second byte of the encoded 1438 seven bits of the encoded integer value. The second byte of the encoded 1440 representation, if it is present, contains the seven next least significant 1439 representation, if it is present, contains the seven next least significant 1441 bits of the integer value, and so on. The following table contains examples 1440 bits of the integer value, and so on. The following table contains examples 1442 of encoded integer values: 1441 of encoded integer values: 1443 1442 1444 [Table] | 1443 <table striped=1> 1445 [Tr]<th>Decimal<th>Hexadecimal<th width=100%>Encoded Representation | 1444 <tr><th>Decimal<th>Hexadecimal<th width=100%>Encoded Representation 1446 [Tr]<td>43<td>0x000000000000002B<td>0x2B | 1445 <tr><td>43<td>0x000000000000002B<td>0x2B 1447 [Tr]<td>200815<td>0x000000000003106F<td>0x9C 0xA0 0x0C | 1446 <tr><td>200815<td>0x000000000003106F<td>0x9C 0xA0 0x0C 1448 [Tr]<td>-1<td>0xFFFFFFFFFFFFFFFF<td>0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0x | 1447 <tr><td>-1<td>0xFFFFFFFFFFFFFFFF<td>0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0xFF 0x 1449 </table> 1448 </table> 1450 1449 1451 1450 1452 [h2 "Segment B-Tree Format"] | 1451 <h2>Segment B-Tree Format</h2> 1453 1452 1454 <p> 1453 <p> 1455 Segment b-trees are prefix-compressed b+-trees. There is one segment b-tree 1454 Segment b-trees are prefix-compressed b+-trees. There is one segment b-tree 1456 for each row in the %_segdir table (see above). The root node of the segment 1455 for each row in the %_segdir table (see above). The root node of the segment 1457 b-tree is stored as a blob in the "root" field of the corresponding row 1456 b-tree is stored as a blob in the "root" field of the corresponding row 1458 of the %_segdir table. All other nodes (if any exist) are stored in the 1457 of the %_segdir table. All other nodes (if any exist) are stored in the 1459 "blob" column of the %_segments table. Nodes within the %_segments table are 1458 "blob" column of the %_segments table. Nodes within the %_segments table are 1460 identified by the integer value in the blockid field of the corresponding 1459 identified by the integer value in the blockid field of the corresponding 1461 row. The following table describes the fields of the %_segdir table: 1460 row. The following table describes the fields of the %_segdir table: 1462 1461 1463 [Table] | 1462 <table striped=1> 1464 [Tr]<th>Column <th width=100%>Interpretion | 1463 <tr><th>Column <th width=100%>Interpretion 1465 [Tr]<td>level <td> | 1464 <tr><td>level <td> 1466 Between them, the contents of the "level" and "idx" fields define the 1465 Between them, the contents of the "level" and "idx" fields define the 1467 relative age of the segment b-tree. The smaller the value stored in the 1466 relative age of the segment b-tree. The smaller the value stored in the 1468 "level" field, the more recently the segment b-tree was created. If two 1467 "level" field, the more recently the segment b-tree was created. If two 1469 segment b-trees are of the same "level", the segment with the larger 1468 segment b-trees are of the same "level", the segment with the larger 1470 value stored in the "idx" column is more recent. The PRIMARY KEY constraint 1469 value stored in the "idx" column is more recent. The PRIMARY KEY constraint 1471 on the %_segdir table prevents any two segments from having the same value 1470 on the %_segdir table prevents any two segments from having the same value 1472 for both the "level" and "idx" fields. 1471 for both the "level" and "idx" fields. 1473 [Tr]<td>idx <td> See above. | 1472 <tr><td>idx <td> See above. 1474 [Tr]<td>start_block <td> | 1473 <tr><td>start_block <td> 1475 The blockid that corresponds to the node with the smallest blockid that 1474 The blockid that corresponds to the node with the smallest blockid that 1476 belongs to this segment b-tree. Or zero if the entire segment b-tree 1475 belongs to this segment b-tree. Or zero if the entire segment b-tree 1477 fits on the root node. If it exists, this node is always a leaf node. 1476 fits on the root node. If it exists, this node is always a leaf node. 1478 [Tr]<td>leaves_end_block <td> | 1477 <tr><td>leaves_end_block <td> 1479 The blockid that corresponds to the leaf node with the largest blockid 1478 The blockid that corresponds to the leaf node with the largest blockid 1480 that belongs to this segment b-tree. Or zero if the entire segment b-tree 1479 that belongs to this segment b-tree. Or zero if the entire segment b-tree 1481 fits on the root node. 1480 fits on the root node. 1482 [Tr]<td>end_block <td> | 1481 <tr><td>end_block <td> 1483 The blockid that corresponds to the interior node with the largest 1482 The blockid that corresponds to the interior node with the largest 1484 blockid that belongs to this segment b-tree. Or zero if the entire segment 1483 blockid that belongs to this segment b-tree. Or zero if the entire segment 1485 b-tree fits on the root node. If it exists, this node is always an 1484 b-tree fits on the root node. If it exists, this node is always an 1486 interior node. 1485 interior node. 1487 [Tr]<td>root <td> | 1486 <tr><td>root <td> 1488 Blob containing the root node of the segment b-tree. 1487 Blob containing the root node of the segment b-tree. 1489 </table> 1488 </table> 1490 1489 1491 <p> 1490 <p> 1492 Apart from the root node, the nodes that make up a single segment b-tree are 1491 Apart from the root node, the nodes that make up a single segment b-tree are 1493 always stored using a contiguous sequence of blockids. Furthermore, the 1492 always stored using a contiguous sequence of blockids. Furthermore, the 1494 nodes that make up a single level of the b-tree are themselves stored as 1493 nodes that make up a single level of the b-tree are themselves stored as ................................................................................................................................................................................ 1496 used to store the b-tree leaves are allocated starting with the blockid 1495 used to store the b-tree leaves are allocated starting with the blockid 1497 value stored in the "start_block" column of the corresponding %_segdir row, 1496 value stored in the "start_block" column of the corresponding %_segdir row, 1498 and finishing at the blockid value stored in the "leaves_end_block" 1497 and finishing at the blockid value stored in the "leaves_end_block" 1499 field of the same row. It is therefore possible to iterate through all the 1498 field of the same row. It is therefore possible to iterate through all the 1500 leaves of a segment b-tree, in key order, by traversing the %_segments 1499 leaves of a segment b-tree, in key order, by traversing the %_segments 1501 table in blockid order from "start_block" to "leaves_end_block". 1500 table in blockid order from "start_block" to "leaves_end_block". 1502 1501 1503 [h3 "Segment B-Tree Leaf Nodes"] | 1502 <h3>Segment B-Tree Leaf Nodes</h3> 1504 1503 1505 <p> 1504 <p> 1506 The following diagram depicts the format of a segment b-tree leaf node. 1505 The following diagram depicts the format of a segment b-tree leaf node. 1507 1506 > 1507 <center> > 1508 <img src=images/fts3_leaf_node.png> 1508 [Fig fts3_leaf_node.png "Segment B-Tree Leaf Node Format"] | 1509 <p> Segment B-Tree Leaf Node Format > 1510 </center> 1509 1511 1510 <p> 1512 <p> 1511 The first term stored on each node ("Term 1" in the figure above) is 1513 The first term stored on each node ("Term 1" in the figure above) is 1512 stored verbatim. Each subsequent term is prefix-compressed with respect 1514 stored verbatim. Each subsequent term is prefix-compressed with respect 1513 to its predecessor. Terms are stored within a page in sorted (memcmp) 1515 to its predecessor. Terms are stored within a page in sorted (memcmp) 1514 order. 1516 order. 1515 1517 1516 [h3 "Segment B-Tree Interior Nodes"] | 1518 <h3>Segment B-Tree Interior Nodes</h3> 1517 1519 1518 <p> 1520 <p> 1519 The following diagram depicts the format of a segment b-tree interior 1521 The following diagram depicts the format of a segment b-tree interior 1520 (non-leaf) node. 1522 (non-leaf) node. 1521 1523 > 1524 <center> > 1525 <img src=images/fts3_interior_node.png> 1522 [Fig fts3_interior_node.png "Segment B-Tree Interior Node Format"] | 1526 <p> Segment B-Tree Interior Node Format > 1527 </center> 1523 1528 1524 1529 1525 [h2 "Doclist Format"] | 1530 <h2>Doclist Format</h2> 1526 1531 1527 <p> 1532 <p> 1528 A doclist consists of an array of 64-bit signed integers, serialized using 1533 A doclist consists of an array of 64-bit signed integers, serialized using 1529 the FTS3 varint format. Each doclist entry is made up of a series of two 1534 the FTS3 varint format. Each doclist entry is made up of a series of two 1530 or more integers, as follows: 1535 or more integers, as follows: 1531 1536 1532 <ol> 1537 <ol> ................................................................................................................................................................................ 1546 of storing the term-offset value literally, each integer stored 1551 of storing the term-offset value literally, each integer stored 1547 is the difference between the current term-offset and the previous 1552 is the difference between the current term-offset and the previous 1548 one (or zero if the current term-offset is the first), plus 2. 1553 one (or zero if the current term-offset is the first), plus 2. 1549 </ol> 1554 </ol> 1550 <li> Constant value 0. 1555 <li> Constant value 0. 1551 </ol> 1556 </ol> 1552 1557 > 1558 <center> > 1559 <img src=images/fts3_doclist2.png> 1553 [Fig fts3_doclist2.png "FTS3 Doclist Format"] | 1560 <p> FTS3 Doclist Format > 1561 </center> 1554 1562 > 1563 <center> > 1564 <img src=images/fts3_doclist.png> 1555 [Fig fts3_doclist.png "FTS3 Doclist Entry Format"] | 1565 <p> FTS3 Doclist Entry Format > 1566 </center> 1556 1567 1557 <p> 1568 <p> 1558 For doclists for which the term appears in more than one column of the FTS3 1569 For doclists for which the term appears in more than one column of the FTS3 1559 virtual table, term-offset lists within the doclist are stored in column 1570 virtual table, term-offset lists within the doclist are stored in column 1560 number order. This ensures that the term-offset list associated with 1571 number order. This ensures that the term-offset list associated with 1561 column 0 (if any) is always first, allowing the first two fields of the 1572 column 0 (if any) is always first, allowing the first two fields of the 1562 term-offset list to be omitted in this case. 1573 term-offset list to be omitted in this case. 1563 1574 > 1575 <h1 id=appendix_a nonumber tags="search application tips"> 1564 [h1 "Appendix A: Search Application Tips" {} "search application tips"] | 1576 Appendix A: Search Application Tips > 1577 </h1> 1565 1578 1566 <p> 1579 <p> 1567 FTS3 is primarily designed to support Boolean full-text queries - queries 1580 FTS3 is primarily designed to support Boolean full-text queries - queries 1568 to find the set of documents that match a specified criteria. However, many 1581 to find the set of documents that match a specified criteria. However, many 1569 (most?) search applications require that results are somehow ranked in order 1582 (most?) search applications require that results are somehow ranked in order 1570 of "relevance", where "relevance" is defined as the likelihood that the user 1583 of "relevance", where "relevance" is defined as the likelihood that the user 1571 who performed the search is interested in a specific element of the returned 1584 who performed the search is interested in a specific element of the returned ................................................................................................................................................................................ 1578 1591 1579 <p> 1592 <p> 1580 One very simple scheme might be to count the number of instances of the 1593 One very simple scheme might be to count the number of instances of the 1581 users search terms in each result document. Those documents that contain 1594 users search terms in each result document. Those documents that contain 1582 many instances of the terms are considered more relevant than those with 1595 many instances of the terms are considered more relevant than those with 1583 a small number of instances of each term. In an FTS3 application, the 1596 a small number of instances of each term. In an FTS3 application, the 1584 number of term instances in each result could be determined by counting 1597 number of term instances in each result could be determined by counting 1585 the number of integers in the return value of the \[offsets\] function. | 1598 the number of integers in the return value of the [offsets] function. 1586 The following example shows a query that could be used to obtain the 1599 The following example shows a query that could be used to obtain the 1587 ten most relevant results for a query entered by the user: 1600 ten most relevant results for a query entered by the user: 1588 1601 1589 [Code { | 1602 <codeblock> 1590 <i>-- This example (and all others in this section) assumes the following sche 1603 <i>-- This example (and all others in this section) assumes the following sche 1591 CREATE VIRTUAL TABLE documents USING fts3(title, content); 1604 CREATE VIRTUAL TABLE documents USING fts3(title, content); 1592 1605 1593 <i>-- Assuming the application has supplied an SQLite user function named "cou 1606 <i>-- Assuming the application has supplied an SQLite user function named "cou 1594 <i>-- that returns the number of space-separated integers contained in its onl 1607 <i>-- that returns the number of space-separated integers contained in its onl 1595 <i>-- the following query could be used to return the titles of the 10 documen 1608 <i>-- the following query could be used to return the titles of the 10 documen 1596 <i>-- the greatest number of instances of the users query terms. Hopefully, th 1609 <i>-- the greatest number of instances of the users query terms. Hopefully, th 1597 <i>-- documents will be those that the users considers more or less the most " 1610 <i>-- documents will be those that the users considers more or less the most " 1598 SELECT title FROM documents 1611 SELECT title FROM documents 1599 WHERE documents MATCH <query> 1612 WHERE documents MATCH <query> 1600 ORDER BY countintegers(offsets(document)) DESC 1613 ORDER BY countintegers(offsets(document)) DESC 1601 OFFSET 0 LIMIT 10 1614 OFFSET 0 LIMIT 10 1602 }] < > 1615 </codeblock> 1603 1616 1604 <p> 1617 <p> 1605 The query above could be made to run faster by using the FTS3 \[matchinfo\] | 1618 The query above could be made to run faster by using the FTS3 [matchinfo] 1606 function to determine the number of query term instances that appear in each 1619 function to determine the number of query term instances that appear in each 1607 result. The matchinfo function is much more efficient than the offsets 1620 result. The matchinfo function is much more efficient than the offsets 1608 function. Furthermore, the matchinfo function provides extra information 1621 function. Furthermore, the matchinfo function provides extra information 1609 regarding the overall number of occurences of each query term in the entire 1622 regarding the overall number of occurences of each query term in the entire 1610 document set (not just the current row) and the number of documents in which 1623 document set (not just the current row) and the number of documents in which 1611 each query term appears. This may be used (for example) to attach a higher 1624 each query term appears. This may be used (for example) to attach a higher 1612 weight to less common terms which may increase the overall computed relevancy 1625 weight to less common terms which may increase the overall computed relevancy 1613 of those results the user considers more interesting. 1626 of those results the user considers more interesting. 1614 1627 1615 [Code { | 1628 <codeblock> 1616 <i>-- If the application supplies an SQLite user function called "rank" that</ 1629 <i>-- If the application supplies an SQLite user function called "rank" that</ 1617 <i>-- interprets the blob of data returned by matchinfo and returns a numeric< 1630 <i>-- interprets the blob of data returned by matchinfo and returns a numeric< 1618 <i>-- relevancy based on it, then the following SQL may be used to return the< 1631 <i>-- relevancy based on it, then the following SQL may be used to return the< 1619 <i>-- titles of the 10 most relevant documents in the dataset for a users quer 1632 <i>-- titles of the 10 most relevant documents in the dataset for a users quer 1620 SELECT title FROM documents 1633 SELECT title FROM documents 1621 WHERE documents MATCH <query> 1634 WHERE documents MATCH <query> 1622 ORDER BY rank(matchinfo(document)) DESC 1635 ORDER BY rank(matchinfo(document)) DESC 1623 OFFSET 0 LIMIT 10 1636 OFFSET 0 LIMIT 10 1624 }] < > 1637 </codeblock> 1625 1638 1626 <p> 1639 <p> 1627 The SQL query in the example above uses less CPU than the first example 1640 The SQL query in the example above uses less CPU than the first example 1628 in this section, but still has a non-obvious performance problem. SQLite 1641 in this section, but still has a non-obvious performance problem. SQLite 1629 satisfies this query by retreiving the value of the "title" column and 1642 satisfies this query by retreiving the value of the "title" column and 1630 matchinfo data from the FTS3 module for every row matched by the users 1643 matchinfo data from the FTS3 module for every row matched by the users 1631 query before it sorts and limits the results. Because of the way SQLite's 1644 query before it sorts and limits the results. Because of the way SQLite's ................................................................................................................................................................................ 1644 This means that SQLite will load only the docid and matchinfo data for each 1657 This means that SQLite will load only the docid and matchinfo data for each 1645 row matching the users query into memory, determine the docid values 1658 row matching the users query into memory, determine the docid values 1646 corresponding to the ten most relevant documents, then load only the title 1659 corresponding to the ten most relevant documents, then load only the title 1647 and content information for those 10 documents only. Because both the matchinf 1660 and content information for those 10 documents only. Because both the matchinf 1648 and docid values are gleaned entirely from the full-text index, this results 1661 and docid values are gleaned entirely from the full-text index, this results 1649 in dramatically less data being loaded from the database into memory. 1662 in dramatically less data being loaded from the database into memory. 1650 1663 1651 [Code { | 1664 <codeblock> 1652 SELECT title FROM documents JOIN ( 1665 SELECT title FROM documents JOIN ( 1653 SELECT docid, rank(matchinfo(document)) AS rank 1666 SELECT docid, rank(matchinfo(document)) AS rank 1654 FROM documents 1667 FROM documents 1655 WHERE documents MATCH <query> 1668 WHERE documents MATCH <query> 1656 ORDER BY rank DESC 1669 ORDER BY rank DESC 1657 OFFSET 0 LIMIT 10 1670 OFFSET 0 LIMIT 10 1658 ) AS ranktable USING(docid) 1671 ) AS ranktable USING(docid) 1659 ORDER BY ranktable.rank DESC 1672 ORDER BY ranktable.rank DESC 1660 }] < > 1673 </codeblock> 1661 1674 1662 <p> 1675 <p> 1663 The next block of SQL enhances the query with solutions to two other problems 1676 The next block of SQL enhances the query with solutions to two other problems 1664 that may arise in developing search applications using FTS3: 1677 that may arise in developing search applications using FTS3: 1665 1678 1666 <ol> 1679 <ol> 1667 <li> <p> 1680 <li> <p> 1668 The \[snippet\] function cannot be used with the above query. Because | 1681 The [snippet] function cannot be used with the above query. Because 1669 the outer query does not include a "WHERE ... MATCH" clause, the snippet 1682 the outer query does not include a "WHERE ... MATCH" clause, the snippet 1670 function may not be used with it. One solution is to duplicate the WHERE 1683 function may not be used with it. One solution is to duplicate the WHERE 1671 clause used by the sub-query in the outer query. The overhead associated 1684 clause used by the sub-query in the outer query. The overhead associated 1672 with this is usually negligible. 1685 with this is usually negligible. 1673 <li> <p> 1686 <li> <p> 1674 The relevancy of a document may depend on something other than just 1687 The relevancy of a document may depend on something other than just 1675 the data available in the return value of matchinfo. For example 1688 the data available in the return value of matchinfo. For example ................................................................................................................................................................................ 1681 </ol> 1694 </ol> 1682 1695 1683 <p> 1696 <p> 1684 This version of the query is very similar to that used by the 1697 This version of the query is very similar to that used by the 1685 <a href="http://www.sqlite.org/search?q=fts3">sqlite.org documentation search< 1698 <a href="http://www.sqlite.org/search?q=fts3">sqlite.org documentation search< 1686 application. 1699 application. 1687 1700 1688 [Code { | 1701 <codeblock> 1689 <i>-- This table stores the static weight assigned to each document in FTS3 ta 1702 <i>-- This table stores the static weight assigned to each document in FTS3 ta 1690 <i>-- "documents". For each row in the documents table there is a correspondin 1703 <i>-- "documents". For each row in the documents table there is a correspondin 1691 <i>-- with the same docid value in this table.</i> 1704 <i>-- with the same docid value in this table.</i> 1692 CREATE TABLE documents_data(docid INTEGER PRIMARY KEY, weight); 1705 CREATE TABLE documents_data(docid INTEGER PRIMARY KEY, weight); 1693 1706 1694 <i>-- This query is similar to the one in the block above, except that:</i> 1707 <i>-- This query is similar to the one in the block above, except that:</i> 1695 <i>--</i> 1708 <i>--</i> ................................................................................................................................................................................ 1705 FROM documents JOIN documents_data USING(docid) 1718 FROM documents JOIN documents_data USING(docid) 1706 WHERE documents MATCH <query> 1719 WHERE documents MATCH <query> 1707 ORDER BY rank DESC 1720 ORDER BY rank DESC 1708 OFFSET 0 LIMIT 10 1721 OFFSET 0 LIMIT 10 1709 ) AS ranktable USING(docid) 1722 ) AS ranktable USING(docid) 1710 WHERE documents MATCH <query> 1723 WHERE documents MATCH <query> 1711 ORDER BY ranktable.rank DESC 1724 ORDER BY ranktable.rank DESC 1712 }] < > 1725 </codeblock> 1713 1726 1714 <p> 1727 <p> 1715 All the example queries above return the ten most relevant query results. 1728 All the example queries above return the ten most relevant query results. 1716 By modifying the values used with the OFFSET and LIMIT clauses, a query 1729 By modifying the values used with the OFFSET and LIMIT clauses, a query 1717 to return (say) the next ten most relevant results is easy to construct. 1730 to return (say) the next ten most relevant results is easy to construct. 1718 This may be used to obtain the data required for a search applications second 1731 This may be used to obtain the data required for a search applications second 1719 and subsequent pages of results. 1732 and subsequent pages of results. 1720 1733 1721 <p> 1734 <p> 1722 The next block contains an example rank function that uses matchinfo data 1735 The next block contains an example rank function that uses matchinfo data 1723 implemented in C. Instead of a single weight, it allows a weight to be 1736 implemented in C. Instead of a single weight, it allows a weight to be 1724 externally assigned to each column of each document. It may be registered 1737 externally assigned to each column of each document. It may be registered 1725 with SQLite like any other user function using \[sqlite3_create_function()\]. | 1738 with SQLite like any other user function using [sqlite3_create_function]. 1726 1739 1727 [Code [string map {[ [ ] ]} { | 1740 <codeblock> 1728 <i>/*</i> 1741 <i>/*</i> 1729 <i>** SQLite user defined function to use with matchinfo() to calculate the</i> 1742 <i>** SQLite user defined function to use with matchinfo() to calculate the</i> 1730 <i>** relevancy of an FTS3 match. The value returned is the relevancy score</i> 1743 <i>** relevancy of an FTS3 match. The value returned is the relevancy score</i> 1731 <i>** (a real value greater than or equal to zero). A larger value indicates </i 1744 <i>** (a real value greater than or equal to zero). A larger value indicates </i 1732 <i>** a more relevant document.</i> 1745 <i>** a more relevant document.</i> 1733 <i>**</i> 1746 <i>**</i> 1734 <i>** The overall relevancy returned is the sum of the relevancies of each </i> 1747 <i>** The overall relevancy returned is the sum of the relevancies of each </i> ................................................................................................................................................................................ 1771 <i> /* Check that the number of arguments passed to this function is correct.</ 1784 <i> /* Check that the number of arguments passed to this function is correct.</ 1772 <i> ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array< 1785 <i> ** If not, jump to wrong_number_args. Set aMatchinfo to point to the array< 1773 <i> ** of unsigned integer values returned by FTS3 function matchinfo. Set</i> 1786 <i> ** of unsigned integer values returned by FTS3 function matchinfo. Set</i> 1774 <i> ** nPhrase to contain the number of reportable phrases in the users full-te 1787 <i> ** nPhrase to contain the number of reportable phrases in the users full-te 1775 <i> ** query, and nCol to the number of columns in the table.</i> 1788 <i> ** query, and nCol to the number of columns in the table.</i> 1776 <i> */</i> 1789 <i> */</i> 1777 if( nVal<1 ) goto wrong_number_args; 1790 if( nVal<1 ) goto wrong_number_args; 1778 aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]); | 1791 aMatchinfo = (unsigned int *)sqlite3_value_blob(apVal[0]); 1779 nPhrase = aMatchinfo[0]; | 1792 nPhrase = aMatchinfo[0]; 1780 nCol = aMatchinfo[1]; | 1793 nCol = aMatchinfo[1]; 1781 if( nVal!=(1+nCol) ) goto wrong_number_args; 1794 if( nVal!=(1+nCol) ) goto wrong_number_args; 1782 1795 1783 <i> /* Iterate through each phrase in the users query. */</i> 1796 <i> /* Iterate through each phrase in the users query. */</i> 1784 for(iPhrase=0; iPhrase<nPhrase; iPhrase++){ 1797 for(iPhrase=0; iPhrase<nPhrase; iPhrase++){ 1785 int iCol; <i>/* Current column */</i> 1798 int iCol; <i>/* Current column */</i> 1786 1799 1787 <i> /* Now iterate through each column in the users query. For each column,</ 1800 <i> /* Now iterate through each column in the users query. For each column,</ 1788 <i> ** increment the relevancy score by:</i> 1801 <i> ** increment the relevancy score by:</i> 1789 <i> **</i> 1802 <i> **</i> 1790 <i> ** (<hit count> / <global hit count>) * <column weight> 1803 <i> ** (<hit count> / <global hit count>) * <column weight> 1791 <i> **</i> 1804 <i> **</i> 1792 <i> ** aPhraseinfo[] points to the start of the data for phrase iPhrase. So</ | 1805 <i> ** aPhraseinfo[] points to the start of the data for phrase iPh 1793 <i> ** the hit count and global hit counts for each column are found in </i> 1806 <i> ** the hit count and global hit counts for each column are found in </i> 1794 <i> ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], respectively.</i> | 1807 <i> ** aPhraseinfo[iCol*3] and aPhraseinfo[iCol*3+1], res 1795 <i> */</i> 1808 <i> */</i> 1796 int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3]; | 1809 int *aPhraseinfo = &aMatchinfo[2 + iPhrase*nCol*3]; 1797 for(iCol=0; iCol<nCol; iCol++){ 1810 for(iCol=0; iCol<nCol; iCol++){ 1798 int nHitCount = aPhraseinfo[3*iCol]; | 1811 int nHitCount = aPhraseinfo[3*iCol]; 1799 int nGlobalHitCount = aPhraseinfo[3*iCol+1]; | 1812 int nGlobalHitCount = aPhraseinfo[3*iCol+1]; 1800 double weight = sqlite3_value_double(apVal[iCol+1]); | 1813 double weight = sqlite3_value_double(apVal[iCol+1]); 1801 if( nHitCount>0 ){ 1814 if( nHitCount>0 ){ 1802 score += ((double)nHitCount / (double)nGlobalHitCount) * weight; 1815 score += ((double)nHitCount / (double)nGlobalHitCount) * weight; 1803 } 1816 } 1804 } 1817 } 1805 } 1818 } 1806 1819 1807 sqlite3_result_double(pCtx, score); 1820 sqlite3_result_double(pCtx, score); 1808 return; 1821 return; 1809 1822 1810 <i> /* Jump here if the wrong number of arguments are passed to this function * 1823 <i> /* Jump here if the wrong number of arguments are passed to this function * 1811 wrong_number_args: 1824 wrong_number_args: 1812 sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1) 1825 sqlite3_result_error(pCtx, "wrong number of arguments to function rank()", -1) 1813 } 1826 } 1814 }]] < 1815 < 1816 } < 1817 < > 1827 </codeblock>
Changes to wrap.tcl
35 # the final output. 35 # the final output. 36 # 36 # 37 set DOC [lindex $argv 0] 37 set DOC [lindex $argv 0] 38 set SRC [lindex $argv 1] 38 set SRC [lindex $argv 1] 39 set DEST [lindex $argv 2] 39 set DEST [lindex $argv 2] 40 set HOMEDIR [pwd] ;# Also remember our home directory. 40 set HOMEDIR [pwd] ;# Also remember our home directory. 41 41 > 42 source [file dirname [info script]]/pages/fancyformat.tcl > 43 42 # Open the SQLite database. 44 # Open the SQLite database. 43 # 45 # 44 sqlite3 db docinfo.db 46 sqlite3 db docinfo.db 45 db eval { 47 db eval { 46 BEGIN; 48 BEGIN; 47 DELETE FROM link; 49 DELETE FROM link; 48 DELETE FROM keyword; 50 DELETE FROM keyword; ................................................................................................................................................................................ 403 /* rounded corners */ 405 /* rounded corners */ 404 .se { background: url(${path}images/se.png) 100% 100% no-repeat #80a796} 406 .se { background: url(${path}images/se.png) 100% 100% no-repeat #80a796} 405 .sw { background: url(${path}images/sw.png) 0% 100% no-repeat } 407 .sw { background: url(${path}images/sw.png) 0% 100% no-repeat } 406 .ne { background: url(${path}images/ne.png) 100% 0% no-repeat } 408 .ne { background: url(${path}images/ne.png) 100% 0% no-repeat } 407 .nw { background: url(${path}images/nw.png) 0% 0% no-repeat } 409 .nw { background: url(${path}images/nw.png) 0% 0% no-repeat } 408 410 409 /* Things for "fancyformat" documents start here. */ 411 /* Things for "fancyformat" documents start here. */ > 412 .fancy img+p {font-style:italic} 410 .fancy .codeblock i { color: darkblue; } 413 .fancy .codeblock i { color: darkblue; } 411 .fancy h1,.fancy h2,.fancy h3,.fancy h4 {font-weight:normal;color:#80a796} 414 .fancy h1,.fancy h2,.fancy h3,.fancy h4 {font-weight:normal;color:#80a796} 412 .fancy h2 { margin-left: 10px } 415 .fancy h2 { margin-left: 10px } 413 .fancy h3 { margin-left: 20px } 416 .fancy h3 { margin-left: 20px } 414 .fancy h4 { margin-left: 30px } 417 .fancy h4 { margin-left: 30px } 415 .fancy th {white-space:nowrap;text-align:left;border-bottom:solid 1px #444} 418 .fancy th {white-space:nowrap;text-align:left;border-bottom:solid 1px #444} 416 .fancy th, .fancy td {padding: 0.2em 1ex; vertical-align:top} 419 .fancy th, .fancy td {padding: 0.2em 1ex; vertical-align:top} ................................................................................................................................................................................ 612 # 615 # 613 foreach infile [lrange $argv 3 end] { 616 foreach infile [lrange $argv 3 end] { 614 cd $HOMEDIR 617 cd $HOMEDIR 615 puts "Processing $infile" 618 puts "Processing $infile" 616 set fd [open $infile r] 619 set fd [open $infile r] 617 set in [read $fd] 620 set in [read $fd] 618 close $fd 621 close $fd > 622 if {[string first <table_of_contents> $in]>=0} { set in [addtoc $in] } 619 set title {No Title} 623 set title {No Title} 620 regexp {<title>([^\n]*)</title>} $in all title 624 regexp {<title>([^\n]*)</title>} $in all title 621 regsub {<title>[^\n]*</title>} $in {} in 625 regsub {<title>[^\n]*</title>} $in {} in 622 set outfile [file root [file tail $infile]].html 626 set outfile [file root [file tail $infile]].html 623 hd_open_main $outfile 627 hd_open_main $outfile 624 db eval { 628 db eval { 625 INSERT INTO page(filename,pagetitle) 629 INSERT INTO page(filename,pagetitle) ................................................................................................................................................................................ 643 proc hd_requirement {args} {} 647 proc hd_requirement {args} {} 644 foreach infile [lrange $argv 3 end] { 648 foreach infile [lrange $argv 3 end] { 645 cd $HOMEDIR 649 cd $HOMEDIR 646 puts "Processing $infile" 650 puts "Processing $infile" 647 set fd [open $infile r] 651 set fd [open $infile r] 648 set in [read $fd] 652 set in [read $fd] 649 close $fd 653 close $fd > 654 if {[string first <table_of_contents> $in]>=0} { set in [addtoc $in] } 650 set title {No Title} 655 set title {No Title} 651 regexp {<title>([^\n]*)</title>} $in all title 656 regexp {<title>([^\n]*)</title>} $in all title 652 regsub {<title>[^\n]*</title>} $in {} in 657 regsub {<title>[^\n]*</title>} $in {} in 653 set outfile [file root [file tail $infile]].html 658 set outfile [file root [file tail $infile]].html 654 hd_open_main $outfile 659 hd_open_main $outfile 655 hd_header $title $infile 660 hd_header $title $infile 656 regsub -all {<tcl>} $in "\175; eval \173" in 661 regsub -all {<tcl>} $in "\175; eval \173" in