Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add fts5.html. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
519916c18a0b7a5db704035a4aa083da |
User & Date: | dan 2015-03-04 20:19:09.365 |
Context
2015-03-04
| ||
20:56 | Clarification of the "INSERT OR" and "REPLACE" forms of the INSERT statement. (check-in: 58895ad201 user: drh tags: trunk) | |
20:19 | Add fts5.html. (check-in: 519916c18a user: dan tags: trunk) | |
16:59 | Merge typo fix from the 3.8.8 branch. (check-in: 9621e94943 user: drh tags: trunk) | |
Changes
Added images/fts5_formula.odf.
cannot compute difference between binary files
Added images/fts5_formula1.png.
cannot compute difference between binary files
Added images/fts5_formula2.png.
cannot compute difference between binary files
Added images/fts5_formula3.png.
cannot compute difference between binary files
Changes to pages/fancyformat.tcl.
︙ | ︙ | |||
195 196 197 198 199 200 201 | } } } foreach line [split $txt "\n"] { set line [string range $line $nMinSpace end] append out "$line\n" } | | | 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | } } } foreach line [split $txt "\n"] { set line [string range $line $nMinSpace end] append out "$line\n" } append out "</pre></table></div>" return $out } proc fancyformat_document {zTitle lReqfile zBody} { unset -nocomplain ::ffreq unset -nocomplain ::ffreq_children foreach f $lReqfile { |
︙ | ︙ | |||
378 379 380 381 382 383 384 385 386 387 388 389 390 391 | proc addtoc {zDoc} { # If the extension with the [parsehtml] command has not been loaded, # load it now. # if {[info commands parsehtml] == ""} { load ./parsehtml.so } # These variables are all used to store state between invocations of # the [parsehtml] callback used to do preprocessing. # set ::Addtoc(heading:1) 0 set ::Addtoc(heading:2) 0 set ::Addtoc(heading:3) 0 set ::Addtoc(heading:4) 0 | > > > > > > > > > | 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 | proc addtoc {zDoc} { # If the extension with the [parsehtml] command has not been loaded, # load it now. # if {[info commands parsehtml] == ""} { load ./parsehtml.so } # Handle any <tclscript> blocks. # while { [regexp -nocase {<tclscript>(.*?)</tclscript>} $zDoc -> script] } { set sub [eval $script] set sub [string map {& {\&}} $sub] set zDoc [regsub -nocase {<tclscript>.*?</tclscript>} $zDoc $sub] } # These variables are all used to store state between invocations of # the [parsehtml] callback used to do preprocessing. # set ::Addtoc(heading:1) 0 set ::Addtoc(heading:2) 0 set ::Addtoc(heading:3) 0 set ::Addtoc(heading:4) 0 |
︙ | ︙ |
Added pages/fts5.in.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 | <tcl>hd_keywords *fts5 FTS5</tcl> <title>SQLite FTS5 Extension</title> <table_of_contents> <h2 style="margin-left:1.0em" notoc> Overview</h2> <h1>CREATE TABLE Arguments</h1> <p>Each argument specified as part of a "CREATE VIRTUAL TABLE ... USING fts5 ..." statement is either a column name or a configuration option. A column name consists of a single FTS5 bareword or a single string literal quoted in any manner acceptable to SQLite. A configuration option consists of an FTS5 bareword - the option name - followed by an "=" character, followed by the option value. The option value is specified using either a single FTS5 bareword or a string literal, again quoted in any manner acceptable to the SQLite core. Anything else is a syntax error. <h2>Column Names</h2> <p>It is an error to attempt to name an fts5 table column "rowid" or "rank". This is not supported. <h2>Configuration Options</h2> <p> A configuration option consists of an FTS5 bareword - the option name - followed by an "=" character, followed by a either an FTS5 bareword or a string literal. For example: <codeblock> CREATE VIRTUAL TABLE mail USING fts5(sender, title, body, tokenize = 'porter ascii'); </codeblock> <p> There are currently the following configuration options: <ul> <li> The "tokenize" option, used to configure a [FTS5 tokenizers | custom tokenizer]. <li> The "prefix" option, used to add [FTS5 prefix indexes | prefix indexes] to an FTS5 table. <li> The "content" option, used to make the FTS5 table an [FTS5 content option | external content or contentless table]. <li> The "content_rowid" option, used to set the rowid field of an [FTS5 external content tables | external content table]. </ul> <h1>Full-text Query Syntax</h1> <p> The following block contains a summary of the FTS query syntax in BNF form. A detailed explanation follows. <codeblock> <phrase> := string [*] <phrase> := <phrase> + <phrase> <neargroup> := NEAR ( <phrase> <phrase> ... [, N] ) <query> := [colname :] <phrase> <query> := [colname :] <neargroup> <query> := ( <query> ) <query> := <query> AND <query> <query> := <query> OR <query> <query> := <query> NOT <query> </codeblock> <p> Within an FTS expression a <b>string</b> may be specified in one of two ways: <ul> <li> <p>By enclosing it in double quotes ("). Within a string, any embedded double quote characters may be escaped SQL-style - by adding a second double-quote character. <li> <p>As a bareword that includes no whitespace or reserved characters, and is not "AND", "OR" or "NOT" (case sensitive). Reserved characters are: <pre> : ~ ! @ # $ % ^ & * ( ) + , = </pre> In other words, the top row of a regular US keyboard, the plus sign, comma and colon characters. Strings that include any of these characters must be quoted. </ul> <p> FTS queries are made up of <b>phrases</b>. A phrase is an ordered list of one or more tokens. A string is transformed into a phrase by passing it to the FTS table tokenizer. Two phrases can be concatenated into a single large phrase using the "+" operator. For example, assuming the tokenizer module being used tokenizes the input "one.two.three" to three separate tokens, the following three queries all specify the same phrase: <codeblock> ... MATCH '"one two three"' ... MATCH 'one + two + three' ... MATCH '"one two" + three' ... MATCH 'one.two.three' </codeblock> <p> A phrase matches a document if the document contains at least one sub-sequence of tokens that matches the sequence of tokens that make up the phrase. <p> If a "*" character follows a string within an FTS expression, then the final token extracted from the string is marked as a <b>prefix token</b>. As you might expect, a prefix token matches any document token of which it is a prefix. For example, the first two queries in the following block will match any document that contains the token "one" immediately followed by the token "two" and then any token that begins with "thr". <codeblock> ... MATCH '"one two thr" * ' ... MATCH 'one + two + thr*' ... MATCH '"one two thr*"' <b>-- May not work as expected!</b> </codeblock> <p>The final query in the block above may not work as expected. Because the "*" character is inside the double-quotes, it will be passed to the tokenizer, which will likely discard it (or perhaps, depending on the specific tokenizer in use, include it as part of the final token) instead of recognizing it as a special FTS character. <p>Two or more phrases may be grouped into a <b>NEAR group</b>. A NEAR group is specified by the token "NEAR" (case sensitive) followed by an open parenthesis character, followed by two or more whitespace separated phrases, optionally followed by a comma and the numeric parameter <i>N</i>, followed by a close parenthesis. For example: <codeblock> ... MATCH 'NEAR("one two" "three four", 10)' ... MATCH 'NEAR("one two" thr* + four)' </codeblock> <p>If no <i>N</i> parameter is supplied, it defaults to 10. A NEAR group matches a document if the document contains at least one clump of tokens that: <ol> <li> contains at least one instance of each phrase, and <li> for which the number of tokens between the end of the first phrase and the beginning of the last phrase in the clump is less than <i>N</i>. </ol> <p>For example: <codeblock> CREATE VIRTUAL TABLE f USING fts5(x); INSERT INTO f(rowid, x) VALUES(1, 'A B C D x x x E F x'); ... MATCH 'NEAR(e d, 4)'; <i>-- Matches!</i> ... MATCH 'NEAR(e d, 3)'; <i>-- Matches!</i> ... MATCH 'NEAR(e d, 2)'; <i>-- Does not match!</i> ... MATCH 'NEAR("c d" "e f", 3)'; <i>-- Matches!</i> ... MATCH 'NEAR("c" "e f", 3)'; <i>-- Does not match!</i> ... MATCH 'NEAR(a d e, 6)'; <i>-- Matches!</i> ... MATCH 'NEAR(a d e, 5)'; <i>-- Does not match!</i> ... MATCH 'NEAR("a b c d" "b c" "e f", 4)'; <i>-- Matches!</i> ... MATCH 'NEAR("a b c d" "b c" "e f", 3)'; <i>-- Does not match!</i> </codeblock> <p> A single phrase or NEAR group may be restricted to matching text within a specified column of the FTS table by prefixing it with the column name followed by a colon character. Column names may be specified using either of the two forms described for strings above. Unlike strings that are part of phrases, column names are not passed to the tokenizer module. Column names are case-insensitive in the usual way for SQLite column names - upper/lower case equivalence is understood for ASCII-range characters only. <codeblock> ... MATCH 'colname : NEAR("one two" "three four", 10)' ... MATCH '"colname" : one + two + three' </codeblock> <p> Phrases and NEAR groups may be arranged into expressions using <b>boolean operators</b>. In order of precedence, from highest to lowest, the operators are: <table striped=1> <tr><th>Operator <th>Function <tr><td><code><query1> AND <query2></code> <td>Matches if both query1 and query2 match. <tr><td><code><query1> OR <query2></code> <td>Matches if either query1 or query2 match. <tr><td><code><query1> NOT <query2></code> <td>Matches if query1 matches and query2 does not match. </table> <p> Parenthesis may be used to group expressions in order to modify operator precedence in the usual ways. For example: <codeblock> <i>-- Matches documents that contain at least one instance of either "one"</i> <i>-- or "two", but do not contain any instances of token "three".</i> ... MATCH 'one OR two NOT three' <i>-- Match all documents that contain the token "two" but not "three", or</i> <i>-- contain the token "one".</i> ... MATCH 'one OR (two NOT three)' </codeblock> <p> Phrases and NEAR groups may also be connected by <b>implicit AND operators</b>. For simplicity, these are not shown in the BNF grammar above. Essentially, any sequence of phrases or NEAR groups (including those restricted to matching specified columns) separated only by whitespace are handled as if there were an implicit AND operator between each pair of phrases or NEAR groups. Implicit AND operators are never inserted after or before an expression enclosed in parenthesis. For example: <codeblock> ... MATCH 'one two three' <i>-- 'one AND two AND three'</i> ... MATCH 'three "one two"' <i>-- 'three AND "one two"'</i> ... MATCH 'NEAR(one two) three' <i>-- 'NEAR(one two) AND three'</i> ... MATCH 'one OR two three' <i>-- 'one OR two AND three'</i> ... MATCH '(one OR two) three' <i>-- Syntax error!</i> ... MATCH 'func(one two)' <i>-- Syntax error!</i> </codeblock> <h1 tags="FTS5 prefix indexes">Prefix Indexes</h1> <p> By default, FTS5 maintains a single index recording the location of each token instance within the document set. This means that querying for complete tokens is fast, as it requires a single lookup, but querying for a prefix token can be slow, as it requires a range scan. For example, to query for the prefix token "abc*" requires a range scan of all tokens greater than or equal to "abc" and less than "abd". <p> A prefix index is a separate index that records the location of all instances of prefix tokens of a certain length in characters used to speed up queries for prefix tokens. For example, optimizing a query for prefix token "abc*" requires a prefix index of three-character prefixes. <p> To add prefix indexes to an FTS5 table, the "prefix" option is set to either a single positive integer or a text value containing a white-space separated list of one or more positive integer values. A prefix index is created for each integer specified. If more than one "prefix" option is specified as part of a single CREATE VIRTUAL TABLE statement, all apply. <codeblock> <i>-- Two ways to create an FTS5 table that maintains prefix indexes for -- two and three character prefix tokens.</i> CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix='2 3'); CREATE VIRTUAL TABLE ft USING fts5(a, b, prefix=2, prefix=3); </codeblock> <h1 tags="FTS5 tokenizers">Tokenizers</h1> <p> The CREATE VIRTUAL TABLE "tokenize" option is used to configure the specific tokenizer used by the FTS5 table. The option argument must be either an FTS5 bareword, or an SQL text literal. The text of the argument is itself treated as a white-space series of one or more FTS5 barewords or SQL text literals. The first of these is the name of the tokenizer to use. The second and subsequent list elements, if they exist, are arguments passed to the tokenizer implementation. <p> Unlike option values and column names, SQL text literals intended as tokenizers must be quoted using single quote characters. For example: <codeblock> <i>-- The following are all equivalent</i> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter ascii'); CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "porter ascii"); CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = "'porter' 'ascii'"); CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '''porter'' ''ascii'''); <i>-- But this will fail:</i> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = '"porter" "ascii"'); <i>-- This will fail too:</i> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter' 'ascii'); </codeblock> <p> FTS5 features three built-in tokenizer modules, described in subsequent sections: <ul> <li> The <b>unicode61</b> tokenizer, based on the Unicode 6.1 standard. This is the default. <li> The <b>ascii</b> tokenizer, which assumes all characters outside of the ASCII codepoint range (0-127) are to be treated as token characters. <li> The <b>porter</b> tokenizer, which implements the <a href=http://tartarus.org/martin/PorterStemmer/>porter stemming algorithm</a>. </ul> <p> It is also possible to create custom tokenizers for FTS5. The API for doing so is [custom tokenizers | described here]. <h2>Unicode61 Tokenizer</h2> <p> The unicode tokenizer classifies all unicode characters as either "separator" or "token" characters. By default all space and punctuation characters, as defined by Unicode 6.1, are considered separators, and all other characters as token characters. Each contiguous run of one or more token characters is considered to be a token. The tokenizer is case-insensitive according to the rules defined by Unicode 6.1. <p> By default, diacritics are removed from all Latin script characters. This means, for example, that "A", "a", "À", "à", "Â" and "â" are all considered to be equivalent. <p> Any arguments following "unicode61" in the token specification are treated as a list of alternating option names and values. Unicode61 supports the following options: <table striped=1> <tr><th> Option <th> Usage <tr><td> remove_diacritics <td>This option should be set to "0" or "1". If it is set (the default), diacritics are removed from all latin script characters as described above. If it is clear, they are not. <tr><td> tokenchars <td> This option is used to specify additional unicode characters that should be considered token characters, even if they are white-space or punctuation characters according to Unicode 6.1. All characters in the string that this option is set to are considered token characters. <tr><td> separators <td> This option is used to specify additional unicode characters that should be considered as separator characters, even if they are token characters according to Unicode 6.1. All characters in the string that this option is set to are considered separators. </table> <p> For example: <codeblock> <i>-- Create an FTS5 table that does not remove diacritics from Latin -- script characters, and that considers hyphens and underscore characters -- to be part of tokens. </i> CREATE VIRTUAL TABLE ft USING fts5(a, b, tokenize = "unicode61 remove_diacritics 0 tokenchars '-_'" ); </codeblock> <h2>Ascii Tokenizer</h2> <p> The Ascii tokenizer is similar to the Unicode61 tokenizer, except that: <ul> <li> All non-ASCII characters (those with codepoints greater than 127) are always considered token characters. If any non-ASCII characters are specified as part of the separators option, they are ignored. <li> Case-folding is only performed for ASCII characters. So while "A" and "a" are considered to be equivalent, "Ã" and "ã" are distinct. <li> The remove_diacritics option is not supported. </ul> <p> For example: <codeblock> <i>-- Create an FTS5 table that uses the ascii tokenizer, but does not -- consider numeric characters to be part of tokens.</i> CREATE VIRTUAL TABLE ft USING fts5(a, b, tokenize = "ascii separators '0123456789'" ); </codeblock> <h2>Porter Tokenizer</h2> <p> The porter tokenizer is a wrapper tokenizer. It takes the output of some other tokenizer and applies the <a href=http://tartarus.org/martin/PorterStemmer/>porter stemming algorithm</a> to each token before it returns it to FTS5. This allows search terms like "correction" to match similar words such as "corrected" or "correcting". The porter stemmer algorithm is designed for use with English language terms only - using it with other languages may or may not improve search utility. <p> By default, the porter tokenizer operates as a wrapper around the default tokenizer (unicode61). Or, if one or more extra arguments are added to the "tokenize" option following "porter", they are treated as a specification for the underlying tokenizer that the porter stemmer uses. For example: <codeblock> <i>-- Two ways to create an FTS5 table that uses the porter tokenizer to -- stem the output of the default tokenizer (unicode61). </i> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = porter); CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61'); <i>-- A porter tokenizer used to stem the output of the unicode61 tokenizer, -- with diacritics removed before stemming.</i> CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = 'porter unicode61 remove_diacritics 1'); </codeblock> <h1 tags="FTS5 content option">External Content and Contentless Tables</h1> <p> Normally, when a row is inserted into an FTS5 table, as well as the various full-text index entries and other data a copy of the row is stored in a private table managed by the FTS5 module. When column values are requested from the FTS5 table by the user or by an auxiliary function implementation, they are read from this private table. The "content" option may be used to create an FTS5 table that stores only FTS full-text index entries. Because the column values themselves are usually much larger than the associated full-text index entries, this can save significant database space. <p> There are two ways to use the "content" option: <ul> <li> By setting it to an empty string to create a contentless FTS5 table. In this case FTS5 assumes that the original column values are unavailable to it when processing queries. Full-text queries and some auxiliary functions can still be used, but no column values apart from the rowid may be read from the table. <li> By setting it to the name of a database object (table, virtual table or view) that may be queried by FTS5 at any time to retrieve the column values. This is known as an "external content" table. In this case all FTS5 functionality may be used, but it is the responsibility of the user to ensure that the contents of the full-text index are consistent with the named database object. If they are not, query results may be unpredictable. </ul> <h2 tags="FTS5 contentless tables">Contentless Tables</h2> <p> A contentless FTS5 table is created by setting the "content" option to an empty string. For example: <codeblock> CREATE VIRTUAL TABLE f1 USING fts5(a, b, c, content=''); </codeblock> <p> Contentless FTS5 tables do not support UPDATE or DELETE statements, or INSERT statements that do not supply a non-NULL value for the rowid field. Rows may be deleted from a contentless table using an [FTS5 delete command]. <p> Attempting to read any column value except the rowid from a contentless FTS5 table returns an SQL NULL value. <h2 tags="FTS5 external content tables">External Content Tables</h2> <p> An external content FTS5 table is created by setting the content option to the name of a table, virtual table or view (hereafter the "content table") within the same database. Whenever column values are required by FTS5, it queries the content table as follows, with the rowid of the row for which values are required bound to the SQL variable: <codeblock> SELECT * FROM <content> WHERE <content_rowid> = ?; </codeblock> <p> In the above, <content> is replaced by the name of the content table. By default, <content_rowid> is replaced by the literal text "rowid". Or, if the "content_rowid" option is set within the CREATE VIRTUAL TABLE statement, by the value of that option. <p> The "*" in the above query must expand to a set of columns consisting of the <column_rowid> column followed by each indexed column, in the same order as they are present in the external content fts5 table. <p> The content table may also be queried as follows: <codeblock> SELECT * FROM <content> ORDER BY <content_rowid> ASC; SELECT * FROM <content> ORDER BY <content_rowid> DESC; </codeblock> <p> It is still the responsibility of the user to ensure that the contents of an external content FTS5 table are kept up to date with the content table. One way to do this is with triggers. For example: <codeblock> <i>-- Create a table. And an external content fts5 table to index it.</i> CREATE TABLE tbl(a INTEGER PRIMARY KEY, b, c); CREATE VIRTUAL TABLE fts_idx USING fts5(b, c, content='tbl', content_rowid='a'); <i>-- Triggers to keep the FTS index up to date.</i> CREATE TRIGGER tbl_ai AFTER INSERT ON tbl BEGIN INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c); END; CREATE TRIGGER tbl_ad AFTER DELETE ON tbl BEGIN INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c); END; CREATE TRIGGER tbl_au AFTER UPDATE ON tbl BEGIN INSERT INTO fts_idx(fts_idx, rowid, b, c) VALUES('delete', old.a, old.b, old.c); INSERT INTO fts_idx(rowid, b, c) VALUES (new.a, new.b, new.c); END; </codeblock> <h1> Auxiliary Functions </h1> <h2>Built-in Auxiliary Functions</h2> <h3>The bm25() function</h3> <p> The built-in auxiliary function bm25() returns a real value indicating how well the current row matches the full-text query. The better the match, the larger the value returned. A query such as the following may be used to return matches in order from best to worst match: <codeblock> SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts) DESC </codeblock> <p> In order to calculate a documents score, the full-text query is separated into its component phrases. The bm25 score for document <i>D</i> and query <i>Q</i> is then calculated as follows: <p> <img src="images/fts5_formula1.png" style="width:55ex;margin-left:5ex"> <p> In the above, <i>nPhrase</i> is the number of phrases in the query. <i>|D|</i> is the number of tokens in the current document, and <i>avgdl</i> is the average number of tokens in all documents within the FTS5 table. <i>k<sub>1</sub></i> and <i>b</i> are both constants, hard-coded at 1.2 and 0.75 respectively. <p> <i>IDF(q<sub>i</sub>)</i> is the inverse-document-frequency of query phrase <i>i</i>. It is calculated as follows, where <i>N</i> is the total number of rows in the FTS5 table and <i>n(q<sub>i</sub>)</i> is the total number of rows that contain at least one instance of phrase <i>i</i>: <p> <img src="images/fts5_formula2.png" style="width:55ex;margin-left:5ex"> <p> Finally, <i>f(q<sub>i</sub>,D)</i> is the phrase frequency of phrase <i>i</i>. By default, this is simply the number of occurrences of the phrase within the current row. However, by passing extra real value arguments to the bm25() SQL function, each column of the table may be assigned a different weight and the phrase frequency calculated as follows: <p> <img src="images/fts5_formula3.png" style="width:55ex;margin-left:5ex"> <p> where <i>w<sub>c</sub></i> is the weight assigned to column <i>c</i> and <i>n(q<sub>i</sub>,c)</i> is the number of occurrences of phrase <i>i</i> in column <i>c</i> of the current row. The first argument passed to bm25() following the table name is the weight assigned to the leftmost column of the FTS5 table. The second is the weight assigned to the second leftmost column, and so on. If there are not enough arguments for all table columns, remaining columns are assigned a weight of 1.0. If there are too many trailing arguments, the extras are ignored. For example: <codeblock> <i>-- Assuming the following schema:</i> CREATE VIRTUAL TABLE email USING fts5(sender, title, body); <i>-- Return results in bm25 order, with each phrase hit in the "sender"</i> <i>-- column considered the equal of 10 hits in the "body" column, and</i> <i>-- each hit in the "title" column considered as valuable as 5 hits in</i> <i>-- the "body" column.</i> SELECT * FROM email WHERE email MATCH ? ORDER BY bm25(email, 10.0, 5.0) DESC; </codeblock> <p>Refer to wikipedia for <a href="http://en.wikipedia.org/wiki/Okapi_BM25">more information regarding BM25</a> and its variants. <h3>The highlight() function</h3> <p> The highlight() function returns a copy of the text from a specified column of the current row with extra markup text inserted to mark the start and end of phrase matches. <p>The highlight() must be invoked with exactly three arguments following the table name. To be interpreted as follows: <ol> <li> An integer indicating the index of the FTS table column to read the text from. Columns are numbered from left to right starting at zero. <li> The text to insert before each phrase match. <li> The text to insert after each phrase match. </ol> <p>For example: <codeblock> <i>-- Return a copy of the text from the leftmost column of the current</i> <i>-- row, with phrase matches marked using html "b" tags.</i> SELECT highlight(fts, 0, '<b>' '</b>') FROM fts WHERE fts MATCH ? </codeblock> <p>In cases where two or more phrase instances overlap (share one or more tokens in common), a single open and close marker is inserted for each set of overlapping phrases. For example: <codeblock> <i>-- Assuming this:</i> CREATE VIRTUAL TABLE ft USING fts5(a); INSERT INTO ft VALUES('a b c x c d e'); INSERT INTO ft VALUES('a b c c d e'); INSERT INTO ft VALUES('a b c d e'); <i>-- The following SELECT statement returns these three rows:</i> <i>-- '[a b c] x [c d e]'</i> <i>-- '[a b c] [c d e]'</i> <i>-- '[a b c d e]'</i> SELECT highlight(ft, 0, '[', ']') FROM ft WHERE ft MATCH 'a+b+c AND c+d+e'; </codeblock> <h3>The snippet() function</h3> <p>The snippet() function is similar to highlight(), except that instead of returning entire column values, it automatically selects and extracts a short fragment of document text to process and return. The snippet() function must be passed five parameters following the table name argument: <ol> <li> An integer indicating the index of the FTS table column to select the returned text from. Columns are numbered from left to right starting at zero. A negative value indicates that the column should be automatically selected. <li> The text to insert before each phrase match within the returned text. <li> The text to insert after each phrase match within the returned text. <li> The text to add to the start or end of the selected text to indicate that the returned text does not occur at the start or end of its column, respectively. <li> The maximum number of tokens in the returned text. This must be greater than zero and equal to or less than 64. </ol> <h2 tags="auxiliary function mapping">Sorting by Auxiliary Function Results</h2> <p> All FTS5 tables feature a special hidden column named "rank". If the current query is not a full-text query (i.e. if it does not include a MATCH operator), the value of the "rank" column is always NULL. Otherwise, in a full-text query, column rank contains by default the same value as would be returned by executing the bm25() auxiliary function with no trailing arguments. <p> The difference between reading from the rank column and using the bm25() function directly within the query is only significant when sorting by the returned value. In this case, using "rank" is faster than using bm25(). <codeblock> <i>-- The following queries are logically equivalent. But the second may</i> <i>-- be faster, particularly if the caller abandons the query before</i> <i>-- all rows have been returned (or if the queries were modified to </i> <i>-- include LIMIT clauses).</i> SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts) DESC; SELECT * FROM fts WHERE fts MATCH ? ORDER BY rank DESC; </codeblock> <p> Instead of using bm25() with no trailing arguments, the specific auxiliary function mapped to the rank column may be configured either on a per-query basis, or by setting a different persistent default for the FTS table. <p> In order to change the mapping of the rank column for a single query, a term similar to the following is added to the WHERE clause of a query: <codeblock> rank MATCH 'auxiliary-function-name(arg1, arg2, ...)' </codeblock> <p> The right-hand-side of the MATCH clause must be a constant expression that evaluates to a string consisting of the auxiliary function to invoke, followed by zero or more comma separated arguments within parenthesis. Arguments must be SQL literals. For example: <codeblock> <i>-- The following queries are logically equivalent. But the second may</i> <i>-- be faster. See above. </i> SELECT * FROM fts WHERE fts MATCH ? ORDER BY bm25(fts, 10.0, 5.0) DESC; SELECT * FROM fts WHERE fts MATCH ? AND rank MATCH 'bm25(10.0, 5.0)' ORDER BY rank DESC; </codeblock> <p> The default mapping of the rank column for a table may be modified using the [FTS5 rank configuration option]. <h1>Special INSERT Commands</h1> <h2 tags="FTS5 automerge option">The 'automerge' Configuration Option</h2> <p> Instead of using a single data structure on disk to store the full-text index, FTS5 uses a series of b-trees. Each time a new transaction is committed, a new b-tree containing the contents of the committed transaction is written into the database file. When the full-text index is queried, each b-tree must be queried individually and the results merged before being returned to the user. <p> In order to prevent the number of b-trees in the database from becoming too large (slowing down queries), smaller b-trees are periodically merged into single larger b-trees containing the same data. By default, this happens automatically within INSERT, UPDATE or DELETE statements that modify the full-text index. The 'automerge' parameter determines how many smaller b-trees are merged together at a time. Setting it to a small value can speed up queries (as they have to query and merge the results from fewer b-trees), but can also slow down writing to the database (as each INSERT, UPDATE or DELETE statement has to do more work as part of the automatic merging process). <p> Each of the b-trees that make up the full-text index is assigned to a "level" based on its size. Level-0 b-trees are the smallest, as they contain the contents of a single transaction. Higher level b-trees are the result of merging two or more level-0 b-trees together and so they are larger. FTS5 begins to merge b-trees together once there exist <i>M</i> or more b-trees with the same level, where <i>M</i> is the value of the 'automerge' parameter. <p> The maximum allowed value for the 'automerge' parameter is 16. The default value is 4. Setting the 'automerge' parameter to 0 disables the automatic incremental merging of b-trees altogether. <codeblock> INSERT INTO ft(ft, rank) VALUES('automerge', 8); </codeblock> <h2>The 'crisismerge' Configuration Option</h2> <p>The 'crisismerge' option is similar to 'automerge', in that it determines how and how often the component b-trees that make up the full-text index are merged together. Once there exist <i>C</i> or more b-trees on a single level within the full-text index, where <i>C</i> is the value of the 'crisismerge' option, all b-trees on the level are immediately merged into a single b-tree. <p>The difference between this option and the 'automerge' option is that when the 'automerge' limit is reached FTS5 only begins to merge the b-trees together. Most of the work is performed as part of subsequent INSERT, UPDATE or DELETE operations. Whereas when the 'crisismerge' limit is reached, the offending b-trees are all merged immediately. This means that an INSERT, UPDATE or DELETE that triggers a crisis-merge may take a long time to complete. <p>The default 'crisismerge' value is 16. There is no maximum limit. Attempting to set the 'crisismerge' parameter to a value of 0 or 1 is equivalent to setting it to the default value (16). It is an error to attempt to set the 'crisismerge' option to a negative value. <codeblock> INSERT INTO ft(ft, rank) VALUES('crisismerge', 16); </codeblock> <h2 tags="FTS5 delete command">The 'delete' Command</h2> <p> This command is only available with [FTS5 external content tables | external content] and [FTS5 contentless tables | contentless] tables. It is used to delete the index entries associated with a single row from the full-text index. This command and the [FTS5 delete-all command | delete-all] command are the only ways to remove entries from the full-text index of a contentless table. <p> In order to use this command to delete a row, the text value 'delete' must be inserted into the special column with the same name as the table. The rowid of the row to delete is inserted into the rowid column. The values inserted into the other columns must match the values currently stored in the table. For example: <codeblock> <i>-- Insert a row with rowid=14 into the fts5 table.</i> INSERT INTO ft(rowid, a, b, c) VALUES(14, $a, $b, $c); <i>-- Remove the same row from the fts5 table.</i> INSERT INTO ft(ft, rowid, a, b, c) VALUES('delete', 14, $a, $b, $c); </codeblock> <p> If the values "inserted" into the text columns as part of a 'delete' command are not the same as those currently stored within the table, the results may be unpredictable. <p><i style=color:red>todo: explain why</i> <h2 tags="FTS5 delete-all command">The 'delete-all' Command</h2> <p> This command is only available with [FTS5 external content tables | external content] and [FTS5 contentless tables | contentless] tables. It deletes all entries from the full-text index. <codeblock> INSERT INTO ft(ft) VALUES('delete-all'); </codeblock> <h2>The 'integrity-check' Command</h2> <p> This command is used to verify that the full-text index is consistent with the contents of the FTS5 table or [FTS5 external content tables | content table]. It is not available with [FTS5 contentless tables | contentless tables]. <p>The integrity-check command is invoked by inserting the text value 'integrity-check' into the special column with the same name as the FTS5 table. For example: <codeblock> INSERT INTO ft(ft) VALUES('integrity-check'); </codeblock> <p>If the full-text index is consistent with the contents of the table, the INSERT used to invoke the integrity-check command succeeds. Or, if any discrepancy is found, it fails with an [SQLITE_CORRUPT_VTAB] error. <h2 tags="FTS5 optimize command">The 'optimize' Command</h2> <p>This command merges all individual b-trees that currently make up the full-text index into a single large b-tree structure. This ensures that the full-text index consumes the mimimum space within the database and is in the fastest form to query. <p>Refer to the documentation for the [FTS5 automerge option] for more details regarding the relationship between the full-text index and its component b-trees. <codeblock> INSERT INTO ft(ft) VALUES('optimize'); </codeblock> <h2>The 'pgsz' Configuration Option</h2> <p> This command is used to set the persistent "pgsz" option. <p> The full-text index maintained by FTS5 is stored as a series of fixed-size blobs in a database table. It is not strictly necessary for all blobs that make up a full-text index to be the same size. The pgsz option determines the size of all blobs created by subsequent index writers. The default value is 1000. <codeblock> INSERT INTO ft(ft, rank) VALUES('pgsz', 4072); </codeblock> <h2 tags="FTS5 rank configuration option">The 'rank' Configuration Option</h2> <p> This command is used to set the persistent "rank" option. <p> The rank option is used to change the default auxiliary function mapping for the rank column. The option should be set to a text value in the same format as described for [auxiliary function mapping | "rank MATCH ?"] terms above. For example: <codeblock> INSERT INTO ft(ft, rank) VALUES('rank', 'bm25(10.0, 5.0)'); </codeblock> <h2 tags="FTS5 rebuild command">The 'rebuild' Command</h2> <p> This command first deletes the entire full-text index, then rebuilds it based on the contents of the table or [FTS5 external content tables | content table]. It is not available with [FTS5 contentless tables | contentless tables]. <codeblock> INSERT INTO ft(ft) VALUES('rebuild'); </codeblock> <h1>Extending FTS5</h1> <p>FTS5 features APIs allowing it to be extended by: <ul> <li> Adding new auxiliary functions implemented in C, and <li> Adding new tokenizers, also implemented in C. </ul> <p> The built-in tokenizers and auxiliary functions described in this documented are all implemented using the publicly available API described below. <p> Before a new auxiliary function or tokenizer implementation may be registered with FTS5, an application must obtain a pointer to the "fts5_api" structure. There is one fts5_api structure for each database connection with which the FTS5 extension is registered. To obtain the pointer, the application invokes the SQL user-defined function fts5(), which returns a blob value containing the pointer to the fts5_api structure for the connection. The following example code demonstrates the technique: <codeblock> <i>/* ** Return a pointer to the fts5_api pointer for database connection db. ** If an error occurs, return NULL and leave an error in the database ** handle (accessible using sqlite3_errcode()/errmsg(). */</i> fts5_api *fts5_api_from_db(sqlite3 *db){ fts5_api *pRet = 0; sqlite3_stmt *pStmt = 0; if( SQLITE_OK==sqlite3_prepare(db, "SELECT fts5()", -1, &pStmt, 0) && SQLITE_ROW==sqlite3_step(pStmt) && sizeof(pRet)==sqlite3_column_bytes(pStmt, 0) ){ memcpy(&pRet, sqlite3_column_blob(pStmt, 0), sizeof(pRet)); } sqlite3_finalize(pStmt); return pRet; } </codeblock> <p> The fts5_api structure is defined as follows. It exposes three methods, one each for registering new auxiliary functions and tokenizers, and one for retrieving existing tokenizer. The latter is intended to facilitate the implementation of "tokenizer wrappers" similar to the built-in porter tokenizer. <codeblock> <tclscript> set res "" set ::extract_api_docs_mode fts5_api catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> </codeblock> <p> To invoke a method of the fts5_api object, the fts5_api pointer itself should be passed as the methods first argument followed by the other, method specific, arguments. For example: <codeblock> rc = pFts5Api->xCreateTokenizer(pFts5Api, ... other args ...); </codeblock> <p> The fts5_api structure methods are described individually in the following sections. <h2 tags="custom tokenizers">Custom Tokenizers</h2> <p> To create a custom tokenizer, an application must implement three functions: a tokenizer constructor (xCreate), a destructor (xDelete) and a function to do the actual tokenization (xTokenize). The type of each function is as for the member variables of the fts5_tokenizer struct: <codeblock> <tclscript> set res "" set ::extract_api_docs_mode fts5_tokenizer catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> </codeblock> <p> When an FTS5 table uses the custom tokenizer, the FTS5 core calls xCreate() once to create a tokenizer, then xTokenize() zero or more times to tokenize strings, then xDelete() to free any resources allocated by xCreate(). More specifically: <tclscript> set res "" set ::extract_api_docs_mode tokenizer_api catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> <h2>Custom Auxiliary Functions</h2> <p> Implementing a custom auxiliary function is similar to implementing an [application-defined SQL function | scalar SQL function]. The implementation should be a C function of type fts5_extension_function, defined as follows: <codeblock> <tclscript> set res "" set ::extract_api_docs_mode fts5_extension catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> </codeblock> <p> The implementation is registered with the FTS5 module by calling the xCreateFunction() method of the fts5_api object. If there is already an auxiliary function with the same name, it is replaced by the new function. If a non-NULL xDestroy parameter is passed to xCreateFunction(), it is invoked with a copy of the pContext pointer passed as the only argument when the database handle is closed or when the registered auxiliary function is replaced. <p> The final three arguments passed to the auxiliary function callback are similar to the three arguments passed to the implementation of a scalar SQL function. All arguments except the first passed to the auxiliary function are available to the implementation in the apVal[] array. The implementation should return a result or error via the content handle pCtx. <p> The first argument passed to an auxiliary function callback is a pointer to a structure containing methods that may be invoked in order to obtain information regarding the current query or row. The second argument is an opaque handle that should be passed as the first argument to any such method invocation. For example, the following auxiliary function definition returns the total number of tokens in all columns of the current row: <codeblock> <i>/* ** Implementation of an auxiliary function that returns the number ** of tokens in the current row (including all columns). */</i> static void column_size_imp( const Fts5ExtensionApi *pApi, Fts5Context *pFts, sqlite3_context *pCtx, int nVal, sqlite3_value **apVal ){ int rc; int nToken; rc = pApi->xColumnSize(pFts, -1, &nToken); if( rc==SQLITE_OK ){ sqlite3_result_int(pCtx, nToken); }else{ sqlite3_result_error_code(pCtx, rc); } } </codeblock> <p>The following section describes the API offered to auxiliary function implementations in detail. Further examples may be found in the "fts5_aux.c" file of the source code. <h3>Custom Auxiliary Functions API Reference</h3> <codeblock> <tclscript> set res "" set ::extract_api_docs_mode Fts5ExtensionApi catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> </codeblock> <tclscript> set res "" unset -nocomplain ::extract_api_docs_mode catch { set res [source [file join $::SRC ext/fts5/extract_api_docs.tcl]] } set res </tclscript> |
Changes to wrap.tcl.
︙ | ︙ | |||
456 457 458 459 460 461 462 | .fancy th, .fancy td {padding: 0.2em 1ex; vertical-align:top} .fancy #toc a { color: darkblue ; text-decoration: none } .fancy .todo { color: #AA3333 ; font-style : italic } .fancy .todo:before { content: 'TODO:' } .fancy p.todo { border: solid #AA3333 1px; padding: 1ex } .fancy img { display:block; } .fancy :link:hover, .fancy :visited:hover { background: wheat } | | | 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 | .fancy th, .fancy td {padding: 0.2em 1ex; vertical-align:top} .fancy #toc a { color: darkblue ; text-decoration: none } .fancy .todo { color: #AA3333 ; font-style : italic } .fancy .todo:before { content: 'TODO:' } .fancy p.todo { border: solid #AA3333 1px; padding: 1ex } .fancy img { display:block; } .fancy :link:hover, .fancy :visited:hover { background: wheat } .fancy p,.fancy ul,.fancy ol,.fancy dl { margin: 1em 5ex } .fancy li p { margin: 1em 0 } /* End of "fancyformat" specific rules. */ </style> } puts $fd {</head>} if {[file exists DRAFT]} { |
︙ | ︙ |