# 2009 December 03 # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # # Brute force (random data) tests for FTS3. # #------------------------------------------------------------------------- # # The FTS3 tests implemented in this file focus on testing that FTS3 # returns the correct set of documents for various types of full-text # query. This is done using pseudo-randomly generated data and queries. # The expected result of each query is calculated using Tcl code. # # 1. The database is initialized to contain a single table with three # columns. 100 rows are inserted into the table. Each of the three # values in each row is a document consisting of between 0 and 100 # terms. Terms are selected from a vocabulary of $G(nVocab) terms. # # 2. The following is performed 100 times: # # a. A row is inserted into the database. The row contents are # generated as in step 1. The docid is a pseudo-randomly selected # value between 0 and 1000000. # # b. A psuedo-randomly selected row is updated. One of its columns is # set to contain a new document generated in the same way as the # documents in step 1. # # c. A psuedo-randomly selected row is deleted. # # d. For each of several types of fts3 queries, 10 SELECT queries # of the form: # # SELECT docid FROM WHERE MATCH '' # # are evaluated. The results are compared to those calculated by # Tcl code in this file. The patterns used for the different query # types are: # # 1. query = # 2. query = # 3. query = " " # 4. query = " " # 5. query = " " # 6. query = NEAR # 7. query = NEAR/11 NEAR/11 # 8. query = OR # 9. query = NOT # 10. query = AND # 11. query = NEAR OR NEAR # 12. query = NEAR NOT NEAR # 13. query = NEAR AND NEAR # # where is a term psuedo-randomly selected from the vocabulary # and prefix is the first 2 characters of such a term followed by # a "*" character. # # Every second iteration, steps (a) through (d) above are performed # within a single transaction. This forces the queries in (d) to # read data from both the database and the in-memory hash table # that caches the full-text index entries created by steps (a), (b) # and (c) until the transaction is committed. # # The procedure above is run 5 times, using advisory fts3 node sizes of 50, # 500, 1000 and 2000 bytes. # # After the test using an advisory node-size of 50, an OOM test is run using # the database. This test is similar to step (d) above, except that it tests # the effects of transient and persistent OOM conditions encountered while # executing each query. # set testdir [file dirname $argv0] source $testdir/tester.tcl # If this build does not include FTS3, skip the tests in this file. # ifcapable !fts3 { finish_test ; return } source $testdir/fts3_common.tcl source $testdir/malloc_common.tcl set G(nVocab) 100 set nVocab 100 set lVocab [list] expr srand(0) # Generate a vocabulary of nVocab words. Each word is 3 characters long. # set lChar {a b c d e f g h i j k l m n o p q r s t u v w x y z} for {set i 0} {$i < $nVocab} {incr i} { set len [expr int(rand()*3)+2] set word [lindex $lChar [expr int(rand()*26)]] append word [lindex $lChar [expr int(rand()*26)]] if {$len>2} { append word [lindex $lChar [expr int(rand()*26)]] } if {$len>3} { append word [lindex $lChar [expr int(rand()*26)]] } lappend lVocab $word } proc random_term {} { lindex $::lVocab [expr {int(rand()*$::nVocab)}] } # Return a document consisting of $nWord arbitrarily selected terms # from the $::lVocab list. # proc generate_doc {nWord} { set doc [list] for {set i 0} {$i < $nWord} {incr i} { lappend doc [random_term] } return $doc } # Primitives to update the table. # unset -nocomplain t1 proc insert_row {rowid} { set a [generate_doc [expr int((rand()*100))]] set b [generate_doc [expr int((rand()*100))]] set c [generate_doc [expr int((rand()*100))]] execsql { INSERT INTO t1(docid, a, b, c) VALUES($rowid, $a, $b, $c) } set ::t1($rowid) [list $a $b $c] } proc delete_row {rowid} { execsql { DELETE FROM t1 WHERE rowid = $rowid } catch {unset ::t1($rowid)} } proc update_row {rowid} { set cols {a b c} set iCol [expr int(rand()*3)] set doc [generate_doc [expr int((rand()*100))]] lset ::t1($rowid) $iCol $doc execsql "UPDATE t1 SET [lindex $cols $iCol] = \$doc WHERE rowid = \$rowid" } proc simple_phrase {zPrefix} { set ret [list] set reg [string map {* {[^ ]*}} $zPrefix] set reg " $reg " foreach key [lsort -integer [array names ::t1]] { set value $::t1($key) set cnt [list] foreach col $value { if {[regexp $reg " $col "]} { lappend ret $key ; break } } } #lsort -uniq -integer $ret set ret } # This [proc] is used to test the FTS3 matchinfo() function. # proc simple_token_matchinfo {zToken bDesc} { set nDoc(0) 0 set nDoc(1) 0 set nDoc(2) 0 set nHit(0) 0 set nHit(1) 0 set nHit(2) 0 set dir -inc if {$bDesc} { set dir -dec } foreach key [array names ::t1] { set value $::t1($key) set a($key) [list] foreach i {0 1 2} col $value { set hit [llength [lsearch -all $col $zToken]] lappend a($key) $hit incr nHit($i) $hit if {$hit>0} { incr nDoc($i) } } } set ret [list] foreach docid [lsort -integer $dir [array names a]] { if { [lindex [lsort -integer $a($docid)] end] } { set matchinfo [list 1 3] foreach i {0 1 2} hit $a($docid) { lappend matchinfo $hit $nHit($i) $nDoc($i) } lappend ret $docid $matchinfo } } set ret } proc simple_near {termlist nNear} { set ret [list] foreach {key value} [array get ::t1] { foreach v $value { set l [lsearch -exact -all $v [lindex $termlist 0]] foreach T [lrange $termlist 1 end] { set l2 [list] foreach i $l { set iStart [expr $i - $nNear - 1] set iEnd [expr $i + $nNear + 1] if {$iStart < 0} {set iStart 0} foreach i2 [lsearch -exact -all [lrange $v $iStart $iEnd] $T] { incr i2 $iStart if {$i2 != $i} { lappend l2 $i2 } } } set l [lsort -uniq -integer $l2] } if {[llength $l]} { #puts "MATCH($key): $v" lappend ret $key } } } lsort -unique -integer $ret } # The following three procs: # # setup_not A B # setup_or A B # setup_and A B # # each take two arguments. Both arguments must be lists of integer values # sorted by value. The return value is the list produced by evaluating # the equivalent of "A op B", where op is the FTS3 operator NOT, OR or # AND. # proc setop_not {A B} { foreach b $B { set n($b) {} } set ret [list] foreach a $A { if {![info exists n($a)]} {lappend ret $a} } return $ret } proc setop_or {A B} { lsort -integer -uniq [concat $A $B] } proc setop_and {A B} { foreach b $B { set n($b) {} } set ret [list] foreach a $A { if {[info exists n($a)]} {lappend ret $a} } return $ret } proc mit {blob} { set scan(littleEndian) i* set scan(bigEndian) I* binary scan $blob $scan($::tcl_platform(byteOrder)) r return $r } db func mit mit set sqlite_fts3_enable_parentheses 1 proc do_orderbydocid_test {tn sql res} { uplevel [list do_select_test $tn.asc "$sql ORDER BY docid ASC" $res] uplevel [list do_select_test $tn.desc "$sql ORDER BY docid DESC" \ [lsort -int -dec $res] ] } set NUM_TRIALS 100 foreach {nodesize order} { 50 DESC 50 ASC 500 ASC 1000 DESC 2000 ASC } { catch { array unset ::t1 } set testname "$nodesize/$order" # Create the FTS3 table. Populate it (and the Tcl array) with 100 rows. # db transaction { catchsql { DROP TABLE t1 } execsql "CREATE VIRTUAL TABLE t1 USING fts4(a, b, c, order=$order)" execsql "INSERT INTO t1(t1) VALUES('nodesize=$nodesize')" for {set i 0} {$i < 100} {incr i} { insert_row $i } } for {set iTest 1} {$iTest <= $NUM_TRIALS} {incr iTest} { catchsql COMMIT set DO_MALLOC_TEST 0 set nRep 10 if {$iTest==100 && $nodesize==50} { set DO_MALLOC_TEST 1 set nRep 2 } set ::testprefix fts3rnd-1.$testname.$iTest # Delete one row, update one row and insert one row. # set rows [array names ::t1] set nRow [llength $rows] set iUpdate [lindex $rows [expr {int(rand()*$nRow)}]] set iDelete $iUpdate while {$iDelete == $iUpdate} { set iDelete [lindex $rows [expr {int(rand()*$nRow)}]] } set iInsert $iUpdate while {[info exists ::t1($iInsert)]} { set iInsert [expr {int(rand()*1000000)}] } execsql BEGIN insert_row $iInsert update_row $iUpdate delete_row $iDelete if {0==($iTest%2)} { execsql COMMIT } if {0==($iTest%2)} { #do_test 0 { fts3_integrity_check t1 } ok } # Pick 10 terms from the vocabulary. Check that the results of querying # the database for the set of documents containing each of these terms # is the same as the result obtained by scanning the contents of the Tcl # array for each term. # for {set i 0} {$i < 10} {incr i} { set term [random_term] do_select_test 1.$i.asc { SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term ORDER BY docid ASC } [simple_token_matchinfo $term 0] do_select_test 1.$i.desc { SELECT docid, mit(matchinfo(t1)) FROM t1 WHERE t1 MATCH $term ORDER BY docid DESC } [simple_token_matchinfo $term 1] } # This time, use the first two characters of each term as a term prefix # to query for. Test that querying the Tcl array produces the same results # as querying the FTS3 table for the prefix. # for {set i 0} {$i < $nRep} {incr i} { set prefix [string range [random_term] 0 end-1] set match "${prefix}*" do_orderbydocid_test 2.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_phrase $match] } # Similar to the above, except for phrase queries. # for {set i 0} {$i < $nRep} {incr i} { set term [list [random_term] [random_term]] set match "\"$term\"" do_orderbydocid_test 3.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_phrase $term] } # Three word phrases. # for {set i 0} {$i < $nRep} {incr i} { set term [list [random_term] [random_term] [random_term]] set match "\"$term\"" do_orderbydocid_test 4.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_phrase $term] } # Three word phrases made up of term-prefixes. # for {set i 0} {$i < $nRep} {incr i} { set query "[string range [random_term] 0 end-1]* " append query "[string range [random_term] 0 end-1]* " append query "[string range [random_term] 0 end-1]*" set match "\"$query\"" do_orderbydocid_test 5.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_phrase $query] } # A NEAR query with terms as the arguments: # # ... MATCH '$term1 NEAR $term2' ... # for {set i 0} {$i < $nRep} {incr i} { set terms [list [random_term] [random_term]] set match [join $terms " NEAR "] do_orderbydocid_test 6.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_near $terms 10] } # A 3-way NEAR query with terms as the arguments. # for {set i 0} {$i < $nRep} {incr i} { set terms [list [random_term] [random_term] [random_term]] set nNear 11 set match [join $terms " NEAR/$nNear "] do_orderbydocid_test 7.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [simple_near $terms $nNear] } # Set operations on simple term queries. # foreach {tn op proc} { 8 OR setop_or 9 NOT setop_not 10 AND setop_and } { for {set i 0} {$i < $nRep} {incr i} { set term1 [random_term] set term2 [random_term] set match "$term1 $op $term2" do_orderbydocid_test $tn.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [$proc [simple_phrase $term1] [simple_phrase $term2]] } } # Set operations on NEAR queries. # foreach {tn op proc} { 11 OR setop_or 12 NOT setop_not 13 AND setop_and } { for {set i 0} {$i < $nRep} {incr i} { set term1 [random_term] set term2 [random_term] set term3 [random_term] set term4 [random_term] set match "$term1 NEAR $term2 $op $term3 NEAR $term4" do_orderbydocid_test $tn.$i { SELECT docid FROM t1 WHERE t1 MATCH $match } [$proc \ [simple_near [list $term1 $term2] 10] \ [simple_near [list $term3 $term4] 10] ] } } catchsql COMMIT } } finish_test