/ Check-in [c646e403]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Enhance fts5txt2db.tcl, a script used to generate fts5/fts4 databases for performance testing.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5-perf
Files: files | file ages | folders
SHA1: c646e40350e5aa91abcf52de61fb31275bad38f9
User & Date: dan 2016-01-26 17:08:22
Context
2016-01-26
19:30
Improve the performance of fts5 column filters on detail=col tables. check-in: 249a2d07 user: dan tags: fts5-perf
17:08
Enhance fts5txt2db.tcl, a script used to generate fts5/fts4 databases for performance testing. check-in: c646e403 user: dan tags: fts5-perf
2016-01-23
18:51
Fix an fts5 problem with using both xPhraseFirst() and xPhraseFirstColumn() within a single statement in detail=col mode. check-in: 72d53699 user: dan tags: fts5-perf
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts5/tool/fts5txt2db.tcl.

     1      1   
     2      2   
     3         -proc usage {} {
     4         -  puts stderr "$::argv0 ?OPTIONS? DATABASE FILE1..."
            3  +#-------------------------------------------------------------------------
            4  +# Command line options processor.
            5  +#
            6  +proc command_line_error {O E {msg ""}} {
            7  +  if {$msg != ""} {
            8  +    puts stderr "Error: $msg"
            9  +    puts stderr ""
           10  +  }
           11  +
           12  +  set L [list]
           13  +  foreach o $O {
           14  +    if {[llength $o]==1} {
           15  +      lappend L [string toupper $o]
           16  +    }
           17  +  }
           18  +
           19  +  puts stderr "Usage: $::argv0 ?SWITCHES? $L"
           20  +  puts stderr ""
           21  +  puts stderr "Switches are:"
           22  +  foreach o $O {
           23  +    if {[llength $o]==3} {
           24  +      foreach {a b c} $o {}
           25  +      puts stderr [format "    -%-15s %s (default \"%s\")" "$a VAL" $c $b]
           26  +    } elseif {[llength $o]==2} {
           27  +      foreach {a b} $o {}
           28  +      puts stderr [format "    -%-15s %s" $a $b]
           29  +    }
           30  +  }
     5     31     puts stderr ""
     6         -  puts stderr "Options are"
     7         -  puts stderr "  -fts5"
     8         -  puts stderr "  -fts4"
     9         -  puts stderr "  -colsize <list of column sizes>"
    10         -  puts stderr {
           32  +  puts stderr $E
           33  +  exit -1
           34  +}
           35  +
           36  +proc process_command_line {avar lArgs O E} {
           37  +
           38  +  upvar $avar A
           39  +  set zTrailing ""       ;# True if ... is present in $O
           40  +  set lPosargs [list]
           41  +
           42  +  # Populate A() with default values. Also, for each switch in the command
           43  +  # line spec, set an entry in the idx() array as follows:
           44  +  #
           45  +  #  {tblname t1 "table name to use"}  
           46  +  #      -> [set idx(-tblname) {tblname t1 "table name to use"}  
           47  +  #
           48  +  # For each position parameter, append its name to $lPosargs. If the ...
           49  +  # specifier is present, set $zTrailing to the name of the prefix.
           50  +  #
           51  +  foreach o $O {
           52  +    set nm [lindex $o 0]
           53  +    set nArg [llength $o]
           54  +    switch -- $nArg {
           55  +      1 {
           56  +        if {[string range $nm end-2 end]=="..."} {
           57  +          set zTrailing [string range $nm 0 end-3]
           58  +        } else {
           59  +          lappend lPosargs $nm
           60  +        }
           61  +      }
           62  +      2 {
           63  +        set A($nm) 0
           64  +        set idx(-$nm) $o
           65  +      }
           66  +      3 {
           67  +        set A($nm) [lindex $o 1]
           68  +        set idx(-$nm) $o
           69  +      }
           70  +      default {
           71  +        error "Error in command line specification"
           72  +      }
           73  +    }
           74  +  }
           75  +
           76  +  # Set explicitly specified option values
           77  +  #
           78  +  set nArg [llength $lArgs]
           79  +  for {set i 0} {$i < $nArg} {incr i} {
           80  +    set opt [lindex $lArgs $i]
           81  +    if {[string range $opt 0 0]!="-" || $opt=="--"} break
           82  +    set c [array names idx "${opt}*"]
           83  +    if {[llength $c]==0} { command_line_error $O $E "Unrecognized option: $opt"}
           84  +    if {[llength $c]>1}  { command_line_error $O $E "Ambiguous option: $opt"}
           85  +
           86  +    if {[llength $idx($c)]==3} {
           87  +      if {$i==[llength $lArgs]-1} {
           88  +        command_line_error $O $E "Option requires argument: $c" 
           89  +      }
           90  +      incr i
           91  +      set A([lindex $idx($c) 0]) [lindex $lArgs $i]
           92  +    } else {
           93  +      set A([lindex $idx($c) 0]) 1
           94  +    }
           95  +  }
           96  +
           97  +  # Deal with position arguments.
           98  +  #
           99  +  set nPosarg [llength $lPosargs]
          100  +  set nRem [expr $nArg - $i]
          101  +  if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
          102  +    command_line_error $O $E
          103  +  }
          104  +  for {set j 0} {$j < $nPosarg} {incr j} {
          105  +    set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
          106  +  }
          107  +  if {$zTrailing!=""} {
          108  +    set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
          109  +  }
          110  +}
          111  +# End of command line options processor.
          112  +#-------------------------------------------------------------------------
          113  +
          114  +
          115  +process_command_line A $argv {
          116  +  {fts5                 "use fts5"}
          117  +  {fts4                 "use fts4"}
          118  +  {colsize   "10 10 10" "list of column sizes"}
          119  +  {tblname   "t1"       "table name to create"}
          120  +  {detail    "full"     "Fts5 detail mode to use"}
          121  +  {repeat    1          "Load each file this many times"}
          122  +  database
          123  +  file...
          124  +} {
    11    125   This script is designed to create fts4/5 tables with more than one column.
    12    126   The -colsize option should be set to a Tcl list of integer values, one for
    13    127   each column in the table. Each value is the number of tokens that will be
    14    128   inserted into the column value for each row. For example, setting the -colsize
    15    129   option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
    16    130   tokens per row in each, respectively.
    17    131   
................................................................................
    18    132   Each "FILE" argument should be a text file. The contents of these text files is
    19    133   split on whitespace characters to form a list of tokens. The first N1 tokens
    20    134   are used for the first column of the first row, where N1 is the first element
    21    135   of the -colsize list. The next N2 are used for the second column of the first
    22    136   row, and so on. Rows are added to the table until the entire list of tokens
    23    137   is exhausted.
    24    138   }
    25         -  exit -1
          139  +
          140  +if {$A(fts4)} {
          141  +  set A(fts) fts4
          142  +} else {
          143  +  set A(fts) fts5
    26    144   }
    27    145   
    28         -set O(aColSize)       [list 10 10 10]
    29         -set O(tblname)        t1
    30         -set O(fts)            fts5
    31         -
    32         -
    33         -set options_with_values {-colsize}
    34         -
    35         -for {set i 0} {$i < [llength $argv]} {incr i} {
    36         -  set opt [lindex $argv $i]
    37         -  if {[string range $opt 0 0]!="-"} break
    38         -
    39         -  if {[lsearch $options_with_values $opt]>=0} {
    40         -    incr i
    41         -    if {$i==[llength $argv]} usage
    42         -    set val [lindex $argv $i]
    43         -  }
    44         -
    45         -  switch -- $opt {
    46         -    -colsize {
    47         -      set O(aColSize) $val
    48         -    }
    49         -
    50         -    -fts4 {
    51         -      set O(fts) fts4
    52         -    }
    53         -
    54         -    -fts5 {
    55         -      set O(fts) fts5
    56         -    }
    57         -  }
    58         -}
    59         -
    60         -if {$i > [llength $argv]-2} usage
    61         -set O(db) [lindex $argv $i]
    62         -set O(files) [lrange $argv [expr $i+1] end]
    63         -
    64         -sqlite3 db $O(db)
          146  +sqlite3 db $A(database)
    65    147   
    66    148   # Create the FTS table in the db. Return a list of the table columns.
    67    149   #
    68    150   proc create_table {} {
    69         -  global O
          151  +  global A
    70    152     set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]
    71    153   
    72         -  set nCol [llength $O(aColSize)]
          154  +  set nCol [llength $A(colsize)]
    73    155     set cols [lrange $cols 0 [expr $nCol-1]]
    74    156   
    75         -  set sql    "CREATE VIRTUAL TABLE IF NOT EXISTS $O(tblname) USING $O(fts) ("
          157  +  set sql    "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
    76    158     append sql [join $cols ,]
    77         -  append sql ");"
          159  +  if {$A(fts)=="fts5"} { append sql ",detail=$A(detail));" }
    78    160   
    79    161     db eval $sql
    80    162     return $cols
    81    163   }
    82    164   
    83    165   # Return a list of tokens from the named file.
    84    166   #
................................................................................
    85    167   proc readfile {file} {
    86    168     set fd [open $file]
    87    169     set data [read $fd]
    88    170     close $fd
    89    171     split $data
    90    172   }
    91    173   
          174  +proc repeat {L n} {
          175  +  set res [list]
          176  +  for {set i 0} {$i < $n} {incr i} {
          177  +    set res [concat $res $L]
          178  +  }
          179  +  set res
          180  +}
          181  +
    92    182   
    93    183   # Load all the data into a big list of tokens.
    94    184   #
    95    185   set tokens [list]
    96         -foreach f $O(files) {
    97         -  set tokens [concat $tokens [readfile $f]]
          186  +foreach f $A(file) {
          187  +  set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
    98    188   }
    99    189   
   100    190   set N [llength $tokens]
   101    191   set i 0
   102    192   set cols [create_table]
   103         -set sql "INSERT INTO $O(tblname) VALUES(\$[lindex $cols 0]"
          193  +set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
   104    194   foreach c [lrange $cols 1 end] {
   105         -  append sql ", \$A($c)"
          195  +  append sql ", \$R($c)"
   106    196   }
   107    197   append sql ")"
   108    198   
   109    199   db eval BEGIN
   110    200     while {$i < $N} {
   111         -    foreach c $cols s $O(aColSize) {
   112         -      set A($c) [lrange $tokens $i [expr $i+$s-1]]
          201  +    foreach c $cols s $A(colsize) {
          202  +      set R($c) [lrange $tokens $i [expr $i+$s-1]]
   113    203         incr i $s
   114    204       }
   115    205       db eval $sql
   116    206     }
   117    207   db eval COMMIT
   118    208   
   119    209   
   120    210