/ Check-in [c646e403]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Enhance fts5txt2db.tcl, a script used to generate fts5/fts4 databases for performance testing.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5-perf
Files: files | file ages | folders
SHA1: c646e40350e5aa91abcf52de61fb31275bad38f9
User & Date: dan 2016-01-26 17:08:22
Context
2016-01-26
19:30
Improve the performance of fts5 column filters on detail=col tables. check-in: 249a2d07 user: dan tags: fts5-perf
17:08
Enhance fts5txt2db.tcl, a script used to generate fts5/fts4 databases for performance testing. check-in: c646e403 user: dan tags: fts5-perf
2016-01-23
18:51
Fix an fts5 problem with using both xPhraseFirst() and xPhraseFirstColumn() within a single statement in detail=col mode. check-in: 72d53699 user: dan tags: fts5-perf
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/tool/fts5txt2db.tcl.

1
2
3

4




5









6
7
8









9
10




























































































11
12
13
14
15
16
17
..
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77

78
79
80
81
82
83
84
..
85
86
87
88
89
90
91








92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120


proc usage {} {

  puts stderr "$::argv0 ?OPTIONS? DATABASE FILE1..."




  puts stderr ""









  puts stderr "Options are"
  puts stderr "  -fts5"
  puts stderr "  -fts4"









  puts stderr "  -colsize <list of column sizes>"
  puts stderr {




























































































This script is designed to create fts4/5 tables with more than one column.
The -colsize option should be set to a Tcl list of integer values, one for
each column in the table. Each value is the number of tokens that will be
inserted into the column value for each row. For example, setting the -colsize
option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
tokens per row in each, respectively.

................................................................................
Each "FILE" argument should be a text file. The contents of these text files is
split on whitespace characters to form a list of tokens. The first N1 tokens
are used for the first column of the first row, where N1 is the first element
of the -colsize list. The next N2 are used for the second column of the first
row, and so on. Rows are added to the table until the entire list of tokens
is exhausted.
}
  exit -1
}

set O(aColSize)       [list 10 10 10]
set O(tblname)        t1
set O(fts)            fts5


set options_with_values {-colsize}

for {set i 0} {$i < [llength $argv]} {incr i} {
  set opt [lindex $argv $i]
  if {[string range $opt 0 0]!="-"} break

  if {[lsearch $options_with_values $opt]>=0} {
    incr i
    if {$i==[llength $argv]} usage
    set val [lindex $argv $i]
  }

  switch -- $opt {
    -colsize {
      set O(aColSize) $val
    }

    -fts4 {
      set O(fts) fts4
    }

    -fts5 {

      set O(fts) fts5
    }
  }
}

if {$i > [llength $argv]-2} usage
set O(db) [lindex $argv $i]
set O(files) [lrange $argv [expr $i+1] end]

sqlite3 db $O(db)

# Create the FTS table in the db. Return a list of the table columns.
#
proc create_table {} {
  global O
  set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]

  set nCol [llength $O(aColSize)]
  set cols [lrange $cols 0 [expr $nCol-1]]

  set sql    "CREATE VIRTUAL TABLE IF NOT EXISTS $O(tblname) USING $O(fts) ("
  append sql [join $cols ,]
  append sql ");"


  db eval $sql
  return $cols
}

# Return a list of tokens from the named file.
#
................................................................................
proc readfile {file} {
  set fd [open $file]
  set data [read $fd]
  close $fd
  split $data
}










# Load all the data into a big list of tokens.
#
set tokens [list]
foreach f $O(files) {
  set tokens [concat $tokens [readfile $f]]
}

set N [llength $tokens]
set i 0
set cols [create_table]
set sql "INSERT INTO $O(tblname) VALUES(\$[lindex $cols 0]"
foreach c [lrange $cols 1 end] {
  append sql ", \$A($c)"
}
append sql ")"

db eval BEGIN
  while {$i < $N} {
    foreach c $cols s $O(aColSize) {
      set A($c) [lrange $tokens $i [expr $i+$s-1]]
      incr i $s
    }
    db eval $sql
  }
db eval COMMIT





<
>
|
>
>
>
>
|
>
>
>
>
>
>
>
>
>
|
|
|
>
>
>
>
>
>
>
>
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
<
<
<
>
|
|
|
<
<
<
<
<
<
|




|


|


|

<
>







 







>
>
>
>
>
>
>
>




|
|





|

|





|
|








1
2

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
...
132
133
134
135
136
137
138

139























140
141



142
143
144
145






146
147
148
149
150
151
152
153
154
155
156
157
158

159
160
161
162
163
164
165
166
...
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210



#-------------------------------------------------------------------------
# Command line options processor.
#
proc command_line_error {O E {msg ""}} {
  if {$msg != ""} {
    puts stderr "Error: $msg"
    puts stderr ""
  }

  set L [list]
  foreach o $O {
    if {[llength $o]==1} {
      lappend L [string toupper $o]
    }
  }

  puts stderr "Usage: $::argv0 ?SWITCHES? $L"
  puts stderr ""
  puts stderr "Switches are:"
  foreach o $O {
    if {[llength $o]==3} {
      foreach {a b c} $o {}
      puts stderr [format "    -%-15s %s (default \"%s\")" "$a VAL" $c $b]
    } elseif {[llength $o]==2} {
      foreach {a b} $o {}
      puts stderr [format "    -%-15s %s" $a $b]
    }
  }
  puts stderr ""
  puts stderr $E
  exit -1
}

proc process_command_line {avar lArgs O E} {

  upvar $avar A
  set zTrailing ""       ;# True if ... is present in $O
  set lPosargs [list]

  # Populate A() with default values. Also, for each switch in the command
  # line spec, set an entry in the idx() array as follows:
  #
  #  {tblname t1 "table name to use"}  
  #      -> [set idx(-tblname) {tblname t1 "table name to use"}  
  #
  # For each position parameter, append its name to $lPosargs. If the ...
  # specifier is present, set $zTrailing to the name of the prefix.
  #
  foreach o $O {
    set nm [lindex $o 0]
    set nArg [llength $o]
    switch -- $nArg {
      1 {
        if {[string range $nm end-2 end]=="..."} {
          set zTrailing [string range $nm 0 end-3]
        } else {
          lappend lPosargs $nm
        }
      }
      2 {
        set A($nm) 0
        set idx(-$nm) $o
      }
      3 {
        set A($nm) [lindex $o 1]
        set idx(-$nm) $o
      }
      default {
        error "Error in command line specification"
      }
    }
  }

  # Set explicitly specified option values
  #
  set nArg [llength $lArgs]
  for {set i 0} {$i < $nArg} {incr i} {
    set opt [lindex $lArgs $i]
    if {[string range $opt 0 0]!="-" || $opt=="--"} break
    set c [array names idx "${opt}*"]
    if {[llength $c]==0} { command_line_error $O $E "Unrecognized option: $opt"}
    if {[llength $c]>1}  { command_line_error $O $E "Ambiguous option: $opt"}

    if {[llength $idx($c)]==3} {
      if {$i==[llength $lArgs]-1} {
        command_line_error $O $E "Option requires argument: $c" 
      }
      incr i
      set A([lindex $idx($c) 0]) [lindex $lArgs $i]
    } else {
      set A([lindex $idx($c) 0]) 1
    }
  }

  # Deal with position arguments.
  #
  set nPosarg [llength $lPosargs]
  set nRem [expr $nArg - $i]
  if {$nRem < $nPosarg || ($zTrailing=="" && $nRem > $nPosarg)} {
    command_line_error $O $E
  }
  for {set j 0} {$j < $nPosarg} {incr j} {
    set A([lindex $lPosargs $j]) [lindex $lArgs [expr $j+$i]]
  }
  if {$zTrailing!=""} {
    set A($zTrailing) [lrange $lArgs [expr $j+$i] end]
  }
}
# End of command line options processor.
#-------------------------------------------------------------------------


process_command_line A $argv {
  {fts5                 "use fts5"}
  {fts4                 "use fts4"}
  {colsize   "10 10 10" "list of column sizes"}
  {tblname   "t1"       "table name to create"}
  {detail    "full"     "Fts5 detail mode to use"}
  {repeat    1          "Load each file this many times"}
  database
  file...
} {
This script is designed to create fts4/5 tables with more than one column.
The -colsize option should be set to a Tcl list of integer values, one for
each column in the table. Each value is the number of tokens that will be
inserted into the column value for each row. For example, setting the -colsize
option to "5 10" creates an FTS table with 2 columns, with roughly 5 and 10
tokens per row in each, respectively.

................................................................................
Each "FILE" argument should be a text file. The contents of these text files is
split on whitespace characters to form a list of tokens. The first N1 tokens
are used for the first column of the first row, where N1 is the first element
of the -colsize list. The next N2 are used for the second column of the first
row, and so on. Rows are added to the table until the entire list of tokens
is exhausted.
}

























if {$A(fts4)} {
  set A(fts) fts4



} else {
  set A(fts) fts5
}







sqlite3 db $A(database)

# Create the FTS table in the db. Return a list of the table columns.
#
proc create_table {} {
  global A
  set cols [list a b c d e f g h i j k l m n o p q r s t u v w x y z]

  set nCol [llength $A(colsize)]
  set cols [lrange $cols 0 [expr $nCol-1]]

  set sql    "CREATE VIRTUAL TABLE IF NOT EXISTS $A(tblname) USING $A(fts) ("
  append sql [join $cols ,]

  if {$A(fts)=="fts5"} { append sql ",detail=$A(detail));" }

  db eval $sql
  return $cols
}

# Return a list of tokens from the named file.
#
................................................................................
proc readfile {file} {
  set fd [open $file]
  set data [read $fd]
  close $fd
  split $data
}

proc repeat {L n} {
  set res [list]
  for {set i 0} {$i < $n} {incr i} {
    set res [concat $res $L]
  }
  set res
}


# Load all the data into a big list of tokens.
#
set tokens [list]
foreach f $A(file) {
  set tokens [concat $tokens [repeat [readfile $f] $A(repeat)]]
}

set N [llength $tokens]
set i 0
set cols [create_table]
set sql "INSERT INTO $A(tblname) VALUES(\$R([lindex $cols 0])"
foreach c [lrange $cols 1 end] {
  append sql ", \$R($c)"
}
append sql ")"

db eval BEGIN
  while {$i < $N} {
    foreach c $cols s $A(colsize) {
      set R($c) [lrange $tokens $i [expr $i+$s-1]]
      incr i $s
    }
    db eval $sql
  }
db eval COMMIT