Documentation Source Text: Artifact [66023215d3]

Artifact 66023215d344f44718955c8e27bed8dc8ea38d35:

File search/buildsearchdb.tcl — part of check-in [26dedb8a9a] at 2012-05-30 14:25:11 on branch trunk — Fix the "search" box so that it accepts upper and lower case search terms (with case folding) and OR and NEAR search and phrase searches. Exclude some of the aggregate documents from indexing. (user: drh size: 6046)

load ./parsehtml.so

# Return a list of relative paths to documents that should be included 
# in the index.
#
proc document_list {} {
  set files [list]
  foreach f [glob *.html c3ref/*.html releaselog/*.html] {
    if {![string match *crossref* $f]
     && ![string match fileio.html $f]
     && ![string match capi3ref.html $f]
     && ![string match changes.html $f]
     && ![string match btreemodule.html $f]
    } { lappend files $f }
  }
  return $files
}

proc readfile {zFile} {
  set fd [open $zFile]
  set ret [read $fd]
  close $fd
  return $ret
}

proc keywordparse_callback {tag details} {
  global K P
  switch -- [string tolower $tag] {
    "" {
      if {[info exists K(hyperlink)]} {
        append K($K(hyperlink)) $details
      }
    }
    "a" {
      array set D $details
      if {[info exists D(href)]} { set K(hyperlink) $D(href) }
    }
    "/a" {
      unset -nocomplain P(hyperlink)
    }
  }
}

#-------------------------------------------------------------------------
# This function is used as the callback when parsing ordinary documents 
# (not the keywords document).
#
# Rules for extracting fragment "titles". A fragment title consists of
# all text that follows the tag that opens the fragment until either:
#
#   1. 80 characters have been parsed, or
#   2. 8 characters have been parsed and one of the following is 
#        encountered:
#      a) A block element opening or closing tag, or
#      b) A <br> element, or
#      c) A "." character.
#
#   3. 8 characters have been parsed and a <br> tag or "." character is
#      encountered
#
proc docparse_callback {tag details} {
  global P
  set tag [string tolower $tag]
  switch -glob -- $tag {
    "" {
      append P(text) " $details"
      if {$P(isTitle)} { append P(title) $details }
      if {[llength $P(fragments)]} { 
        append P(ftext) " $details" 
      }
    }

    "title"  { set P(isTitle) 1 }
    "/title" { set P(isTitle) 0 }

    "a" { 
      array set D $details
      if {[info exists D(name)]} {
        if {[llength $P(fragments)]} { 
          lappend P(fragments) $P(ftitle) $P(ftext) 
        }
        lappend P(fragments) $D(name)
        set P(ftext) ""
        set P(ftitle) ""
        catch { unset P(ftitleclose) }
      }
    }
    "h*" {
      array set D $details
      if {[info exists D(id)]} {
        if {[llength $P(fragments)]} { 
          lappend P(fragments) $P(ftitle) $P(ftext) 
        }
        lappend P(fragments) $D(id)
        set P(ftext) ""
        set P(ftitle) ""
      }
    }

    div {
      array set D $details
      if {[info exists D(class)] && $D(class) == "startsearch"} { 
        set P(text) "" 
      }
    }
  }

  set ftext [string trim $P(ftext) " \v\n"]
  if {[string length $ftext]>4 && $P(ftitle) == ""} {
    set blocktags [list                               \
      br td /td th /th p /p                           \
      h1 h2 h3 h4 h5 h /h1 /h2 /h3 /h4 /h5 /h
    ]
    if {[lsearch $blocktags $tag]>=0} {
      set P(ftitle) $ftext
      set P(ftext)  ""
    } elseif {[string length $ftext]>80} {
      set idx [string last " " [string range $ftext 0 79]]
      if {$idx<0} { set idx 80 }
      set P(ftitle) [string range $ftext 0 [expr $idx-1]]
      set P(ftext)  [string range $ftext $idx end]
    } 
  }
}

proc insert_entry {url keywords title content} {
  set nTitle    [tokencount $::tokenizer $title]
  set nKeywords [tokencount $::tokenizer $keywords]
  set nContent  [tokencount $::tokenizer $content]
  db eval {
    INSERT INTO page VALUES($keywords, $title, $content);
    INSERT INTO pagedata VALUES(NULL, $nKeywords, $nTitle, $nContent, $url);
  }
}

proc rebuild_database {} {
  set report [list]

  db transaction {
    # Create the database schema. If the schema already exists, then those
    # tables that contain document data are dropped and recreated by this
    # proc. The 'config' table is left untouched.
    #
    db eval {
      CREATE TABLE IF NOT EXISTS config(item TEXT, value TEXT);

      DROP TABLE IF EXISTS page;
      DROP TABLE IF EXISTS pagedata;
      CREATE VIRTUAL TABLE page USING fts3(
        keywords,                           -- Document keywords
        title,                              -- Title (or first heading)
        content,                            -- Complete document text
        tokenize porter                     -- Built-in porter tokenizer
      );
      CREATE TABLE pagedata(
        docid INTEGER PRIMARY KEY,          -- Maps to docid of page
        nk INTEGER, nt INTEGER, nc INTEGER, -- Tokens in each ft field
        url TEXT                            -- Relative path to document
      );
    }

    set zHtml [readfile keyword_index.html]
    parsehtml $zHtml keywordparse_callback

    # Scan the file-system for HTML documents. Add each document found to
    # the page and pagedata tables.
    foreach file [document_list] {
      set zHtml [readfile $file]

      array unset ::P
      set ::P(text) ""                 ;# The full document text
      set ::P(isTitle) 0               ;# True while parsing contents of <title>
      set ::P(fragments) [list]        ;# List of document fragments parsed
      set ::P(ftext) ""                ;# Text of current document fragment 

      parsehtml $zHtml docparse_callback
      if {[info exists ::P(ftitle)]} {
        lappend ::P(fragments) $::P(ftitle) $::P(ftext)
      }

      set len [string length $::P(text)]
      set keyword ""
      catch { set keyword $::K($file) }
      if {![info exists ::P(title)]} {set ::P(title) "No Title"}
      insert_entry $file $keyword $::P(title) $::P(text)

      foreach {name title text} $::P(fragments) {
        set url "$file#$name"
        set keyword ""
        catch { set keyword [string trim $::K($url) " \n\v"] }
        insert_entry $url $keyword $title $text
      }
    }

    db eval { INSERT INTO page(page) VALUES('optimize') }
  }
  db eval VACUUM

  set report
}

cd doc
sqlite3 db search.db
set ::tokenizer [db one {SELECT fts3_tokenizer('porter')}]
rebuild_database