Documentation Source Text

Artifact Content
Login

Artifact 66023215d344f44718955c8e27bed8dc8ea38d35:



load ./parsehtml.so

# Return a list of relative paths to documents that should be included 
# in the index.
#
proc document_list {} {
  set files [list]
  foreach f [glob *.html c3ref/*.html releaselog/*.html] {
    if {![string match *crossref* $f]
     && ![string match fileio.html $f]
     && ![string match capi3ref.html $f]
     && ![string match changes.html $f]
     && ![string match btreemodule.html $f]
    } { lappend files $f }
  }
  return $files
}

proc readfile {zFile} {
  set fd [open $zFile]
  set ret [read $fd]
  close $fd
  return $ret
}

proc keywordparse_callback {tag details} {
  global K P
  switch -- [string tolower $tag] {
    "" {
      if {[info exists K(hyperlink)]} {
        append K($K(hyperlink)) $details
      }
    }
    "a" {
      array set D $details
      if {[info exists D(href)]} { set K(hyperlink) $D(href) }
    }
    "/a" {
      unset -nocomplain P(hyperlink)
    }
  }
}

#-------------------------------------------------------------------------
# This function is used as the callback when parsing ordinary documents 
# (not the keywords document).
#
# Rules for extracting fragment "titles". A fragment title consists of
# all text that follows the tag that opens the fragment until either:
#
#   1. 80 characters have been parsed, or
#   2. 8 characters have been parsed and one of the following is 
#        encountered:
#      a) A block element opening or closing tag, or
#      b) A <br> element, or
#      c) A "." character.
#
#   3. 8 characters have been parsed and a <br> tag or "." character is
#      encountered
#
proc docparse_callback {tag details} {
  global P
  set tag [string tolower $tag]
  switch -glob -- $tag {
    "" {
      append P(text) " $details"
      if {$P(isTitle)} { append P(title) $details }
      if {[llength $P(fragments)]} { 
        append P(ftext) " $details" 
      }
    }

    "title"  { set P(isTitle) 1 }
    "/title" { set P(isTitle) 0 }

    "a" { 
      array set D $details
      if {[info exists D(name)]} {
        if {[llength $P(fragments)]} { 
          lappend P(fragments) $P(ftitle) $P(ftext) 
        }
        lappend P(fragments) $D(name)
        set P(ftext) ""
        set P(ftitle) ""
        catch { unset P(ftitleclose) }
      }
    }
    "h*" {
      array set D $details
      if {[info exists D(id)]} {
        if {[llength $P(fragments)]} { 
          lappend P(fragments) $P(ftitle) $P(ftext) 
        }
        lappend P(fragments) $D(id)
        set P(ftext) ""
        set P(ftitle) ""
      }
    }

    div {
      array set D $details
      if {[info exists D(class)] && $D(class) == "startsearch"} { 
        set P(text) "" 
      }
    }
  }

  set ftext [string trim $P(ftext) " \v\n"]
  if {[string length $ftext]>4 && $P(ftitle) == ""} {
    set blocktags [list                               \
      br td /td th /th p /p                           \
      h1 h2 h3 h4 h5 h /h1 /h2 /h3 /h4 /h5 /h
    ]
    if {[lsearch $blocktags $tag]>=0} {
      set P(ftitle) $ftext
      set P(ftext)  ""
    } elseif {[string length $ftext]>80} {
      set idx [string last " " [string range $ftext 0 79]]
      if {$idx<0} { set idx 80 }
      set P(ftitle) [string range $ftext 0 [expr $idx-1]]
      set P(ftext)  [string range $ftext $idx end]
    } 
  }
}

proc insert_entry {url keywords title content} {
  set nTitle    [tokencount $::tokenizer $title]
  set nKeywords [tokencount $::tokenizer $keywords]
  set nContent  [tokencount $::tokenizer $content]
  db eval {
    INSERT INTO page VALUES($keywords, $title, $content);
    INSERT INTO pagedata VALUES(NULL, $nKeywords, $nTitle, $nContent, $url);
  }
}

proc rebuild_database {} {
  set report [list]

  db transaction {
    # Create the database schema. If the schema already exists, then those
    # tables that contain document data are dropped and recreated by this
    # proc. The 'config' table is left untouched.
    #
    db eval {
      CREATE TABLE IF NOT EXISTS config(item TEXT, value TEXT);

      DROP TABLE IF EXISTS page;
      DROP TABLE IF EXISTS pagedata;
      CREATE VIRTUAL TABLE page USING fts3(
        keywords,                           -- Document keywords
        title,                              -- Title (or first heading)
        content,                            -- Complete document text
        tokenize porter                     -- Built-in porter tokenizer
      );
      CREATE TABLE pagedata(
        docid INTEGER PRIMARY KEY,          -- Maps to docid of page
        nk INTEGER, nt INTEGER, nc INTEGER, -- Tokens in each ft field
        url TEXT                            -- Relative path to document
      );
    }

    set zHtml [readfile keyword_index.html]
    parsehtml $zHtml keywordparse_callback

    # Scan the file-system for HTML documents. Add each document found to
    # the page and pagedata tables.
    foreach file [document_list] {
      set zHtml [readfile $file]

      array unset ::P
      set ::P(text) ""                 ;# The full document text
      set ::P(isTitle) 0               ;# True while parsing contents of <title>
      set ::P(fragments) [list]        ;# List of document fragments parsed
      set ::P(ftext) ""                ;# Text of current document fragment 

      parsehtml $zHtml docparse_callback
      if {[info exists ::P(ftitle)]} {
        lappend ::P(fragments) $::P(ftitle) $::P(ftext)
      }

      set len [string length $::P(text)]
      set keyword ""
      catch { set keyword $::K($file) }
      if {![info exists ::P(title)]} {set ::P(title) "No Title"}
      insert_entry $file $keyword $::P(title) $::P(text)

      foreach {name title text} $::P(fragments) {
        set url "$file#$name"
        set keyword ""
        catch { set keyword [string trim $::K($url) " \n\v"] }
        insert_entry $url $keyword $title $text
      }
    }

    db eval { INSERT INTO page(page) VALUES('optimize') }
  }
  db eval VACUUM

  set report
}

cd doc
sqlite3 db search.db
set ::tokenizer [db one {SELECT fts3_tokenizer('porter')}]
rebuild_database