Documentation Source Text: Artifact [923c5323d4]

Artifact 923c5323d474718cc6cfdaebe072ac35a3a4f919:

File search/parsehtml.c — part of check-in [bfc81cf6e4] at 2010-01-08 17:57:37 on branch trunk — Filter out <script> tags when building the search database. (user: dan size: 7032)

/*
** This file contains some functions implemented in C used by the 
** buildsearchdb.tcl script. There are two Tcl commands:
**
**   parsehtml
**   tokencount
**
** Tcl command [parsehtml] is a helper command used to extract text and 
** markup tags from the HTML documents in the SQLite documentation. The
** [tokencount] command uses an FTS3 tokenizer to count the number of
** tokens in a document. Both of these are used while building the database
** only.
*/

#include <tcl.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <ctype.h>
#include <math.h>

#define ISSPACE(c) (((c)&0x80)==0 && isspace(c))

#include "fts3_tokenizer.h"
#include "sqlite3.h"

typedef unsigned int u32;
typedef unsigned char u8;
typedef sqlite3_uint64 u64;

typedef struct STokenizer STokenizer;
struct STokenizer {
  sqlite3_tokenizer_module *pMod;
  sqlite3_tokenizer *pTokenizer;
  sqlite3_tokenizer_cursor *pCursor;
};

static void tokenizerClose(STokenizer *p){
  if( p->pCursor ){ p->pMod->xClose(p->pCursor); }
  if( p->pTokenizer ){ p->pMod->xDestroy(p->pTokenizer); }
  memset(p, 0, sizeof(STokenizer));
}

static int tokenizerOpen(
  Tcl_Interp *interp,
  Tcl_Obj *pTokenizer,
  Tcl_Obj *pDocument,
  STokenizer *pOut                /* OUT: Structure containing tokenizer */
){
  sqlite3_tokenizer_module *pMod; /* Tokenizer module */
  int rc;                         /* Return code */
  const char *zDoc;               /* Pointer to pDocument string buffer */
  int nDoc;                       /* Number of bytes in buffer zDoc */
  const char *zFail;              /* Error message (if an error occurs) */

  memset(pOut, 0, sizeof(STokenizer));
  memcpy(&pMod, Tcl_GetByteArrayFromObj(pTokenizer, 0), sizeof(pMod));

  rc = pMod->xCreate(0, 0, &pOut->pTokenizer);
  if( rc!=SQLITE_OK ){ zFail = "Error in xCreate()"; goto failed; }
  pOut->pMod = pMod;

  zDoc = Tcl_GetStringFromObj(pDocument, &nDoc);
  rc = pMod->xOpen(pOut->pTokenizer, zDoc, nDoc, &pOut->pCursor);
  if( rc!=SQLITE_OK ){ zFail = "Error in xOpen()"; goto failed; }

  pOut->pCursor->pTokenizer = pOut->pTokenizer;
  return TCL_OK;

 failed:
  tokenizerClose(pOut);
  Tcl_AppendResult(interp, zFail, 0);
  return TCL_ERROR;
}

static int tokenizerNext(
  STokenizer *p,                  /* Tokenizer wrapper object */
  int *piStart,                   /* OUT: Byte offset of start of token */
  int *piEnd,                     /* OUT: Byte offset of end of token */
  int *piCurrent                  /* OUT: Token number */  
){
  const char *z; int n;
  return p->pMod->xNext(p->pCursor, &z, &n, piStart, piEnd, piCurrent);
}

/*
** Tcl command: tokencount TOKENIZER DOCUMENT
*/
static int tokencountcmd(
  ClientData clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *const objv[]
){
  STokenizer sToken;
  int rc;
  int nToken = 0;
  int i1, i2, i3;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "TOKENIZER DOCUMENT");
    return TCL_ERROR;
  }

  if( tokenizerOpen(interp, objv[1], objv[2], &sToken) ) return TCL_ERROR;

  while( SQLITE_OK==(rc = tokenizerNext(&sToken, &i1, &i2, &i3)) ) nToken++;
  if( rc!=SQLITE_DONE ){
    Tcl_AppendResult(interp, "Error in xNext() 3", 0);
    rc = TCL_ERROR;
  }else{
    Tcl_SetObjResult(interp, Tcl_NewIntObj(nToken));
    rc = TCL_OK;
  }

  tokenizerClose(&sToken);
  return rc;
}


/*
** Tcl command: parsehtml HTML SCRIPT
*/
static int parsehtmlcmd(
  ClientData clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj * const objv[]
){
  char *zHtml;
  char *z;
  Tcl_Obj **aCall;
  int nElem;
  Tcl_Obj **aElem;
  int rc;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT");
    return TCL_ERROR;
  }
  zHtml = Tcl_GetString(objv[1]);

  rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem);
  if( rc!=TCL_OK ) return rc;
  aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+2));
  memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem);
  aCall[nElem] = 0;
  aCall[nElem+1] = 0;

  z = zHtml;
  while( *z ){
    char *zText = z;
    while( *z && *z!='<' ) z++;

    /* Invoke the callback script for the chunk of text just parsed. */
    Tcl_IncrRefCount( aCall[nElem]   = Tcl_NewObj() );
    Tcl_IncrRefCount( aCall[nElem+1] = Tcl_NewStringObj(zText, z-zText) );
    rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
    Tcl_DecrRefCount( aCall[nElem] );
    Tcl_DecrRefCount( aCall[nElem+1] );
    if( rc!=TCL_OK ) return rc;

    /* Unless is at the end of the document, z now points to the start of a
    ** markup tag. Either an opening or a closing tag. Parse it up and 
    ** invoke the callback script. */
    if( *z ){
      int nTag;
      char *zTag;
      z++;

      while( ISSPACE(*z) ) z++;
      zTag = z;

      while( *z && !ISSPACE(*z) && *z!='>' ) z++;
      nTag = z-zTag;

      if( nTag==5 && 0==strncasecmp("style", zTag, 5) ){
        while( *z && strncasecmp("/style>", z, 7 ) ) z++;
      } else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){
        while( *z && strncasecmp("-->", z, 3 ) ) z++;
      } else if( nTag>=3 && 0==memcmp("script", zTag, 6) ){
        while( *z && strncasecmp("/script>", z, 8 ) ) z++;
      } else {
        Tcl_Obj *pParam = Tcl_NewObj();

        while( *z && *z!='>' ){
          char *zAttr;

          /* Gobble up white-space */
          while( ISSPACE(*z) ) z++;
          zAttr = z;

          /* Advance to the end of the attribute name */
          while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++;
          if( z==zAttr ) zAttr = 0;

          if( zAttr ){
            Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr);
            Tcl_ListObjAppendElement(interp, pParam, pAttr);
          }
          while( ISSPACE(*z) ) z++;

          if( *z=='=' ){
            int nVal;
            char *zVal;
            z++;
            while( ISSPACE(*z) ) z++;
            zVal = z;

            if( *zVal=='"' ){
              zVal++;
              z++;
              while( *z && *z!='"' ) z++;
              nVal = z-zVal;
              z++;
            }else{
              while( *z && !ISSPACE(*z) && *z!='>' ) z++;
              nVal = z-zVal;
            }
            Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal));
          }else if( zAttr ){
            Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1));
          }
        }
        
        Tcl_IncrRefCount( aCall[nElem]   = Tcl_NewStringObj(zTag, nTag) );
        Tcl_IncrRefCount( aCall[nElem+1] = pParam );
        rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
        Tcl_DecrRefCount( aCall[nElem] );
        Tcl_DecrRefCount( aCall[nElem+1] );
        if( rc!=TCL_OK ) return rc;
      }

      while( *z && !ISSPACE(*z) && *z!='>' ) z++;
      if( *z ) z++;
    }

  }

  return TCL_OK;
}

int Parsehtml_Init(Tcl_Interp *interp){

#ifdef USE_TCL_STUBS
  if (Tcl_InitStubs(interp, "8.4", 0) == 0) {
    return TCL_ERROR;
  }
#endif

  Tcl_CreateObjCommand(interp, "parsehtml",  parsehtmlcmd, 0, 0);
  Tcl_CreateObjCommand(interp, "tokencount", tokencountcmd, 0, 0);

  return TCL_OK;
}