/*
** This file contains some functions implemented in C used by the
** buildsearchdb.tcl script. There are two Tcl commands:
**
** parsehtml
** tokencount
**
** Tcl command [parsehtml] is a helper command used to extract text and
** markup tags from the HTML documents in the SQLite documentation. The
** [tokencount] command uses an FTS3 tokenizer to count the number of
** tokens in a document. Both of these are used while building the database
** only.
*/
#include <tcl.h>
#include <string.h>
#include <strings.h>
#include <assert.h>
#include <ctype.h>
#include <math.h>
#define ISSPACE(c) (((c)&0x80)==0 && isspace(c))
#include "fts3_tokenizer.h"
#include "sqlite3.h"
typedef unsigned int u32;
typedef unsigned char u8;
typedef sqlite3_uint64 u64;
typedef struct STokenizer STokenizer;
struct STokenizer {
sqlite3_tokenizer_module *pMod;
sqlite3_tokenizer *pTokenizer;
sqlite3_tokenizer_cursor *pCursor;
};
static void tokenizerClose(STokenizer *p){
if( p->pCursor ){ p->pMod->xClose(p->pCursor); }
if( p->pTokenizer ){ p->pMod->xDestroy(p->pTokenizer); }
memset(p, 0, sizeof(STokenizer));
}
static int tokenizerOpen(
Tcl_Interp *interp,
Tcl_Obj *pTokenizer,
Tcl_Obj *pDocument,
STokenizer *pOut /* OUT: Structure containing tokenizer */
){
sqlite3_tokenizer_module *pMod; /* Tokenizer module */
int rc; /* Return code */
const char *zDoc; /* Pointer to pDocument string buffer */
int nDoc; /* Number of bytes in buffer zDoc */
const char *zFail; /* Error message (if an error occurs) */
memset(pOut, 0, sizeof(STokenizer));
memcpy(&pMod, Tcl_GetByteArrayFromObj(pTokenizer, 0), sizeof(pMod));
rc = pMod->xCreate(0, 0, &pOut->pTokenizer);
if( rc!=SQLITE_OK ){ zFail = "Error in xCreate()"; goto failed; }
pOut->pMod = pMod;
zDoc = Tcl_GetStringFromObj(pDocument, &nDoc);
rc = pMod->xOpen(pOut->pTokenizer, zDoc, nDoc, &pOut->pCursor);
if( rc!=SQLITE_OK ){ zFail = "Error in xOpen()"; goto failed; }
pOut->pCursor->pTokenizer = pOut->pTokenizer;
return TCL_OK;
failed:
tokenizerClose(pOut);
Tcl_AppendResult(interp, zFail, 0);
return TCL_ERROR;
}
static int tokenizerNext(
STokenizer *p, /* Tokenizer wrapper object */
int *piStart, /* OUT: Byte offset of start of token */
int *piEnd, /* OUT: Byte offset of end of token */
int *piCurrent /* OUT: Token number */
){
const char *z; int n;
return p->pMod->xNext(p->pCursor, &z, &n, piStart, piEnd, piCurrent);
}
/*
** Tcl command: tokencount TOKENIZER DOCUMENT
*/
static int tokencountcmd(
ClientData clientData,
Tcl_Interp *interp,
int objc,
Tcl_Obj *const objv[]
){
STokenizer sToken;
int rc;
int nToken = 0;
int i1, i2, i3;
if( objc!=3 ){
Tcl_WrongNumArgs(interp, 1, objv, "TOKENIZER DOCUMENT");
return TCL_ERROR;
}
if( tokenizerOpen(interp, objv[1], objv[2], &sToken) ) return TCL_ERROR;
while( SQLITE_OK==(rc = tokenizerNext(&sToken, &i1, &i2, &i3)) ) nToken++;
if( rc!=SQLITE_DONE ){
Tcl_AppendResult(interp, "Error in xNext() 3", 0);
rc = TCL_ERROR;
}else{
Tcl_SetObjResult(interp, Tcl_NewIntObj(nToken));
rc = TCL_OK;
}
tokenizerClose(&sToken);
return rc;
}
/*
** Tcl command: parsehtml HTML SCRIPT
*/
static int parsehtmlcmd(
ClientData clientData,
Tcl_Interp *interp,
int objc,
Tcl_Obj * const objv[]
){
char *zHtml;
char *z;
Tcl_Obj **aCall;
int nElem;
Tcl_Obj **aElem;
int rc;
if( objc!=3 ){
Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT");
return TCL_ERROR;
}
zHtml = Tcl_GetString(objv[1]);
rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem);
if( rc!=TCL_OK ) return rc;
aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+2));
memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem);
aCall[nElem] = 0;
aCall[nElem+1] = 0;
z = zHtml;
while( *z ){
char *zText = z;
while( *z && *z!='<' ) z++;
/* Invoke the callback script for the chunk of text just parsed. */
Tcl_IncrRefCount( aCall[nElem] = Tcl_NewObj() );
Tcl_IncrRefCount( aCall[nElem+1] = Tcl_NewStringObj(zText, z-zText) );
rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
Tcl_DecrRefCount( aCall[nElem] );
Tcl_DecrRefCount( aCall[nElem+1] );
if( rc!=TCL_OK ) return rc;
/* Unless is at the end of the document, z now points to the start of a
** markup tag. Either an opening or a closing tag. Parse it up and
** invoke the callback script. */
if( *z ){
int nTag;
char *zTag;
z++;
while( ISSPACE(*z) ) z++;
zTag = z;
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
nTag = z-zTag;
if( nTag==5 && 0==strncasecmp("style", zTag, 5) ){
while( *z && strncasecmp("/style>", z, 7 ) ) z++;
} else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){
while( *z && strncasecmp("-->", z, 3 ) ) z++;
} else if( nTag>=3 && 0==memcmp("script", zTag, 6) ){
while( *z && strncasecmp("/script>", z, 8 ) ) z++;
} else {
Tcl_Obj *pParam = Tcl_NewObj();
while( *z && *z!='>' ){
char *zAttr;
/* Gobble up white-space */
while( ISSPACE(*z) ) z++;
zAttr = z;
/* Advance to the end of the attribute name */
while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++;
if( z==zAttr ) zAttr = 0;
if( zAttr ){
Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr);
Tcl_ListObjAppendElement(interp, pParam, pAttr);
}
while( ISSPACE(*z) ) z++;
if( *z=='=' ){
int nVal;
char *zVal;
z++;
while( ISSPACE(*z) ) z++;
zVal = z;
if( *zVal=='"' ){
zVal++;
z++;
while( *z && *z!='"' ) z++;
nVal = z-zVal;
z++;
}else{
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
nVal = z-zVal;
}
Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal));
}else if( zAttr ){
Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1));
}
}
Tcl_IncrRefCount( aCall[nElem] = Tcl_NewStringObj(zTag, nTag) );
Tcl_IncrRefCount( aCall[nElem+1] = pParam );
rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
Tcl_DecrRefCount( aCall[nElem] );
Tcl_DecrRefCount( aCall[nElem+1] );
if( rc!=TCL_OK ) return rc;
}
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
if( *z ) z++;
}
}
return TCL_OK;
}
int Parsehtml_Init(Tcl_Interp *interp){
#ifdef USE_TCL_STUBS
if (Tcl_InitStubs(interp, "8.4", 0) == 0) {
return TCL_ERROR;
}
#endif
Tcl_CreateObjCommand(interp, "parsehtml", parsehtmlcmd, 0, 0);
Tcl_CreateObjCommand(interp, "tokencount", tokencountcmd, 0, 0);
return TCL_OK;
}