/*
** This file contains the [parsehtml] command, a helper command used to extract
** text and markup tags from the HTML documents in the documentation.
*/
#include
#include
#include
#include
#include
#include
#define ISSPACE(c) (((c)&0x80)==0 && isspace(c))
#include "sqlite3.h"
typedef unsigned int u32;
typedef unsigned char u8;
typedef sqlite3_uint64 u64;
static int doTagCallback(
Tcl_Interp *interp,
Tcl_Obj **aCall,
int nElem,
const char *zTag, int nTag,
int iOffset, int iEndOffset,
Tcl_Obj *pParam
){
int rc;
Tcl_Obj *pArg = pParam;
if( pArg==0 ) pArg = Tcl_NewObj();
Tcl_IncrRefCount( aCall[nElem] = Tcl_NewStringObj(zTag, nTag) );
Tcl_IncrRefCount( aCall[nElem+1] = pArg );
Tcl_IncrRefCount( aCall[nElem+2] = Tcl_NewIntObj(iOffset) );
Tcl_IncrRefCount( aCall[nElem+3] = Tcl_NewIntObj(iEndOffset) );
rc = Tcl_EvalObjv(interp, nElem+4, aCall, 0);
Tcl_DecrRefCount( aCall[nElem] );
Tcl_DecrRefCount( aCall[nElem+1] );
Tcl_DecrRefCount( aCall[nElem+2] );
Tcl_DecrRefCount( aCall[nElem+3] );
return rc;
}
static int doTextCallback(
Tcl_Interp *interp,
Tcl_Obj **aCall,
int nElem,
const char *zText, int nText,
int iOffset, int iEndOffset
){
int rc = TCL_OK;
if( nText>0 ){
Tcl_Obj *pText = Tcl_NewStringObj(zText, nText);
rc = doTagCallback(interp, aCall, nElem, "", 0, iOffset, iEndOffset, pText);
}
return rc;
}
/*
** Tcl command: parsehtml HTML SCRIPT
*/
static int parsehtmlcmd(
ClientData clientData,
Tcl_Interp *interp,
int objc,
Tcl_Obj * const objv[]
){
char *zHtml;
char *z;
Tcl_Obj **aCall;
int nElem;
Tcl_Obj **aElem;
int rc;
if( objc!=3 ){
Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT");
return TCL_ERROR;
}
zHtml = Tcl_GetString(objv[1]);
rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem);
if( rc!=TCL_OK ) return rc;
aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+4));
memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem);
memset(&aCall[nElem], 0, 3*sizeof(Tcl_Obj*));
z = zHtml;
while( *z ){
char *zText = z;
while( *z && *z!='<' ) z++;
/* Invoke the callback script for the chunk of text just parsed. */
rc = doTextCallback(interp,aCall,nElem,zText,z-zText,zText-zHtml,z-zHtml);
if( rc!=TCL_OK ) return rc;
/* Unless is at the end of the document, z now points to the start of a
** markup tag. Either an opening or a closing tag. Parse it up and
** invoke the callback script. */
if( *z ){
int nTag;
char *zTag;
int iOffset; /* Offset of open tag (the '<' character) */
assert( *z=='<' );
iOffset = z - zHtml;
z++;
while( ISSPACE(*z) ) z++;
zTag = z;
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
nTag = z-zTag;
if( nTag==5 && 0==strncasecmp("style", zTag, 5) ){
while( *z && strncasecmp("/style>", z, 7 ) ) z++;
} else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){
while( *z && strncasecmp("-->", z, 3 ) ) z++;
} else if( nTag>=3 && 0==memcmp("script", zTag, 6) ){
while( *z && strncasecmp("/script>", z, 8 ) ) z++;
} else {
Tcl_Obj *pParam = Tcl_NewObj();
while( *z && *z!='>' ){
char *zAttr;
/* Gobble up white-space */
while( ISSPACE(*z) ) z++;
zAttr = z;
/* Advance to the end of the attribute name */
while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++;
if( z==zAttr ) zAttr = 0;
if( zAttr ){
Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr);
Tcl_ListObjAppendElement(interp, pParam, pAttr);
}
while( ISSPACE(*z) ) z++;
if( *z=='=' ){
int nVal;
char *zVal;
z++;
while( ISSPACE(*z) ) z++;
zVal = z;
if( *zVal=='"' ){
zVal++;
z++;
while( *z && *z!='"' ) z++;
nVal = z-zVal;
z++;
}else{
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
nVal = z-zVal;
}
Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal));
}else if( zAttr ){
Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1));
}
}
rc = doTagCallback(interp,
aCall, nElem, zTag, nTag, iOffset, 1+z-zHtml, pParam
);
if( rc!=TCL_OK ) return rc;
if( nTag==3 && memcmp(zTag, "tcl", 3)==0 ){
const char *zText = &z[1];
while( *z && strncasecmp("", z, 6) ) z++;
rc = doTextCallback(interp, aCall, nElem, zText, z-zText, 0, 0);
if( rc!=TCL_OK ) return rc;
rc = doTagCallback(interp, aCall, nElem, "/tcl", 4, 0, 0, 0);
if( rc!=TCL_OK ) return rc;
if( *z ) z++;
}
}
while( *z && !ISSPACE(*z) && *z!='>' ) z++;
if( *z ) z++;
}
}
return TCL_OK;
}
int Parsehtml_Init(Tcl_Interp *interp){
#ifdef USE_TCL_STUBS
if (Tcl_InitStubs(interp, "8.4", 0) == 0) {
return TCL_ERROR;
}
#endif
Tcl_CreateObjCommand(interp, "parsehtml", parsehtmlcmd, 0, 0);
return TCL_OK;
}