/* ** This file contains some functions implemented in C used by the ** buildsearchdb.tcl script. There are two Tcl commands: ** ** parsehtml ** tokencount ** ** Tcl command [parsehtml] is a helper command used to extract text and ** markup tags from the HTML documents in the SQLite documentation. The ** [tokencount] command uses an FTS3 tokenizer to count the number of ** tokens in a document. Both of these are used while building the database ** only. */ #include #include #include #include #include #include #define ISSPACE(c) (((c)&0x80)==0 && isspace(c)) #include "fts3_tokenizer.h" #include "sqlite3.h" typedef unsigned int u32; typedef unsigned char u8; typedef sqlite3_uint64 u64; typedef struct STokenizer STokenizer; struct STokenizer { sqlite3_tokenizer_module *pMod; sqlite3_tokenizer *pTokenizer; sqlite3_tokenizer_cursor *pCursor; }; static void tokenizerClose(STokenizer *p){ if( p->pCursor ){ p->pMod->xClose(p->pCursor); } if( p->pTokenizer ){ p->pMod->xDestroy(p->pTokenizer); } memset(p, 0, sizeof(STokenizer)); } static int tokenizerOpen( Tcl_Interp *interp, Tcl_Obj *pTokenizer, Tcl_Obj *pDocument, STokenizer *pOut /* OUT: Structure containing tokenizer */ ){ sqlite3_tokenizer_module *pMod; /* Tokenizer module */ int rc; /* Return code */ const char *zDoc; /* Pointer to pDocument string buffer */ int nDoc; /* Number of bytes in buffer zDoc */ const char *zFail; /* Error message (if an error occurs) */ memset(pOut, 0, sizeof(STokenizer)); memcpy(&pMod, Tcl_GetByteArrayFromObj(pTokenizer, 0), sizeof(pMod)); rc = pMod->xCreate(0, 0, &pOut->pTokenizer); if( rc!=SQLITE_OK ){ zFail = "Error in xCreate()"; goto failed; } pOut->pMod = pMod; zDoc = Tcl_GetStringFromObj(pDocument, &nDoc); rc = pMod->xOpen(pOut->pTokenizer, zDoc, nDoc, &pOut->pCursor); if( rc!=SQLITE_OK ){ zFail = "Error in xOpen()"; goto failed; } pOut->pCursor->pTokenizer = pOut->pTokenizer; return TCL_OK; failed: tokenizerClose(pOut); Tcl_AppendResult(interp, zFail, 0); return TCL_ERROR; } static int tokenizerNext( STokenizer *p, /* Tokenizer wrapper object */ int *piStart, /* OUT: Byte offset of start of token */ int *piEnd, /* OUT: Byte offset of end of token */ int *piCurrent /* OUT: Token number */ ){ const char *z; int n; return p->pMod->xNext(p->pCursor, &z, &n, piStart, piEnd, piCurrent); } /* ** Tcl command: tokencount TOKENIZER DOCUMENT */ static int tokencountcmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[] ){ STokenizer sToken; int rc; int nToken = 0; int i1, i2, i3; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "TOKENIZER DOCUMENT"); return TCL_ERROR; } if( tokenizerOpen(interp, objv[1], objv[2], &sToken) ) return TCL_ERROR; while( SQLITE_OK==(rc = tokenizerNext(&sToken, &i1, &i2, &i3)) ) nToken++; if( rc!=SQLITE_DONE ){ Tcl_AppendResult(interp, "Error in xNext() 3", 0); rc = TCL_ERROR; }else{ Tcl_SetObjResult(interp, Tcl_NewIntObj(nToken)); rc = TCL_OK; } tokenizerClose(&sToken); return rc; } /* ** Tcl command: parsehtml HTML SCRIPT */ static int parsehtmlcmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj * const objv[] ){ char *zHtml; char *z; Tcl_Obj **aCall; int nElem; Tcl_Obj **aElem; int rc; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT"); return TCL_ERROR; } zHtml = Tcl_GetString(objv[1]); rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem); if( rc!=TCL_OK ) return rc; aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+2)); memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem); aCall[nElem] = 0; aCall[nElem+1] = 0; z = zHtml; while( *z ){ char *zText = z; while( *z && *z!='<' ) z++; /* Invoke the callback script for the chunk of text just parsed. */ Tcl_IncrRefCount( aCall[nElem] = Tcl_NewObj() ); Tcl_IncrRefCount( aCall[nElem+1] = Tcl_NewStringObj(zText, z-zText) ); rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0); Tcl_DecrRefCount( aCall[nElem] ); Tcl_DecrRefCount( aCall[nElem+1] ); if( rc!=TCL_OK ) return rc; /* Unless is at the end of the document, z now points to the start of a ** markup tag. Either an opening or a closing tag. Parse it up and ** invoke the callback script. */ if( *z ){ int nTag; char *zTag; z++; while( ISSPACE(*z) ) z++; zTag = z; while( *z && !ISSPACE(*z) && *z!='>' ) z++; nTag = z-zTag; if( nTag==5 && 0==strncasecmp("style", zTag, 5) ){ while( *z && strncasecmp("/style>", z, 7 ) ) z++; } else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){ while( *z && strncasecmp("-->", z, 3 ) ) z++; } else if( nTag>=3 && 0==memcmp("script", zTag, 6) ){ while( *z && strncasecmp("/script>", z, 8 ) ) z++; } else { Tcl_Obj *pParam = Tcl_NewObj(); while( *z && *z!='>' ){ char *zAttr; /* Gobble up white-space */ while( ISSPACE(*z) ) z++; zAttr = z; /* Advance to the end of the attribute name */ while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++; if( z==zAttr ) zAttr = 0; if( zAttr ){ Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr); Tcl_ListObjAppendElement(interp, pParam, pAttr); } while( ISSPACE(*z) ) z++; if( *z=='=' ){ int nVal; char *zVal; z++; while( ISSPACE(*z) ) z++; zVal = z; if( *zVal=='"' ){ zVal++; z++; while( *z && *z!='"' ) z++; nVal = z-zVal; z++; }else{ while( *z && !ISSPACE(*z) && *z!='>' ) z++; nVal = z-zVal; } Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal)); }else if( zAttr ){ Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1)); } } Tcl_IncrRefCount( aCall[nElem] = Tcl_NewStringObj(zTag, nTag) ); Tcl_IncrRefCount( aCall[nElem+1] = pParam ); rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0); Tcl_DecrRefCount( aCall[nElem] ); Tcl_DecrRefCount( aCall[nElem+1] ); if( rc!=TCL_OK ) return rc; } while( *z && !ISSPACE(*z) && *z!='>' ) z++; if( *z ) z++; } } return TCL_OK; } int Parsehtml_Init(Tcl_Interp *interp){ #ifdef USE_TCL_STUBS if (Tcl_InitStubs(interp, "8.4", 0) == 0) { return TCL_ERROR; } #endif Tcl_CreateObjCommand(interp, "parsehtml", parsehtmlcmd, 0, 0); Tcl_CreateObjCommand(interp, "tokencount", tokencountcmd, 0, 0); return TCL_OK; }