/* ** This file contains some functions implemented in C used by the search.tcl ** script. There are two Tcl commands: ** ** parsehtml ** tokencount ** ** Tcl command [parsehtml] is a helper command used to extract text and ** markup tags from the HTML documents in the SQLite documentation. The ** [tokencount] command uses an FTS3 tokenizer to count the number of ** tokens in a document. Both of these are used while building the database ** only. ** ** There are also two SQL user functions registered: ** ** rank() ** erank() ** ** rank() interprets the return value of the FTS3 matchinfo() function and ** returns a score for the match (a real number). The higher the score, the ** more relevant the document is considered. This is used to order query ** results when the user searchs the database. The rank() function takes ** (nCol+1) arguments, where nCol is the number of columns in the FTS3 ** table. The first argument is the return value of matchinfo(). The ** second argument is the number of tokens in column 0 of the current FTS3 ** table row. The third argument is the number of tokens in column 1, and ** so on. ** ** Function erank() is called in exactly the same way as rank(). Instead ** of returning a score, it returns an HTML formatted table containing ** data that may be used to understand how the score for the current row ** was calculated. */ #include #include #include #include #include #define ISSPACE(c) (((c)&0x80)==0 && isspace(c)) #include "fts3_tokenizer.h" #include "sqlite3.h" typedef unsigned int u32; typedef unsigned char u8; typedef sqlite3_uint64 u64; typedef struct STokenizer STokenizer; struct STokenizer { sqlite3_tokenizer_module *pMod; sqlite3_tokenizer *pTokenizer; sqlite3_tokenizer_cursor *pCursor; }; static void tokenizerClose(STokenizer *p){ if( p->pCursor ){ p->pMod->xClose(p->pCursor); } if( p->pTokenizer ){ p->pMod->xDestroy(p->pTokenizer); } memset(p, 0, sizeof(STokenizer)); } static int tokenizerOpen( Tcl_Interp *interp, Tcl_Obj *pTokenizer, Tcl_Obj *pDocument, STokenizer *pOut /* OUT: Structure containing tokenizer */ ){ sqlite3_tokenizer_module *pMod; /* Tokenizer module */ int rc; /* Return code */ const char *zDoc; /* Pointer to pDocument string buffer */ int nDoc; /* Number of bytes in buffer zDoc */ const char *zFail; /* Error message (if an error occurs) */ memset(pOut, 0, sizeof(STokenizer)); memcpy(&pMod, Tcl_GetByteArrayFromObj(pTokenizer, 0), sizeof(pMod)); rc = pMod->xCreate(0, 0, &pOut->pTokenizer); if( rc!=SQLITE_OK ){ zFail = "Error in xCreate()"; goto failed; } pOut->pMod = pMod; zDoc = Tcl_GetStringFromObj(pDocument, &nDoc); rc = pMod->xOpen(pOut->pTokenizer, zDoc, nDoc, &pOut->pCursor); if( rc!=SQLITE_OK ){ zFail = "Error in xOpen()"; goto failed; } pOut->pCursor->pTokenizer = pOut->pTokenizer; return TCL_OK; failed: tokenizerClose(pOut); Tcl_AppendResult(interp, zFail, 0); return TCL_ERROR; } static int tokenizerNext( STokenizer *p, /* Tokenizer wrapper object */ int *piStart, /* OUT: Byte offset of start of token */ int *piEnd, /* OUT: Byte offset of end of token */ int *piCurrent /* OUT: Token number */ ){ const char *z; int n; return p->pMod->xNext(p->pCursor, &z, &n, piStart, piEnd, piCurrent); } /* ** Tcl command: tokencount TOKENIZER DOCUMENT */ static int tokencountcmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *const objv[] ){ STokenizer sToken; int rc; int nToken = 0; int i1, i2, i3; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "TOKENIZER DOCUMENT"); return TCL_ERROR; } if( tokenizerOpen(interp, objv[1], objv[2], &sToken) ) return TCL_ERROR; while( SQLITE_OK==(rc = tokenizerNext(&sToken, &i1, &i2, &i3)) ) nToken++; if( rc!=SQLITE_DONE ){ Tcl_AppendResult(interp, "Error in xNext() 3", 0); rc = TCL_ERROR; }else{ Tcl_SetObjResult(interp, Tcl_NewIntObj(nToken)); rc = TCL_OK; } tokenizerClose(&sToken); return rc; } /* ** Tcl command: parsehtml HTML SCRIPT */ static int parsehtmlcmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj * const objv[] ){ char *zHtml; char *z; Tcl_Obj **aCall; int nElem; Tcl_Obj **aElem; int rc; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT"); return TCL_ERROR; } zHtml = Tcl_GetString(objv[1]); rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem); if( rc!=TCL_OK ) return rc; aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+2)); memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem); aCall[nElem] = 0; aCall[nElem+1] = 0; z = zHtml; while( *z ){ char *zText = z; while( *z && *z!='<' ) z++; /* Invoke the callback script for the chunk of text just parsed. */ Tcl_IncrRefCount( aCall[nElem] = Tcl_NewObj() ); Tcl_IncrRefCount( aCall[nElem+1] = Tcl_NewStringObj(zText, z-zText) ); rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0); Tcl_DecrRefCount( aCall[nElem] ); Tcl_DecrRefCount( aCall[nElem+1] ); if( rc!=TCL_OK ) return rc; /* Unless is at the end of the document, z now points to the start of a ** markup tag. Either an opening or a closing tag. Parse it up and ** invoke the callback script. */ if( *z ){ int nTag; char *zTag; z++; while( ISSPACE(*z) ) z++; zTag = z; while( *z && !ISSPACE(*z) && *z!='>' ) z++; nTag = z-zTag; if( nTag==5 && 0==sqlite3_strnicmp("style", zTag, 5) ){ while( *z && sqlite3_strnicmp("/style>", z, 6 ) ) z++; } else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){ while( *z && sqlite3_strnicmp("-->", z, 3 ) ) z++; } else { Tcl_Obj *pParam = Tcl_NewObj(); while( *z && *z!='>' ){ char *zAttr; /* Gobble up white-space */ while( ISSPACE(*z) ) z++; zAttr = z; /* Advance to the end of the attribute name */ while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++; if( z==zAttr ) zAttr = 0; if( zAttr ){ Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr); Tcl_ListObjAppendElement(interp, pParam, pAttr); } while( ISSPACE(*z) ) z++; if( *z=='=' ){ int nVal; char *zVal; z++; while( ISSPACE(*z) ) z++; zVal = z; if( *zVal=='"' ){ zVal++; z++; while( *z && *z!='"' ) z++; nVal = z-zVal; z++; }else{ while( *z && !ISSPACE(*z) && *z!='>' ) z++; nVal = z-zVal; } Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal)); }else if( zAttr ){ Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1)); } } Tcl_IncrRefCount( aCall[nElem] = Tcl_NewStringObj(zTag, nTag) ); Tcl_IncrRefCount( aCall[nElem+1] = pParam ); rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0); Tcl_DecrRefCount( aCall[nElem] ); Tcl_DecrRefCount( aCall[nElem+1] ); if( rc!=TCL_OK ) return rc; } while( *z && !ISSPACE(*z) && *z!='>' ) z++; if( *z ) z++; } } return TCL_OK; } /* ** Implementation of search result ranking function. */ static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){ u32 *aMatchinfo; double score = 0.0; int iCol; int iPhrase; int nCol; int nPhrase; int isExplain = sqlite3_user_data(pCtx); char *zExplain = 0; if( nVal==0 ) goto wna; aMatchinfo = (u32 *)sqlite3_value_blob(apVal[0]); nPhrase = aMatchinfo[0]; nCol = aMatchinfo[1]; if( nVal!=nCol+1 ) goto wna; if( isExplain ) zExplain = sqlite3_mprintf(""); for(iCol=0; iCol
%d.( ", zExplain, iCol); } for(iPhrase=0; iPhrase = %.4f", zExplain, nToken, colscore); } } if( isExplain ){ sqlite3_result_text(pCtx, sqlite3_mprintf( "%z
= %.4f
", zExplain, score ), -1, sqlite3_free); }else{ sqlite3_result_double(pCtx, score); } return; wna: sqlite3_result_error(pCtx,"wrong number of arguments to function rank()",-1); } int Sqlite3_Init(Tcl_Interp *interp); static int initDb(sqlite3 *db, char **pzErr, void *p){ sqlite3_create_function(db, "rank",-1, SQLITE_UTF8, 0, rankfunc,0,0); sqlite3_create_function(db, "erank", -1, SQLITE_UTF8, (void*)1, rankfunc,0,0); } int Parsehtml_Init(Tcl_Interp *interp){ #ifdef USE_TCL_STUBS if (Tcl_InitStubs(interp, "8.4", 0) == 0) { return TCL_ERROR; } #endif Tcl_CreateObjCommand(interp, "parsehtml", parsehtmlcmd, 0, 0); Tcl_CreateObjCommand(interp, "tokencount", tokencountcmd, 0, 0); sqlite3_auto_extension(initDb); return TCL_OK; } static int AppInit(Tcl_Interp *interp) { int rc; rc = Sqlite3_Init(interp); if( rc!=TCL_OK ) return rc; rc = Parsehtml_Init(interp); return rc; } int main(int argc, char *argv[]) { Tcl_Main(argc, argv, AppInit); return 0; }