Documentation Source Text

Artifact Content
Login

Artifact 7c28a91fa1c96798c88c25a5008cb8d851e0c4f3:



/*
** This file contains some functions implemented in C used by the search.tcl
** script. There are two Tcl commands:
**
**   parsehtml
**   tokencount
**
** Tcl command [parsehtml] is a helper command used to extract text and 
** markup tags from the HTML documents in the SQLite documentation. The
** [tokencount] command uses an FTS3 tokenizer to count the number of
** tokens in a document. Both of these are used while building the database
** only.
**
** There are also two SQL user functions registered:
**
**   rank()
**   erank()
**
** rank() interprets the return value of the FTS3 matchinfo() function and
** returns a score for the match (a real number). The higher the score, the
** more relevant the document is considered. This is used to order query
** results when the user searchs the database. The rank() function takes
** (nCol+1) arguments, where nCol is the number of columns in the FTS3
** table. The first argument is the return value of matchinfo(). The
** second argument is the number of tokens in column 0 of the current FTS3 
** table row. The third argument is the number of tokens in column 1, and
** so on.
**
** Function erank() is called in exactly the same way as rank(). Instead
** of returning a score, it returns an HTML formatted table containing
** data that may be used to understand how the score for the current row
** was calculated.
*/

#include <tcl.h>
#include <string.h>
#include <assert.h>
#include <ctype.h>
#include <math.h>

#define ISSPACE(c) (((c)&0x80)==0 && isspace(c))

#include "fts3_tokenizer.h"
#include "sqlite3.h"

typedef unsigned int u32;
typedef unsigned char u8;
typedef sqlite3_uint64 u64;

typedef struct STokenizer STokenizer;
struct STokenizer {
  sqlite3_tokenizer_module *pMod;
  sqlite3_tokenizer *pTokenizer;
  sqlite3_tokenizer_cursor *pCursor;
};

static void tokenizerClose(STokenizer *p){
  if( p->pCursor ){ p->pMod->xClose(p->pCursor); }
  if( p->pTokenizer ){ p->pMod->xDestroy(p->pTokenizer); }
  memset(p, 0, sizeof(STokenizer));
}

static int tokenizerOpen(
  Tcl_Interp *interp,
  Tcl_Obj *pTokenizer,
  Tcl_Obj *pDocument,
  STokenizer *pOut                /* OUT: Structure containing tokenizer */
){
  sqlite3_tokenizer_module *pMod; /* Tokenizer module */
  int rc;                         /* Return code */
  const char *zDoc;               /* Pointer to pDocument string buffer */
  int nDoc;                       /* Number of bytes in buffer zDoc */
  const char *zFail;              /* Error message (if an error occurs) */

  memset(pOut, 0, sizeof(STokenizer));
  memcpy(&pMod, Tcl_GetByteArrayFromObj(pTokenizer, 0), sizeof(pMod));

  rc = pMod->xCreate(0, 0, &pOut->pTokenizer);
  if( rc!=SQLITE_OK ){ zFail = "Error in xCreate()"; goto failed; }
  pOut->pMod = pMod;

  zDoc = Tcl_GetStringFromObj(pDocument, &nDoc);
  rc = pMod->xOpen(pOut->pTokenizer, zDoc, nDoc, &pOut->pCursor);
  if( rc!=SQLITE_OK ){ zFail = "Error in xOpen()"; goto failed; }

  pOut->pCursor->pTokenizer = pOut->pTokenizer;
  return TCL_OK;

 failed:
  tokenizerClose(pOut);
  Tcl_AppendResult(interp, zFail, 0);
  return TCL_ERROR;
}

static int tokenizerNext(
  STokenizer *p,                  /* Tokenizer wrapper object */
  int *piStart,                   /* OUT: Byte offset of start of token */
  int *piEnd,                     /* OUT: Byte offset of end of token */
  int *piCurrent                  /* OUT: Token number */  
){
  const char *z; int n;
  return p->pMod->xNext(p->pCursor, &z, &n, piStart, piEnd, piCurrent);
}

/*
** Tcl command: tokencount TOKENIZER DOCUMENT
*/
static int tokencountcmd(
  ClientData clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj *const objv[]
){
  STokenizer sToken;
  int rc;
  int nToken = 0;
  int i1, i2, i3;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "TOKENIZER DOCUMENT");
    return TCL_ERROR;
  }

  if( tokenizerOpen(interp, objv[1], objv[2], &sToken) ) return TCL_ERROR;

  while( SQLITE_OK==(rc = tokenizerNext(&sToken, &i1, &i2, &i3)) ) nToken++;
  if( rc!=SQLITE_DONE ){
    Tcl_AppendResult(interp, "Error in xNext() 3", 0);
    rc = TCL_ERROR;
  }else{
    Tcl_SetObjResult(interp, Tcl_NewIntObj(nToken));
    rc = TCL_OK;
  }

  tokenizerClose(&sToken);
  return rc;
}


/*
** Tcl command: parsehtml HTML SCRIPT
*/
static int parsehtmlcmd(
  ClientData clientData,
  Tcl_Interp *interp,
  int objc,
  Tcl_Obj * const objv[]
){
  char *zHtml;
  char *z;
  Tcl_Obj **aCall;
  int nElem;
  Tcl_Obj **aElem;
  int rc;

  if( objc!=3 ){
    Tcl_WrongNumArgs(interp, 1, objv, "HTML SCRIPT");
    return TCL_ERROR;
  }
  zHtml = Tcl_GetString(objv[1]);

  rc = Tcl_ListObjGetElements(interp, objv[2], &nElem, &aElem);
  if( rc!=TCL_OK ) return rc;
  aCall = (Tcl_Obj **)ckalloc(sizeof(Tcl_Obj *)*(nElem+2));
  memcpy(aCall, aElem, sizeof(Tcl_Obj *)*nElem);
  aCall[nElem] = 0;
  aCall[nElem+1] = 0;

  z = zHtml;
  while( *z ){
    char *zText = z;
    while( *z && *z!='<' ) z++;

    /* Invoke the callback script for the chunk of text just parsed. */
    Tcl_IncrRefCount( aCall[nElem]   = Tcl_NewObj() );
    Tcl_IncrRefCount( aCall[nElem+1] = Tcl_NewStringObj(zText, z-zText) );
    rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
    Tcl_DecrRefCount( aCall[nElem] );
    Tcl_DecrRefCount( aCall[nElem+1] );
    if( rc!=TCL_OK ) return rc;

    /* Unless is at the end of the document, z now points to the start of a
    ** markup tag. Either an opening or a closing tag. Parse it up and 
    ** invoke the callback script. */
    if( *z ){
      int nTag;
      char *zTag;
      z++;

      while( ISSPACE(*z) ) z++;
      zTag = z;

      while( *z && !ISSPACE(*z) && *z!='>' ) z++;
      nTag = z-zTag;

      if( nTag==5 && 0==sqlite3_strnicmp("style", zTag, 5) ){
        while( *z && sqlite3_strnicmp("/style>", z, 6 ) ) z++;
      } else if( nTag>=3 && 0==memcmp("!--", zTag, 3) ){
        while( *z && sqlite3_strnicmp("-->", z, 3 ) ) z++;
      } else {
        Tcl_Obj *pParam = Tcl_NewObj();

        while( *z && *z!='>' ){
          char *zAttr;

          /* Gobble up white-space */
          while( ISSPACE(*z) ) z++;
          zAttr = z;

          /* Advance to the end of the attribute name */
          while( *z && *z!='>' && !ISSPACE(*z) && *z!='=' ) z++;
          if( z==zAttr ) zAttr = 0;

          if( zAttr ){
            Tcl_Obj *pAttr = Tcl_NewStringObj(zAttr, z-zAttr);
            Tcl_ListObjAppendElement(interp, pParam, pAttr);
          }
          while( ISSPACE(*z) ) z++;

          if( *z=='=' ){
            int nVal;
            char *zVal;
            z++;
            while( ISSPACE(*z) ) z++;
            zVal = z;

            if( *zVal=='"' ){
              zVal++;
              z++;
              while( *z && *z!='"' ) z++;
              nVal = z-zVal;
              z++;
            }else{
              while( *z && !ISSPACE(*z) && *z!='>' ) z++;
              nVal = z-zVal;
            }
            Tcl_ListObjAppendElement(interp,pParam,Tcl_NewStringObj(zVal,nVal));
          }else if( zAttr ){
            Tcl_ListObjAppendElement(interp, pParam, Tcl_NewIntObj(1));
          }
        }
        
        Tcl_IncrRefCount( aCall[nElem]   = Tcl_NewStringObj(zTag, nTag) );
        Tcl_IncrRefCount( aCall[nElem+1] = pParam );
        rc = Tcl_EvalObjv(interp, nElem+2, aCall, 0);
        Tcl_DecrRefCount( aCall[nElem] );
        Tcl_DecrRefCount( aCall[nElem+1] );
        if( rc!=TCL_OK ) return rc;
      }

      while( *z && !ISSPACE(*z) && *z!='>' ) z++;
      if( *z ) z++;
    }

  }

  return TCL_OK;
}


/*
** Implementation of search result ranking function.
*/
static void rankfunc(sqlite3_context *pCtx, int nVal, sqlite3_value **apVal){
  u32 *aMatchinfo;
  double score = 0.0;
  int iCol;
  int iPhrase;
  int nCol;
  int nPhrase;

  int isExplain = sqlite3_user_data(pCtx);
  char *zExplain = 0;

  if( nVal==0 ) goto wna;
  aMatchinfo = (u32 *)sqlite3_value_blob(apVal[0]);
  nPhrase = aMatchinfo[0];
  nCol = aMatchinfo[1];
  if( nVal!=nCol+1 ) goto wna;

  if( isExplain ) zExplain = sqlite3_mprintf("<table width=100%%>");

  for(iCol=0; iCol<nCol; iCol++){
    int nToken = sqlite3_value_int(apVal[iCol+1]);
    double colscore = 0.0;
    if( isExplain ){
      zExplain = sqlite3_mprintf("%z<tr><td>%d.<td>( ", zExplain, iCol);
    }
    for(iPhrase=0; iPhrase<nPhrase; iPhrase++){
      u32 nGlobal = aMatchinfo[2 + iPhrase*nCol + iCol];
      u32 nHit = aMatchinfo[2 + nPhrase*nCol + iPhrase*nCol + iCol];

      if( nHit ) colscore += (double)nHit / (double)nGlobal;
      if( isExplain ){
        const char *zDiv = (iPhrase==0 ? "" : "+ ");
        zExplain = sqlite3_mprintf("%z%s%d/%d ", zExplain, zDiv, nHit, nGlobal);
      }
    }
    colscore = colscore / (log(100+nToken)/log(10)); 
    score += colscore;
    if( isExplain ){
      zExplain = sqlite3_mprintf(
          "%z) / log(100+%d)<td> = %.4f", zExplain, nToken, colscore);
    }
  }

  if( isExplain ){
    sqlite3_result_text(pCtx, sqlite3_mprintf(
        "%z<tr><td><td width=100%%><td>= <b>%.4f</b></table>", zExplain, score
    ), -1, sqlite3_free);
  }else{
    sqlite3_result_double(pCtx, score);
  }
  return;

 wna:
  sqlite3_result_error(pCtx,"wrong number of arguments to function rank()",-1);
}

int Sqlite3_Init(Tcl_Interp *interp);

static int initDb(sqlite3 *db, char **pzErr, void *p){
  sqlite3_create_function(db, "rank",-1, SQLITE_UTF8, 0, rankfunc,0,0);
  sqlite3_create_function(db, "erank", -1, SQLITE_UTF8, (void*)1, rankfunc,0,0);
}

int Parsehtml_Init(Tcl_Interp *interp){
#ifdef USE_TCL_STUBS
  if (Tcl_InitStubs(interp, "8.4", 0) == 0) {
    return TCL_ERROR;
  }
#endif

  Tcl_CreateObjCommand(interp, "parsehtml",  parsehtmlcmd, 0, 0);
  Tcl_CreateObjCommand(interp, "tokencount", tokencountcmd, 0, 0);
  sqlite3_auto_extension(initDb);

  return TCL_OK;
}


static int AppInit(Tcl_Interp *interp) {
  int rc;
  rc = Sqlite3_Init(interp);
  if( rc!=TCL_OK ) return rc;
  rc = Parsehtml_Init(interp);
  return rc;
}

int main(int argc, char *argv[]) {
  Tcl_Main(argc, argv, AppInit);
  return 0;
}