Documentation Source Text

Artifact [10f99ddbf1]
Login

Artifact 10f99ddbf1337b8553f8c1014038d3e3c63a52ad30dbceb73a0725f2897d66cb:



/*
** This file contains the implementation of a custom FTS5 tokenizer. This
** tokenizer implements the following special features:
**
**   * For all tokens that match the pattern "SQLITE_XXX" (case sensitive),
**     "XXX" is added as a synonym for SQLITE_XXX.
**
**   * For all tokens that match the pattern "sqlite3_xxx" (case sensitive),
**     "xxx" is added as a synonym for sqlite3_xxx.
**
** By default, this file builds a TCL extension.  But if the -DSQLITE_EXT
** compile-time option is used, then an SQLite extension is built instead.
*/

#ifdef SQLITE_EXT
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#else
#include <sqlite3.h>
#include <tcl.h>
#endif
#include <string.h>
#include <assert.h>


/*************************************************************************
** This is generic code copied from the FTS5 documentation.
**
** Return a pointer to the fts5_api pointer for database connection db.
** If an error occurs, return NULL and leave an error in the database 
** handle (accessible using sqlite3_errcode()/errmsg()).
*/
fts5_api *fts5_api_from_db(sqlite3 *db){
  fts5_api *pRet = 0;
  sqlite3_stmt *pStmt = 0;
  if( SQLITE_OK==sqlite3_prepare(db, "SELECT fts5(?1)", -1, &pStmt, 0) ){
    sqlite3_bind_pointer(pStmt, 1, (void*)&pRet, "fts5_api_ptr", 0);
    sqlite3_step(pStmt);
  }
  sqlite3_finalize(pStmt);
  return pRet;
}
/************************************************************************/

/*
** Simple ranking function used by search script. Assumes the queried
** table has the following 5 indexed columns:
**
**     apis,                      -- C APIs 
**     keywords,                  -- Keywords
**     title1,                    -- Document title
**     title2,                    -- Heading title, if any
**     content,                   -- Document text
**
** This function returns the following integer values:
**
**   10000 - all phrases present in "keywords".
**   1000 - all phrases present in "keywords", "title1" or "title2".
**   100 - all phrases present in "keywords", "title1" or "title2" or "apis".
**
** It adds a bonus of 10 if either of the above and the condition 
** (xRowid()>1000 && (xRowid() % 1000)==1) is true.
**
*/
void srankFunc(
  const Fts5ExtensionApi *pApi,   /* API offered by current FTS version */
  Fts5Context *pFts,              /* First arg to pass to pApi functions */
  sqlite3_context *pCtx,          /* Context for returning result/error */
  int nVal,                       /* Number of values in apVal[] array */
  sqlite3_value **apVal           /* Array of trailing arguments */
){
  int nPhrase;                    /* Number of phrases in query */
  int i;                          /* Used to iterate through phrases */
  int rc;                         /* Return code */
  int n1 = 0;
  int n2 = 0;
  int n3 = 0;
  int iScore = 0;                 /* Returned value */
  sqlite3_int64 iRowid;           /* Rowid for current row */

  iRowid = pApi->xRowid(pFts);
#if 0
  if( iRowid<1000 ) return;
#endif
  nPhrase = pApi->xPhraseCount(pFts);
  for(i=0; i<nPhrase; i++){
    Fts5PhraseIter iter;
    int ic, io;

    rc = pApi->xPhraseFirst(pFts, i, &iter, &ic, &io);
    if( rc!=SQLITE_OK ){
      sqlite3_result_error(pCtx, "Error in xPhraseFirst/xPhraseNext", -1);
      return;
    }

    if( ic==0 ){
      while( ic==0 ) pApi->xPhraseNext(pFts, &iter, &ic, &io);
      if( ic<0 ) ic = 0;
    }

    if( ic==1 ) n1++;
    if( ic==2 || ic==3 ) n2++;
    if( ic==0 ) n3++;
  }

  if( n1==nPhrase ){ iScore = 10000; }
  else if( n1+n2==nPhrase ){ iScore = 1000; }
  else if( n1+n2+n3==nPhrase ){ iScore = 100; }

  if( iScore && iRowid>1000 && (iRowid % 1000)==1 ){
    iScore += 10;
  }

  sqlite3_result_int(pCtx, iScore);
}



typedef struct STokenizer STokenizer;
typedef struct STokenCtx STokenCtx;

/*
** Tokenizer type. Casts to Fts5Tokenizer.
*/
struct STokenizer {
  fts5_tokenizer porter;
  Fts5Tokenizer *pPorter;
};

/*
** Context passed through underlying tokenizer to wrapper callback.
*/
struct STokenCtx {
  void *pCtx;
  int (*xToken)(void*, int, const char*, int, int, int);
};

static int stokenCreate(
  void *pCtx, 
  const char **azArg, int nArg, 
  Fts5Tokenizer **ppOut
){
  fts5_api *pApi = (fts5_api*)pCtx;
  STokenizer *p;
  void *pPorterCtx;
  int rc;

  /* Allocate the Fts5Tokenizer object for this tokenizer. */
  p = sqlite3_malloc(sizeof(STokenizer));
  if( p ){
    memset(p, 0, sizeof(STokenizer));
  }else{
    return SQLITE_NOMEM;
  }

  /* Locate and allocate the porter tokenizer */
  rc = pApi->xFindTokenizer(pApi, "porter", &pPorterCtx, &p->porter);
  if( rc==SQLITE_OK ){
    rc = p->porter.xCreate(pPorterCtx, azArg, nArg, &p->pPorter);
  }

  /* Return the new tokenizer to the caller */
  if( rc!=SQLITE_OK ){
    sqlite3_free(p);
    p = 0;
  }
  *ppOut = (Fts5Tokenizer*)p;
  return rc;
}

static void stokenDelete(Fts5Tokenizer *pTokenizer){
  STokenizer *p = (STokenizer*)pTokenizer;
  p->porter.xDelete(p->pPorter);
  sqlite3_free(p);
}

static int stokenTokenizeCb(
  void *pCtx,         /* Copy of 2nd argument to xTokenize() */
  int tflags,         /* Mask of FTS5_TOKEN_* flags */
  const char *pToken, /* Pointer to buffer containing token */
  int nToken,         /* Size of token in bytes */
  int iStart,         /* Byte offset of token within input text */
  int iEnd            /* Byte offset of end of token within input text */
){
  STokenCtx *p = (STokenCtx*)pCtx;
  int rc = p->xToken(p->pCtx, 0, pToken, nToken, iStart, iEnd);
  if( rc==SQLITE_OK && nToken>7 && 0==memcmp("sqlite_", pToken, 7) ){
    rc = p->xToken(
        p->pCtx, FTS5_TOKEN_COLOCATED, pToken+7, nToken-7, iStart, iEnd);
  }

  if( rc==SQLITE_OK && nToken>8 && 0==memcmp("sqlite3_", pToken, 8) ){
    rc = p->xToken(
        p->pCtx, FTS5_TOKEN_COLOCATED, pToken+8, nToken-8, iStart, iEnd);
  }

  return rc;
}

/*
** Tokenizer type for "html" tokenizer. Casts to Fts5Tokenizer.
*/
typedef struct HtmlTokenizer HtmlTokenizer;
struct HtmlTokenizer {
  fts5_tokenizer tokenizer;
  Fts5Tokenizer *pTokenizer;
};

static int htmlCreate(
  void *pCtx, 
  const char **azArg, int nArg, 
  Fts5Tokenizer **ppOut
){
  fts5_api *pApi = (fts5_api*)pCtx;
  HtmlTokenizer *p = 0;
  int rc = SQLITE_OK;

  if( nArg==0 ){
    rc = SQLITE_ERROR;
  }else{
    /* Allocate the Fts5Tokenizer object for this tokenizer. */
    p = sqlite3_malloc(sizeof(HtmlTokenizer));
    if( p ){
      memset(p, 0, sizeof(HtmlTokenizer));
    }else{
      return SQLITE_NOMEM;
    }
  }

  if( rc==SQLITE_OK ){
    /* Locate and allocate the next tokenizer */
    void *pNextCtx = 0;
    rc = pApi->xFindTokenizer(pApi, azArg[0], &pNextCtx, &p->tokenizer);
    if( rc==SQLITE_OK ){
      rc = p->tokenizer.xCreate(pNextCtx, &azArg[1], nArg-1, &p->pTokenizer);
    }
  }

  /* Return the new tokenizer to the caller */
  if( rc!=SQLITE_OK ){
    sqlite3_free(p);
    p = 0;
  }
  *ppOut = (Fts5Tokenizer*)p;
  return rc;
}

static void htmlDelete(Fts5Tokenizer *pTokenizer){
  HtmlTokenizer *p = (HtmlTokenizer*)pTokenizer;
  p->tokenizer.xDelete(p->pTokenizer);
  sqlite3_free(p);
}

static int htmlTokenize(
  Fts5Tokenizer *pTokenizer, 
  void *pCtx,
  int flags,            /* Mask of FTS5_TOKENIZE_* flags */
  const char *pText, int nText, 
  int (*xToken)(
    void *pCtx,         /* Copy of 2nd argument to xTokenize() */
    int tflags,         /* Mask of FTS5_TOKEN_* flags */
    const char *pToken, /* Pointer to buffer containing token */
    int nToken,         /* Size of token in bytes */
    int iStart,         /* Byte offset of token within input text */
    int iEnd            /* Byte offset of end of token within input text */
  )
){
  HtmlTokenizer *p = (HtmlTokenizer*)pTokenizer;
  char *zOut;
  int i;
  int bTag=0;
  int rc;
  
  zOut = sqlite3_malloc(nText+1);
  if( zOut==0 ){
    return SQLITE_NOMEM;
  }
  for(i=0; i<nText; i++){
    char c = pText[i];
    if( bTag==0 && c=='<' ) bTag = 1;
    zOut[i] = bTag ? ' ' : c;
    if( bTag==1 && c=='>' ) bTag = 0;
  }

  rc = p->tokenizer.xTokenize(p->pTokenizer, pCtx, flags, zOut, nText, xToken);
  sqlite3_free(zOut);
  return rc;
}

static int stokenTokenize(
  Fts5Tokenizer *pTokenizer, 
  void *pCtx,
  int flags,            /* Mask of FTS5_TOKENIZE_* flags */
  const char *pText, int nText, 
  int (*xToken)(
    void *pCtx,         /* Copy of 2nd argument to xTokenize() */
    int tflags,         /* Mask of FTS5_TOKEN_* flags */
    const char *pToken, /* Pointer to buffer containing token */
    int nToken,         /* Size of token in bytes */
    int iStart,         /* Byte offset of token within input text */
    int iEnd            /* Byte offset of end of token within input text */
  )
){
  STokenizer *p = (STokenizer*)pTokenizer;
  int rc;

  if( flags==FTS5_TOKENIZE_DOCUMENT ){
    STokenCtx ctx;
    ctx.xToken = xToken;
    ctx.pCtx = pCtx;
    rc = p->porter.xTokenize(
        p->pPorter, (void*)&ctx, flags, pText, nText, stokenTokenizeCb
    );
  }else{
    rc = p->porter.xTokenize(p->pPorter, pCtx, flags, pText, nText, xToken);
  }

  return rc;
}


static int register_tokenizer(sqlite3 *db, char **pzErr, void *p){
  fts5_api *pApi;
  fts5_tokenizer t;
  int rc;

  pApi = fts5_api_from_db(db);
  if( pApi==0 ){
    *pzErr = sqlite3_mprintf("fts5_api_from_db: %s", sqlite3_errmsg(db));
    return SQLITE_ERROR;
  }

  t.xCreate = stokenCreate;
  t.xDelete = stokenDelete;
  t.xTokenize = stokenTokenize;
  rc = pApi->xCreateTokenizer(pApi, "stoken", (void*)pApi, &t, 0);

  if( rc==SQLITE_OK ){
    t.xCreate = htmlCreate;
    t.xDelete = htmlDelete;
    t.xTokenize = htmlTokenize;
    rc = pApi->xCreateTokenizer(pApi, "html", (void*)pApi, &t, 0);
  }

  if( rc==SQLITE_OK ){
    rc = pApi->xCreateFunction(pApi, "srank", 0, srankFunc, 0);
  }

  return rc;
}

#ifdef SQLITE_EXT
#ifdef _WIN32
__declspec(dllexport)
#endif
int sqlite3_ftsext_init(
  sqlite3 *db, 
  char **pzErrMsg, 
  const sqlite3_api_routines *pApi
){
  int rc = SQLITE_OK;
  SQLITE_EXTENSION_INIT2(pApi);
  (void)pzErrMsg;  /* Unused parameter */
  rc = register_tokenizer(db, 0, 0);
  return rc;
}

#else
int Fts5ext_Init(Tcl_Interp *interp){
#ifdef USE_TCL_STUBS
  if (Tcl_InitStubs(interp, "8.4", 0) == 0) {
    return TCL_ERROR;
  }
#endif
  sqlite3_auto_extension((void (*)(void))register_tokenizer);
  return TCL_OK;
}
#endif