SQLite

Artifact [bbfa4e3b]
Login

Artifact bbfa4e3b2a26ef17d4edc6d98cd4a3f5396d998a:


     1
     2
     3
     4
     5
     6
     7
     8
     9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84
    85
    86
    87
    88
    89
    90
    91
    92
    93
    94
    95
    96
    97
    98
    99
   100
   101
   102
   103
   104
   105
   106
   107
   108
   109
   110
   111
   112
   113
   114
   115
   116
   117
   118
   119
   120
   121
   122
   123
   124
   125
   126
   127
   128
   129
   130
   131
   132
   133
   134
   135
   136
   137
   138
   139
   140
   141
   142
   143
   144
   145
   146
   147
   148
   149
   150
   151
   152
   153
   154
   155
   156
   157
   158
   159
   160
   161
   162
   163
   164
   165
   166
   167
   168
   169
   170
   171
   172
   173
   174
/*
** The author disclaims copyright to this source code.
**
*************************************************************************
** Implementation of the "simple" full-text-search tokenizer.
*/

#include <assert.h>
#if !defined(__APPLE__)
#include <malloc.h>
#else
#include <stdlib.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "tokenizer.h"

/* Duplicate a string; the caller must free() the returned string.
 * (We don't use strdup() since it's not part of the standard C library and
 * may not be available everywhere.) */
/* TODO(shess) Copied from fulltext.c, consider util.c for such
** things. */
static char *string_dup(const char *s){
  char *str = malloc(strlen(s) + 1);
  strcpy(str, s);
  return str;
}

typedef struct simple_tokenizer {
  sqlite3_tokenizer base;
  const char *zDelim;          /* token delimiters */
} simple_tokenizer;

typedef struct simple_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *pInput;          /* input we are tokenizing */
  int nBytes;                  /* size of the input */
  const char *pCurrent;        /* current position in pInput */
  int iToken;                  /* index of next token to be returned */
  char *zToken;                /* storage for current token */
  int nTokenBytes;             /* actual size of current token */
  int nTokenAllocated;         /* space allocated to zToken buffer */
} simple_tokenizer_cursor;

static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

static int simpleCreate(
  int argc, const char **argv,
  sqlite3_tokenizer **ppTokenizer
){
  simple_tokenizer *t;

  t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
  /* TODO(shess) Delimiters need to remain the same from run to run,
  ** else we need to reindex.  One solution would be a meta-table to
  ** track such information in the database, then we'd only want this
  ** information on the initial create.
  */
  if( argc>1 ){
    t->zDelim = string_dup(argv[1]);
  } else {
    /* Build a string excluding alphanumeric ASCII characters */
    char zDelim[0x80];               /* nul-terminated, so nul not a member */
    int i, j;
    for(i=1, j=0; i<0x80; i++){
      if( !isalnum(i) ){
        zDelim[j++] = i;
      }
    }
    zDelim[j++] = '\0';
    assert( j<=sizeof(zDelim) );
    t->zDelim = string_dup(zDelim);
  }

  *ppTokenizer = &t->base;
  return SQLITE_OK;
}

static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  simple_tokenizer *t = (simple_tokenizer *) pTokenizer;

  free((void *) t->zDelim);
  free(t);

  return SQLITE_OK;
}

static int simpleOpen(
  sqlite3_tokenizer *pTokenizer,
  const char *pInput, int nBytes,
  sqlite3_tokenizer_cursor **ppCursor
){
  simple_tokenizer_cursor *c;

  c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
  c->pInput = pInput;
  c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
  c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
  c->iToken = 0;
  c->zToken = NULL;               /* no space allocated, yet. */
  c->nTokenBytes = 0;
  c->nTokenAllocated = 0;

  *ppCursor = &c->base;
  return SQLITE_OK;
}

static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

  if( NULL!=c->zToken ){
    free(c->zToken);
  }
  free(c);

  return SQLITE_OK;
}

static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,
  const char **ppToken, int *pnBytes,
  int *piStartOffset, int *piEndOffset, int *piPosition
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  int ii;

  while( c->pCurrent-c->pInput<c->nBytes ){
    int n = (int) strcspn(c->pCurrent, t->zDelim);
    if( n>0 ){
      if( n+1>c->nTokenAllocated ){
        c->zToken = realloc(c->zToken, n+1);
      }
      for(ii=0; ii<n; ii++){
        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        char ch = c->pCurrent[ii];
        c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;
      }
      c->zToken[n] = '\0';
      *ppToken = c->zToken;
      *pnBytes = n;
      *piStartOffset = (int) (c->pCurrent-c->pInput);
      *piEndOffset = *piStartOffset+n;
      *piPosition = c->iToken++;
      c->pCurrent += n + 1;

      return SQLITE_OK;
    }
    c->pCurrent += n + 1;
    /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
    ** to happen in two places, though, which is annoying.
    */
  }
  return SQLITE_DONE;
}

static sqlite3_tokenizer_module simpleTokenizerModule = {
  0,
  simpleCreate,
  simpleDestroy,
  simpleOpen,
  simpleClose,
  simpleNext,
};

void get_simple_tokenizer_module(
  sqlite3_tokenizer_module **ppModule
){
  *ppModule = &simpleTokenizerModule;
}