SQLite

Check-in [2631ceaeef]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 2631ceaeefaca3aa837e3b439399f13c51456914
User & Date: shess 2006-09-01 00:05:17.000
Context
2006-09-01
00:33
Make fts1.c not rely on nul-terminated strings. Mostly a matter of making sure we always pass around ptr/len, but there were a few places where we actually relied on nul-termination.

An earlier change had additionally changed appropriate sqlite3_bind_text() calls to sqlite3_bind_blob(). I've found that this changes what's actually stored in the database, so backed those changes out. Also (and this is weird), I found that I could no longer do straight-forward = queries against %_term.term at a command-line. (CVS 3379) (check-in: 5844db1aa9 user: shess tags: trunk)

00:05
Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378) (check-in: 2631ceaeef user: shess tags: trunk)
2006-08-31
15:07
Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", so that all symbols with external linkage begin with "sqlite3Fts1", and so that all filenames begin with "fts1". (CVS 3377) (check-in: e1891f0dc5 user: drh tags: trunk)
Changes
Side-by-Side Diff Ignore Whitespace Patch
Changes to ext/fts1/fts1_tokenizer1.c.
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

46
47
48
49
50
51
52

53
54

55
56
57
58
59




60
61
62
63
64
65
66
67

68
69
70
71
72
73
74










75
76

77
78
79
80



81
82

83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

98
99
100
101
102
103
104
105
106
107
108
109
110
111
112

113
114

115
116
117
118
119
120
121
122
123
124
125
126

127
128
129
130
131
132
133
134
135
136
137
138
139



140

141

142
143
144
145
146














147
148

149
150
151
152
153


154
155
156

157
158
159


160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
25
26
27
28
29
30
31











32
33

34
35
36
37
38
39
40

41
42

43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

59
60
61
62
63
64
65

66
67
68
69
70
71
72
73
74
75
76

77




78
79
80


81




82
83
84
85
86
87
88




89

90
91
92
93
94
95
96
97
98
99
100
101
102

103
104

105

106
107
108
109
110
111
112
113



114

115

116
117
118
119
120
121
122
123
124
125
126
127
128

129
130
131





132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

147
148
149
150


151
152
153


154
155


156
157
158

159
160
161




162
163
164
165
166
167
168







-
-
-
-
-
-
-
-
-
-
-


-
+






-
+

-
+
-




+
+
+
+







-
+






-
+
+
+
+
+
+
+
+
+
+

-
+
-
-
-
-
+
+
+
-
-
+
-
-
-
-







-
-
-
-
+
-













-
+

-
+
-








-
-
-
+
-

-










+
+
+
-
+

+
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+

-
+



-
-
+
+

-
-
+

-
-
+
+

-



-
-
-
-







#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "fts1_tokenizer.h"

/* Duplicate a string; the caller must free() the returned string.
 * (We don't use strdup() since it's not part of the standard C library and
 * may not be available everywhere.) */
/* TODO(shess) Copied from fulltext.c, consider util.c for such
** things. */
static char *string_dup(const char *s){
  char *str = malloc(strlen(s) + 1);
  strcpy(str, s);
  return str;
}

typedef struct simple_tokenizer {
  sqlite3_tokenizer base;
  const char *zDelim;          /* token delimiters */
  char delim[128];             /* flag ASCII delimiters */
} simple_tokenizer;

typedef struct simple_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *pInput;          /* input we are tokenizing */
  int nBytes;                  /* size of the input */
  const char *pCurrent;        /* current position in pInput */
  int iOffset;                 /* current position in pInput */
  int iToken;                  /* index of next token to be returned */
  char *zToken;                /* storage for current token */
  char *pToken;                /* storage for current token */
  int nTokenBytes;             /* actual size of current token */
  int nTokenAllocated;         /* space allocated to zToken buffer */
} simple_tokenizer_cursor;

static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

static int isDelim(simple_tokenizer *t, unsigned char c){
  return c<0x80 && t->delim[c];
}

static int simpleCreate(
  int argc, const char **argv,
  sqlite3_tokenizer **ppTokenizer
){
  simple_tokenizer *t;

  t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
  t = (simple_tokenizer *) calloc(sizeof(simple_tokenizer), 1);
  /* TODO(shess) Delimiters need to remain the same from run to run,
  ** else we need to reindex.  One solution would be a meta-table to
  ** track such information in the database, then we'd only want this
  ** information on the initial create.
  */
  if( argc>1 ){
    t->zDelim = string_dup(argv[1]);
    int i, n = strlen(argv[1]);
    for(i=0; i<n; i++){
      unsigned char ch = argv[1][i];
      /* We explicitly don't support UTF-8 delimiters for now. */
      if( ch>=0x80 ){
        free(t);
        return SQLITE_ERROR;
      }
      t->delim[ch] = 1;
    }
  } else {
    /* Build a string excluding alphanumeric ASCII characters */
    /* Mark non-alphanumeric ASCII characters as delimiters */
    char zDelim[0x80];               /* nul-terminated, so nul not a member */
    int i, j;
    for(i=1, j=0; i<0x80; i++){
      if( !isalnum(i) ){
    int i;
    for(i=1; i<0x80; i++){
      t->delim[i] = !isalnum(i);
        zDelim[j++] = i;
      }
    }
    }
    zDelim[j++] = '\0';
    assert( j<=sizeof(zDelim) );
    t->zDelim = string_dup(zDelim);
  }

  *ppTokenizer = &t->base;
  return SQLITE_OK;
}

static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  simple_tokenizer *t = (simple_tokenizer *) pTokenizer;

  free((void *) t->zDelim);
  free(t);
  free(pTokenizer);

  return SQLITE_OK;
}

static int simpleOpen(
  sqlite3_tokenizer *pTokenizer,
  const char *pInput, int nBytes,
  sqlite3_tokenizer_cursor **ppCursor
){
  simple_tokenizer_cursor *c;

  c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
  c->pInput = pInput;
  c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
  c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
  c->iOffset = 0;                 /* start tokenizing at the beginning */
  c->iToken = 0;
  c->zToken = NULL;               /* no space allocated, yet. */
  c->pToken = NULL;               /* no space allocated, yet. */
  c->nTokenBytes = 0;
  c->nTokenAllocated = 0;

  *ppCursor = &c->base;
  return SQLITE_OK;
}

static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

  if( NULL!=c->zToken ){
    free(c->zToken);
  free(c->pToken);
  }
  free(c);

  return SQLITE_OK;
}

static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,
  const char **ppToken, int *pnBytes,
  int *piStartOffset, int *piEndOffset, int *piPosition
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  unsigned char *p = (unsigned char *)c->pInput;

  while( c->iOffset<c->nBytes ){
  int ii;
    int iStartOffset;

    /* Scan past delimiter characters */
  while( c->pCurrent-c->pInput<c->nBytes ){
    int n = (int) strcspn(c->pCurrent, t->zDelim);
    if( n>0 ){
      if( n+1>c->nTokenAllocated ){
        c->zToken = realloc(c->zToken, n+1);
    while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    if( c->iOffset>iStartOffset ){
      int i, n = c->iOffset-iStartOffset;
      if( n>c->nTokenAllocated ){
        c->pToken = realloc(c->pToken, n);
      }
      for(ii=0; ii<n; ii++){
      for(i=0; i<n; i++){
        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        char ch = c->pCurrent[ii];
        c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
        unsigned char ch = p[iStartOffset+i];
        c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
      }
      c->zToken[n] = '\0';
      *ppToken = c->zToken;
      *ppToken = c->pToken;
      *pnBytes = n;
      *piStartOffset = (int) (c->pCurrent-c->pInput);
      *piEndOffset = *piStartOffset+n;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;
      c->pCurrent += n + 1;

      return SQLITE_OK;
    }
    c->pCurrent += n + 1;
    /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
    ** to happen in two places, though, which is annoying.
    */
  }
  return SQLITE_DONE;
}

static sqlite3_tokenizer_module simpleTokenizerModule = {
  0,
  simpleCreate,