/ Check-in [2631ceae]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:2631ceaeefaca3aa837e3b439399f13c51456914
User & Date: shess 2006-09-01 00:05:17
Context
2006-09-01
00:33
Make fts1.c not rely on nul-terminated strings. Mostly a matter of making sure we always pass around ptr/len, but there were a few places where we actually relied on nul-termination.

An earlier change had additionally changed appropriate sqlite3_bind_text() calls to sqlite3_bind_blob(). I've found that this changes what's actually stored in the database, so backed those changes out. Also (and this is weird), I found that I could no longer do straight-forward = queries against %_term.term at a command-line. (CVS 3379) check-in: 5844db1a user: shess tags: trunk

00:05
Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378) check-in: 2631ceae user: shess tags: trunk
2006-08-31
15:07
Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", so that all symbols with external linkage begin with "sqlite3Fts1", and so that all filenames begin with "fts1". (CVS 3377) check-in: e1891f0d user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1_tokenizer1.c.

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

46
47
48
49
50
51
52
53
54
55
56
57
58
59




60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144














145
146
147
148

149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "fts1_tokenizer.h"

/* Duplicate a string; the caller must free() the returned string.
 * (We don't use strdup() since it's not part of the standard C library and
 * may not be available everywhere.) */
/* TODO(shess) Copied from fulltext.c, consider util.c for such
** things. */
static char *string_dup(const char *s){
  char *str = malloc(strlen(s) + 1);
  strcpy(str, s);
  return str;
}

typedef struct simple_tokenizer {
  sqlite3_tokenizer base;
  const char *zDelim;          /* token delimiters */

} simple_tokenizer;

typedef struct simple_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *pInput;          /* input we are tokenizing */
  int nBytes;                  /* size of the input */
  const char *pCurrent;        /* current position in pInput */
  int iToken;                  /* index of next token to be returned */
  char *zToken;                /* storage for current token */
  int nTokenBytes;             /* actual size of current token */
  int nTokenAllocated;         /* space allocated to zToken buffer */
} simple_tokenizer_cursor;

static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */





static int simpleCreate(
  int argc, const char **argv,
  sqlite3_tokenizer **ppTokenizer
){
  simple_tokenizer *t;

  t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
  /* TODO(shess) Delimiters need to remain the same from run to run,
  ** else we need to reindex.  One solution would be a meta-table to
  ** track such information in the database, then we'd only want this
  ** information on the initial create.
  */
  if( argc>1 ){
    t->zDelim = string_dup(argv[1]);
  } else {
    /* Build a string excluding alphanumeric ASCII characters */
    char zDelim[0x80];               /* nul-terminated, so nul not a member */
    int i, j;
    for(i=1, j=0; i<0x80; i++){
      if( !isalnum(i) ){
        zDelim[j++] = i;
      }
    }
    zDelim[j++] = '\0';
    assert( j<=sizeof(zDelim) );
    t->zDelim = string_dup(zDelim);



  }

  *ppTokenizer = &t->base;
  return SQLITE_OK;
}

static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
  simple_tokenizer *t = (simple_tokenizer *) pTokenizer;

  free((void *) t->zDelim);
  free(t);

  return SQLITE_OK;
}

static int simpleOpen(
  sqlite3_tokenizer *pTokenizer,
  const char *pInput, int nBytes,
  sqlite3_tokenizer_cursor **ppCursor
){
  simple_tokenizer_cursor *c;

  c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
  c->pInput = pInput;
  c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
  c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
  c->iToken = 0;
  c->zToken = NULL;               /* no space allocated, yet. */
  c->nTokenBytes = 0;
  c->nTokenAllocated = 0;

  *ppCursor = &c->base;
  return SQLITE_OK;
}

static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;

  if( NULL!=c->zToken ){
    free(c->zToken);
  }
  free(c);

  return SQLITE_OK;
}

static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,
  const char **ppToken, int *pnBytes,
  int *piStartOffset, int *piEndOffset, int *piPosition
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  int ii;

  while( c->pCurrent-c->pInput<c->nBytes ){
    int n = (int) strcspn(c->pCurrent, t->zDelim);
    if( n>0 ){














      if( n+1>c->nTokenAllocated ){
        c->zToken = realloc(c->zToken, n+1);
      }
      for(ii=0; ii<n; ii++){

        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        char ch = c->pCurrent[ii];
        c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
      }
      c->zToken[n] = '\0';
      *ppToken = c->zToken;
      *pnBytes = n;
      *piStartOffset = (int) (c->pCurrent-c->pInput);
      *piEndOffset = *piStartOffset+n;
      *piPosition = c->iToken++;
      c->pCurrent += n + 1;

      return SQLITE_OK;
    }
    c->pCurrent += n + 1;
    /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
    ** to happen in two places, though, which is annoying.
    */
  }
  return SQLITE_DONE;
}

static sqlite3_tokenizer_module simpleTokenizerModule = {
  0,
  simpleCreate,







<
<
<
<
<
<
<
<
<
<
<


<
>






|

|
<




>
>
>
>







|






|
|
|
|
|
|
|
|
|
|
|
|
|
>
>
>







<
<
<
|
<













|

|
<








<
<
|
<

<










|

|
|
<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
|

<
>



|
|

<
|

|
|

<



<
<
<
<







25
26
27
28
29
30
31











32
33

34
35
36
37
38
39
40
41
42
43

44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88



89

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105

106
107
108
109
110
111
112
113


114

115

116
117
118
119
120
121
122
123
124
125
126
127
128
129

130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

147
148
149
150
151
152
153

154
155
156
157
158

159
160
161




162
163
164
165
166
167
168
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>

#include "fts1_tokenizer.h"












typedef struct simple_tokenizer {
  sqlite3_tokenizer base;

  char delim[128];             /* flag ASCII delimiters */
} simple_tokenizer;

typedef struct simple_tokenizer_cursor {
  sqlite3_tokenizer_cursor base;
  const char *pInput;          /* input we are tokenizing */
  int nBytes;                  /* size of the input */
  int iOffset;                 /* current position in pInput */
  int iToken;                  /* index of next token to be returned */
  char *pToken;                /* storage for current token */

  int nTokenAllocated;         /* space allocated to zToken buffer */
} simple_tokenizer_cursor;

static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */

static int isDelim(simple_tokenizer *t, unsigned char c){
  return c<0x80 && t->delim[c];
}

static int simpleCreate(
  int argc, const char **argv,
  sqlite3_tokenizer **ppTokenizer
){
  simple_tokenizer *t;

  t = (simple_tokenizer *) calloc(sizeof(simple_tokenizer), 1);
  /* TODO(shess) Delimiters need to remain the same from run to run,
  ** else we need to reindex.  One solution would be a meta-table to
  ** track such information in the database, then we'd only want this
  ** information on the initial create.
  */
  if( argc>1 ){
    int i, n = strlen(argv[1]);
    for(i=0; i<n; i++){
      unsigned char ch = argv[1][i];
      /* We explicitly don't support UTF-8 delimiters for now. */
      if( ch>=0x80 ){
        free(t);
        return SQLITE_ERROR;
      }
      t->delim[ch] = 1;
    }
  } else {
    /* Mark non-alphanumeric ASCII characters as delimiters */
    int i;
    for(i=1; i<0x80; i++){
      t->delim[i] = !isalnum(i);
    }
  }

  *ppTokenizer = &t->base;
  return SQLITE_OK;
}

static int simpleDestroy(sqlite3_tokenizer *pTokenizer){



  free(pTokenizer);

  return SQLITE_OK;
}

static int simpleOpen(
  sqlite3_tokenizer *pTokenizer,
  const char *pInput, int nBytes,
  sqlite3_tokenizer_cursor **ppCursor
){
  simple_tokenizer_cursor *c;

  c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
  c->pInput = pInput;
  c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
  c->iOffset = 0;                 /* start tokenizing at the beginning */
  c->iToken = 0;
  c->pToken = NULL;               /* no space allocated, yet. */

  c->nTokenAllocated = 0;

  *ppCursor = &c->base;
  return SQLITE_OK;
}

static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;


  free(c->pToken);

  free(c);

  return SQLITE_OK;
}

static int simpleNext(
  sqlite3_tokenizer_cursor *pCursor,
  const char **ppToken, int *pnBytes,
  int *piStartOffset, int *piEndOffset, int *piPosition
){
  simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
  simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
  unsigned char *p = (unsigned char *)c->pInput;

  while( c->iOffset<c->nBytes ){
    int iStartOffset;


    /* Scan past delimiter characters */
    while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    /* Count non-delimiter characters. */
    iStartOffset = c->iOffset;
    while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
      c->iOffset++;
    }

    if( c->iOffset>iStartOffset ){
      int i, n = c->iOffset-iStartOffset;
      if( n>c->nTokenAllocated ){
        c->pToken = realloc(c->pToken, n);
      }

      for(i=0; i<n; i++){
        /* TODO(shess) This needs expansion to handle UTF-8
        ** case-insensitivity.
        */
        unsigned char ch = p[iStartOffset+i];
        c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
      }

      *ppToken = c->pToken;
      *pnBytes = n;
      *piStartOffset = iStartOffset;
      *piEndOffset = c->iOffset;
      *piPosition = c->iToken++;


      return SQLITE_OK;
    }




  }
  return SQLITE_DONE;
}

static sqlite3_tokenizer_module simpleTokenizerModule = {
  0,
  simpleCreate,