/ Check-in [2631ceae]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 2631ceaeefaca3aa837e3b439399f13c51456914
User & Date: shess 2006-09-01 00:05:17
Context
2006-09-01
00:33
Make fts1.c not rely on nul-terminated strings. Mostly a matter of making sure we always pass around ptr/len, but there were a few places where we actually relied on nul-termination.

An earlier change had additionally changed appropriate sqlite3_bind_text() calls to sqlite3_bind_blob(). I've found that this changes what's actually stored in the database, so backed those changes out. Also (and this is weird), I found that I could no longer do straight-forward = queries against %_term.term at a command-line. (CVS 3379) check-in: 5844db1a user: shess tags: trunk

00:05
Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378) check-in: 2631ceae user: shess tags: trunk
2006-08-31
15:07
Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", so that all symbols with external linkage begin with "sqlite3Fts1", and so that all filenames begin with "fts1". (CVS 3377) check-in: e1891f0d user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1_tokenizer1.c.

    25     25   #endif
    26     26   #include <stdio.h>
    27     27   #include <string.h>
    28     28   #include <ctype.h>
    29     29   
    30     30   #include "fts1_tokenizer.h"
    31     31   
    32         -/* Duplicate a string; the caller must free() the returned string.
    33         - * (We don't use strdup() since it's not part of the standard C library and
    34         - * may not be available everywhere.) */
    35         -/* TODO(shess) Copied from fulltext.c, consider util.c for such
    36         -** things. */
    37         -static char *string_dup(const char *s){
    38         -  char *str = malloc(strlen(s) + 1);
    39         -  strcpy(str, s);
    40         -  return str;
    41         -}
    42         -
    43     32   typedef struct simple_tokenizer {
    44     33     sqlite3_tokenizer base;
    45         -  const char *zDelim;          /* token delimiters */
           34  +  char delim[128];             /* flag ASCII delimiters */
    46     35   } simple_tokenizer;
    47     36   
    48     37   typedef struct simple_tokenizer_cursor {
    49     38     sqlite3_tokenizer_cursor base;
    50     39     const char *pInput;          /* input we are tokenizing */
    51     40     int nBytes;                  /* size of the input */
    52         -  const char *pCurrent;        /* current position in pInput */
           41  +  int iOffset;                 /* current position in pInput */
    53     42     int iToken;                  /* index of next token to be returned */
    54         -  char *zToken;                /* storage for current token */
    55         -  int nTokenBytes;             /* actual size of current token */
           43  +  char *pToken;                /* storage for current token */
    56     44     int nTokenAllocated;         /* space allocated to zToken buffer */
    57     45   } simple_tokenizer_cursor;
    58     46   
    59     47   static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
           48  +
           49  +static int isDelim(simple_tokenizer *t, unsigned char c){
           50  +  return c<0x80 && t->delim[c];
           51  +}
    60     52   
    61     53   static int simpleCreate(
    62     54     int argc, const char **argv,
    63     55     sqlite3_tokenizer **ppTokenizer
    64     56   ){
    65     57     simple_tokenizer *t;
    66     58   
    67         -  t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
           59  +  t = (simple_tokenizer *) calloc(sizeof(simple_tokenizer), 1);
    68     60     /* TODO(shess) Delimiters need to remain the same from run to run,
    69     61     ** else we need to reindex.  One solution would be a meta-table to
    70     62     ** track such information in the database, then we'd only want this
    71     63     ** information on the initial create.
    72     64     */
    73     65     if( argc>1 ){
    74         -    t->zDelim = string_dup(argv[1]);
    75         -  } else {
    76         -    /* Build a string excluding alphanumeric ASCII characters */
    77         -    char zDelim[0x80];               /* nul-terminated, so nul not a member */
    78         -    int i, j;
    79         -    for(i=1, j=0; i<0x80; i++){
    80         -      if( !isalnum(i) ){
    81         -        zDelim[j++] = i;
           66  +    int i, n = strlen(argv[1]);
           67  +    for(i=0; i<n; i++){
           68  +      unsigned char ch = argv[1][i];
           69  +      /* We explicitly don't support UTF-8 delimiters for now. */
           70  +      if( ch>=0x80 ){
           71  +        free(t);
           72  +        return SQLITE_ERROR;
    82     73         }
           74  +      t->delim[ch] = 1;
    83     75       }
    84         -    zDelim[j++] = '\0';
    85         -    assert( j<=sizeof(zDelim) );
    86         -    t->zDelim = string_dup(zDelim);
           76  +  } else {
           77  +    /* Mark non-alphanumeric ASCII characters as delimiters */
           78  +    int i;
           79  +    for(i=1; i<0x80; i++){
           80  +      t->delim[i] = !isalnum(i);
           81  +    }
    87     82     }
    88     83   
    89     84     *ppTokenizer = &t->base;
    90     85     return SQLITE_OK;
    91     86   }
    92     87   
    93     88   static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
    94         -  simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
    95         -
    96         -  free((void *) t->zDelim);
    97         -  free(t);
    98         -
           89  +  free(pTokenizer);
    99     90     return SQLITE_OK;
   100     91   }
   101     92   
   102     93   static int simpleOpen(
   103     94     sqlite3_tokenizer *pTokenizer,
   104     95     const char *pInput, int nBytes,
   105     96     sqlite3_tokenizer_cursor **ppCursor
   106     97   ){
   107     98     simple_tokenizer_cursor *c;
   108     99   
   109    100     c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
   110    101     c->pInput = pInput;
   111    102     c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
   112         -  c->pCurrent = c->pInput;        /* start tokenizing at the beginning */
          103  +  c->iOffset = 0;                 /* start tokenizing at the beginning */
   113    104     c->iToken = 0;
   114         -  c->zToken = NULL;               /* no space allocated, yet. */
   115         -  c->nTokenBytes = 0;
          105  +  c->pToken = NULL;               /* no space allocated, yet. */
   116    106     c->nTokenAllocated = 0;
   117    107   
   118    108     *ppCursor = &c->base;
   119    109     return SQLITE_OK;
   120    110   }
   121    111   
   122    112   static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
   123    113     simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
   124         -
   125         -  if( NULL!=c->zToken ){
   126         -    free(c->zToken);
   127         -  }
          114  +  free(c->pToken);
   128    115     free(c);
   129         -
   130    116     return SQLITE_OK;
   131    117   }
   132    118   
   133    119   static int simpleNext(
   134    120     sqlite3_tokenizer_cursor *pCursor,
   135    121     const char **ppToken, int *pnBytes,
   136    122     int *piStartOffset, int *piEndOffset, int *piPosition
   137    123   ){
   138    124     simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
   139    125     simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
   140         -  int ii;
          126  +  unsigned char *p = (unsigned char *)c->pInput;
          127  +
          128  +  while( c->iOffset<c->nBytes ){
          129  +    int iStartOffset;
          130  +
          131  +    /* Scan past delimiter characters */
          132  +    while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
          133  +      c->iOffset++;
          134  +    }
          135  +
          136  +    /* Count non-delimiter characters. */
          137  +    iStartOffset = c->iOffset;
          138  +    while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
          139  +      c->iOffset++;
          140  +    }
   141    141   
   142         -  while( c->pCurrent-c->pInput<c->nBytes ){
   143         -    int n = (int) strcspn(c->pCurrent, t->zDelim);
   144         -    if( n>0 ){
   145         -      if( n+1>c->nTokenAllocated ){
   146         -        c->zToken = realloc(c->zToken, n+1);
          142  +    if( c->iOffset>iStartOffset ){
          143  +      int i, n = c->iOffset-iStartOffset;
          144  +      if( n>c->nTokenAllocated ){
          145  +        c->pToken = realloc(c->pToken, n);
   147    146         }
   148         -      for(ii=0; ii<n; ii++){
          147  +      for(i=0; i<n; i++){
   149    148           /* TODO(shess) This needs expansion to handle UTF-8
   150    149           ** case-insensitivity.
   151    150           */
   152         -        char ch = c->pCurrent[ii];
   153         -        c->zToken[ii] = (unsigned char)ch<0x80 ? tolower(ch) : ch;
          151  +        unsigned char ch = p[iStartOffset+i];
          152  +        c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
   154    153         }
   155         -      c->zToken[n] = '\0';
   156         -      *ppToken = c->zToken;
          154  +      *ppToken = c->pToken;
   157    155         *pnBytes = n;
   158         -      *piStartOffset = (int) (c->pCurrent-c->pInput);
   159         -      *piEndOffset = *piStartOffset+n;
          156  +      *piStartOffset = iStartOffset;
          157  +      *piEndOffset = c->iOffset;
   160    158         *piPosition = c->iToken++;
   161         -      c->pCurrent += n + 1;
   162    159   
   163    160         return SQLITE_OK;
   164    161       }
   165         -    c->pCurrent += n + 1;
   166         -    /* TODO(shess) could strspn() to skip delimiters en masse.  Needs
   167         -    ** to happen in two places, though, which is annoying.
   168         -    */
   169    162     }
   170    163     return SQLITE_DONE;
   171    164   }
   172    165   
   173    166   static sqlite3_tokenizer_module simpleTokenizerModule = {
   174    167     0,
   175    168     simpleCreate,