Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
2631ceaeefaca3aa837e3b439399f13c |
User & Date: | shess 2006-09-01 00:05:17.000 |
Context
2006-09-01
| ||
00:33 |
Make fts1.c not rely on nul-terminated strings. Mostly a matter of
making sure we always pass around ptr/len, but there were a few places
where we actually relied on nul-termination.
An earlier change had additionally changed appropriate sqlite3_bind_text() calls to sqlite3_bind_blob(). I've found that this changes what's actually stored in the database, so backed those changes out. Also (and this is weird), I found that I could no longer do straight-forward = queries against %_term.term at a command-line. (CVS 3379) (check-in: 5844db1aa9 user: shess tags: trunk) | |
00:05 | Make tokenizer not rely on nul-terminated text. Instead of using strcspn() and a nul-terminated delimiter list, I just flagged delimiters in an array and wrote things inline. Submitting this for review separately because it's pretty standalone. (CVS 3378) (check-in: 2631ceaeef user: shess tags: trunk) | |
2006-08-31
| ||
15:07 | Refactor the FTS1 module so that its name is "fts1" instead of "fulltext", so that all symbols with external linkage begin with "sqlite3Fts1", and so that all filenames begin with "fts1". (CVS 3377) (check-in: e1891f0dc5 user: drh tags: trunk) | |
Changes
Changes to ext/fts1/fts1_tokenizer1.c.
︙ | ︙ | |||
25 26 27 28 29 30 31 | #endif #include <stdio.h> #include <string.h> #include <ctype.h> #include "fts1_tokenizer.h" | < < < < < < < < < < < | | | < > > > > | | > > > > > > > > > | < | | | < | < < < < < < < | < | | < < < | < < > > > | > | > > | > > > > > | > > | | | | | < | | | < < < < < | 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 | #endif #include <stdio.h> #include <string.h> #include <ctype.h> #include "fts1_tokenizer.h" typedef struct simple_tokenizer { sqlite3_tokenizer base; char delim[128]; /* flag ASCII delimiters */ } simple_tokenizer; typedef struct simple_tokenizer_cursor { sqlite3_tokenizer_cursor base; const char *pInput; /* input we are tokenizing */ int nBytes; /* size of the input */ int iOffset; /* current position in pInput */ int iToken; /* index of next token to be returned */ char *pToken; /* storage for current token */ int nTokenAllocated; /* space allocated to zToken buffer */ } simple_tokenizer_cursor; static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ static int isDelim(simple_tokenizer *t, unsigned char c){ return c<0x80 && t->delim[c]; } static int simpleCreate( int argc, const char **argv, sqlite3_tokenizer **ppTokenizer ){ simple_tokenizer *t; t = (simple_tokenizer *) calloc(sizeof(simple_tokenizer), 1); /* TODO(shess) Delimiters need to remain the same from run to run, ** else we need to reindex. One solution would be a meta-table to ** track such information in the database, then we'd only want this ** information on the initial create. */ if( argc>1 ){ int i, n = strlen(argv[1]); for(i=0; i<n; i++){ unsigned char ch = argv[1][i]; /* We explicitly don't support UTF-8 delimiters for now. */ if( ch>=0x80 ){ free(t); return SQLITE_ERROR; } t->delim[ch] = 1; } } else { /* Mark non-alphanumeric ASCII characters as delimiters */ int i; for(i=1; i<0x80; i++){ t->delim[i] = !isalnum(i); } } *ppTokenizer = &t->base; return SQLITE_OK; } static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ free(pTokenizer); return SQLITE_OK; } static int simpleOpen( sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes, sqlite3_tokenizer_cursor **ppCursor ){ simple_tokenizer_cursor *c; c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); c->pInput = pInput; c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; c->iOffset = 0; /* start tokenizing at the beginning */ c->iToken = 0; c->pToken = NULL; /* no space allocated, yet. */ c->nTokenAllocated = 0; *ppCursor = &c->base; return SQLITE_OK; } static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; free(c->pToken); free(c); return SQLITE_OK; } static int simpleNext( sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition ){ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; unsigned char *p = (unsigned char *)c->pInput; while( c->iOffset<c->nBytes ){ int iStartOffset; /* Scan past delimiter characters */ while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){ c->iOffset++; } /* Count non-delimiter characters. */ iStartOffset = c->iOffset; while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){ c->iOffset++; } if( c->iOffset>iStartOffset ){ int i, n = c->iOffset-iStartOffset; if( n>c->nTokenAllocated ){ c->pToken = realloc(c->pToken, n); } for(i=0; i<n; i++){ /* TODO(shess) This needs expansion to handle UTF-8 ** case-insensitivity. */ unsigned char ch = p[iStartOffset+i]; c->pToken[i] = ch<0x80 ? tolower(ch) : ch; } *ppToken = c->pToken; *pnBytes = n; *piStartOffset = iStartOffset; *piEndOffset = c->iOffset; *piPosition = c->iToken++; return SQLITE_OK; } } return SQLITE_DONE; } static sqlite3_tokenizer_module simpleTokenizerModule = { 0, simpleCreate, |
︙ | ︙ |