Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | A first implementation of a full-text search module for SQLite. (CVS 3363) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
b0d8e0d314d6f77b7d4b5dd00c694a13 |
User & Date: | adamd 2006-08-23 23:58:50.000 |
Context
2006-08-24
| ||
02:42 | Tcl interface does filename translation prior to calling sqlite3_open(). Ticket #1937. (CVS 3364) (check-in: 5696e0cb77 user: drh tags: trunk) | |
2006-08-23
| ||
23:58 | A first implementation of a full-text search module for SQLite. (CVS 3363) (check-in: b0d8e0d314 user: adamd tags: trunk) | |
20:07 | Add the new experimental sqlite3_auto_extension() API. (CVS 3362) (check-in: a85fc877eb user: drh tags: trunk) | |
Changes
Added ext/fts1/ft_hash.c.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 | /* ** 2001 September 22 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This is the implementation of generic hash-tables used in SQLite. ** We've modified it slightly to serve as a standalone hash table ** implementation for the full-text indexing module. */ #include <assert.h> #include <stdlib.h> #include <string.h> #include "ft_hash.h" void *malloc_and_zero(int n){ void *p = malloc(n); if( p ){ memset(p, 0, n); } return p; } /* Turn bulk memory into a hash table object by initializing the ** fields of the Hash structure. ** ** "pNew" is a pointer to the hash table that is to be initialized. ** keyClass is one of the constants HASH_INT, HASH_POINTER, ** HASH_BINARY, or HASH_STRING. The value of keyClass ** determines what kind of key the hash table will use. "copyKey" is ** true if the hash table should make its own private copy of keys and ** false if it should just use the supplied pointer. CopyKey only makes ** sense for HASH_STRING and HASH_BINARY and is ignored ** for other key classes. */ void HashInit(Hash *pNew, int keyClass, int copyKey){ assert( pNew!=0 ); assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY ); pNew->keyClass = keyClass; #if 0 if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0; #endif pNew->copyKey = copyKey; pNew->first = 0; pNew->count = 0; pNew->htsize = 0; pNew->ht = 0; pNew->xMalloc = malloc_and_zero; pNew->xFree = free; } /* Remove all entries from a hash table. Reclaim all memory. ** Call this routine to delete a hash table or to reset a hash table ** to the empty state. */ void HashClear(Hash *pH){ HashElem *elem; /* For looping over all elements of the table */ assert( pH!=0 ); elem = pH->first; pH->first = 0; if( pH->ht ) pH->xFree(pH->ht); pH->ht = 0; pH->htsize = 0; while( elem ){ HashElem *next_elem = elem->next; if( pH->copyKey && elem->pKey ){ pH->xFree(elem->pKey); } pH->xFree(elem); elem = next_elem; } pH->count = 0; } #if 0 /* NOT USED */ /* ** Hash and comparison functions when the mode is HASH_INT */ static int intHash(const void *pKey, int nKey){ return nKey ^ (nKey<<8) ^ (nKey>>8); } static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){ return n2 - n1; } #endif #if 0 /* NOT USED */ /* ** Hash and comparison functions when the mode is HASH_POINTER */ static int ptrHash(const void *pKey, int nKey){ uptr x = Addr(pKey); return x ^ (x<<8) ^ (x>>8); } static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){ if( pKey1==pKey2 ) return 0; if( pKey1<pKey2 ) return -1; return 1; } #endif /* ** Hash and comparison functions when the mode is HASH_STRING */ static int strHash(const void *pKey, int nKey){ const char *z = (const char *)pKey; int h = 0; if( nKey<=0 ) nKey = (int) strlen(z); while( nKey > 0 ){ h = (h<<3) ^ h ^ *z++; nKey--; } return h & 0x7fffffff; } static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){ if( n1!=n2 ) return 1; return strncmp((const char*)pKey1,(const char*)pKey2,n1); } /* ** Hash and comparison functions when the mode is HASH_BINARY */ static int binHash(const void *pKey, int nKey){ int h = 0; const char *z = (const char *)pKey; while( nKey-- > 0 ){ h = (h<<3) ^ h ^ *(z++); } return h & 0x7fffffff; } static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){ if( n1!=n2 ) return 1; return memcmp(pKey1,pKey2,n1); } /* ** Return a pointer to the appropriate hash function given the key class. ** ** The C syntax in this function definition may be unfamilar to some ** programmers, so we provide the following additional explanation: ** ** The name of the function is "hashFunction". The function takes a ** single parameter "keyClass". The return value of hashFunction() ** is a pointer to another function. Specifically, the return value ** of hashFunction() is a pointer to a function that takes two parameters ** with types "const void*" and "int" and returns an "int". */ static int (*hashFunction(int keyClass))(const void*,int){ #if 0 /* HASH_INT and HASH_POINTER are never used */ switch( keyClass ){ case HASH_INT: return &intHash; case HASH_POINTER: return &ptrHash; case HASH_STRING: return &strHash; case HASH_BINARY: return &binHash;; default: break; } return 0; #else if( keyClass==HASH_STRING ){ return &strHash; }else{ assert( keyClass==HASH_BINARY ); return &binHash; } #endif } /* ** Return a pointer to the appropriate hash function given the key class. ** ** For help in interpreted the obscure C code in the function definition, ** see the header comment on the previous function. */ static int (*compareFunction(int keyClass))(const void*,int,const void*,int){ #if 0 /* HASH_INT and HASH_POINTER are never used */ switch( keyClass ){ case HASH_INT: return &intCompare; case HASH_POINTER: return &ptrCompare; case HASH_STRING: return &strCompare; case HASH_BINARY: return &binCompare; default: break; } return 0; #else if( keyClass==HASH_STRING ){ return &strCompare; }else{ assert( keyClass==HASH_BINARY ); return &binCompare; } #endif } /* Link an element into the hash table */ static void insertElement( Hash *pH, /* The complete hash table */ struct _ht *pEntry, /* The entry into which pNew is inserted */ HashElem *pNew /* The element to be inserted */ ){ HashElem *pHead; /* First element already in pEntry */ pHead = pEntry->chain; if( pHead ){ pNew->next = pHead; pNew->prev = pHead->prev; if( pHead->prev ){ pHead->prev->next = pNew; } else { pH->first = pNew; } pHead->prev = pNew; }else{ pNew->next = pH->first; if( pH->first ){ pH->first->prev = pNew; } pNew->prev = 0; pH->first = pNew; } pEntry->count++; pEntry->chain = pNew; } /* Resize the hash table so that it cantains "new_size" buckets. ** "new_size" must be a power of 2. The hash table might fail ** to resize if sqliteMalloc() fails. */ static void rehash(Hash *pH, int new_size){ struct _ht *new_ht; /* The new hash table */ HashElem *elem, *next_elem; /* For looping over existing elements */ int (*xHash)(const void*,int); /* The hash function */ assert( (new_size & (new_size-1))==0 ); new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) ); if( new_ht==0 ) return; if( pH->ht ) pH->xFree(pH->ht); pH->ht = new_ht; pH->htsize = new_size; xHash = hashFunction(pH->keyClass); for(elem=pH->first, pH->first=0; elem; elem = next_elem){ int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1); next_elem = elem->next; insertElement(pH, &new_ht[h], elem); } } /* This function (for internal use only) locates an element in an ** hash table that matches the given key. The hash for this key has ** already been computed and is passed as the 4th parameter. */ static HashElem *findElementGivenHash( const Hash *pH, /* The pH to be searched */ const void *pKey, /* The key we are searching for */ int nKey, int h /* The hash for this key. */ ){ HashElem *elem; /* Used to loop thru the element list */ int count; /* Number of elements left to test */ int (*xCompare)(const void*,int,const void*,int); /* comparison function */ if( pH->ht ){ struct _ht *pEntry = &pH->ht[h]; elem = pEntry->chain; count = pEntry->count; xCompare = compareFunction(pH->keyClass); while( count-- && elem ){ if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){ return elem; } elem = elem->next; } } return 0; } /* Remove a single entry from the hash table given a pointer to that ** element and a hash on the element's key. */ static void removeElementGivenHash( Hash *pH, /* The pH containing "elem" */ HashElem* elem, /* The element to be removed from the pH */ int h /* Hash value for the element */ ){ struct _ht *pEntry; if( elem->prev ){ elem->prev->next = elem->next; }else{ pH->first = elem->next; } if( elem->next ){ elem->next->prev = elem->prev; } pEntry = &pH->ht[h]; if( pEntry->chain==elem ){ pEntry->chain = elem->next; } pEntry->count--; if( pEntry->count<=0 ){ pEntry->chain = 0; } if( pH->copyKey && elem->pKey ){ pH->xFree(elem->pKey); } pH->xFree( elem ); pH->count--; if( pH->count<=0 ){ assert( pH->first==0 ); assert( pH->count==0 ); HashClear(pH); } } /* Attempt to locate an element of the hash table pH with a key ** that matches pKey,nKey. Return the data for this element if it is ** found, or NULL if there is no match. */ void *HashFind(const Hash *pH, const void *pKey, int nKey){ int h; /* A hash on key */ HashElem *elem; /* The element that matches key */ int (*xHash)(const void*,int); /* The hash function */ if( pH==0 || pH->ht==0 ) return 0; xHash = hashFunction(pH->keyClass); assert( xHash!=0 ); h = (*xHash)(pKey,nKey); assert( (pH->htsize & (pH->htsize-1))==0 ); elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1)); return elem ? elem->data : 0; } /* Insert an element into the hash table pH. The key is pKey,nKey ** and the data is "data". ** ** If no element exists with a matching key, then a new ** element is created. A copy of the key is made if the copyKey ** flag is set. NULL is returned. ** ** If another element already exists with the same key, then the ** new data replaces the old data and the old data is returned. ** The key is not copied in this instance. If a malloc fails, then ** the new data is returned and the hash table is unchanged. ** ** If the "data" parameter to this function is NULL, then the ** element corresponding to "key" is removed from the hash table. */ void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){ int hraw; /* Raw hash value of the key */ int h; /* the hash of the key modulo hash table size */ HashElem *elem; /* Used to loop thru the element list */ HashElem *new_elem; /* New element added to the pH */ int (*xHash)(const void*,int); /* The hash function */ assert( pH!=0 ); xHash = hashFunction(pH->keyClass); assert( xHash!=0 ); hraw = (*xHash)(pKey, nKey); assert( (pH->htsize & (pH->htsize-1))==0 ); h = hraw & (pH->htsize-1); elem = findElementGivenHash(pH,pKey,nKey,h); if( elem ){ void *old_data = elem->data; if( data==0 ){ removeElementGivenHash(pH,elem,h); }else{ elem->data = data; } return old_data; } if( data==0 ) return 0; new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) ); if( new_elem==0 ) return data; if( pH->copyKey && pKey!=0 ){ new_elem->pKey = pH->xMalloc( nKey ); if( new_elem->pKey==0 ){ pH->xFree(new_elem); return data; } memcpy((void*)new_elem->pKey, pKey, nKey); }else{ new_elem->pKey = (void*)pKey; } new_elem->nKey = nKey; pH->count++; if( pH->htsize==0 ){ rehash(pH,8); if( pH->htsize==0 ){ pH->count = 0; pH->xFree(new_elem); return data; } } if( pH->count > pH->htsize ){ rehash(pH,pH->htsize*2); } assert( pH->htsize>0 ); assert( (pH->htsize & (pH->htsize-1))==0 ); h = hraw & (pH->htsize-1); insertElement(pH, &pH->ht[h], new_elem); new_elem->data = data; return 0; } |
Added ext/fts1/ft_hash.h.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* ** 2001 September 22 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This is the header file for the generic hash-table implemenation ** used in SQLite. We've modified it slightly to serve as a standalone ** hash table implementation for the full-text indexing module. ** */ #ifndef _HASH_H_ #define _HASH_H_ /* Forward declarations of structures. */ typedef struct Hash Hash; typedef struct HashElem HashElem; /* A complete hash table is an instance of the following structure. ** The internals of this structure are intended to be opaque -- client ** code should not attempt to access or modify the fields of this structure ** directly. Change this structure only by using the routines below. ** However, many of the "procedures" and "functions" for modifying and ** accessing this structure are really macros, so we can't really make ** this structure opaque. */ struct Hash { char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */ char copyKey; /* True if copy of key made on insert */ int count; /* Number of entries in this table */ HashElem *first; /* The first element of the array */ void *(*xMalloc)(int); /* malloc() function to use */ void (*xFree)(void *); /* free() function to use */ int htsize; /* Number of buckets in the hash table */ struct _ht { /* the hash table */ int count; /* Number of entries with this hash */ HashElem *chain; /* Pointer to first entry with this hash */ } *ht; }; /* Each element in the hash table is an instance of the following ** structure. All elements are stored on a single doubly-linked list. ** ** Again, this structure is intended to be opaque, but it can't really ** be opaque because it is used by macros. */ struct HashElem { HashElem *next, *prev; /* Next and previous elements in the table */ void *data; /* Data associated with this element */ void *pKey; int nKey; /* Key associated with this element */ }; /* ** There are 4 different modes of operation for a hash table: ** ** HASH_INT nKey is used as the key and pKey is ignored. ** ** HASH_POINTER pKey is used as the key and nKey is ignored. ** ** HASH_STRING pKey points to a string that is nKey bytes long ** (including the null-terminator, if any). Case ** is respected in comparisons. ** ** HASH_BINARY pKey points to binary data nKey bytes long. ** memcmp() is used to compare keys. ** ** A copy of the key is made for HASH_STRING and HASH_BINARY ** if the copyKey parameter to HashInit is 1. */ /* #define HASH_INT 1 // NOT USED */ /* #define HASH_POINTER 2 // NOT USED */ #define HASH_STRING 3 #define HASH_BINARY 4 /* ** Access routines. To delete, insert a NULL pointer. */ void HashInit(Hash*, int keytype, int copyKey); void *HashInsert(Hash*, const void *pKey, int nKey, void *pData); void *HashFind(const Hash*, const void *pKey, int nKey); void HashClear(Hash*); /* ** Macros for looping over all elements of a hash table. The idiom is ** like this: ** ** Hash h; ** HashElem *p; ** ... ** for(p=HashFirst(&h); p; p=HashNext(p)){ ** SomeStructure *pData = HashData(p); ** // do something with pData ** } */ #define HashFirst(H) ((H)->first) #define HashNext(E) ((E)->next) #define HashData(E) ((E)->data) #define HashKey(E) ((E)->pKey) #define HashKeysize(E) ((E)->nKey) /* ** Number of entries in a hash table */ #define HashCount(H) ((H)->count) #endif /* _HASH_H_ */ |
Added ext/fts1/fulltext.c.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 | /* The author disclaims copyright to this source code. * * This is an SQLite module implementing full-text search. */ #include <assert.h> #if !defined(__APPLE__) #include <malloc.h> #else #include <stdlib.h> #endif #include <stdio.h> #include <string.h> #include <ctype.h> #include "fulltext.h" #include "ft_hash.h" #include "tokenizer.h" #include "sqlite3.h" #include "sqlite3ext.h" SQLITE_EXTENSION_INIT1 /* utility functions */ /* We encode variable-length integers in little-endian order using seven bits * per byte as follows: ** ** KEY: ** A = 0xxxxxxx 7 bits of data and one flag bit ** B = 1xxxxxxx 7 bits of data and one flag bit ** ** 7 bits - A ** 14 bits - BA ** 21 bits - BBA ** and so on. */ /* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */ #define VARINT_MAX 10 /* Write a 64-bit variable-length integer to memory starting at p[0]. * The length of data written will be between 1 and VARINT_MAX bytes. * The number of bytes written is returned. */ int putVarint(char *p, sqlite_int64 v){ unsigned char *q = (unsigned char *) p; sqlite_uint64 vu = v; do{ *q++ = (unsigned char) ((vu & 0x7f) | 0x80); vu >>= 7; }while( vu!=0 ); q[-1] &= 0x7f; /* turn off high bit in final byte */ assert( q - (unsigned char *)p <= VARINT_MAX ); return (int) (q - (unsigned char *)p); } /* Read a 64-bit variable-length integer from memory starting at p[0]. * Return the number of bytes read, or 0 on error. * The value is stored in *v. */ int getVarint(const char *p, sqlite_int64 *v){ const unsigned char *q = (const unsigned char *) p; sqlite_uint64 x = 0, y = 1; while( (*q & 0x80) == 0x80 ){ x += y * (*q++ & 0x7f); y <<= 7; if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */ assert( 0 ); return 0; } } x += y * (*q++); *v = (sqlite_int64) x; return (int) (q - (unsigned char *)p); } int getVarint32(const char *p, int *pi){ sqlite_int64 i; int ret = getVarint(p, &i); *pi = (int) i; assert( *pi==i ); return ret; } /*** Document lists *** * * A document list holds a sorted list of varint-encoded document IDs. * * A doclist with type DL_POSITIONS_OFFSETS is stored like this: * * array { * varint docid; * array { * varint position; (delta from previous position plus 1, or 0 for end) * varint startOffset; (delta from previous startOffset) * varint endOffset; (delta from startOffset) * } * } * * Here, array { X } means zero or more occurrences of X, adjacent in memory. * * A doclist with type DL_POSITIONS is like the above, but holds only docids * and positions without offset information. * * A doclist with type DL_DOCIDS is like the above, but holds only docids * without positions or offset information. * * On disk, every document list has positions and offsets, so we don't bother * to serialize a doclist's type. * * We don't yet delta-encode document IDs; doing so will probably be a * modest win. * * NOTE(shess) I've thought of a slightly (1%) better offset encoding. * After the first offset, estimate the next offset by using the * current token position and the previous token position and offset, * offset to handle some variance. So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal. Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128. An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise. */ typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */ } DocListType; typedef struct DocList { char *pData; int nData; DocListType iType; int iLastPos; /* the last position written */ int iLastOffset; /* the last start offset written */ } DocList; /* Initialize a new DocList to hold the given data. */ static void docListInit(DocList *d, DocListType iType, const char *pData, int nData){ d->nData = nData; if( nData>0 ){ d->pData = malloc(nData); memcpy(d->pData, pData, nData); } else { d->pData = NULL; } d->iType = iType; d->iLastPos = 0; d->iLastOffset = 0; } /* Create a new dynamically-allocated DocList. */ static DocList *docListNew(DocListType iType){ DocList *d = (DocList *) malloc(sizeof(DocList)); docListInit(d, iType, 0, 0); return d; } static void docListDestroy(DocList *d){ free(d->pData); #ifndef NDEBUG memset(d, 0x55, sizeof(*d)); #endif } static void docListDelete(DocList *d){ docListDestroy(d); free(d); } static char *docListEnd(DocList *d){ return d->pData + d->nData; } /* Append a varint to a DocList's data. */ static void appendVarint(DocList *d, sqlite_int64 i){ char c[VARINT_MAX]; int n = putVarint(c, i); d->pData = realloc(d->pData, d->nData + n); memcpy(d->pData + d->nData, c, n); d->nData += n; } static void docListAddDocid(DocList *d, sqlite_int64 iDocid){ appendVarint(d, iDocid); d->iLastPos = 0; } /* Add a position to the last position list in a doclist. */ static void docListAddPos(DocList *d, int iPos){ assert( d->iType>=DL_POSITIONS ); appendVarint(d, iPos-d->iLastPos+1); d->iLastPos = iPos; } static void docListAddPosOffset(DocList *d, int iPos, int iStartOffset, int iEndOffset){ assert( d->iType==DL_POSITIONS_OFFSETS ); docListAddPos(d, iPos); appendVarint(d, iStartOffset-d->iLastOffset); d->iLastOffset = iStartOffset; appendVarint(d, iEndOffset-iStartOffset); } /* Terminate the last position list in the given doclist. */ static void docListAddEndPos(DocList *d){ appendVarint(d, 0); } typedef struct DocListReader { DocList *pDoclist; char *p; int iLastPos; /* the last position read */ } DocListReader; static void readerInit(DocListReader *r, DocList *pDoclist){ r->pDoclist = pDoclist; if( pDoclist!=NULL ){ r->p = pDoclist->pData; } r->iLastPos = 0; } static int readerAtEnd(DocListReader *pReader){ return pReader->p >= docListEnd(pReader->pDoclist); } /* Peek at the next docid without advancing the read pointer. */ static sqlite_int64 peekDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !readerAtEnd(pReader) ); getVarint(pReader->p, &ret); return ret; } /* Read the next docid. */ static sqlite_int64 readDocid(DocListReader *pReader){ sqlite_int64 ret; assert( !readerAtEnd(pReader) ); pReader->p += getVarint(pReader->p, &ret); pReader->iLastPos = 0; return ret; } /* Read the next position from a position list. * Returns the position, or -1 at the end of the list. */ static int readPosition(DocListReader *pReader){ int i; int iType = pReader->pDoclist->iType; assert( iType>=DL_POSITIONS ); assert( !readerAtEnd(pReader) ); pReader->p += getVarint32(pReader->p, &i); if( i==0 ){ pReader->iLastPos = -1; return -1; } pReader->iLastPos += ((int) i)-1; if( iType>=DL_POSITIONS_OFFSETS ){ /* Skip over offsets, ignoring them for now. */ int iStart, iEnd; pReader->p += getVarint32(pReader->p, &iStart); pReader->p += getVarint32(pReader->p, &iEnd); } return pReader->iLastPos; } /* Skip past the end of a position list. */ static void skipPositionList(DocListReader *pReader){ while( readPosition(pReader)!=-1 ) ; } /* Skip over a docid, including its position list if the doclist has * positions. */ static void skipDocument(DocListReader *pReader){ readDocid(pReader); if( pReader->pDoclist->iType >= DL_POSITIONS ){ skipPositionList(pReader); } } static sqlite_int64 firstDocid(DocList *d){ DocListReader r; readerInit(&r, d); return readDocid(&r); } /* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid; * otherwise pUpdate, which must contain only the single docid [iDocid], is * inserted (if not present) or updated (if already present). */ static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){ int modified = 0; DocListReader reader; char *p; assert( d->iType==pUpdate->iType); if( pUpdate!=NULL ){ assert( iDocid==firstDocid(pUpdate) ); } readerInit(&reader, d); while( !readerAtEnd(&reader) ){ if( peekDocid(&reader) >= iDocid ) break; skipDocument(&reader); } p = reader.p; /* Delete if there is a matching element. */ if( !readerAtEnd(&reader) && iDocid==peekDocid(&reader) ){ skipDocument(&reader); memmove(p, reader.p, docListEnd(d) - reader.p); d->nData -= (reader.p - p); modified = 1; } /* Insert if indicated. */ if( pUpdate!=NULL ){ int iDoclist; int nNeed; docListAddEndPos(pUpdate); nNeed = pUpdate->nData; iDoclist = p - d->pData; d->pData = realloc(d->pData, d->nData+nNeed); p = d->pData + iDoclist; memmove(p+nNeed, p, docListEnd(d) - p); memcpy(p, pUpdate->pData, nNeed); p += nNeed; d->nData += nNeed; modified = 1; } return modified; } /* Split the second half of doclist d into a separate doclist d2. Returns 1 * if successful, or 0 if d contains a single document and hence can't be * split. */ static int docListSplit(DocList *d, DocList *d2){ const char *pSplitPoint = d->pData + d->nData / 2; DocListReader reader; readerInit(&reader, d); while( reader.p<pSplitPoint ){ skipDocument(&reader); } if( readerAtEnd(&reader) ) return 0; docListInit(d2, d->iType, reader.p, docListEnd(d) - reader.p); d->nData = reader.p - d->pData; d->pData = realloc(d->pData, d->nData); return 1; } /* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked * on-disk doclist, resulting in another in-memory DocList [out]. [in] * and [out] may or may not store position information according to the * caller's wishes. The on-disk doclist always comes with positions. * * The caller must read each chunk of the on-disk doclist in succession and * pass it to mergeBlock(). * * If [in] has positions, then the merge output contains only documents with * matching positions in the two input doclists. If [in] does not have * positions, then the merge output contains all documents common to the two * input doclists. * * If [in] is NULL, then the on-disk doclist is copied to [out] directly. * * A merge is performed using an integer [iOffset] provided by the caller. * [iOffset] is subtracted from each position in the on-disk doclist for the * purpose of position comparison; this is helpful in implementing phrase * searches. * * A DocListMerge is not yet able to propagate offsets through query * processing; we should add that capability soon. */ typedef struct DocListMerge { DocListReader in; DocList *pOut; int iOffset; } DocListMerge; static void mergeInit(DocListMerge *m, DocList *pIn, int iOffset, DocList *pOut){ readerInit(&m->in, pIn); m->pOut = pOut; m->iOffset = iOffset; /* can't handle offsets yet */ assert( pIn==NULL || pIn->iType <= DL_POSITIONS ); assert( pOut->iType <= DL_POSITIONS ); } /* A helper function for mergeBlock(), below. Merge the position lists * pointed to by m->in and pBlockReader. * If the merge matches, write [iDocid] to m->pOut; if m->pOut * has positions then write all matching positions as well. */ static void mergePosList(DocListMerge *m, sqlite_int64 iDocid, DocListReader *pBlockReader){ int block_pos = readPosition(pBlockReader); int in_pos = readPosition(&m->in); int match = 0; while( block_pos!=-1 || in_pos!=-1 ){ if( block_pos-m->iOffset==in_pos ){ if( !match ){ docListAddDocid(m->pOut, iDocid); match = 1; } if( m->pOut->iType >= DL_POSITIONS ){ docListAddPos(m->pOut, in_pos); } block_pos = readPosition(pBlockReader); in_pos = readPosition(&m->in); } else if( in_pos==-1 || block_pos!=-1 && block_pos-m->iOffset<in_pos ){ block_pos = readPosition(pBlockReader); } else { in_pos = readPosition(&m->in); } } if( m->pOut->iType >= DL_POSITIONS && match ){ docListAddEndPos(m->pOut); } } /* Merge one block of an on-disk doclist into a DocListMerge. */ static void mergeBlock(DocListMerge *m, DocList *pBlock){ DocListReader blockReader; assert( pBlock->iType >= DL_POSITIONS ); readerInit(&blockReader, pBlock); while( !readerAtEnd(&blockReader) ){ sqlite_int64 iDocid = readDocid(&blockReader); if( m->in.pDoclist!=NULL ){ while( 1 ){ if( readerAtEnd(&m->in) ) return; /* nothing more to merge */ if( peekDocid(&m->in)>=iDocid ) break; skipDocument(&m->in); } if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */ skipPositionList(&blockReader); /* skip this docid in the block */ continue; } readDocid(&m->in); } /* We have a document match. */ if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){ /* We don't need to do a poslist merge. */ docListAddDocid(m->pOut, iDocid); if( m->pOut->iType >= DL_POSITIONS ){ /* Copy all positions to the output doclist. */ while( 1 ){ int pos = readPosition(&blockReader); if( pos==-1 ) break; docListAddPos(m->pOut, pos); } docListAddEndPos(m->pOut); } else skipPositionList(&blockReader); continue; } mergePosList(m, iDocid, &blockReader); } } static char *string_dup_n(const char *s, int n){ char *str = malloc(n + 1); memcpy(str, s, n); str[n] = '\0'; return str; } /* Duplicate a string; the caller must free() the returned string. * (We don't use strdup() since it's not part of the standard C library and * may not be available everywhere.) */ static char *string_dup(const char *s){ return string_dup_n(s, strlen(s)); } /* Format a string, replacing each occurrence of the % character with * zName. This may be more convenient than sqlite_mprintf() * when one string is used repeatedly in a format string. * The caller must free() the returned string. */ static char *string_format(const char *zFormat, const char *zName){ const char *p; size_t len = 0; size_t nName = strlen(zName); char *result; char *r; /* first compute length needed */ for(p = zFormat ; *p ; ++p){ len += (*p=='%' ? nName : 1); } len += 1; /* for null terminator */ r = result = malloc(len); for(p = zFormat; *p; ++p){ if( *p=='%' ){ memcpy(r, zName, nName); r += nName; } else { *r++ = *p; } } *r++ = '\0'; assert( r == result + len ); return result; } static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){ char *zCommand = string_format(zFormat, zName); int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL); free(zCommand); return rc; } static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt, const char *zFormat){ char *zCommand = string_format(zFormat, zName); int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL); free(zCommand); return rc; } /* end utility functions */ #define QUERY_GENERIC 0 #define QUERY_FULLTEXT 1 #define CHUNK_MAX 1024 typedef enum fulltext_statement { CONTENT_INSERT_STMT, CONTENT_SELECT_STMT, CONTENT_DELETE_STMT, TERM_SELECT_STMT, TERM_CHUNK_SELECT_STMT, TERM_INSERT_STMT, TERM_UPDATE_STMT, TERM_DELETE_STMT, MAX_STMT /* Always at end! */ } fulltext_statement; /* These must exactly match the enum above. */ /* TODO(adam): Is there some risk that a statement (in particular, ** pTermSelectStmt) will be used in two cursors at once, e.g. if a ** query joins a virtual table to itself? If so perhaps we should ** move some of these to the cursor object. */ const char *fulltext_zStatement[MAX_STMT] = { /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)", /* CONTENT_SELECT */ "select content from %_content where rowid = ?", /* CONTENT_DELETE */ "delete from %_content where rowid = ?", /* TERM_SELECT */ "select rowid, doclist from %_term where term = ? and first = ?", /* TERM_CHUNK_SELECT */ "select max(first) from %_term where term = ? and first <= ?", /* TERM_INSERT */ "insert into %_term (term, first, doclist) values (?, ?, ?)", /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?", /* TERM_DELETE */ "delete from %_term where rowid = ?", }; typedef struct fulltext_vtab { sqlite3_vtab base; sqlite3 *db; const char *zName; /* virtual table name */ sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */ /* Precompiled statements which we keep as long as the table is ** open. */ sqlite3_stmt *pFulltextStatements[MAX_STMT]; } fulltext_vtab; typedef struct fulltext_cursor { sqlite3_vtab_cursor base; int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */ sqlite3_stmt *pStmt; int eof; /* The following is used only when iCursorType == QUERY_FULLTEXT. */ DocListReader result; } fulltext_cursor; static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){ return (fulltext_vtab *) c->base.pVtab; } static sqlite3_module fulltextModule; /* forward declaration */ /* Puts a freshly-prepared statement determined by iStmt in *ppStmt. ** If the indicated statement has never been prepared, it is prepared ** and cached, otherwise the cached version is reset. */ static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt, sqlite3_stmt **ppStmt){ assert( iStmt<MAX_STMT ); if( v->pFulltextStatements[iStmt]==NULL ){ int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt], fulltext_zStatement[iStmt]); if( rc!=SQLITE_OK ) return rc; } else { int rc = sqlite3_reset(v->pFulltextStatements[iStmt]); if( rc!=SQLITE_OK ) return rc; } *ppStmt = v->pFulltextStatements[iStmt]; return SQLITE_OK; } /* Step the indicated statement, handling errors SQLITE_BUSY (by ** retrying) and SQLITE_SCHEMA (by re-preparing and transferring ** bindings to the new statement). ** TODO(adam): We should extend this function so that it can work with ** statements declared locally, not only globally cached statements. */ static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt, sqlite3_stmt **ppStmt){ int rc; sqlite3_stmt *s = *ppStmt; assert( iStmt<MAX_STMT ); assert( s==v->pFulltextStatements[iStmt] ); while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){ sqlite3_stmt *pNewStmt; if( rc==SQLITE_BUSY ) continue; if( rc!=SQLITE_ERROR ) return rc; rc = sqlite3_reset(s); if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR; v->pFulltextStatements[iStmt] = NULL; /* Still in s */ rc = sql_get_statement(v, iStmt, &pNewStmt); if( rc!=SQLITE_OK ) goto err; *ppStmt = pNewStmt; rc = sqlite3_transfer_bindings(s, pNewStmt); if( rc!=SQLITE_OK ) goto err; rc = sqlite3_finalize(s); if( rc!=SQLITE_OK ) return rc; s = pNewStmt; } return rc; err: sqlite3_finalize(s); return rc; } /* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK. ** Useful for statements like UPDATE, where we expect no results. */ static int sql_single_step_statement(fulltext_vtab *v, fulltext_statement iStmt, sqlite3_stmt **ppStmt){ int rc = sql_step_statement(v, iStmt, ppStmt); return (rc==SQLITE_DONE) ? SQLITE_OK : rc; } /* insert into %_content (rowid, content) values ([rowid], [zContent]) */ static int content_insert(fulltext_vtab *v, sqlite3_value *rowid, const char *zContent, int nContent){ sqlite3_stmt *s; int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_value(s, 1, rowid); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC); if( rc!=SQLITE_OK ) return rc; return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s); } /* select content from %_content where rowid = [iRow] * The caller must delete the returned string. */ static int content_select(fulltext_vtab *v, sqlite_int64 iRow, char **pzContent){ sqlite3_stmt *s; int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 1, iRow); if( rc!=SQLITE_OK ) return rc; rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s); if( rc!=SQLITE_ROW ) return rc; *pzContent = string_dup((const char *)sqlite3_column_text(s, 0)); /* We expect only one row. We must execute another sqlite3_step() * to complete the iteration; otherwise the table will remain locked. */ rc = sqlite3_step(s); if( rc==SQLITE_DONE ) return SQLITE_OK; free(*pzContent); return rc; } /* delete from %_content where rowid = [iRow ] */ static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){ sqlite3_stmt *s; int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 1, iRow); if( rc!=SQLITE_OK ) return rc; return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s); } /* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst] * If found, returns SQLITE_OK; the caller must free the returned doclist. * If no rows found, returns SQLITE_ERROR. */ static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm, sqlite_int64 iFirst, sqlite_int64 *rowid, DocList *out){ sqlite3_stmt *s; int rc = sql_get_statement(v, TERM_SELECT_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 2, iFirst); if( rc!=SQLITE_OK ) return rc; rc = sql_step_statement(v, TERM_SELECT_STMT, &s); if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; *rowid = sqlite3_column_int64(s, 0); docListInit(out, DL_POSITIONS_OFFSETS, sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1)); /* We expect only one row. We must execute another sqlite3_step() * to complete the iteration; otherwise the table will remain locked. */ rc = sqlite3_step(s); return rc==SQLITE_DONE ? SQLITE_OK : rc; } /* select max(first) from %_term where term = [zTerm] and first <= [iFirst] * If found, returns SQLITE_ROW and result in *piResult; if the query returns * NULL (meaning no row found) returns SQLITE_DONE. */ static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm, sqlite_int64 iFirst, sqlite_int64 *piResult){ sqlite3_stmt *s; int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 2, iFirst); if( rc!=SQLITE_OK ) return rc; rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s); if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc; switch( sqlite3_column_type(s, 0) ){ case SQLITE_NULL: rc = SQLITE_DONE; break; case SQLITE_INTEGER: *piResult = sqlite3_column_int64(s, 0); break; default: return SQLITE_ERROR; } /* We expect only one row. We must execute another sqlite3_step() * to complete the iteration; otherwise the table will remain locked. */ if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR; return rc; } /* insert into %_term (term, first, doclist) values ([zTerm], [iFirst], [doclist]) */ static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm, sqlite_int64 iFirst, DocList *doclist){ sqlite3_stmt *s; int rc = sql_get_statement(v, TERM_INSERT_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 2, iFirst); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC); if( rc!=SQLITE_OK ) return rc; return sql_single_step_statement(v, TERM_INSERT_STMT, &s); } /* update %_term set doclist = [doclist] where rowid = [rowid] */ static int term_update(fulltext_vtab *v, sqlite_int64 rowid, DocList *doclist){ sqlite3_stmt *s; int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 2, rowid); if( rc!=SQLITE_OK ) return rc; return sql_single_step_statement(v, TERM_UPDATE_STMT, &s); } static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){ sqlite3_stmt *s; int rc = sql_get_statement(v, TERM_DELETE_STMT, &s); if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_int64(s, 1, rowid); if( rc!=SQLITE_OK ) return rc; return sql_single_step_statement(v, TERM_DELETE_STMT, &s); } static void fulltext_vtab_destroy(fulltext_vtab *v){ int iStmt; for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){ if( v->pFulltextStatements[iStmt]!=NULL ){ sqlite3_finalize(v->pFulltextStatements[iStmt]); v->pFulltextStatements[iStmt] = NULL; } } if( v->pTokenizer!=NULL ){ v->pTokenizer->pModule->xDestroy(v->pTokenizer); v->pTokenizer = NULL; } free((void *) v->zName); free(v); } /* Current interface: ** argv[0] - module name ** argv[1] - database name ** argv[2] - table name ** argv[3] - tokenizer name (optional, a sensible default is provided) ** argv[4..] - passed to tokenizer (optional based on tokenizer) **/ static int fulltextConnect(sqlite3 *db, void *pAux, int argc, char **argv, sqlite3_vtab **ppVTab){ int rc; fulltext_vtab *v; sqlite3_tokenizer_module *m = NULL; assert( argc>=3 ); v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab)); /* sqlite will initialize v->base */ v->db = db; v->zName = string_dup(argv[2]); v->pTokenizer = NULL; if( argc==3 ){ get_simple_tokenizer_module(&m); } else { /* TODO(shess) For now, add new tokenizers as else if clauses. */ if( !strcmp(argv[3], "simple") ){ get_simple_tokenizer_module(&m); } else { assert( "unrecognized tokenizer"==NULL ); } } /* TODO(shess) Since tokenization impacts the index, the parameters ** to the tokenizer need to be identical when a persistent virtual ** table is re-created. One solution would be a meta-table to track ** such information in the database. Then we could verify that the ** information is identical on subsequent creates. */ /* TODO(shess) Why isn't argv already (const char **)? */ rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer); if( rc!=SQLITE_OK ) return rc; v->pTokenizer->pModule = m; /* TODO: verify the existence of backing tables foo_content, foo_term */ rc = sqlite3_declare_vtab(db, "create table x(content text)"); if( rc!=SQLITE_OK ) return rc; memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements)); *ppVTab = &v->base; return SQLITE_OK; } static int fulltextCreate(sqlite3 *db, void *pAux, int argc, char **argv, sqlite3_vtab **ppVTab){ int rc; assert( argc>=3 ); /* The %_content table holds the text of each full-text item, with ** the rowid used as the docid. ** ** The %_term table maps each term to a document list blob ** containing elements sorted by ascending docid, each element ** encoded as: ** ** docid varint-encoded ** token count varint-encoded ** "count" token elements (poslist): ** position varint-encoded as delta from previous position ** start offset varint-encoded as delta from previous start offset ** end offset varint-encoded as delta from start offset ** ** Additionally, doclist blobs can be chunked into multiple rows, ** using "first" to order the blobs. "first" is simply the first ** docid in the blob. */ /* ** NOTE(shess) That last sentence is incorrect in the face of ** deletion, which can leave a doclist that doesn't contain the ** first from that row. I _believe_ this does not matter to the ** operation of the system, but it might be reasonable to update ** appropriately in case this assumption becomes more important. */ rc = sql_exec(db, argv[2], "create table %_content(content text);" "create table %_term(term text, first integer, doclist blob);" "create index %_index on %_term(term, first)"); if( rc!=SQLITE_OK ) return rc; return fulltextConnect(db, pAux, argc, argv, ppVTab); } /* Decide how to handle an SQL query. * At the moment, MATCH queries can include implicit boolean ANDs; we * haven't implemented phrase searches or OR yet. */ static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){ int i; for(i=0; i<pInfo->nConstraint; ++i){ const struct sqlite3_index_constraint *pConstraint; pConstraint = &pInfo->aConstraint[i]; if( pConstraint->iColumn==0 && pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH && pConstraint->usable ){ /* a full-text search */ pInfo->aConstraintUsage[i].argvIndex = 1; pInfo->aConstraintUsage[i].omit = 1; pInfo->idxNum = QUERY_FULLTEXT; pInfo->estimatedCost = 1.0; /* an arbitrary value for now */ return SQLITE_OK; } } pInfo->idxNum = QUERY_GENERIC; return SQLITE_OK; } static int fulltextDisconnect(sqlite3_vtab *pVTab){ fulltext_vtab_destroy((fulltext_vtab *)pVTab); return SQLITE_OK; } static int fulltextDestroy(sqlite3_vtab *pVTab){ fulltext_vtab *v = (fulltext_vtab *)pVTab; int rc = sql_exec(v->db, v->zName, "drop table %_content; drop table %_term"); if( rc!=SQLITE_OK ) return rc; fulltext_vtab_destroy((fulltext_vtab *)pVTab); return SQLITE_OK; } static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){ fulltext_cursor *c; c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1); /* sqlite will initialize c->base */ *ppCursor = &c->base; return SQLITE_OK; } static int fulltextClose(sqlite3_vtab_cursor *pCursor){ fulltext_cursor *c = (fulltext_cursor *) pCursor; sqlite3_finalize(c->pStmt); if( c->result.pDoclist!=NULL ){ docListDelete(c->result.pDoclist); } free(c); return SQLITE_OK; } static int fulltextNext(sqlite3_vtab_cursor *pCursor){ fulltext_cursor *c = (fulltext_cursor *) pCursor; sqlite_int64 iDocid; int rc; switch( c->iCursorType ){ case QUERY_GENERIC: /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ rc = sqlite3_step(c->pStmt); switch( rc ){ case SQLITE_ROW: c->eof = 0; return SQLITE_OK; case SQLITE_DONE: c->eof = 1; return SQLITE_OK; default: c->eof = 1; return rc; } case QUERY_FULLTEXT: rc = sqlite3_reset(c->pStmt); if( rc!=SQLITE_OK ) return rc; if( readerAtEnd(&c->result)){ c->eof = 1; return SQLITE_OK; } iDocid = readDocid(&c->result); rc = sqlite3_bind_int64(c->pStmt, 1, iDocid); if( rc!=SQLITE_OK ) return rc; /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */ rc = sqlite3_step(c->pStmt); if( rc==SQLITE_ROW ){ /* the case we expect */ c->eof = 0; return SQLITE_OK; } /* an error occurred; abort */ return rc==SQLITE_DONE ? SQLITE_ERROR : rc; default: assert( 0 ); return SQLITE_ERROR; /* not reached */ } } static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm, sqlite3_stmt **ppStmt){ int rc; if( *ppStmt ){ rc = sqlite3_reset(*ppStmt); } else { rc = sql_prepare(v->db, v->zName, ppStmt, "select doclist from %_term where term = ? order by first"); } if( rc!=SQLITE_OK ) return rc; rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT); if( rc!=SQLITE_OK ) return rc; return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */ } /* Read the posting list for [zTerm]; AND it with the doclist [in] to * produce the doclist [out], using the given offset [iOffset] for phrase * matching. * (*pSelect) is used to hold an SQLite statement used inside this function; * the caller should initialize *pSelect to NULL before the first call. */ static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect, const char *zTerm, DocList *pIn, int iOffset, DocList *out){ int rc; DocListMerge merge; if( pIn!=NULL && !pIn->nData ){ /* If [pIn] is already empty, there's no point in reading the * posting list to AND it in; return immediately. */ return SQLITE_OK; } rc = term_select_doclist(v, zTerm, -1, pSelect); if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc; mergeInit(&merge, pIn, iOffset, out); while( rc==SQLITE_ROW ){ DocList block; docListInit(&block, DL_POSITIONS_OFFSETS, sqlite3_column_blob(*pSelect, 0), sqlite3_column_bytes(*pSelect, 0)); mergeBlock(&merge, &block); docListDestroy(&block); rc = sqlite3_step(*pSelect); if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){ return rc; } } return SQLITE_OK; } typedef struct QueryTerm { int is_phrase; /* true if this term begins a new phrase */ const char *zTerm; } QueryTerm; /* A parsed query. * * As an example, parsing the query ["four score" years "new nation"] will * yield a Query with 5 terms: * "four", is_phrase = 1 * "score", is_phrase = 0 * "years", is_phrase = 1 * "new", is_phrase = 1 * "nation", is_phrase = 0 */ typedef struct Query { int nTerms; QueryTerm *pTerm; } Query; void query_add(Query *q, int is_phrase, const char *zTerm){ QueryTerm *t; ++q->nTerms; q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0])); t = &q->pTerm[q->nTerms - 1]; t->is_phrase = is_phrase; t->zTerm = zTerm; } void query_free(Query *q){ int i; for(i = 0; i < q->nTerms; ++i){ free((void *) q->pTerm[i].zTerm); } free(q->pTerm); } int tokenize_segment(sqlite3_tokenizer *pTokenizer, const char *zQuery, int in_phrase, Query *pQuery){ sqlite3_tokenizer_module *pModule = pTokenizer->pModule; sqlite3_tokenizer_cursor *pCursor; int is_first = 1; int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor); if( rc!=SQLITE_OK ) return rc; pCursor->pTokenizer = pTokenizer; while( 1 ){ const char *zToken; int nToken, iStartOffset, iEndOffset, dummy_pos; rc = pModule->xNext(pCursor, &zToken, &nToken, &iStartOffset, &iEndOffset, &dummy_pos); if( rc!=SQLITE_OK ) break; query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken)); is_first = 0; } return pModule->xClose(pCursor); } /* Parse a query string, yielding a Query object. */ int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){ char *zQuery1 = string_dup(zQuery); int in_phrase = 0; char *s = zQuery1; pQuery->nTerms = 0; pQuery->pTerm = NULL; while( *s ){ char *t = s; while( *t ){ if( *t=='"' ){ *t++ = '\0'; break; } ++t; } if( *s ){ tokenize_segment(v->pTokenizer, s, in_phrase, pQuery); } s = t; in_phrase = !in_phrase; } free(zQuery1); return SQLITE_OK; } /* Perform a full-text query; return a list of documents in [pResult]. */ static int fulltext_query(fulltext_vtab *v, const char *zQuery, DocList **pResult){ Query q; int phrase_start = -1; int i; sqlite3_stmt *pSelect = NULL; DocList *d = NULL; int rc = parse_query(v, zQuery, &q); if( rc!=SQLITE_OK ) return rc; /* Merge terms. */ for(i = 0 ; i < q.nTerms ; ++i){ /* In each merge step, we need to generate positions whenever we're * processing a phrase which hasn't ended yet. */ int need_positions = i<q.nTerms-1 && !q.pTerm[i+1].is_phrase; DocList *next = docListNew(need_positions ? DL_POSITIONS : DL_DOCIDS); if( q.pTerm[i].is_phrase ){ phrase_start = i; } rc = query_merge(v, &pSelect, q.pTerm[i].zTerm, d, i - phrase_start, next); if( rc!=SQLITE_OK ) break; if( d!=NULL ){ docListDelete(d); } d = next; } sqlite3_finalize(pSelect); query_free(&q); *pResult = d; return rc; } static int fulltextFilter(sqlite3_vtab_cursor *pCursor, int idxNum, const char *idxStr, int argc, sqlite3_value **argv){ fulltext_cursor *c = (fulltext_cursor *) pCursor; fulltext_vtab *v = cursor_vtab(c); int rc; const char *zStatement; c->iCursorType = idxNum; switch( idxNum ){ case QUERY_GENERIC: zStatement = "select rowid, content from %_content"; break; case QUERY_FULLTEXT: /* full-text search */ { const char *zQuery = (const char *)sqlite3_value_text(argv[0]); DocList *pResult; assert( argc==1 ); rc = fulltext_query(v, zQuery, &pResult); if( rc!=SQLITE_OK ) return rc; readerInit(&c->result, pResult); zStatement = "select rowid, content from %_content where rowid = ?"; break; } default: assert( 0 ); } rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement); if( rc!=SQLITE_OK ) return rc; return fulltextNext(pCursor); } static int fulltextEof(sqlite3_vtab_cursor *pCursor){ fulltext_cursor *c = (fulltext_cursor *) pCursor; return c->eof; } static int fulltextColumn(sqlite3_vtab_cursor *pCursor, sqlite3_context *pContext, int idxCol){ fulltext_cursor *c = (fulltext_cursor *) pCursor; const char *s; assert( idxCol==0 ); s = (const char *) sqlite3_column_text(c->pStmt, 1); sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT); return SQLITE_OK; } static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){ fulltext_cursor *c = (fulltext_cursor *) pCursor; *pRowid = sqlite3_column_int64(c->pStmt, 0); return SQLITE_OK; } /* Build a hash table containing all terms in zText. */ static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer, const char *zText, sqlite_int64 iDocid){ sqlite3_tokenizer_cursor *pCursor; const char *pToken; int nTokenBytes; int iStartOffset, iEndOffset, iPosition; int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor); if( rc!=SQLITE_OK ) return rc; pCursor->pTokenizer = pTokenizer; HashInit(terms, HASH_STRING, 1); while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor, &pToken, &nTokenBytes, &iStartOffset, &iEndOffset, &iPosition) ){ DocList *p; /* Positions can't be negative; we use -1 as a terminator internally. */ if( iPosition<0 ) { rc = SQLITE_ERROR; goto err; } p = HashFind(terms, pToken, nTokenBytes); if( p==NULL ){ p = docListNew(DL_POSITIONS_OFFSETS); docListAddDocid(p, iDocid); HashInsert(terms, pToken, nTokenBytes, p); } docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset); } err: /* TODO(shess) Check return? Should this be able to cause errors at ** this point? Actually, same question about sqlite3_finalize(), ** though one could argue that failure there means that the data is ** not durable. *ponder* */ pTokenizer->pModule->xClose(pCursor); return rc; } /* Update the %_terms table to map the term [zTerm] to the given rowid. */ static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm, sqlite_int64 iDocid, DocList *p){ sqlite_int64 iFirst; sqlite_int64 iIndexRow; DocList doclist; char *pBlob = NULL; int nBlob = 0; int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); if( rc==SQLITE_DONE ){ docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0); if( docListUpdate(&doclist, iDocid, p) ){ rc = term_insert(v, zTerm, nTerm, iDocid, &doclist); docListDestroy(&doclist); return rc; } return SQLITE_OK; } if( rc!=SQLITE_ROW ) return SQLITE_ERROR; /* This word is in the index; add this document ID to its blob. */ rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); if( rc!=SQLITE_OK ) return rc; if( docListUpdate(&doclist, iDocid, p) ){ /* If the blob is too big, split it in half. */ if( doclist.nData>CHUNK_MAX ){ DocList half; if( docListSplit(&doclist, &half) ){ rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half); docListDestroy(&half); if( rc!=SQLITE_OK ) goto err; } } rc = term_update(v, iIndexRow, &doclist); } err: docListDestroy(&doclist); return rc; } /* Insert a row into the full-text index; set *piRowid to be the ID of the * new row. */ static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid, const char *zText, sqlite_int64 *piRowid){ Hash terms; /* maps term string -> PosList */ HashElem *e; int rc = content_insert(v, pRequestRowid, zText, -1); if( rc!=SQLITE_OK ) return rc; *piRowid = sqlite3_last_insert_rowid(v->db); if( !zText ) return SQLITE_OK; /* nothing to index */ rc = build_terms(&terms, v->pTokenizer, zText, *piRowid); if( rc!=SQLITE_OK ) return rc; for(e=HashFirst(&terms); e; e=HashNext(e)){ DocList *p = HashData(e); rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p); if( rc!=SQLITE_OK ) break; } for(e=HashFirst(&terms); e; e=HashNext(e)){ DocList *p = HashData(e); docListDelete(p); } HashClear(&terms); return rc; } static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm, sqlite_int64 iDocid){ sqlite_int64 iFirst; sqlite_int64 iIndexRow; DocList doclist; int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst); if( rc!=SQLITE_ROW ) return SQLITE_ERROR; rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist); if( rc!=SQLITE_OK ) return rc; if( docListUpdate(&doclist, iDocid, NULL) ){ if( doclist.nData>0 ){ rc = term_update(v, iIndexRow, &doclist); } else { /* empty posting list */ rc = term_delete(v, iIndexRow); } } docListDestroy(&doclist); return rc; } /* Delete a row from the full-text index. */ static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){ char *zText; Hash terms; HashElem *e; int rc = content_select(v, iRow, &zText); if( rc!=SQLITE_OK ) return rc; rc = build_terms(&terms, v->pTokenizer, zText, iRow); free(zText); if( rc!=SQLITE_OK ) return rc; for(e=HashFirst(&terms); e; e=HashNext(e)){ rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow); if( rc!=SQLITE_OK ) break; } for(e=HashFirst(&terms); e; e=HashNext(e)){ DocList *p = HashData(e); docListDelete(p); } HashClear(&terms); return content_delete(v, iRow); } static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg, sqlite_int64 *pRowid){ fulltext_vtab *v = (fulltext_vtab *) pVtab; if( nArg<2 ){ return index_delete(v, sqlite3_value_int64(ppArg[0])); } if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){ return SQLITE_ERROR; /* an update; not yet supported */ } assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */ return index_insert(v, ppArg[1], (const char *)sqlite3_value_text(ppArg[2]), pRowid); } static sqlite3_module fulltextModule = { 0, fulltextCreate, fulltextConnect, fulltextBestIndex, fulltextDisconnect, fulltextDestroy, fulltextOpen, fulltextClose, fulltextFilter, fulltextNext, fulltextEof, fulltextColumn, fulltextRowid, fulltextUpdate }; int fulltext_init(sqlite3 *db){ return sqlite3_create_module(db, "fulltext", &fulltextModule, 0); } #if !SQLITE_CORE int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg, const sqlite3_api_routines *pApi){ SQLITE_EXTENSION_INIT2(pApi) return fulltext_init(db); } #endif |
Added ext/fts1/fulltext.h.
> > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 | #include "sqlite3.h" #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ int fulltext_init(sqlite3 *db); #ifdef __cplusplus } /* extern "C" */ #endif /* __cplusplus */ |
Added ext/fts1/simple_tokenizer.c.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | /* ** The author disclaims copyright to this source code. ** ************************************************************************* ** Implementation of the "simple" full-text-search tokenizer. */ #include <assert.h> #if !defined(__APPLE__) #include <malloc.h> #else #include <stdlib.h> #endif #include <stdio.h> #include <string.h> #include <ctype.h> #include "tokenizer.h" /* Duplicate a string; the caller must free() the returned string. * (We don't use strdup() since it's not part of the standard C library and * may not be available everywhere.) */ /* TODO(shess) Copied from fulltext.c, consider util.c for such ** things. */ static char *string_dup(const char *s){ char *str = malloc(strlen(s) + 1); strcpy(str, s); return str; } typedef struct simple_tokenizer { sqlite3_tokenizer base; const char *zDelim; /* token delimiters */ } simple_tokenizer; typedef struct simple_tokenizer_cursor { sqlite3_tokenizer_cursor base; const char *pInput; /* input we are tokenizing */ int nBytes; /* size of the input */ const char *pCurrent; /* current position in pInput */ int iToken; /* index of next token to be returned */ char *zToken; /* storage for current token */ int nTokenBytes; /* actual size of current token */ int nTokenAllocated; /* space allocated to zToken buffer */ } simple_tokenizer_cursor; static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */ static int simpleCreate( int argc, const char **argv, sqlite3_tokenizer **ppTokenizer ){ simple_tokenizer *t; t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer)); /* TODO(shess) Delimiters need to remain the same from run to run, ** else we need to reindex. One solution would be a meta-table to ** track such information in the database, then we'd only want this ** information on the initial create. */ if( argc>1 ){ t->zDelim = string_dup(argv[1]); } else { /* Build a string of non-alphanumeric ASCII characters */ char zDelim[128]; /* nul-terminated, so nul not a member */ int i, j; for(i=1, j=0; i<0x80; i++){ if( !isalnum(i) ){ zDelim[j++] = i; } } zDelim[j++] = '\0'; assert( j<=sizeof(zDelim) ); t->zDelim = string_dup(zDelim); } *ppTokenizer = &t->base; return SQLITE_OK; } static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ simple_tokenizer *t = (simple_tokenizer *) pTokenizer; free((void *) t->zDelim); free(t); return SQLITE_OK; } static int simpleOpen( sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes, sqlite3_tokenizer_cursor **ppCursor ){ simple_tokenizer_cursor *c; c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor)); c->pInput = pInput; c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes; c->pCurrent = c->pInput; /* start tokenizing at the beginning */ c->iToken = 0; c->zToken = NULL; /* no space allocated, yet. */ c->nTokenBytes = 0; c->nTokenAllocated = 0; *ppCursor = &c->base; return SQLITE_OK; } static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; if( NULL!=c->zToken ){ free(c->zToken); } free(c); return SQLITE_OK; } static int simpleNext( sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition ){ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; int ii; while( c->pCurrent-c->pInput<c->nBytes ){ int n = (int) strcspn(c->pCurrent, t->zDelim); if( n>0 ){ if( n+1>c->nTokenAllocated ){ c->zToken = realloc(c->zToken, n+1); } for(ii=0; ii<n; ii++){ c->zToken[ii] = tolower(c->pCurrent[ii]); } c->zToken[n] = '\0'; *ppToken = c->zToken; *pnBytes = n; *piStartOffset = (int) (c->pCurrent-c->pInput); *piEndOffset = *piStartOffset+n; *piPosition = c->iToken++; c->pCurrent += n + 1; return SQLITE_OK; } c->pCurrent += n + 1; /* TODO(shess) could strspn() to skip delimiters en masse. Needs ** to happen in two places, though, which is annoying. */ } return SQLITE_DONE; } static sqlite3_tokenizer_module simpleTokenizerModule = { 0, simpleCreate, simpleDestroy, simpleOpen, simpleClose, simpleNext, }; void get_simple_tokenizer_module( sqlite3_tokenizer_module **ppModule ){ *ppModule = &simpleTokenizerModule; } |
Added ext/fts1/tokenizer.h.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | /* ** 2006 July 10 ** ** The author disclaims copyright to this source code. ** ************************************************************************* ** Defines the interface to tokenizers used by fulltext-search. There ** are three basic components: ** ** sqlite3_tokenizer_module is a singleton defining the tokenizer ** interface functions. This is essentially the class structure for ** tokenizers. ** ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps ** including customization information defined at creation time. ** ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate ** tokens from a particular input. */ #ifndef _TOKENIZER_H_ #define _TOKENIZER_H_ /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time. ** If tokenizers are to be allowed to call sqlite3_*() functions, then ** we will need a way to register the API consistently. */ #include "sqlite3.h" /* ** Structures used by the tokenizer interface. */ typedef struct sqlite3_tokenizer sqlite3_tokenizer; typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; struct sqlite3_tokenizer_module { int iVersion; /* currently 0 */ /* ** Create and destroy a tokenizer. argc/argv are passed down from ** the fulltext virtual table creation to allow customization. */ int (*xCreate)(int argc, const char **argv, sqlite3_tokenizer **ppTokenizer); int (*xDestroy)(sqlite3_tokenizer *pTokenizer); /* ** Tokenize a particular input. Call xOpen() to prepare to ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then ** xClose() to free any internal state. The pInput passed to ** xOpen() must exist until the cursor is closed. The ppToken ** result from xNext() is only valid until the next call to xNext() ** or until xClose() is called. */ /* TODO(shess) current implementation requires pInput to be ** nul-terminated. This should either be fixed, or pInput/nBytes ** should be converted to zInput. */ int (*xOpen)(sqlite3_tokenizer *pTokenizer, const char *pInput, int nBytes, sqlite3_tokenizer_cursor **ppCursor); int (*xClose)(sqlite3_tokenizer_cursor *pCursor); int (*xNext)(sqlite3_tokenizer_cursor *pCursor, const char **ppToken, int *pnBytes, int *piStartOffset, int *piEndOffset, int *piPosition); }; struct sqlite3_tokenizer { sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ /* Tokenizer implementations will typically add additional fields */ }; struct sqlite3_tokenizer_cursor { sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */ /* Tokenizer implementations will typically add additional fields */ }; /* ** Get the module for a tokenizer which generates tokens based on a ** set of non-token characters. The default is to break tokens at any ** non-alnum character, though the set of delimiters can also be ** specified by the first argv argument to xCreate(). */ /* TODO(shess) This doesn't belong here. Need some sort of ** registration process. */ void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule); #endif /* _TOKENIZER_H_ */ |