Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
f6c3abdc6c5e916e5366ba28fb1cd06c |
User & Date: | shess 2007-03-29 16:30:39.000 |
Context
2007-03-29
| ||
17:07 | Add a couple of test cases to improve coverage testing. (CVS 3747) (check-in: 0b22ce3637 user: danielk1977 tags: trunk) | |
16:30 | Don't call ctype functions on hi-bit chars. Some platforms raise assertions when this occurs, and it's almost certainly not the right thing to do in the first place. (CVS 3746) (check-in: f6c3abdc6c user: shess tags: trunk) | |
15:00 | Assume the malloc-failed flag cannot already be set when calling sqlite3_errmsg(16)(). (CVS 3745) (check-in: 54fa22273d user: danielk1977 tags: trunk) | |
Changes
Changes to ext/fts1/fts1.c.
︙ | ︙ | |||
172 173 174 175 176 177 178 179 180 181 182 183 184 185 | * offset to handle some variance. So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal. Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128. An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise. */ typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */ } DocListType; | > > > > > > > > > > > > > > > > > > > | 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 | * offset to handle some variance. So the estimate would be * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded * as normal. Offsets more than 64 chars from the estimate are * encoded as the delta to the previous start offset + 128. An * additional tiny increment can be gained by using the end offset of * the previous token to make the estimate a tiny bit more precise. */ /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. */ /* TODO(shess) The snippet-generation code should be using the ** tokenizer-generated tokens rather than doing its own local ** tokenization. */ /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ static int safe_isspace(char c){ return (c&0x80)==0 ? isspace(c) : 0; } static int safe_tolower(char c){ return (c&0x80)==0 ? tolower(c) : c; } static int safe_isalnum(char c){ return (c&0x80)==0 ? isalnum(c) : 0; } typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */ } DocListType; |
︙ | ︙ | |||
1532 1533 1534 1535 1536 1537 1538 | int i, c; switch( *z ){ case 0: { *tokenType = TOKEN_EOF; return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { | | | 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 | int i, c; switch( *z ){ case 0: { *tokenType = TOKEN_EOF; return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { for(i=1; safe_isspace(z[i]); i++){} *tokenType = TOKEN_SPACE; return i; } case '\'': case '"': { int delim = z[0]; for(i=1; (c=z[i])!=0; i++){ |
︙ | ︙ | |||
1684 1685 1686 1687 1688 1689 1690 | ** input: delimiters ( '[' , ']' , '...' ) ** output: [ ] ... */ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ | | | 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 | ** input: delimiters ( '[' , ']' , '...' ) ** output: [ ] ... */ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ dequoteString(azIn[i]); if( j>=0 ){ azIn[j] = azIn[i]; } j++; } } |
︙ | ︙ | |||
1733 1734 1735 1736 1737 1738 1739 | ** ** Ignore leading space in *s. ** ** To put it another way, return true if the first token of ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ | | | | | 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 | ** ** Ignore leading space in *s. ** ** To put it another way, return true if the first token of ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ while( safe_isspace(*s) ){ s++; } while( *t ){ if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; } return *s!='_' && !safe_isalnum(*s); } /* ** An instance of this structure defines the "spec" of a ** full text index. This structure is populated by parseSpec ** and use by fulltextConnect and fulltextCreate. */ |
︙ | ︙ | |||
1849 1850 1851 1852 1853 1854 1855 | clearTableSpec(pSpec); return SQLITE_NOMEM; } for(i=0; i<pSpec->nColumn; i++){ char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { | | | 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 | clearTableSpec(pSpec); return SQLITE_NOMEM; } for(i=0; i<pSpec->nColumn; i++){ char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { if( !safe_isalnum(*p) ) *p = '_'; } } /* ** Parse the tokenizer specification string. */ pSpec->azTokenizer = tokenizeString(zTokenizer, &n); |
︙ | ︙ | |||
2326 2327 2328 2329 2330 2331 2332 | return aMatch[i].iStart; } if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ | | | | | | 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 | return aMatch[i].iStart; } if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ if( safe_isspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } if( safe_isspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } return iBreak; } /* ** If the StringBuffer does not end in white space, add a single ** space character to the end. */ static void appendWhiteSpace(StringBuffer *p){ if( p->len==0 ) return; if( safe_isspace(p->s[p->len-1]) ) return; append(p, " "); } /* ** Remove white space from teh end of the StringBuffer */ static void trimWhiteSpace(StringBuffer *p){ while( p->len>0 && safe_isspace(p->s[p->len-1]) ){ p->len--; } } /* |
︙ | ︙ |
Changes to ext/fts2/fts2.c.
︙ | ︙ | |||
299 300 301 302 303 304 305 306 307 308 309 310 311 312 | */ #if 0 # define TRACE(A) printf A; fflush(stdout) #else # define TRACE(A) #endif typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */ } DocListType; | > > > > > > > > > > > > > > > > > > > | 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 | */ #if 0 # define TRACE(A) printf A; fflush(stdout) #else # define TRACE(A) #endif /* It is not safe to call isspace(), tolower(), or isalnum() on ** hi-bit-set characters. This is the same solution used in the ** tokenizer. */ /* TODO(shess) The snippet-generation code should be using the ** tokenizer-generated tokens rather than doing its own local ** tokenization. */ /* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */ static int safe_isspace(char c){ return (c&0x80)==0 ? isspace(c) : 0; } static int safe_tolower(char c){ return (c&0x80)==0 ? tolower(c) : c; } static int safe_isalnum(char c){ return (c&0x80)==0 ? isalnum(c) : 0; } typedef enum DocListType { DL_DOCIDS, /* docids only */ DL_POSITIONS, /* docids + positions */ DL_POSITIONS_OFFSETS /* docids + positions + offsets */ } DocListType; |
︙ | ︙ | |||
500 501 502 503 504 505 506 | if( i>0 ) append(sb, ", "); append(sb, azString[i]); } } static int endsInWhiteSpace(StringBuffer *p){ return stringBufferLength(p)>0 && | | | 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 | if( i>0 ) append(sb, ", "); append(sb, azString[i]); } } static int endsInWhiteSpace(StringBuffer *p){ return stringBufferLength(p)>0 && safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]); } /* If the StringBuffer ends in something other than white space, add a ** single space character to the end. */ static void appendWhiteSpace(StringBuffer *p){ if( stringBufferLength(p)==0 ) return; |
︙ | ︙ | |||
2190 2191 2192 2193 2194 2195 2196 | int i, c; switch( *z ){ case 0: { *tokenType = TOKEN_EOF; return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { | | | 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 | int i, c; switch( *z ){ case 0: { *tokenType = TOKEN_EOF; return 0; } case ' ': case '\t': case '\n': case '\f': case '\r': { for(i=1; safe_isspace(z[i]); i++){} *tokenType = TOKEN_SPACE; return i; } case '\'': case '"': { int delim = z[0]; for(i=1; (c=z[i])!=0; i++){ |
︙ | ︙ | |||
2342 2343 2344 2345 2346 2347 2348 | ** input: delimiters ( '[' , ']' , '...' ) ** output: [ ] ... */ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ | | | 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 | ** input: delimiters ( '[' , ']' , '...' ) ** output: [ ] ... */ static void tokenListToIdList(char **azIn){ int i, j; if( azIn ){ for(i=0, j=-1; azIn[i]; i++){ if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){ dequoteString(azIn[i]); if( j>=0 ){ azIn[j] = azIn[i]; } j++; } } |
︙ | ︙ | |||
2391 2392 2393 2394 2395 2396 2397 | ** ** Ignore leading space in *s. ** ** To put it another way, return true if the first token of ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ | | | | | 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 | ** ** Ignore leading space in *s. ** ** To put it another way, return true if the first token of ** s[] is t[]. */ static int startsWith(const char *s, const char *t){ while( safe_isspace(*s) ){ s++; } while( *t ){ if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0; } return *s!='_' && !safe_isalnum(*s); } /* ** An instance of this structure defines the "spec" of a ** full text index. This structure is populated by parseSpec ** and use by fulltextConnect and fulltextCreate. */ |
︙ | ︙ | |||
2507 2508 2509 2510 2511 2512 2513 | clearTableSpec(pSpec); return SQLITE_NOMEM; } for(i=0; i<pSpec->nColumn; i++){ char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { | | | 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 | clearTableSpec(pSpec); return SQLITE_NOMEM; } for(i=0; i<pSpec->nColumn; i++){ char *p; pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]); for (p = pSpec->azContentColumn[i]; *p ; ++p) { if( !safe_isalnum(*p) ) *p = '_'; } } /* ** Parse the tokenizer specification string. */ pSpec->azTokenizer = tokenizeString(zTokenizer, &n); |
︙ | ︙ | |||
2967 2968 2969 2970 2971 2972 2973 | return aMatch[i].iStart; } if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ | | | | 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 | return aMatch[i].iStart; } if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ if( safe_isspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } if( safe_isspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } return iBreak; } |
︙ | ︙ |
Added test/fts1k.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | # 2007 March 28 # # The author disclaims copyright to this source code. # #************************************************************************* # This file implements regression tests for SQLite library. The focus # of this script is testing isspace/isalnum/tolower problems with the # FTS1 module. Unfortunately, this code isn't a really principled set # of tests, because it's impossible to know where new uses of these # functions might appear. # # $Id: fts1k.test,v 1.1 2007/03/29 16:30:41 shess Exp $ # set testdir [file dirname $argv0] source $testdir/tester.tcl # If SQLITE_ENABLE_FTS1 is defined, omit this file. ifcapable !fts1 { finish_test return } # Tests that startsWith() (calls isspace, tolower, isalnum) can handle # hi-bit chars. parseSpec() also calls isalnum here. do_test fts1k-1.1 { execsql "CREATE VIRTUAL TABLE t1 USING fts1(content, \x80)" } {} # Additionally tests isspace() call in getToken(), and isalnum() call # in tokenListToIdList(). do_test fts1k-1.2 { catch { execsql "CREATE VIRTUAL TABLE t2 USING fts1(content, tokenize \x80)" } sqlite3_errmsg $DB } "unknown tokenizer: \x80" # Additionally test final isalnum() in startsWith(). do_test fts1k-1.3 { execsql "CREATE VIRTUAL TABLE t3 USING fts1(content, tokenize\x80)" } {} # The snippet-generation code has calls to isspace() which are sort of # hard to get to. It finds convenient breakpoints by starting ~40 # chars before and after the matched term, and scanning ~10 chars # around that position for isspace() characters. The long word with # embedded hi-bit chars causes one of these isspace() calls to be # exercised. The version with a couple extra spaces should cause the # other isspace() call to be exercised. [Both cases have been tested # in the debugger, but I'm hoping to continue to catch it if simple # constant changes change things slightly. # # The trailing and leading hi-bit chars help with code which tests for # isspace() to coalesce multiple spaces. set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80" set phrase1 "$word $word $word target $word $word $word" set phrase2 "$word $word $word target $word $word $word" db eval {CREATE VIRTUAL TABLE t4 USING fts1(content)} db eval "INSERT INTO t4 (content) VALUES ('$phrase1')" db eval "INSERT INTO t4 (content) VALUES ('$phrase2')" do_test fts1k-1.4 { execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'} } {1 111 2 117} finish_test |
Added test/fts2l.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | # 2007 March 28 # # The author disclaims copyright to this source code. # #************************************************************************* # This file implements regression tests for SQLite library. The focus # of this script is testing isspace/isalnum/tolower problems with the # FTS2 module. Unfortunately, this code isn't a really principled set # of tests, because it's impossible to know where new uses of these # functions might appear. # # $Id: fts2l.test,v 1.1 2007/03/29 16:30:41 shess Exp $ # set testdir [file dirname $argv0] source $testdir/tester.tcl # If SQLITE_ENABLE_FTS2 is defined, omit this file. ifcapable !fts2 { finish_test return } # Tests that startsWith() (calls isspace, tolower, isalnum) can handle # hi-bit chars. parseSpec() also calls isalnum here. do_test fts2l-1.1 { execsql "CREATE VIRTUAL TABLE t1 USING fts2(content, \x80)" } {} # Additionally tests isspace() call in getToken(), and isalnum() call # in tokenListToIdList(). do_test fts2l-1.2 { catch { execsql "CREATE VIRTUAL TABLE t2 USING fts2(content, tokenize \x80)" } sqlite3_errmsg $DB } "unknown tokenizer: \x80" # Additionally test final isalnum() in startsWith(). do_test fts2l-1.3 { execsql "CREATE VIRTUAL TABLE t3 USING fts2(content, tokenize\x80)" } {} # The snippet-generation code has calls to isspace() which are sort of # hard to get to. It finds convenient breakpoints by starting ~40 # chars before and after the matched term, and scanning ~10 chars # around that position for isspace() characters. The long word with # embedded hi-bit chars causes one of these isspace() calls to be # exercised. The version with a couple extra spaces should cause the # other isspace() call to be exercised. [Both cases have been tested # in the debugger, but I'm hoping to continue to catch it if simple # constant changes change things slightly. # # The trailing and leading hi-bit chars help with code which tests for # isspace() to coalesce multiple spaces. set word "\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80xxxxx\x80" set phrase1 "$word $word $word target $word $word $word" set phrase2 "$word $word $word target $word $word $word" db eval {CREATE VIRTUAL TABLE t4 USING fts2(content)} db eval "INSERT INTO t4 (content) VALUES ('$phrase1')" db eval "INSERT INTO t4 (content) VALUES ('$phrase2')" do_test fts2l-1.4 { execsql {SELECT rowid, length(snippet(t4)) FROM t4 WHERE t4 MATCH 'target'} } {1 111 2 117} finish_test |