Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Rework the UTF8 reader logic in order to avoid the use of malloc(). Ticket #2523. (CVS 4175) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
9a059cb6bced5cdc950f7816602ac92d |
User & Date: | drh 2007-07-23 19:12:42.000 |
Context
2007-07-23
| ||
19:26 | Check the return value of sqlite3PagerWrite() when autovacuuming. Ticket #2524. (CVS 4176) (check-in: b4a5c62b85 user: drh tags: trunk) | |
19:12 | Rework the UTF8 reader logic in order to avoid the use of malloc(). Ticket #2523. (CVS 4175) (check-in: 9a059cb6bc user: drh tags: trunk) | |
2007-07-22
| ||
19:10 | Fix a bad sizeof in vdbe.c. Ticket #2522. (CVS 4174) (check-in: 77ebc3feb0 user: drh tags: trunk) | |
Changes
Changes to src/func.c.
︙ | ︙ | |||
12 13 14 15 16 17 18 | ** This file contains the C functions that implement various SQL ** functions of SQLite. ** ** There is only one exported symbol in this file - the function ** sqliteRegisterBuildinFunctions() found at the bottom of the file. ** All other code has file scope. ** | | > | 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | ** This file contains the C functions that implement various SQL ** functions of SQLite. ** ** There is only one exported symbol in this file - the function ** sqliteRegisterBuildinFunctions() found at the bottom of the file. ** All other code has file scope. ** ** $Id: func.c,v 1.162 2007/07/23 19:12:42 drh Exp $ */ #include "sqliteInt.h" #include <ctype.h> /* #include <math.h> */ #include <stdlib.h> #include <assert.h> #include "vdbeInt.h" #include "os.h" /* ** Return the collating function associated with a function. */ static CollSeq *sqlite3GetFuncCollSeq(sqlite3_context *context){ return context->pColl; } |
︙ | ︙ | |||
393 394 395 396 397 398 399 | /* The correct SQL-92 behavior is for the LIKE operator to ignore ** case. Thus 'a' LIKE 'A' would be true. */ static const struct compareInfo likeInfoNorm = { '%', '_', 0, 1 }; /* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator ** is case sensitive causing 'a' LIKE 'A' to be false */ static const struct compareInfo likeInfoAlt = { '%', '_', 0, 0 }; | < < < < < < < < < | 394 395 396 397 398 399 400 401 402 403 404 405 406 407 | /* The correct SQL-92 behavior is for the LIKE operator to ignore ** case. Thus 'a' LIKE 'A' would be true. */ static const struct compareInfo likeInfoNorm = { '%', '_', 0, 1 }; /* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator ** is case sensitive causing 'a' LIKE 'A' to be false */ static const struct compareInfo likeInfoAlt = { '%', '_', 0, 0 }; /* ** Compare two UTF-8 strings for equality where the first string can ** potentially be a "glob" expression. Return true (1) if they ** are the same and false (0) if they are different. ** ** Globbing rules: ** |
︙ | ︙ | |||
436 437 438 439 440 441 442 | */ static int patternCompare( const u8 *zPattern, /* The glob pattern */ const u8 *zString, /* The string to compare against the glob */ const struct compareInfo *pInfo, /* Information about how to do the compare */ const int esc /* The escape character */ ){ | | < | > | | | < < > > | | < > | | < | | > | < > | | | | | > > > | | > > | | < | | < > | < < > | | | > > > | | | < | > | | < < < | > > | | < | > | < > | < > > > | < < | 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 | */ static int patternCompare( const u8 *zPattern, /* The glob pattern */ const u8 *zString, /* The string to compare against the glob */ const struct compareInfo *pInfo, /* Information about how to do the compare */ const int esc /* The escape character */ ){ int c, c2; int invert; int seen; u8 matchOne = pInfo->matchOne; u8 matchAll = pInfo->matchAll; u8 matchSet = pInfo->matchSet; u8 noCase = pInfo->noCase; int prevEscape = 0; /* True if the previous character was 'escape' */ while( (c = sqlite3Utf8Read(zPattern,0,&zPattern))!=0 ){ if( !prevEscape && c==matchAll ){ while( (c=sqlite3Utf8Read(zPattern,0,&zPattern)) == matchAll || c == matchOne ){ if( c==matchOne && sqlite3Utf8Read(zString, 0, &zString)==0 ){ return 0; } } if( c==0 ){ return 1; }else if( c==esc ){ c = sqlite3Utf8Read(zPattern, 0, &zPattern); if( c==0 ){ return 0; } }else if( c==matchSet ){ assert( esc==0 ); /* This is GLOB, not LIKE */ assert( matchSet<0x80 ); /* '[' is a single-byte character */ while( *zString && patternCompare(&zPattern[-1],zString,pInfo,esc)==0 ){ SQLITE_SKIP_UTF8(zString); } return *zString!=0; } while( (c2 = sqlite3Utf8Read(zString,0,&zString))!=0 ){ if( noCase ){ c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2; c = c<0x80 ? sqlite3UpperToLower[c] : c; while( c2 != 0 && c2 != c ){ c2 = sqlite3Utf8Read(zString, 0, &zString); if( c2<0x80 ) c2 = sqlite3UpperToLower[c2]; } }else{ while( c2 != 0 && c2 != c ){ c2 = sqlite3Utf8Read(zString, 0, &zString); } } if( c2==0 ) return 0; if( patternCompare(zPattern,zString,pInfo,esc) ) return 1; } return 0; }else if( !prevEscape && c==matchOne ){ if( sqlite3Utf8Read(zString, 0, &zString)==0 ){ return 0; } }else if( c==matchSet ){ int prior_c = 0; assert( esc==0 ); /* This only occurs for GLOB, not LIKE */ seen = 0; invert = 0; c = sqlite3Utf8Read(zString, 0, &zString); if( c==0 ) return 0; c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); if( c2=='^' ){ invert = 1; c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); } if( c2==']' ){ if( c==']' ) seen = 1; c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); } while( c2 && c2!=']' ){ if( c2=='-' && zPattern[0]!=']' && zPattern[0]!=0 && prior_c>0 ){ c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); if( c>=prior_c && c<=c2 ) seen = 1; prior_c = 0; }else{ if( c==c2 ){ seen = 1; } prior_c = c2; } c2 = sqlite3Utf8Read(zPattern, 0, &zPattern); } if( c2==0 || (seen ^ invert)==0 ){ return 0; } }else if( esc==c && !prevEscape ){ prevEscape = 1; }else{ c2 = sqlite3Utf8Read(zString, 0, &zString); if( noCase ){ c = c<0x80 ? sqlite3UpperToLower[c] : c; c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2; } if( c!=c2 ){ return 0; } prevEscape = 0; } } return *zString==0; } /* |
︙ | ︙ | |||
586 587 588 589 590 591 592 | const unsigned char *zEsc = sqlite3_value_text(argv[2]); if( zEsc==0 ) return; if( sqlite3Utf8CharLen((char*)zEsc, -1)!=1 ){ sqlite3_result_error(context, "ESCAPE expression must be a single character", -1); return; } | | | 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 | const unsigned char *zEsc = sqlite3_value_text(argv[2]); if( zEsc==0 ) return; if( sqlite3Utf8CharLen((char*)zEsc, -1)!=1 ){ sqlite3_result_error(context, "ESCAPE expression must be a single character", -1); return; } escape = sqlite3Utf8Read(zEsc, 0, &zEsc); } if( zA && zB ){ struct compareInfo *pInfo = sqlite3_user_data(context); #ifdef SQLITE_TEST sqlite3_like_count++; #endif |
︙ | ︙ |
Changes to src/sqliteInt.h.
1 2 3 4 5 6 7 8 9 10 11 12 13 | /* ** 2001 September 15 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** Internal interface definitions for SQLite. ** | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | /* ** 2001 September 15 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** Internal interface definitions for SQLite. ** ** @(#) $Id: sqliteInt.h,v 1.579 2007/07/23 19:12:42 drh Exp $ */ #ifndef _SQLITEINT_H_ #define _SQLITEINT_H_ #include "sqliteLimit.h" #if defined(SQLITE_TCL) || defined(TCLSH) |
︙ | ︙ | |||
1552 1553 1554 1555 1556 1557 1558 | * This global flag is set for performance testing of triggers. When it is set * SQLite will perform the overhead of building new and old trigger references * even when no triggers exist */ extern int sqlite3_always_code_trigger_setup; /* | < < < < | < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 | * This global flag is set for performance testing of triggers. When it is set * SQLite will perform the overhead of building new and old trigger references * even when no triggers exist */ extern int sqlite3_always_code_trigger_setup; /* ** Assuming zIn points to the first byte of a UTF-8 character, ** advance zIn to point to the first byte of the next UTF-8 character. */ #define SQLITE_SKIP_UTF8(zIn) { \ if( (*(zIn++))>=0xc0 ){ \ while( (*zIn & 0xc0)==0x80 ){ zIn++; } \ } \ } /* ** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production ** builds) or a function call (for debugging). If it is a function call, ** it allows the operator to set a breakpoint at the spot where database ** corruption is first detected. */ |
︙ | ︙ | |||
1826 1827 1828 1829 1830 1831 1832 | int sqlite3FixTriggerStep(DbFixer*, TriggerStep*); int sqlite3AtoF(const char *z, double*); char *sqlite3_snprintf(int,char*,const char*,...); int sqlite3GetInt32(const char *, int*); int sqlite3FitsIn64Bits(const char *); int sqlite3Utf16ByteLen(const void *pData, int nChar); int sqlite3Utf8CharLen(const char *pData, int nByte); | | | 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 | int sqlite3FixTriggerStep(DbFixer*, TriggerStep*); int sqlite3AtoF(const char *z, double*); char *sqlite3_snprintf(int,char*,const char*,...); int sqlite3GetInt32(const char *, int*); int sqlite3FitsIn64Bits(const char *); int sqlite3Utf16ByteLen(const void *pData, int nChar); int sqlite3Utf8CharLen(const char *pData, int nByte); int sqlite3Utf8Read(const u8*, const u8*, const u8**); int sqlite3PutVarint(unsigned char *, u64); int sqlite3GetVarint(const unsigned char *, u64 *); int sqlite3GetVarint32(const unsigned char *, u32 *); int sqlite3VarintLen(u64 v); void sqlite3IndexAffinityStr(Vdbe *, Index *); void sqlite3TableAffinityStr(Vdbe *, Table *); char sqlite3CompareAffinity(Expr *pExpr, char aff2); |
︙ | ︙ |
Changes to src/utf.c.
︙ | ︙ | |||
8 9 10 11 12 13 14 | ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** | | | 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** ** $Id: utf.c,v 1.52 2007/07/23 19:12:42 drh Exp $ ** ** Notes on UTF-8: ** ** Byte-0 Byte-1 Byte-2 Byte-3 Value ** 0xxxxxxx 00000000 00000000 0xxxxxxx ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx |
︙ | ︙ | |||
55 56 57 58 59 60 61 62 63 64 65 66 67 68 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, }; #define WRITE_UTF8(zOut, c) { \ if( c<0x00080 ){ \ *zOut++ = (c&0xFF); \ } \ else if( c<0x00800 ){ \ *zOut++ = 0xC0 + ((c>>6)&0x1F); \ | > | 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00, }; #define WRITE_UTF8(zOut, c) { \ if( c<0x00080 ){ \ *zOut++ = (c&0xFF); \ } \ else if( c<0x00800 ){ \ *zOut++ = 0xC0 + ((c>>6)&0x1F); \ |
︙ | ︙ | |||
121 122 123 124 125 126 127 128 129 130 131 132 133 134 | if( c>=0xD800 && c<0xE000 ){ \ int c2 = ((*zIn++)<<8); \ c2 += (*zIn++); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ } \ } /* ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). */ /* #define TRANSLATE_TRACE 1 */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | if( c>=0xD800 && c<0xE000 ){ \ int c2 = ((*zIn++)<<8); \ c2 += (*zIn++); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ } \ } /* ** Translate a single UTF-8 character. Return the unicode value. ** ** During translation, assume that the byte that zTerm points ** is a 0x00. ** ** Write a pointer to the next unread byte back into *pzNext. ** ** Notes On Invalid UTF-8: ** ** * This routine never allows a 7-bit character (0x00 through 0x7f) to ** be encoded as a multi-byte character. Any multi-byte character that ** attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd. ** ** * This routine never allows a UTF16 surrogate value to be encoded. ** If a multi-byte character attempts to encode a value between ** 0xd800 and 0xe000 then it is rendered as 0xfffd. ** ** * Bytes in the range of 0x80 through 0xbf which occur as the first ** byte of a character are interpreted as single-byte characters ** and rendered as themselves even though they are technically ** invalid characters. ** ** * This routine accepts an infinite number of different UTF8 encodings ** for unicode values 0x80 and greater. It do not change over-length ** encodings to 0xfffd as some systems recommend. */ int sqlite3Utf8Read( const unsigned char *z, /* First byte of UTF-8 character */ const unsigned char *zTerm, /* Pretend this byte is 0x00 */ const unsigned char **pzNext /* Write first byte past UTF-8 char here */ ){ int c = *(z++); if( c>=0xc0 ){ c = sqlite3UtfTrans1[c-0xc0]; while( z!=zTerm && (*z & 0xc0)==0x80 ){ c = (c<<6) + (0x3f & *(z++)); } if( c<0x80 || (c&0xFFFFF800)==0xD800 || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } } *pzNext = z; return c; } /* ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate(). */ /* #define TRANSLATE_TRACE 1 */ |
︙ | ︙ | |||
215 216 217 218 219 220 221 | if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | < < < | < < < | 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | if( !zOut ) return SQLITE_NOMEM; }else{ zOut = zShort; } z = zOut; if( pMem->enc==SQLITE_UTF8 ){ if( desiredEnc==SQLITE_UTF16LE ){ /* UTF-8 -> UTF-16 Little-endian */ while( zIn<zTerm ){ c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); WRITE_UTF16LE(z, c); } }else{ assert( desiredEnc==SQLITE_UTF16BE ); /* UTF-8 -> UTF-16 Big-endian */ while( zIn<zTerm ){ c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); WRITE_UTF16BE(z, c); } } pMem->n = z - zOut; *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ |
︙ | ︙ | |||
473 474 475 476 477 478 479 | ** ** The translation is done in-place (since it is impossible for the ** correct UTF-8 encoding to be longer than a malformed encoding). */ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; | > | | | < > > | | 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 | ** ** The translation is done in-place (since it is impossible for the ** correct UTF-8 encoding to be longer than a malformed encoding). */ int sqlite3Utf8To8(unsigned char *zIn){ unsigned char *zOut = zIn; unsigned char *zStart = zIn; unsigned char *zTerm; u32 c; while( zIn[0] ){ c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn); if( c!=0xfffd ){ WRITE_UTF8(zOut, c); } } *zOut = 0; return zOut - zStart; } #endif #if defined(SQLITE_TEST) /* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3UtfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; unsigned char *zTerm; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z[0] = 0; zTerm = z; z = zBuf; c = sqlite3Utf8Read(z, zTerm, (const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ |
︙ | ︙ |