Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Convert invalid surrogates to 0xfffd when translating UTF. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA3-256: |
7fab1393c2b22b1f3b159b631e06e7e0 |
User & Date: | drh 2020-02-17 23:08:16 |
References
2020-05-20
| ||
15:02 | Back out the change from [7fab1393c2b22b1f] that tries to convert invalid surrogate characters in UTF16 into the replacement character 0xfffd, as we find that this breaks some software. (check-in: 4218c7b7 user: drh tags: trunk) | |
Context
2020-02-18
| ||
19:49 | Add the new sqlite3_create_filename() and sqlite3_free_filename() interfaces for use by Shims. Use these interfaces inside the multiplexor. (check-in: 9469f36a user: drh tags: trunk) | |
2020-02-17
| ||
23:08 | Convert invalid surrogates to 0xfffd when translating UTF. (check-in: 7fab1393 user: drh tags: trunk) | |
19:25 | A better (smaller and faster) solution to ticket [4374860b29383380]. (check-in: abc473fb user: drh tags: trunk) | |
Changes
Changes to src/utf.c.
︙ | ︙ | |||
101 102 103 104 105 106 107 | *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ *zOut++ = (u8)(c&0x00FF); \ } \ } | < < < < < < < < < < < < < < < < < < < < | 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03)); \ *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0)); \ *zOut++ = (u8)(0x00DC + ((c>>8)&0x03)); \ *zOut++ = (u8)(c&0x00FF); \ } \ } /* ** Translate a single UTF-8 character. Return the unicode value. ** ** During translation, assume that the byte that zTerm points ** is a 0x00. ** ** Write a pointer to the next unread byte back into *pzNext. |
︙ | ︙ | |||
297 298 299 300 301 302 303 | pMem->n = (int)(z - zOut); *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ | | > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > | 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | pMem->n = (int)(z - zOut); *z++ = 0; }else{ assert( desiredEnc==SQLITE_UTF8 ); if( pMem->enc==SQLITE_UTF16LE ){ /* UTF-16 Little-endian -> UTF-8 */ while( zIn<zTerm ){ c = *(zIn++); c += (*(zIn++))<<8; if( c>=0xd800 && c<0xe000 ){ if( c>=0xdc00 || zIn>=zTerm ){ c = 0xfffd; }else{ int c2 = *(zIn++); c2 += (*(zIn++))<<8; if( c2<0xdc00 || c2>=0xe000 ){ zIn -= 2; c = 0xfffd; }else{ c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000; } } } WRITE_UTF8(z, c); } }else{ /* UTF-16 Big-endian -> UTF-8 */ while( zIn<zTerm ){ c = (*(zIn++))<<8; c += *(zIn++); if( c>=0xd800 && c<0xe000 ){ if( c>=0xdc00 || zIn>=zTerm ){ c = 0xfffd; }else{ int c2 = (*(zIn++))<<8; c2 += *(zIn++); if( c2<0xdc00 || c2>=0xe000 ){ zIn -= 2; c = 0xfffd; }else{ c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000; } } } WRITE_UTF8(z, c); } } pMem->n = (int)(z - zOut); } *z = 0; assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len ); |
︙ | ︙ | |||
462 463 464 465 466 467 468 | ** in pZ. nChar must be non-negative. */ int sqlite3Utf16ByteLen(const void *zIn, int nChar){ int c; unsigned char const *z = zIn; int n = 0; | | | | | < < < < > | | < | > | 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 | ** in pZ. nChar must be non-negative. */ int sqlite3Utf16ByteLen(const void *zIn, int nChar){ int c; unsigned char const *z = zIn; int n = 0; if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++; while( n<nChar ){ c = z[0]; z += 2; if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2; n++; } return (int)(z-(unsigned char const *)zIn) - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE); } #if defined(SQLITE_TEST) /* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. |
︙ | ︙ | |||
501 502 503 504 505 506 507 | z[0] = 0; z = zBuf; c = sqlite3Utf8Read((const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); | < < < < < < < < < < < < < < < < < < < < < < < < | 508 509 510 511 512 513 514 515 516 517 518 519 | z[0] = 0; z = zBuf; c = sqlite3Utf8Read((const u8**)&z); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } } #endif /* SQLITE_TEST */ #endif /* SQLITE_OMIT_UTF16 */ |
Changes to test/tkt-3fe897352e.test.
︙ | ︙ | |||
29 30 31 32 33 34 35 | sqlite3 db :memory: db eval { PRAGMA encoding=UTF8; CREATE TABLE t1(x); INSERT INTO t1 VALUES(hex_to_utf16be('D800')); SELECT hex(x) FROM t1; } | | | | | | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | sqlite3 db :memory: db eval { PRAGMA encoding=UTF8; CREATE TABLE t1(x); INSERT INTO t1 VALUES(hex_to_utf16be('D800')); SELECT hex(x) FROM t1; } } {EFBFBD} do_test tkt-3fe89-1.2 { db eval { DELETE FROM t1; INSERT INTO t1 VALUES(hex_to_utf16le('00D8')); SELECT hex(x) FROM t1; } } {EFBFBD} do_test tkt-3fe89-1.3 { db eval { DELETE FROM t1; INSERT INTO t1 VALUES(hex_to_utf16be('DFFF')); SELECT hex(x) FROM t1; } } {EFBFBD} do_test tkt-3fe89-1.4 { db eval { DELETE FROM t1; INSERT INTO t1 VALUES(hex_to_utf16le('FFDF')); SELECT hex(x) FROM t1; } } {EFBFBD} finish_test |