Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | When converting UTF8 or UTF16 strings, change overlong strings and other illegal codes to 0xFFFD. Ticket #2029. (CVS 3479) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
0c6736df9cb4c3c8f6224e30df939cea |
User & Date: | drh 2006-10-19 01:58:44.000 |
Context
2006-10-19
| ||
20:27 | fts2 support for testing. These are a prelude to adding some test scripts. (CVS 3480) (check-in: 004ad1943f user: shess tags: trunk) | |
01:58 | When converting UTF8 or UTF16 strings, change overlong strings and other illegal codes to 0xFFFD. Ticket #2029. (CVS 3479) (check-in: 0c6736df9c user: drh tags: trunk) | |
2006-10-18
| ||
23:26 | Fix a problems that arise if malloc() fails while compiling SELECT statements within a TRIGGER. (CVS 3478) (check-in: ee4894b499 user: drh tags: trunk) | |
Changes
Changes to src/utf.c.
︙ | ︙ | |||
8 9 10 11 12 13 14 | ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** | | | 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 | ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** This file contains routines used to translate between UTF-8, ** UTF-16, UTF-16BE, and UTF-16LE. ** ** $Id: utf.c,v 1.43 2006/10/19 01:58:44 drh Exp $ ** ** Notes on UTF-8: ** ** Byte-0 Byte-1 Byte-2 Byte-3 Value ** 0xxxxxxx 00000000 00000000 0xxxxxxx ** 110yyyyy 10xxxxxx 00000000 00000yyy yyxxxxxx ** 1110zzzz 10yyyyyy 10xxxxxx 00000000 zzzzyyyy yyxxxxxx |
︙ | ︙ | |||
60 61 62 63 64 65 66 | */ #include "sqliteInt.h" #include <assert.h> #include "vdbeInt.h" /* ** This table maps from the first byte of a UTF-8 character to the number | | | | | | | | | | | | > > > > > > > > > > > > > | > > > | 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | */ #include "sqliteInt.h" #include <assert.h> #include "vdbeInt.h" /* ** This table maps from the first byte of a UTF-8 character to the number ** of trailing bytes expected. A value '4' indicates that the table key ** is not a legal first byte for a UTF-8 character. */ static const u8 xtra_utf8_bytes[256] = { /* 0xxxxxxx */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10wwwwww */ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* 110yyyyy */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 1110zzzz */ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* 11110yyy */ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, }; /* ** This table maps from the number of trailing bytes in a UTF-8 character ** to an integer constant that is effectively calculated for each character ** read by a naive implementation of a UTF-8 character reader. The code ** in the READ_UTF8 macro explains things best. */ static const int xtra_utf8_bits[] = { 0, 12416, /* (0xC0 << 6) + (0x80) */ 925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */ 63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ }; /* ** If a UTF-8 character contains N bytes extra bytes (N bytes follow ** the initial byte so that the total character length is N+1) then ** masking the character with utf8_mask[N] must produce a non-zero ** result. Otherwise, we have an (illegal) overlong encoding. */ static const int utf_mask[] = { 0x00000000, 0xffffff80, 0xfffff800, 0xffff0000, }; #define READ_UTF8(zIn, c) { \ int xtra; \ c = *(zIn)++; \ xtra = xtra_utf8_bytes[c]; \ switch( xtra ){ \ case 4: c = (int)0xFFFD; break; \ case 3: c = (c<<6) + *(zIn)++; \ case 2: c = (c<<6) + *(zIn)++; \ case 1: c = (c<<6) + *(zIn)++; \ c -= xtra_utf8_bits[xtra]; \ if( (utf_mask[xtra]&c)==0 \ || (c&0xFFFFF800)==0xD800 \ || (c&0xFFFFFFFE)==0xFFFE ){ c = 0xFFFD; } \ } \ } int sqlite3ReadUtf8(const unsigned char *z){ int c; READ_UTF8(z, c); return c; } |
︙ | ︙ | |||
177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | #define READ_UTF16LE(zIn, c){ \ c = (*zIn++); \ c += ((*zIn++)<<8); \ if( c>=0xD800 && c<=0xE000 ){ \ int c2 = (*zIn++); \ c2 += ((*zIn++)<<8); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ } \ } #define READ_UTF16BE(zIn, c){ \ c = ((*zIn++)<<8); \ c += (*zIn++); \ if( c>=0xD800 && c<=0xE000 ){ \ int c2 = ((*zIn++)<<8); \ c2 += (*zIn++); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ } \ } #define SKIP_UTF16BE(zIn){ \ if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \ zIn += 4; \ }else{ \ | > > | 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 | #define READ_UTF16LE(zIn, c){ \ c = (*zIn++); \ c += ((*zIn++)<<8); \ if( c>=0xD800 && c<=0xE000 ){ \ int c2 = (*zIn++); \ c2 += ((*zIn++)<<8); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ } \ } #define READ_UTF16BE(zIn, c){ \ c = ((*zIn++)<<8); \ c += (*zIn++); \ if( c>=0xD800 && c<=0xE000 ){ \ int c2 = ((*zIn++)<<8); \ c2 += (*zIn++); \ c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10); \ if( (c & 0xFFFF0000)==0 ) c = 0xFFFD; \ } \ } #define SKIP_UTF16BE(zIn){ \ if( *zIn>=0xD8 && (*zIn<0xE0 || (*zIn==0xE0 && *(zIn+1)==0x00)) ){ \ zIn += 4; \ }else{ \ |
︙ | ︙ | |||
552 553 554 555 556 557 558 | #if defined(SQLITE_TEST) /* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3utfSelfTest(){ | | > > > | | 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | #if defined(SQLITE_TEST) /* ** This routine is called from the TCL test function "translate_selftest". ** It checks that the primitives for serializing and deserializing ** characters in each encoding are inverses of each other. */ void sqlite3utfSelfTest(){ unsigned int i, t; unsigned char zBuf[20]; unsigned char *z; int n; unsigned int c; for(i=0; i<0x00110000; i++){ z = zBuf; WRITE_UTF8(z, i); n = z-zBuf; z = zBuf; READ_UTF8(z, c); t = i; if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; assert( c==t ); assert( (z-zBuf)==n ); } for(i=0; i<0x00110000; i++){ if( i>=0xD800 && i<=0xE000 ) continue; z = zBuf; WRITE_UTF16LE(z, i); n = z-zBuf; |
︙ | ︙ |