SQLite

Check-in [7fab1393]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Convert invalid surrogates to 0xfffd when translating UTF.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA3-256: 7fab1393c2b22b1f3b159b631e06e7e0d3900850ee249c38e4d3cdd0aacf637e
User & Date: drh 2020-02-17 23:08:16
References
2020-05-20
15:02
Back out the change from [7fab1393c2b22b1f] that tries to convert invalid surrogate characters in UTF16 into the replacement character 0xfffd, as we find that this breaks some software. (check-in: 4218c7b7 user: drh tags: trunk)
Context
2020-02-18
19:49
Add the new sqlite3_create_filename() and sqlite3_free_filename() interfaces for use by Shims. Use these interfaces inside the multiplexor. (check-in: 9469f36a user: drh tags: trunk)
2020-02-17
23:08
Convert invalid surrogates to 0xfffd when translating UTF. (check-in: 7fab1393 user: drh tags: trunk)
19:25
A better (smaller and faster) solution to ticket [4374860b29383380]. (check-in: abc473fb user: drh tags: trunk)
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/utf.c.

101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
    *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
    *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
    *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
    *zOut++ = (u8)(c&0x00FF);                                       \
  }                                                                 \
}

#define READ_UTF16LE(zIn, TERM, c){                                   \
  c = (*zIn++);                                                       \
  c += ((*zIn++)<<8);                                                 \
  if( c>=0xD800 && c<0xE000 && TERM ){                                \
    int c2 = (*zIn++);                                                \
    c2 += ((*zIn++)<<8);                                              \
    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
  }                                                                   \
}

#define READ_UTF16BE(zIn, TERM, c){                                   \
  c = ((*zIn++)<<8);                                                  \
  c += (*zIn++);                                                      \
  if( c>=0xD800 && c<0xE000 && TERM ){                                \
    int c2 = ((*zIn++)<<8);                                           \
    c2 += (*zIn++);                                                   \
    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
  }                                                                   \
}

/*
** Translate a single UTF-8 character.  Return the unicode value.
**
** During translation, assume that the byte that zTerm points
** is a 0x00.
**
** Write a pointer to the next unread byte back into *pzNext.







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







101
102
103
104
105
106
107




















108
109
110
111
112
113
114
    *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
    *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
    *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
    *zOut++ = (u8)(c&0x00FF);                                       \
  }                                                                 \
}





















/*
** Translate a single UTF-8 character.  Return the unicode value.
**
** During translation, assume that the byte that zTerm points
** is a 0x00.
**
** Write a pointer to the next unread byte back into *pzNext.
297
298
299
300
301
302
303
304















305
306
307
308
309

310














311
312
313
314
315
316
317
    pMem->n = (int)(z - zOut);
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */
      while( zIn<zTerm ){
        READ_UTF16LE(zIn, zIn<zTerm, c); 















        WRITE_UTF8(z, c);
      }
    }else{
      /* UTF-16 Big-endian -> UTF-8 */
      while( zIn<zTerm ){

        READ_UTF16BE(zIn, zIn<zTerm, c); 














        WRITE_UTF8(z, c);
      }
    }
    pMem->n = (int)(z - zOut);
  }
  *z = 0;
  assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );







|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>





>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>







277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
    pMem->n = (int)(z - zOut);
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */
      while( zIn<zTerm ){
        c = *(zIn++);
        c += (*(zIn++))<<8;
        if( c>=0xd800 && c<0xe000 ){
          if( c>=0xdc00 || zIn>=zTerm ){
            c = 0xfffd;
          }else{
            int c2 = *(zIn++);
            c2 += (*(zIn++))<<8;
            if( c2<0xdc00 || c2>=0xe000 ){
              zIn -= 2;
              c = 0xfffd;
            }else{
              c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
            }
          }
        }
        WRITE_UTF8(z, c);
      }
    }else{
      /* UTF-16 Big-endian -> UTF-8 */
      while( zIn<zTerm ){
        c = (*(zIn++))<<8;
        c += *(zIn++);
        if( c>=0xd800 && c<0xe000 ){
          if( c>=0xdc00 || zIn>=zTerm ){
            c = 0xfffd;
          }else{
            int c2 = (*(zIn++))<<8;
            c2 += *(zIn++);
            if( c2<0xdc00 || c2>=0xe000 ){
              zIn -= 2;
              c = 0xfffd;
            }else{
              c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
            }
          }
        }
        WRITE_UTF8(z, c);
      }
    }
    pMem->n = (int)(z - zOut);
  }
  *z = 0;
  assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476

477
478
479
480

481
482
483
484
485
486
487
** in pZ.  nChar must be non-negative.
*/
int sqlite3Utf16ByteLen(const void *zIn, int nChar){
  int c;
  unsigned char const *z = zIn;
  int n = 0;
  
  if( SQLITE_UTF16NATIVE==SQLITE_UTF16BE ){
    while( n<nChar ){
      READ_UTF16BE(z, 1, c);
      n++;
    }
  }else{
    while( n<nChar ){
      READ_UTF16LE(z, 1, c);

      n++;
    }
  }
  return (int)(z-(unsigned char const *)zIn);

}

#if defined(SQLITE_TEST)
/*
** This routine is called from the TCL test function "translate_selftest".
** It checks that the primitives for serializing and deserializing
** characters in each encoding are inverses of each other.







|
|
|
|
<
<
<
<
>
|
|
<
|
>







472
473
474
475
476
477
478
479
480
481
482




483
484
485

486
487
488
489
490
491
492
493
494
** in pZ.  nChar must be non-negative.
*/
int sqlite3Utf16ByteLen(const void *zIn, int nChar){
  int c;
  unsigned char const *z = zIn;
  int n = 0;
  
  if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++;
  while( n<nChar ){
    c = z[0];
    z += 2;




    if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2;
    n++;
  }

  return (int)(z-(unsigned char const *)zIn) 
              - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE);
}

#if defined(SQLITE_TEST)
/*
** This routine is called from the TCL test function "translate_selftest".
** It checks that the primitives for serializing and deserializing
** characters in each encoding are inverses of each other.
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
    z[0] = 0;
    z = zBuf;
    c = sqlite3Utf8Read((const u8**)&z);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16LE(z, i);
    n = (int)(z-zBuf);
    assert( n>0 && n<=4 );
    z[0] = 0;
    z = zBuf;
    READ_UTF16LE(z, 1, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){
    if( i>=0xD800 && i<0xE000 ) continue;
    z = zBuf;
    WRITE_UTF16BE(z, i);
    n = (int)(z-zBuf);
    assert( n>0 && n<=4 );
    z[0] = 0;
    z = zBuf;
    READ_UTF16BE(z, 1, c);
    assert( c==i );
    assert( (z-zBuf)==n );
  }
}
#endif /* SQLITE_TEST */
#endif /* SQLITE_OMIT_UTF16 */







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<





508
509
510
511
512
513
514
























515
516
517
518
519
    z[0] = 0;
    z = zBuf;
    c = sqlite3Utf8Read((const u8**)&z);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
























    assert( (z-zBuf)==n );
  }
}
#endif /* SQLITE_TEST */
#endif /* SQLITE_OMIT_UTF16 */

Changes to test/tkt-3fe897352e.test.

29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  sqlite3 db :memory:
  db eval {
    PRAGMA encoding=UTF8;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(hex_to_utf16be('D800'));
    SELECT hex(x) FROM t1;
  }
} {EDA080}
do_test tkt-3fe89-1.2 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16le('00D8'));
    SELECT hex(x) FROM t1;
  }
} {EDA080}
do_test tkt-3fe89-1.3 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16be('DFFF'));
    SELECT hex(x) FROM t1;
  }
} {EDBFBF}
do_test tkt-3fe89-1.4 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16le('FFDF'));
    SELECT hex(x) FROM t1;
  }
} {EDBFBF}


finish_test







|






|






|






|



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
  sqlite3 db :memory:
  db eval {
    PRAGMA encoding=UTF8;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(hex_to_utf16be('D800'));
    SELECT hex(x) FROM t1;
  }
} {EFBFBD}
do_test tkt-3fe89-1.2 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16le('00D8'));
    SELECT hex(x) FROM t1;
  }
} {EFBFBD}
do_test tkt-3fe89-1.3 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16be('DFFF'));
    SELECT hex(x) FROM t1;
  }
} {EFBFBD}
do_test tkt-3fe89-1.4 {
  db eval {
    DELETE FROM t1;
    INSERT INTO t1 VALUES(hex_to_utf16le('FFDF'));
    SELECT hex(x) FROM t1;
  }
} {EFBFBD}


finish_test