/ Artifact [d7a61c1d]
Login

Artifact d7a61c1dfdac3eb091d43341a674032dca5a34e122f78ef0b5bd2d5a31967dde:


     1  /*
     2  ** 2004 April 13
     3  **
     4  ** The author disclaims copyright to this source code.  In place of
     5  ** a legal notice, here is a blessing:
     6  **
     7  **    May you do good and not evil.
     8  **    May you find forgiveness for yourself and forgive others.
     9  **    May you share freely, never taking more than you give.
    10  **
    11  *************************************************************************
    12  ** This file contains routines used to translate between UTF-8, 
    13  ** UTF-16, UTF-16BE, and UTF-16LE.
    14  **
    15  ** Notes on UTF-8:
    16  **
    17  **   Byte-0    Byte-1    Byte-2    Byte-3    Value
    18  **  0xxxxxxx                                 00000000 00000000 0xxxxxxx
    19  **  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
    20  **  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
    21  **  11110uuu  10uuzzzz  10yyyyyy  10xxxxxx   000uuuuu zzzzyyyy yyxxxxxx
    22  **
    23  **
    24  ** Notes on UTF-16:  (with wwww+1==uuuuu)
    25  **
    26  **      Word-0               Word-1          Value
    27  **  110110ww wwzzzzyy   110111yy yyxxxxxx    000uuuuu zzzzyyyy yyxxxxxx
    28  **  zzzzyyyy yyxxxxxx                        00000000 zzzzyyyy yyxxxxxx
    29  **
    30  **
    31  ** BOM or Byte Order Mark:
    32  **     0xff 0xfe   little-endian utf-16 follows
    33  **     0xfe 0xff   big-endian utf-16 follows
    34  **
    35  */
    36  #include "sqliteInt.h"
    37  #include <assert.h>
    38  #include "vdbeInt.h"
    39  
    40  #if !defined(SQLITE_AMALGAMATION) && SQLITE_BYTEORDER==0
    41  /*
    42  ** The following constant value is used by the SQLITE_BIGENDIAN and
    43  ** SQLITE_LITTLEENDIAN macros.
    44  */
    45  const int sqlite3one = 1;
    46  #endif /* SQLITE_AMALGAMATION && SQLITE_BYTEORDER==0 */
    47  
    48  /*
    49  ** This lookup table is used to help decode the first byte of
    50  ** a multi-byte UTF8 character.
    51  */
    52  static const unsigned char sqlite3Utf8Trans1[] = {
    53    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    54    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    55    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
    56    0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
    57    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    58    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
    59    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
    60    0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
    61  };
    62  
    63  
    64  #define WRITE_UTF8(zOut, c) {                          \
    65    if( c<0x00080 ){                                     \
    66      *zOut++ = (u8)(c&0xFF);                            \
    67    }                                                    \
    68    else if( c<0x00800 ){                                \
    69      *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
    70      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    71    }                                                    \
    72    else if( c<0x10000 ){                                \
    73      *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
    74      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    75      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    76    }else{                                               \
    77      *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
    78      *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
    79      *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
    80      *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
    81    }                                                    \
    82  }
    83  
    84  #define WRITE_UTF16LE(zOut, c) {                                    \
    85    if( c<=0xFFFF ){                                                  \
    86      *zOut++ = (u8)(c&0x00FF);                                       \
    87      *zOut++ = (u8)((c>>8)&0x00FF);                                  \
    88    }else{                                                            \
    89      *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
    90      *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
    91      *zOut++ = (u8)(c&0x00FF);                                       \
    92      *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
    93    }                                                                 \
    94  }
    95  
    96  #define WRITE_UTF16BE(zOut, c) {                                    \
    97    if( c<=0xFFFF ){                                                  \
    98      *zOut++ = (u8)((c>>8)&0x00FF);                                  \
    99      *zOut++ = (u8)(c&0x00FF);                                       \
   100    }else{                                                            \
   101      *zOut++ = (u8)(0x00D8 + (((c-0x10000)>>18)&0x03));              \
   102      *zOut++ = (u8)(((c>>10)&0x003F) + (((c-0x10000)>>10)&0x00C0));  \
   103      *zOut++ = (u8)(0x00DC + ((c>>8)&0x03));                         \
   104      *zOut++ = (u8)(c&0x00FF);                                       \
   105    }                                                                 \
   106  }
   107  
   108  /*
   109  ** Translate a single UTF-8 character.  Return the unicode value.
   110  **
   111  ** During translation, assume that the byte that zTerm points
   112  ** is a 0x00.
   113  **
   114  ** Write a pointer to the next unread byte back into *pzNext.
   115  **
   116  ** Notes On Invalid UTF-8:
   117  **
   118  **  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
   119  **     be encoded as a multi-byte character.  Any multi-byte character that
   120  **     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
   121  **
   122  **  *  This routine never allows a UTF16 surrogate value to be encoded.
   123  **     If a multi-byte character attempts to encode a value between
   124  **     0xd800 and 0xe000 then it is rendered as 0xfffd.
   125  **
   126  **  *  Bytes in the range of 0x80 through 0xbf which occur as the first
   127  **     byte of a character are interpreted as single-byte characters
   128  **     and rendered as themselves even though they are technically
   129  **     invalid characters.
   130  **
   131  **  *  This routine accepts over-length UTF8 encodings
   132  **     for unicode values 0x80 and greater.  It does not change over-length
   133  **     encodings to 0xfffd as some systems recommend.
   134  */
   135  #define READ_UTF8(zIn, zTerm, c)                           \
   136    c = *(zIn++);                                            \
   137    if( c>=0xc0 ){                                           \
   138      c = sqlite3Utf8Trans1[c-0xc0];                         \
   139      while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
   140        c = (c<<6) + (0x3f & *(zIn++));                      \
   141      }                                                      \
   142      if( c<0x80                                             \
   143          || (c&0xFFFFF800)==0xD800                          \
   144          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
   145    }
   146  u32 sqlite3Utf8Read(
   147    const unsigned char **pz    /* Pointer to string from which to read char */
   148  ){
   149    unsigned int c;
   150  
   151    /* Same as READ_UTF8() above but without the zTerm parameter.
   152    ** For this routine, we assume the UTF8 string is always zero-terminated.
   153    */
   154    c = *((*pz)++);
   155    if( c>=0xc0 ){
   156      c = sqlite3Utf8Trans1[c-0xc0];
   157      while( (*(*pz) & 0xc0)==0x80 ){
   158        c = (c<<6) + (0x3f & *((*pz)++));
   159      }
   160      if( c<0x80
   161          || (c&0xFFFFF800)==0xD800
   162          || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
   163    }
   164    return c;
   165  }
   166  
   167  
   168  
   169  
   170  /*
   171  ** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
   172  ** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
   173  */ 
   174  /* #define TRANSLATE_TRACE 1 */
   175  
   176  #ifndef SQLITE_OMIT_UTF16
   177  /*
   178  ** This routine transforms the internal text encoding used by pMem to
   179  ** desiredEnc. It is an error if the string is already of the desired
   180  ** encoding, or if *pMem does not contain a string value.
   181  */
   182  SQLITE_NOINLINE int sqlite3VdbeMemTranslate(Mem *pMem, u8 desiredEnc){
   183    sqlite3_int64 len;          /* Maximum length of output string in bytes */
   184    unsigned char *zOut;        /* Output buffer */
   185    unsigned char *zIn;         /* Input iterator */
   186    unsigned char *zTerm;       /* End of input */
   187    unsigned char *z;           /* Output iterator */
   188    unsigned int c;
   189  
   190    assert( pMem->db==0 || sqlite3_mutex_held(pMem->db->mutex) );
   191    assert( pMem->flags&MEM_Str );
   192    assert( pMem->enc!=desiredEnc );
   193    assert( pMem->enc!=0 );
   194    assert( pMem->n>=0 );
   195  
   196  #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG)
   197    {
   198      StrAccum acc;
   199      char zBuf[1000];
   200      sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0);  
   201      sqlite3VdbeMemPrettyPrint(pMem, &acc);
   202      fprintf(stderr, "INPUT:  %s\n", sqlite3StrAccumFinish(&acc));
   203    }
   204  #endif
   205  
   206    /* If the translation is between UTF-16 little and big endian, then 
   207    ** all that is required is to swap the byte order. This case is handled
   208    ** differently from the others.
   209    */
   210    if( pMem->enc!=SQLITE_UTF8 && desiredEnc!=SQLITE_UTF8 ){
   211      u8 temp;
   212      int rc;
   213      rc = sqlite3VdbeMemMakeWriteable(pMem);
   214      if( rc!=SQLITE_OK ){
   215        assert( rc==SQLITE_NOMEM );
   216        return SQLITE_NOMEM_BKPT;
   217      }
   218      zIn = (u8*)pMem->z;
   219      zTerm = &zIn[pMem->n&~1];
   220      while( zIn<zTerm ){
   221        temp = *zIn;
   222        *zIn = *(zIn+1);
   223        zIn++;
   224        *zIn++ = temp;
   225      }
   226      pMem->enc = desiredEnc;
   227      goto translate_out;
   228    }
   229  
   230    /* Set len to the maximum number of bytes required in the output buffer. */
   231    if( desiredEnc==SQLITE_UTF8 ){
   232      /* When converting from UTF-16, the maximum growth results from
   233      ** translating a 2-byte character to a 4-byte UTF-8 character.
   234      ** A single byte is required for the output string
   235      ** nul-terminator.
   236      */
   237      pMem->n &= ~1;
   238      len = 2 * (sqlite3_int64)pMem->n + 1;
   239    }else{
   240      /* When converting from UTF-8 to UTF-16 the maximum growth is caused
   241      ** when a 1-byte UTF-8 character is translated into a 2-byte UTF-16
   242      ** character. Two bytes are required in the output buffer for the
   243      ** nul-terminator.
   244      */
   245      len = 2 * (sqlite3_int64)pMem->n + 2;
   246    }
   247  
   248    /* Set zIn to point at the start of the input buffer and zTerm to point 1
   249    ** byte past the end.
   250    **
   251    ** Variable zOut is set to point at the output buffer, space obtained
   252    ** from sqlite3_malloc().
   253    */
   254    zIn = (u8*)pMem->z;
   255    zTerm = &zIn[pMem->n];
   256    zOut = sqlite3DbMallocRaw(pMem->db, len);
   257    if( !zOut ){
   258      return SQLITE_NOMEM_BKPT;
   259    }
   260    z = zOut;
   261  
   262    if( pMem->enc==SQLITE_UTF8 ){
   263      if( desiredEnc==SQLITE_UTF16LE ){
   264        /* UTF-8 -> UTF-16 Little-endian */
   265        while( zIn<zTerm ){
   266          READ_UTF8(zIn, zTerm, c);
   267          WRITE_UTF16LE(z, c);
   268        }
   269      }else{
   270        assert( desiredEnc==SQLITE_UTF16BE );
   271        /* UTF-8 -> UTF-16 Big-endian */
   272        while( zIn<zTerm ){
   273          READ_UTF8(zIn, zTerm, c);
   274          WRITE_UTF16BE(z, c);
   275        }
   276      }
   277      pMem->n = (int)(z - zOut);
   278      *z++ = 0;
   279    }else{
   280      assert( desiredEnc==SQLITE_UTF8 );
   281      if( pMem->enc==SQLITE_UTF16LE ){
   282        /* UTF-16 Little-endian -> UTF-8 */
   283        while( zIn<zTerm ){
   284          c = *(zIn++);
   285          c += (*(zIn++))<<8;
   286          if( c>=0xd800 && c<0xe000 ){
   287  #ifdef SQLITE_REPLACE_INVALID_UTF
   288            if( c>=0xdc00 || zIn>=zTerm ){
   289              c = 0xfffd;
   290            }else{
   291              int c2 = *(zIn++);
   292              c2 += (*(zIn++))<<8;
   293              if( c2<0xdc00 || c2>=0xe000 ){
   294                zIn -= 2;
   295                c = 0xfffd;
   296              }else{
   297                c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
   298              }
   299            }
   300  #else
   301            if( zIn<zTerm ){
   302              int c2 = (*zIn++);
   303              c2 += ((*zIn++)<<8);
   304              c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
   305            }
   306  #endif
   307          }
   308          WRITE_UTF8(z, c);
   309        }
   310      }else{
   311        /* UTF-16 Big-endian -> UTF-8 */
   312        while( zIn<zTerm ){
   313          c = (*(zIn++))<<8;
   314          c += *(zIn++);
   315          if( c>=0xd800 && c<0xe000 ){
   316  #ifdef SQLITE_REPLACE_INVALID_UTF
   317            if( c>=0xdc00 || zIn>=zTerm ){
   318              c = 0xfffd;
   319            }else{
   320              int c2 = (*(zIn++))<<8;
   321              c2 += *(zIn++);
   322              if( c2<0xdc00 || c2>=0xe000 ){
   323                zIn -= 2;
   324                c = 0xfffd;
   325              }else{
   326                c = ((c&0x3ff)<<10) + (c2&0x3ff) + 0x10000;
   327              }
   328            }
   329  #else
   330            if( zIn<zTerm ){
   331              int c2 = ((*zIn++)<<8);
   332              c2 += (*zIn++);
   333              c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);
   334            }
   335  #endif
   336          }
   337          WRITE_UTF8(z, c);
   338        }
   339      }
   340      pMem->n = (int)(z - zOut);
   341    }
   342    *z = 0;
   343    assert( (pMem->n+(desiredEnc==SQLITE_UTF8?1:2))<=len );
   344  
345 c = pMem->flags; 346 sqlite3VdbeMemRelease(pMem); 347 pMem->flags = MEM_Str|MEM_Term|(c&(MEM_AffMask|MEM_Subtype));
348 pMem->enc = desiredEnc; 349 pMem->z = (char*)zOut; 350 pMem->zMalloc = pMem->z; 351 pMem->szMalloc = sqlite3DbMallocSize(pMem->db, pMem->z); 352 353 translate_out: 354 #if defined(TRANSLATE_TRACE) && defined(SQLITE_DEBUG) 355 { 356 StrAccum acc; 357 char zBuf[1000]; 358 sqlite3StrAccumInit(&acc, 0, zBuf, sizeof(zBuf), 0); 359 sqlite3VdbeMemPrettyPrint(pMem, &acc); 360 fprintf(stderr, "OUTPUT: %s\n", sqlite3StrAccumFinish(&acc)); 361 } 362 #endif 363 return SQLITE_OK; 364 } 365 #endif /* SQLITE_OMIT_UTF16 */ 366 367 #ifndef SQLITE_OMIT_UTF16 368 /* 369 ** This routine checks for a byte-order mark at the beginning of the 370 ** UTF-16 string stored in *pMem. If one is present, it is removed and 371 ** the encoding of the Mem adjusted. This routine does not do any 372 ** byte-swapping, it just sets Mem.enc appropriately. 373 ** 374 ** The allocation (static, dynamic etc.) and encoding of the Mem may be 375 ** changed by this function. 376 */ 377 int sqlite3VdbeMemHandleBom(Mem *pMem){ 378 int rc = SQLITE_OK; 379 u8 bom = 0; 380 381 assert( pMem->n>=0 ); 382 if( pMem->n>1 ){ 383 u8 b1 = *(u8 *)pMem->z; 384 u8 b2 = *(((u8 *)pMem->z) + 1); 385 if( b1==0xFE && b2==0xFF ){ 386 bom = SQLITE_UTF16BE; 387 } 388 if( b1==0xFF && b2==0xFE ){ 389 bom = SQLITE_UTF16LE; 390 } 391 } 392 393 if( bom ){ 394 rc = sqlite3VdbeMemMakeWriteable(pMem); 395 if( rc==SQLITE_OK ){ 396 pMem->n -= 2; 397 memmove(pMem->z, &pMem->z[2], pMem->n); 398 pMem->z[pMem->n] = '\0'; 399 pMem->z[pMem->n+1] = '\0'; 400 pMem->flags |= MEM_Term; 401 pMem->enc = bom; 402 } 403 } 404 return rc; 405 } 406 #endif /* SQLITE_OMIT_UTF16 */ 407 408 /* 409 ** pZ is a UTF-8 encoded unicode string. If nByte is less than zero, 410 ** return the number of unicode characters in pZ up to (but not including) 411 ** the first 0x00 byte. If nByte is not less than zero, return the 412 ** number of unicode characters in the first nByte of pZ (or up to 413 ** the first 0x00, whichever comes first). 414 */ 415 int sqlite3Utf8CharLen(const char *zIn, int nByte){ 416 int r = 0; 417 const u8 *z = (const u8*)zIn; 418 const u8 *zTerm; 419 if( nByte>=0 ){ 420 zTerm = &z[nByte]; 421 }else{ 422 zTerm = (const u8*)(-1); 423 } 424 assert( z<=zTerm ); 425 while( *z!=0 && z<zTerm ){ 426 SQLITE_SKIP_UTF8(z); 427 r++; 428 } 429 return r; 430 } 431 432 /* This test function is not currently used by the automated test-suite. 433 ** Hence it is only available in debug builds. 434 */ 435 #if defined(SQLITE_TEST) && defined(SQLITE_DEBUG) 436 /* 437 ** Translate UTF-8 to UTF-8. 438 ** 439 ** This has the effect of making sure that the string is well-formed 440 ** UTF-8. Miscoded characters are removed. 441 ** 442 ** The translation is done in-place and aborted if the output 443 ** overruns the input. 444 */ 445 int sqlite3Utf8To8(unsigned char *zIn){ 446 unsigned char *zOut = zIn; 447 unsigned char *zStart = zIn; 448 u32 c; 449 450 while( zIn[0] && zOut<=zIn ){ 451 c = sqlite3Utf8Read((const u8**)&zIn); 452 if( c!=0xfffd ){ 453 WRITE_UTF8(zOut, c); 454 } 455 } 456 *zOut = 0; 457 return (int)(zOut - zStart); 458 } 459 #endif 460 461 #ifndef SQLITE_OMIT_UTF16 462 /* 463 ** Convert a UTF-16 string in the native encoding into a UTF-8 string. 464 ** Memory to hold the UTF-8 string is obtained from sqlite3_malloc and must 465 ** be freed by the calling function. 466 ** 467 ** NULL is returned if there is an allocation error. 468 */ 469 char *sqlite3Utf16to8(sqlite3 *db, const void *z, int nByte, u8 enc){ 470 Mem m; 471 memset(&m, 0, sizeof(m)); 472 m.db = db; 473 sqlite3VdbeMemSetStr(&m, z, nByte, enc, SQLITE_STATIC); 474 sqlite3VdbeChangeEncoding(&m, SQLITE_UTF8); 475 if( db->mallocFailed ){ 476 sqlite3VdbeMemRelease(&m); 477 m.z = 0; 478 } 479 assert( (m.flags & MEM_Term)!=0 || db->mallocFailed ); 480 assert( (m.flags & MEM_Str)!=0 || db->mallocFailed ); 481 assert( m.z || db->mallocFailed ); 482 return m.z; 483 } 484 485 /* 486 ** zIn is a UTF-16 encoded unicode string at least nChar characters long. 487 ** Return the number of bytes in the first nChar unicode characters 488 ** in pZ. nChar must be non-negative. 489 */ 490 int sqlite3Utf16ByteLen(const void *zIn, int nChar){ 491 int c; 492 unsigned char const *z = zIn; 493 int n = 0; 494 495 if( SQLITE_UTF16NATIVE==SQLITE_UTF16LE ) z++; 496 while( n<nChar ){ 497 c = z[0]; 498 z += 2; 499 if( c>=0xd8 && c<0xdc && z[0]>=0xdc && z[0]<0xe0 ) z += 2; 500 n++; 501 } 502 return (int)(z-(unsigned char const *)zIn) 503 - (SQLITE_UTF16NATIVE==SQLITE_UTF16LE); 504 } 505 506 #if defined(SQLITE_TEST) 507 /* 508 ** This routine is called from the TCL test function "translate_selftest". 509 ** It checks that the primitives for serializing and deserializing 510 ** characters in each encoding are inverses of each other. 511 */ 512 void sqlite3UtfSelfTest(void){ 513 unsigned int i, t; 514 unsigned char zBuf[20]; 515 unsigned char *z; 516 int n; 517 unsigned int c; 518 519 for(i=0; i<0x00110000; i++){ 520 z = zBuf; 521 WRITE_UTF8(z, i); 522 n = (int)(z-zBuf); 523 assert( n>0 && n<=4 ); 524 z[0] = 0; 525 z = zBuf; 526 c = sqlite3Utf8Read((const u8**)&z); 527 t = i; 528 if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD; 529 if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD; 530 assert( c==t ); 531 assert( (z-zBuf)==n ); 532 } 533 } 534 #endif /* SQLITE_TEST */ 535 #endif /* SQLITE_OMIT_UTF16 */