SQLite

Check-in [9a059cb6bc]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Rework the UTF8 reader logic in order to avoid the use of malloc(). Ticket #2523. (CVS 4175)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 9a059cb6bced5cdc950f7816602ac92d89a899be
User & Date: drh 2007-07-23 19:12:42.000
Context
2007-07-23
19:26
Check the return value of sqlite3PagerWrite() when autovacuuming. Ticket #2524. (CVS 4176) (check-in: b4a5c62b85 user: drh tags: trunk)
19:12
Rework the UTF8 reader logic in order to avoid the use of malloc(). Ticket #2523. (CVS 4175) (check-in: 9a059cb6bc user: drh tags: trunk)
2007-07-22
19:10
Fix a bad sizeof in vdbe.c. Ticket #2522. (CVS 4174) (check-in: 77ebc3feb0 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/func.c.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27

28
29
30
31
32
33
34
** This file contains the C functions that implement various SQL
** functions of SQLite.  
**
** There is only one exported symbol in this file - the function
** sqliteRegisterBuildinFunctions() found at the bottom of the file.
** All other code has file scope.
**
** $Id: func.c,v 1.161 2007/06/22 15:21:16 danielk1977 Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>
/* #include <math.h> */
#include <stdlib.h>
#include <assert.h>
#include "vdbeInt.h"
#include "os.h"


/*
** Return the collating function associated with a function.
*/
static CollSeq *sqlite3GetFuncCollSeq(sqlite3_context *context){
  return context->pColl;
}







|








>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
** This file contains the C functions that implement various SQL
** functions of SQLite.  
**
** There is only one exported symbol in this file - the function
** sqliteRegisterBuildinFunctions() found at the bottom of the file.
** All other code has file scope.
**
** $Id: func.c,v 1.162 2007/07/23 19:12:42 drh Exp $
*/
#include "sqliteInt.h"
#include <ctype.h>
/* #include <math.h> */
#include <stdlib.h>
#include <assert.h>
#include "vdbeInt.h"
#include "os.h"


/*
** Return the collating function associated with a function.
*/
static CollSeq *sqlite3GetFuncCollSeq(sqlite3_context *context){
  return context->pColl;
}
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
/* The correct SQL-92 behavior is for the LIKE operator to ignore
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

/*
** Read a single UTF-8 character and return its value.
*/
u32 sqlite3ReadUtf8(const unsigned char *z){
  u32 c;
  SQLITE_READ_UTF8(z, c);
  return c;
}

/*
** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" expression.  Return true (1) if they
** are the same and false (0) if they are different.
**
** Globbing rules:
**







<
<
<
<
<
<
<
<
<







394
395
396
397
398
399
400









401
402
403
404
405
406
407
/* The correct SQL-92 behavior is for the LIKE operator to ignore
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };










/*
** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" expression.  Return true (1) if they
** are the same and false (0) if they are different.
**
** Globbing rules:
**
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454

455
456
457
458
459
460
461


462
463
464

465
466
467
468
469

470
471
472
473
474

475
476
477
478
479



480
481

482

483
484
485
486
487
488
489

490
491
492

493
494
495
496
497
498
499
500
501



502
503
504
505
506
507
508
509
510
511

512
513
514
515
516
517


518
519
520
521
522

523
524
525
526

527
528
529



530
531
532
533
534
535
536
537
538
539
540
*/
static int patternCompare(
  const u8 *zPattern,              /* The glob pattern */
  const u8 *zString,               /* The string to compare against the glob */
  const struct compareInfo *pInfo, /* Information about how to do the compare */
  const int esc                    /* The escape character */
){
  register int c;
  int invert;
  int seen;
  int c2;
  u8 matchOne = pInfo->matchOne;
  u8 matchAll = pInfo->matchAll;
  u8 matchSet = pInfo->matchSet;
  u8 noCase = pInfo->noCase; 
  int prevEscape = 0;     /* True if the previous character was 'escape' */

  while( (c = *zPattern)!=0 ){
    if( !prevEscape && c==matchAll ){

      while( (c=zPattern[1]) == matchAll || c == matchOne ){
        if( c==matchOne ){
          if( *zString==0 ) return 0;
          SQLITE_SKIP_UTF8(zString);
        }
        zPattern++;
      }


      if( c && esc && sqlite3ReadUtf8(&zPattern[1])==esc ){
        u8 const *zTemp = &zPattern[1];
        SQLITE_SKIP_UTF8(zTemp);

        c = *zTemp;
      }
      if( c==0 ) return 1;
      if( c==matchSet ){
        assert( esc==0 );   /* This is GLOB, not LIKE */

        while( *zString && patternCompare(&zPattern[1],zString,pInfo,esc)==0 ){
          SQLITE_SKIP_UTF8(zString);
        }
        return *zString!=0;
      }else{

        while( (c2 = *zString)!=0 ){
          if( noCase ){
            c2 = sqlite3UpperToLower[c2];
            c = sqlite3UpperToLower[c];
            while( c2 != 0 && c2 != c ){ c2 = sqlite3UpperToLower[*++zString]; }



          }else{
            while( c2 != 0 && c2 != c ){ c2 = *++zString; }

          }

          if( c2==0 ) return 0;
          if( patternCompare(&zPattern[1],zString,pInfo,esc) ) return 1;
          SQLITE_SKIP_UTF8(zString);
        }
        return 0;
      }
    }else if( !prevEscape && c==matchOne ){

      if( *zString==0 ) return 0;
      SQLITE_SKIP_UTF8(zString);
      zPattern++;

    }else if( c==matchSet ){
      int prior_c = 0;
      assert( esc==0 );    /* This only occurs for GLOB, not LIKE */
      seen = 0;
      invert = 0;
      c = sqlite3ReadUtf8(zString);
      if( c==0 ) return 0;
      c2 = *++zPattern;
      if( c2=='^' ){ invert = 1; c2 = *++zPattern; }



      if( c2==']' ){
        if( c==']' ) seen = 1;
        c2 = *++zPattern;
      }
      while( (c2 = sqlite3ReadUtf8(zPattern))!=0 && c2!=']' ){
        if( c2=='-' && zPattern[1]!=']' && zPattern[1]!=0 && prior_c>0 ){
          zPattern++;
          c2 = sqlite3ReadUtf8(zPattern);
          if( c>=prior_c && c<=c2 ) seen = 1;
          prior_c = 0;

        }else if( c==c2 ){
          seen = 1;
          prior_c = c2;
        }else{
          prior_c = c2;
        }


        SQLITE_SKIP_UTF8(zPattern);
      }
      if( c2==0 || (seen ^ invert)==0 ) return 0;
      SQLITE_SKIP_UTF8(zString);
      zPattern++;

    }else if( esc && !prevEscape && sqlite3ReadUtf8(zPattern)==esc){
      prevEscape = 1;
      SQLITE_SKIP_UTF8(zPattern);
    }else{

      if( noCase ){
        if( sqlite3UpperToLower[c] != sqlite3UpperToLower[*zString] ) return 0;
      }else{



        if( c != *zString ) return 0;
      }
      zPattern++;
      zString++;
      prevEscape = 0;
    }
  }
  return *zString==0;
}

/*







|


<






|

>
|
|
|
<

<

>
>
|
|
<
>
|
|
<
|
|
>
|



<
>
|
|
|
|
|
>
>
>
|
|
>

>
|
|
<
|
|
<

>
|
<
<
>





|

|
|
>
>
>


|

|
|
<
|


>
|
|
<
<
<
|
>
>
|

|
<
|
>
|

<

>

|
<
>
>
>
|

<
<







428
429
430
431
432
433
434
435
436
437

438
439
440
441
442
443
444
445
446
447
448
449

450

451
452
453
454
455

456
457
458

459
460
461
462
463
464
465

466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481

482
483

484
485
486


487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

506
507
508
509
510
511



512
513
514
515
516
517

518
519
520
521

522
523
524
525

526
527
528
529
530


531
532
533
534
535
536
537
*/
static int patternCompare(
  const u8 *zPattern,              /* The glob pattern */
  const u8 *zString,               /* The string to compare against the glob */
  const struct compareInfo *pInfo, /* Information about how to do the compare */
  const int esc                    /* The escape character */
){
  int c, c2;
  int invert;
  int seen;

  u8 matchOne = pInfo->matchOne;
  u8 matchAll = pInfo->matchAll;
  u8 matchSet = pInfo->matchSet;
  u8 noCase = pInfo->noCase; 
  int prevEscape = 0;     /* True if the previous character was 'escape' */

  while( (c = sqlite3Utf8Read(zPattern,0,&zPattern))!=0 ){
    if( !prevEscape && c==matchAll ){
      while( (c=sqlite3Utf8Read(zPattern,0,&zPattern)) == matchAll
               || c == matchOne ){
        if( c==matchOne && sqlite3Utf8Read(zString, 0, &zString)==0 ){
          return 0;

        }

      }
      if( c==0 ){
        return 1;
      }else if( c==esc ){
        c = sqlite3Utf8Read(zPattern, 0, &zPattern);

        if( c==0 ){
          return 0;
        }

      }else if( c==matchSet ){
        assert( esc==0 );         /* This is GLOB, not LIKE */
        assert( matchSet<0x80 );  /* '[' is a single-byte character */
        while( *zString && patternCompare(&zPattern[-1],zString,pInfo,esc)==0 ){
          SQLITE_SKIP_UTF8(zString);
        }
        return *zString!=0;

      }
      while( (c2 = sqlite3Utf8Read(zString,0,&zString))!=0 ){
        if( noCase ){
          c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
          c = c<0x80 ? sqlite3UpperToLower[c] : c;
          while( c2 != 0 && c2 != c ){
            c2 = sqlite3Utf8Read(zString, 0, &zString);
            if( c2<0x80 ) c2 = sqlite3UpperToLower[c2];
          }
        }else{
          while( c2 != 0 && c2 != c ){
            c2 = sqlite3Utf8Read(zString, 0, &zString);
          }
        }
        if( c2==0 ) return 0;
        if( patternCompare(zPattern,zString,pInfo,esc) ) return 1;

      }
      return 0;

    }else if( !prevEscape && c==matchOne ){
      if( sqlite3Utf8Read(zString, 0, &zString)==0 ){
        return 0;


      }
    }else if( c==matchSet ){
      int prior_c = 0;
      assert( esc==0 );    /* This only occurs for GLOB, not LIKE */
      seen = 0;
      invert = 0;
      c = sqlite3Utf8Read(zString, 0, &zString);
      if( c==0 ) return 0;
      c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      if( c2=='^' ){
        invert = 1;
        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      }
      if( c2==']' ){
        if( c==']' ) seen = 1;
        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      }
      while( c2 && c2!=']' ){
        if( c2=='-' && zPattern[0]!=']' && zPattern[0]!=0 && prior_c>0 ){

          c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
          if( c>=prior_c && c<=c2 ) seen = 1;
          prior_c = 0;
        }else{
          if( c==c2 ){
            seen = 1;



          }
          prior_c = c2;
        }
        c2 = sqlite3Utf8Read(zPattern, 0, &zPattern);
      }
      if( c2==0 || (seen ^ invert)==0 ){

        return 0;
      }
    }else if( esc==c && !prevEscape ){
      prevEscape = 1;

    }else{
      c2 = sqlite3Utf8Read(zString, 0, &zString);
      if( noCase ){
        c = c<0x80 ? sqlite3UpperToLower[c] : c;

        c2 = c2<0x80 ? sqlite3UpperToLower[c2] : c2;
      }
      if( c!=c2 ){
        return 0;
      }


      prevEscape = 0;
    }
  }
  return *zString==0;
}

/*
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
    const unsigned char *zEsc = sqlite3_value_text(argv[2]);
    if( zEsc==0 ) return;
    if( sqlite3Utf8CharLen((char*)zEsc, -1)!=1 ){
      sqlite3_result_error(context, 
          "ESCAPE expression must be a single character", -1);
      return;
    }
    escape = sqlite3ReadUtf8(zEsc);
  }
  if( zA && zB ){
    struct compareInfo *pInfo = sqlite3_user_data(context);
#ifdef SQLITE_TEST
    sqlite3_like_count++;
#endif
    







|







583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
    const unsigned char *zEsc = sqlite3_value_text(argv[2]);
    if( zEsc==0 ) return;
    if( sqlite3Utf8CharLen((char*)zEsc, -1)!=1 ){
      sqlite3_result_error(context, 
          "ESCAPE expression must be a single character", -1);
      return;
    }
    escape = sqlite3Utf8Read(zEsc, 0, &zEsc);
  }
  if( zA && zB ){
    struct compareInfo *pInfo = sqlite3_user_data(context);
#ifdef SQLITE_TEST
    sqlite3_like_count++;
#endif
    
Changes to src/sqliteInt.h.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
** 2001 September 15
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
** @(#) $Id: sqliteInt.h,v 1.578 2007/06/26 10:38:55 danielk1977 Exp $
*/
#ifndef _SQLITEINT_H_
#define _SQLITEINT_H_
#include "sqliteLimit.h"


#if defined(SQLITE_TCL) || defined(TCLSH)













|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/*
** 2001 September 15
**
** The author disclaims copyright to this source code.  In place of
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
** @(#) $Id: sqliteInt.h,v 1.579 2007/07/23 19:12:42 drh Exp $
*/
#ifndef _SQLITEINT_H_
#define _SQLITEINT_H_
#include "sqliteLimit.h"


#if defined(SQLITE_TCL) || defined(TCLSH)
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
 * This global flag is set for performance testing of triggers. When it is set
 * SQLite will perform the overhead of building new and old trigger references 
 * even when no triggers exist
 */
extern int sqlite3_always_code_trigger_setup;

/*
** A lookup table used by the SQLITE_READ_UTF8 macro.  The definition
** is in utf.c.
*/
extern const unsigned char sqlite3UtfTrans1[];

/*
** Macros for reading UTF8 characters.
**
** SQLITE_READ_UTF8(x,c) reads a single UTF8 value out of x and writes
** that value into c.  The type of x must be unsigned char*.  The type
** of c must be unsigned int.
**
** SQLITE_SKIP_UTF8(x) advances x forward by one character.  The type of
** x must be unsigned char*.
**
** Notes On Invalid UTF-8:
**
**  *  These macros never allow a 7-bit character (0x00 through 0x7f) to
**     be encoded as a multi-byte character.  Any multi-byte character that
**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
**
**  *  These macros never allow a UTF16 surrogate value to be encoded.
**     If a multi-byte character attempts to encode a value between
**     0xd800 and 0xe000 then it is rendered as 0xfffd.
**
**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
**     byte of a character are interpreted as single-byte characters
**     and rendered as themselves even though they are technically
**     invalid characters.
**
**  *  These routines accept an infinite number of different UTF8 encodings
**     for unicode values 0x80 and greater.  They do not change over-length
**     encodings to 0xfffd as some systems recommend.
** 
*/
#define SQLITE_READ_UTF8(zIn, c) {                     \
  c = *(zIn++);                                        \
  if( c>=0xc0 ){                                       \
    c = sqlite3UtfTrans1[c-0xc0];                      \
    while( (*zIn & 0xc0)==0x80 ){                      \
      c = (c<<6) + (0x3f & *(zIn++));                  \
    }                                                  \
    if( c<0x80                                         \
        || (c&0xFFFFF800)==0xD800                      \
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }    \
  }                                                    \
}
#define SQLITE_SKIP_UTF8(zIn) {                        \
  if( (*(zIn++))>=0xc0 ){                              \
    while( (*zIn & 0xc0)==0x80 ){ zIn++; }             \
  }                                                    \
}




/*
** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
** builds) or a function call (for debugging).  If it is a function call,
** it allows the operator to set a breakpoint at the spot where database
** corruption is first detected.
*/







<
<
<
<
|
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<

<
<
<
<
<
<
<
<
<
<
<
<





<
<
<







1552
1553
1554
1555
1556
1557
1558




1559

1560



























1561












1562
1563
1564
1565
1566



1567
1568
1569
1570
1571
1572
1573
 * This global flag is set for performance testing of triggers. When it is set
 * SQLite will perform the overhead of building new and old trigger references 
 * even when no triggers exist
 */
extern int sqlite3_always_code_trigger_setup;

/*




** Assuming zIn points to the first byte of a UTF-8 character,

** advance zIn to point to the first byte of the next UTF-8 character.



























*/












#define SQLITE_SKIP_UTF8(zIn) {                        \
  if( (*(zIn++))>=0xc0 ){                              \
    while( (*zIn & 0xc0)==0x80 ){ zIn++; }             \
  }                                                    \
}




/*
** The SQLITE_CORRUPT_BKPT macro can be either a constant (for production
** builds) or a function call (for debugging).  If it is a function call,
** it allows the operator to set a breakpoint at the spot where database
** corruption is first detected.
*/
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
int sqlite3FixTriggerStep(DbFixer*, TriggerStep*);
int sqlite3AtoF(const char *z, double*);
char *sqlite3_snprintf(int,char*,const char*,...);
int sqlite3GetInt32(const char *, int*);
int sqlite3FitsIn64Bits(const char *);
int sqlite3Utf16ByteLen(const void *pData, int nChar);
int sqlite3Utf8CharLen(const char *pData, int nByte);
u32 sqlite3ReadUtf8(const unsigned char *);
int sqlite3PutVarint(unsigned char *, u64);
int sqlite3GetVarint(const unsigned char *, u64 *);
int sqlite3GetVarint32(const unsigned char *, u32 *);
int sqlite3VarintLen(u64 v);
void sqlite3IndexAffinityStr(Vdbe *, Index *);
void sqlite3TableAffinityStr(Vdbe *, Table *);
char sqlite3CompareAffinity(Expr *pExpr, char aff2);







|







1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
int sqlite3FixTriggerStep(DbFixer*, TriggerStep*);
int sqlite3AtoF(const char *z, double*);
char *sqlite3_snprintf(int,char*,const char*,...);
int sqlite3GetInt32(const char *, int*);
int sqlite3FitsIn64Bits(const char *);
int sqlite3Utf16ByteLen(const void *pData, int nChar);
int sqlite3Utf8CharLen(const char *pData, int nByte);
int sqlite3Utf8Read(const u8*, const u8*, const u8**);
int sqlite3PutVarint(unsigned char *, u64);
int sqlite3GetVarint(const unsigned char *, u64 *);
int sqlite3GetVarint32(const unsigned char *, u32 *);
int sqlite3VarintLen(u64 v);
void sqlite3IndexAffinityStr(Vdbe *, Index *);
void sqlite3TableAffinityStr(Vdbe *, Table *);
char sqlite3CompareAffinity(Expr *pExpr, char aff2);
Changes to src/utf.c.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** This file contains routines used to translate between UTF-8, 
** UTF-16, UTF-16BE, and UTF-16LE.
**
** $Id: utf.c,v 1.51 2007/05/23 16:23:09 danielk1977 Exp $
**
** Notes on UTF-8:
**
**   Byte-0    Byte-1    Byte-2    Byte-3    Value
**  0xxxxxxx                                 00000000 00000000 0xxxxxxx
**  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
**  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx







|







8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** This file contains routines used to translate between UTF-8, 
** UTF-16, UTF-16BE, and UTF-16LE.
**
** $Id: utf.c,v 1.52 2007/07/23 19:12:42 drh Exp $
**
** Notes on UTF-8:
**
**   Byte-0    Byte-1    Byte-2    Byte-3    Value
**  0xxxxxxx                                 00000000 00000000 0xxxxxxx
**  110yyyyy  10xxxxxx                       00000000 00000yyy yyxxxxxx
**  1110zzzz  10yyyyyy  10xxxxxx             00000000 zzzzyyyy yyxxxxxx
55
56
57
58
59
60
61

62
63
64
65
66
67
68
  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
};


#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (c&0xFF);                                \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \







>







55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
};


#define WRITE_UTF8(zOut, c) {                          \
  if( c<0x00080 ){                                     \
    *zOut++ = (c&0xFF);                                \
  }                                                    \
  else if( c<0x00800 ){                                \
    *zOut++ = 0xC0 + ((c>>6)&0x1F);                    \
121
122
123
124
125
126
127
















































128
129
130
131
132
133
134
  if( c>=0xD800 && c<0xE000 ){                                       \
    int c2 = ((*zIn++)<<8);                                           \
    c2 += (*zIn++);                                                   \
    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
    if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
  }                                                                   \
}

















































/*
** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
*/ 
/* #define TRANSLATE_TRACE 1 */








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
  if( c>=0xD800 && c<0xE000 ){                                       \
    int c2 = ((*zIn++)<<8);                                           \
    c2 += (*zIn++);                                                   \
    c = (c2&0x03FF) + ((c&0x003F)<<10) + (((c&0x03C0)+0x0040)<<10);   \
    if( (c & 0xFFFF0000)==0 ) c = 0xFFFD;                             \
  }                                                                   \
}

/*
** Translate a single UTF-8 character.  Return the unicode value.
**
** During translation, assume that the byte that zTerm points
** is a 0x00.
**
** Write a pointer to the next unread byte back into *pzNext.
**
** Notes On Invalid UTF-8:
**
**  *  This routine never allows a 7-bit character (0x00 through 0x7f) to
**     be encoded as a multi-byte character.  Any multi-byte character that
**     attempts to encode a value between 0x00 and 0x7f is rendered as 0xfffd.
**
**  *  This routine never allows a UTF16 surrogate value to be encoded.
**     If a multi-byte character attempts to encode a value between
**     0xd800 and 0xe000 then it is rendered as 0xfffd.
**
**  *  Bytes in the range of 0x80 through 0xbf which occur as the first
**     byte of a character are interpreted as single-byte characters
**     and rendered as themselves even though they are technically
**     invalid characters.
**
**  *  This routine accepts an infinite number of different UTF8 encodings
**     for unicode values 0x80 and greater.  It do not change over-length
**     encodings to 0xfffd as some systems recommend.
*/
int sqlite3Utf8Read(
  const unsigned char *z,         /* First byte of UTF-8 character */
  const unsigned char *zTerm,     /* Pretend this byte is 0x00 */
  const unsigned char **pzNext    /* Write first byte past UTF-8 char here */
){
  int c = *(z++);
  if( c>=0xc0 ){
    c = sqlite3UtfTrans1[c-0xc0];
    while( z!=zTerm && (*z & 0xc0)==0x80 ){
      c = (c<<6) + (0x3f & *(z++));
    }
    if( c<0x80
        || (c&0xFFFFF800)==0xD800
        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }
  }
  *pzNext = z;
  return c;
}



/*
** If the TRANSLATE_TRACE macro is defined, the value of each Mem is
** printed on stderr on the way into and out of sqlite3VdbeMemTranslate().
*/ 
/* #define TRANSLATE_TRACE 1 */

215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
    if( !zOut ) return SQLITE_NOMEM;
  }else{
    zOut = zShort;
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
    unsigned int iExtra = 0xD800;

    if( 0==(pMem->flags&MEM_Term) && zTerm>zIn && (zTerm[-1]&0x80) ){
      /* This UTF8 string is not nul-terminated, and the last byte is
      ** not a character in the ascii range (codpoints 0..127). This
      ** means the SQLITE_READ_UTF8() macro might read past the end
      ** of the allocated buffer.
      **
      ** There are four possibilities:
      **
      **   1. The last byte is the first byte of a non-ASCII character,
      **
      **   2. The final N bytes of the input string are continuation bytes
      **      and immediately preceding them is the first byte of a 
      **      non-ASCII character.
      **
      **   3. The final N bytes of the input string are continuation bytes
      **      and immediately preceding them is a byte that encodes a 
      **      character in the ASCII range.
      **
      **   4. The entire string consists of continuation characters.
      **
      ** Cases (3) and (4) require no special handling. The SQLITE_READ_UTF8()
      ** macro will not overread the buffer in these cases.
      */
      unsigned char *zExtra = &zTerm[-1];
      while( zExtra>zIn && (zExtra[0]&0xC0)==0x80 ){
        zExtra--;
      }

      if( (zExtra[0]&0xC0)==0xC0 ){
        /* Make a copy of the last character encoding in the input string.
        ** Then make sure it is nul-terminated and use SQLITE_READ_UTF8()
        ** to decode the codepoint. Store the codepoint in variable iExtra,
        ** it will be appended to the output string later.
        */
        unsigned char *zFree = 0;
        unsigned char zBuf[16];
        int nExtra = (pMem->n+zIn-zExtra);
        zTerm = zExtra;
        if( nExtra>15 ){
          zExtra = sqliteMallocRaw(nExtra+1);
          if( !zExtra ){
            return SQLITE_NOMEM;
          }
          zFree = zExtra;
        }else{
          zExtra = zBuf;
        }
        memcpy(zExtra, zTerm, nExtra);
        zExtra[nExtra] = '\0';
        SQLITE_READ_UTF8(zExtra, iExtra);
        sqliteFree(zFree);
      }
    }

    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16LE(z, c);
      }
      if( iExtra!=0xD800 ){
        WRITE_UTF16LE(z, iExtra);
      }
    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        SQLITE_READ_UTF8(zIn, c); 
        WRITE_UTF16BE(z, c);
      }
      if( iExtra!=0xD800 ){
        WRITE_UTF16BE(z, iExtra);
      }
    }
    pMem->n = z - zOut;
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<



|


<
<
<




|


<
<
<







264
265
266
267
268
269
270
























































271
272
273
274
275
276



277
278
279
280
281
282
283



284
285
286
287
288
289
290
    if( !zOut ) return SQLITE_NOMEM;
  }else{
    zOut = zShort;
  }
  z = zOut;

  if( pMem->enc==SQLITE_UTF8 ){
























































    if( desiredEnc==SQLITE_UTF16LE ){
      /* UTF-8 -> UTF-16 Little-endian */
      while( zIn<zTerm ){
        c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
        WRITE_UTF16LE(z, c);
      }



    }else{
      assert( desiredEnc==SQLITE_UTF16BE );
      /* UTF-8 -> UTF-16 Big-endian */
      while( zIn<zTerm ){
        c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);
        WRITE_UTF16BE(z, c);
      }



    }
    pMem->n = z - zOut;
    *z++ = 0;
  }else{
    assert( desiredEnc==SQLITE_UTF8 );
    if( pMem->enc==SQLITE_UTF16LE ){
      /* UTF-16 Little-endian -> UTF-8 */
473
474
475
476
477
478
479

480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503

504
505
506
507
508
509
510
511

512
513
514
515
516
517
518
519
520
**
** The translation is done in-place (since it is impossible for the
** correct UTF-8 encoding to be longer than a malformed encoding).
*/
int sqlite3Utf8To8(unsigned char *zIn){
  unsigned char *zOut = zIn;
  unsigned char *zStart = zIn;

  int c;

  while(1){
    SQLITE_READ_UTF8(zIn, c);
    if( c==0 ) break;
    if( c!=0xfffd ){
      WRITE_UTF8(zOut, c);
    }
  }
  *zOut = 0;
  return zOut - zStart;
}
#endif

#if defined(SQLITE_TEST)
/*
** This routine is called from the TCL test function "translate_selftest".
** It checks that the primitives for serializing and deserializing
** characters in each encoding are inverses of each other.
*/
void sqlite3UtfSelfTest(){
  unsigned int i, t;
  unsigned char zBuf[20];
  unsigned char *z;

  int n;
  unsigned int c;

  for(i=0; i<0x00110000; i++){
    z = zBuf;
    WRITE_UTF8(z, i);
    n = z-zBuf;
    z[0] = 0;

    z = zBuf;
    SQLITE_READ_UTF8(z, c);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){







>
|

|
|
<



















>








>

|







460
461
462
463
464
465
466
467
468
469
470
471

472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
**
** The translation is done in-place (since it is impossible for the
** correct UTF-8 encoding to be longer than a malformed encoding).
*/
int sqlite3Utf8To8(unsigned char *zIn){
  unsigned char *zOut = zIn;
  unsigned char *zStart = zIn;
  unsigned char *zTerm;
  u32 c;

  while( zIn[0] ){
    c = sqlite3Utf8Read(zIn, zTerm, (const u8**)&zIn);

    if( c!=0xfffd ){
      WRITE_UTF8(zOut, c);
    }
  }
  *zOut = 0;
  return zOut - zStart;
}
#endif

#if defined(SQLITE_TEST)
/*
** This routine is called from the TCL test function "translate_selftest".
** It checks that the primitives for serializing and deserializing
** characters in each encoding are inverses of each other.
*/
void sqlite3UtfSelfTest(){
  unsigned int i, t;
  unsigned char zBuf[20];
  unsigned char *z;
  unsigned char *zTerm;
  int n;
  unsigned int c;

  for(i=0; i<0x00110000; i++){
    z = zBuf;
    WRITE_UTF8(z, i);
    n = z-zBuf;
    z[0] = 0;
    zTerm = z;
    z = zBuf;
    c = sqlite3Utf8Read(z, zTerm, (const u8**)&z);
    t = i;
    if( i>=0xD800 && i<=0xDFFF ) t = 0xFFFD;
    if( (i&0xFFFFFFFE)==0xFFFE ) t = 0xFFFD;
    assert( c==t );
    assert( (z-zBuf)==n );
  }
  for(i=0; i<0x00110000; i++){