/ Changes On Branch normalize
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Changes In Branch normalize Excluding Merge-Ins

This is equivalent to a diff from 90cb01d8 to 658f4225

2018-01-24
15:07
Add the normalize.c extension. (check-in: 16ebe558 user: drh tags: trunk)
2018-01-08
20:04
Test cases for sqlite3_normalize(). (Closed-Leaf check-in: 658f4225 user: drh tags: normalize)
19:29
Special handling of the NULL keyword. Sometimes it is a literal, and sometimes it is a keyword. (check-in: db5d138e user: drh tags: normalize)
16:54
First code for an auxiliary function that will normalize an SQL statement. (check-in: 84814aac user: drh tags: normalize)
14:24
Fix compilation of 'sqltclsh.exe' for MSVC. (check-in: abb112d9 user: mistachkin tags: trunk)
2018-01-07
23:28
Avoid the use of utimensat() on older unix platforms. (check-in: 90cb01d8 user: drh tags: trunk)
21:58
Remove the dependency on test_windirent.h from the generated shell.c file. (check-in: 0a50c9e3 user: drh tags: trunk)

Changes to Makefile.in.

   436    436     $(TOP)/ext/misc/fuzzer.c \
   437    437     $(TOP)/ext/fts5/fts5_tcl.c \
   438    438     $(TOP)/ext/fts5/fts5_test_mi.c \
   439    439     $(TOP)/ext/fts5/fts5_test_tok.c \
   440    440     $(TOP)/ext/misc/ieee754.c \
   441    441     $(TOP)/ext/misc/mmapwarm.c \
   442    442     $(TOP)/ext/misc/nextchar.c \
          443  +  $(TOP)/ext/misc/normalize.c \
   443    444     $(TOP)/ext/misc/percentile.c \
   444    445     $(TOP)/ext/misc/regexp.c \
   445    446     $(TOP)/ext/misc/remember.c \
   446    447     $(TOP)/ext/misc/series.c \
   447    448     $(TOP)/ext/misc/spellfix.c \
   448    449     $(TOP)/ext/misc/totype.c \
   449    450     $(TOP)/ext/misc/unionvtab.c \

Changes to Makefile.msc.

  1496   1496     $(TOP)\ext\misc\fuzzer.c \
  1497   1497     $(TOP)\ext\fts5\fts5_tcl.c \
  1498   1498     $(TOP)\ext\fts5\fts5_test_mi.c \
  1499   1499     $(TOP)\ext\fts5\fts5_test_tok.c \
  1500   1500     $(TOP)\ext\misc\ieee754.c \
  1501   1501     $(TOP)\ext\misc\mmapwarm.c \
  1502   1502     $(TOP)\ext\misc\nextchar.c \
         1503  +  $(TOP)\ext\misc\normalize.c \
  1503   1504     $(TOP)\ext\misc\percentile.c \
  1504   1505     $(TOP)\ext\misc\regexp.c \
  1505   1506     $(TOP)\ext\misc\remember.c \
  1506   1507     $(TOP)\ext\misc\series.c \
  1507   1508     $(TOP)\ext\misc\spellfix.c \
  1508   1509     $(TOP)\ext\misc\totype.c \
  1509   1510     $(TOP)\ext\misc\unionvtab.c \

Added ext/misc/normalize.c.

            1  +/*
            2  +** 2018-01-08
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +******************************************************************************
           12  +**
           13  +** This file contains code to implement the sqlite3_normalize() function.
           14  +**
           15  +**    char *sqlite3_normalize(const char *zSql);
           16  +**
           17  +** This function takes an SQL string as input and returns a "normalized"
           18  +** version of that string in memory obtained from sqlite3_malloc64().  The
           19  +** caller is responsible for ensuring that the returned memory is freed.
           20  +**
           21  +** If a memory allocation error occurs, this routine returns NULL.
           22  +**
           23  +** The normalization consists of the following transformations:
           24  +**
           25  +**   (1)  Convert every literal (string, blob literal, numeric constant,
           26  +**        or "NULL" constant) into a ?
           27  +**
           28  +**   (2)  Remove all superfluous whitespace, including comments.  Change
           29  +**        all required whitespace to a single space character.
           30  +**
           31  +**   (3)  Lowercase all ASCII characters.
           32  +**
           33  +**   (4)  If an IN or NOT IN operator is followed by a list of 1 or more
           34  +**        values, convert that list into "(?,?,?)".
           35  +**
           36  +** The purpose of normalization is two-fold:
           37  +**
           38  +**   (1)  Sanitize queries by removing potentially private or sensitive
           39  +**        information contained in literals.
           40  +**
           41  +**   (2)  Identify structurally identical queries by comparing their
           42  +**        normalized forms.
           43  +**
           44  +** Command-Line Utility
           45  +** --------------------
           46  +**
           47  +** This file also contains code for a command-line utility that converts
           48  +** SQL queries in text files into their normalized forms.  To build the
           49  +** command-line program, compile this file with -DSQLITE_NORMALIZE_CLI
           50  +** and link it against the SQLite library.
           51  +*/
           52  +#include <sqlite3.h>
           53  +#include <string.h>
           54  +
           55  +/*
           56  +** Implementation note:
           57  +**
           58  +** Much of the tokenizer logic is copied out of the tokenize.c source file
           59  +** of SQLite.  That logic could be simplified for this particular application,
           60  +** but that would impose a risk of introducing subtle errors.  It is best to
           61  +** keep the code as close to the original as possible.
           62  +**
           63  +** The tokenize code is in sync with the SQLite core as of 2018-01-08.
           64  +** Any future changes to the core tokenizer might require corresponding
           65  +** adjustments to the tokenizer logic in this module.
           66  +*/
           67  +
           68  +
           69  +/* Character classes for tokenizing
           70  +**
           71  +** In the sqlite3GetToken() function, a switch() on aiClass[c] is implemented
           72  +** using a lookup table, whereas a switch() directly on c uses a binary search.
           73  +** The lookup table is much faster.  To maximize speed, and to ensure that
           74  +** a lookup table is used, all of the classes need to be small integers and
           75  +** all of them need to be used within the switch.
           76  +*/
           77  +#define CC_X          0    /* The letter 'x', or start of BLOB literal */
           78  +#define CC_KYWD       1    /* Alphabetics or '_'.  Usable in a keyword */
           79  +#define CC_ID         2    /* unicode characters usable in IDs */
           80  +#define CC_DIGIT      3    /* Digits */
           81  +#define CC_DOLLAR     4    /* '$' */
           82  +#define CC_VARALPHA   5    /* '@', '#', ':'.  Alphabetic SQL variables */
           83  +#define CC_VARNUM     6    /* '?'.  Numeric SQL variables */
           84  +#define CC_SPACE      7    /* Space characters */
           85  +#define CC_QUOTE      8    /* '"', '\'', or '`'.  String literals, quoted ids */
           86  +#define CC_QUOTE2     9    /* '['.   [...] style quoted ids */
           87  +#define CC_PIPE      10    /* '|'.   Bitwise OR or concatenate */
           88  +#define CC_MINUS     11    /* '-'.  Minus or SQL-style comment */
           89  +#define CC_LT        12    /* '<'.  Part of < or <= or <> */
           90  +#define CC_GT        13    /* '>'.  Part of > or >= */
           91  +#define CC_EQ        14    /* '='.  Part of = or == */
           92  +#define CC_BANG      15    /* '!'.  Part of != */
           93  +#define CC_SLASH     16    /* '/'.  / or c-style comment */
           94  +#define CC_LP        17    /* '(' */
           95  +#define CC_RP        18    /* ')' */
           96  +#define CC_SEMI      19    /* ';' */
           97  +#define CC_PLUS      20    /* '+' */
           98  +#define CC_STAR      21    /* '*' */
           99  +#define CC_PERCENT   22    /* '%' */
          100  +#define CC_COMMA     23    /* ',' */
          101  +#define CC_AND       24    /* '&' */
          102  +#define CC_TILDA     25    /* '~' */
          103  +#define CC_DOT       26    /* '.' */
          104  +#define CC_ILLEGAL   27    /* Illegal character */
          105  +
          106  +static const unsigned char aiClass[] = {
          107  +/*         x0  x1  x2  x3  x4  x5  x6  x7  x8  x9  xa  xb  xc  xd  xe  xf */
          108  +/* 0x */   27, 27, 27, 27, 27, 27, 27, 27, 27,  7,  7, 27,  7,  7, 27, 27,
          109  +/* 1x */   27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
          110  +/* 2x */    7, 15,  8,  5,  4, 22, 24,  8, 17, 18, 21, 20, 23, 11, 26, 16,
          111  +/* 3x */    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  5, 19, 12, 14, 13,  6,
          112  +/* 4x */    5,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          113  +/* 5x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  9, 27, 27, 27,  1,
          114  +/* 6x */    8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
          115  +/* 7x */    1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1, 27, 10, 27, 25, 27,
          116  +/* 8x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          117  +/* 9x */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          118  +/* Ax */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          119  +/* Bx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          120  +/* Cx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          121  +/* Dx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          122  +/* Ex */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
          123  +/* Fx */    2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
          124  +};
          125  +
          126  +/* An array to map all upper-case characters into their corresponding
          127  +** lower-case character. 
          128  +**
          129  +** SQLite only considers US-ASCII (or EBCDIC) characters.  We do not
          130  +** handle case conversions for the UTF character set since the tables
          131  +** involved are nearly as big or bigger than SQLite itself.
          132  +*/
          133  +static const unsigned char sqlite3UpperToLower[] = {
          134  +      0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
          135  +     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
          136  +     36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
          137  +     54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 97, 98, 99,100,101,102,103,
          138  +    104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,
          139  +    122, 91, 92, 93, 94, 95, 96, 97, 98, 99,100,101,102,103,104,105,106,107,
          140  +    108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,
          141  +    126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,
          142  +    144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,
          143  +    162,163,164,165,166,167,168,169,170,171,172,173,174,175,176,177,178,179,
          144  +    180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,
          145  +    198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,
          146  +    216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,
          147  +    234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,
          148  +    252,253,254,255
          149  +};
          150  +
          151  +/*
          152  +** The following 256 byte lookup table is used to support SQLites built-in
          153  +** equivalents to the following standard library functions:
          154  +**
          155  +**   isspace()                        0x01
          156  +**   isalpha()                        0x02
          157  +**   isdigit()                        0x04
          158  +**   isalnum()                        0x06
          159  +**   isxdigit()                       0x08
          160  +**   toupper()                        0x20
          161  +**   SQLite identifier character      0x40
          162  +**   Quote character                  0x80
          163  +**
          164  +** Bit 0x20 is set if the mapped character requires translation to upper
          165  +** case. i.e. if the character is a lower-case ASCII character.
          166  +** If x is a lower-case ASCII character, then its upper-case equivalent
          167  +** is (x - 0x20). Therefore toupper() can be implemented as:
          168  +**
          169  +**   (x & ~(map[x]&0x20))
          170  +**
          171  +** The equivalent of tolower() is implemented using the sqlite3UpperToLower[]
          172  +** array. tolower() is used more often than toupper() by SQLite.
          173  +**
          174  +** Bit 0x40 is set if the character is non-alphanumeric and can be used in an 
          175  +** SQLite identifier.  Identifiers are alphanumerics, "_", "$", and any
          176  +** non-ASCII UTF character. Hence the test for whether or not a character is
          177  +** part of an identifier is 0x46.
          178  +*/
          179  +static const unsigned char sqlite3CtypeMap[256] = {
          180  +  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 00..07    ........ */
          181  +  0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,  /* 08..0f    ........ */
          182  +  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 10..17    ........ */
          183  +  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 18..1f    ........ */
          184  +  0x01, 0x00, 0x80, 0x00, 0x40, 0x00, 0x00, 0x80,  /* 20..27     !"#$%&' */
          185  +  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 28..2f    ()*+,-./ */
          186  +  0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c, 0x0c,  /* 30..37    01234567 */
          187  +  0x0c, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 38..3f    89:;<=>? */
          188  +
          189  +  0x00, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x0a, 0x02,  /* 40..47    @ABCDEFG */
          190  +  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 48..4f    HIJKLMNO */
          191  +  0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,  /* 50..57    PQRSTUVW */
          192  +  0x02, 0x02, 0x02, 0x80, 0x00, 0x00, 0x00, 0x40,  /* 58..5f    XYZ[\]^_ */
          193  +  0x80, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x2a, 0x22,  /* 60..67    `abcdefg */
          194  +  0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 68..6f    hijklmno */
          195  +  0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22, 0x22,  /* 70..77    pqrstuvw */
          196  +  0x22, 0x22, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00,  /* 78..7f    xyz{|}~. */
          197  +
          198  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 80..87    ........ */
          199  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 88..8f    ........ */
          200  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 90..97    ........ */
          201  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* 98..9f    ........ */
          202  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a0..a7    ........ */
          203  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* a8..af    ........ */
          204  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b0..b7    ........ */
          205  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* b8..bf    ........ */
          206  +
          207  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c0..c7    ........ */
          208  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* c8..cf    ........ */
          209  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d0..d7    ........ */
          210  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* d8..df    ........ */
          211  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e0..e7    ........ */
          212  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* e8..ef    ........ */
          213  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,  /* f0..f7    ........ */
          214  +  0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40   /* f8..ff    ........ */
          215  +};
          216  +#define sqlite3Toupper(x)   ((x)&~(sqlite3CtypeMap[(unsigned char)(x)]&0x20))
          217  +#define sqlite3Isspace(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x01)
          218  +#define sqlite3Isalnum(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x06)
          219  +#define sqlite3Isalpha(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x02)
          220  +#define sqlite3Isdigit(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x04)
          221  +#define sqlite3Isxdigit(x)  (sqlite3CtypeMap[(unsigned char)(x)]&0x08)
          222  +#define sqlite3Tolower(x)   (sqlite3UpperToLower[(unsigned char)(x)])
          223  +#define sqlite3Isquote(x)   (sqlite3CtypeMap[(unsigned char)(x)]&0x80)
          224  +
          225  +
          226  +/*
          227  +** If X is a character that can be used in an identifier then
          228  +** IdChar(X) will be true.  Otherwise it is false.
          229  +**
          230  +** For ASCII, any character with the high-order bit set is
          231  +** allowed in an identifier.  For 7-bit characters, 
          232  +** sqlite3IsIdChar[X] must be 1.
          233  +**
          234  +** For EBCDIC, the rules are more complex but have the same
          235  +** end result.
          236  +**
          237  +** Ticket #1066.  the SQL standard does not allow '$' in the
          238  +** middle of identifiers.  But many SQL implementations do. 
          239  +** SQLite will allow '$' in identifiers for compatibility.
          240  +** But the feature is undocumented.
          241  +*/
          242  +#define IdChar(C)  ((sqlite3CtypeMap[(unsigned char)C]&0x46)!=0)
          243  +
          244  +/*
          245  +** Ignore testcase() macros
          246  +*/
          247  +#define testcase(X)
          248  +
          249  +/*
          250  +** Token values
          251  +*/
          252  +#define TK_SPACE    0
          253  +#define TK_NAME     1
          254  +#define TK_LITERAL  2
          255  +#define TK_PUNCT    3
          256  +#define TK_ERROR    4
          257  +
          258  +#define TK_MINUS    TK_PUNCT
          259  +#define TK_LP       TK_PUNCT
          260  +#define TK_RP       TK_PUNCT
          261  +#define TK_SEMI     TK_PUNCT
          262  +#define TK_PLUS     TK_PUNCT
          263  +#define TK_STAR     TK_PUNCT
          264  +#define TK_SLASH    TK_PUNCT
          265  +#define TK_REM      TK_PUNCT
          266  +#define TK_EQ       TK_PUNCT
          267  +#define TK_LE       TK_PUNCT
          268  +#define TK_NE       TK_PUNCT
          269  +#define TK_LSHIFT   TK_PUNCT
          270  +#define TK_LT       TK_PUNCT
          271  +#define TK_GE       TK_PUNCT
          272  +#define TK_RSHIFT   TK_PUNCT
          273  +#define TK_GT       TK_PUNCT
          274  +#define TK_GE       TK_PUNCT
          275  +#define TK_BITOR    TK_PUNCT
          276  +#define TK_CONCAT   TK_PUNCT
          277  +#define TK_COMMA    TK_PUNCT
          278  +#define TK_BITAND   TK_PUNCT
          279  +#define TK_BITNOT   TK_PUNCT
          280  +#define TK_STRING   TK_LITERAL
          281  +#define TK_ID       TK_NAME
          282  +#define TK_ILLEGAL  TK_ERROR
          283  +#define TK_DOT      TK_PUNCT
          284  +#define TK_INTEGER  TK_LITERAL
          285  +#define TK_FLOAT    TK_LITERAL
          286  +#define TK_VARIABLE TK_LITERAL
          287  +#define TK_BLOB     TK_LITERAL
          288  +
          289  +/*
          290  +** Return the length (in bytes) of the token that begins at z[0]. 
          291  +** Store the token type in *tokenType before returning.
          292  +*/
          293  +static int sqlite3GetToken(const unsigned char *z, int *tokenType){
          294  +  int i, c;
          295  +  switch( aiClass[*z] ){  /* Switch on the character-class of the first byte
          296  +                          ** of the token. See the comment on the CC_ defines
          297  +                          ** above. */
          298  +    case CC_SPACE: {
          299  +      for(i=1; sqlite3Isspace(z[i]); i++){}
          300  +      *tokenType = TK_SPACE;
          301  +      return i;
          302  +    }
          303  +    case CC_MINUS: {
          304  +      if( z[1]=='-' ){
          305  +        for(i=2; (c=z[i])!=0 && c!='\n'; i++){}
          306  +        *tokenType = TK_SPACE;
          307  +        return i;
          308  +      }
          309  +      *tokenType = TK_MINUS;
          310  +      return 1;
          311  +    }
          312  +    case CC_LP: {
          313  +      *tokenType = TK_LP;
          314  +      return 1;
          315  +    }
          316  +    case CC_RP: {
          317  +      *tokenType = TK_RP;
          318  +      return 1;
          319  +    }
          320  +    case CC_SEMI: {
          321  +      *tokenType = TK_SEMI;
          322  +      return 1;
          323  +    }
          324  +    case CC_PLUS: {
          325  +      *tokenType = TK_PLUS;
          326  +      return 1;
          327  +    }
          328  +    case CC_STAR: {
          329  +      *tokenType = TK_STAR;
          330  +      return 1;
          331  +    }
          332  +    case CC_SLASH: {
          333  +      if( z[1]!='*' || z[2]==0 ){
          334  +        *tokenType = TK_SLASH;
          335  +        return 1;
          336  +      }
          337  +      for(i=3, c=z[2]; (c!='*' || z[i]!='/') && (c=z[i])!=0; i++){}
          338  +      if( c ) i++;
          339  +      *tokenType = TK_SPACE;
          340  +      return i;
          341  +    }
          342  +    case CC_PERCENT: {
          343  +      *tokenType = TK_REM;
          344  +      return 1;
          345  +    }
          346  +    case CC_EQ: {
          347  +      *tokenType = TK_EQ;
          348  +      return 1 + (z[1]=='=');
          349  +    }
          350  +    case CC_LT: {
          351  +      if( (c=z[1])=='=' ){
          352  +        *tokenType = TK_LE;
          353  +        return 2;
          354  +      }else if( c=='>' ){
          355  +        *tokenType = TK_NE;
          356  +        return 2;
          357  +      }else if( c=='<' ){
          358  +        *tokenType = TK_LSHIFT;
          359  +        return 2;
          360  +      }else{
          361  +        *tokenType = TK_LT;
          362  +        return 1;
          363  +      }
          364  +    }
          365  +    case CC_GT: {
          366  +      if( (c=z[1])=='=' ){
          367  +        *tokenType = TK_GE;
          368  +        return 2;
          369  +      }else if( c=='>' ){
          370  +        *tokenType = TK_RSHIFT;
          371  +        return 2;
          372  +      }else{
          373  +        *tokenType = TK_GT;
          374  +        return 1;
          375  +      }
          376  +    }
          377  +    case CC_BANG: {
          378  +      if( z[1]!='=' ){
          379  +        *tokenType = TK_ILLEGAL;
          380  +        return 1;
          381  +      }else{
          382  +        *tokenType = TK_NE;
          383  +        return 2;
          384  +      }
          385  +    }
          386  +    case CC_PIPE: {
          387  +      if( z[1]!='|' ){
          388  +        *tokenType = TK_BITOR;
          389  +        return 1;
          390  +      }else{
          391  +        *tokenType = TK_CONCAT;
          392  +        return 2;
          393  +      }
          394  +    }
          395  +    case CC_COMMA: {
          396  +      *tokenType = TK_COMMA;
          397  +      return 1;
          398  +    }
          399  +    case CC_AND: {
          400  +      *tokenType = TK_BITAND;
          401  +      return 1;
          402  +    }
          403  +    case CC_TILDA: {
          404  +      *tokenType = TK_BITNOT;
          405  +      return 1;
          406  +    }
          407  +    case CC_QUOTE: {
          408  +      int delim = z[0];
          409  +      testcase( delim=='`' );
          410  +      testcase( delim=='\'' );
          411  +      testcase( delim=='"' );
          412  +      for(i=1; (c=z[i])!=0; i++){
          413  +        if( c==delim ){
          414  +          if( z[i+1]==delim ){
          415  +            i++;
          416  +          }else{
          417  +            break;
          418  +          }
          419  +        }
          420  +      }
          421  +      if( c=='\'' ){
          422  +        *tokenType = TK_STRING;
          423  +        return i+1;
          424  +      }else if( c!=0 ){
          425  +        *tokenType = TK_ID;
          426  +        return i+1;
          427  +      }else{
          428  +        *tokenType = TK_ILLEGAL;
          429  +        return i;
          430  +      }
          431  +    }
          432  +    case CC_DOT: {
          433  +      if( !sqlite3Isdigit(z[1]) ){
          434  +        *tokenType = TK_DOT;
          435  +        return 1;
          436  +      }
          437  +      /* If the next character is a digit, this is a floating point
          438  +      ** number that begins with ".".  Fall thru into the next case */
          439  +    }
          440  +    case CC_DIGIT: {
          441  +      *tokenType = TK_INTEGER;
          442  +      if( z[0]=='0' && (z[1]=='x' || z[1]=='X') && sqlite3Isxdigit(z[2]) ){
          443  +        for(i=3; sqlite3Isxdigit(z[i]); i++){}
          444  +        return i;
          445  +      }
          446  +      for(i=0; sqlite3Isdigit(z[i]); i++){}
          447  +      if( z[i]=='.' ){
          448  +        i++;
          449  +        while( sqlite3Isdigit(z[i]) ){ i++; }
          450  +        *tokenType = TK_FLOAT;
          451  +      }
          452  +      if( (z[i]=='e' || z[i]=='E') &&
          453  +           ( sqlite3Isdigit(z[i+1]) 
          454  +            || ((z[i+1]=='+' || z[i+1]=='-') && sqlite3Isdigit(z[i+2]))
          455  +           )
          456  +      ){
          457  +        i += 2;
          458  +        while( sqlite3Isdigit(z[i]) ){ i++; }
          459  +        *tokenType = TK_FLOAT;
          460  +      }
          461  +      while( IdChar(z[i]) ){
          462  +        *tokenType = TK_ILLEGAL;
          463  +        i++;
          464  +      }
          465  +      return i;
          466  +    }
          467  +    case CC_QUOTE2: {
          468  +      for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
          469  +      *tokenType = c==']' ? TK_ID : TK_ILLEGAL;
          470  +      return i;
          471  +    }
          472  +    case CC_VARNUM: {
          473  +      *tokenType = TK_VARIABLE;
          474  +      for(i=1; sqlite3Isdigit(z[i]); i++){}
          475  +      return i;
          476  +    }
          477  +    case CC_DOLLAR:
          478  +    case CC_VARALPHA: {
          479  +      int n = 0;
          480  +      testcase( z[0]=='$' );  testcase( z[0]=='@' );
          481  +      testcase( z[0]==':' );  testcase( z[0]=='#' );
          482  +      *tokenType = TK_VARIABLE;
          483  +      for(i=1; (c=z[i])!=0; i++){
          484  +        if( IdChar(c) ){
          485  +          n++;
          486  +        }else if( c=='(' && n>0 ){
          487  +          do{
          488  +            i++;
          489  +          }while( (c=z[i])!=0 && !sqlite3Isspace(c) && c!=')' );
          490  +          if( c==')' ){
          491  +            i++;
          492  +          }else{
          493  +            *tokenType = TK_ILLEGAL;
          494  +          }
          495  +          break;
          496  +        }else if( c==':' && z[i+1]==':' ){
          497  +          i++;
          498  +        }else{
          499  +          break;
          500  +        }
          501  +      }
          502  +      if( n==0 ) *tokenType = TK_ILLEGAL;
          503  +      return i;
          504  +    }
          505  +    case CC_KYWD: {
          506  +      for(i=1; aiClass[z[i]]<=CC_KYWD; i++){}
          507  +      if( IdChar(z[i]) ){
          508  +        /* This token started out using characters that can appear in keywords,
          509  +        ** but z[i] is a character not allowed within keywords, so this must
          510  +        ** be an identifier instead */
          511  +        i++;
          512  +        break;
          513  +      }
          514  +      *tokenType = TK_ID;
          515  +      return i;
          516  +    }
          517  +    case CC_X: {
          518  +      testcase( z[0]=='x' ); testcase( z[0]=='X' );
          519  +      if( z[1]=='\'' ){
          520  +        *tokenType = TK_BLOB;
          521  +        for(i=2; sqlite3Isxdigit(z[i]); i++){}
          522  +        if( z[i]!='\'' || i%2 ){
          523  +          *tokenType = TK_ILLEGAL;
          524  +          while( z[i] && z[i]!='\'' ){ i++; }
          525  +        }
          526  +        if( z[i] ) i++;
          527  +        return i;
          528  +      }
          529  +      /* If it is not a BLOB literal, then it must be an ID, since no
          530  +      ** SQL keywords start with the letter 'x'.  Fall through */
          531  +    }
          532  +    case CC_ID: {
          533  +      i = 1;
          534  +      break;
          535  +    }
          536  +    default: {
          537  +      *tokenType = TK_ILLEGAL;
          538  +      return 1;
          539  +    }
          540  +  }
          541  +  while( IdChar(z[i]) ){ i++; }
          542  +  *tokenType = TK_ID;
          543  +  return i;
          544  +}
          545  +
          546  +char *sqlite3_normalize(const char *zSql){
          547  +  char *z;              /* The output string */
          548  +  sqlite3_int64 nZ;     /* Size of the output string in bytes */
          549  +  sqlite3_int64 nSql;   /* Size of the input string in bytes */
          550  +  int i;                /* Next character to read from zSql[] */
          551  +  int j;                /* Next slot to fill in on z[] */
          552  +  int tokenType;        /* Type of the next token */
          553  +  int n;                /* Size of the next token */
          554  +  int k;                /* Loop counter */
          555  +
          556  +  nSql = strlen(zSql);
          557  +  nZ = nSql;
          558  +  z = sqlite3_malloc64( nZ+2 );
          559  +  if( z==0 ) return 0;
          560  +  for(i=j=0; zSql[i]; i += n){
          561  +    n = sqlite3GetToken((unsigned char*)zSql+i, &tokenType);
          562  +    switch( tokenType ){
          563  +      case TK_SPACE: {
          564  +        break;
          565  +      }
          566  +      case TK_ERROR: {
          567  +        sqlite3_free(z);
          568  +        return 0;
          569  +      }
          570  +      case TK_LITERAL: {
          571  +        z[j++] = '?';
          572  +        break;
          573  +      }
          574  +      case TK_PUNCT:
          575  +      case TK_NAME: {
          576  +        if( n==4 && sqlite3_strnicmp(zSql+i,"NULL",4)==0 ){
          577  +          if( (j>=3 && strncmp(z+j-2,"is",2)==0 && !IdChar(z[j-3]))
          578  +           || (j>=4 && strncmp(z+j-3,"not",3)==0 && !IdChar(z[j-4]))
          579  +          ){
          580  +            /* NULL is a keyword in this case, not a literal value */
          581  +          }else{
          582  +            /* Here the NULL is a literal value */
          583  +            z[j++] = '?';
          584  +            break;
          585  +          }
          586  +        }
          587  +        if( j>0 && IdChar(z[j-1]) && IdChar(zSql[i]) ) z[j++] = ' ';
          588  +        for(k=0; k<n; k++){
          589  +          z[j++] = sqlite3Tolower(zSql[i+k]);
          590  +        }
          591  +        break;
          592  +      }
          593  +    }
          594  +  }
          595  +  while( j>0 && z[j-1]==' ' ){ j--; }
          596  +  if( i>0 && z[j-1]!=';' ){ z[j++] = ';'; }
          597  +  z[j] = 0;
          598  +
          599  +  /* Make a second pass converting "in(...)" where the "..." is not a
          600  +  ** SELECT statement into "in(?,?,?)" */
          601  +  for(i=0; i<j; i=n){
          602  +    char *zIn = strstr(z+i, "in(");
          603  +    int nParen;
          604  +    if( zIn==0 ) break;
          605  +    n = (int)(zIn-z)+3;  /* Index of first char past "in(" */
          606  +    if( n && IdChar(zIn[-1]) ) continue;
          607  +    if( strncmp(zIn, "in(select",9)==0 && !IdChar(zIn[9]) ) continue;
          608  +    if( strncmp(zIn, "in(with",7)==0 && !IdChar(zIn[7]) ) continue;
          609  +    for(nParen=1, k=0; z[n+k]; k++){
          610  +      if( z[n+k]=='(' ) nParen++;
          611  +      if( z[n+k]==')' ){
          612  +        nParen--;
          613  +        if( nParen==0 ) break;
          614  +      }
          615  +    }
          616  +    /* k is the number of bytes in the "..." within "in(...)" */
          617  +    if( k<5 ){
          618  +      z = sqlite3_realloc64(z, j+(5-k)+1);
          619  +      if( z==0 ) return 0;
          620  +      memmove(z+n+5, z+n+k, j-(n+k));
          621  +    }else if( k>5 ){
          622  +      memmove(z+n+5, z+n+k, j-(n+k));
          623  +    }
          624  +    j = j-k+5;
          625  +    z[j] = 0;
          626  +    memcpy(z+n, "?,?,?", 5);
          627  +  }
          628  +  return z;
          629  +}
          630  +
          631  +/*
          632  +** For testing purposes, or to build a stand-alone SQL normalizer program,
          633  +** compile this one source file with the -DSQLITE_NORMALIZE_CLI and link
          634  +** it against any SQLite library.  The resulting command-line program will
          635  +** run sqlite3_normalize() over the text of all files named on the command-
          636  +** line and show the result on standard output.
          637  +*/
          638  +#ifdef SQLITE_NORMALIZE_CLI
          639  +#include <stdio.h>
          640  +#include <stdlib.h>
          641  +
          642  +/*
          643  +** Break zIn up into separate SQL statements and run sqlite3_normalize()
          644  +** on each one.  Print the result of each run.
          645  +*/
          646  +static void normalizeFile(char *zIn){
          647  +  int i;
          648  +  if( zIn==0 ) return;
          649  +  for(i=0; zIn[i]; i++){
          650  +    char cSaved;
          651  +    if( zIn[i]!=';' ) continue;
          652  +    cSaved = zIn[i+1];
          653  +    zIn[i+1] = 0;
          654  +    if( sqlite3_complete(zIn) ){
          655  +      char *zOut = sqlite3_normalize(zIn);
          656  +      if( zOut ){
          657  +        printf("%s\n", zOut);
          658  +        sqlite3_free(zOut);
          659  +      }else{
          660  +        fprintf(stderr, "ERROR: %s\n", zIn);
          661  +      }
          662  +      zIn[i+1] = cSaved;
          663  +      zIn += i+1;
          664  +      i = -1;
          665  +    }else{
          666  +      zIn[i+1] = cSaved;
          667  +    }
          668  +  }
          669  +}
          670  +
          671  +/*
          672  +** The main routine for "sql_normalize".  Read files named on the
          673  +** command-line and run the text of each through sqlite3_normalize().
          674  +*/
          675  +int main(int argc, char **argv){
          676  +  int i;
          677  +  FILE *in;
          678  +  char *zBuf = 0;
          679  +  sqlite3_int64 sz, got;
          680  +
          681  +  for(i=1; i<argc; i++){
          682  +    in = fopen(argv[i], "rb");
          683  +    if( in==0 ){
          684  +      fprintf(stderr, "cannot open \"%s\"\n", argv[i]);
          685  +      continue;
          686  +    }
          687  +    fseek(in, 0, SEEK_END);
          688  +    sz = ftell(in);
          689  +    rewind(in);
          690  +    zBuf = sqlite3_realloc64(zBuf, sz+1);
          691  +    if( zBuf==0 ){
          692  +      fprintf(stderr, "failed to malloc for %lld bytes\n", sz);
          693  +      exit(1);
          694  +    }
          695  +    got = fread(zBuf, 1, sz, in);
          696  +    fclose(in);
          697  +    if( got!=sz ){
          698  +      fprintf(stderr, "only able to read %lld of %lld bytes from \"%s\"\n",
          699  +              got, sz, argv[i]);
          700  +    }else{
          701  +      zBuf[got] = 0;
          702  +      normalizeFile(zBuf);
          703  +    }
          704  +  }
          705  +  sqlite3_free(zBuf);
          706  +}
          707  +#endif /* SQLITE_NORMALIZE_CLI */

Changes to main.mk.

   357    357     $(TOP)/ext/misc/csv.c \
   358    358     $(TOP)/ext/misc/eval.c \
   359    359     $(TOP)/ext/misc/fileio.c \
   360    360     $(TOP)/ext/misc/fuzzer.c \
   361    361     $(TOP)/ext/misc/ieee754.c \
   362    362     $(TOP)/ext/misc/mmapwarm.c \
   363    363     $(TOP)/ext/misc/nextchar.c \
          364  +  $(TOP)/ext/misc/normalize.c \
   364    365     $(TOP)/ext/misc/percentile.c \
   365    366     $(TOP)/ext/misc/regexp.c \
   366    367     $(TOP)/ext/misc/remember.c \
   367    368     $(TOP)/ext/misc/series.c \
   368    369     $(TOP)/ext/misc/spellfix.c \
   369    370     $(TOP)/ext/misc/totype.c \
   370    371     $(TOP)/ext/misc/unionvtab.c \

Changes to src/test1.c.

  4554   4554     }
  4555   4555   
  4556   4556     zBuf = (char*)Tcl_GetByteArrayFromObj(objv[1], 0);
  4557   4557     Tcl_SetObjResult(interp, Tcl_NewIntObj(sqlite3_complete16(zBuf)));
  4558   4558   #endif /* SQLITE_OMIT_COMPLETE && SQLITE_OMIT_UTF16 */
  4559   4559     return TCL_OK;
  4560   4560   }
         4561  +
         4562  +/*
         4563  +** Usage: sqlite3_normalize SQL
         4564  +**
         4565  +** Return the normalized value for an SQL statement.
         4566  +*/
         4567  +static int SQLITE_TCLAPI test_normalize(
         4568  +  void * clientData,
         4569  +  Tcl_Interp *interp,
         4570  +  int objc,
         4571  +  Tcl_Obj *CONST objv[]
         4572  +){
         4573  +  char *zSql;
         4574  +  char *zNorm;
         4575  +  extern char *sqlite3_normalize(const char*);
         4576  +
         4577  +  if( objc!=2 ){
         4578  +    Tcl_WrongNumArgs(interp, 1, objv, "SQL");
         4579  +    return TCL_ERROR;
         4580  +  }
         4581  +
         4582  +  zSql = (char*)Tcl_GetString(objv[1]);
         4583  +  zNorm = sqlite3_normalize(zSql);
         4584  +  if( zNorm ){
         4585  +    Tcl_SetObjResult(interp, Tcl_NewStringObj(zNorm, -1));
         4586  +    sqlite3_free(zNorm);
         4587  +  }
         4588  +  return TCL_OK;
         4589  +}
  4561   4590   
  4562   4591   /*
  4563   4592   ** Usage: sqlite3_step STMT
  4564   4593   **
  4565   4594   ** Advance the statement to the next row.
  4566   4595   */
  4567   4596   static int SQLITE_TCLAPI test_step(
................................................................................
  7543   7572        { "sqlite3_extended_errcode",      test_ex_errcode    ,0 },
  7544   7573        { "sqlite3_errmsg",                test_errmsg        ,0 },
  7545   7574        { "sqlite3_errmsg16",              test_errmsg16      ,0 },
  7546   7575        { "sqlite3_open",                  test_open          ,0 },
  7547   7576        { "sqlite3_open16",                test_open16        ,0 },
  7548   7577        { "sqlite3_open_v2",               test_open_v2       ,0 },
  7549   7578        { "sqlite3_complete16",            test_complete16    ,0 },
         7579  +     { "sqlite3_normalize",             test_normalize     ,0 },
  7550   7580   
  7551   7581        { "sqlite3_prepare",               test_prepare       ,0 },
  7552   7582        { "sqlite3_prepare16",             test_prepare16     ,0 },
  7553   7583        { "sqlite3_prepare_v2",            test_prepare_v2    ,0 },
  7554   7584        { "sqlite3_prepare_tkt3134",       test_prepare_tkt3134, 0},
  7555   7585        { "sqlite3_prepare16_v2",          test_prepare16_v2  ,0 },
  7556   7586        { "sqlite3_finalize",              test_finalize      ,0 },

Added test/normalize.test.

            1  +# 2018-01-08
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#***********************************************************************
           11  +#
           12  +# Tests for the sqlite3_normalize() extension function.
           13  +#
           14  +
           15  +set testdir [file dirname $argv0]
           16  +source $testdir/tester.tcl
           17  +set testprefix normalize
           18  +
           19  +foreach {tnum sql norm} {
           20  +  100
           21  +  {SELECT * FROM t1 WHERE a IN (1) AND b=51.42}
           22  +  {select*from t1 where a in(?,?,?)and b=?;}
           23  +
           24  +  110
           25  +  {SELECT a, b+15, c FROM t1 WHERE d NOT IN (SELECT x FROM t2);}
           26  +  {select a,b+?,c from t1 where d not in(select x from t2);}
           27  +
           28  +  120
           29  +  { SELECT NULL, b FROM t1 -- comment text
           30  +     WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */
           31  +                 SELECT a FROM t)
           32  +        OR e='hello';
           33  +  }
           34  +  {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;}
           35  +
           36  +  121
           37  +  {/*Initial comment*/
           38  +   -- another comment line
           39  +   SELECT NULL  /* comment */ , b FROM t1 -- comment text
           40  +     WHERE d IN (WITH t(a) AS (VALUES(5)) /* CTE */
           41  +                 SELECT a FROM t)
           42  +        OR e='hello';
           43  +  }
           44  +  {select?,b from t1 where d in(with t(a)as(values(?))select a from t)or e=?;}
           45  +
           46  +  130
           47  +  {/* Query containing parameters */
           48  +   SELECT x,$::abc(15),y,@abc,z,?99,w FROM t1 /* Trailing comment */}
           49  +  {select x,?,y,?,z,?,w from t1;}
           50  +
           51  +  140
           52  +  {/* Long list on the RHS of IN */
           53  +   SELECT 15 IN (1,2,3,(SELECT * FROM t1),'xyz',x'abcd',22*(x+5),null);}
           54  +  {select?in(?,?,?);}
           55  +
           56  +  150
           57  +  {SELECT x'abc'; -- illegal token}
           58  +  {}
           59  +
           60  +  160
           61  +  {SELECT a,NULL,b FROM t1 WHERE c IS NOT NULL or D is null or e=5}
           62  +  {select a,?,b from t1 where c is not null or d is null or e=?;}
           63  +
           64  +  170
           65  +  {/* IN list exactly 5 bytes long */
           66  +   SELECT * FROM t1 WHERE x IN (1,2,3);}
           67  +  {select*from t1 where x in(?,?,?);}
           68  +} {
           69  +  do_test $tnum [list sqlite3_normalize $sql] $norm
           70  +}
           71  +
           72  +finish_test