/ Check-in [25ba1f84]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Merge fts4-unicode branch with trunk.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 25ba1f84f2b98d50ac1e2b9849b59ee902c2cca7
User & Date: dan 2012-05-26 18:42:21
Context
2012-05-27
22:42
Merge into trunk the changes that permit :memory: databases to use shared cache. check-in: e72179f3 user: drh tags: trunk
2012-05-26
18:42
Merge fts4-unicode branch with trunk. check-in: 25ba1f84 user: dan tags: trunk
18:28
If SQLITE_DISABLE_FTS3_UNICODE is defined, do not build the "unicode61" tokenizer. Closed-Leaf check-in: e71495a8 user: dan tags: fts4-unicode
2012-05-22
02:45
Version 3.7.12.1 check-in: 6d326d44 user: drh tags: trunk, release, version-3.7.12.1
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts3/README.tokenizers.

     7      7     statement:
     8      8   
     9      9       CREATE VIRTUAL TABLE <table-name> USING fts3(
    10     10         <columns ...> [, tokenize <tokenizer-name> [<tokenizer-args>]]
    11     11       );
    12     12   
    13     13     The built-in tokenizers (valid values to pass as <tokenizer name>) are
    14         -  "simple" and "porter".
           14  +  "simple", "porter" and "unicode".
    15     15   
    16     16     <tokenizer-args> should consist of zero or more white-space separated
    17     17     arguments to pass to the selected tokenizer implementation. The 
    18     18     interpretation of the arguments, if any, depends on the individual 
    19     19     tokenizer.
    20     20   
    21     21   2. Custom Tokenizers

Changes to ext/fts3/fts3.c.

  3550   3550   **
  3551   3551   ** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed
  3552   3552   ** to by the argument to point to the "simple" tokenizer implementation.
  3553   3553   ** And so on.
  3554   3554   */
  3555   3555   void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
  3556   3556   void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
         3557  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
         3558  +void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const**ppModule);
         3559  +#endif
  3557   3560   #ifdef SQLITE_ENABLE_ICU
  3558   3561   void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
  3559   3562   #endif
  3560   3563   
  3561   3564   /*
  3562   3565   ** Initialise the fts3 extension. If this extension is built as part
  3563   3566   ** of the sqlite library, then this function is called directly by
................................................................................
  3565   3568   ** function is called by the sqlite3_extension_init() entry point.
  3566   3569   */
  3567   3570   int sqlite3Fts3Init(sqlite3 *db){
  3568   3571     int rc = SQLITE_OK;
  3569   3572     Fts3Hash *pHash = 0;
  3570   3573     const sqlite3_tokenizer_module *pSimple = 0;
  3571   3574     const sqlite3_tokenizer_module *pPorter = 0;
         3575  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
         3576  +  const sqlite3_tokenizer_module *pUnicode = 0;
         3577  +#endif
  3572   3578   
  3573   3579   #ifdef SQLITE_ENABLE_ICU
  3574   3580     const sqlite3_tokenizer_module *pIcu = 0;
  3575   3581     sqlite3Fts3IcuTokenizerModule(&pIcu);
  3576   3582   #endif
         3583  +
         3584  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
         3585  +  sqlite3Fts3UnicodeTokenizer(&pUnicode);
         3586  +#endif
  3577   3587   
  3578   3588   #ifdef SQLITE_TEST
  3579   3589     rc = sqlite3Fts3InitTerm(db);
  3580   3590     if( rc!=SQLITE_OK ) return rc;
  3581   3591   #endif
  3582   3592   
  3583   3593     rc = sqlite3Fts3InitAux(db);
................................................................................
  3594   3604       sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
  3595   3605     }
  3596   3606   
  3597   3607     /* Load the built-in tokenizers into the hash table */
  3598   3608     if( rc==SQLITE_OK ){
  3599   3609       if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
  3600   3610        || sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter) 
         3611  +
         3612  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
         3613  +     || sqlite3Fts3HashInsert(pHash, "unicode61", 10, (void *)pUnicode) 
         3614  +#endif
  3601   3615   #ifdef SQLITE_ENABLE_ICU
  3602   3616        || (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
  3603   3617   #endif
  3604   3618       ){
  3605   3619         rc = SQLITE_NOMEM;
  3606   3620       }
  3607   3621     }

Changes to ext/fts3/fts3Int.h.

   536    536   int sqlite3Fts3MsrIncrNext(
   537    537       Fts3Table *, Fts3MultiSegReader *, sqlite3_int64 *, char **, int *);
   538    538   int sqlite3Fts3EvalPhrasePoslist(Fts3Cursor *, Fts3Expr *, int iCol, char **); 
   539    539   int sqlite3Fts3MsrOvfl(Fts3Cursor *, Fts3MultiSegReader *, int *);
   540    540   int sqlite3Fts3MsrIncrRestart(Fts3MultiSegReader *pCsr);
   541    541   
   542    542   int sqlite3Fts3DeferredTokenList(Fts3DeferredToken *, char **, int *);
          543  +
          544  +/* fts3_unicode2.c (functions generated by parsing unicode text files) */
          545  +int sqlite3FtsUnicodeTolower(int);
          546  +int sqlite3FtsUnicodeIsalnum(int);
   543    547   
   544    548   #endif /* !SQLITE_CORE || SQLITE_ENABLE_FTS3 */
   545    549   #endif /* _FTSINT_H */

Added ext/fts3/fts3_unicode.c.

            1  +/*
            2  +** 2012 May 24
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +******************************************************************************
           12  +**
           13  +** Implementation of the "unicode" full-text-search tokenizer.
           14  +*/
           15  +
           16  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
           17  +
           18  +#include "fts3Int.h"
           19  +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
           20  +
           21  +#include <assert.h>
           22  +#include <stdlib.h>
           23  +#include <stdio.h>
           24  +#include <string.h>
           25  +
           26  +#include "fts3_tokenizer.h"
           27  +
           28  +/*
           29  +** The following two macros - READ_UTF8 and WRITE_UTF8 - have been copied
           30  +** from the sqlite3 source file utf.c. If this file is compiled as part
           31  +** of the amalgamation, they are not required.
           32  +*/
           33  +#ifndef SQLITE_AMALGAMATION
           34  +
           35  +static const unsigned char sqlite3Utf8Trans1[] = {
           36  +  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
           37  +  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
           38  +  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
           39  +  0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
           40  +  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
           41  +  0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
           42  +  0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
           43  +  0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x00, 0x00,
           44  +};
           45  +
           46  +#define READ_UTF8(zIn, zTerm, c)                           \
           47  +  c = *(zIn++);                                            \
           48  +  if( c>=0xc0 ){                                           \
           49  +    c = sqlite3Utf8Trans1[c-0xc0];                         \
           50  +    while( zIn!=zTerm && (*zIn & 0xc0)==0x80 ){            \
           51  +      c = (c<<6) + (0x3f & *(zIn++));                      \
           52  +    }                                                      \
           53  +    if( c<0x80                                             \
           54  +        || (c&0xFFFFF800)==0xD800                          \
           55  +        || (c&0xFFFFFFFE)==0xFFFE ){  c = 0xFFFD; }        \
           56  +  }
           57  +
           58  +#define WRITE_UTF8(zOut, c) {                          \
           59  +  if( c<0x00080 ){                                     \
           60  +    *zOut++ = (u8)(c&0xFF);                            \
           61  +  }                                                    \
           62  +  else if( c<0x00800 ){                                \
           63  +    *zOut++ = 0xC0 + (u8)((c>>6)&0x1F);                \
           64  +    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
           65  +  }                                                    \
           66  +  else if( c<0x10000 ){                                \
           67  +    *zOut++ = 0xE0 + (u8)((c>>12)&0x0F);               \
           68  +    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
           69  +    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
           70  +  }else{                                               \
           71  +    *zOut++ = 0xF0 + (u8)((c>>18) & 0x07);             \
           72  +    *zOut++ = 0x80 + (u8)((c>>12) & 0x3F);             \
           73  +    *zOut++ = 0x80 + (u8)((c>>6) & 0x3F);              \
           74  +    *zOut++ = 0x80 + (u8)(c & 0x3F);                   \
           75  +  }                                                    \
           76  +}
           77  +
           78  +#endif /* ifndef SQLITE_AMALGAMATION */
           79  +
           80  +typedef struct unicode_tokenizer unicode_tokenizer;
           81  +typedef struct unicode_cursor unicode_cursor;
           82  +
           83  +struct unicode_tokenizer {
           84  +  sqlite3_tokenizer base;
           85  +};
           86  +
           87  +struct unicode_cursor {
           88  +  sqlite3_tokenizer_cursor base;
           89  +  const unsigned char *aInput;    /* Input text being tokenized */
           90  +  int nInput;                     /* Size of aInput[] in bytes */
           91  +  int iOff;                       /* Current offset within aInput[] */
           92  +  int iToken;                     /* Index of next token to be returned */
           93  +  char *zToken;                   /* storage for current token */
           94  +  int nAlloc;                     /* space allocated at zToken */
           95  +};
           96  +
           97  +/*
           98  +** Create a new tokenizer instance.
           99  +*/
          100  +static int unicodeCreate(
          101  +  int nArg,                       /* Size of array argv[] */
          102  +  const char * const *azArg,      /* Tokenizer creation arguments */
          103  +  sqlite3_tokenizer **pp          /* OUT: New tokenizer handle */
          104  +){
          105  +  unicode_tokenizer *pNew;        /* New tokenizer object */
          106  +  pNew = (unicode_tokenizer *) sqlite3_malloc(sizeof(unicode_tokenizer));
          107  +  if( pNew==NULL ){
          108  +    return SQLITE_NOMEM;
          109  +  }
          110  +  memset(pNew, 0, sizeof(unicode_tokenizer));
          111  +  *pp = &pNew->base;
          112  +  return SQLITE_OK;
          113  +}
          114  +
          115  +/*
          116  +** Destroy a tokenizer allocated by unicodeCreate().
          117  +*/
          118  +static int unicodeDestroy(sqlite3_tokenizer *pTokenizer){
          119  +  sqlite3_free(pTokenizer);
          120  +  return SQLITE_OK;
          121  +}
          122  +
          123  +/*
          124  +** Prepare to begin tokenizing a particular string.  The input
          125  +** string to be tokenized is pInput[0..nBytes-1].  A cursor
          126  +** used to incrementally tokenize this string is returned in 
          127  +** *ppCursor.
          128  +*/
          129  +static int unicodeOpen(
          130  +  sqlite3_tokenizer *p,           /* The tokenizer */
          131  +  const char *aInput,             /* Input string */
          132  +  int nInput,                     /* Size of string aInput in bytes */
          133  +  sqlite3_tokenizer_cursor **pp   /* OUT: New cursor object */
          134  +){
          135  +  unicode_cursor *pCsr;
          136  +
          137  +  pCsr = (unicode_cursor *)sqlite3_malloc(sizeof(unicode_cursor));
          138  +  if( pCsr==0 ){
          139  +    return SQLITE_NOMEM;
          140  +  }
          141  +  memset(pCsr, 0, sizeof(unicode_cursor));
          142  +
          143  +  pCsr->aInput = (const unsigned char *)aInput;
          144  +  if( aInput==0 ){
          145  +    pCsr->nInput = 0;
          146  +  }else if( nInput<0 ){
          147  +    pCsr->nInput = (int)strlen(aInput);
          148  +  }else{
          149  +    pCsr->nInput = nInput;
          150  +  }
          151  +
          152  +  *pp = &pCsr->base;
          153  +  UNUSED_PARAMETER(p);
          154  +  return SQLITE_OK;
          155  +}
          156  +
          157  +/*
          158  +** Close a tokenization cursor previously opened by a call to
          159  +** simpleOpen() above.
          160  +*/
          161  +static int unicodeClose(sqlite3_tokenizer_cursor *pCursor){
          162  +  unicode_cursor *pCsr = (unicode_cursor *) pCursor;
          163  +  sqlite3_free(pCsr->zToken);
          164  +  sqlite3_free(pCsr);
          165  +  return SQLITE_OK;
          166  +}
          167  +
          168  +/*
          169  +** Extract the next token from a tokenization cursor.  The cursor must
          170  +** have been opened by a prior call to simpleOpen().
          171  +*/
          172  +static int unicodeNext(
          173  +  sqlite3_tokenizer_cursor *p,    /* Cursor returned by simpleOpen */
          174  +  const char **paToken,           /* OUT: Token text */
          175  +  int *pnToken,                   /* OUT: Number of bytes at *paToken */
          176  +  int *piStart,                   /* OUT: Starting offset of token */
          177  +  int *piEnd,                     /* OUT: Ending offset of token */
          178  +  int *piPos                      /* OUT: Position integer of token */
          179  +){
          180  +  unicode_cursor *pCsr = (unicode_cursor *)p;
          181  +  int iCode;
          182  +  char *zOut;
          183  +  const unsigned char *z = &pCsr->aInput[pCsr->iOff];
          184  +  const unsigned char *zStart = z;
          185  +  const unsigned char *zEnd;
          186  +  const unsigned char *zTerm = &pCsr->aInput[pCsr->nInput];
          187  +
          188  +  /* Scan past any delimiter characters before the start of the next token.
          189  +  ** Return SQLITE_DONE early if this takes us all the way to the end of 
          190  +  ** the input.  */
          191  +  while( z<zTerm ){
          192  +    READ_UTF8(z, zTerm, iCode);
          193  +    if( sqlite3FtsUnicodeIsalnum(iCode) ) break;
          194  +    zStart = z;
          195  +  }
          196  +  if( zStart>=zTerm ) return SQLITE_DONE;
          197  +
          198  +  zOut = pCsr->zToken;
          199  +  do {
          200  +    /* Grow the output buffer if required. */
          201  +    if( (zOut-pCsr->zToken)>=(pCsr->nAlloc-4) ){
          202  +      char *zNew = sqlite3_realloc(pCsr->zToken, pCsr->nAlloc+64);
          203  +      if( !zNew ) return SQLITE_NOMEM;
          204  +      zOut = &zNew[zOut - pCsr->zToken];
          205  +      pCsr->zToken = zNew;
          206  +      pCsr->nAlloc += 64;
          207  +    }
          208  +
          209  +    /* Write the folded case of the last character read to the output */
          210  +    zEnd = z;
          211  +    WRITE_UTF8(zOut, sqlite3FtsUnicodeTolower(iCode));
          212  +
          213  +    /* If the cursor is not at EOF, read the next character */
          214  +    if( z>=zTerm ) break;
          215  +    READ_UTF8(z, zTerm, iCode);
          216  +  }while( sqlite3FtsUnicodeIsalnum(iCode) );
          217  +
          218  +  /* Set the output variables and return. */
          219  +  pCsr->iOff = (z - pCsr->aInput);
          220  +  *paToken = pCsr->zToken;
          221  +  *pnToken = zOut - pCsr->zToken;
          222  +  *piStart = (zStart - pCsr->aInput);
          223  +  *piEnd = (zEnd - pCsr->aInput);
          224  +  *piPos = pCsr->iToken++;
          225  +  return SQLITE_OK;
          226  +}
          227  +
          228  +/*
          229  +** Set *ppModule to a pointer to the sqlite3_tokenizer_module 
          230  +** structure for the unicode tokenizer.
          231  +*/
          232  +void sqlite3Fts3UnicodeTokenizer(sqlite3_tokenizer_module const **ppModule){
          233  +  static const sqlite3_tokenizer_module module = {
          234  +    0,
          235  +    unicodeCreate,
          236  +    unicodeDestroy,
          237  +    unicodeOpen,
          238  +    unicodeClose,
          239  +    unicodeNext,
          240  +    0,
          241  +  };
          242  +  *ppModule = &module;
          243  +}
          244  +
          245  +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
          246  +#endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */

Added ext/fts3/fts3_unicode2.c.

            1  +/*
            2  +** 2012 May 25
            3  +**
            4  +** The author disclaims copyright to this source code.  In place of
            5  +** a legal notice, here is a blessing:
            6  +**
            7  +**    May you do good and not evil.
            8  +**    May you find forgiveness for yourself and forgive others.
            9  +**    May you share freely, never taking more than you give.
           10  +**
           11  +******************************************************************************
           12  +*/
           13  +
           14  +/*
           15  +** DO NOT EDIT THIS MACHINE GENERATED FILE.
           16  +*/
           17  +
           18  +#ifndef SQLITE_DISABLE_FTS3_UNICODE
           19  +
           20  +#include <assert.h>
           21  +
           22  +/*
           23  +** Return true if the argument corresponds to a unicode codepoint
           24  +** classified as either a letter or a number. Otherwise false.
           25  +**
           26  +** The results are undefined if the value passed to this function
           27  +** is less than zero.
           28  +*/
           29  +int sqlite3FtsUnicodeIsalnum(int c){
           30  +  /* Each unsigned integer in the following array corresponds to a contiguous
           31  +  ** range of unicode codepoints that are not either letters or numbers (i.e.
           32  +  ** codepoints for which this function should return 0).
           33  +  **
           34  +  ** The most significant 22 bits in each 32-bit value contain the first 
           35  +  ** codepoint in the range. The least significant 10 bits are used to store
           36  +  ** the size of the range (always at least 1). In other words, the value 
           37  +  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
           38  +  ** C. It is not possible to represent a range larger than 1023 codepoints 
           39  +  ** using this format.
           40  +  */
           41  +  const static unsigned int aEntry[] = {
           42  +    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
           43  +    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
           44  +    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
           45  +    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
           46  +    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
           47  +    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
           48  +    0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
           49  +    0x001B9C07, 0x001BF402, 0x001C000E, 0x001C3C01, 0x001C4401,
           50  +    0x001CC01B, 0x001E980B, 0x001FAC09, 0x001FD804, 0x00205804,
           51  +    0x00206C09, 0x00209403, 0x0020A405, 0x0020C00F, 0x00216403,
           52  +    0x00217801, 0x0023901B, 0x00240004, 0x0024E803, 0x0024F812,
           53  +    0x00254407, 0x00258804, 0x0025C001, 0x00260403, 0x0026F001,
           54  +    0x0026F807, 0x00271C02, 0x00272C03, 0x00275C01, 0x00278802,
           55  +    0x0027C802, 0x0027E802, 0x00280403, 0x0028F001, 0x0028F805,
           56  +    0x00291C02, 0x00292C03, 0x00294401, 0x0029C002, 0x0029D401,
           57  +    0x002A0403, 0x002AF001, 0x002AF808, 0x002B1C03, 0x002B2C03,
           58  +    0x002B8802, 0x002BC002, 0x002C0403, 0x002CF001, 0x002CF807,
           59  +    0x002D1C02, 0x002D2C03, 0x002D5802, 0x002D8802, 0x002DC001,
           60  +    0x002E0801, 0x002EF805, 0x002F1803, 0x002F2804, 0x002F5C01,
           61  +    0x002FCC08, 0x00300403, 0x0030F807, 0x00311803, 0x00312804,
           62  +    0x00315402, 0x00318802, 0x0031FC01, 0x00320802, 0x0032F001,
           63  +    0x0032F807, 0x00331803, 0x00332804, 0x00335402, 0x00338802,
           64  +    0x00340802, 0x0034F807, 0x00351803, 0x00352804, 0x00355C01,
           65  +    0x00358802, 0x0035E401, 0x00360802, 0x00372801, 0x00373C06,
           66  +    0x00375801, 0x00376008, 0x0037C803, 0x0038C401, 0x0038D007,
           67  +    0x0038FC01, 0x00391C09, 0x00396802, 0x003AC401, 0x003AD006,
           68  +    0x003AEC02, 0x003B2006, 0x003C041F, 0x003CD00C, 0x003DC417,
           69  +    0x003E340B, 0x003E6424, 0x003EF80F, 0x003F380D, 0x0040AC14,
           70  +    0x00412806, 0x00415804, 0x00417803, 0x00418803, 0x00419C07,
           71  +    0x0041C404, 0x0042080C, 0x00423C01, 0x00426806, 0x0043EC01,
           72  +    0x004D740C, 0x004E400A, 0x00500001, 0x0059B402, 0x005A0001,
           73  +    0x005A6C02, 0x005BAC03, 0x005C4803, 0x005CC805, 0x005D4802,
           74  +    0x005DC802, 0x005ED023, 0x005F6004, 0x005F7401, 0x0060000F,
           75  +    0x0062A401, 0x0064800C, 0x0064C00C, 0x00650001, 0x00651002,
           76  +    0x0066C011, 0x00672002, 0x00677822, 0x00685C05, 0x00687802,
           77  +    0x0069540A, 0x0069801D, 0x0069FC01, 0x006A8007, 0x006AA006,
           78  +    0x006C0005, 0x006CD011, 0x006D6823, 0x006E0003, 0x006E840D,
           79  +    0x006F980E, 0x006FF004, 0x00709014, 0x0070EC05, 0x0071F802,
           80  +    0x00730008, 0x00734019, 0x0073B401, 0x0073C803, 0x00770027,
           81  +    0x0077F004, 0x007EF401, 0x007EFC03, 0x007F3403, 0x007F7403,
           82  +    0x007FB403, 0x007FF402, 0x00800065, 0x0081A806, 0x0081E805,
           83  +    0x00822805, 0x0082801A, 0x00834021, 0x00840002, 0x00840C04,
           84  +    0x00842002, 0x00845001, 0x00845803, 0x00847806, 0x00849401,
           85  +    0x00849C01, 0x0084A401, 0x0084B801, 0x0084E802, 0x00850005,
           86  +    0x00852804, 0x00853C01, 0x00864264, 0x00900027, 0x0091000B,
           87  +    0x0092704E, 0x00940200, 0x009C0475, 0x009E53B9, 0x00AD400A,
           88  +    0x00B39406, 0x00B3BC03, 0x00B3E404, 0x00B3F802, 0x00B5C001,
           89  +    0x00B5FC01, 0x00B7804F, 0x00B8C00C, 0x00BA001A, 0x00BA6C59,
           90  +    0x00BC00D6, 0x00BFC00C, 0x00C00005, 0x00C02019, 0x00C0A807,
           91  +    0x00C0D802, 0x00C0F403, 0x00C26404, 0x00C28001, 0x00C3EC01,
           92  +    0x00C64002, 0x00C6580A, 0x00C70024, 0x00C8001F, 0x00C8A81E,
           93  +    0x00C94001, 0x00C98020, 0x00CA2827, 0x00CB003F, 0x00CC0100,
           94  +    0x01370040, 0x02924037, 0x0293F802, 0x02983403, 0x0299BC10,
           95  +    0x029A7C01, 0x029BC008, 0x029C0017, 0x029C8002, 0x029E2402,
           96  +    0x02A00801, 0x02A01801, 0x02A02C01, 0x02A08C09, 0x02A0D804,
           97  +    0x02A1D004, 0x02A20002, 0x02A2D011, 0x02A33802, 0x02A38012,
           98  +    0x02A3E003, 0x02A4980A, 0x02A51C0D, 0x02A57C01, 0x02A60004,
           99  +    0x02A6CC1B, 0x02A77802, 0x02A8A40E, 0x02A90C01, 0x02A93002,
          100  +    0x02A97004, 0x02A9DC03, 0x02A9EC01, 0x02AAC001, 0x02AAC803,
          101  +    0x02AADC02, 0x02AAF802, 0x02AB0401, 0x02AB7802, 0x02ABAC07,
          102  +    0x02ABD402, 0x02AF8C0B, 0x03600001, 0x036DFC02, 0x036FFC02,
          103  +    0x037FFC02, 0x03E3FC01, 0x03EC7801, 0x03ECA401, 0x03EEC810,
          104  +    0x03F4F802, 0x03F7F002, 0x03F8001A, 0x03F88007, 0x03F8C023,
          105  +    0x03F95013, 0x03F9A004, 0x03FBFC01, 0x03FC040F, 0x03FC6807,
          106  +    0x03FCEC06, 0x03FD6C0B, 0x03FF8007, 0x03FFA007, 0x03FFE405,
          107  +    0x04040003, 0x0404DC09, 0x0405E411, 0x0406400C, 0x0407402E,
          108  +    0x040E7C01, 0x040F4001, 0x04215C01, 0x04247C01, 0x0424FC01,
          109  +    0x04280403, 0x04281402, 0x04283004, 0x0428E003, 0x0428FC01,
          110  +    0x04294009, 0x0429FC01, 0x042CE407, 0x04400003, 0x0440E016,
          111  +    0x04420003, 0x0442C012, 0x04440003, 0x04449C0E, 0x04450004,
          112  +    0x04460003, 0x0446CC0E, 0x04471404, 0x045AAC0D, 0x0491C004,
          113  +    0x05BD442E, 0x05BE3C04, 0x074000F6, 0x07440027, 0x0744A4B5,
          114  +    0x07480046, 0x074C0057, 0x075B0401, 0x075B6C01, 0x075BEC01,
          115  +    0x075C5401, 0x075CD401, 0x075D3C01, 0x075DBC01, 0x075E2401,
          116  +    0x075EA401, 0x075F0C01, 0x07BBC002, 0x07C0002C, 0x07C0C064,
          117  +    0x07C2800F, 0x07C2C40E, 0x07C3040F, 0x07C3440F, 0x07C4401F,
          118  +    0x07C4C03C, 0x07C5C02B, 0x07C7981D, 0x07C8402B, 0x07C90009,
          119  +    0x07C94002, 0x07CC0021, 0x07CCC006, 0x07CCDC46, 0x07CE0014,
          120  +    0x07CE8025, 0x07CF1805, 0x07CF8011, 0x07D0003F, 0x07D10001,
          121  +    0x07D108B6, 0x07D3E404, 0x07D4003E, 0x07D50004, 0x07D54018,
          122  +    0x07D7EC46, 0x07D9140B, 0x07DA0046, 0x07DC0074, 0x38000401,
          123  +    0x38008060, 0x380400F0, 0x3C000001, 0x3FFFF401, 0x40000001,
          124  +    0x43FFF401,
          125  +  };
          126  +  static const unsigned int aAscii[4] = {
          127  +    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
          128  +  };
          129  +
          130  +  if( c<128 ){
          131  +    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
          132  +  }else if( c<(1<<22) ){
          133  +    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
          134  +    int iRes;
          135  +    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
          136  +    int iLo = 0;
          137  +    while( iHi>=iLo ){
          138  +      int iTest = (iHi + iLo) / 2;
          139  +      if( key >= aEntry[iTest] ){
          140  +        iRes = iTest;
          141  +        iLo = iTest+1;
          142  +      }else{
          143  +        iHi = iTest-1;
          144  +      }
          145  +    }
          146  +    assert( aEntry[0]<key );
          147  +    assert( key>=aEntry[iRes] );
          148  +    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
          149  +  }
          150  +  return 1;
          151  +}
          152  +
          153  +
          154  +/*
          155  +** Interpret the argument as a unicode codepoint. If the codepoint
          156  +** is an upper case character that has a lower case equivalent,
          157  +** return the codepoint corresponding to the lower case version.
          158  +** Otherwise, return a copy of the argument.
          159  +**
          160  +** The results are undefined if the value passed to this function
          161  +** is less than zero.
          162  +*/
          163  +int sqlite3FtsUnicodeTolower(int c){
          164  +  /* Each entry in the following array defines a rule for folding a range
          165  +  ** of codepoints to lower case. The rule applies to a range of nRange
          166  +  ** codepoints starting at codepoint iCode.
          167  +  **
          168  +  ** If the least significant bit in flags is clear, then the rule applies
          169  +  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
          170  +  ** need to be folded). Or, if it is set, then the rule only applies to
          171  +  ** every second codepoint in the range, starting with codepoint C.
          172  +  **
          173  +  ** The 7 most significant bits in flags are an index into the aiOff[]
          174  +  ** array. If a specific codepoint C does require folding, then its lower
          175  +  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
          176  +  **
          177  +  ** The contents of this array are generated by parsing the CaseFolding.txt
          178  +  ** file distributed as part of the "Unicode Character Database". See
          179  +  ** http://www.unicode.org for details.
          180  +  */
          181  +  static const struct TableEntry {
          182  +    unsigned short iCode;
          183  +    unsigned char flags;
          184  +    unsigned char nRange;
          185  +  } aEntry[] = {
          186  +    {65, 14, 26},          {181, 64, 1},          {192, 14, 23},
          187  +    {216, 14, 7},          {256, 1, 48},          {306, 1, 6},
          188  +    {313, 1, 16},          {330, 1, 46},          {376, 116, 1},
          189  +    {377, 1, 6},           {383, 104, 1},         {385, 50, 1},
          190  +    {386, 1, 4},           {390, 44, 1},          {391, 0, 1},
          191  +    {393, 42, 2},          {395, 0, 1},           {398, 32, 1},
          192  +    {399, 38, 1},          {400, 40, 1},          {401, 0, 1},
          193  +    {403, 42, 1},          {404, 46, 1},          {406, 52, 1},
          194  +    {407, 48, 1},          {408, 0, 1},           {412, 52, 1},
          195  +    {413, 54, 1},          {415, 56, 1},          {416, 1, 6},
          196  +    {422, 60, 1},          {423, 0, 1},           {425, 60, 1},
          197  +    {428, 0, 1},           {430, 60, 1},          {431, 0, 1},
          198  +    {433, 58, 2},          {435, 1, 4},           {439, 62, 1},
          199  +    {440, 0, 1},           {444, 0, 1},           {452, 2, 1},
          200  +    {453, 0, 1},           {455, 2, 1},           {456, 0, 1},
          201  +    {458, 2, 1},           {459, 1, 18},          {478, 1, 18},
          202  +    {497, 2, 1},           {498, 1, 4},           {502, 122, 1},
          203  +    {503, 134, 1},         {504, 1, 40},          {544, 110, 1},
          204  +    {546, 1, 18},          {570, 70, 1},          {571, 0, 1},
          205  +    {573, 108, 1},         {574, 68, 1},          {577, 0, 1},
          206  +    {579, 106, 1},         {580, 28, 1},          {581, 30, 1},
          207  +    {582, 1, 10},          {837, 36, 1},          {880, 1, 4},
          208  +    {886, 0, 1},           {902, 18, 1},          {904, 16, 3},
          209  +    {908, 26, 1},          {910, 24, 2},          {913, 14, 17},
          210  +    {931, 14, 9},          {962, 0, 1},           {975, 4, 1},
          211  +    {976, 140, 1},         {977, 142, 1},         {981, 146, 1},
          212  +    {982, 144, 1},         {984, 1, 24},          {1008, 136, 1},
          213  +    {1009, 138, 1},        {1012, 130, 1},        {1013, 128, 1},
          214  +    {1015, 0, 1},          {1017, 152, 1},        {1018, 0, 1},
          215  +    {1021, 110, 3},        {1024, 34, 16},        {1040, 14, 32},
          216  +    {1120, 1, 34},         {1162, 1, 54},         {1216, 6, 1},
          217  +    {1217, 1, 14},         {1232, 1, 88},         {1329, 22, 38},
          218  +    {4256, 66, 38},        {4295, 66, 1},         {4301, 66, 1},
          219  +    {7680, 1, 150},        {7835, 132, 1},        {7838, 96, 1},
          220  +    {7840, 1, 96},         {7944, 150, 8},        {7960, 150, 6},
          221  +    {7976, 150, 8},        {7992, 150, 8},        {8008, 150, 6},
          222  +    {8025, 151, 8},        {8040, 150, 8},        {8072, 150, 8},
          223  +    {8088, 150, 8},        {8104, 150, 8},        {8120, 150, 2},
          224  +    {8122, 126, 2},        {8124, 148, 1},        {8126, 100, 1},
          225  +    {8136, 124, 4},        {8140, 148, 1},        {8152, 150, 2},
          226  +    {8154, 120, 2},        {8168, 150, 2},        {8170, 118, 2},
          227  +    {8172, 152, 1},        {8184, 112, 2},        {8186, 114, 2},
          228  +    {8188, 148, 1},        {8486, 98, 1},         {8490, 92, 1},
          229  +    {8491, 94, 1},         {8498, 12, 1},         {8544, 8, 16},
          230  +    {8579, 0, 1},          {9398, 10, 26},        {11264, 22, 47},
          231  +    {11360, 0, 1},         {11362, 88, 1},        {11363, 102, 1},
          232  +    {11364, 90, 1},        {11367, 1, 6},         {11373, 84, 1},
          233  +    {11374, 86, 1},        {11375, 80, 1},        {11376, 82, 1},
          234  +    {11378, 0, 1},         {11381, 0, 1},         {11390, 78, 2},
          235  +    {11392, 1, 100},       {11499, 1, 4},         {11506, 0, 1},
          236  +    {42560, 1, 46},        {42624, 1, 24},        {42786, 1, 14},
          237  +    {42802, 1, 62},        {42873, 1, 4},         {42877, 76, 1},
          238  +    {42878, 1, 10},        {42891, 0, 1},         {42893, 74, 1},
          239  +    {42896, 1, 4},         {42912, 1, 10},        {42922, 72, 1},
          240  +    {65313, 14, 26},       
          241  +  };
          242  +  static const unsigned short aiOff[] = {
          243  +   1,     2,     8,     15,    16,    26,    28,    32,    
          244  +   37,    38,    40,    48,    63,    64,    69,    71,    
          245  +   79,    80,    116,   202,   203,   205,   206,   207,   
          246  +   209,   210,   211,   213,   214,   217,   218,   219,   
          247  +   775,   7264,  10792, 10795, 23228, 23256, 30204, 54721, 
          248  +   54753, 54754, 54756, 54787, 54793, 54809, 57153, 57274, 
          249  +   57921, 58019, 58363, 61722, 65268, 65341, 65373, 65406, 
          250  +   65408, 65410, 65415, 65424, 65436, 65439, 65450, 65462, 
          251  +   65472, 65476, 65478, 65480, 65482, 65488, 65506, 65511, 
          252  +   65514, 65521, 65527, 65528, 65529, 
          253  +  };
          254  +
          255  +  int ret = c;
          256  +
          257  +  assert( c>=0 );
          258  +  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
          259  +
          260  +  if( c<128 ){
          261  +    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
          262  +  }else if( c<65536 ){
          263  +    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
          264  +    int iLo = 0;
          265  +    int iRes = -1;
          266  +
          267  +    while( iHi>=iLo ){
          268  +      int iTest = (iHi + iLo) / 2;
          269  +      int cmp = (c - aEntry[iTest].iCode);
          270  +      if( cmp>=0 ){
          271  +        iRes = iTest;
          272  +        iLo = iTest+1;
          273  +      }else{
          274  +        iHi = iTest-1;
          275  +      }
          276  +    }
          277  +    assert( iRes<0 || c>=aEntry[iRes].iCode );
          278  +
          279  +    if( iRes>=0 ){
          280  +      const struct TableEntry *p = &aEntry[iRes];
          281  +      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
          282  +        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
          283  +        assert( ret>0 );
          284  +      }
          285  +    }
          286  +  }
          287  +  
          288  +  else if( c>=66560 && c<66600 ){
          289  +    ret = c + 40;
          290  +  }
          291  +
          292  +  return ret;
          293  +}
          294  +#endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */

Changes to ext/fts3/fts3_write.c.

  3170   3170     if( sqlite3_step(pStmt)==SQLITE_ROW ){
  3171   3171       fts3DecodeIntArray(nStat, a,
  3172   3172            sqlite3_column_blob(pStmt, 0),
  3173   3173            sqlite3_column_bytes(pStmt, 0));
  3174   3174     }else{
  3175   3175       memset(a, 0, sizeof(u32)*(nStat) );
  3176   3176     }
  3177         -  sqlite3_reset(pStmt);
         3177  +  rc = sqlite3_reset(pStmt);
         3178  +  if( rc!=SQLITE_OK ){
         3179  +    sqlite3_free(a);
         3180  +    *pRC = rc;
         3181  +    return;
         3182  +  }
  3178   3183     if( nChng<0 && a[0]<(u32)(-nChng) ){
  3179   3184       a[0] = 0;
  3180   3185     }else{
  3181   3186       a[0] += nChng;
  3182   3187     }
  3183   3188     for(i=0; i<p->nColumn+1; i++){
  3184   3189       u32 x = a[i+1];

Added ext/fts3/unicode/CaseFolding.txt.

            1  +# CaseFolding-6.1.0.txt
            2  +# Date: 2011-07-25, 21:21:56 GMT [MD]
            3  +#
            4  +# Unicode Character Database
            5  +# Copyright (c) 1991-2011 Unicode, Inc.
            6  +# For terms of use, see http://www.unicode.org/terms_of_use.html
            7  +# For documentation, see http://www.unicode.org/reports/tr44/
            8  +#
            9  +# Case Folding Properties
           10  +#
           11  +# This file is a supplement to the UnicodeData file.
           12  +# It provides a case folding mapping generated from the Unicode Character Database.
           13  +# If all characters are mapped according to the full mapping below, then
           14  +# case differences (according to UnicodeData.txt and SpecialCasing.txt)
           15  +# are eliminated.
           16  +#
           17  +# The data supports both implementations that require simple case foldings
           18  +# (where string lengths don't change), and implementations that allow full case folding
           19  +# (where string lengths may grow). Note that where they can be supported, the
           20  +# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
           21  +#
           22  +# All code points not listed in this file map to themselves.
           23  +#
           24  +# NOTE: case folding does not preserve normalization formats!
           25  +#
           26  +# For information on case folding, including how to have case folding 
           27  +# preserve normalization formats, see Section 3.13 Default Case Algorithms in
           28  +# The Unicode Standard, Version 5.0.
           29  +#
           30  +# ================================================================================
           31  +# Format
           32  +# ================================================================================
           33  +# The entries in this file are in the following machine-readable format:
           34  +#
           35  +# <code>; <status>; <mapping>; # <name>
           36  +#
           37  +# The status field is:
           38  +# C: common case folding, common mappings shared by both simple and full mappings.
           39  +# F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
           40  +# S: simple case folding, mappings to single characters where different from F.
           41  +# T: special case for uppercase I and dotted uppercase I
           42  +#    - For non-Turkic languages, this mapping is normally not used.
           43  +#    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
           44  +#      Note that the Turkic mappings do not maintain canonical equivalence without additional processing.
           45  +#      See the discussions of case mapping in the Unicode Standard for more information.
           46  +#
           47  +# Usage:
           48  +#  A. To do a simple case folding, use the mappings with status C + S.
           49  +#  B. To do a full case folding, use the mappings with status C + F.
           50  +#
           51  +#    The mappings with status T can be used or omitted depending on the desired case-folding
           52  +#    behavior. (The default option is to exclude them.)
           53  +#
           54  +# =================================================================
           55  +
           56  +# Property: Case_Folding
           57  +
           58  +#  All code points not explicitly listed for Case_Folding
           59  +#  have the value C for the status field, and the code point itself for the mapping field.
           60  +
           61  +# @missing: 0000..10FFFF; C; <code point>
           62  +
           63  +# =================================================================
           64  +0041; C; 0061; # LATIN CAPITAL LETTER A
           65  +0042; C; 0062; # LATIN CAPITAL LETTER B
           66  +0043; C; 0063; # LATIN CAPITAL LETTER C
           67  +0044; C; 0064; # LATIN CAPITAL LETTER D
           68  +0045; C; 0065; # LATIN CAPITAL LETTER E
           69  +0046; C; 0066; # LATIN CAPITAL LETTER F
           70  +0047; C; 0067; # LATIN CAPITAL LETTER G
           71  +0048; C; 0068; # LATIN CAPITAL LETTER H
           72  +0049; C; 0069; # LATIN CAPITAL LETTER I
           73  +0049; T; 0131; # LATIN CAPITAL LETTER I
           74  +004A; C; 006A; # LATIN CAPITAL LETTER J
           75  +004B; C; 006B; # LATIN CAPITAL LETTER K
           76  +004C; C; 006C; # LATIN CAPITAL LETTER L
           77  +004D; C; 006D; # LATIN CAPITAL LETTER M
           78  +004E; C; 006E; # LATIN CAPITAL LETTER N
           79  +004F; C; 006F; # LATIN CAPITAL LETTER O
           80  +0050; C; 0070; # LATIN CAPITAL LETTER P
           81  +0051; C; 0071; # LATIN CAPITAL LETTER Q
           82  +0052; C; 0072; # LATIN CAPITAL LETTER R
           83  +0053; C; 0073; # LATIN CAPITAL LETTER S
           84  +0054; C; 0074; # LATIN CAPITAL LETTER T
           85  +0055; C; 0075; # LATIN CAPITAL LETTER U
           86  +0056; C; 0076; # LATIN CAPITAL LETTER V
           87  +0057; C; 0077; # LATIN CAPITAL LETTER W
           88  +0058; C; 0078; # LATIN CAPITAL LETTER X
           89  +0059; C; 0079; # LATIN CAPITAL LETTER Y
           90  +005A; C; 007A; # LATIN CAPITAL LETTER Z
           91  +00B5; C; 03BC; # MICRO SIGN
           92  +00C0; C; 00E0; # LATIN CAPITAL LETTER A WITH GRAVE
           93  +00C1; C; 00E1; # LATIN CAPITAL LETTER A WITH ACUTE
           94  +00C2; C; 00E2; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX
           95  +00C3; C; 00E3; # LATIN CAPITAL LETTER A WITH TILDE
           96  +00C4; C; 00E4; # LATIN CAPITAL LETTER A WITH DIAERESIS
           97  +00C5; C; 00E5; # LATIN CAPITAL LETTER A WITH RING ABOVE
           98  +00C6; C; 00E6; # LATIN CAPITAL LETTER AE
           99  +00C7; C; 00E7; # LATIN CAPITAL LETTER C WITH CEDILLA
          100  +00C8; C; 00E8; # LATIN CAPITAL LETTER E WITH GRAVE
          101  +00C9; C; 00E9; # LATIN CAPITAL LETTER E WITH ACUTE
          102  +00CA; C; 00EA; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX
          103  +00CB; C; 00EB; # LATIN CAPITAL LETTER E WITH DIAERESIS
          104  +00CC; C; 00EC; # LATIN CAPITAL LETTER I WITH GRAVE
          105  +00CD; C; 00ED; # LATIN CAPITAL LETTER I WITH ACUTE
          106  +00CE; C; 00EE; # LATIN CAPITAL LETTER I WITH CIRCUMFLEX
          107  +00CF; C; 00EF; # LATIN CAPITAL LETTER I WITH DIAERESIS
          108  +00D0; C; 00F0; # LATIN CAPITAL LETTER ETH
          109  +00D1; C; 00F1; # LATIN CAPITAL LETTER N WITH TILDE
          110  +00D2; C; 00F2; # LATIN CAPITAL LETTER O WITH GRAVE
          111  +00D3; C; 00F3; # LATIN CAPITAL LETTER O WITH ACUTE
          112  +00D4; C; 00F4; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX
          113  +00D5; C; 00F5; # LATIN CAPITAL LETTER O WITH TILDE
          114  +00D6; C; 00F6; # LATIN CAPITAL LETTER O WITH DIAERESIS
          115  +00D8; C; 00F8; # LATIN CAPITAL LETTER O WITH STROKE
          116  +00D9; C; 00F9; # LATIN CAPITAL LETTER U WITH GRAVE
          117  +00DA; C; 00FA; # LATIN CAPITAL LETTER U WITH ACUTE
          118  +00DB; C; 00FB; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX
          119  +00DC; C; 00FC; # LATIN CAPITAL LETTER U WITH DIAERESIS
          120  +00DD; C; 00FD; # LATIN CAPITAL LETTER Y WITH ACUTE
          121  +00DE; C; 00FE; # LATIN CAPITAL LETTER THORN
          122  +00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
          123  +0100; C; 0101; # LATIN CAPITAL LETTER A WITH MACRON
          124  +0102; C; 0103; # LATIN CAPITAL LETTER A WITH BREVE
          125  +0104; C; 0105; # LATIN CAPITAL LETTER A WITH OGONEK
          126  +0106; C; 0107; # LATIN CAPITAL LETTER C WITH ACUTE
          127  +0108; C; 0109; # LATIN CAPITAL LETTER C WITH CIRCUMFLEX
          128  +010A; C; 010B; # LATIN CAPITAL LETTER C WITH DOT ABOVE
          129  +010C; C; 010D; # LATIN CAPITAL LETTER C WITH CARON
          130  +010E; C; 010F; # LATIN CAPITAL LETTER D WITH CARON
          131  +0110; C; 0111; # LATIN CAPITAL LETTER D WITH STROKE
          132  +0112; C; 0113; # LATIN CAPITAL LETTER E WITH MACRON
          133  +0114; C; 0115; # LATIN CAPITAL LETTER E WITH BREVE
          134  +0116; C; 0117; # LATIN CAPITAL LETTER E WITH DOT ABOVE
          135  +0118; C; 0119; # LATIN CAPITAL LETTER E WITH OGONEK
          136  +011A; C; 011B; # LATIN CAPITAL LETTER E WITH CARON
          137  +011C; C; 011D; # LATIN CAPITAL LETTER G WITH CIRCUMFLEX
          138  +011E; C; 011F; # LATIN CAPITAL LETTER G WITH BREVE
          139  +0120; C; 0121; # LATIN CAPITAL LETTER G WITH DOT ABOVE
          140  +0122; C; 0123; # LATIN CAPITAL LETTER G WITH CEDILLA
          141  +0124; C; 0125; # LATIN CAPITAL LETTER H WITH CIRCUMFLEX
          142  +0126; C; 0127; # LATIN CAPITAL LETTER H WITH STROKE
          143  +0128; C; 0129; # LATIN CAPITAL LETTER I WITH TILDE
          144  +012A; C; 012B; # LATIN CAPITAL LETTER I WITH MACRON
          145  +012C; C; 012D; # LATIN CAPITAL LETTER I WITH BREVE
          146  +012E; C; 012F; # LATIN CAPITAL LETTER I WITH OGONEK
          147  +0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
          148  +0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
          149  +0132; C; 0133; # LATIN CAPITAL LIGATURE IJ
          150  +0134; C; 0135; # LATIN CAPITAL LETTER J WITH CIRCUMFLEX
          151  +0136; C; 0137; # LATIN CAPITAL LETTER K WITH CEDILLA
          152  +0139; C; 013A; # LATIN CAPITAL LETTER L WITH ACUTE
          153  +013B; C; 013C; # LATIN CAPITAL LETTER L WITH CEDILLA
          154  +013D; C; 013E; # LATIN CAPITAL LETTER L WITH CARON
          155  +013F; C; 0140; # LATIN CAPITAL LETTER L WITH MIDDLE DOT
          156  +0141; C; 0142; # LATIN CAPITAL LETTER L WITH STROKE
          157  +0143; C; 0144; # LATIN CAPITAL LETTER N WITH ACUTE
          158  +0145; C; 0146; # LATIN CAPITAL LETTER N WITH CEDILLA
          159  +0147; C; 0148; # LATIN CAPITAL LETTER N WITH CARON
          160  +0149; F; 02BC 006E; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
          161  +014A; C; 014B; # LATIN CAPITAL LETTER ENG
          162  +014C; C; 014D; # LATIN CAPITAL LETTER O WITH MACRON
          163  +014E; C; 014F; # LATIN CAPITAL LETTER O WITH BREVE
          164  +0150; C; 0151; # LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
          165  +0152; C; 0153; # LATIN CAPITAL LIGATURE OE
          166  +0154; C; 0155; # LATIN CAPITAL LETTER R WITH ACUTE
          167  +0156; C; 0157; # LATIN CAPITAL LETTER R WITH CEDILLA
          168  +0158; C; 0159; # LATIN CAPITAL LETTER R WITH CARON
          169  +015A; C; 015B; # LATIN CAPITAL LETTER S WITH ACUTE
          170  +015C; C; 015D; # LATIN CAPITAL LETTER S WITH CIRCUMFLEX
          171  +015E; C; 015F; # LATIN CAPITAL LETTER S WITH CEDILLA
          172  +0160; C; 0161; # LATIN CAPITAL LETTER S WITH CARON
          173  +0162; C; 0163; # LATIN CAPITAL LETTER T WITH CEDILLA
          174  +0164; C; 0165; # LATIN CAPITAL LETTER T WITH CARON
          175  +0166; C; 0167; # LATIN CAPITAL LETTER T WITH STROKE
          176  +0168; C; 0169; # LATIN CAPITAL LETTER U WITH TILDE
          177  +016A; C; 016B; # LATIN CAPITAL LETTER U WITH MACRON
          178  +016C; C; 016D; # LATIN CAPITAL LETTER U WITH BREVE
          179  +016E; C; 016F; # LATIN CAPITAL LETTER U WITH RING ABOVE
          180  +0170; C; 0171; # LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
          181  +0172; C; 0173; # LATIN CAPITAL LETTER U WITH OGONEK
          182  +0174; C; 0175; # LATIN CAPITAL LETTER W WITH CIRCUMFLEX
          183  +0176; C; 0177; # LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
          184  +0178; C; 00FF; # LATIN CAPITAL LETTER Y WITH DIAERESIS
          185  +0179; C; 017A; # LATIN CAPITAL LETTER Z WITH ACUTE
          186  +017B; C; 017C; # LATIN CAPITAL LETTER Z WITH DOT ABOVE
          187  +017D; C; 017E; # LATIN CAPITAL LETTER Z WITH CARON
          188  +017F; C; 0073; # LATIN SMALL LETTER LONG S
          189  +0181; C; 0253; # LATIN CAPITAL LETTER B WITH HOOK
          190  +0182; C; 0183; # LATIN CAPITAL LETTER B WITH TOPBAR
          191  +0184; C; 0185; # LATIN CAPITAL LETTER TONE SIX
          192  +0186; C; 0254; # LATIN CAPITAL LETTER OPEN O
          193  +0187; C; 0188; # LATIN CAPITAL LETTER C WITH HOOK
          194  +0189; C; 0256; # LATIN CAPITAL LETTER AFRICAN D
          195  +018A; C; 0257; # LATIN CAPITAL LETTER D WITH HOOK
          196  +018B; C; 018C; # LATIN CAPITAL LETTER D WITH TOPBAR
          197  +018E; C; 01DD; # LATIN CAPITAL LETTER REVERSED E
          198  +018F; C; 0259; # LATIN CAPITAL LETTER SCHWA
          199  +0190; C; 025B; # LATIN CAPITAL LETTER OPEN E
          200  +0191; C; 0192; # LATIN CAPITAL LETTER F WITH HOOK
          201  +0193; C; 0260; # LATIN CAPITAL LETTER G WITH HOOK
          202  +0194; C; 0263; # LATIN CAPITAL LETTER GAMMA
          203  +0196; C; 0269; # LATIN CAPITAL LETTER IOTA
          204  +0197; C; 0268; # LATIN CAPITAL LETTER I WITH STROKE
          205  +0198; C; 0199; # LATIN CAPITAL LETTER K WITH HOOK
          206  +019C; C; 026F; # LATIN CAPITAL LETTER TURNED M
          207  +019D; C; 0272; # LATIN CAPITAL LETTER N WITH LEFT HOOK
          208  +019F; C; 0275; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
          209  +01A0; C; 01A1; # LATIN CAPITAL LETTER O WITH HORN
          210  +01A2; C; 01A3; # LATIN CAPITAL LETTER OI
          211  +01A4; C; 01A5; # LATIN CAPITAL LETTER P WITH HOOK
          212  +01A6; C; 0280; # LATIN LETTER YR
          213  +01A7; C; 01A8; # LATIN CAPITAL LETTER TONE TWO
          214  +01A9; C; 0283; # LATIN CAPITAL LETTER ESH
          215  +01AC; C; 01AD; # LATIN CAPITAL LETTER T WITH HOOK
          216  +01AE; C; 0288; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
          217  +01AF; C; 01B0; # LATIN CAPITAL LETTER U WITH HORN
          218  +01B1; C; 028A; # LATIN CAPITAL LETTER UPSILON
          219  +01B2; C; 028B; # LATIN CAPITAL LETTER V WITH HOOK
          220  +01B3; C; 01B4; # LATIN CAPITAL LETTER Y WITH HOOK
          221  +01B5; C; 01B6; # LATIN CAPITAL LETTER Z WITH STROKE
          222  +01B7; C; 0292; # LATIN CAPITAL LETTER EZH
          223  +01B8; C; 01B9; # LATIN CAPITAL LETTER EZH REVERSED
          224  +01BC; C; 01BD; # LATIN CAPITAL LETTER TONE FIVE
          225  +01C4; C; 01C6; # LATIN CAPITAL LETTER DZ WITH CARON
          226  +01C5; C; 01C6; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
          227  +01C7; C; 01C9; # LATIN CAPITAL LETTER LJ
          228  +01C8; C; 01C9; # LATIN CAPITAL LETTER L WITH SMALL LETTER J
          229  +01CA; C; 01CC; # LATIN CAPITAL LETTER NJ
          230  +01CB; C; 01CC; # LATIN CAPITAL LETTER N WITH SMALL LETTER J
          231  +01CD; C; 01CE; # LATIN CAPITAL LETTER A WITH CARON
          232  +01CF; C; 01D0; # LATIN CAPITAL LETTER I WITH CARON
          233  +01D1; C; 01D2; # LATIN CAPITAL LETTER O WITH CARON
          234  +01D3; C; 01D4; # LATIN CAPITAL LETTER U WITH CARON
          235  +01D5; C; 01D6; # LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
          236  +01D7; C; 01D8; # LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
          237  +01D9; C; 01DA; # LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
          238  +01DB; C; 01DC; # LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
          239  +01DE; C; 01DF; # LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON
          240  +01E0; C; 01E1; # LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON
          241  +01E2; C; 01E3; # LATIN CAPITAL LETTER AE WITH MACRON
          242  +01E4; C; 01E5; # LATIN CAPITAL LETTER G WITH STROKE
          243  +01E6; C; 01E7; # LATIN CAPITAL LETTER G WITH CARON
          244  +01E8; C; 01E9; # LATIN CAPITAL LETTER K WITH CARON
          245  +01EA; C; 01EB; # LATIN CAPITAL LETTER O WITH OGONEK
          246  +01EC; C; 01ED; # LATIN CAPITAL LETTER O WITH OGONEK AND MACRON
          247  +01EE; C; 01EF; # LATIN CAPITAL LETTER EZH WITH CARON
          248  +01F0; F; 006A 030C; # LATIN SMALL LETTER J WITH CARON
          249  +01F1; C; 01F3; # LATIN CAPITAL LETTER DZ
          250  +01F2; C; 01F3; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
          251  +01F4; C; 01F5; # LATIN CAPITAL LETTER G WITH ACUTE
          252  +01F6; C; 0195; # LATIN CAPITAL LETTER HWAIR
          253  +01F7; C; 01BF; # LATIN CAPITAL LETTER WYNN
          254  +01F8; C; 01F9; # LATIN CAPITAL LETTER N WITH GRAVE
          255  +01FA; C; 01FB; # LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
          256  +01FC; C; 01FD; # LATIN CAPITAL LETTER AE WITH ACUTE
          257  +01FE; C; 01FF; # LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
          258  +0200; C; 0201; # LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
          259  +0202; C; 0203; # LATIN CAPITAL LETTER A WITH INVERTED BREVE
          260  +0204; C; 0205; # LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
          261  +0206; C; 0207; # LATIN CAPITAL LETTER E WITH INVERTED BREVE
          262  +0208; C; 0209; # LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
          263  +020A; C; 020B; # LATIN CAPITAL LETTER I WITH INVERTED BREVE
          264  +020C; C; 020D; # LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
          265  +020E; C; 020F; # LATIN CAPITAL LETTER O WITH INVERTED BREVE
          266  +0210; C; 0211; # LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
          267  +0212; C; 0213; # LATIN CAPITAL LETTER R WITH INVERTED BREVE
          268  +0214; C; 0215; # LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
          269  +0216; C; 0217; # LATIN CAPITAL LETTER U WITH INVERTED BREVE
          270  +0218; C; 0219; # LATIN CAPITAL LETTER S WITH COMMA BELOW
          271  +021A; C; 021B; # LATIN CAPITAL LETTER T WITH COMMA BELOW
          272  +021C; C; 021D; # LATIN CAPITAL LETTER YOGH
          273  +021E; C; 021F; # LATIN CAPITAL LETTER H WITH CARON
          274  +0220; C; 019E; # LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
          275  +0222; C; 0223; # LATIN CAPITAL LETTER OU
          276  +0224; C; 0225; # LATIN CAPITAL LETTER Z WITH HOOK
          277  +0226; C; 0227; # LATIN CAPITAL LETTER A WITH DOT ABOVE
          278  +0228; C; 0229; # LATIN CAPITAL LETTER E WITH CEDILLA
          279  +022A; C; 022B; # LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
          280  +022C; C; 022D; # LATIN CAPITAL LETTER O WITH TILDE AND MACRON
          281  +022E; C; 022F; # LATIN CAPITAL LETTER O WITH DOT ABOVE
          282  +0230; C; 0231; # LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON
          283  +0232; C; 0233; # LATIN CAPITAL LETTER Y WITH MACRON
          284  +023A; C; 2C65; # LATIN CAPITAL LETTER A WITH STROKE
          285  +023B; C; 023C; # LATIN CAPITAL LETTER C WITH STROKE
          286  +023D; C; 019A; # LATIN CAPITAL LETTER L WITH BAR
          287  +023E; C; 2C66; # LATIN CAPITAL LETTER T WITH DIAGONAL STROKE
          288  +0241; C; 0242; # LATIN CAPITAL LETTER GLOTTAL STOP
          289  +0243; C; 0180; # LATIN CAPITAL LETTER B WITH STROKE
          290  +0244; C; 0289; # LATIN CAPITAL LETTER U BAR
          291  +0245; C; 028C; # LATIN CAPITAL LETTER TURNED V
          292  +0246; C; 0247; # LATIN CAPITAL LETTER E WITH STROKE
          293  +0248; C; 0249; # LATIN CAPITAL LETTER J WITH STROKE
          294  +024A; C; 024B; # LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL
          295  +024C; C; 024D; # LATIN CAPITAL LETTER R WITH STROKE
          296  +024E; C; 024F; # LATIN CAPITAL LETTER Y WITH STROKE
          297  +0345; C; 03B9; # COMBINING GREEK YPOGEGRAMMENI
          298  +0370; C; 0371; # GREEK CAPITAL LETTER HETA
          299  +0372; C; 0373; # GREEK CAPITAL LETTER ARCHAIC SAMPI
          300  +0376; C; 0377; # GREEK CAPITAL LETTER PAMPHYLIAN DIGAMMA
          301  +0386; C; 03AC; # GREEK CAPITAL LETTER ALPHA WITH TONOS
          302  +0388; C; 03AD; # GREEK CAPITAL LETTER EPSILON WITH TONOS
          303  +0389; C; 03AE; # GREEK CAPITAL LETTER ETA WITH TONOS
          304  +038A; C; 03AF; # GREEK CAPITAL LETTER IOTA WITH TONOS
          305  +038C; C; 03CC; # GREEK CAPITAL LETTER OMICRON WITH TONOS
          306  +038E; C; 03CD; # GREEK CAPITAL LETTER UPSILON WITH TONOS
          307  +038F; C; 03CE; # GREEK CAPITAL LETTER OMEGA WITH TONOS
          308  +0390; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
          309  +0391; C; 03B1; # GREEK CAPITAL LETTER ALPHA
          310  +0392; C; 03B2; # GREEK CAPITAL LETTER BETA
          311  +0393; C; 03B3; # GREEK CAPITAL LETTER GAMMA
          312  +0394; C; 03B4; # GREEK CAPITAL LETTER DELTA
          313  +0395; C; 03B5; # GREEK CAPITAL LETTER EPSILON
          314  +0396; C; 03B6; # GREEK CAPITAL LETTER ZETA
          315  +0397; C; 03B7; # GREEK CAPITAL LETTER ETA
          316  +0398; C; 03B8; # GREEK CAPITAL LETTER THETA
          317  +0399; C; 03B9; # GREEK CAPITAL LETTER IOTA
          318  +039A; C; 03BA; # GREEK CAPITAL LETTER KAPPA
          319  +039B; C; 03BB; # GREEK CAPITAL LETTER LAMDA
          320  +039C; C; 03BC; # GREEK CAPITAL LETTER MU
          321  +039D; C; 03BD; # GREEK CAPITAL LETTER NU
          322  +039E; C; 03BE; # GREEK CAPITAL LETTER XI
          323  +039F; C; 03BF; # GREEK CAPITAL LETTER OMICRON
          324  +03A0; C; 03C0; # GREEK CAPITAL LETTER PI
          325  +03A1; C; 03C1; # GREEK CAPITAL LETTER RHO
          326  +03A3; C; 03C3; # GREEK CAPITAL LETTER SIGMA
          327  +03A4; C; 03C4; # GREEK CAPITAL LETTER TAU
          328  +03A5; C; 03C5; # GREEK CAPITAL LETTER UPSILON
          329  +03A6; C; 03C6; # GREEK CAPITAL LETTER PHI
          330  +03A7; C; 03C7; # GREEK CAPITAL LETTER CHI
          331  +03A8; C; 03C8; # GREEK CAPITAL LETTER PSI
          332  +03A9; C; 03C9; # GREEK CAPITAL LETTER OMEGA
          333  +03AA; C; 03CA; # GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
          334  +03AB; C; 03CB; # GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
          335  +03B0; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
          336  +03C2; C; 03C3; # GREEK SMALL LETTER FINAL SIGMA
          337  +03CF; C; 03D7; # GREEK CAPITAL KAI SYMBOL
          338  +03D0; C; 03B2; # GREEK BETA SYMBOL
          339  +03D1; C; 03B8; # GREEK THETA SYMBOL
          340  +03D5; C; 03C6; # GREEK PHI SYMBOL
          341  +03D6; C; 03C0; # GREEK PI SYMBOL
          342  +03D8; C; 03D9; # GREEK LETTER ARCHAIC KOPPA
          343  +03DA; C; 03DB; # GREEK LETTER STIGMA
          344  +03DC; C; 03DD; # GREEK LETTER DIGAMMA
          345  +03DE; C; 03DF; # GREEK LETTER KOPPA
          346  +03E0; C; 03E1; # GREEK LETTER SAMPI
          347  +03E2; C; 03E3; # COPTIC CAPITAL LETTER SHEI
          348  +03E4; C; 03E5; # COPTIC CAPITAL LETTER FEI
          349  +03E6; C; 03E7; # COPTIC CAPITAL LETTER KHEI
          350  +03E8; C; 03E9; # COPTIC CAPITAL LETTER HORI
          351  +03EA; C; 03EB; # COPTIC CAPITAL LETTER GANGIA
          352  +03EC; C; 03ED; # COPTIC CAPITAL LETTER SHIMA
          353  +03EE; C; 03EF; # COPTIC CAPITAL LETTER DEI
          354  +03F0; C; 03BA; # GREEK KAPPA SYMBOL
          355  +03F1; C; 03C1; # GREEK RHO SYMBOL
          356  +03F4; C; 03B8; # GREEK CAPITAL THETA SYMBOL
          357  +03F5; C; 03B5; # GREEK LUNATE EPSILON SYMBOL
          358  +03F7; C; 03F8; # GREEK CAPITAL LETTER SHO
          359  +03F9; C; 03F2; # GREEK CAPITAL LUNATE SIGMA SYMBOL
          360  +03FA; C; 03FB; # GREEK CAPITAL LETTER SAN
          361  +03FD; C; 037B; # GREEK CAPITAL REVERSED LUNATE SIGMA SYMBOL
          362  +03FE; C; 037C; # GREEK CAPITAL DOTTED LUNATE SIGMA SYMBOL
          363  +03FF; C; 037D; # GREEK CAPITAL REVERSED DOTTED LUNATE SIGMA SYMBOL
          364  +0400; C; 0450; # CYRILLIC CAPITAL LETTER IE WITH GRAVE
          365  +0401; C; 0451; # CYRILLIC CAPITAL LETTER IO
          366  +0402; C; 0452; # CYRILLIC CAPITAL LETTER DJE
          367  +0403; C; 0453; # CYRILLIC CAPITAL LETTER GJE
          368  +0404; C; 0454; # CYRILLIC CAPITAL LETTER UKRAINIAN IE
          369  +0405; C; 0455; # CYRILLIC CAPITAL LETTER DZE
          370  +0406; C; 0456; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
          371  +0407; C; 0457; # CYRILLIC CAPITAL LETTER YI
          372  +0408; C; 0458; # CYRILLIC CAPITAL LETTER JE
          373  +0409; C; 0459; # CYRILLIC CAPITAL LETTER LJE
          374  +040A; C; 045A; # CYRILLIC CAPITAL LETTER NJE
          375  +040B; C; 045B; # CYRILLIC CAPITAL LETTER TSHE
          376  +040C; C; 045C; # CYRILLIC CAPITAL LETTER KJE
          377  +040D; C; 045D; # CYRILLIC CAPITAL LETTER I WITH GRAVE
          378  +040E; C; 045E; # CYRILLIC CAPITAL LETTER SHORT U
          379  +040F; C; 045F; # CYRILLIC CAPITAL LETTER DZHE
          380  +0410; C; 0430; # CYRILLIC CAPITAL LETTER A
          381  +0411; C; 0431; # CYRILLIC CAPITAL LETTER BE
          382  +0412; C; 0432; # CYRILLIC CAPITAL LETTER VE
          383  +0413; C; 0433; # CYRILLIC CAPITAL LETTER GHE
          384  +0414; C; 0434; # CYRILLIC CAPITAL LETTER DE
          385  +0415; C; 0435; # CYRILLIC CAPITAL LETTER IE
          386  +0416; C; 0436; # CYRILLIC CAPITAL LETTER ZHE
          387  +0417; C; 0437; # CYRILLIC CAPITAL LETTER ZE
          388  +0418; C; 0438; # CYRILLIC CAPITAL LETTER I
          389  +0419; C; 0439; # CYRILLIC CAPITAL LETTER SHORT I
          390  +041A; C; 043A; # CYRILLIC CAPITAL LETTER KA
          391  +041B; C; 043B; # CYRILLIC CAPITAL LETTER EL
          392  +041C; C; 043C; # CYRILLIC CAPITAL LETTER EM
          393  +041D; C; 043D; # CYRILLIC CAPITAL LETTER EN
          394  +041E; C; 043E; # CYRILLIC CAPITAL LETTER O
          395  +041F; C; 043F; # CYRILLIC CAPITAL LETTER PE
          396  +0420; C; 0440; # CYRILLIC CAPITAL LETTER ER
          397  +0421; C; 0441; # CYRILLIC CAPITAL LETTER ES
          398  +0422; C; 0442; # CYRILLIC CAPITAL LETTER TE
          399  +0423; C; 0443; # CYRILLIC CAPITAL LETTER U
          400  +0424; C; 0444; # CYRILLIC CAPITAL LETTER EF
          401  +0425; C; 0445; # CYRILLIC CAPITAL LETTER HA
          402  +0426; C; 0446; # CYRILLIC CAPITAL LETTER TSE
          403  +0427; C; 0447; # CYRILLIC CAPITAL LETTER CHE
          404  +0428; C; 0448; # CYRILLIC CAPITAL LETTER SHA
          405  +0429; C; 0449; # CYRILLIC CAPITAL LETTER SHCHA
          406  +042A; C; 044A; # CYRILLIC CAPITAL LETTER HARD SIGN
          407  +042B; C; 044B; # CYRILLIC CAPITAL LETTER YERU
          408  +042C; C; 044C; # CYRILLIC CAPITAL LETTER SOFT SIGN
          409  +042D; C; 044D; # CYRILLIC CAPITAL LETTER E
          410  +042E; C; 044E; # CYRILLIC CAPITAL LETTER YU
          411  +042F; C; 044F; # CYRILLIC CAPITAL LETTER YA
          412  +0460; C; 0461; # CYRILLIC CAPITAL LETTER OMEGA
          413  +0462; C; 0463; # CYRILLIC CAPITAL LETTER YAT
          414  +0464; C; 0465; # CYRILLIC CAPITAL LETTER IOTIFIED E
          415  +0466; C; 0467; # CYRILLIC CAPITAL LETTER LITTLE YUS
          416  +0468; C; 0469; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS
          417  +046A; C; 046B; # CYRILLIC CAPITAL LETTER BIG YUS
          418  +046C; C; 046D; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS
          419  +046E; C; 046F; # CYRILLIC CAPITAL LETTER KSI
          420  +0470; C; 0471; # CYRILLIC CAPITAL LETTER PSI
          421  +0472; C; 0473; # CYRILLIC CAPITAL LETTER FITA
          422  +0474; C; 0475; # CYRILLIC CAPITAL LETTER IZHITSA
          423  +0476; C; 0477; # CYRILLIC CAPITAL LETTER IZHITSA WITH DOUBLE GRAVE ACCENT
          424  +0478; C; 0479; # CYRILLIC CAPITAL LETTER UK
          425  +047A; C; 047B; # CYRILLIC CAPITAL LETTER ROUND OMEGA
          426  +047C; C; 047D; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO
          427  +047E; C; 047F; # CYRILLIC CAPITAL LETTER OT
          428  +0480; C; 0481; # CYRILLIC CAPITAL LETTER KOPPA
          429  +048A; C; 048B; # CYRILLIC CAPITAL LETTER SHORT I WITH TAIL
          430  +048C; C; 048D; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN
          431  +048E; C; 048F; # CYRILLIC CAPITAL LETTER ER WITH TICK
          432  +0490; C; 0491; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN
          433  +0492; C; 0493; # CYRILLIC CAPITAL LETTER GHE WITH STROKE
          434  +0494; C; 0495; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK
          435  +0496; C; 0497; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
          436  +0498; C; 0499; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER
          437  +049A; C; 049B; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER
          438  +049C; C; 049D; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE
          439  +049E; C; 049F; # CYRILLIC CAPITAL LETTER KA WITH STROKE
          440  +04A0; C; 04A1; # CYRILLIC CAPITAL LETTER BASHKIR KA
          441  +04A2; C; 04A3; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER
          442  +04A4; C; 04A5; # CYRILLIC CAPITAL LIGATURE EN GHE
          443  +04A6; C; 04A7; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK
          444  +04A8; C; 04A9; # CYRILLIC CAPITAL LETTER ABKHASIAN HA
          445  +04AA; C; 04AB; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER
          446  +04AC; C; 04AD; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER
          447  +04AE; C; 04AF; # CYRILLIC CAPITAL LETTER STRAIGHT U
          448  +04B0; C; 04B1; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE
          449  +04B2; C; 04B3; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER
          450  +04B4; C; 04B5; # CYRILLIC CAPITAL LIGATURE TE TSE
          451  +04B6; C; 04B7; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER
          452  +04B8; C; 04B9; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE
          453  +04BA; C; 04BB; # CYRILLIC CAPITAL LETTER SHHA
          454  +04BC; C; 04BD; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE
          455  +04BE; C; 04BF; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER
          456  +04C0; C; 04CF; # CYRILLIC LETTER PALOCHKA
          457  +04C1; C; 04C2; # CYRILLIC CAPITAL LETTER ZHE WITH BREVE
          458  +04C3; C; 04C4; # CYRILLIC CAPITAL LETTER KA WITH HOOK
          459  +04C5; C; 04C6; # CYRILLIC CAPITAL LETTER EL WITH TAIL
          460  +04C7; C; 04C8; # CYRILLIC CAPITAL LETTER EN WITH HOOK
          461  +04C9; C; 04CA; # CYRILLIC CAPITAL LETTER EN WITH TAIL
          462  +04CB; C; 04CC; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE
          463  +04CD; C; 04CE; # CYRILLIC CAPITAL LETTER EM WITH TAIL
          464  +04D0; C; 04D1; # CYRILLIC CAPITAL LETTER A WITH BREVE
          465  +04D2; C; 04D3; # CYRILLIC CAPITAL LETTER A WITH DIAERESIS
          466  +04D4; C; 04D5; # CYRILLIC CAPITAL LIGATURE A IE
          467  +04D6; C; 04D7; # CYRILLIC CAPITAL LETTER IE WITH BREVE
          468  +04D8; C; 04D9; # CYRILLIC CAPITAL LETTER SCHWA
          469  +04DA; C; 04DB; # CYRILLIC CAPITAL LETTER SCHWA WITH DIAERESIS
          470  +04DC; C; 04DD; # CYRILLIC CAPITAL LETTER ZHE WITH DIAERESIS
          471  +04DE; C; 04DF; # CYRILLIC CAPITAL LETTER ZE WITH DIAERESIS
          472  +04E0; C; 04E1; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE
          473  +04E2; C; 04E3; # CYRILLIC CAPITAL LETTER I WITH MACRON
          474  +04E4; C; 04E5; # CYRILLIC CAPITAL LETTER I WITH DIAERESIS
          475  +04E6; C; 04E7; # CYRILLIC CAPITAL LETTER O WITH DIAERESIS
          476  +04E8; C; 04E9; # CYRILLIC CAPITAL LETTER BARRED O
          477  +04EA; C; 04EB; # CYRILLIC CAPITAL LETTER BARRED O WITH DIAERESIS
          478  +04EC; C; 04ED; # CYRILLIC CAPITAL LETTER E WITH DIAERESIS
          479  +04EE; C; 04EF; # CYRILLIC CAPITAL LETTER U WITH MACRON
          480  +04F0; C; 04F1; # CYRILLIC CAPITAL LETTER U WITH DIAERESIS
          481  +04F2; C; 04F3; # CYRILLIC CAPITAL LETTER U WITH DOUBLE ACUTE
          482  +04F4; C; 04F5; # CYRILLIC CAPITAL LETTER CHE WITH DIAERESIS
          483  +04F6; C; 04F7; # CYRILLIC CAPITAL LETTER GHE WITH DESCENDER
          484  +04F8; C; 04F9; # CYRILLIC CAPITAL LETTER YERU WITH DIAERESIS
          485  +04FA; C; 04FB; # CYRILLIC CAPITAL LETTER GHE WITH STROKE AND HOOK
          486  +04FC; C; 04FD; # CYRILLIC CAPITAL LETTER HA WITH HOOK
          487  +04FE; C; 04FF; # CYRILLIC CAPITAL LETTER HA WITH STROKE
          488  +0500; C; 0501; # CYRILLIC CAPITAL LETTER KOMI DE
          489  +0502; C; 0503; # CYRILLIC CAPITAL LETTER KOMI DJE
          490  +0504; C; 0505; # CYRILLIC CAPITAL LETTER KOMI ZJE
          491  +0506; C; 0507; # CYRILLIC CAPITAL LETTER KOMI DZJE
          492  +0508; C; 0509; # CYRILLIC CAPITAL LETTER KOMI LJE
          493  +050A; C; 050B; # CYRILLIC CAPITAL LETTER KOMI NJE
          494  +050C; C; 050D; # CYRILLIC CAPITAL LETTER KOMI SJE
          495  +050E; C; 050F; # CYRILLIC CAPITAL LETTER KOMI TJE
          496  +0510; C; 0511; # CYRILLIC CAPITAL LETTER REVERSED ZE
          497  +0512; C; 0513; # CYRILLIC CAPITAL LETTER EL WITH HOOK
          498  +0514; C; 0515; # CYRILLIC CAPITAL LETTER LHA
          499  +0516; C; 0517; # CYRILLIC CAPITAL LETTER RHA
          500  +0518; C; 0519; # CYRILLIC CAPITAL LETTER YAE
          501  +051A; C; 051B; # CYRILLIC CAPITAL LETTER QA
          502  +051C; C; 051D; # CYRILLIC CAPITAL LETTER WE
          503  +051E; C; 051F; # CYRILLIC CAPITAL LETTER ALEUT KA
          504  +0520; C; 0521; # CYRILLIC CAPITAL LETTER EL WITH MIDDLE HOOK
          505  +0522; C; 0523; # CYRILLIC CAPITAL LETTER EN WITH MIDDLE HOOK
          506  +0524; C; 0525; # CYRILLIC CAPITAL LETTER PE WITH DESCENDER
          507  +0526; C; 0527; # CYRILLIC CAPITAL LETTER SHHA WITH DESCENDER
          508  +0531; C; 0561; # ARMENIAN CAPITAL LETTER AYB
          509  +0532; C; 0562; # ARMENIAN CAPITAL LETTER BEN
          510  +0533; C; 0563; # ARMENIAN CAPITAL LETTER GIM
          511  +0534; C; 0564; # ARMENIAN CAPITAL LETTER DA
          512  +0535; C; 0565; # ARMENIAN CAPITAL LETTER ECH
          513  +0536; C; 0566; # ARMENIAN CAPITAL LETTER ZA
          514  +0537; C; 0567; # ARMENIAN CAPITAL LETTER EH
          515  +0538; C; 0568; # ARMENIAN CAPITAL LETTER ET
          516  +0539; C; 0569; # ARMENIAN CAPITAL LETTER TO
          517  +053A; C; 056A; # ARMENIAN CAPITAL LETTER ZHE
          518  +053B; C; 056B; # ARMENIAN CAPITAL LETTER INI
          519  +053C; C; 056C; # ARMENIAN CAPITAL LETTER LIWN
          520  +053D; C; 056D; # ARMENIAN CAPITAL LETTER XEH
          521  +053E; C; 056E; # ARMENIAN CAPITAL LETTER CA
          522  +053F; C; 056F; # ARMENIAN CAPITAL LETTER KEN
          523  +0540; C; 0570; # ARMENIAN CAPITAL LETTER HO
          524  +0541; C; 0571; # ARMENIAN CAPITAL LETTER JA
          525  +0542; C; 0572; # ARMENIAN CAPITAL LETTER GHAD
          526  +0543; C; 0573; # ARMENIAN CAPITAL LETTER CHEH
          527  +0544; C; 0574; # ARMENIAN CAPITAL LETTER MEN
          528  +0545; C; 0575; # ARMENIAN CAPITAL LETTER YI
          529  +0546; C; 0576; # ARMENIAN CAPITAL LETTER NOW
          530  +0547; C; 0577; # ARMENIAN CAPITAL LETTER SHA
          531  +0548; C; 0578; # ARMENIAN CAPITAL LETTER VO
          532  +0549; C; 0579; # ARMENIAN CAPITAL LETTER CHA
          533  +054A; C; 057A; # ARMENIAN CAPITAL LETTER PEH
          534  +054B; C; 057B; # ARMENIAN CAPITAL LETTER JHEH
          535  +054C; C; 057C; # ARMENIAN CAPITAL LETTER RA
          536  +054D; C; 057D; # ARMENIAN CAPITAL LETTER SEH
          537  +054E; C; 057E; # ARMENIAN CAPITAL LETTER VEW
          538  +054F; C; 057F; # ARMENIAN CAPITAL LETTER TIWN
          539  +0550; C; 0580; # ARMENIAN CAPITAL LETTER REH
          540  +0551; C; 0581; # ARMENIAN CAPITAL LETTER CO
          541  +0552; C; 0582; # ARMENIAN CAPITAL LETTER YIWN
          542  +0553; C; 0583; # ARMENIAN CAPITAL LETTER PIWR
          543  +0554; C; 0584; # ARMENIAN CAPITAL LETTER KEH
          544  +0555; C; 0585; # ARMENIAN CAPITAL LETTER OH
          545  +0556; C; 0586; # ARMENIAN CAPITAL LETTER FEH
          546  +0587; F; 0565 0582; # ARMENIAN SMALL LIGATURE ECH YIWN
          547  +10A0; C; 2D00; # GEORGIAN CAPITAL LETTER AN
          548  +10A1; C; 2D01; # GEORGIAN CAPITAL LETTER BAN
          549  +10A2; C; 2D02; # GEORGIAN CAPITAL LETTER GAN
          550  +10A3; C; 2D03; # GEORGIAN CAPITAL LETTER DON
          551  +10A4; C; 2D04; # GEORGIAN CAPITAL LETTER EN
          552  +10A5; C; 2D05; # GEORGIAN CAPITAL LETTER VIN
          553  +10A6; C; 2D06; # GEORGIAN CAPITAL LETTER ZEN
          554  +10A7; C; 2D07; # GEORGIAN CAPITAL LETTER TAN
          555  +10A8; C; 2D08; # GEORGIAN CAPITAL LETTER IN
          556  +10A9; C; 2D09; # GEORGIAN CAPITAL LETTER KAN
          557  +10AA; C; 2D0A; # GEORGIAN CAPITAL LETTER LAS
          558  +10AB; C; 2D0B; # GEORGIAN CAPITAL LETTER MAN
          559  +10AC; C; 2D0C; # GEORGIAN CAPITAL LETTER NAR
          560  +10AD; C; 2D0D; # GEORGIAN CAPITAL LETTER ON
          561  +10AE; C; 2D0E; # GEORGIAN CAPITAL LETTER PAR
          562  +10AF; C; 2D0F; # GEORGIAN CAPITAL LETTER ZHAR
          563  +10B0; C; 2D10; # GEORGIAN CAPITAL LETTER RAE
          564  +10B1; C; 2D11; # GEORGIAN CAPITAL LETTER SAN
          565  +10B2; C; 2D12; # GEORGIAN CAPITAL LETTER TAR
          566  +10B3; C; 2D13; # GEORGIAN CAPITAL LETTER UN
          567  +10B4; C; 2D14; # GEORGIAN CAPITAL LETTER PHAR
          568  +10B5; C; 2D15; # GEORGIAN CAPITAL LETTER KHAR
          569  +10B6; C; 2D16; # GEORGIAN CAPITAL LETTER GHAN
          570  +10B7; C; 2D17; # GEORGIAN CAPITAL LETTER QAR
          571  +10B8; C; 2D18; # GEORGIAN CAPITAL LETTER SHIN
          572  +10B9; C; 2D19; # GEORGIAN CAPITAL LETTER CHIN
          573  +10BA; C; 2D1A; # GEORGIAN CAPITAL LETTER CAN
          574  +10BB; C; 2D1B; # GEORGIAN CAPITAL LETTER JIL
          575  +10BC; C; 2D1C; # GEORGIAN CAPITAL LETTER CIL
          576  +10BD; C; 2D1D; # GEORGIAN CAPITAL LETTER CHAR
          577  +10BE; C; 2D1E; # GEORGIAN CAPITAL LETTER XAN
          578  +10BF; C; 2D1F; # GEORGIAN CAPITAL LETTER JHAN
          579  +10C0; C; 2D20; # GEORGIAN CAPITAL LETTER HAE
          580  +10C1; C; 2D21; # GEORGIAN CAPITAL LETTER HE
          581  +10C2; C; 2D22; # GEORGIAN CAPITAL LETTER HIE
          582  +10C3; C; 2D23; # GEORGIAN CAPITAL LETTER WE
          583  +10C4; C; 2D24; # GEORGIAN CAPITAL LETTER HAR
          584  +10C5; C; 2D25; # GEORGIAN CAPITAL LETTER HOE
          585  +10C7; C; 2D27; # GEORGIAN CAPITAL LETTER YN
          586  +10CD; C; 2D2D; # GEORGIAN CAPITAL LETTER AEN
          587  +1E00; C; 1E01; # LATIN CAPITAL LETTER A WITH RING BELOW
          588  +1E02; C; 1E03; # LATIN CAPITAL LETTER B WITH DOT ABOVE
          589  +1E04; C; 1E05; # LATIN CAPITAL LETTER B WITH DOT BELOW
          590  +1E06; C; 1E07; # LATIN CAPITAL LETTER B WITH LINE BELOW
          591  +1E08; C; 1E09; # LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE
          592  +1E0A; C; 1E0B; # LATIN CAPITAL LETTER D WITH DOT ABOVE
          593  +1E0C; C; 1E0D; # LATIN CAPITAL LETTER D WITH DOT BELOW
          594  +1E0E; C; 1E0F; # LATIN CAPITAL LETTER D WITH LINE BELOW
          595  +1E10; C; 1E11; # LATIN CAPITAL LETTER D WITH CEDILLA
          596  +1E12; C; 1E13; # LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW
          597  +1E14; C; 1E15; # LATIN CAPITAL LETTER E WITH MACRON AND GRAVE
          598  +1E16; C; 1E17; # LATIN CAPITAL LETTER E WITH MACRON AND ACUTE
          599  +1E18; C; 1E19; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW
          600  +1E1A; C; 1E1B; # LATIN CAPITAL LETTER E WITH TILDE BELOW
          601  +1E1C; C; 1E1D; # LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE
          602  +1E1E; C; 1E1F; # LATIN CAPITAL LETTER F WITH DOT ABOVE
          603  +1E20; C; 1E21; # LATIN CAPITAL LETTER G WITH MACRON
          604  +1E22; C; 1E23; # LATIN CAPITAL LETTER H WITH DOT ABOVE
          605  +1E24; C; 1E25; # LATIN CAPITAL LETTER H WITH DOT BELOW
          606  +1E26; C; 1E27; # LATIN CAPITAL LETTER H WITH DIAERESIS
          607  +1E28; C; 1E29; # LATIN CAPITAL LETTER H WITH CEDILLA
          608  +1E2A; C; 1E2B; # LATIN CAPITAL LETTER H WITH BREVE BELOW
          609  +1E2C; C; 1E2D; # LATIN CAPITAL LETTER I WITH TILDE BELOW
          610  +1E2E; C; 1E2F; # LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE
          611  +1E30; C; 1E31; # LATIN CAPITAL LETTER K WITH ACUTE
          612  +1E32; C; 1E33; # LATIN CAPITAL LETTER K WITH DOT BELOW
          613  +1E34; C; 1E35; # LATIN CAPITAL LETTER K WITH LINE BELOW
          614  +1E36; C; 1E37; # LATIN CAPITAL LETTER L WITH DOT BELOW
          615  +1E38; C; 1E39; # LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON
          616  +1E3A; C; 1E3B; # LATIN CAPITAL LETTER L WITH LINE BELOW
          617  +1E3C; C; 1E3D; # LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW
          618  +1E3E; C; 1E3F; # LATIN CAPITAL LETTER M WITH ACUTE
          619  +1E40; C; 1E41; # LATIN CAPITAL LETTER M WITH DOT ABOVE
          620  +1E42; C; 1E43; # LATIN CAPITAL LETTER M WITH DOT BELOW
          621  +1E44; C; 1E45; # LATIN CAPITAL LETTER N WITH DOT ABOVE
          622  +1E46; C; 1E47; # LATIN CAPITAL LETTER N WITH DOT BELOW
          623  +1E48; C; 1E49; # LATIN CAPITAL LETTER N WITH LINE BELOW
          624  +1E4A; C; 1E4B; # LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW
          625  +1E4C; C; 1E4D; # LATIN CAPITAL LETTER O WITH TILDE AND ACUTE
          626  +1E4E; C; 1E4F; # LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS
          627  +1E50; C; 1E51; # LATIN CAPITAL LETTER O WITH MACRON AND GRAVE
          628  +1E52; C; 1E53; # LATIN CAPITAL LETTER O WITH MACRON AND ACUTE
          629  +1E54; C; 1E55; # LATIN CAPITAL LETTER P WITH ACUTE
          630  +1E56; C; 1E57; # LATIN CAPITAL LETTER P WITH DOT ABOVE
          631  +1E58; C; 1E59; # LATIN CAPITAL LETTER R WITH DOT ABOVE
          632  +1E5A; C; 1E5B; # LATIN CAPITAL LETTER R WITH DOT BELOW
          633  +1E5C; C; 1E5D; # LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON
          634  +1E5E; C; 1E5F; # LATIN CAPITAL LETTER R WITH LINE BELOW
          635  +1E60; C; 1E61; # LATIN CAPITAL LETTER S WITH DOT ABOVE
          636  +1E62; C; 1E63; # LATIN CAPITAL LETTER S WITH DOT BELOW
          637  +1E64; C; 1E65; # LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE
          638  +1E66; C; 1E67; # LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE
          639  +1E68; C; 1E69; # LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE
          640  +1E6A; C; 1E6B; # LATIN CAPITAL LETTER T WITH DOT ABOVE
          641  +1E6C; C; 1E6D; # LATIN CAPITAL LETTER T WITH DOT BELOW
          642  +1E6E; C; 1E6F; # LATIN CAPITAL LETTER T WITH LINE BELOW
          643  +1E70; C; 1E71; # LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW
          644  +1E72; C; 1E73; # LATIN CAPITAL LETTER U WITH DIAERESIS BELOW
          645  +1E74; C; 1E75; # LATIN CAPITAL LETTER U WITH TILDE BELOW
          646  +1E76; C; 1E77; # LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW
          647  +1E78; C; 1E79; # LATIN CAPITAL LETTER U WITH TILDE AND ACUTE
          648  +1E7A; C; 1E7B; # LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS
          649  +1E7C; C; 1E7D; # LATIN CAPITAL LETTER V WITH TILDE
          650  +1E7E; C; 1E7F; # LATIN CAPITAL LETTER V WITH DOT BELOW
          651  +1E80; C; 1E81; # LATIN CAPITAL LETTER W WITH GRAVE
          652  +1E82; C; 1E83; # LATIN CAPITAL LETTER W WITH ACUTE
          653  +1E84; C; 1E85; # LATIN CAPITAL LETTER W WITH DIAERESIS
          654  +1E86; C; 1E87; # LATIN CAPITAL LETTER W WITH DOT ABOVE
          655  +1E88; C; 1E89; # LATIN CAPITAL LETTER W WITH DOT BELOW
          656  +1E8A; C; 1E8B; # LATIN CAPITAL LETTER X WITH DOT ABOVE
          657  +1E8C; C; 1E8D; # LATIN CAPITAL LETTER X WITH DIAERESIS
          658  +1E8E; C; 1E8F; # LATIN CAPITAL LETTER Y WITH DOT ABOVE
          659  +1E90; C; 1E91; # LATIN CAPITAL LETTER Z WITH CIRCUMFLEX
          660  +1E92; C; 1E93; # LATIN CAPITAL LETTER Z WITH DOT BELOW
          661  +1E94; C; 1E95; # LATIN CAPITAL LETTER Z WITH LINE BELOW
          662  +1E96; F; 0068 0331; # LATIN SMALL LETTER H WITH LINE BELOW
          663  +1E97; F; 0074 0308; # LATIN SMALL LETTER T WITH DIAERESIS
          664  +1E98; F; 0077 030A; # LATIN SMALL LETTER W WITH RING ABOVE
          665  +1E99; F; 0079 030A; # LATIN SMALL LETTER Y WITH RING ABOVE
          666  +1E9A; F; 0061 02BE; # LATIN SMALL LETTER A WITH RIGHT HALF RING
          667  +1E9B; C; 1E61; # LATIN SMALL LETTER LONG S WITH DOT ABOVE
          668  +1E9E; F; 0073 0073; # LATIN CAPITAL LETTER SHARP S
          669  +1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
          670  +1EA0; C; 1EA1; # LATIN CAPITAL LETTER A WITH DOT BELOW
          671  +1EA2; C; 1EA3; # LATIN CAPITAL LETTER A WITH HOOK ABOVE
          672  +1EA4; C; 1EA5; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE
          673  +1EA6; C; 1EA7; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE
          674  +1EA8; C; 1EA9; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE
          675  +1EAA; C; 1EAB; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE
          676  +1EAC; C; 1EAD; # LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW
          677  +1EAE; C; 1EAF; # LATIN CAPITAL LETTER A WITH BREVE AND ACUTE
          678  +1EB0; C; 1EB1; # LATIN CAPITAL LETTER A WITH BREVE AND GRAVE
          679  +1EB2; C; 1EB3; # LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE
          680  +1EB4; C; 1EB5; # LATIN CAPITAL LETTER A WITH BREVE AND TILDE
          681  +1EB6; C; 1EB7; # LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW
          682  +1EB8; C; 1EB9; # LATIN CAPITAL LETTER E WITH DOT BELOW
          683  +1EBA; C; 1EBB; # LATIN CAPITAL LETTER E WITH HOOK ABOVE
          684  +1EBC; C; 1EBD; # LATIN CAPITAL LETTER E WITH TILDE
          685  +1EBE; C; 1EBF; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE
          686  +1EC0; C; 1EC1; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE
          687  +1EC2; C; 1EC3; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE
          688  +1EC4; C; 1EC5; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE
          689  +1EC6; C; 1EC7; # LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW
          690  +1EC8; C; 1EC9; # LATIN CAPITAL LETTER I WITH HOOK ABOVE
          691  +1ECA; C; 1ECB; # LATIN CAPITAL LETTER I WITH DOT BELOW
          692  +1ECC; C; 1ECD; # LATIN CAPITAL LETTER O WITH DOT BELOW
          693  +1ECE; C; 1ECF; # LATIN CAPITAL LETTER O WITH HOOK ABOVE
          694  +1ED0; C; 1ED1; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE
          695  +1ED2; C; 1ED3; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE
          696  +1ED4; C; 1ED5; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE
          697  +1ED6; C; 1ED7; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE
          698  +1ED8; C; 1ED9; # LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW
          699  +1EDA; C; 1EDB; # LATIN CAPITAL LETTER O WITH HORN AND ACUTE
          700  +1EDC; C; 1EDD; # LATIN CAPITAL LETTER O WITH HORN AND GRAVE
          701  +1EDE; C; 1EDF; # LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE
          702  +1EE0; C; 1EE1; # LATIN CAPITAL LETTER O WITH HORN AND TILDE
          703  +1EE2; C; 1EE3; # LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW
          704  +1EE4; C; 1EE5; # LATIN CAPITAL LETTER U WITH DOT BELOW
          705  +1EE6; C; 1EE7; # LATIN CAPITAL LETTER U WITH HOOK ABOVE
          706  +1EE8; C; 1EE9; # LATIN CAPITAL LETTER U WITH HORN AND ACUTE
          707  +1EEA; C; 1EEB; # LATIN CAPITAL LETTER U WITH HORN AND GRAVE
          708  +1EEC; C; 1EED; # LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE
          709  +1EEE; C; 1EEF; # LATIN CAPITAL LETTER U WITH HORN AND TILDE
          710  +1EF0; C; 1EF1; # LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW
          711  +1EF2; C; 1EF3; # LATIN CAPITAL LETTER Y WITH GRAVE
          712  +1EF4; C; 1EF5; # LATIN CAPITAL LETTER Y WITH DOT BELOW
          713  +1EF6; C; 1EF7; # LATIN CAPITAL LETTER Y WITH HOOK ABOVE
          714  +1EF8; C; 1EF9; # LATIN CAPITAL LETTER Y WITH TILDE
          715  +1EFA; C; 1EFB; # LATIN CAPITAL LETTER MIDDLE-WELSH LL
          716  +1EFC; C; 1EFD; # LATIN CAPITAL LETTER MIDDLE-WELSH V
          717  +1EFE; C; 1EFF; # LATIN CAPITAL LETTER Y WITH LOOP
          718  +1F08; C; 1F00; # GREEK CAPITAL LETTER ALPHA WITH PSILI
          719  +1F09; C; 1F01; # GREEK CAPITAL LETTER ALPHA WITH DASIA
          720  +1F0A; C; 1F02; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA
          721  +1F0B; C; 1F03; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA
          722  +1F0C; C; 1F04; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA
          723  +1F0D; C; 1F05; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA
          724  +1F0E; C; 1F06; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI
          725  +1F0F; C; 1F07; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI
          726  +1F18; C; 1F10; # GREEK CAPITAL LETTER EPSILON WITH PSILI
          727  +1F19; C; 1F11; # GREEK CAPITAL LETTER EPSILON WITH DASIA
          728  +1F1A; C; 1F12; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND VARIA
          729  +1F1B; C; 1F13; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND VARIA
          730  +1F1C; C; 1F14; # GREEK CAPITAL LETTER EPSILON WITH PSILI AND OXIA
          731  +1F1D; C; 1F15; # GREEK CAPITAL LETTER EPSILON WITH DASIA AND OXIA
          732  +1F28; C; 1F20; # GREEK CAPITAL LETTER ETA WITH PSILI
          733  +1F29; C; 1F21; # GREEK CAPITAL LETTER ETA WITH DASIA
          734  +1F2A; C; 1F22; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA
          735  +1F2B; C; 1F23; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA
          736  +1F2C; C; 1F24; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA
          737  +1F2D; C; 1F25; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA
          738  +1F2E; C; 1F26; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI
          739  +1F2F; C; 1F27; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI
          740  +1F38; C; 1F30; # GREEK CAPITAL LETTER IOTA WITH PSILI
          741  +1F39; C; 1F31; # GREEK CAPITAL LETTER IOTA WITH DASIA
          742  +1F3A; C; 1F32; # GREEK CAPITAL LETTER IOTA WITH PSILI AND VARIA
          743  +1F3B; C; 1F33; # GREEK CAPITAL LETTER IOTA WITH DASIA AND VARIA
          744  +1F3C; C; 1F34; # GREEK CAPITAL LETTER IOTA WITH PSILI AND OXIA
          745  +1F3D; C; 1F35; # GREEK CAPITAL LETTER IOTA WITH DASIA AND OXIA
          746  +1F3E; C; 1F36; # GREEK CAPITAL LETTER IOTA WITH PSILI AND PERISPOMENI
          747  +1F3F; C; 1F37; # GREEK CAPITAL LETTER IOTA WITH DASIA AND PERISPOMENI
          748  +1F48; C; 1F40; # GREEK CAPITAL LETTER OMICRON WITH PSILI
          749  +1F49; C; 1F41; # GREEK CAPITAL LETTER OMICRON WITH DASIA
          750  +1F4A; C; 1F42; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND VARIA
          751  +1F4B; C; 1F43; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND VARIA
          752  +1F4C; C; 1F44; # GREEK CAPITAL LETTER OMICRON WITH PSILI AND OXIA
          753  +1F4D; C; 1F45; # GREEK CAPITAL LETTER OMICRON WITH DASIA AND OXIA
          754  +1F50; F; 03C5 0313; # GREEK SMALL LETTER UPSILON WITH PSILI
          755  +1F52; F; 03C5 0313 0300; # GREEK SMALL LETTER UPSILON WITH PSILI AND VARIA
          756  +1F54; F; 03C5 0313 0301; # GREEK SMALL LETTER UPSILON WITH PSILI AND OXIA
          757  +1F56; F; 03C5 0313 0342; # GREEK SMALL LETTER UPSILON WITH PSILI AND PERISPOMENI
          758  +1F59; C; 1F51; # GREEK CAPITAL LETTER UPSILON WITH DASIA
          759  +1F5B; C; 1F53; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND VARIA
          760  +1F5D; C; 1F55; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND OXIA
          761  +1F5F; C; 1F57; # GREEK CAPITAL LETTER UPSILON WITH DASIA AND PERISPOMENI
          762  +1F68; C; 1F60; # GREEK CAPITAL LETTER OMEGA WITH PSILI
          763  +1F69; C; 1F61; # GREEK CAPITAL LETTER OMEGA WITH DASIA
          764  +1F6A; C; 1F62; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA
          765  +1F6B; C; 1F63; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA
          766  +1F6C; C; 1F64; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA
          767  +1F6D; C; 1F65; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA
          768  +1F6E; C; 1F66; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI
          769  +1F6F; C; 1F67; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI
          770  +1F80; F; 1F00 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND YPOGEGRAMMENI
          771  +1F81; F; 1F01 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND YPOGEGRAMMENI
          772  +1F82; F; 1F02 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND VARIA AND YPOGEGRAMMENI
          773  +1F83; F; 1F03 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND VARIA AND YPOGEGRAMMENI
          774  +1F84; F; 1F04 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND OXIA AND YPOGEGRAMMENI
          775  +1F85; F; 1F05 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND OXIA AND YPOGEGRAMMENI
          776  +1F86; F; 1F06 03B9; # GREEK SMALL LETTER ALPHA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
          777  +1F87; F; 1F07 03B9; # GREEK SMALL LETTER ALPHA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
          778  +1F88; F; 1F00 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
          779  +1F88; S; 1F80; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PROSGEGRAMMENI
          780  +1F89; F; 1F01 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
          781  +1F89; S; 1F81; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PROSGEGRAMMENI
          782  +1F8A; F; 1F02 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          783  +1F8A; S; 1F82; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          784  +1F8B; F; 1F03 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          785  +1F8B; S; 1F83; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          786  +1F8C; F; 1F04 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          787  +1F8C; S; 1F84; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          788  +1F8D; F; 1F05 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          789  +1F8D; S; 1F85; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          790  +1F8E; F; 1F06 03B9; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          791  +1F8E; S; 1F86; # GREEK CAPITAL LETTER ALPHA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          792  +1F8F; F; 1F07 03B9; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          793  +1F8F; S; 1F87; # GREEK CAPITAL LETTER ALPHA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          794  +1F90; F; 1F20 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND YPOGEGRAMMENI
          795  +1F91; F; 1F21 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND YPOGEGRAMMENI
          796  +1F92; F; 1F22 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND VARIA AND YPOGEGRAMMENI
          797  +1F93; F; 1F23 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND VARIA AND YPOGEGRAMMENI
          798  +1F94; F; 1F24 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND OXIA AND YPOGEGRAMMENI
          799  +1F95; F; 1F25 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND OXIA AND YPOGEGRAMMENI
          800  +1F96; F; 1F26 03B9; # GREEK SMALL LETTER ETA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
          801  +1F97; F; 1F27 03B9; # GREEK SMALL LETTER ETA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
          802  +1F98; F; 1F20 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
          803  +1F98; S; 1F90; # GREEK CAPITAL LETTER ETA WITH PSILI AND PROSGEGRAMMENI
          804  +1F99; F; 1F21 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
          805  +1F99; S; 1F91; # GREEK CAPITAL LETTER ETA WITH DASIA AND PROSGEGRAMMENI
          806  +1F9A; F; 1F22 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          807  +1F9A; S; 1F92; # GREEK CAPITAL LETTER ETA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          808  +1F9B; F; 1F23 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          809  +1F9B; S; 1F93; # GREEK CAPITAL LETTER ETA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          810  +1F9C; F; 1F24 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          811  +1F9C; S; 1F94; # GREEK CAPITAL LETTER ETA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          812  +1F9D; F; 1F25 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          813  +1F9D; S; 1F95; # GREEK CAPITAL LETTER ETA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          814  +1F9E; F; 1F26 03B9; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          815  +1F9E; S; 1F96; # GREEK CAPITAL LETTER ETA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          816  +1F9F; F; 1F27 03B9; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          817  +1F9F; S; 1F97; # GREEK CAPITAL LETTER ETA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          818  +1FA0; F; 1F60 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND YPOGEGRAMMENI
          819  +1FA1; F; 1F61 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND YPOGEGRAMMENI
          820  +1FA2; F; 1F62 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
          821  +1FA3; F; 1F63 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND VARIA AND YPOGEGRAMMENI
          822  +1FA4; F; 1F64 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND OXIA AND YPOGEGRAMMENI
          823  +1FA5; F; 1F65 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND OXIA AND YPOGEGRAMMENI
          824  +1FA6; F; 1F66 03B9; # GREEK SMALL LETTER OMEGA WITH PSILI AND PERISPOMENI AND YPOGEGRAMMENI
          825  +1FA7; F; 1F67 03B9; # GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI AND YPOGEGRAMMENI
          826  +1FA8; F; 1F60 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
          827  +1FA8; S; 1FA0; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PROSGEGRAMMENI
          828  +1FA9; F; 1F61 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
          829  +1FA9; S; 1FA1; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PROSGEGRAMMENI
          830  +1FAA; F; 1F62 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          831  +1FAA; S; 1FA2; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND VARIA AND PROSGEGRAMMENI
          832  +1FAB; F; 1F63 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          833  +1FAB; S; 1FA3; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND VARIA AND PROSGEGRAMMENI
          834  +1FAC; F; 1F64 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          835  +1FAC; S; 1FA4; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND OXIA AND PROSGEGRAMMENI
          836  +1FAD; F; 1F65 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          837  +1FAD; S; 1FA5; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND OXIA AND PROSGEGRAMMENI
          838  +1FAE; F; 1F66 03B9; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          839  +1FAE; S; 1FA6; # GREEK CAPITAL LETTER OMEGA WITH PSILI AND PERISPOMENI AND PROSGEGRAMMENI
          840  +1FAF; F; 1F67 03B9; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          841  +1FAF; S; 1FA7; # GREEK CAPITAL LETTER OMEGA WITH DASIA AND PERISPOMENI AND PROSGEGRAMMENI
          842  +1FB2; F; 1F70 03B9; # GREEK SMALL LETTER ALPHA WITH VARIA AND YPOGEGRAMMENI
          843  +1FB3; F; 03B1 03B9; # GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
          844  +1FB4; F; 03AC 03B9; # GREEK SMALL LETTER ALPHA WITH OXIA AND YPOGEGRAMMENI
          845  +1FB6; F; 03B1 0342; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI
          846  +1FB7; F; 03B1 0342 03B9; # GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
          847  +1FB8; C; 1FB0; # GREEK CAPITAL LETTER ALPHA WITH VRACHY
          848  +1FB9; C; 1FB1; # GREEK CAPITAL LETTER ALPHA WITH MACRON
          849  +1FBA; C; 1F70; # GREEK CAPITAL LETTER ALPHA WITH VARIA
          850  +1FBB; C; 1F71; # GREEK CAPITAL LETTER ALPHA WITH OXIA
          851  +1FBC; F; 03B1 03B9; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
          852  +1FBC; S; 1FB3; # GREEK CAPITAL LETTER ALPHA WITH PROSGEGRAMMENI
          853  +1FBE; C; 03B9; # GREEK PROSGEGRAMMENI
          854  +1FC2; F; 1F74 03B9; # GREEK SMALL LETTER ETA WITH VARIA AND YPOGEGRAMMENI
          855  +1FC3; F; 03B7 03B9; # GREEK SMALL LETTER ETA WITH YPOGEGRAMMENI
          856  +1FC4; F; 03AE 03B9; # GREEK SMALL LETTER ETA WITH OXIA AND YPOGEGRAMMENI
          857  +1FC6; F; 03B7 0342; # GREEK SMALL LETTER ETA WITH PERISPOMENI
          858  +1FC7; F; 03B7 0342 03B9; # GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
          859  +1FC8; C; 1F72; # GREEK CAPITAL LETTER EPSILON WITH VARIA
          860  +1FC9; C; 1F73; # GREEK CAPITAL LETTER EPSILON WITH OXIA
          861  +1FCA; C; 1F74; # GREEK CAPITAL LETTER ETA WITH VARIA
          862  +1FCB; C; 1F75; # GREEK CAPITAL LETTER ETA WITH OXIA
          863  +1FCC; F; 03B7 03B9; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
          864  +1FCC; S; 1FC3; # GREEK CAPITAL LETTER ETA WITH PROSGEGRAMMENI
          865  +1FD2; F; 03B9 0308 0300; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND VARIA
          866  +1FD3; F; 03B9 0308 0301; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA
          867  +1FD6; F; 03B9 0342; # GREEK SMALL LETTER IOTA WITH PERISPOMENI
          868  +1FD7; F; 03B9 0308 0342; # GREEK SMALL LETTER IOTA WITH DIALYTIKA AND PERISPOMENI
          869  +1FD8; C; 1FD0; # GREEK CAPITAL LETTER IOTA WITH VRACHY
          870  +1FD9; C; 1FD1; # GREEK CAPITAL LETTER IOTA WITH MACRON
          871  +1FDA; C; 1F76; # GREEK CAPITAL LETTER IOTA WITH VARIA
          872  +1FDB; C; 1F77; # GREEK CAPITAL LETTER IOTA WITH OXIA
          873  +1FE2; F; 03C5 0308 0300; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND VARIA
          874  +1FE3; F; 03C5 0308 0301; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA
          875  +1FE4; F; 03C1 0313; # GREEK SMALL LETTER RHO WITH PSILI
          876  +1FE6; F; 03C5 0342; # GREEK SMALL LETTER UPSILON WITH PERISPOMENI
          877  +1FE7; F; 03C5 0308 0342; # GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND PERISPOMENI
          878  +1FE8; C; 1FE0; # GREEK CAPITAL LETTER UPSILON WITH VRACHY
          879  +1FE9; C; 1FE1; # GREEK CAPITAL LETTER UPSILON WITH MACRON
          880  +1FEA; C; 1F7A; # GREEK CAPITAL LETTER UPSILON WITH VARIA
          881  +1FEB; C; 1F7B; # GREEK CAPITAL LETTER UPSILON WITH OXIA
          882  +1FEC; C; 1FE5; # GREEK CAPITAL LETTER RHO WITH DASIA
          883  +1FF2; F; 1F7C 03B9; # GREEK SMALL LETTER OMEGA WITH VARIA AND YPOGEGRAMMENI
          884  +1FF3; F; 03C9 03B9; # GREEK SMALL LETTER OMEGA WITH YPOGEGRAMMENI
          885  +1FF4; F; 03CE 03B9; # GREEK SMALL LETTER OMEGA WITH OXIA AND YPOGEGRAMMENI
          886  +1FF6; F; 03C9 0342; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI
          887  +1FF7; F; 03C9 0342 03B9; # GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
          888  +1FF8; C; 1F78; # GREEK CAPITAL LETTER OMICRON WITH VARIA
          889  +1FF9; C; 1F79; # GREEK CAPITAL LETTER OMICRON WITH OXIA
          890  +1FFA; C; 1F7C; # GREEK CAPITAL LETTER OMEGA WITH VARIA
          891  +1FFB; C; 1F7D; # GREEK CAPITAL LETTER OMEGA WITH OXIA
          892  +1FFC; F; 03C9 03B9; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
          893  +1FFC; S; 1FF3; # GREEK CAPITAL LETTER OMEGA WITH PROSGEGRAMMENI
          894  +2126; C; 03C9; # OHM SIGN
          895  +212A; C; 006B; # KELVIN SIGN
          896  +212B; C; 00E5; # ANGSTROM SIGN
          897  +2132; C; 214E; # TURNED CAPITAL F
          898  +2160; C; 2170; # ROMAN NUMERAL ONE
          899  +2161; C; 2171; # ROMAN NUMERAL TWO
          900  +2162; C; 2172; # ROMAN NUMERAL THREE
          901  +2163; C; 2173; # ROMAN NUMERAL FOUR
          902  +2164; C; 2174; # ROMAN NUMERAL FIVE
          903  +2165; C; 2175; # ROMAN NUMERAL SIX
          904  +2166; C; 2176; # ROMAN NUMERAL SEVEN
          905  +2167; C; 2177; # ROMAN NUMERAL EIGHT
          906  +2168; C; 2178; # ROMAN NUMERAL NINE
          907  +2169; C; 2179; # ROMAN NUMERAL TEN
          908  +216A; C; 217A; # ROMAN NUMERAL ELEVEN
          909  +216B; C; 217B; # ROMAN NUMERAL TWELVE
          910  +216C; C; 217C; # ROMAN NUMERAL FIFTY
          911  +216D; C; 217D; # ROMAN NUMERAL ONE HUNDRED
          912  +216E; C; 217E; # ROMAN NUMERAL FIVE HUNDRED
          913  +216F; C; 217F; # ROMAN NUMERAL ONE THOUSAND
          914  +2183; C; 2184; # ROMAN NUMERAL REVERSED ONE HUNDRED
          915  +24B6; C; 24D0; # CIRCLED LATIN CAPITAL LETTER A
          916  +24B7; C; 24D1; # CIRCLED LATIN CAPITAL LETTER B
          917  +24B8; C; 24D2; # CIRCLED LATIN CAPITAL LETTER C
          918  +24B9; C; 24D3; # CIRCLED LATIN CAPITAL LETTER D
          919  +24BA; C; 24D4; # CIRCLED LATIN CAPITAL LETTER E
          920  +24BB; C; 24D5; # CIRCLED LATIN CAPITAL LETTER F
          921  +24BC; C; 24D6; # CIRCLED LATIN CAPITAL LETTER G
          922  +24BD; C; 24D7; # CIRCLED LATIN CAPITAL LETTER H
          923  +24BE; C; 24D8; # CIRCLED LATIN CAPITAL LETTER I
          924  +24BF; C; 24D9; # CIRCLED LATIN CAPITAL LETTER J
          925  +24C0; C; 24DA; # CIRCLED LATIN CAPITAL LETTER K
          926  +24C1; C; 24DB; # CIRCLED LATIN CAPITAL LETTER L
          927  +24C2; C; 24DC; # CIRCLED LATIN CAPITAL LETTER M
          928  +24C3; C; 24DD; # CIRCLED LATIN CAPITAL LETTER N
          929  +24C4; C; 24DE; # CIRCLED LATIN CAPITAL LETTER O
          930  +24C5; C; 24DF; # CIRCLED LATIN CAPITAL LETTER P
          931  +24C6; C; 24E0; # CIRCLED LATIN CAPITAL LETTER Q
          932  +24C7; C; 24E1; # CIRCLED LATIN CAPITAL LETTER R
          933  +24C8; C; 24E2; # CIRCLED LATIN CAPITAL LETTER S
          934  +24C9; C; 24E3; # CIRCLED LATIN CAPITAL LETTER T
          935  +24CA; C; 24E4; # CIRCLED LATIN CAPITAL LETTER U
          936  +24CB; C; 24E5; # CIRCLED LATIN CAPITAL LETTER V
          937  +24CC; C; 24E6; # CIRCLED LATIN CAPITAL LETTER W
          938  +24CD; C; 24E7; # CIRCLED LATIN CAPITAL LETTER X
          939  +24CE; C; 24E8; # CIRCLED LATIN CAPITAL LETTER Y
          940  +24CF; C; 24E9; # CIRCLED LATIN CAPITAL LETTER Z
          941  +2C00; C; 2C30; # GLAGOLITIC CAPITAL LETTER AZU
          942  +2C01; C; 2C31; # GLAGOLITIC CAPITAL LETTER BUKY
          943  +2C02; C; 2C32; # GLAGOLITIC CAPITAL LETTER VEDE
          944  +2C03; C; 2C33; # GLAGOLITIC CAPITAL LETTER GLAGOLI
          945  +2C04; C; 2C34; # GLAGOLITIC CAPITAL LETTER DOBRO
          946  +2C05; C; 2C35; # GLAGOLITIC CAPITAL LETTER YESTU
          947  +2C06; C; 2C36; # GLAGOLITIC CAPITAL LETTER ZHIVETE
          948  +2C07; C; 2C37; # GLAGOLITIC CAPITAL LETTER DZELO
          949  +2C08; C; 2C38; # GLAGOLITIC CAPITAL LETTER ZEMLJA
          950  +2C09; C; 2C39; # GLAGOLITIC CAPITAL LETTER IZHE
          951  +2C0A; C; 2C3A; # GLAGOLITIC CAPITAL LETTER INITIAL IZHE
          952  +2C0B; C; 2C3B; # GLAGOLITIC CAPITAL LETTER I
          953  +2C0C; C; 2C3C; # GLAGOLITIC CAPITAL LETTER DJERVI
          954  +2C0D; C; 2C3D; # GLAGOLITIC CAPITAL LETTER KAKO
          955  +2C0E; C; 2C3E; # GLAGOLITIC CAPITAL LETTER LJUDIJE
          956  +2C0F; C; 2C3F; # GLAGOLITIC CAPITAL LETTER MYSLITE
          957  +2C10; C; 2C40; # GLAGOLITIC CAPITAL LETTER NASHI
          958  +2C11; C; 2C41; # GLAGOLITIC CAPITAL LETTER ONU
          959  +2C12; C; 2C42; # GLAGOLITIC CAPITAL LETTER POKOJI
          960  +2C13; C; 2C43; # GLAGOLITIC CAPITAL LETTER RITSI
          961  +2C14; C; 2C44; # GLAGOLITIC CAPITAL LETTER SLOVO
          962  +2C15; C; 2C45; # GLAGOLITIC CAPITAL LETTER TVRIDO
          963  +2C16; C; 2C46; # GLAGOLITIC CAPITAL LETTER UKU
          964  +2C17; C; 2C47; # GLAGOLITIC CAPITAL LETTER FRITU
          965  +2C18; C; 2C48; # GLAGOLITIC CAPITAL LETTER HERU
          966  +2C19; C; 2C49; # GLAGOLITIC CAPITAL LETTER OTU
          967  +2C1A; C; 2C4A; # GLAGOLITIC CAPITAL LETTER PE
          968  +2C1B; C; 2C4B; # GLAGOLITIC CAPITAL LETTER SHTA
          969  +2C1C; C; 2C4C; # GLAGOLITIC CAPITAL LETTER TSI
          970  +2C1D; C; 2C4D; # GLAGOLITIC CAPITAL LETTER CHRIVI
          971  +2C1E; C; 2C4E; # GLAGOLITIC CAPITAL LETTER SHA
          972  +2C1F; C; 2C4F; # GLAGOLITIC CAPITAL LETTER YERU
          973  +2C20; C; 2C50; # GLAGOLITIC CAPITAL LETTER YERI
          974  +2C21; C; 2C51; # GLAGOLITIC CAPITAL LETTER YATI
          975  +2C22; C; 2C52; # GLAGOLITIC CAPITAL LETTER SPIDERY HA
          976  +2C23; C; 2C53; # GLAGOLITIC CAPITAL LETTER YU
          977  +2C24; C; 2C54; # GLAGOLITIC CAPITAL LETTER SMALL YUS
          978  +2C25; C; 2C55; # GLAGOLITIC CAPITAL LETTER SMALL YUS WITH TAIL
          979  +2C26; C; 2C56; # GLAGOLITIC CAPITAL LETTER YO
          980  +2C27; C; 2C57; # GLAGOLITIC CAPITAL LETTER IOTATED SMALL YUS
          981  +2C28; C; 2C58; # GLAGOLITIC CAPITAL LETTER BIG YUS
          982  +2C29; C; 2C59; # GLAGOLITIC CAPITAL LETTER IOTATED BIG YUS
          983  +2C2A; C; 2C5A; # GLAGOLITIC CAPITAL LETTER FITA
          984  +2C2B; C; 2C5B; # GLAGOLITIC CAPITAL LETTER IZHITSA
          985  +2C2C; C; 2C5C; # GLAGOLITIC CAPITAL LETTER SHTAPIC
          986  +2C2D; C; 2C5D; # GLAGOLITIC CAPITAL LETTER TROKUTASTI A
          987  +2C2E; C; 2C5E; # GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
          988  +2C60; C; 2C61; # LATIN CAPITAL LETTER L WITH DOUBLE BAR
          989  +2C62; C; 026B; # LATIN CAPITAL LETTER L WITH MIDDLE TILDE
          990  +2C63; C; 1D7D; # LATIN CAPITAL LETTER P WITH STROKE
          991  +2C64; C; 027D; # LATIN CAPITAL LETTER R WITH TAIL
          992  +2C67; C; 2C68; # LATIN CAPITAL LETTER H WITH DESCENDER
          993  +2C69; C; 2C6A; # LATIN CAPITAL LETTER K WITH DESCENDER
          994  +2C6B; C; 2C6C; # LATIN CAPITAL LETTER Z WITH DESCENDER
          995  +2C6D; C; 0251; # LATIN CAPITAL LETTER ALPHA
          996  +2C6E; C; 0271; # LATIN CAPITAL LETTER M WITH HOOK
          997  +2C6F; C; 0250; # LATIN CAPITAL LETTER TURNED A
          998  +2C70; C; 0252; # LATIN CAPITAL LETTER TURNED ALPHA
          999  +2C72; C; 2C73; # LATIN CAPITAL LETTER W WITH HOOK
         1000  +2C75; C; 2C76; # LATIN CAPITAL LETTER HALF H
         1001  +2C7E; C; 023F; # LATIN CAPITAL LETTER S WITH SWASH TAIL
         1002  +2C7F; C; 0240; # LATIN CAPITAL LETTER Z WITH SWASH TAIL
         1003  +2C80; C; 2C81; # COPTIC CAPITAL LETTER ALFA
         1004  +2C82; C; 2C83; # COPTIC CAPITAL LETTER VIDA
         1005  +2C84; C; 2C85; # COPTIC CAPITAL LETTER GAMMA
         1006  +2C86; C; 2C87; # COPTIC CAPITAL LETTER DALDA
         1007  +2C88; C; 2C89; # COPTIC CAPITAL LETTER EIE
         1008  +2C8A; C; 2C8B; # COPTIC CAPITAL LETTER SOU
         1009  +2C8C; C; 2C8D; # COPTIC CAPITAL LETTER ZATA
         1010  +2C8E; C; 2C8F; # COPTIC CAPITAL LETTER HATE
         1011  +2C90; C; 2C91; # COPTIC CAPITAL LETTER THETHE
         1012  +2C92; C; 2C93; # COPTIC CAPITAL LETTER IAUDA
         1013  +2C94; C; 2C95; # COPTIC CAPITAL LETTER KAPA
         1014  +2C96; C; 2C97; # COPTIC CAPITAL LETTER LAULA
         1015  +2C98; C; 2C99; # COPTIC CAPITAL LETTER MI
         1016  +2C9A; C; 2C9B; # COPTIC CAPITAL LETTER NI
         1017  +2C9C; C; 2C9D; # COPTIC CAPITAL LETTER KSI
         1018  +2C9E; C; 2C9F; # COPTIC CAPITAL LETTER O
         1019  +2CA0; C; 2CA1; # COPTIC CAPITAL LETTER PI
         1020  +2CA2; C; 2CA3; # COPTIC CAPITAL LETTER RO
         1021  +2CA4; C; 2CA5; # COPTIC CAPITAL LETTER SIMA
         1022  +2CA6; C; 2CA7; # COPTIC CAPITAL LETTER TAU
         1023  +2CA8; C; 2CA9; # COPTIC CAPITAL LETTER UA
         1024  +2CAA; C; 2CAB; # COPTIC CAPITAL LETTER FI
         1025  +2CAC; C; 2CAD; # COPTIC CAPITAL LETTER KHI
         1026  +2CAE; C; 2CAF; # COPTIC CAPITAL LETTER PSI
         1027  +2CB0; C; 2CB1; # COPTIC CAPITAL LETTER OOU
         1028  +2CB2; C; 2CB3; # COPTIC CAPITAL LETTER DIALECT-P ALEF
         1029  +2CB4; C; 2CB5; # COPTIC CAPITAL LETTER OLD COPTIC AIN
         1030  +2CB6; C; 2CB7; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC EIE
         1031  +2CB8; C; 2CB9; # COPTIC CAPITAL LETTER DIALECT-P KAPA
         1032  +2CBA; C; 2CBB; # COPTIC CAPITAL LETTER DIALECT-P NI
         1033  +2CBC; C; 2CBD; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC NI
         1034  +2CBE; C; 2CBF; # COPTIC CAPITAL LETTER OLD COPTIC OOU
         1035  +2CC0; C; 2CC1; # COPTIC CAPITAL LETTER SAMPI
         1036  +2CC2; C; 2CC3; # COPTIC CAPITAL LETTER CROSSED SHEI
         1037  +2CC4; C; 2CC5; # COPTIC CAPITAL LETTER OLD COPTIC SHEI
         1038  +2CC6; C; 2CC7; # COPTIC CAPITAL LETTER OLD COPTIC ESH
         1039  +2CC8; C; 2CC9; # COPTIC CAPITAL LETTER AKHMIMIC KHEI
         1040  +2CCA; C; 2CCB; # COPTIC CAPITAL LETTER DIALECT-P HORI
         1041  +2CCC; C; 2CCD; # COPTIC CAPITAL LETTER OLD COPTIC HORI
         1042  +2CCE; C; 2CCF; # COPTIC CAPITAL LETTER OLD COPTIC HA
         1043  +2CD0; C; 2CD1; # COPTIC CAPITAL LETTER L-SHAPED HA
         1044  +2CD2; C; 2CD3; # COPTIC CAPITAL LETTER OLD COPTIC HEI
         1045  +2CD4; C; 2CD5; # COPTIC CAPITAL LETTER OLD COPTIC HAT
         1046  +2CD6; C; 2CD7; # COPTIC CAPITAL LETTER OLD COPTIC GANGIA
         1047  +2CD8; C; 2CD9; # COPTIC CAPITAL LETTER OLD COPTIC DJA
         1048  +2CDA; C; 2CDB; # COPTIC CAPITAL LETTER OLD COPTIC SHIMA
         1049  +2CDC; C; 2CDD; # COPTIC CAPITAL LETTER OLD NUBIAN SHIMA
         1050  +2CDE; C; 2CDF; # COPTIC CAPITAL LETTER OLD NUBIAN NGI
         1051  +2CE0; C; 2CE1; # COPTIC CAPITAL LETTER OLD NUBIAN NYI
         1052  +2CE2; C; 2CE3; # COPTIC CAPITAL LETTER OLD NUBIAN WAU
         1053  +2CEB; C; 2CEC; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC SHEI
         1054  +2CED; C; 2CEE; # COPTIC CAPITAL LETTER CRYPTOGRAMMIC GANGIA
         1055  +2CF2; C; 2CF3; # COPTIC CAPITAL LETTER BOHAIRIC KHEI
         1056  +A640; C; A641; # CYRILLIC CAPITAL LETTER ZEMLYA
         1057  +A642; C; A643; # CYRILLIC CAPITAL LETTER DZELO
         1058  +A644; C; A645; # CYRILLIC CAPITAL LETTER REVERSED DZE
         1059  +A646; C; A647; # CYRILLIC CAPITAL LETTER IOTA
         1060  +A648; C; A649; # CYRILLIC CAPITAL LETTER DJERV
         1061  +A64A; C; A64B; # CYRILLIC CAPITAL LETTER MONOGRAPH UK
         1062  +A64C; C; A64D; # CYRILLIC CAPITAL LETTER BROAD OMEGA
         1063  +A64E; C; A64F; # CYRILLIC CAPITAL LETTER NEUTRAL YER
         1064  +A650; C; A651; # CYRILLIC CAPITAL LETTER YERU WITH BACK YER
         1065  +A652; C; A653; # CYRILLIC CAPITAL LETTER IOTIFIED YAT
         1066  +A654; C; A655; # CYRILLIC CAPITAL LETTER REVERSED YU
         1067  +A656; C; A657; # CYRILLIC CAPITAL LETTER IOTIFIED A
         1068  +A658; C; A659; # CYRILLIC CAPITAL LETTER CLOSED LITTLE YUS
         1069  +A65A; C; A65B; # CYRILLIC CAPITAL LETTER BLENDED YUS
         1070  +A65C; C; A65D; # CYRILLIC CAPITAL LETTER IOTIFIED CLOSED LITTLE YUS
         1071  +A65E; C; A65F; # CYRILLIC CAPITAL LETTER YN
         1072  +A660; C; A661; # CYRILLIC CAPITAL LETTER REVERSED TSE
         1073  +A662; C; A663; # CYRILLIC CAPITAL LETTER SOFT DE
         1074  +A664; C; A665; # CYRILLIC CAPITAL LETTER SOFT EL
         1075  +A666; C; A667; # CYRILLIC CAPITAL LETTER SOFT EM
         1076  +A668; C; A669; # CYRILLIC CAPITAL LETTER MONOCULAR O
         1077  +A66A; C; A66B; # CYRILLIC CAPITAL LETTER BINOCULAR O
         1078  +A66C; C; A66D; # CYRILLIC CAPITAL LETTER DOUBLE MONOCULAR O
         1079  +A680; C; A681; # CYRILLIC CAPITAL LETTER DWE
         1080  +A682; C; A683; # CYRILLIC CAPITAL LETTER DZWE
         1081  +A684; C; A685; # CYRILLIC CAPITAL LETTER ZHWE
         1082  +A686; C; A687; # CYRILLIC CAPITAL LETTER CCHE
         1083  +A688; C; A689; # CYRILLIC CAPITAL LETTER DZZE
         1084  +A68A; C; A68B; # CYRILLIC CAPITAL LETTER TE WITH MIDDLE HOOK
         1085  +A68C; C; A68D; # CYRILLIC CAPITAL LETTER TWE
         1086  +A68E; C; A68F; # CYRILLIC CAPITAL LETTER TSWE
         1087  +A690; C; A691; # CYRILLIC CAPITAL LETTER TSSE
         1088  +A692; C; A693; # CYRILLIC CAPITAL LETTER TCHE
         1089  +A694; C; A695; # CYRILLIC CAPITAL LETTER HWE
         1090  +A696; C; A697; # CYRILLIC CAPITAL LETTER SHWE
         1091  +A722; C; A723; # LATIN CAPITAL LETTER EGYPTOLOGICAL ALEF
         1092  +A724; C; A725; # LATIN CAPITAL LETTER EGYPTOLOGICAL AIN
         1093  +A726; C; A727; # LATIN CAPITAL LETTER HENG
         1094  +A728; C; A729; # LATIN CAPITAL LETTER TZ
         1095  +A72A; C; A72B; # LATIN CAPITAL LETTER TRESILLO
         1096  +A72C; C; A72D; # LATIN CAPITAL LETTER CUATRILLO
         1097  +A72E; C; A72F; # LATIN CAPITAL LETTER CUATRILLO WITH COMMA
         1098  +A732; C; A733; # LATIN CAPITAL LETTER AA
         1099  +A734; C; A735; # LATIN CAPITAL LETTER AO
         1100  +A736; C; A737; # LATIN CAPITAL LETTER AU
         1101  +A738; C; A739; # LATIN CAPITAL LETTER AV
         1102  +A73A; C; A73B; # LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR
         1103  +A73C; C; A73D; # LATIN CAPITAL LETTER AY
         1104  +A73E; C; A73F; # LATIN CAPITAL LETTER REVERSED C WITH DOT
         1105  +A740; C; A741; # LATIN CAPITAL LETTER K WITH STROKE
         1106  +A742; C; A743; # LATIN CAPITAL LETTER K WITH DIAGONAL STROKE
         1107  +A744; C; A745; # LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE
         1108  +A746; C; A747; # LATIN CAPITAL LETTER BROKEN L
         1109  +A748; C; A749; # LATIN CAPITAL LETTER L WITH HIGH STROKE
         1110  +A74A; C; A74B; # LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY
         1111  +A74C; C; A74D; # LATIN CAPITAL LETTER O WITH LOOP
         1112  +A74E; C; A74F; # LATIN CAPITAL LETTER OO
         1113  +A750; C; A751; # LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER
         1114  +A752; C; A753; # LATIN CAPITAL LETTER P WITH FLOURISH
         1115  +A754; C; A755; # LATIN CAPITAL LETTER P WITH SQUIRREL TAIL
         1116  +A756; C; A757; # LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER
         1117  +A758; C; A759; # LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE
         1118  +A75A; C; A75B; # LATIN CAPITAL LETTER R ROTUNDA
         1119  +A75C; C; A75D; # LATIN CAPITAL LETTER RUM ROTUNDA
         1120  +A75E; C; A75F; # LATIN CAPITAL LETTER V WITH DIAGONAL STROKE
         1121  +A760; C; A761; # LATIN CAPITAL LETTER VY
         1122  +A762; C; A763; # LATIN CAPITAL LETTER VISIGOTHIC Z
         1123  +A764; C; A765; # LATIN CAPITAL LETTER THORN WITH STROKE
         1124  +A766; C; A767; # LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER
         1125  +A768; C; A769; # LATIN CAPITAL LETTER VEND
         1126  +A76A; C; A76B; # LATIN CAPITAL LETTER ET
         1127  +A76C; C; A76D; # LATIN CAPITAL LETTER IS
         1128  +A76E; C; A76F; # LATIN CAPITAL LETTER CON
         1129  +A779; C; A77A; # LATIN CAPITAL LETTER INSULAR D
         1130  +A77B; C; A77C; # LATIN CAPITAL LETTER INSULAR F
         1131  +A77D; C; 1D79; # LATIN CAPITAL LETTER INSULAR G
         1132  +A77E; C; A77F; # LATIN CAPITAL LETTER TURNED INSULAR G
         1133  +A780; C; A781; # LATIN CAPITAL LETTER TURNED L
         1134  +A782; C; A783; # LATIN CAPITAL LETTER INSULAR R
         1135  +A784; C; A785; # LATIN CAPITAL LETTER INSULAR S
         1136  +A786; C; A787; # LATIN CAPITAL LETTER INSULAR T
         1137  +A78B; C; A78C; # LATIN CAPITAL LETTER SALTILLO
         1138  +A78D; C; 0265; # LATIN CAPITAL LETTER TURNED H
         1139  +A790; C; A791; # LATIN CAPITAL LETTER N WITH DESCENDER
         1140  +A792; C; A793; # LATIN CAPITAL LETTER C WITH BAR
         1141  +A7A0; C; A7A1; # LATIN CAPITAL LETTER G WITH OBLIQUE STROKE
         1142  +A7A2; C; A7A3; # LATIN CAPITAL LETTER K WITH OBLIQUE STROKE
         1143  +A7A4; C; A7A5; # LATIN CAPITAL LETTER N WITH OBLIQUE STROKE
         1144  +A7A6; C; A7A7; # LATIN CAPITAL LETTER R WITH OBLIQUE STROKE
         1145  +A7A8; C; A7A9; # LATIN CAPITAL LETTER S WITH OBLIQUE STROKE
         1146  +A7AA; C; 0266; # LATIN CAPITAL LETTER H WITH HOOK
         1147  +FB00; F; 0066 0066; # LATIN SMALL LIGATURE FF
         1148  +FB01; F; 0066 0069; # LATIN SMALL LIGATURE FI
         1149  +FB02; F; 0066 006C; # LATIN SMALL LIGATURE FL
         1150  +FB03; F; 0066 0066 0069; # LATIN SMALL LIGATURE FFI
         1151  +FB04; F; 0066 0066 006C; # LATIN SMALL LIGATURE FFL
         1152  +FB05; F; 0073 0074; # LATIN SMALL LIGATURE LONG S T
         1153  +FB06; F; 0073 0074; # LATIN SMALL LIGATURE ST
         1154  +FB13; F; 0574 0576; # ARMENIAN SMALL LIGATURE MEN NOW
         1155  +FB14; F; 0574 0565; # ARMENIAN SMALL LIGATURE MEN ECH
         1156  +FB15; F; 0574 056B; # ARMENIAN SMALL LIGATURE MEN INI
         1157  +FB16; F; 057E 0576; # ARMENIAN SMALL LIGATURE VEW NOW
         1158  +FB17; F; 0574 056D; # ARMENIAN SMALL LIGATURE MEN XEH
         1159  +FF21; C; FF41; # FULLWIDTH LATIN CAPITAL LETTER A
         1160  +FF22; C; FF42; # FULLWIDTH LATIN CAPITAL LETTER B
         1161  +FF23; C; FF43; # FULLWIDTH LATIN CAPITAL LETTER C
         1162  +FF24; C; FF44; # FULLWIDTH LATIN CAPITAL LETTER D
         1163  +FF25; C; FF45; # FULLWIDTH LATIN CAPITAL LETTER E
         1164  +FF26; C; FF46; # FULLWIDTH LATIN CAPITAL LETTER F
         1165  +FF27; C; FF47; # FULLWIDTH LATIN CAPITAL LETTER G
         1166  +FF28; C; FF48; # FULLWIDTH LATIN CAPITAL LETTER H
         1167  +FF29; C; FF49; # FULLWIDTH LATIN CAPITAL LETTER I
         1168  +FF2A; C; FF4A; # FULLWIDTH LATIN CAPITAL LETTER J
         1169  +FF2B; C; FF4B; # FULLWIDTH LATIN CAPITAL LETTER K
         1170  +FF2C; C; FF4C; # FULLWIDTH LATIN CAPITAL LETTER L
         1171  +FF2D; C; FF4D; # FULLWIDTH LATIN CAPITAL LETTER M
         1172  +FF2E; C; FF4E; # FULLWIDTH LATIN CAPITAL LETTER N
         1173  +FF2F; C; FF4F; # FULLWIDTH LATIN CAPITAL LETTER O
         1174  +FF30; C; FF50; # FULLWIDTH LATIN CAPITAL LETTER P
         1175  +FF31; C; FF51; # FULLWIDTH LATIN CAPITAL LETTER Q
         1176  +FF32; C; FF52; # FULLWIDTH LATIN CAPITAL LETTER R
         1177  +FF33; C; FF53; # FULLWIDTH LATIN CAPITAL LETTER S
         1178  +FF34; C; FF54; # FULLWIDTH LATIN CAPITAL LETTER T
         1179  +FF35; C; FF55; # FULLWIDTH LATIN CAPITAL LETTER U
         1180  +FF36; C; FF56; # FULLWIDTH LATIN CAPITAL LETTER V
         1181  +FF37; C; FF57; # FULLWIDTH LATIN CAPITAL LETTER W
         1182  +FF38; C; FF58; # FULLWIDTH LATIN CAPITAL LETTER X
         1183  +FF39; C; FF59; # FULLWIDTH LATIN CAPITAL LETTER Y
         1184  +FF3A; C; FF5A; # FULLWIDTH LATIN CAPITAL LETTER Z
         1185  +10400; C; 10428; # DESERET CAPITAL LETTER LONG I
         1186  +10401; C; 10429; # DESERET CAPITAL LETTER LONG E
         1187  +10402; C; 1042A; # DESERET CAPITAL LETTER LONG A
         1188  +10403; C; 1042B; # DESERET CAPITAL LETTER LONG AH
         1189  +10404; C; 1042C; # DESERET CAPITAL LETTER LONG O
         1190  +10405; C; 1042D; # DESERET CAPITAL LETTER LONG OO
         1191  +10406; C; 1042E; # DESERET CAPITAL LETTER SHORT I
         1192  +10407; C; 1042F; # DESERET CAPITAL LETTER SHORT E
         1193  +10408; C; 10430; # DESERET CAPITAL LETTER SHORT A
         1194  +10409; C; 10431; # DESERET CAPITAL LETTER SHORT AH
         1195  +1040A; C; 10432; # DESERET CAPITAL LETTER SHORT O
         1196  +1040B; C; 10433; # DESERET CAPITAL LETTER SHORT OO
         1197  +1040C; C; 10434; # DESERET CAPITAL LETTER AY
         1198  +1040D; C; 10435; # DESERET CAPITAL LETTER OW
         1199  +1040E; C; 10436; # DESERET CAPITAL LETTER WU
         1200  +1040F; C; 10437; # DESERET CAPITAL LETTER YEE
         1201  +10410; C; 10438; # DESERET CAPITAL LETTER H
         1202  +10411; C; 10439; # DESERET CAPITAL LETTER PEE
         1203  +10412; C; 1043A; # DESERET CAPITAL LETTER BEE
         1204  +10413; C; 1043B; # DESERET CAPITAL LETTER TEE
         1205  +10414; C; 1043C; # DESERET CAPITAL LETTER DEE
         1206  +10415; C; 1043D; # DESERET CAPITAL LETTER CHEE
         1207  +10416; C; 1043E; # DESERET CAPITAL LETTER JEE
         1208  +10417; C; 1043F; # DESERET CAPITAL LETTER KAY
         1209  +10418; C; 10440; # DESERET CAPITAL LETTER GAY
         1210  +10419; C; 10441; # DESERET CAPITAL LETTER EF
         1211  +1041A; C; 10442; # DESERET CAPITAL LETTER VEE
         1212  +1041B; C; 10443; # DESERET CAPITAL LETTER ETH
         1213  +1041C; C; 10444; # DESERET CAPITAL LETTER THEE
         1214  +1041D; C; 10445; # DESERET CAPITAL LETTER ES
         1215  +1041E; C; 10446; # DESERET CAPITAL LETTER ZEE
         1216  +1041F; C; 10447; # DESERET CAPITAL LETTER ESH
         1217  +10420; C; 10448; # DESERET CAPITAL LETTER ZHEE
         1218  +10421; C; 10449; # DESERET CAPITAL LETTER ER
         1219  +10422; C; 1044A; # DESERET CAPITAL LETTER EL
         1220  +10423; C; 1044B; # DESERET CAPITAL LETTER EM
         1221  +10424; C; 1044C; # DESERET CAPITAL LETTER EN
         1222  +10425; C; 1044D; # DESERET CAPITAL LETTER ENG
         1223  +10426; C; 1044E; # DESERET CAPITAL LETTER OI
         1224  +10427; C; 1044F; # DESERET CAPITAL LETTER EW

Added ext/fts3/unicode/UnicodeData.txt.

more than 10,000 changes

Added ext/fts3/unicode/mkunicode.tcl.

            1  +
            2  +
            3  +# Parameter $zName must be a path to the file UnicodeData.txt. This command
            4  +# reads the file and returns a list of codepoints (integers). The list
            5  +# contains all codepoints in the UnicodeData.txt assigned to any "General
            6  +# Category" that is not a "Letter" or "Number".
            7  +#
            8  +proc an_load_unicodedata_text {zName} {
            9  +  set fd [open $zName]
           10  +  set lField {
           11  +    code
           12  +    character_name
           13  +    general_category
           14  +    canonical_combining_classes
           15  +    bidirectional_category
           16  +    character_decomposition_mapping
           17  +    decimal_digit_value
           18  +    digit_value
           19  +    numeric_value
           20  +    mirrored
           21  +    unicode_1_name
           22  +    iso10646_comment_field
           23  +    uppercase_mapping
           24  +    lowercase_mapping
           25  +    titlecase_mapping
           26  +  }
           27  +  set lRet [list]
           28  +
           29  +  while { ![eof $fd] } {
           30  +    set line [gets $fd]
           31  +    if {$line == ""} continue
           32  +
           33  +    set fields [split $line ";"]
           34  +    if {[llength $fields] != [llength $lField]} { error "parse error: $line" }
           35  +    foreach $lField $fields {}
           36  +
           37  +    set iCode [expr "0x$code"]
           38  +    set bAlnum [expr {[lsearch {L N} [string range $general_category 0 0]]>=0}]
           39  +
           40  +    if { !$bAlnum } { lappend lRet $iCode }
           41  +  }
           42  +
           43  +  close $fd
           44  +  set lRet
           45  +}
           46  +
           47  +proc an_load_separator_ranges {} {
           48  +  global unicodedata.txt
           49  +  set lSep [an_load_unicodedata_text ${unicodedata.txt}]
           50  +  unset -nocomplain iFirst 
           51  +  unset -nocomplain nRange 
           52  +  set lRange [list]
           53  +  foreach sep $lSep {
           54  +    if {0==[info exists iFirst]} {
           55  +      set iFirst $sep
           56  +      set nRange 1
           57  +    } elseif { $sep == ($iFirst+$nRange) } {
           58  +      incr nRange
           59  +    } else {
           60  +      lappend lRange [list $iFirst $nRange]
           61  +      set iFirst $sep
           62  +      set nRange 1
           63  +    }
           64  +  } 
           65  +  lappend lRange [list $iFirst $nRange]
           66  +  set lRange
           67  +}
           68  +
           69  +proc an_print_range_array {lRange} {
           70  +  set iFirstMax 0
           71  +  set nRangeMax 0
           72  +  foreach range $lRange {
           73  +    foreach {iFirst nRange} $range {}
           74  +    if {$iFirst > $iFirstMax} {set iFirstMax $iFirst}
           75  +    if {$nRange > $nRangeMax} {set nRangeMax $nRange}
           76  +  }
           77  +  if {$iFirstMax >= (1<<22)} {error "first-max is too large for format"}
           78  +  if {$nRangeMax >= (1<<10)} {error "range-max is too large for format"}
           79  +
           80  +  puts -nonewline "  "
           81  +  puts [string trim {
           82  +  /* Each unsigned integer in the following array corresponds to a contiguous
           83  +  ** range of unicode codepoints that are not either letters or numbers (i.e.
           84  +  ** codepoints for which this function should return 0).
           85  +  **
           86  +  ** The most significant 22 bits in each 32-bit value contain the first 
           87  +  ** codepoint in the range. The least significant 10 bits are used to store
           88  +  ** the size of the range (always at least 1). In other words, the value 
           89  +  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
           90  +  ** C. It is not possible to represent a range larger than 1023 codepoints 
           91  +  ** using this format.
           92  +  */
           93  +  }]
           94  +  puts -nonewline "  const static unsigned int aEntry\[\] = \{"
           95  +  set i 0
           96  +  foreach range $lRange {
           97  +    foreach {iFirst nRange} $range {}
           98  +    set u32 [format "0x%08X" [expr ($iFirst<<10) + $nRange]]
           99  +
          100  +    if {($i % 5)==0} {puts "" ; puts -nonewline "   "}
          101  +    puts -nonewline " $u32,"
          102  +    incr i
          103  +  }
          104  +  puts ""
          105  +  puts "  \};"
          106  +}
          107  +
          108  +proc an_print_ascii_bitmap {lRange} {
          109  +  foreach range $lRange {
          110  +    foreach {iFirst nRange} $range {}
          111  +    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} {
          112  +      if {$i<=127} { set a($i) 1 }
          113  +    }
          114  +  }
          115  +
          116  +  set aAscii [list 0 0 0 0]
          117  +  foreach key [array names a] {
          118  +    set idx [expr $key >> 5]
          119  +    lset aAscii $idx [expr [lindex $aAscii $idx] | (1 << ($key&0x001F))]
          120  +  }
          121  +
          122  +  puts "  static const unsigned int aAscii\[4\] = \{"
          123  +  puts -nonewline "   "
          124  +  foreach v $aAscii { puts -nonewline [format " 0x%08X," $v] }
          125  +  puts ""
          126  +  puts "  \};"
          127  +}
          128  +
          129  +proc print_isalnum {zFunc lRange} {
          130  +  puts "/*"
          131  +  puts "** Return true if the argument corresponds to a unicode codepoint"
          132  +  puts "** classified as either a letter or a number. Otherwise false."
          133  +  puts "**"
          134  +  puts "** The results are undefined if the value passed to this function"
          135  +  puts "** is less than zero."
          136  +  puts "*/"
          137  +  puts "int ${zFunc}\(int c)\{"
          138  +  an_print_range_array $lRange
          139  +  an_print_ascii_bitmap $lRange
          140  +  puts {
          141  +  if( c<128 ){
          142  +    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
          143  +  }else if( c<(1<<22) ){
          144  +    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
          145  +    int iRes;
          146  +    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
          147  +    int iLo = 0;
          148  +    while( iHi>=iLo ){
          149  +      int iTest = (iHi + iLo) / 2;
          150  +      if( key >= aEntry[iTest] ){
          151  +        iRes = iTest;
          152  +        iLo = iTest+1;
          153  +      }else{
          154  +        iHi = iTest-1;
          155  +      }
          156  +    }
          157  +    assert( aEntry[0]<key );
          158  +    assert( key>=aEntry[iRes] );
          159  +    return (c >= ((aEntry[iRes]>>10) + (aEntry[iRes]&0x3FF)));
          160  +  }
          161  +  return 1;}
          162  +  puts "\}"
          163  +}
          164  +
          165  +proc print_test_isalnum {zFunc lRange} {
          166  +  foreach range $lRange {
          167  +    foreach {iFirst nRange} $range {}
          168  +    for {set i $iFirst} {$i < ($iFirst+$nRange)} {incr i} { set a($i) 1 }
          169  +  }
          170  +
          171  +  puts "static int isalnum_test(int *piCode)\{"
          172  +  puts -nonewline "  unsigned char aAlnum\[\] = \{"
          173  +  for {set i 0} {$i < 70000} {incr i} {
          174  +    if {($i % 32)==0} { puts "" ; puts -nonewline "    " }
          175  +    set bFlag [expr ![info exists a($i)]]
          176  +    puts -nonewline "${bFlag},"
          177  +  }
          178  +  puts ""
          179  +  puts "  \};"
          180  +
          181  +  puts -nonewline "  int aLargeSep\[\] = \{"
          182  +  set i 0
          183  +  foreach iSep [lsort -integer [array names a]] {
          184  +    if {$iSep<70000} continue
          185  +    if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
          186  +    puts -nonewline " $iSep,"
          187  +    incr i
          188  +  }
          189  +  puts ""
          190  +  puts "  \};"
          191  +  puts -nonewline "  int aLargeOther\[\] = \{"
          192  +  set i 0
          193  +  foreach iSep [lsort -integer [array names a]] {
          194  +    if {$iSep<70000} continue
          195  +    if {[info exists a([expr $iSep-1])]==0} {
          196  +      if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
          197  +      puts -nonewline " [expr $iSep-1],"
          198  +      incr i
          199  +    }
          200  +    if {[info exists a([expr $iSep+1])]==0} {
          201  +      if {($i % 8)==0} { puts "" ; puts -nonewline "   " }
          202  +      puts -nonewline " [expr $iSep+1],"
          203  +      incr i
          204  +    }
          205  +  }
          206  +  puts ""
          207  +  puts "  \};"
          208  +
          209  +  puts [subst -nocommands {
          210  +  int i;
          211  +  for(i=0; i<sizeof(aAlnum)/sizeof(aAlnum[0]); i++){
          212  +    if( ${zFunc}(i)!=aAlnum[i] ){
          213  +      *piCode = i;
          214  +      return 1;
          215  +    }
          216  +  }
          217  +  for(i=0; i<sizeof(aLargeSep)/sizeof(aLargeSep[0]); i++){
          218  +    if( ${zFunc}(aLargeSep[i])!=0 ){
          219  +      *piCode = aLargeSep[i];
          220  +      return 1;
          221  +    }
          222  +  }
          223  +  for(i=0; i<sizeof(aLargeOther)/sizeof(aLargeOther[0]); i++){
          224  +    if( ${zFunc}(aLargeOther[i])!=1 ){
          225  +      *piCode = aLargeOther[i];
          226  +      return 1;
          227  +    }
          228  +  }
          229  +  }]
          230  +  puts "  return 0;"
          231  +  puts "\}"
          232  +}
          233  +
          234  +#-------------------------------------------------------------------------
          235  +
          236  +proc tl_load_casefolding_txt {zName} {
          237  +  global tl_lookup_table
          238  +
          239  +  set fd [open $zName]
          240  +  while { ![eof $fd] } {
          241  +    set line [gets $fd]
          242  +    if {[string range $line 0 0] == "#"} continue
          243  +    if {$line == ""} continue
          244  +
          245  +    foreach x {a b c d} {unset -nocomplain $x}
          246  +    foreach {a b c d} [split $line ";"] {}
          247  +
          248  +    set a2 [list]
          249  +    set c2 [list]
          250  +    foreach elem $a { lappend a2 [expr "0x[string trim $elem]"] }
          251  +    foreach elem $c { lappend c2 [expr "0x[string trim $elem]"] }
          252  +    set b [string trim $b]
          253  +    set d [string trim $d]
          254  +
          255  +    if {$b=="C" || $b=="S"} { set tl_lookup_table($a2) $c2 }
          256  +  }
          257  +}
          258  +
          259  +proc tl_create_records {} {
          260  +  global tl_lookup_table
          261  +
          262  +  set iFirst ""
          263  +  set nOff 0
          264  +  set nRange 0
          265  +  set nIncr 0
          266  +
          267  +  set lRecord [list]
          268  +  foreach code [lsort -integer [array names tl_lookup_table]] {
          269  +    set mapping $tl_lookup_table($code)
          270  +    if {$iFirst == ""} {
          271  +      set iFirst $code
          272  +      set nOff   [expr $mapping - $code]
          273  +      set nRange 1
          274  +      set nIncr 1
          275  +    } else {
          276  +      set diff [expr $code - ($iFirst + ($nIncr * ($nRange - 1)))]
          277  +      if { $nRange==1 && ($diff==1 || $diff==2) } {
          278  +        set nIncr $diff
          279  +      }
          280  +
          281  +      if {$diff != $nIncr || ($mapping - $code)!=$nOff} {
          282  +        if { $nRange==1 } {set nIncr 1}
          283  +        lappend lRecord [list $iFirst $nIncr $nRange $nOff]
          284  +        set iFirst $code
          285  +        set nOff   [expr $mapping - $code]
          286  +        set nRange 1
          287  +        set nIncr 1
          288  +      } else {
          289  +        incr nRange
          290  +      }
          291  +    }
          292  +  }
          293  +
          294  +  lappend lRecord [list $iFirst $nIncr $nRange $nOff]
          295  +
          296  +  set lRecord
          297  +}
          298  +
          299  +proc tl_print_table_header {} {
          300  +  puts -nonewline "  "
          301  +  puts [string trim {
          302  +  /* Each entry in the following array defines a rule for folding a range
          303  +  ** of codepoints to lower case. The rule applies to a range of nRange
          304  +  ** codepoints starting at codepoint iCode.
          305  +  **
          306  +  ** If the least significant bit in flags is clear, then the rule applies
          307  +  ** to all nRange codepoints (i.e. all nRange codepoints are upper case and
          308  +  ** need to be folded). Or, if it is set, then the rule only applies to
          309  +  ** every second codepoint in the range, starting with codepoint C.
          310  +  **
          311  +  ** The 7 most significant bits in flags are an index into the aiOff[]
          312  +  ** array. If a specific codepoint C does require folding, then its lower
          313  +  ** case equivalent is ((C + aiOff[flags>>1]) & 0xFFFF).
          314  +  **
          315  +  ** The contents of this array are generated by parsing the CaseFolding.txt
          316  +  ** file distributed as part of the "Unicode Character Database". See
          317  +  ** http://www.unicode.org for details.
          318  +  */
          319  +  }]
          320  +  puts "  static const struct TableEntry \{"
          321  +  puts "    unsigned short iCode;"
          322  +  puts "    unsigned char flags;"
          323  +  puts "    unsigned char nRange;"
          324  +  puts "  \} aEntry\[\] = \{"
          325  +}
          326  +
          327  +proc tl_print_table_entry {togglevar entry liOff} {
          328  +  upvar $togglevar t
          329  +  foreach {iFirst nIncr nRange nOff} $entry {}
          330  +
          331  +  if {$iFirst > (1<<16)} { return 1 }
          332  +
          333  +  if {[info exists t]==0} {set t 0}
          334  +  if {$t==0} { puts -nonewline "    " }
          335  +
          336  +  set flags 0
          337  +  if {$nIncr==2} { set flags 1 ; set nRange [expr $nRange * 2]}
          338  +  if {$nOff<0}   { incr nOff [expr (1<<16)] }
          339  +
          340  +  set idx [lsearch $liOff $nOff]
          341  +  if {$idx<0} {error "malfunction generating aiOff"}
          342  +  set flags [expr $flags + $idx*2]
          343  +
          344  +  set txt "{$iFirst, $flags, $nRange},"
          345  +  if {$t==2} {
          346  +    puts $txt
          347  +  } else {
          348  +    puts -nonewline [format "% -23s" $txt]
          349  +  }
          350  +  set t [expr ($t+1)%3]
          351  +
          352  +  return 0
          353  +}
          354  +
          355  +proc tl_print_table_footer {togglevar} {
          356  +  upvar $togglevar t
          357  +  if {$t!=0} {puts ""}
          358  +  puts "  \};"
          359  +}
          360  +
          361  +proc tl_print_if_entry {entry} {
          362  +  foreach {iFirst nIncr nRange nOff} $entry {}
          363  +  if {$nIncr==2} {error "tl_print_if_entry needs improvement!"}
          364  +
          365  +  puts "  else if( c>=$iFirst && c<[expr $iFirst+$nRange] )\{"
          366  +  puts "    ret = c + $nOff;"
          367  +  puts "  \}"
          368  +}
          369  +
          370  +proc tl_generate_ioff_table {lRecord} {
          371  +  foreach entry $lRecord {
          372  +    foreach {iFirst nIncr nRange iOff} $entry {}
          373  +    if {$iOff<0}   { incr iOff [expr (1<<16)] }
          374  +    if {[info exists a($iOff)]} continue
          375  +    set a($iOff) 1
          376  +  }
          377  +
          378  +  set liOff [lsort -integer [array names a]]
          379  +  if {[llength $liOff]>128} { error "Too many distinct ioffs" }
          380  +  return $liOff
          381  +}
          382  +
          383  +proc tl_print_ioff_table {liOff} {
          384  +  puts -nonewline "  static const unsigned short aiOff\[\] = \{"
          385  +  set i 0
          386  +  foreach off $liOff {
          387  +    if {($i % 8)==0} {puts "" ; puts -nonewline "   "}
          388  +    puts -nonewline [format "% -7s" "$off,"]
          389  +    incr i
          390  +  }
          391  +  puts ""
          392  +  puts "  \};"
          393  +
          394  +}
          395  +
          396  +proc print_tolower {zFunc} {
          397  +
          398  +  set lRecord [tl_create_records]
          399  +
          400  +  set lHigh [list]
          401  +  puts "/*"
          402  +  puts "** Interpret the argument as a unicode codepoint. If the codepoint"
          403  +  puts "** is an upper case character that has a lower case equivalent,"
          404  +  puts "** return the codepoint corresponding to the lower case version."
          405  +  puts "** Otherwise, return a copy of the argument."
          406  +  puts "**"
          407  +  puts "** The results are undefined if the value passed to this function"
          408  +  puts "** is less than zero."
          409  +  puts "*/"
          410  +  puts "int ${zFunc}\(int c)\{"
          411  +
          412  +  set liOff [tl_generate_ioff_table $lRecord]
          413  +  tl_print_table_header
          414  +  foreach entry $lRecord { 
          415  +    if {[tl_print_table_entry toggle $entry $liOff]} { 
          416  +      lappend lHigh $entry 
          417  +    } 
          418  +  }
          419  +  tl_print_table_footer toggle
          420  +  tl_print_ioff_table $liOff
          421  +
          422  +  puts {
          423  +  int ret = c;
          424  +
          425  +  assert( c>=0 );
          426  +  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
          427  +
          428  +  if( c<128 ){
          429  +    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
          430  +  }else if( c<65536 ){
          431  +    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
          432  +    int iLo = 0;
          433  +    int iRes = -1;
          434  +
          435  +    while( iHi>=iLo ){
          436  +      int iTest = (iHi + iLo) / 2;
          437  +      int cmp = (c - aEntry[iTest].iCode);
          438  +      if( cmp>=0 ){
          439  +        iRes = iTest;
          440  +        iLo = iTest+1;
          441  +      }else{
          442  +        iHi = iTest-1;
          443  +      }
          444  +    }
          445  +    assert( iRes<0 || c>=aEntry[iRes].iCode );
          446  +
          447  +    if( iRes>=0 ){
          448  +      const struct TableEntry *p = &aEntry[iRes];
          449  +      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
          450  +        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
          451  +        assert( ret>0 );
          452  +      }
          453  +    }
          454  +  }
          455  +  }
          456  +
          457  +  foreach entry $lHigh {
          458  +    tl_print_if_entry $entry
          459  +  }
          460  +
          461  +  puts ""
          462  +  puts "  return ret;"
          463  +  puts "\}"
          464  +}
          465  +
          466  +proc print_tolower_test {zFunc} {
          467  +  global tl_lookup_table
          468  +
          469  +  puts "static int tolower_test(int *piCode)\{"
          470  +  puts -nonewline "  static int aLookup\[\] = \{"
          471  +  for {set i 0} {$i < 70000} {incr i} {
          472  +    set expected $i
          473  +    catch { set expected $tl_lookup_table($i) }
          474  +    if {($i % 8)==0}  { puts "" ; puts -nonewline "    " }
          475  +    puts -nonewline "$expected, "
          476  +  }
          477  +  puts "  \};"
          478  +  puts "  int i;"
          479  +  puts "  for(i=0; i<sizeof(aLookup)/sizeof(aLookup\[0\]); i++)\{"
          480  +  puts "    if( ${zFunc}\(i)!=aLookup\[i\] )\{"
          481  +  puts "      *piCode = i;"
          482  +  puts "      return 1;"
          483  +  puts "    \}"
          484  +  puts "  \}"
          485  +  puts "  return 0;"
          486  +  puts "\}"
          487  +}
          488  +
          489  +
          490  +proc print_fileheader {} {
          491  +  puts [string trim {
          492  +/*
          493  +** 2012 May 25
          494  +**
          495  +** The author disclaims copyright to this source code.  In place of
          496  +** a legal notice, here is a blessing:
          497  +**
          498  +**    May you do good and not evil.
          499  +**    May you find forgiveness for yourself and forgive others.
          500  +**    May you share freely, never taking more than you give.
          501  +**
          502  +******************************************************************************
          503  +*/
          504  +
          505  +/*
          506  +** DO NOT EDIT THIS MACHINE GENERATED FILE.
          507  +*/
          508  +  }]
          509  +  puts ""
          510  +  puts "#ifndef SQLITE_DISABLE_FTS3_UNICODE"
          511  +  puts ""
          512  +  puts "#include <assert.h>"
          513  +  puts ""
          514  +}
          515  +
          516  +proc print_test_main {} {
          517  +  puts ""
          518  +  puts "#include <stdio.h>"
          519  +  puts ""
          520  +  puts "int main(int argc, char **argv)\{"
          521  +  puts "  int r1, r2;"
          522  +  puts "  int code;"
          523  +  puts "  r1 = isalnum_test(&code);"
          524  +  puts "  if( r1 ) printf(\"isalnum(): Problem with code %d\\n\",code);"
          525  +  puts "  else printf(\"isalnum(): test passed\\n\");"
          526  +  puts "  r2 = tolower_test(&code);"
          527  +  puts "  if( r2 ) printf(\"tolower(): Problem with code %d\\n\",code);"
          528  +  puts "  else printf(\"tolower(): test passed\\n\");"
          529  +  puts "  return (r1 || r2);"
          530  +  puts "\}"
          531  +}
          532  +
          533  +# Proces the command line arguments. Exit early if they are not to
          534  +# our liking.
          535  +#
          536  +proc usage {} {
          537  +  puts -nonewline stderr "Usage: $::argv0 ?-test? i"
          538  +  puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
          539  +  exit 1
          540  +}
          541  +if {[llength $argv]!=2 && [llength $argv]!=3} usage
          542  +if {[llength $argv]==3 && [lindex $argv 0]!="-test"} usage
          543  +set unicodedata.txt [lindex $argv end]
          544  +set casefolding.txt [lindex $argv end-1]
          545  +set generate_test_code [expr {[llength $argv]==3}]
          546  +
          547  +# Print the isalnum() function to stdout.
          548  +#
          549  +print_fileheader
          550  +set lRange [an_load_separator_ranges]
          551  +print_isalnum sqlite3FtsUnicodeIsalnum $lRange
          552  +
          553  +# Leave a gap between the two generated C functions.
          554  +#
          555  +puts ""
          556  +puts ""
          557  +
          558  +# Print the tolower() function to stdout.
          559  +#
          560  +tl_load_casefolding_txt ${casefolding.txt}
          561  +print_tolower sqlite3FtsUnicodeTolower
          562  +
          563  +# Print the test routines and main() function to stdout, if -test 
          564  +# was specified.
          565  +#
          566  +if {$::generate_test_code} {
          567  +  print_test_isalnum sqlite3FtsUnicodeIsalnum $lRange
          568  +  print_tolower_test sqlite3FtsUnicodeTolower 
          569  +  print_test_main 
          570  +}
          571  +
          572  +puts "#endif /* ifndef SQLITE_DISABLE_FTS3_UNICODE */"
          573  +

Changes to main.mk.

    51     51   # Object files for the SQLite library.
    52     52   #
    53     53   LIBOBJ+= alter.o analyze.o attach.o auth.o \
    54     54            backup.o bitvec.o btmutex.o btree.o build.o \
    55     55            callback.o complete.o ctime.o date.o delete.o expr.o fault.o fkey.o \
    56     56            fts3.o fts3_aux.o fts3_expr.o fts3_hash.o fts3_icu.o fts3_porter.o \
    57     57            fts3_snippet.o fts3_tokenizer.o fts3_tokenizer1.o \
           58  +	 fts3_unicode.o fts3_unicode2.o \
    58     59            fts3_write.o func.o global.o hash.o \
    59     60            icu.o insert.o journal.o legacy.o loadext.o \
    60     61            main.o malloc.o mem0.o mem1.o mem2.o mem3.o mem5.o \
    61     62            memjournal.o \
    62     63            mutex.o mutex_noop.o mutex_os2.o mutex_unix.o mutex_w32.o \
    63     64            notify.o opcodes.o os.o os_os2.o os_unix.o os_win.o \
    64     65            pager.o parse.o pcache.o pcache1.o pragma.o prepare.o printf.o \
................................................................................
   194    195     $(TOP)/ext/fts3/fts3_hash.h \
   195    196     $(TOP)/ext/fts3/fts3_icu.c \
   196    197     $(TOP)/ext/fts3/fts3_porter.c \
   197    198     $(TOP)/ext/fts3/fts3_snippet.c \
   198    199     $(TOP)/ext/fts3/fts3_tokenizer.h \
   199    200     $(TOP)/ext/fts3/fts3_tokenizer.c \
   200    201     $(TOP)/ext/fts3/fts3_tokenizer1.c \
          202  +  $(TOP)/ext/fts3/fts3_unicode.c \
          203  +  $(TOP)/ext/fts3/fts3_unicode2.c \
   201    204     $(TOP)/ext/fts3/fts3_write.c
   202    205   SRC += \
   203    206     $(TOP)/ext/icu/sqliteicu.h \
   204    207     $(TOP)/ext/icu/icu.c
   205    208   SRC += \
   206    209     $(TOP)/ext/rtree/rtree.h \
   207    210     $(TOP)/ext/rtree/rtree.c
................................................................................
   503    506   	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_porter.c
   504    507   
   505    508   fts3_tokenizer.o:	$(TOP)/ext/fts3/fts3_tokenizer.c $(HDR) $(EXTHDR)
   506    509   	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_tokenizer.c
   507    510   
   508    511   fts3_tokenizer1.o:	$(TOP)/ext/fts3/fts3_tokenizer1.c $(HDR) $(EXTHDR)
   509    512   	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_tokenizer1.c
          513  +
          514  +fts3_unicode.o:	$(TOP)/ext/fts3/fts3_unicode.c $(HDR) $(EXTHDR)
          515  +	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_unicode.c
          516  +
          517  +fts3_unicode2.o:	$(TOP)/ext/fts3/fts3_unicode2.c $(HDR) $(EXTHDR)
          518  +	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_unicode2.c
   510    519   
   511    520   fts3_write.o:	$(TOP)/ext/fts3/fts3_write.c $(HDR) $(EXTHDR)
   512    521   	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts3/fts3_write.c
   513    522   
   514    523   rtree.o:	$(TOP)/ext/rtree/rtree.c $(HDR) $(EXTHDR)
   515    524   	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/rtree/rtree.c
   516    525   

Changes to src/test_config.c.

   302    302   #endif
   303    303   
   304    304   #ifdef SQLITE_ENABLE_FTS3
   305    305     Tcl_SetVar2(interp, "sqlite_options", "fts3", "1", TCL_GLOBAL_ONLY);
   306    306   #else
   307    307     Tcl_SetVar2(interp, "sqlite_options", "fts3", "0", TCL_GLOBAL_ONLY);
   308    308   #endif
          309  +
          310  +#if !defined(SQLITE_ENABLE_FTS3) || defined(SQLITE_DISABLE_FTS3_UNICODE)
          311  +  Tcl_SetVar2(interp, "sqlite_options", "fts3_unicode", "0", TCL_GLOBAL_ONLY);
          312  +#else
          313  +  Tcl_SetVar2(interp, "sqlite_options", "fts3_unicode", "1", TCL_GLOBAL_ONLY);
          314  +#endif
   309    315   
   310    316   #ifdef SQLITE_OMIT_GET_TABLE
   311    317     Tcl_SetVar2(interp, "sqlite_options", "gettable", "0", TCL_GLOBAL_ONLY);
   312    318   #else
   313    319     Tcl_SetVar2(interp, "sqlite_options", "gettable", "1", TCL_GLOBAL_ONLY);
   314    320   #endif
   315    321   

Changes to test/fts3fault2.test.

   126    126     faultsim_restore_and_reopen
   127    127     db eval {SELECT * FROM sqlite_master}
   128    128   } -body {
   129    129     execsql { INSERT INTO ft(ft) VALUES('rebuild') }
   130    130   } -test {
   131    131     faultsim_test_result {0 {}}
   132    132   }
          133  +
          134  +ifcapable fts3_unicode {
          135  +  do_test 5.0 {
          136  +    faultsim_delete_and_reopen
          137  +    execsql {
          138  +      CREATE VIRTUAL TABLE ft USING fts4(a, tokenize=unicode61);
          139  +    }
          140  +    faultsim_save_and_close
          141  +  } {}
          142  +  
          143  +  do_faultsim_test 5.1 -faults oom* -prep {
          144  +    faultsim_restore_and_reopen
          145  +    db eval {SELECT * FROM sqlite_master}
          146  +  } -body {
          147  +    execsql { INSERT INTO ft VALUES('the quick brown fox'); }
          148  +    execsql { INSERT INTO ft VALUES(
          149  +       'theunusuallylongtokenthatjustdragsonandonandonandthendragsonsomemoreeof'
          150  +      );
          151  +    }
          152  +    execsql { SELECT docid FROM ft WHERE ft MATCH 'th*' }
          153  +  } -test {
          154  +    faultsim_test_result {0 {1 2}}
          155  +  }
          156  +}
   133    157   
   134    158   finish_test

Added test/fts4unicode.test.

            1  +# 2012 May 25
            2  +#
            3  +# The author disclaims copyright to this source code.  In place of
            4  +# a legal notice, here is a blessing:
            5  +#
            6  +#    May you do good and not evil.
            7  +#    May you find forgiveness for yourself and forgive others.
            8  +#    May you share freely, never taking more than you give.
            9  +#
           10  +#*************************************************************************
           11  +#
           12  +# The tests in this file focus on testing the "unicode" FTS tokenizer.
           13  +#
           14  +
           15  +set testdir [file dirname $argv0]
           16  +source $testdir/tester.tcl
           17  +ifcapable !fts3_unicode { finish_test ; return }
           18  +set ::testprefix fts4unicode
           19  +
           20  +proc do_unicode_token_test {tn input res} {
           21  +  set input [string map {' ''} $input]
           22  +  uplevel [list do_execsql_test $tn "
           23  +    SELECT fts3_tokenizer_test('unicode61', '$input');
           24  +  " [list [list {*}$res]]]
           25  +}
           26  +
           27  +do_unicode_token_test 1.0 {a B c D} {0 a a 1 b B 2 c c 3 d D}
           28  +do_unicode_token_test 1.1 {  } {0   1   2  }
           29  +do_unicode_token_test 1.2 {xx xx xx} {0 xx xx 1 xx xx 2 xx xx}
           30  +
           31  +# 0x00DF is a small "sharp s". 0x1E9E is a capital sharp s.
           32  +do_unicode_token_test 1.3 "\uDF" "0 \uDF \uDF"
           33  +do_unicode_token_test 1.4 "\u1E9E" "0  \u1E9E"
           34  +do_unicode_token_test 1.5 "\u1E9E" "0 \uDF \u1E9E"
           35  +
           36  +do_unicode_token_test 1.6 "The quick brown fox" {
           37  +  0 the The 1 quick quick 2 brown brown 3 fox fox
           38  +}
           39  +do_unicode_token_test 1.7 "The\u00bfquick\u224ebrown\u2263fox" {
           40  +  0 the The 1 quick quick 2 brown brown 3 fox fox
           41  +}
           42  +
           43  +#-------------------------------------------------------------------------
           44  +#
           45  +set docs [list {
           46  +  Enhance the INSERT syntax to allow multiple rows to be inserted via the
           47  +  VALUES clause.
           48  +} {
           49  +  Enhance the CREATE VIRTUAL TABLE command to support the IF NOT EXISTS clause.
           50  +} {
           51  +  Added the sqlite3_stricmp() interface as a counterpart to sqlite3_strnicmp().
           52  +} {
           53  +  Added the sqlite3_db_readonly() interface.
           54  +} {
           55  +  Added the SQLITE_FCNTL_PRAGMA file control, giving VFS implementations the
           56  +  ability to add new PRAGMA statements or to override built-in PRAGMAs.  
           57  +} {
           58  +  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
           59  +  the same row that contains the maximum x value.
           60  +} {
           61  +  Added support for the FTS4 languageid option.
           62  +} {
           63  +  Documented support for the FTS4 content option. This feature has actually
           64  +  been in the code since version 3.7.9 but is only now considered to be
           65  +  officially supported.  
           66  +} {
           67  +  Pending statements no longer block ROLLBACK. Instead, the pending statement
           68  +  will return SQLITE_ABORT upon next access after the ROLLBACK.  
           69  +} {
           70  +  Improvements to the handling of CSV inputs in the command-line shell
           71  +} {
           72  +  Fix a bug introduced in version 3.7.10 that might cause a LEFT JOIN to be
           73  +  incorrectly converted into an INNER JOIN if the WHERE clause indexable terms
           74  +  connected by OR.  
           75  +}]
           76  +
           77  +set map(a) [list "\u00C4" "\u00E4"]  ; # LATIN LETTER A WITH DIAERESIS
           78  +set map(e) [list "\u00CB" "\u00EB"]  ; # LATIN LETTER E WITH DIAERESIS
           79  +set map(i) [list "\u00CF" "\u00EF"]  ; # LATIN LETTER I WITH DIAERESIS
           80  +set map(o) [list "\u00D6" "\u00F6"]  ; # LATIN LETTER O WITH DIAERESIS
           81  +set map(u) [list "\u00DC" "\u00FC"]  ; # LATIN LETTER U WITH DIAERESIS
           82  +set map(y) [list "\u0178" "\u00FF"]  ; # LATIN LETTER Y WITH DIAERESIS
           83  +set map(h) [list "\u1E26" "\u1E27"]  ; # LATIN LETTER H WITH DIAERESIS
           84  +set map(w) [list "\u1E84" "\u1E85"]  ; # LATIN LETTER W WITH DIAERESIS
           85  +set map(x) [list "\u1E8C" "\u1E8D"]  ; # LATIN LETTER X WITH DIAERESIS
           86  +foreach k [array names map] {
           87  +  lappend mappings [string toupper $k] [lindex $map($k) 0] 
           88  +  lappend mappings $k [lindex $map($k) 1]
           89  +}
           90  +proc mapdoc {doc} { 
           91  +  set doc [regsub -all {[[:space:]]+} $doc " "]
           92  +  string map $::mappings [string trim $doc] 
           93  +}
           94  +
           95  +do_test 2.0 {
           96  +  execsql { CREATE VIRTUAL TABLE t2 USING fts4(tokenize=unicode61, x); }
           97  +  foreach doc $docs {
           98  +    set d [mapdoc $doc]
           99  +    execsql { INSERT INTO t2 VALUES($d) }
          100  +  }
          101  +} {}
          102  +
          103  +do_test 2.1 {
          104  +  set q [mapdoc "row"]
          105  +  execsql { SELECT * FROM t2 WHERE t2 MATCH $q }
          106  +} [list [mapdoc {
          107  +  Queries of the form: "SELECT max(x), y FROM table" returns the value of y on
          108  +  the same row that contains the maximum x value.
          109  +}]]
          110  +
          111  +foreach {tn query snippet} {
          112  +  2 "row" {
          113  +     ...returns the value of y on the same [row] that contains 
          114  +     the maximum x value.
          115  +  }
          116  +  3 "ROW" {
          117  +     ...returns the value of y on the same [row] that contains 
          118  +     the maximum x value.
          119  +  }
          120  +  4 "rollback" {
          121  +     ...[ROLLBACK]. Instead, the pending statement
          122  +     will return SQLITE_ABORT upon next access after the [ROLLBACK].
          123  +  }
          124  +  5 "rOllback" {
          125  +     ...[ROLLBACK]. Instead, the pending statement
          126  +     will return SQLITE_ABORT upon next access after the [ROLLBACK].
          127  +  }
          128  +  6 "lang*" {
          129  +     Added support for the FTS4 [languageid] option.
          130  +  }
          131  +} {
          132  +  do_test 2.$tn {
          133  +    set q [mapdoc $query]
          134  +    execsql { SELECT snippet(t2, '[', ']', '...') FROM t2 WHERE t2 MATCH $q }
          135  +  } [list [mapdoc $snippet]]
          136  +}
          137  +
          138  +#-------------------------------------------------------------------------
          139  +# Make sure the unicode61 tokenizer does not crash if it is passed a 
          140  +# NULL pointer.
          141  +reset_db
          142  +do_execsql_test 3.1 {
          143  +  CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x, y);
          144  +  INSERT INTO t1 VALUES(NULL, 'a b c');
          145  +}
          146  +
          147  +do_execsql_test 3.2 {
          148  +  SELECT snippet(t1, '[', ']') FROM t1 WHERE t1 MATCH 'b'
          149  +} {{a [b] c}}
          150  +
          151  +do_execsql_test 3.3 {
          152  +  BEGIN;
          153  +  DELETE FROM t1;
          154  +  INSERT INTO t1 VALUES('b b b b b b b b b b b', 'b b b b b b b b b b b b b');
          155  +  INSERT INTO t1 SELECT * FROM t1;
          156  +  INSERT INTO t1 SELECT * FROM t1;
          157  +  INSERT INTO t1 SELECT * FROM t1;
          158  +  INSERT INTO t1 SELECT * FROM t1;
          159  +  INSERT INTO t1 SELECT * FROM t1;
          160  +  INSERT INTO t1 SELECT * FROM t1;
          161  +  INSERT INTO t1 SELECT * FROM t1;
          162  +  INSERT INTO t1 SELECT * FROM t1;
          163  +  INSERT INTO t1 SELECT * FROM t1;
          164  +  INSERT INTO t1 SELECT * FROM t1;
          165  +  INSERT INTO t1 SELECT * FROM t1;
          166  +  INSERT INTO t1 SELECT * FROM t1;
          167  +  INSERT INTO t1 SELECT * FROM t1;
          168  +  INSERT INTO t1 SELECT * FROM t1;
          169  +  INSERT INTO t1 SELECT * FROM t1;
          170  +  INSERT INTO t1 SELECT * FROM t1;
          171  +  INSERT INTO t1 VALUES('a b c', NULL);
          172  +  INSERT INTO t1 VALUES('a x c', NULL);
          173  +  COMMIT;
          174  +}
          175  +
          176  +do_execsql_test 3.4 {
          177  +  SELECT * FROM t1 WHERE t1 MATCH 'a b';
          178  +} {{a b c} {}}
          179  +
          180  +#-------------------------------------------------------------------------
          181  +#
          182  +reset_db
          183  +
          184  +do_test 4.1 {
          185  +  set a "abc\uFFFEdef"
          186  +  set b "abc\uD800def"
          187  +  set c "\uFFFEdef"
          188  +  set d "\uD800def"
          189  +  execsql {
          190  +    CREATE VIRTUAL TABLE t1 USING fts4(tokenize=unicode61, x);
          191  +    INSERT INTO t1 VALUES($a);
          192  +    INSERT INTO t1 VALUES($b);
          193  +    INSERT INTO t1 VALUES($c);
          194  +    INSERT INTO t1 VALUES($d);
          195  +  }
          196  +} {}
          197  +
          198  +do_test 4.2 {
          199  +  set a [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0x62}]
          200  +  set b [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0x62}]
          201  +  set c [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
          202  +  set d [binary format c* {0x61 0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF 0x62}]
          203  +  execsql {
          204  +    INSERT INTO t1 VALUES($a);
          205  +    INSERT INTO t1 VALUES($b);
          206  +    INSERT INTO t1 VALUES($c);
          207  +    INSERT INTO t1 VALUES($d);
          208  +  }
          209  +} {}
          210  +
          211  +do_test 4.3 {
          212  +  set a [binary format c* {0xF7 0xBF 0xBF 0xBF}]
          213  +  set b [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF}]
          214  +  set c [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF}]
          215  +  set d [binary format c* {0xF7 0xBF 0xBF 0xBF 0xBF 0xBF 0xBF}]
          216  +  execsql {
          217  +    INSERT INTO t1 VALUES($a);
          218  +    INSERT INTO t1 VALUES($b);
          219  +    INSERT INTO t1 VALUES($c);
          220  +    INSERT INTO t1 VALUES($d);
          221  +  }
          222  +} {}
          223  +
          224  +
          225  +
          226  +finish_test
          227  +

Changes to test/permutations.test.

   181    181     fts3near.test fts3query.test fts3shared.test fts3snippet.test 
   182    182     fts3sort.test
   183    183     fts3fault.test fts3malloc.test fts3matchinfo.test
   184    184     fts3aux1.test fts3comp1.test fts3auto.test
   185    185     fts4aa.test fts4content.test
   186    186     fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test
   187    187     fts3corrupt2.test fts3first.test fts4langid.test fts4merge.test
   188         -  fts4check.test
          188  +  fts4check.test fts4unicode.test
   189    189   }
   190    190   
   191    191   
   192    192   lappend ::testsuitelist xxx
   193    193   #-------------------------------------------------------------------------
   194    194   # Define the coverage related test suites:
   195    195   #

Changes to tool/mksqlite3c.tcl.

   312    312      fts3_expr.c
   313    313      fts3_hash.c
   314    314      fts3_porter.c
   315    315      fts3_tokenizer.c
   316    316      fts3_tokenizer1.c
   317    317      fts3_write.c
   318    318      fts3_snippet.c
          319  +   fts3_unicode.c
          320  +   fts3_unicode2.c
   319    321   
   320    322      rtree.c
   321    323      icu.c
   322    324      fts3_icu.c
   323    325   } {
   324    326     copy_file tsrc/$file
   325    327   }
   326    328   
   327    329   close $out