/ Check-in [7adfa4a5]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix the spellfix1_scriptcode() function to ignore whitespace and punctuation, and to recognize hebrew and arabic scripts.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:7adfa4a5794e47f97491c08abeaaac90e826b331
User & Date: drh 2015-12-17 14:18:21
Context
2015-12-18
16:29
Micro-optimizations and comment fixes on the mem5.c memory allocator module. check-in: 8bf5e056 user: drh tags: trunk
03:59
Reduce the size of the CellInfo object from 32 to 24 bytes on 64-bit machines. Closed-Leaf check-in: 78507154 user: drh tags: optimize-cellinfo
2015-12-17
20:36
Add the "offsets=0" option to fts5, to create a smaller index without term offset information. A few things are currently broken on this branch. check-in: 40b5bbf0 user: dan tags: fts5-offsets
17:30
Reduce the size of the VdbeCursor object by a pointer (the pBt pointer used for ephemeral tables). check-in: 98b710c3 user: drh tags: optimize-vdbecursor
14:18
Fix the spellfix1_scriptcode() function to ignore whitespace and punctuation, and to recognize hebrew and arabic scripts. check-in: 7adfa4a5 user: drh tags: trunk
13:28
Fixes for harmless compiler warnings. check-in: 85ebd46c user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/misc/spellfix.c.

1713
1714
1715
1716
1717
1718
1719


1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730




1731
1732
1733
1734
1735
1736
1737


1738
1739
1740
1741
1742
1743
1744
  int nIn = sqlite3_value_bytes(argv[0]);
  int c, sz;
  int scriptMask = 0;
  int res;
# define SCRIPT_LATIN       0x0001
# define SCRIPT_CYRILLIC    0x0002
# define SCRIPT_GREEK       0x0004



  while( nIn>0 ){
    c = utf8Read(zIn, nIn, &sz);
    zIn += sz;
    nIn -= sz;
    if( c<0x02af ){
      scriptMask |= SCRIPT_LATIN;
    }else if( c>=0x0400 && c<=0x04ff ){
      scriptMask |= SCRIPT_CYRILLIC;
    }else if( c>=0x0386 && c<=0x03ce ){
      scriptMask |= SCRIPT_GREEK;




    }
  }
  switch( scriptMask ){
    case 0:                res = 999; break;
    case SCRIPT_LATIN:     res = 215; break;
    case SCRIPT_CYRILLIC:  res = 220; break;
    case SCRIPT_GREEK:     res = 200; break;


    default:               res = 998; break;
  }
  sqlite3_result_int(context, res);
}

/* End transliterate
******************************************************************************







>
>





|





>
>
>
>







>
>







1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
  int nIn = sqlite3_value_bytes(argv[0]);
  int c, sz;
  int scriptMask = 0;
  int res;
# define SCRIPT_LATIN       0x0001
# define SCRIPT_CYRILLIC    0x0002
# define SCRIPT_GREEK       0x0004
# define SCRIPT_HEBREW      0x0008
# define SCRIPT_ARABIC      0x0010

  while( nIn>0 ){
    c = utf8Read(zIn, nIn, &sz);
    zIn += sz;
    nIn -= sz;
    if( c<0x02af && (c>=0x80 || midClass[c&0x7f]<CCLASS_DIGIT) ){
      scriptMask |= SCRIPT_LATIN;
    }else if( c>=0x0400 && c<=0x04ff ){
      scriptMask |= SCRIPT_CYRILLIC;
    }else if( c>=0x0386 && c<=0x03ce ){
      scriptMask |= SCRIPT_GREEK;
    }else if( c>=0x0590 && c<=0x05ff ){
      scriptMask |= SCRIPT_HEBREW;
    }else if( c>=0x0600 && c<=0x06ff ){
      scriptMask |= SCRIPT_ARABIC;
    }
  }
  switch( scriptMask ){
    case 0:                res = 999; break;
    case SCRIPT_LATIN:     res = 215; break;
    case SCRIPT_CYRILLIC:  res = 220; break;
    case SCRIPT_GREEK:     res = 200; break;
    case SCRIPT_HEBREW:    res = 125; break;
    case SCRIPT_ARABIC:    res = 160; break;
    default:               res = 998; break;
  }
  sqlite3_result_int(context, res);
}

/* End transliterate
******************************************************************************

Added test/spellfix3.test.























































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# 2015-12-17
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl
set testprefix spellfix3

ifcapable !vtab { finish_test ; return }

load_static_extension db spellfix

do_execsql_test 100 {
  SELECT spellfix1_scriptcode('And God said, “Let there be light”');
} {215}
do_execsql_test 110 {
  SELECT spellfix1_scriptcode('Бог сказал: "Да будет свет"');
} {220}
do_execsql_test 120 {
  SELECT spellfix1_scriptcode('και ειπεν ο θεος γενηθητω φως και εγενετο φως');
} {200}
do_execsql_test 130 {
  SELECT spellfix1_scriptcode('וַיֹּ֥אמֶר אֱלֹהִ֖ים יְהִ֣י א֑וֹר וַֽיְהִי־אֽוֹר׃');
} {125}
do_execsql_test 140 {
  SELECT spellfix1_scriptcode('فِي ذَلِكَ الوَقتِ، قالَ اللهُ: لِيَكُنْ نُورٌ. فَصَارَ نُورٌ.');
} {160}
do_execsql_test 200 {
  SELECT spellfix1_scriptcode('+3.14159');
} {999}
do_execsql_test 210 {
  SELECT spellfix1_scriptcode('And God said: "Да будет свет"');
} {998}

finish_test