SQLite

Check-in [fb10bbb9f9]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix some problems with building fts5 and fts3 together using the amalgamation.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1: fb10bbb9f9c4481e6043d323a3018a4ec68eb0ff
User & Date: dan 2015-02-02 11:32:20.159
Context
2015-02-02
11:58
Ensure generated header file fts5parse.h is included in sqlite3.c. (check-in: bc7be2fcfd user: dan tags: fts5)
11:32
Fix some problems with building fts5 and fts3 together using the amalgamation. (check-in: fb10bbb9f9 user: dan tags: fts5)
09:40
Merge latest trunk changes with this branch. (check-in: 76212f2c9a user: dan tags: fts5)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts3/unicode/mkunicode.tcl.
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int remove_diacritic(int c)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i







|







113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  puts "** If the argument is a codepoint corresponding to a lowercase letter"
  puts "** in the ASCII range with a diacritic added, return the codepoint"
  puts "** of the ASCII letter only. For example, if passed 235 - \"LATIN"
  puts "** SMALL LETTER E WITH DIAERESIS\" - return 65 (\"LATIN SMALL LETTER"
  puts "** E\"). The resuls of passing a codepoint that corresponds to an"
  puts "** uppercase letter are undefined."
  puts "*/"
  puts "static int ${::remove_diacritic}(int c)\{"
  puts "  unsigned short aDia\[\] = \{"
  puts -nonewline "        0, "
  set i 1
  foreach r $lRange {
    foreach {iCode nRange} $r {}
    if {($i % 8)==0} {puts "" ; puts -nonewline "    " }
    incr i
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
  }
  tl_print_table_footer toggle
  tl_print_ioff_table $liOff

  puts {
  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');







|







622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
    if {[tl_print_table_entry toggle $entry $liOff]} { 
      lappend lHigh $entry 
    } 
  }
  tl_print_table_footer toggle
  tl_print_ioff_table $liOff

  puts [subst -nocommands {
  int ret = c;

  assert( c>=0 );
  assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );

  if( c<128 ){
    if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  }

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }

  puts ""
  puts "  return ret;"







|

|







655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = ${::remove_diacritic}(ret);
  }
  }]

  foreach entry $lHigh {
    tl_print_if_entry $entry
  }

  puts ""
  puts "  return ret;"
768
769
770
771
772
773
774

775
776
777
778
779
780
781
782
783
784
785

786
787
788
789
790
791
792
  puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
  exit 1
}
if {[llength $argv]<2} usage
set unicodedata.txt [lindex $argv end]
set casefolding.txt [lindex $argv end-1]


set generate_test_code 0
set generate_fts5_code 0
set function_prefix "sqlite3Fts"
for {set i 0} {$i < [llength $argv]-2} {incr i} {
  switch -- [lindex $argv $i] {
    -test {
      set generate_test_code 1
    }
    -fts5 {
      set function_prefix sqlite3Fts5
      set generate_fts5_code 1

    }
    default {
      usage
    }
  }
}








>











>







768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
  puts            stderr "<CaseFolding.txt file> <UnicodeData.txt file>"
  exit 1
}
if {[llength $argv]<2} usage
set unicodedata.txt [lindex $argv end]
set casefolding.txt [lindex $argv end-1]

set remove_diacritic remove_diacritic
set generate_test_code 0
set generate_fts5_code 0
set function_prefix "sqlite3Fts"
for {set i 0} {$i < [llength $argv]-2} {incr i} {
  switch -- [lindex $argv $i] {
    -test {
      set generate_test_code 1
    }
    -fts5 {
      set function_prefix sqlite3Fts5
      set generate_fts5_code 1
      set remove_diacritic fts5_remove_diacritic
    }
    default {
      usage
    }
  }
}

Changes to ext/fts5/fts5Int.h.
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
**
*/
#ifndef _FTS5INT_H
#define _FTS5INT_H

#include "fts5.h"
#include "sqliteInt.h"
#include "fts3_tokenizer.h"


/*
** Maximum number of prefix indexes on single FTS5 table. This must be
** less than 32. If it is set to anything large than that, an #error
** directive in fts5_index.c will cause the build to fail.
*/







<







12
13
14
15
16
17
18

19
20
21
22
23
24
25
**
*/
#ifndef _FTS5INT_H
#define _FTS5INT_H

#include "fts5.h"
#include "sqliteInt.h"



/*
** Maximum number of prefix indexes on single FTS5 table. This must be
** less than 32. If it is set to anything large than that, an #error
** directive in fts5_index.c will cause the build to fail.
*/
Changes to ext/fts5/fts5_expr.c.
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#define FTS5_EOF 0

typedef struct Fts5ExprTerm Fts5ExprTerm;

/*
** Functions generated by lemon from fts5parse.y.
*/
void *sqlite3Fts5ParserAlloc(void *(*mallocProc)(size_t));
void sqlite3Fts5ParserFree(void*, void (*freeProc)(void*));
void sqlite3Fts5Parser(void*, int, Fts5Token, Fts5Parse*);

struct Fts5Expr {
  Fts5Index *pIndex;
  Fts5ExprNode *pRoot;
  int bDesc;                      /* Iterate in descending docid order */







|







21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#define FTS5_EOF 0

typedef struct Fts5ExprTerm Fts5ExprTerm;

/*
** Functions generated by lemon from fts5parse.y.
*/
void *sqlite3Fts5ParserAlloc(void *(*mallocProc)(u64));
void sqlite3Fts5ParserFree(void*, void (*freeProc)(void*));
void sqlite3Fts5Parser(void*, int, Fts5Token, Fts5Parse*);

struct Fts5Expr {
  Fts5Index *pIndex;
  Fts5ExprNode *pRoot;
  int bDesc;                      /* Iterate in descending docid order */
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
    }
  }

  *pz = &pToken->p[pToken->n];
  return tok;
}

static void *fts5ParseAlloc(size_t t){ return sqlite3_malloc((int)t); }
static void fts5ParseFree(void *p){ sqlite3_free(p); }

int sqlite3Fts5ExprNew(
  Fts5Config *pConfig,            /* FTS5 Configuration */
  const char *zExpr,              /* Expression text */
  Fts5Expr **ppNew, 
  char **pzErr







|







175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
    }
  }

  *pz = &pToken->p[pToken->n];
  return tok;
}

static void *fts5ParseAlloc(u64 t){ return sqlite3_malloc((int)t); }
static void fts5ParseFree(void *p){ sqlite3_free(p); }

int sqlite3Fts5ExprNew(
  Fts5Config *pConfig,            /* FTS5 Configuration */
  const char *zExpr,              /* Expression text */
  Fts5Expr **ppNew, 
  char **pzErr
Changes to ext/fts5/fts5_index.c.
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
        if( pLeaf ){
          if( fts5GetU16(&pLeaf->p[0])!=0 ) p->rc = FTS5_CORRUPT;
          fts5DataRelease(pLeaf);
        }
      }

      fts5DlidxIterFree(pDlidx);
      // fts5DlidxIterTestReverse(p, iIdx, iSegid, iter.iLeaf);
    }
  }

  if( p->rc==SQLITE_OK && iter.iLeaf!=pSeg->pgnoLast ){
    p->rc = FTS5_CORRUPT;
  }








|







3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
        if( pLeaf ){
          if( fts5GetU16(&pLeaf->p[0])!=0 ) p->rc = FTS5_CORRUPT;
          fts5DataRelease(pLeaf);
        }
      }

      fts5DlidxIterFree(pDlidx);
      fts5DlidxIterTestReverse(p, iIdx, iSegid, iter.iLeaf);
    }
  }

  if( p->rc==SQLITE_OK && iter.iLeaf!=pSeg->pgnoLast ){
    p->rc = FTS5_CORRUPT;
  }

Changes to ext/fts5/fts5_tokenize.c.
557
558
559
560
561
562
563

564
565
566
567
568
569
570
  const char *zSuffix;
  int nSuffix;
  int (*xCond)(char *zStem, int nStem);
  const char *zOutput;
  int nOutput;
};


static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
  int ret = -1;
  int nBuf = *pnBuf;
  PorterRule *p;

  for(p=aRule; p->zSuffix; p++){
    assert( strlen(p->zSuffix)==p->nSuffix );







>







557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
  const char *zSuffix;
  int nSuffix;
  int (*xCond)(char *zStem, int nStem);
  const char *zOutput;
  int nOutput;
};

#if 0
static int fts5PorterApply(char *aBuf, int *pnBuf, PorterRule *aRule){
  int ret = -1;
  int nBuf = *pnBuf;
  PorterRule *p;

  for(p=aRule; p->zSuffix; p++){
    assert( strlen(p->zSuffix)==p->nSuffix );
580
581
582
583
584
585
586

587
588
589
590
591
592
593
      *pnBuf = nStem + p->nOutput;
      ret = p - aRule;
    }
  }

  return ret;
}


static int fts5PorterIsVowel(char c, int bYIsVowel){
  return (
      c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
  );
}








>







581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
      *pnBuf = nStem + p->nOutput;
      ret = p - aRule;
    }
  }

  return ret;
}
#endif

static int fts5PorterIsVowel(char c, int bYIsVowel){
  return (
      c=='a' || c=='e' || c=='i' || c=='o' || c=='u' || (bYIsVowel && c=='y')
  );
}

Changes to ext/fts5/fts5_unicode2.c.
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  ** The most significant 22 bits in each 32-bit value contain the first 
  ** codepoint in the range. The least significant 10 bits are used to store
  ** the size of the range (always at least 1). In other words, the value 
  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
  ** C. It is not possible to represent a range larger than 1023 codepoints 
  ** using this format.
  */
  const static unsigned int aEntry[] = {
    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
    0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,







|







34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  ** The most significant 22 bits in each 32-bit value contain the first 
  ** codepoint in the range. The least significant 10 bits are used to store
  ** the size of the range (always at least 1). In other words, the value 
  ** ((C<<22) + N) represents a range of N codepoints starting with codepoint 
  ** C. It is not possible to represent a range larger than 1023 codepoints 
  ** using this format.
  */
  static const unsigned int aEntry[] = {
    0x00000030, 0x0000E807, 0x00016C06, 0x0001EC2F, 0x0002AC07,
    0x0002D001, 0x0002D803, 0x0002EC01, 0x0002FC01, 0x00035C01,
    0x0003DC01, 0x000B0804, 0x000B480E, 0x000B9407, 0x000BB401,
    0x000BBC81, 0x000DD401, 0x000DF801, 0x000E1002, 0x000E1C01,
    0x000FD801, 0x00120808, 0x00156806, 0x00162402, 0x00163C01,
    0x00164437, 0x0017CC02, 0x00180005, 0x00181816, 0x00187802,
    0x00192C15, 0x0019A804, 0x0019C001, 0x001B5001, 0x001B580F,
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
  };

  if( c<128 ){
    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
  }else if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
        iRes = iTest;
        iLo = iTest+1;







|







126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
    0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
  };

  if( c<128 ){
    return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
  }else if( c<(1<<22) ){
    unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
    int iRes = 0;
    int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
    int iLo = 0;
    while( iHi>=iLo ){
      int iTest = (iHi + iLo) / 2;
      if( key >= aEntry[iTest] ){
        iRes = iTest;
        iLo = iTest+1;
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 







|







154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
** If the argument is a codepoint corresponding to a lowercase letter
** in the ASCII range with a diacritic added, return the codepoint
** of the ASCII letter only. For example, if passed 235 - "LATIN
** SMALL LETTER E WITH DIAERESIS" - return 65 ("LATIN SMALL LETTER
** E"). The resuls of passing a codepoint that corresponds to an
** uppercase letter are undefined.
*/
static int fts5_remove_diacritic(int c){
  unsigned short aDia[] = {
        0,  1797,  1848,  1859,  1891,  1928,  1940,  1995, 
     2024,  2040,  2060,  2110,  2168,  2206,  2264,  2286, 
     2344,  2383,  2472,  2488,  2516,  2596,  2668,  2732, 
     2782,  2842,  2894,  2954,  2984,  3000,  3028,  3336, 
     3456,  3696,  3712,  3728,  3744,  3896,  3912,  3928, 
     3968,  4008,  4040,  4106,  4138,  4170,  4202,  4234, 
197
198
199
200
201
202
203
204

205
206
207
208
209
210
211
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);
};



/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
int sqlite3Fts5UnicodeIsdiacritic(int c){







<
>







197
198
199
200
201
202
203

204
205
206
207
208
209
210
211
      iLo = iTest+1;
    }else{
      iHi = iTest-1;
    }
  }
  assert( key>=aDia[iRes] );
  return ((c > (aDia[iRes]>>3) + (aDia[iRes]&0x07)) ? c : (int)aChar[iRes]);

}


/*
** Return true if the argument interpreted as a unicode codepoint
** is a diacritical modifier character.
*/
int sqlite3Fts5UnicodeIsdiacritic(int c){
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = remove_diacritic(ret);
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS5) */







|









347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
      const struct TableEntry *p = &aEntry[iRes];
      if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
        ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
        assert( ret>0 );
      }
    }

    if( bRemoveDiacritic ) ret = fts5_remove_diacritic(ret);
  }
  
  else if( c>=66560 && c<66600 ){
    ret = c + 40;
  }

  return ret;
}
#endif /* defined(SQLITE_ENABLE_FTS5) */
Changes to main.mk.
239
240
241
242
243
244
245
246

247
248
249
250
251
252
253
   $(TOP)/ext/fts5/fts5.c \
   $(TOP)/ext/fts5/fts5_config.c \
   $(TOP)/ext/fts5/fts5_expr.c \
   $(TOP)/ext/fts5/fts5_hash.c \
   $(TOP)/ext/fts5/fts5_index.c \
   fts5parse.c \
   $(TOP)/ext/fts5/fts5_storage.c \
   $(TOP)/ext/fts5/fts5_tokenize.c 



# Generated source code files
#
SRC += \
  keywordhash.h \
  opcodes.c \







|
>







239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
   $(TOP)/ext/fts5/fts5.c \
   $(TOP)/ext/fts5/fts5_config.c \
   $(TOP)/ext/fts5/fts5_expr.c \
   $(TOP)/ext/fts5/fts5_hash.c \
   $(TOP)/ext/fts5/fts5_index.c \
   fts5parse.c \
   $(TOP)/ext/fts5/fts5_storage.c \
   $(TOP)/ext/fts5/fts5_tokenize.c \
   $(TOP)/ext/fts5/fts5_unicode2.c 


# Generated source code files
#
SRC += \
  keywordhash.h \
  opcodes.c \
630
631
632
633
634
635
636
637

638
639
640
641
642
643
644
	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts5/fts5_unicode2.c

fts5parse.c:	$(TOP)/ext/fts5/fts5parse.y lemon 
	cp $(TOP)/ext/fts5/fts5parse.y .
	rm -f fts5parse.h
	./lemon $(OPTS) fts5parse.y
	mv fts5parse.c fts5parse.c.orig
	cat fts5parse.c.orig | sed 's/yy/fts5yy/g' | sed 's/YY/fts5YY/g' > fts5parse.c



userauth.o:	$(TOP)/ext/userauth/userauth.c $(HDR) $(EXTHDR)
	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/userauth/userauth.c


# Rules for building test programs and for running tests







|
>







631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/fts5/fts5_unicode2.c

fts5parse.c:	$(TOP)/ext/fts5/fts5parse.y lemon 
	cp $(TOP)/ext/fts5/fts5parse.y .
	rm -f fts5parse.h
	./lemon $(OPTS) fts5parse.y
	mv fts5parse.c fts5parse.c.orig
	cat fts5parse.c.orig | sed 's/yy/fts5yy/g' | sed 's/YY/fts5YY/g' \
		| sed 's/TOKEN/FTS5TOKEN/g' > fts5parse.c


userauth.o:	$(TOP)/ext/userauth/userauth.c $(HDR) $(EXTHDR)
	$(TCCX) -DSQLITE_CORE -c $(TOP)/ext/userauth/userauth.c


# Rules for building test programs and for running tests
Changes to tool/mksqlite3c.tcl.
338
339
340
341
342
343
344

345
346
347
348
349
350
351
   fts5_config.c
   fts5_expr.c
   fts5_hash.c
   fts5_index.c
   fts5parse.c
   fts5_storage.c
   fts5_tokenize.c


   rtree.c
   icu.c
   fts3_icu.c
} {
  copy_file tsrc/$file
}







>







338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
   fts5_config.c
   fts5_expr.c
   fts5_hash.c
   fts5_index.c
   fts5parse.c
   fts5_storage.c
   fts5_tokenize.c
   fts5_unicode2.c

   rtree.c
   icu.c
   fts3_icu.c
} {
  copy_file tsrc/$file
}