/ Check-in [af8d43a4]
Login
Overview
Comment:Fix prefix indexes so that they work in characters, not bytes.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | fts5
Files: files | file ages | folders
SHA1:af8d43a4a08528bbae25ee38fe25de8a86f8a21c
User & Date: dan 2015-01-13 17:25:08
Context
2015-01-17
17:48
Improve the performance of the fts5 porter tokenizer implementation. check-in: 96ea6004 user: dan tags: fts5
2015-01-13
17:25
Fix prefix indexes so that they work in characters, not bytes. check-in: af8d43a4 user: dan tags: fts5
2015-01-12
17:58
Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii". check-in: f22dbcca user: dan tags: fts5
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ext/fts5/fts5_index.c.

77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
....
4044
4045
4046
4047
4048
4049
4050

































4051
4052
4053
4054
4055
4056
4057
....
4060
4061
4062
4063
4064
4065
4066


4067
4068

4069
4070
4071
4072
4073
4074
4075
4076
4077
....
4103
4104
4105
4106
4107
4108
4109
4110

4111
4112
4113
4114
4115
4116
4117
4118
....
4126
4127
4128
4129
4130
4131
4132

4133
4134
4135
4136
4137
4138
4139
4140
4141
....
4598
4599
4600
4601
4602
4603
4604

















































4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617





4618
4619
4620
**   Then, for each level from 0 to nMax:
**
**     + number of input segments in ongoing merge.
**     + total number of segments in level.
**     + for each segment from oldest to newest:
**         + segment id (always > 0)
**         + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.)
**         + first leaf page number (often 1)
**         + final leaf page number
**
** 2. The Averages Record:
**
**   A single record within the %_data table. The data is a list of varints.
**   The first value is the number of rows in the index. Then, for each column
**   from left to right, the total number of tokens in the column for all 
................................................................................
      sqlite3_free(p->apHash);
    }
    sqlite3_free(p->zDataTbl);
    sqlite3_free(p);
  }
  return rc;
}


































/*
** Calculate and return a checksum that is the XOR of the index entry
** checksum of all entries that would be generated by the token specified
** by the final 5 arguments.
*/
u64 sqlite3Fts5IndexCksum(
................................................................................
  int iCol,                       /* Column term appears in */
  int iPos,                       /* Position term appears in */
  const char *pTerm, int nTerm    /* Term at iPos */
){
  u64 ret = 0;                    /* Return value */
  int iIdx;                       /* For iterating through indexes */



  for(iIdx=0; iIdx<=pConfig->nPrefix; iIdx++){
    int n = ((iIdx==pConfig->nPrefix) ? nTerm : pConfig->aPrefix[iIdx]);

    if( n<=nTerm ){
      ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, n);
    }
  }

  return ret;
}

/*
................................................................................
    }
  }

  /* Add the new token to the main terms hash table. And to each of the
  ** prefix hash tables that it is large enough for. */
  fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken);
  for(i=0; i<pConfig->nPrefix; i++){
    if( nToken>=pConfig->aPrefix[i] ){

      fts5AddTermToHash(p, i+1, iCol, iPos, pToken, pConfig->aPrefix[i]);
    }
  }

  return fts5IndexReturn(p);
}

/*
................................................................................
  Fts5IndexIter **ppIter          /* OUT: New iterator object */
){
  Fts5IndexIter *pRet;
  int iIdx = 0;

  if( flags & FTS5INDEX_QUERY_PREFIX ){
    Fts5Config *pConfig = p->pConfig;

    for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){
      if( pConfig->aPrefix[iIdx-1]==nToken ) break;
    }
    if( iIdx>pConfig->nPrefix ){
      iIdx = -1;
    }
  }

  pRet = (Fts5IndexIter*)sqlite3Fts5MallocZero(&p->rc, sizeof(Fts5IndexIter));
................................................................................
    sqlite3_result_text(pCtx, (const char*)s.p, s.n, SQLITE_TRANSIENT);
  }else{
    sqlite3_result_error_code(pCtx, rc);
  }
  fts5BufferFree(&s);
}



















































/*
** This is called as part of registering the FTS5 module with database
** connection db. It registers several user-defined scalar functions useful
** with FTS5.
**
** If successful, SQLITE_OK is returned. If an error occurs, some other
** SQLite error code is returned instead.
*/
int sqlite3Fts5IndexInit(sqlite3 *db){
  int rc = sqlite3_create_function(
      db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0
  );





  return rc;
}








|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
|
<
>
|
|







 







|
>
|







 







>

|







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>













>
>
>
>
>



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
....
4044
4045
4046
4047
4048
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
....
4093
4094
4095
4096
4097
4098
4099
4100
4101
4102

4103
4104
4105
4106
4107
4108
4109
4110
4111
4112
....
4138
4139
4140
4141
4142
4143
4144
4145
4146
4147
4148
4149
4150
4151
4152
4153
4154
....
4162
4163
4164
4165
4166
4167
4168
4169
4170
4171
4172
4173
4174
4175
4176
4177
4178
....
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645
4646
4647
4648
4649
4650
4651
4652
4653
4654
4655
4656
4657
4658
4659
4660
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676
4677
4678
4679
4680
4681
4682
4683
4684
4685
4686
4687
4688
4689
4690
4691
4692
4693
4694
4695
4696
4697
4698
4699
4700
4701
4702
4703
4704
4705
4706
4707
4708
4709
4710
4711
**   Then, for each level from 0 to nMax:
**
**     + number of input segments in ongoing merge.
**     + total number of segments in level.
**     + for each segment from oldest to newest:
**         + segment id (always > 0)
**         + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.)
**         + first leaf page number (often 1, always greater than 0)
**         + final leaf page number
**
** 2. The Averages Record:
**
**   A single record within the %_data table. The data is a list of varints.
**   The first value is the number of rows in the index. Then, for each column
**   from left to right, the total number of tokens in the column for all 
................................................................................
      sqlite3_free(p->apHash);
    }
    sqlite3_free(p->zDataTbl);
    sqlite3_free(p);
  }
  return rc;
}

/*
** Argument p points to a buffer containing utf-8 text that is n bytes in 
** size. Return the number of bytes in the nChar character prefix of the
** buffer, or 0 if there are less than nChar characters in total.
*/
static int fts5IndexCharlenToBytelen(const char *p, int nByte, int nChar){
  int n = 0;
  int i;
  for(i=0; i<nChar; i++){
    if( n>=nByte ) return 0;      /* Input contains fewer than nChar chars */
    if( (unsigned char)p[n++]>=0xc0 ){
      while( (p[n] & 0xc0)==0x80 ) n++;
    }
  }
  return n;
}

/*
** pIn is a UTF-8 encoded string, nIn bytes in size. Return the number of
** unicode characters in the string.
*/
int fts5IndexCharlen(const char *pIn, int nIn){
  int nChar = 0;            
  int i = 0;
  while( i<nIn ){
    if( (unsigned char)pIn[i++]>=0xc0 ){
      while( i<nIn && (pIn[i] & 0xc0)==0x80 ) i++;
    }
    nChar++;
  }
  return nChar;
}

/*
** Calculate and return a checksum that is the XOR of the index entry
** checksum of all entries that would be generated by the token specified
** by the final 5 arguments.
*/
u64 sqlite3Fts5IndexCksum(
................................................................................
  int iCol,                       /* Column term appears in */
  int iPos,                       /* Position term appears in */
  const char *pTerm, int nTerm    /* Term at iPos */
){
  u64 ret = 0;                    /* Return value */
  int iIdx;                       /* For iterating through indexes */

  ret = fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nTerm);

  for(iIdx=0; iIdx<pConfig->nPrefix; iIdx++){

    int nByte = fts5IndexCharlenToBytelen(pTerm, nTerm, pConfig->aPrefix[iIdx]);
    if( nByte ){
      ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nByte);
    }
  }

  return ret;
}

/*
................................................................................
    }
  }

  /* Add the new token to the main terms hash table. And to each of the
  ** prefix hash tables that it is large enough for. */
  fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken);
  for(i=0; i<pConfig->nPrefix; i++){
    int nByte = fts5IndexCharlenToBytelen(pToken, nToken, pConfig->aPrefix[i]);
    if( nByte ){
      fts5AddTermToHash(p, i+1, iCol, iPos, pToken, nByte);
    }
  }

  return fts5IndexReturn(p);
}

/*
................................................................................
  Fts5IndexIter **ppIter          /* OUT: New iterator object */
){
  Fts5IndexIter *pRet;
  int iIdx = 0;

  if( flags & FTS5INDEX_QUERY_PREFIX ){
    Fts5Config *pConfig = p->pConfig;
    int nChar = fts5IndexCharlen(pToken, nToken);
    for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){
      if( pConfig->aPrefix[iIdx-1]==nChar ) break;
    }
    if( iIdx>pConfig->nPrefix ){
      iIdx = -1;
    }
  }

  pRet = (Fts5IndexIter*)sqlite3Fts5MallocZero(&p->rc, sizeof(Fts5IndexIter));
................................................................................
    sqlite3_result_text(pCtx, (const char*)s.p, s.n, SQLITE_TRANSIENT);
  }else{
    sqlite3_result_error_code(pCtx, rc);
  }
  fts5BufferFree(&s);
}

/*
** The implementation of user-defined scalar function fts5_rowid().
*/
static void fts5RowidFunction(
  sqlite3_context *pCtx,          /* Function call context */
  int nArg,                       /* Number of args (always 2) */
  sqlite3_value **apVal           /* Function arguments */
){
  const char *zArg;
  if( nArg==0 ){
    sqlite3_result_error(pCtx, "should be: fts5_rowid(subject, ....)", -1);
  }else{
    zArg = (const char*)sqlite3_value_text(apVal[0]);
    if( 0==sqlite3_stricmp(zArg, "segment") ){
      i64 iRowid;
      int idx, segid, height, pgno;
      if( nArg!=5 ){
        sqlite3_result_error(pCtx, 
            "should be: fts5_rowid('segment', idx, segid, height, pgno))", -1
        );
      }else{
        idx = sqlite3_value_int(apVal[1]);
        segid = sqlite3_value_int(apVal[2]);
        height = sqlite3_value_int(apVal[3]);
        pgno = sqlite3_value_int(apVal[4]);
        iRowid = FTS5_SEGMENT_ROWID(idx, segid, height, pgno);
        sqlite3_result_int64(pCtx, iRowid);
      }
    }else if( 0==sqlite3_stricmp(zArg, "start-of-index") ){
      i64 iRowid;
      int idx;
      if( nArg!=2 ){
        sqlite3_result_error(pCtx, 
            "should be: fts5_rowid('start-of-index', idx)", -1
        );
      }else{
        idx = sqlite3_value_int(apVal[1]);
        iRowid = FTS5_SEGMENT_ROWID(idx, 1, 0, 0);
        sqlite3_result_int64(pCtx, iRowid);
      }
    }else {
      sqlite3_result_error(pCtx, 
        "first arg to fts5_rowid() must be 'segment' "
        "or 'start-of-index' ..."
        , -1
      );
    }
  }
}

/*
** This is called as part of registering the FTS5 module with database
** connection db. It registers several user-defined scalar functions useful
** with FTS5.
**
** If successful, SQLITE_OK is returned. If an error occurs, some other
** SQLite error code is returned instead.
*/
int sqlite3Fts5IndexInit(sqlite3 *db){
  int rc = sqlite3_create_function(
      db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0
  );
  if( rc==SQLITE_OK ){
    rc = sqlite3_create_function(
        db, "fts5_rowid", -1, SQLITE_UTF8, 0, fts5RowidFunction, 0, 0
    );
  }
  return rc;
}

Added ext/fts5/test/fts5prefix.test.

























































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# 2015 Jan 13
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#
#

source [file join [file dirname [info script]] fts5_common.tcl]
set testprefix fts5prefix


#-------------------------------------------------------------------------
# Check that prefix indexes really do index n-character prefixes, not 
# n-byte prefixes. Use the ascii tokenizer so as not to be confused by
# diacritic removal.
#
do_execsql_test 1.0 { 
  CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = ascii, prefix = 2) 
}

do_test 1.2 {
  foreach {rowid string} {
    1 "\xCA\xCB\xCC\xCD"
    2 "\u1234\u5678\u4321\u8765"
  } {
    execsql { INSERT INTO t1(rowid, x) VALUES($rowid, $string) }
  }
} {}

do_execsql_test 1.1.2 {
  INSERT INTO t1(t1) VALUES('integrity-check');
}

#db eval { select fts5_decode(id, block) AS d FROM t1_data; } { puts $d }

foreach o {1 2} {
  if {$o==2} breakpoint
  foreach {tn q res} {
    1 "SELECT rowid FROM t1 WHERE t1 MATCH '\xCA\xCB*'" 1
    2 "SELECT rowid FROM t1 WHERE t1 MATCH '\u1234\u5678*'" 2
  } {
    do_execsql_test 1.$o.$tn $q $res
  }

  execsql {
    DELETE FROM t1_data WHERE 
    rowid>=fts5_rowid('start-of-index', 0) AND 
    rowid<fts5_rowid('start-of-index', 1);
  }
}


finish_test