Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix prefix indexes so that they work in characters, not bytes. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts5 |
Files: | files | file ages | folders |
SHA1: |
af8d43a4a08528bbae25ee38fe25de8a |
User & Date: | dan 2015-01-13 17:25:08.235 |
Context
2015-01-17
| ||
17:48 | Improve the performance of the fts5 porter tokenizer implementation. (check-in: 96ea600440 user: dan tags: fts5) | |
2015-01-13
| ||
17:25 | Fix prefix indexes so that they work in characters, not bytes. (check-in: af8d43a4a0 user: dan tags: fts5) | |
2015-01-12
| ||
17:58 | Optimize the unicode61 tokenizer so that it handles ascii text faster. Make it the default tokenizer. Change the name of the simple tokenizer to "ascii". (check-in: f22dbccad9 user: dan tags: fts5) | |
Changes
Changes to ext/fts5/fts5_index.c.
︙ | ︙ | |||
77 78 79 80 81 82 83 | ** Then, for each level from 0 to nMax: ** ** + number of input segments in ongoing merge. ** + total number of segments in level. ** + for each segment from oldest to newest: ** + segment id (always > 0) ** + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.) | | | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | ** Then, for each level from 0 to nMax: ** ** + number of input segments in ongoing merge. ** + total number of segments in level. ** + for each segment from oldest to newest: ** + segment id (always > 0) ** + b-tree height (1 -> root is leaf, 2 -> root is parent of leaf etc.) ** + first leaf page number (often 1, always greater than 0) ** + final leaf page number ** ** 2. The Averages Record: ** ** A single record within the %_data table. The data is a list of varints. ** The first value is the number of rows in the index. Then, for each column ** from left to right, the total number of tokens in the column for all |
︙ | ︙ | |||
4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 | sqlite3_free(p->apHash); } sqlite3_free(p->zDataTbl); sqlite3_free(p); } return rc; } /* ** Calculate and return a checksum that is the XOR of the index entry ** checksum of all entries that would be generated by the token specified ** by the final 5 arguments. */ u64 sqlite3Fts5IndexCksum( Fts5Config *pConfig, /* Configuration object */ i64 iRowid, /* Document term appears in */ int iCol, /* Column term appears in */ int iPos, /* Position term appears in */ const char *pTerm, int nTerm /* Term at iPos */ ){ u64 ret = 0; /* Return value */ int iIdx; /* For iterating through indexes */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 | sqlite3_free(p->apHash); } sqlite3_free(p->zDataTbl); sqlite3_free(p); } return rc; } /* ** Argument p points to a buffer containing utf-8 text that is n bytes in ** size. Return the number of bytes in the nChar character prefix of the ** buffer, or 0 if there are less than nChar characters in total. */ static int fts5IndexCharlenToBytelen(const char *p, int nByte, int nChar){ int n = 0; int i; for(i=0; i<nChar; i++){ if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ if( (unsigned char)p[n++]>=0xc0 ){ while( (p[n] & 0xc0)==0x80 ) n++; } } return n; } /* ** pIn is a UTF-8 encoded string, nIn bytes in size. Return the number of ** unicode characters in the string. */ int fts5IndexCharlen(const char *pIn, int nIn){ int nChar = 0; int i = 0; while( i<nIn ){ if( (unsigned char)pIn[i++]>=0xc0 ){ while( i<nIn && (pIn[i] & 0xc0)==0x80 ) i++; } nChar++; } return nChar; } /* ** Calculate and return a checksum that is the XOR of the index entry ** checksum of all entries that would be generated by the token specified ** by the final 5 arguments. */ u64 sqlite3Fts5IndexCksum( Fts5Config *pConfig, /* Configuration object */ i64 iRowid, /* Document term appears in */ int iCol, /* Column term appears in */ int iPos, /* Position term appears in */ const char *pTerm, int nTerm /* Term at iPos */ ){ u64 ret = 0; /* Return value */ int iIdx; /* For iterating through indexes */ ret = fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nTerm); for(iIdx=0; iIdx<pConfig->nPrefix; iIdx++){ int nByte = fts5IndexCharlenToBytelen(pTerm, nTerm, pConfig->aPrefix[iIdx]); if( nByte ){ ret ^= fts5IndexEntryCksum(iRowid, iCol, iPos, pTerm, nByte); } } return ret; } /* |
︙ | ︙ | |||
4103 4104 4105 4106 4107 4108 4109 | } } /* Add the new token to the main terms hash table. And to each of the ** prefix hash tables that it is large enough for. */ fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken); for(i=0; i<pConfig->nPrefix; i++){ | | > | | 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 | } } /* Add the new token to the main terms hash table. And to each of the ** prefix hash tables that it is large enough for. */ fts5AddTermToHash(p, 0, iCol, iPos, pToken, nToken); for(i=0; i<pConfig->nPrefix; i++){ int nByte = fts5IndexCharlenToBytelen(pToken, nToken, pConfig->aPrefix[i]); if( nByte ){ fts5AddTermToHash(p, i+1, iCol, iPos, pToken, nByte); } } return fts5IndexReturn(p); } /* |
︙ | ︙ | |||
4126 4127 4128 4129 4130 4131 4132 4133 | Fts5IndexIter **ppIter /* OUT: New iterator object */ ){ Fts5IndexIter *pRet; int iIdx = 0; if( flags & FTS5INDEX_QUERY_PREFIX ){ Fts5Config *pConfig = p->pConfig; for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){ | > | | 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 | Fts5IndexIter **ppIter /* OUT: New iterator object */ ){ Fts5IndexIter *pRet; int iIdx = 0; if( flags & FTS5INDEX_QUERY_PREFIX ){ Fts5Config *pConfig = p->pConfig; int nChar = fts5IndexCharlen(pToken, nToken); for(iIdx=1; iIdx<=pConfig->nPrefix; iIdx++){ if( pConfig->aPrefix[iIdx-1]==nChar ) break; } if( iIdx>pConfig->nPrefix ){ iIdx = -1; } } pRet = (Fts5IndexIter*)sqlite3Fts5MallocZero(&p->rc, sizeof(Fts5IndexIter)); |
︙ | ︙ | |||
4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 | sqlite3_result_text(pCtx, (const char*)s.p, s.n, SQLITE_TRANSIENT); }else{ sqlite3_result_error_code(pCtx, rc); } fts5BufferFree(&s); } /* ** This is called as part of registering the FTS5 module with database ** connection db. It registers several user-defined scalar functions useful ** with FTS5. ** ** If successful, SQLITE_OK is returned. If an error occurs, some other ** SQLite error code is returned instead. */ int sqlite3Fts5IndexInit(sqlite3 *db){ int rc = sqlite3_create_function( db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0 ); return rc; } | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 | sqlite3_result_text(pCtx, (const char*)s.p, s.n, SQLITE_TRANSIENT); }else{ sqlite3_result_error_code(pCtx, rc); } fts5BufferFree(&s); } /* ** The implementation of user-defined scalar function fts5_rowid(). */ static void fts5RowidFunction( sqlite3_context *pCtx, /* Function call context */ int nArg, /* Number of args (always 2) */ sqlite3_value **apVal /* Function arguments */ ){ const char *zArg; if( nArg==0 ){ sqlite3_result_error(pCtx, "should be: fts5_rowid(subject, ....)", -1); }else{ zArg = (const char*)sqlite3_value_text(apVal[0]); if( 0==sqlite3_stricmp(zArg, "segment") ){ i64 iRowid; int idx, segid, height, pgno; if( nArg!=5 ){ sqlite3_result_error(pCtx, "should be: fts5_rowid('segment', idx, segid, height, pgno))", -1 ); }else{ idx = sqlite3_value_int(apVal[1]); segid = sqlite3_value_int(apVal[2]); height = sqlite3_value_int(apVal[3]); pgno = sqlite3_value_int(apVal[4]); iRowid = FTS5_SEGMENT_ROWID(idx, segid, height, pgno); sqlite3_result_int64(pCtx, iRowid); } }else if( 0==sqlite3_stricmp(zArg, "start-of-index") ){ i64 iRowid; int idx; if( nArg!=2 ){ sqlite3_result_error(pCtx, "should be: fts5_rowid('start-of-index', idx)", -1 ); }else{ idx = sqlite3_value_int(apVal[1]); iRowid = FTS5_SEGMENT_ROWID(idx, 1, 0, 0); sqlite3_result_int64(pCtx, iRowid); } }else { sqlite3_result_error(pCtx, "first arg to fts5_rowid() must be 'segment' " "or 'start-of-index' ..." , -1 ); } } } /* ** This is called as part of registering the FTS5 module with database ** connection db. It registers several user-defined scalar functions useful ** with FTS5. ** ** If successful, SQLITE_OK is returned. If an error occurs, some other ** SQLite error code is returned instead. */ int sqlite3Fts5IndexInit(sqlite3 *db){ int rc = sqlite3_create_function( db, "fts5_decode", 2, SQLITE_UTF8, 0, fts5DecodeFunction, 0, 0 ); if( rc==SQLITE_OK ){ rc = sqlite3_create_function( db, "fts5_rowid", -1, SQLITE_UTF8, 0, fts5RowidFunction, 0, 0 ); } return rc; } |
Added ext/fts5/test/fts5prefix.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 | # 2015 Jan 13 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #*********************************************************************** # # source [file join [file dirname [info script]] fts5_common.tcl] set testprefix fts5prefix #------------------------------------------------------------------------- # Check that prefix indexes really do index n-character prefixes, not # n-byte prefixes. Use the ascii tokenizer so as not to be confused by # diacritic removal. # do_execsql_test 1.0 { CREATE VIRTUAL TABLE t1 USING fts5(x, tokenize = ascii, prefix = 2) } do_test 1.2 { foreach {rowid string} { 1 "\xCA\xCB\xCC\xCD" 2 "\u1234\u5678\u4321\u8765" } { execsql { INSERT INTO t1(rowid, x) VALUES($rowid, $string) } } } {} do_execsql_test 1.1.2 { INSERT INTO t1(t1) VALUES('integrity-check'); } #db eval { select fts5_decode(id, block) AS d FROM t1_data; } { puts $d } foreach o {1 2} { if {$o==2} breakpoint foreach {tn q res} { 1 "SELECT rowid FROM t1 WHERE t1 MATCH '\xCA\xCB*'" 1 2 "SELECT rowid FROM t1 WHERE t1 MATCH '\u1234\u5678*'" 2 } { do_execsql_test 1.$o.$tn $q $res } execsql { DELETE FROM t1_data WHERE rowid>=fts5_rowid('start-of-index', 0) AND rowid<fts5_rowid('start-of-index', 1); } } finish_test |