Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add "doclist index" records to the database. These are to make navigating within very large doclists faster. They are not yet used by queries. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts5 |
Files: | files | file ages | folders |
SHA1: |
89377421ff69f2450364987afe781b6d |
User & Date: | dan 2014-08-01 11:16:25.207 |
Context
2014-08-01
| ||
19:27 | Have the fts5 integrity-check verify that doclist indexes match the contents of the leaf pages that they index. (check-in: 37a7d3035e user: dan tags: fts5) | |
11:16 | Add "doclist index" records to the database. These are to make navigating within very large doclists faster. They are not yet used by queries. (check-in: 89377421ff user: dan tags: fts5) | |
2014-07-31
| ||
17:53 | Add a comment explaining why fts5 cannot cache "sorter statements". (check-in: e6af3b7a3c user: dan tags: fts5) | |
Changes
Changes to ext/fts5/fts5_index.c.
︙ | ︙ | |||
43 44 45 46 47 48 49 50 51 52 53 54 55 56 | */ #define FTS5_DEFAULT_PAGE_SIZE 1000 #define FTS5_WORK_UNIT 64 /* Number of leaf pages in unit of work */ #define FTS5_MIN_MERGE 4 /* Minimum number of segments to merge */ /* ** Details: ** ** The %_data table managed by this module, ** ** CREATE TABLE %_data(id INTEGER PRIMARY KEY, block BLOB); ** | > > | 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | */ #define FTS5_DEFAULT_PAGE_SIZE 1000 #define FTS5_WORK_UNIT 64 /* Number of leaf pages in unit of work */ #define FTS5_MIN_MERGE 4 /* Minimum number of segments to merge */ #define FTS5_MIN_DLIDX_SIZE 4 /* Add dlidx if this many empty pages */ /* ** Details: ** ** The %_data table managed by this module, ** ** CREATE TABLE %_data(id INTEGER PRIMARY KEY, block BLOB); ** |
︙ | ︙ | |||
180 181 182 183 184 185 186 | ** * either an 0x00 or 0x01 byte. If the value 0x01 is used, then there ** is an associated index-by-rowid record. ** * the number of zero-term leaves as a varint. ** ** 5. Segment doclist indexes: ** ** A list of varints - the first docid on each page (starting with the | | | > > | 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | ** * either an 0x00 or 0x01 byte. If the value 0x01 is used, then there ** is an associated index-by-rowid record. ** * the number of zero-term leaves as a varint. ** ** 5. Segment doclist indexes: ** ** A list of varints - the first docid on each page (starting with the ** first termless page) of the doclist. First element in the list is a ** literal docid. Each docid thereafter is a (negative) delta. If there ** are no docids at all on a page, a 0x00 byte takes the place of the ** delta value. */ /* ** Rowids for the averages and structure records in the %_data table. */ #define FTS5_AVERAGES_ROWID 1 /* Rowid used for the averages record */ #define FTS5_STRUCTURE_ROWID(iIdx) (10 + (iIdx)) /* For structure records */ |
︙ | ︙ | |||
231 232 233 234 235 236 237 | #endif /* ** The height of segment b-trees is actually limited to one less than ** (1<<HEIGHT_BITS). This is because the rowid address space for nodes ** with such a height is used by doclist indexes. */ | | | 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | #endif /* ** The height of segment b-trees is actually limited to one less than ** (1<<HEIGHT_BITS). This is because the rowid address space for nodes ** with such a height is used by doclist indexes. */ #define FTS5_SEGMENT_MAX_HEIGHT ((1 << FTS5_DATA_HEIGHT_B)-1) /* ** The rowid for the doclist index associated with leaf page pgno of segment ** segid in index idx. */ #define FTS5_DOCLIST_IDX_ROWID(idx, segid, pgno) \ FTS5_SEGMENT_ROWID(idx, segid, FTS5_SEGMENT_MAX_HEIGHT, pgno) |
︙ | ︙ | |||
373 374 375 376 377 378 379 | ** An object of type Fts5SegWriter is used to write to segments. */ struct Fts5PageWriter { int pgno; /* Page number for this page */ Fts5Buffer buf; /* Buffer containing page data */ Fts5Buffer term; /* Buffer containing previous term on page */ }; | < > > > | 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 | ** An object of type Fts5SegWriter is used to write to segments. */ struct Fts5PageWriter { int pgno; /* Page number for this page */ Fts5Buffer buf; /* Buffer containing page data */ Fts5Buffer term; /* Buffer containing previous term on page */ }; struct Fts5SegWriter { int iIdx; /* Index to write to */ int iSegid; /* Segid to write to */ int nWriter; /* Number of entries in aWriter */ Fts5PageWriter *aWriter; /* Array of PageWriter objects */ i64 iPrevRowid; /* Previous docid written to current leaf */ u8 bFirstRowidInDoclist; /* True if next rowid is first in doclist */ u8 bFirstRowidInPage; /* True if next rowid is first in page */ int nLeafWritten; /* Number of leaf pages written */ int nEmpty; /* Number of contiguous term-less nodes */ Fts5Buffer dlidx; /* Doclist index */ i64 iDlidxPrev; /* Previous rowid appended to dlidx */ int bDlidxPrevValid; /* True if iDlidxPrev is valid */ }; /* ** Object for iterating through the merged results of one or more segments, ** visiting each term/docid pair in the merged data. ** ** nSeg is always a power of two greater than or equal to the number of |
︙ | ︙ | |||
530 531 532 533 534 535 536 | ** An Fts5BtreeIter object is used to iterate through all entries in the ** b-tree hierarchy belonging to a single fts5 segment. In this case the ** "b-tree hierarchy" is all b-tree nodes except leaves. Each entry in the ** b-tree hierarchy consists of the following: ** ** iLeaf: The page number of the leaf page the entry points to. ** | | | 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 | ** An Fts5BtreeIter object is used to iterate through all entries in the ** b-tree hierarchy belonging to a single fts5 segment. In this case the ** "b-tree hierarchy" is all b-tree nodes except leaves. Each entry in the ** b-tree hierarchy consists of the following: ** ** iLeaf: The page number of the leaf page the entry points to. ** ** term: A split-key that all terms on leaf page $iLeaf must be greater ** than or equal to. The "term" associated with the first b-tree ** hierarchy entry (the one that points to leaf page 1) is always ** an empty string. ** ** nEmpty: The number of empty (termless) leaf pages that immediately ** following iLeaf. ** |
︙ | ︙ | |||
1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 | */ static void fts5SegIterInit( Fts5Index *p, int iIdx, /* Config.aHash[] index of FTS index */ Fts5StructureSegment *pSeg, /* Description of segment */ Fts5SegIter *pIter /* Object to populate */ ){ if( p->rc==SQLITE_OK ){ memset(pIter, 0, sizeof(*pIter)); pIter->pSeg = pSeg; pIter->iIdx = iIdx; pIter->iLeafPgno = pSeg->pgnoFirst-1; fts5SegIterNextPage(p, pIter); | > > > > > > > > > | 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 | */ static void fts5SegIterInit( Fts5Index *p, int iIdx, /* Config.aHash[] index of FTS index */ Fts5StructureSegment *pSeg, /* Description of segment */ Fts5SegIter *pIter /* Object to populate */ ){ if( pSeg->pgnoFirst==0 ){ /* This happens if the segment is being used as an input to an incremental ** merge and all data has already been "trimmed". See function ** fts5TrimSegments() for details. In this case leave the iterator empty. ** The caller will see the (pIter->pLeaf==0) and assume the iterator is ** at EOF already. */ assert( pIter->pLeaf==0 ); return; } if( p->rc==SQLITE_OK ){ memset(pIter, 0, sizeof(*pIter)); pIter->pSeg = pSeg; pIter->iIdx = iIdx; pIter->iLeafPgno = pSeg->pgnoFirst-1; fts5SegIterNextPage(p, pIter); |
︙ | ︙ | |||
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 | int i; for(i=0; i<nNew && i<nOld; i++){ if( pOld[i]!=pNew[i] ) break; } return i; } /* ** This is called once for each leaf page except the first that contains ** at least one term. Argument (nTerm/pTerm) is the split-key - a term that ** is larger than all terms written to earlier leaves, and equal to or ** smaller than the first term on the new leaf. ** | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 | int i; for(i=0; i<nNew && i<nOld; i++){ if( pOld[i]!=pNew[i] ) break; } return i; } /* ** If an "nEmpty" record must be written to the b-tree before the next ** term, write it now. */ static void fts5WriteBtreeNEmpty(Fts5Index *p, Fts5SegWriter *pWriter){ if( pWriter->nEmpty ){ Fts5PageWriter *pPg = &pWriter->aWriter[1]; int bFlag = 0; if( pWriter->nEmpty>=FTS5_MIN_DLIDX_SIZE ){ i64 iKey = FTS5_DOCLIST_IDX_ROWID( pWriter->iIdx, pWriter->iSegid, pWriter->aWriter[0].pgno - 1 - pWriter->nEmpty ); fts5DataWrite(p, iKey, pWriter->dlidx.p, pWriter->dlidx.n); bFlag = 1; } fts5BufferAppendVarint(&p->rc, &pPg->buf, bFlag); fts5BufferAppendVarint(&p->rc, &pPg->buf, pWriter->nEmpty); pWriter->nEmpty = 0; } /* Whether or not it was written to disk, zero the doclist index at this ** point */ sqlite3Fts5BufferZero(&pWriter->dlidx); pWriter->bDlidxPrevValid = 0; } /* ** This is called once for each leaf page except the first that contains ** at least one term. Argument (nTerm/pTerm) is the split-key - a term that ** is larger than all terms written to earlier leaves, and equal to or ** smaller than the first term on the new leaf. ** |
︙ | ︙ | |||
2093 2094 2095 2096 2097 2098 2099 | fts5BufferAppendVarint(&p->rc, &pNew->buf, 1); pWriter->nWriter++; pWriter->aWriter = aNew; } pPage = &pWriter->aWriter[iHeight]; | | < < < < < | 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 | fts5BufferAppendVarint(&p->rc, &pNew->buf, 1); pWriter->nWriter++; pWriter->aWriter = aNew; } pPage = &pWriter->aWriter[iHeight]; fts5WriteBtreeNEmpty(p, pWriter); if( pPage->buf.n>=p->pgsz ){ /* pPage will be written to disk. The term will be written into the ** parent of pPage. */ i64 iRowid = FTS5_SEGMENT_ROWID( pWriter->iIdx, pWriter->iSegid, iHeight, pPage->pgno ); |
︙ | ︙ | |||
2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 | } } static void fts5WriteBtreeNoTerm( Fts5Index *p, /* FTS5 backend object */ Fts5SegWriter *pWriter /* Writer object */ ){ pWriter->nEmpty++; } static void fts5WriteFlushLeaf(Fts5Index *p, Fts5SegWriter *pWriter){ static const u8 zero[] = { 0x00, 0x00, 0x00, 0x00 }; Fts5PageWriter *pPage = &pWriter->aWriter[0]; i64 iRowid; if( pPage->term.n==0 ){ | > > > > > > > > > > > > > > > > > > > > > > > > > | 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 | } } static void fts5WriteBtreeNoTerm( Fts5Index *p, /* FTS5 backend object */ Fts5SegWriter *pWriter /* Writer object */ ){ if( pWriter->bFirstRowidInPage ){ /* No rowids on this page. Append an 0x00 byte to the current ** doclist-index */ sqlite3Fts5BufferAppendVarint(&p->rc, &pWriter->dlidx, 0); } pWriter->nEmpty++; } /* ** Rowid iRowid has just been appended to the current leaf page. As it is ** the first on its page, append an entry to the current doclist-index. */ static void fts5WriteDlidxAppend( Fts5Index *p, Fts5SegWriter *pWriter, i64 iRowid ){ i64 iVal; if( pWriter->bDlidxPrevValid ){ iVal = pWriter->iDlidxPrev - iRowid; }else{ iVal = iRowid; } sqlite3Fts5BufferAppendVarint(&p->rc, &pWriter->dlidx, iVal); pWriter->bDlidxPrevValid = 1; pWriter->iDlidxPrev = iRowid; } static void fts5WriteFlushLeaf(Fts5Index *p, Fts5SegWriter *pWriter){ static const u8 zero[] = { 0x00, 0x00, 0x00, 0x00 }; Fts5PageWriter *pPage = &pWriter->aWriter[0]; i64 iRowid; if( pPage->term.n==0 ){ |
︙ | ︙ | |||
2222 2223 2224 2225 2226 2227 2228 | Fts5Index *p, Fts5SegWriter *pWriter, i64 iRowid ){ Fts5PageWriter *pPage = &pWriter->aWriter[0]; /* If this is to be the first docid written to the page, set the | | > | > > > | 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 | Fts5Index *p, Fts5SegWriter *pWriter, i64 iRowid ){ Fts5PageWriter *pPage = &pWriter->aWriter[0]; /* If this is to be the first docid written to the page, set the ** docid-pointer in the page-header. Also append a value to the dlidx ** buffer, in case a doclist-index is required. */ if( pWriter->bFirstRowidInPage ){ fts5PutU16(pPage->buf.p, pPage->buf.n); fts5WriteDlidxAppend(p, pWriter, iRowid); } /* Write the docid. */ if( pWriter->bFirstRowidInDoclist || pWriter->bFirstRowidInPage ){ fts5BufferAppendVarint(&p->rc, &pPage->buf, iRowid); }else{ assert( iRowid<pWriter->iPrevRowid ); fts5BufferAppendVarint(&p->rc, &pPage->buf, pWriter->iPrevRowid - iRowid); |
︙ | ︙ | |||
2297 2298 2299 2300 2301 2302 2303 2304 2305 | } } /* Write the doclist terminator */ fts5WriteAppendZerobyte(p, pWriter); } static void fts5WriteFinish( Fts5Index *p, | > > > > | | | | < < | > | 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 | } } /* Write the doclist terminator */ fts5WriteAppendZerobyte(p, pWriter); } /* ** Flush any data cached by the writer object to the database. Free any ** allocations associated with the writer. */ static void fts5WriteFinish( Fts5Index *p, Fts5SegWriter *pWriter, /* Writer object */ int *pnHeight, /* OUT: Height of the b-tree */ int *pnLeaf /* OUT: Number of leaf pages in b-tree */ ){ int i; *pnLeaf = pWriter->aWriter[0].pgno; *pnHeight = pWriter->nWriter; fts5WriteFlushLeaf(p, pWriter); if( pWriter->nWriter>1 ){ fts5WriteBtreeNEmpty(p, pWriter); } for(i=1; i<pWriter->nWriter; i++){ Fts5PageWriter *pPg = &pWriter->aWriter[i]; i64 iRow = FTS5_SEGMENT_ROWID(pWriter->iIdx, pWriter->iSegid, i, pPg->pgno); fts5DataWrite(p, iRow, pPg->buf.p, pPg->buf.n); } for(i=0; i<pWriter->nWriter; i++){ Fts5PageWriter *pPg = &pWriter->aWriter[i]; fts5BufferFree(&pPg->term); fts5BufferFree(&pPg->buf); } sqlite3_free(pWriter->aWriter); sqlite3Fts5BufferFree(&pWriter->dlidx); } static void fts5WriteInit( Fts5Index *p, Fts5SegWriter *pWriter, int iIdx, int iSegid ){ |
︙ | ︙ | |||
3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 | assert( nArg==2 ); memset(&s, 0, sizeof(Fts5Buffer)); iRowid = sqlite3_value_int64(apVal[0]); n = sqlite3_value_bytes(apVal[1]); a = sqlite3_value_blob(apVal[1]); fts5DecodeRowid(iRowid, &iIdx, &iSegid, &iHeight, &iPgno); if( iSegid==0 ){ if( iRowid==FTS5_AVERAGES_ROWID ){ sqlite3Fts5BufferAppendPrintf(&rc, &s, "{averages} "); }else{ sqlite3Fts5BufferAppendPrintf(&rc, &s, "{structure idx=%d}", (int)(iRowid-10) ); | > > > > > > > > > > > > > > > > > > > > > > | 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 | assert( nArg==2 ); memset(&s, 0, sizeof(Fts5Buffer)); iRowid = sqlite3_value_int64(apVal[0]); n = sqlite3_value_bytes(apVal[1]); a = sqlite3_value_blob(apVal[1]); fts5DecodeRowid(iRowid, &iIdx, &iSegid, &iHeight, &iPgno); if( iHeight==FTS5_SEGMENT_MAX_HEIGHT ){ int i = 0; i64 iPrev; sqlite3Fts5BufferAppendPrintf(&rc, &s, "(dlidx idx=%d segid=%d pgno=%d)", iIdx, iSegid, iHeight, iPgno ); if( n>0 ){ i = getVarint(&a[i], (u64*)&iPrev); sqlite3Fts5BufferAppendPrintf(&rc, &s, " %lld", iPrev); } while( i<n ){ i64 iVal; i += getVarint(&a[i], (u64*)&iVal); if( iVal==0 ){ sqlite3Fts5BufferAppendPrintf(&rc, &s, " x"); }else{ iPrev = iPrev - iVal; sqlite3Fts5BufferAppendPrintf(&rc, &s, " %lld", iPrev); } } }else if( iSegid==0 ){ if( iRowid==FTS5_AVERAGES_ROWID ){ sqlite3Fts5BufferAppendPrintf(&rc, &s, "{averages} "); }else{ sqlite3Fts5BufferAppendPrintf(&rc, &s, "{structure idx=%d}", (int)(iRowid-10) ); |
︙ | ︙ |
Added test/fts5ah.test.
> > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | # 2014 June 17 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS5 module. # set testdir [file dirname $argv0] source $testdir/tester.tcl set testprefix fts5ah # If SQLITE_ENABLE_FTS3 is defined, omit this file. ifcapable !fts3 { finish_test return } #------------------------------------------------------------------------- # This file contains tests for very large doclists. # do_test 1.0 { execsql { CREATE VIRTUAL TABLE t1 USING fts5(a) } execsql { INSERT INTO t1(t1) VALUES('pgsz=128') } for {set i 1} {$i <= 10000} {incr i} { set v {x x x x x x x x x x x x x x x x x x x x} if {($i % 2139)==0} {lset v 3 Y ; lappend Y $i} if {($i % 1577)==0} {lset v 5 W ; lappend W $i} execsql { INSERT INTO t1 VALUES($v) } } } {} do_execsql_test 1.1 { SELECT rowid FROM t1 WHERE t1 MATCH 'x AND w' } [lsort -integer -decr $W] do_execsql_test 1.2 { SELECT rowid FROM t1 WHERE t1 MATCH 'y AND x' } [lsort -integer -decr $Y] do_execsql_test 1.3 { INSERT INTO t1(t1) VALUES('integrity-check'); } do_execsql_test 1.4 { SELECT count(*) FROM t1_data } finish_test |