Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
c7ee60d00976efab25a830e741653801 |
User & Date: | drh 2006-09-21 02:03:09.000 |
Context
2006-09-21
| ||
11:02 | Be more aggressive with the SQLITE_OMIT_VACUUM macro. Saves about 150 bytes of code space. (CVS 3432) (check-in: 7e618db457 user: drh tags: trunk) | |
02:03 | Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431) (check-in: c7ee60d009 user: drh tags: trunk) | |
2006-09-18
| ||
21:14 | Fixed a build problem in sqlite3_extension_init(). (CVS 3430) (check-in: bb2e1871cb user: adamd tags: trunk) | |
Changes
Changes to ext/fts1/fts1.c.
︙ | ︙ | |||
49 50 51 52 53 54 55 | void initStringBuffer(StringBuffer *sb){ sb->len = 0; sb->alloced = 100; sb->s = malloc(100); sb->s[0] = '\0'; } | | < | > > > > | 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | void initStringBuffer(StringBuffer *sb){ sb->len = 0; sb->alloced = 100; sb->s = malloc(100); sb->s[0] = '\0'; } void nappend(StringBuffer *sb, const char *zFrom, int nFrom){ if( sb->len + nFrom >= sb->alloced ){ sb->alloced = sb->len + nFrom + 100; sb->s = realloc(sb->s, sb->alloced+1); if( sb->s==0 ){ initStringBuffer(sb); return; } } memcpy(sb->s + sb->len, zFrom, nFrom); sb->len += nFrom; sb->s[sb->len] = 0; } void append(StringBuffer *sb, const char *zFrom){ nappend(sb, zFrom, strlen(zFrom)); } /* We encode variable-length integers in little-endian order using seven bits * per byte as follows: ** ** KEY: ** A = 0xxxxxxx 7 bits of data and one flag bit |
︙ | ︙ | |||
905 906 907 908 909 910 911 | ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. */ typedef struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ | | < < > > | 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 | ** An instance of the following structure keeps track of generated ** matching-word offset information and snippets. */ typedef struct Snippet { int nMatch; /* Total number of matches */ int nAlloc; /* Space allocated for aMatch[] */ struct snippetMatch { /* One entry for each matching term */ char snStatus; /* Status flag for use while constructing snippets */ short int iCol; /* The column that contains the match */ short int iTerm; /* The index in Query.pTerms[] of the matching term */ short int nByte; /* Number of bytes in the term */ int iStart; /* The offset to the first character of the term */ } *aMatch; /* Points to space obtained from malloc */ char *zOffset; /* Text rendering of aMatch[] */ int nOffset; /* strlen(zOffset) */ char *zSnippet; /* Snippet text */ int nSnippet; /* strlen(zSnippet) */ } Snippet; typedef enum QueryType { QUERY_GENERIC, /* table scan */ QUERY_ROWID, /* lookup by rowid */ QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/ |
︙ | ︙ | |||
1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 | /* Free all of the dynamically allocated memory held by the ** Snippet */ static void snippetClear(Snippet *p){ free(p->aMatch); free(p->zOffset); memset(p, 0, sizeof(*p)); } /* ** Append a single entry to the p->aMatch[] log. */ static void snippetAppendMatch( Snippet *p, /* Append the entry to this snippet */ | > | 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 | /* Free all of the dynamically allocated memory held by the ** Snippet */ static void snippetClear(Snippet *p){ free(p->aMatch); free(p->zOffset); free(p->zSnippet); memset(p, 0, sizeof(*p)); } /* ** Append a single entry to the p->aMatch[] log. */ static void snippetAppendMatch( Snippet *p, /* Append the entry to this snippet */ |
︙ | ︙ | |||
2021 2022 2023 2024 2025 2026 2027 | p->nMatch = 0; p->nAlloc = 0; return; } } i = p->nMatch++; pMatch = &p->aMatch[i]; | < | 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 | p->nMatch = 0; p->nAlloc = 0; return; } } i = p->nMatch++; pMatch = &p->aMatch[i]; pMatch->iCol = iCol; pMatch->iTerm = iTerm; pMatch->iStart = iStart; pMatch->nByte = nByte; } /* |
︙ | ︙ | |||
2162 2163 2164 2165 2166 2167 2168 | cnt++; } p->zOffset = sb.s; p->nOffset = sb.len; } /* | | | > < > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > | > > > > > | > > | > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > | | | > | > > > > > > > > > > > > > > | < > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 | cnt++; } p->zOffset = sb.s; p->nOffset = sb.len; } /* ** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set ** of matching words some of which might be in zDoc. zDoc is column ** number iCol. ** ** iBreak is suggested spot in zDoc where we could begin or end an ** excerpt. Return a value similar to iBreak but possibly adjusted ** to be a little left or right so that the break point is better. */ static int wordBoundary( int iBreak, /* The suggested break point */ const char *zDoc, /* Document text */ int nDoc, /* Number of bytes in zDoc[] */ struct snippetMatch *aMatch, /* Matching words */ int nMatch, /* Number of entries in aMatch[] */ int iCol /* The column number for zDoc[] */ ){ int i; if( iBreak<=10 ){ return 0; } if( iBreak>=nDoc-10 ){ return nDoc; } for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){} while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; } if( i<nMatch ){ if( aMatch[i].iStart<iBreak+10 ){ return aMatch[i].iStart; } if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){ return aMatch[i-1].iStart; } } for(i=1; i<=10; i++){ if( isspace(zDoc[iBreak-i]) ){ return iBreak - i + 1; } if( isspace(zDoc[iBreak+i]) ){ return iBreak + i + 1; } } return iBreak; } /* ** Allowed values for Snippet.aMatch[].snStatus */ #define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */ #define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */ /* ** Generate the text of a snippet. */ static void snippetText( fulltext_cursor *pCursor, /* The cursor we need the snippet for */ const char *zStartMark, /* Markup to appear before each match */ const char *zEndMark, /* Markup to appear after each match */ const char *zEllipsis /* Ellipsis mark */ ){ int i, j; struct snippetMatch *aMatch; int nMatch; int nDesired; StringBuffer sb; int tailCol = -1; int tailOffset = -1; int iCol; int nDoc; const char *zDoc; int iStart, iEnd; int wantEllipsis; int tailEllipsis = 0; int iMatch; free(pCursor->snippet.zSnippet); pCursor->snippet.zSnippet = 0; aMatch = pCursor->snippet.aMatch; nMatch = pCursor->snippet.nMatch; initStringBuffer(&sb); for(i=0; i<nMatch; i++){ aMatch[i].snStatus = SNIPPET_IGNORE; } nDesired = 0; for(i=0; i<pCursor->q.nTerms; i++){ for(j=0; j<nMatch; j++){ if( aMatch[j].iTerm==i ){ aMatch[j].snStatus = SNIPPET_DESIRED; nDesired++; break; } } } iMatch = 0; for(i=0; i<nMatch && nDesired>0; i++){ if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue; nDesired--; iCol = aMatch[i].iCol; zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1); nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1); iStart = aMatch[i].iStart - 40; iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol); if( iStart<=10 ){ iStart = 0; wantEllipsis = 0; }else{ wantEllipsis = 1; } if( iCol==tailCol && iStart<=tailOffset+20 ){ iStart = tailOffset; wantEllipsis = 0; tailEllipsis = 0; } if( wantEllipsis || tailEllipsis ){ append(&sb, zEllipsis); } iEnd = aMatch[i].iStart + aMatch[i].nByte + 40; iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol); if( iEnd>=nDoc-10 ){ iEnd = nDoc; tailEllipsis = 0; }else{ tailEllipsis = 1; } while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; } while( iStart<iEnd ){ while( iMatch<nMatch && aMatch[iMatch].iStart<iStart ){ iMatch++; } if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd ){ nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart); iStart = aMatch[iMatch].iStart; append(&sb, zStartMark); nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte); append(&sb, zEndMark); iStart += aMatch[iMatch].nByte; for(j=iMatch+1; j<nMatch; j++){ if( aMatch[j].iTerm==aMatch[iMatch].iTerm && aMatch[j].snStatus==SNIPPET_DESIRED ){ nDesired--; aMatch[j].snStatus = SNIPPET_IGNORE; } } }else{ nappend(&sb, &zDoc[iStart], iEnd - iStart); iStart = iEnd; } } tailCol = iCol; tailOffset = iEnd; } if( tailEllipsis ){ append(&sb, zEllipsis); } pCursor->snippet.zSnippet = sb.s; pCursor->snippet.nSnippet = sb.len; } /* ** Close the cursor. For additional information see the documentation ** on the xClose method of the virtual table interface. */ |
︙ | ︙ | |||
2843 2844 2845 2846 2847 2848 2849 2850 | ){ fulltext_cursor *pCursor; if( argc<1 ) return; if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); }else{ memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); | > > > > > > > > > > > > | > > > | 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 | ){ fulltext_cursor *pCursor; if( argc<1 ) return; if( sqlite3_value_type(argv[0])!=SQLITE_BLOB || sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){ sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1); }else{ const char *zStart = "<b>"; const char *zEnd = "</b>"; const char *zEllipsis = "<b>...</b>"; memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor)); if( argc>=2 ){ zStart = (const char*)sqlite3_value_text(argv[1]); if( argc>=3 ){ zEnd = (const char*)sqlite3_value_text(argv[2]); if( argc>=4 ){ zEllipsis = (const char*)sqlite3_value_text(argv[3]); } } } snippetAllOffsets(pCursor); snippetText(pCursor, zStart, zEnd, zEllipsis); sqlite3_result_text(pContext, pCursor->snippet.zSnippet, pCursor->snippet.nSnippet, SQLITE_STATIC); } } /* ** Implementation of the offsets() function for FTS1 */ static void snippetOffsetsFunc( |
︙ | ︙ |
Changes to test/fts1c.test.
1 2 3 4 5 6 7 8 9 10 11 12 13 | # 2006 September 14 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS1 module. # | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 | # 2006 September 14 # # The author disclaims copyright to this source code. In place of # a legal notice, here is a blessing: # # May you do good and not evil. # May you find forgiveness for yourself and forgive others. # May you share freely, never taking more than you give. # #************************************************************************* # This file implements regression tests for SQLite library. The # focus of this script is testing the FTS1 module. # # $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $ # set testdir [file dirname $argv0] source $testdir/tester.tcl # If SQLITE_ENABLE_FTS1 is defined, omit this file. ifcapable !fts1 { |
︙ | ︙ | |||
1113 1114 1115 1116 1117 1118 1119 1120 1121 | } {32 {3 0 94 5 3 0 114 5 3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7 3 1 493 7}} do_test fts1c-3.2 { execsql { SELECT rowid, offsets(email) FROM email WHERE body MATCH '"child product"' } } {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}} finish_test | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 | } {32 {3 0 94 5 3 0 114 5 3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7 3 1 493 7}} do_test fts1c-3.2 { execsql { SELECT rowid, offsets(email) FROM email WHERE body MATCH '"child product"' } } {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}} # Snippet generator tests # do_test fts1c-4.1 { execsql { SELECT snippet(email) FROM email WHERE email MATCH 'subject:gas reminder' } } {{Alert Posted 10:00 AM November 20,2000: E-<b>GAS</b> Request <b>Reminder</b>}} do_test fts1c-4.2 { execsql { SELECT snippet(email) FROM email WHERE email MATCH 'christmas candlelight' } } {{<b>...</b>place.? What do you think about going here <b>Christmas</b> eve?? They have an 11:00 a.m. service and a <b>candlelight</b> service at 5:00 p.m., among others. <b>...</b>}} do_test fts1c-4.3 { execsql { SELECT snippet(email) FROM email WHERE email MATCH 'deal sheet potential reuse' } } {{EOL-Accenture <b>Deal</b> <b>Sheet</b><b>...</b>intent Review Enron asset base for <b>potential</b> <b>reuse</b>/ licensing Contract negotiations <b>...</b>}} do_test fts1c-4.4 { execsql { SELECT snippet(email,'<<<','>>>',' ') FROM email WHERE email MATCH 'deal sheet potential reuse' } } {{EOL-Accenture <<<Deal>>> <<<Sheet>>> intent Review Enron asset base for <<<potential>>> <<<reuse>>>/ licensing Contract negotiations }} finish_test |