/ Check-in [c7ee60d0]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:c7ee60d00976efab25a830e7416538010c734129
User & Date: drh 2006-09-21 02:03:09
Context
2006-09-21
11:02
Be more aggressive with the SQLITE_OMIT_VACUUM macro. Saves about 150 bytes of code space. (CVS 3432) check-in: 7e618db4 user: drh tags: trunk
02:03
Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431) check-in: c7ee60d0 user: drh tags: trunk
2006-09-18
21:14
Fixed a build problem in sqlite3_extension_init(). (CVS 3430) check-in: bb2e1871 user: adamd tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to ext/fts1/fts1.c.

    49     49   void initStringBuffer(StringBuffer *sb){
    50     50     sb->len = 0;
    51     51     sb->alloced = 100;
    52     52     sb->s = malloc(100);
    53     53     sb->s[0] = '\0';
    54     54   }
    55     55   
    56         -void append(StringBuffer *sb, const char *zFrom){
    57         -  int nFrom = strlen(zFrom);
           56  +void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
    58     57     if( sb->len + nFrom >= sb->alloced ){
    59     58       sb->alloced = sb->len + nFrom + 100;
    60     59       sb->s = realloc(sb->s, sb->alloced+1);
    61     60       if( sb->s==0 ){
    62     61         initStringBuffer(sb);
    63     62         return;
    64     63       }
    65     64     }
    66         -  strcpy(sb->s + sb->len, zFrom);
           65  +  memcpy(sb->s + sb->len, zFrom, nFrom);
    67     66     sb->len += nFrom;
           67  +  sb->s[sb->len] = 0;
           68  +}
           69  +void append(StringBuffer *sb, const char *zFrom){
           70  +  nappend(sb, zFrom, strlen(zFrom));
    68     71   }
    69     72   
    70     73   /* We encode variable-length integers in little-endian order using seven bits
    71     74    * per byte as follows:
    72     75   **
    73     76   ** KEY:
    74     77   **         A = 0xxxxxxx    7 bits of data and one flag bit
................................................................................
   905    908   ** An instance of the following structure keeps track of generated
   906    909   ** matching-word offset information and snippets.
   907    910   */
   908    911   typedef struct Snippet {
   909    912     int nMatch;     /* Total number of matches */
   910    913     int nAlloc;     /* Space allocated for aMatch[] */
   911    914     struct snippetMatch { /* One entry for each matching term */
   912         -    char exemplar;       /* True if this match should be shown in the snippet */
          915  +    char snStatus;       /* Status flag for use while constructing snippets */
   913    916       short int iCol;      /* The column that contains the match */
   914    917       short int iTerm;     /* The index in Query.pTerms[] of the matching term */
   915    918       short int nByte;     /* Number of bytes in the term */
   916         -    short int nContext;  /* Number of bytes of context for this match */
   917    919       int iStart;          /* The offset to the first character of the term */
   918         -    int iContext;        /* Start of the context */
   919    920     } *aMatch;      /* Points to space obtained from malloc */
   920    921     char *zOffset;  /* Text rendering of aMatch[] */
   921    922     int nOffset;    /* strlen(zOffset) */
          923  +  char *zSnippet; /* Snippet text */
          924  +  int nSnippet;   /* strlen(zSnippet) */
   922    925   } Snippet;
   923    926   
   924    927   
   925    928   typedef enum QueryType {
   926    929     QUERY_GENERIC,   /* table scan */
   927    930     QUERY_ROWID,     /* lookup by rowid */
   928    931     QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
................................................................................
  1998   2001   
  1999   2002   /* Free all of the dynamically allocated memory held by the
  2000   2003   ** Snippet
  2001   2004   */
  2002   2005   static void snippetClear(Snippet *p){
  2003   2006     free(p->aMatch);
  2004   2007     free(p->zOffset);
         2008  +  free(p->zSnippet);
  2005   2009     memset(p, 0, sizeof(*p));
  2006   2010   }
  2007   2011   /*
  2008   2012   ** Append a single entry to the p->aMatch[] log.
  2009   2013   */
  2010   2014   static void snippetAppendMatch(
  2011   2015     Snippet *p,               /* Append the entry to this snippet */
................................................................................
  2021   2025         p->nMatch = 0;
  2022   2026         p->nAlloc = 0;
  2023   2027         return;
  2024   2028       }
  2025   2029     }
  2026   2030     i = p->nMatch++;
  2027   2031     pMatch = &p->aMatch[i];
  2028         -  pMatch->exemplar = 0;
  2029   2032     pMatch->iCol = iCol;
  2030   2033     pMatch->iTerm = iTerm;
  2031   2034     pMatch->iStart = iStart;
  2032   2035     pMatch->nByte = nByte;
  2033   2036   }
  2034   2037   
  2035   2038   /*
................................................................................
  2162   2165       cnt++;
  2163   2166     }
  2164   2167     p->zOffset = sb.s;
  2165   2168     p->nOffset = sb.len;
  2166   2169   }
  2167   2170   
  2168   2171   /*
  2169         -** Scan all matches in Snippet and mark the exemplars.  Exemplars are
  2170         -** matches that we definitely want to include in the snippet.
         2172  +** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
         2173  +** of matching words some of which might be in zDoc.  zDoc is column
         2174  +** number iCol.
  2171   2175   **
  2172         -** Generally speaking, each keyword in the search phrase will have
  2173         -** a single exemplar.  When a keyword matches at multiple points
  2174         -** within the document, the trick is figuring which of these matches
  2175         -** should be the examplar.
         2176  +** iBreak is suggested spot in zDoc where we could begin or end an
         2177  +** excerpt.  Return a value similar to iBreak but possibly adjusted
         2178  +** to be a little left or right so that the break point is better.
  2176   2179   */
  2177         -static void snippetFindExemplars(Snippet *p, Query *pQ){
         2180  +static int wordBoundary(
         2181  +  int iBreak,                   /* The suggested break point */
         2182  +  const char *zDoc,             /* Document text */
         2183  +  int nDoc,                     /* Number of bytes in zDoc[] */
         2184  +  struct snippetMatch *aMatch,  /* Matching words */
         2185  +  int nMatch,                   /* Number of entries in aMatch[] */
         2186  +  int iCol                      /* The column number for zDoc[] */
         2187  +){
         2188  +  int i;
         2189  +  if( iBreak<=10 ){
         2190  +    return 0;
         2191  +  }
         2192  +  if( iBreak>=nDoc-10 ){
         2193  +    return nDoc;
         2194  +  }
         2195  +  for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
         2196  +  while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
         2197  +  if( i<nMatch ){
         2198  +    if( aMatch[i].iStart<iBreak+10 ){
         2199  +      return aMatch[i].iStart;
         2200  +    }
         2201  +    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
         2202  +      return aMatch[i-1].iStart;
         2203  +    }
         2204  +  }
         2205  +  for(i=1; i<=10; i++){
         2206  +    if( isspace(zDoc[iBreak-i]) ){
         2207  +      return iBreak - i + 1;
         2208  +    }
         2209  +    if( isspace(zDoc[iBreak+i]) ){
         2210  +      return iBreak + i + 1;
         2211  +    }
         2212  +  }
         2213  +  return iBreak;
         2214  +}
         2215  +
         2216  +/*
         2217  +** Allowed values for Snippet.aMatch[].snStatus
         2218  +*/
         2219  +#define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
         2220  +#define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */  
         2221  +
         2222  +/*
         2223  +** Generate the text of a snippet.
         2224  +*/
         2225  +static void snippetText(
         2226  +  fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
         2227  +  const char *zStartMark,     /* Markup to appear before each match */
         2228  +  const char *zEndMark,       /* Markup to appear after each match */
         2229  +  const char *zEllipsis       /* Ellipsis mark */
         2230  +){
  2178   2231     int i, j;
  2179         -  for(i=0; i<pQ->nTerms; i++){
  2180         -    for(j=0; j<p->nMatch; j++){
  2181         -      if( p->aMatch[j].iTerm==i ){
  2182         -        p->aMatch[j].exemplar = 1;
         2232  +  struct snippetMatch *aMatch;
         2233  +  int nMatch;
         2234  +  int nDesired;
         2235  +  StringBuffer sb;
         2236  +  int tailCol = -1;
         2237  +  int tailOffset = -1;
         2238  +  int iCol;
         2239  +  int nDoc;
         2240  +  const char *zDoc;
         2241  +  int iStart, iEnd;
         2242  +  int wantEllipsis;
         2243  +  int tailEllipsis = 0;
         2244  +  int iMatch;
         2245  +  
         2246  +
         2247  +  free(pCursor->snippet.zSnippet);
         2248  +  pCursor->snippet.zSnippet = 0;
         2249  +  aMatch = pCursor->snippet.aMatch;
         2250  +  nMatch = pCursor->snippet.nMatch;
         2251  +  initStringBuffer(&sb);
         2252  +
         2253  +  for(i=0; i<nMatch; i++){
         2254  +    aMatch[i].snStatus = SNIPPET_IGNORE;
         2255  +  }
         2256  +  nDesired = 0;
         2257  +  for(i=0; i<pCursor->q.nTerms; i++){
         2258  +    for(j=0; j<nMatch; j++){
         2259  +      if( aMatch[j].iTerm==i ){
         2260  +        aMatch[j].snStatus = SNIPPET_DESIRED;
         2261  +        nDesired++;
  2183   2262           break;
  2184   2263         }
  2185   2264       }
  2186   2265     }
  2187         -}
  2188   2266   
  2189         -static void snippetText(Snippet *p, Query *pQ){
  2190         -  
         2267  +  iMatch = 0;
         2268  +  for(i=0; i<nMatch && nDesired>0; i++){
         2269  +    if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
         2270  +    nDesired--;
         2271  +    iCol = aMatch[i].iCol;
         2272  +    zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
         2273  +    nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
         2274  +    iStart = aMatch[i].iStart - 40;
         2275  +    iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
         2276  +    if( iStart<=10 ){
         2277  +      iStart = 0;
         2278  +      wantEllipsis = 0;
         2279  +    }else{
         2280  +      wantEllipsis = 1;
         2281  +    }
         2282  +    if( iCol==tailCol && iStart<=tailOffset+20 ){
         2283  +      iStart = tailOffset;
         2284  +      wantEllipsis = 0;
         2285  +      tailEllipsis = 0;
         2286  +    }
         2287  +    if( wantEllipsis || tailEllipsis ){
         2288  +      append(&sb, zEllipsis);
         2289  +    }
         2290  +    iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
         2291  +    iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
         2292  +    if( iEnd>=nDoc-10 ){
         2293  +      iEnd = nDoc;
         2294  +      tailEllipsis = 0;
         2295  +    }else{
         2296  +      tailEllipsis = 1;
         2297  +    }
         2298  +    while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
         2299  +    while( iStart<iEnd ){
         2300  +      while( iMatch<nMatch && aMatch[iMatch].iStart<iStart ){ iMatch++; }
         2301  +      if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd ){
         2302  +        nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
         2303  +        iStart = aMatch[iMatch].iStart;
         2304  +        append(&sb, zStartMark);
         2305  +        nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
         2306  +        append(&sb, zEndMark);
         2307  +        iStart += aMatch[iMatch].nByte;
         2308  +        for(j=iMatch+1; j<nMatch; j++){
         2309  +          if( aMatch[j].iTerm==aMatch[iMatch].iTerm
         2310  +              && aMatch[j].snStatus==SNIPPET_DESIRED ){
         2311  +            nDesired--;
         2312  +            aMatch[j].snStatus = SNIPPET_IGNORE;
         2313  +          }
         2314  +        }
         2315  +      }else{
         2316  +        nappend(&sb, &zDoc[iStart], iEnd - iStart);
         2317  +        iStart = iEnd;
         2318  +      }
         2319  +    }
         2320  +    tailCol = iCol;
         2321  +    tailOffset = iEnd;
         2322  +  }
         2323  +  if( tailEllipsis ){
         2324  +    append(&sb, zEllipsis);
         2325  +  }
         2326  +  pCursor->snippet.zSnippet = sb.s;
         2327  +  pCursor->snippet.nSnippet = sb.len;  
  2191   2328   }
  2192   2329   
  2193   2330   
  2194   2331   /*
  2195   2332   ** Close the cursor.  For additional information see the documentation
  2196   2333   ** on the xClose method of the virtual table interface.
  2197   2334   */
................................................................................
  2843   2980   ){
  2844   2981     fulltext_cursor *pCursor;
  2845   2982     if( argc<1 ) return;
  2846   2983     if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
  2847   2984         sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
  2848   2985       sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
  2849   2986     }else{
         2987  +    const char *zStart = "<b>";
         2988  +    const char *zEnd = "</b>";
         2989  +    const char *zEllipsis = "<b>...</b>";
  2850   2990       memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
  2851         -    /* TODO:  Return the snippet */
         2991  +    if( argc>=2 ){
         2992  +      zStart = (const char*)sqlite3_value_text(argv[1]);
         2993  +      if( argc>=3 ){
         2994  +        zEnd = (const char*)sqlite3_value_text(argv[2]);
         2995  +        if( argc>=4 ){
         2996  +          zEllipsis = (const char*)sqlite3_value_text(argv[3]);
         2997  +        }
         2998  +      }
         2999  +    }
         3000  +    snippetAllOffsets(pCursor);
         3001  +    snippetText(pCursor, zStart, zEnd, zEllipsis);
         3002  +    sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
         3003  +                        pCursor->snippet.nSnippet, SQLITE_STATIC);
  2852   3004     }
  2853   3005   }
  2854   3006   
  2855   3007   /*
  2856   3008   ** Implementation of the offsets() function for FTS1
  2857   3009   */
  2858   3010   static void snippetOffsetsFunc(

Changes to test/fts1c.test.

     7      7   #    May you find forgiveness for yourself and forgive others.
     8      8   #    May you share freely, never taking more than you give.
     9      9   #
    10     10   #*************************************************************************
    11     11   # This file implements regression tests for SQLite library.  The
    12     12   # focus of this script is testing the FTS1 module.
    13     13   #
    14         -# $Id: fts1c.test,v 1.6 2006/09/18 02:12:48 drh Exp $
           14  +# $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $
    15     15   #
    16     16   
    17     17   set testdir [file dirname $argv0]
    18     18   source $testdir/tester.tcl
    19     19   
    20     20   # If SQLITE_ENABLE_FTS1 is defined, omit this file.
    21     21   ifcapable !fts1 {
................................................................................
  1113   1113   } {32 {3 0 94 5 3 0 114 5 3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7 3 1 493 7}}
  1114   1114   do_test fts1c-3.2 {
  1115   1115     execsql {
  1116   1116       SELECT rowid, offsets(email) FROM email
  1117   1117        WHERE body MATCH '"child product"'
  1118   1118     }
  1119   1119   } {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}}
         1120  +
         1121  +# Snippet generator tests
         1122  +#
         1123  +do_test fts1c-4.1 {
         1124  +  execsql {
         1125  +    SELECT snippet(email) FROM email
         1126  +     WHERE email MATCH 'subject:gas reminder'
         1127  +  }
         1128  +} {{Alert Posted 10:00 AM November 20,2000: E-<b>GAS</b> Request <b>Reminder</b>}}
         1129  +do_test fts1c-4.2 {
         1130  +  execsql {
         1131  +    SELECT snippet(email) FROM email
         1132  +     WHERE email MATCH 'christmas candlelight'
         1133  +  }
         1134  +} {{<b>...</b>place.? What do you think about going here <b>Christmas</b> 
         1135  +eve?? They have an 11:00 a.m. service and a <b>candlelight</b> service at 5:00 p.m., 
         1136  +among others.
         1137  +
         1138  +<b>...</b>}}
         1139  +
         1140  +do_test fts1c-4.3 {
         1141  +  execsql {
         1142  +    SELECT snippet(email) FROM email
         1143  +     WHERE email MATCH 'deal sheet potential reuse'
         1144  +  }
         1145  +} {{EOL-Accenture <b>Deal</b> <b>Sheet</b><b>...</b>intent
         1146  +     Review Enron asset base for <b>potential</b> <b>reuse</b>/ licensing
         1147  +     Contract negotiations
         1148  +
         1149  +<b>...</b>}}
         1150  +do_test fts1c-4.4 {
         1151  +  execsql {
         1152  +    SELECT snippet(email,'<<<','>>>',' ') FROM email
         1153  +     WHERE email MATCH 'deal sheet potential reuse'
         1154  +  }
         1155  +} {{EOL-Accenture <<<Deal>>> <<<Sheet>>> intent
         1156  +     Review Enron asset base for <<<potential>>> <<<reuse>>>/ licensing
         1157  +     Contract negotiations
         1158  +
         1159  + }}
  1120   1160   
  1121   1161   finish_test