SQLite

Check-in [c7ee60d009]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431)
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: c7ee60d00976efab25a830e7416538010c734129
User & Date: drh 2006-09-21 02:03:09.000
Context
2006-09-21
11:02
Be more aggressive with the SQLITE_OMIT_VACUUM macro. Saves about 150 bytes of code space. (CVS 3432) (check-in: 7e618db457 user: drh tags: trunk)
02:03
Implementation of the snippet() function for FTS1. Includes a few simple test cases but more testing is needed. (CVS 3431) (check-in: c7ee60d009 user: drh tags: trunk)
2006-09-18
21:14
Fixed a build problem in sqlite3_extension_init(). (CVS 3430) (check-in: bb2e1871cb user: adamd tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to ext/fts1/fts1.c.
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67




68
69
70
71
72
73
74
void initStringBuffer(StringBuffer *sb){
  sb->len = 0;
  sb->alloced = 100;
  sb->s = malloc(100);
  sb->s[0] = '\0';
}

void append(StringBuffer *sb, const char *zFrom){
  int nFrom = strlen(zFrom);
  if( sb->len + nFrom >= sb->alloced ){
    sb->alloced = sb->len + nFrom + 100;
    sb->s = realloc(sb->s, sb->alloced+1);
    if( sb->s==0 ){
      initStringBuffer(sb);
      return;
    }
  }
  strcpy(sb->s + sb->len, zFrom);
  sb->len += nFrom;




}

/* We encode variable-length integers in little-endian order using seven bits
 * per byte as follows:
**
** KEY:
**         A = 0xxxxxxx    7 bits of data and one flag bit







|
<








|

>
>
>
>







49
50
51
52
53
54
55
56

57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
void initStringBuffer(StringBuffer *sb){
  sb->len = 0;
  sb->alloced = 100;
  sb->s = malloc(100);
  sb->s[0] = '\0';
}

void nappend(StringBuffer *sb, const char *zFrom, int nFrom){

  if( sb->len + nFrom >= sb->alloced ){
    sb->alloced = sb->len + nFrom + 100;
    sb->s = realloc(sb->s, sb->alloced+1);
    if( sb->s==0 ){
      initStringBuffer(sb);
      return;
    }
  }
  memcpy(sb->s + sb->len, zFrom, nFrom);
  sb->len += nFrom;
  sb->s[sb->len] = 0;
}
void append(StringBuffer *sb, const char *zFrom){
  nappend(sb, zFrom, strlen(zFrom));
}

/* We encode variable-length integers in little-endian order using seven bits
 * per byte as follows:
**
** KEY:
**         A = 0xxxxxxx    7 bits of data and one flag bit
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921


922
923
924
925
926
927
928
** An instance of the following structure keeps track of generated
** matching-word offset information and snippets.
*/
typedef struct Snippet {
  int nMatch;     /* Total number of matches */
  int nAlloc;     /* Space allocated for aMatch[] */
  struct snippetMatch { /* One entry for each matching term */
    char exemplar;       /* True if this match should be shown in the snippet */
    short int iCol;      /* The column that contains the match */
    short int iTerm;     /* The index in Query.pTerms[] of the matching term */
    short int nByte;     /* Number of bytes in the term */
    short int nContext;  /* Number of bytes of context for this match */
    int iStart;          /* The offset to the first character of the term */
    int iContext;        /* Start of the context */
  } *aMatch;      /* Points to space obtained from malloc */
  char *zOffset;  /* Text rendering of aMatch[] */
  int nOffset;    /* strlen(zOffset) */


} Snippet;


typedef enum QueryType {
  QUERY_GENERIC,   /* table scan */
  QUERY_ROWID,     /* lookup by rowid */
  QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/







|



<

<



>
>







908
909
910
911
912
913
914
915
916
917
918

919

920
921
922
923
924
925
926
927
928
929
930
931
** An instance of the following structure keeps track of generated
** matching-word offset information and snippets.
*/
typedef struct Snippet {
  int nMatch;     /* Total number of matches */
  int nAlloc;     /* Space allocated for aMatch[] */
  struct snippetMatch { /* One entry for each matching term */
    char snStatus;       /* Status flag for use while constructing snippets */
    short int iCol;      /* The column that contains the match */
    short int iTerm;     /* The index in Query.pTerms[] of the matching term */
    short int nByte;     /* Number of bytes in the term */

    int iStart;          /* The offset to the first character of the term */

  } *aMatch;      /* Points to space obtained from malloc */
  char *zOffset;  /* Text rendering of aMatch[] */
  int nOffset;    /* strlen(zOffset) */
  char *zSnippet; /* Snippet text */
  int nSnippet;   /* strlen(zSnippet) */
} Snippet;


typedef enum QueryType {
  QUERY_GENERIC,   /* table scan */
  QUERY_ROWID,     /* lookup by rowid */
  QUERY_FULLTEXT   /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
1998
1999
2000
2001
2002
2003
2004

2005
2006
2007
2008
2009
2010
2011

/* Free all of the dynamically allocated memory held by the
** Snippet
*/
static void snippetClear(Snippet *p){
  free(p->aMatch);
  free(p->zOffset);

  memset(p, 0, sizeof(*p));
}
/*
** Append a single entry to the p->aMatch[] log.
*/
static void snippetAppendMatch(
  Snippet *p,               /* Append the entry to this snippet */







>







2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015

/* Free all of the dynamically allocated memory held by the
** Snippet
*/
static void snippetClear(Snippet *p){
  free(p->aMatch);
  free(p->zOffset);
  free(p->zSnippet);
  memset(p, 0, sizeof(*p));
}
/*
** Append a single entry to the p->aMatch[] log.
*/
static void snippetAppendMatch(
  Snippet *p,               /* Append the entry to this snippet */
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
      p->nMatch = 0;
      p->nAlloc = 0;
      return;
    }
  }
  i = p->nMatch++;
  pMatch = &p->aMatch[i];
  pMatch->exemplar = 0;
  pMatch->iCol = iCol;
  pMatch->iTerm = iTerm;
  pMatch->iStart = iStart;
  pMatch->nByte = nByte;
}

/*







<







2025
2026
2027
2028
2029
2030
2031

2032
2033
2034
2035
2036
2037
2038
      p->nMatch = 0;
      p->nAlloc = 0;
      return;
    }
  }
  i = p->nMatch++;
  pMatch = &p->aMatch[i];

  pMatch->iCol = iCol;
  pMatch->iTerm = iTerm;
  pMatch->iStart = iStart;
  pMatch->nByte = nByte;
}

/*
2162
2163
2164
2165
2166
2167
2168
2169
2170

2171
2172





















2173

















2174





2175


2176
2177





2178





















2179




2180
2181
2182

2183
2184
2185
2186
2187














2188
2189




2190









































2191
2192
2193
2194
2195
2196
2197
    cnt++;
  }
  p->zOffset = sb.s;
  p->nOffset = sb.len;
}

/*
** Scan all matches in Snippet and mark the exemplars.  Exemplars are
** matches that we definitely want to include in the snippet.

**
** Generally speaking, each keyword in the search phrase will have





















** a single exemplar.  When a keyword matches at multiple points

















** within the document, the trick is figuring which of these matches





** should be the examplar.


*/
static void snippetFindExemplars(Snippet *p, Query *pQ){





  int i, j;





















  for(i=0; i<pQ->nTerms; i++){




    for(j=0; j<p->nMatch; j++){
      if( p->aMatch[j].iTerm==i ){
        p->aMatch[j].exemplar = 1;

        break;
      }
    }
  }
}















static void snippetText(Snippet *p, Query *pQ){




  









































}


/*
** Close the cursor.  For additional information see the documentation
** on the xClose method of the virtual table interface.
*/







|
|
>

<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
|
>
>

|
>
>
>
>
>

>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
|
|
|
>




|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
<
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175

2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281

2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
    cnt++;
  }
  p->zOffset = sb.s;
  p->nOffset = sb.len;
}

/*
** zDoc[0..nDoc-1] is phrase of text.  aMatch[0..nMatch-1] are a set
** of matching words some of which might be in zDoc.  zDoc is column
** number iCol.
**

** iBreak is suggested spot in zDoc where we could begin or end an
** excerpt.  Return a value similar to iBreak but possibly adjusted
** to be a little left or right so that the break point is better.
*/
static int wordBoundary(
  int iBreak,                   /* The suggested break point */
  const char *zDoc,             /* Document text */
  int nDoc,                     /* Number of bytes in zDoc[] */
  struct snippetMatch *aMatch,  /* Matching words */
  int nMatch,                   /* Number of entries in aMatch[] */
  int iCol                      /* The column number for zDoc[] */
){
  int i;
  if( iBreak<=10 ){
    return 0;
  }
  if( iBreak>=nDoc-10 ){
    return nDoc;
  }
  for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
  while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
  if( i<nMatch ){
    if( aMatch[i].iStart<iBreak+10 ){
      return aMatch[i].iStart;
    }
    if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
      return aMatch[i-1].iStart;
    }
  }
  for(i=1; i<=10; i++){
    if( isspace(zDoc[iBreak-i]) ){
      return iBreak - i + 1;
    }
    if( isspace(zDoc[iBreak+i]) ){
      return iBreak + i + 1;
    }
  }
  return iBreak;
}

/*
** Allowed values for Snippet.aMatch[].snStatus
*/
#define SNIPPET_IGNORE  0   /* It is ok to omit this match from the snippet */
#define SNIPPET_DESIRED 1   /* We want to include this match in the snippet */  

/*
** Generate the text of a snippet.
*/
static void snippetText(
  fulltext_cursor *pCursor,   /* The cursor we need the snippet for */
  const char *zStartMark,     /* Markup to appear before each match */
  const char *zEndMark,       /* Markup to appear after each match */
  const char *zEllipsis       /* Ellipsis mark */
){
  int i, j;
  struct snippetMatch *aMatch;
  int nMatch;
  int nDesired;
  StringBuffer sb;
  int tailCol = -1;
  int tailOffset = -1;
  int iCol;
  int nDoc;
  const char *zDoc;
  int iStart, iEnd;
  int wantEllipsis;
  int tailEllipsis = 0;
  int iMatch;
  

  free(pCursor->snippet.zSnippet);
  pCursor->snippet.zSnippet = 0;
  aMatch = pCursor->snippet.aMatch;
  nMatch = pCursor->snippet.nMatch;
  initStringBuffer(&sb);

  for(i=0; i<nMatch; i++){
    aMatch[i].snStatus = SNIPPET_IGNORE;
  }
  nDesired = 0;
  for(i=0; i<pCursor->q.nTerms; i++){
    for(j=0; j<nMatch; j++){
      if( aMatch[j].iTerm==i ){
        aMatch[j].snStatus = SNIPPET_DESIRED;
        nDesired++;
        break;
      }
    }
  }

  iMatch = 0;
  for(i=0; i<nMatch && nDesired>0; i++){
    if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
    nDesired--;
    iCol = aMatch[i].iCol;
    zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
    nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
    iStart = aMatch[i].iStart - 40;
    iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
    if( iStart<=10 ){
      iStart = 0;
      wantEllipsis = 0;
    }else{
      wantEllipsis = 1;
    }

    if( iCol==tailCol && iStart<=tailOffset+20 ){
      iStart = tailOffset;
      wantEllipsis = 0;
      tailEllipsis = 0;
    }
    if( wantEllipsis || tailEllipsis ){
      append(&sb, zEllipsis);
    }
    iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
    iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
    if( iEnd>=nDoc-10 ){
      iEnd = nDoc;
      tailEllipsis = 0;
    }else{
      tailEllipsis = 1;
    }
    while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
    while( iStart<iEnd ){
      while( iMatch<nMatch && aMatch[iMatch].iStart<iStart ){ iMatch++; }
      if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd ){
        nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
        iStart = aMatch[iMatch].iStart;
        append(&sb, zStartMark);
        nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
        append(&sb, zEndMark);
        iStart += aMatch[iMatch].nByte;
        for(j=iMatch+1; j<nMatch; j++){
          if( aMatch[j].iTerm==aMatch[iMatch].iTerm
              && aMatch[j].snStatus==SNIPPET_DESIRED ){
            nDesired--;
            aMatch[j].snStatus = SNIPPET_IGNORE;
          }
        }
      }else{
        nappend(&sb, &zDoc[iStart], iEnd - iStart);
        iStart = iEnd;
      }
    }
    tailCol = iCol;
    tailOffset = iEnd;
  }
  if( tailEllipsis ){
    append(&sb, zEllipsis);
  }
  pCursor->snippet.zSnippet = sb.s;
  pCursor->snippet.nSnippet = sb.len;  
}


/*
** Close the cursor.  For additional information see the documentation
** on the xClose method of the virtual table interface.
*/
2843
2844
2845
2846
2847
2848
2849



2850









2851



2852
2853
2854
2855
2856
2857
2858
){
  fulltext_cursor *pCursor;
  if( argc<1 ) return;
  if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
      sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
    sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
  }else{



    memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));









    /* TODO:  Return the snippet */



  }
}

/*
** Implementation of the offsets() function for FTS1
*/
static void snippetOffsetsFunc(







>
>
>

>
>
>
>
>
>
>
>
>
|
>
>
>







2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
2990
2991
2992
2993
2994
2995
2996
2997
2998
2999
3000
3001
3002
3003
3004
3005
3006
3007
3008
3009
3010
){
  fulltext_cursor *pCursor;
  if( argc<1 ) return;
  if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
      sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
    sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
  }else{
    const char *zStart = "<b>";
    const char *zEnd = "</b>";
    const char *zEllipsis = "<b>...</b>";
    memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
    if( argc>=2 ){
      zStart = (const char*)sqlite3_value_text(argv[1]);
      if( argc>=3 ){
        zEnd = (const char*)sqlite3_value_text(argv[2]);
        if( argc>=4 ){
          zEllipsis = (const char*)sqlite3_value_text(argv[3]);
        }
      }
    }
    snippetAllOffsets(pCursor);
    snippetText(pCursor, zStart, zEnd, zEllipsis);
    sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
                        pCursor->snippet.nSnippet, SQLITE_STATIC);
  }
}

/*
** Implementation of the offsets() function for FTS1
*/
static void snippetOffsetsFunc(
Changes to test/fts1c.test.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2006 September 14
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS1 module.
#
# $Id: fts1c.test,v 1.6 2006/09/18 02:12:48 drh Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS1 is defined, omit this file.
ifcapable !fts1 {













|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 2006 September 14
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#*************************************************************************
# This file implements regression tests for SQLite library.  The
# focus of this script is testing the FTS1 module.
#
# $Id: fts1c.test,v 1.7 2006/09/21 02:03:11 drh Exp $
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

# If SQLITE_ENABLE_FTS1 is defined, omit this file.
ifcapable !fts1 {
1113
1114
1115
1116
1117
1118
1119
1120








































1121
} {32 {3 0 94 5 3 0 114 5 3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7 3 1 493 7}}
do_test fts1c-3.2 {
  execsql {
    SELECT rowid, offsets(email) FROM email
     WHERE body MATCH '"child product"'
  }
} {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}}









































finish_test








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
} {32 {3 0 94 5 3 0 114 5 3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7 3 1 493 7}}
do_test fts1c-3.2 {
  execsql {
    SELECT rowid, offsets(email) FROM email
     WHERE body MATCH '"child product"'
  }
} {32 {3 0 207 5 3 1 213 7 3 0 245 5 3 1 251 7 3 0 409 5 3 1 415 7}}

# Snippet generator tests
#
do_test fts1c-4.1 {
  execsql {
    SELECT snippet(email) FROM email
     WHERE email MATCH 'subject:gas reminder'
  }
} {{Alert Posted 10:00 AM November 20,2000: E-<b>GAS</b> Request <b>Reminder</b>}}
do_test fts1c-4.2 {
  execsql {
    SELECT snippet(email) FROM email
     WHERE email MATCH 'christmas candlelight'
  }
} {{<b>...</b>place.? What do you think about going here <b>Christmas</b> 
eve?? They have an 11:00 a.m. service and a <b>candlelight</b> service at 5:00 p.m., 
among others.

<b>...</b>}}

do_test fts1c-4.3 {
  execsql {
    SELECT snippet(email) FROM email
     WHERE email MATCH 'deal sheet potential reuse'
  }
} {{EOL-Accenture <b>Deal</b> <b>Sheet</b><b>...</b>intent
     Review Enron asset base for <b>potential</b> <b>reuse</b>/ licensing
     Contract negotiations

<b>...</b>}}
do_test fts1c-4.4 {
  execsql {
    SELECT snippet(email,'<<<','>>>',' ') FROM email
     WHERE email MATCH 'deal sheet potential reuse'
  }
} {{EOL-Accenture <<<Deal>>> <<<Sheet>>> intent
     Review Enron asset base for <<<potential>>> <<<reuse>>>/ licensing
     Contract negotiations

 }}

finish_test