Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | fts4-languageid |
Files: | files | file ages | folders |
SHA1: |
f8e9c445dd358c40e5a7bf3756b9f291 |
User & Date: | dan 2012-03-03 18:46:41.456 |
Context
2012-03-05
| ||
15:33 | Merge the fts4-languageid branch with the trunk. (check-in: 99a9073b5e user: dan tags: trunk) | |
2012-03-03
| ||
18:46 | Add the xLanguageid method to sqlite3_fts3_tokenizer versions 1 and greater. (Closed-Leaf check-in: f8e9c445dd user: dan tags: fts4-languageid) | |
2012-03-02
| ||
19:53 | Fix problems with combining content= and languageid= in a single fts4 table. (check-in: 22491e7bc3 user: dan tags: fts4-languageid) | |
Changes
Changes to ext/fts3/fts3.c.
︙ | ︙ | |||
2954 2955 2956 2957 2958 2959 2960 | int iCol = idxNum-FTS3_FULLTEXT_SEARCH; const char *zQuery = (const char *)sqlite3_value_text(apVal[0]); if( zQuery==0 && sqlite3_value_type(apVal[0])!=SQLITE_NULL ){ return SQLITE_NOMEM; } | > > > | | < < < | 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 | int iCol = idxNum-FTS3_FULLTEXT_SEARCH; const char *zQuery = (const char *)sqlite3_value_text(apVal[0]); if( zQuery==0 && sqlite3_value_type(apVal[0])!=SQLITE_NULL ){ return SQLITE_NOMEM; } pCsr->iLangid = 0; if( nVal==2 ) pCsr->iLangid = sqlite3_value_int(apVal[1]); rc = sqlite3Fts3ExprParse(p->pTokenizer, pCsr->iLangid, p->azColumn, p->bHasStat, p->nColumn, iCol, zQuery, -1, &pCsr->pExpr ); if( rc!=SQLITE_OK ){ if( rc==SQLITE_ERROR ){ static const char *zErr = "malformed MATCH expression: [%s]"; p->base.zErrMsg = sqlite3_mprintf(zErr, zQuery); } return rc; } rc = sqlite3Fts3ReadLock(p); if( rc!=SQLITE_OK ) return rc; rc = fts3EvalStart(pCsr); sqlite3Fts3SegmentsClose(p); if( rc!=SQLITE_OK ) return rc; |
︙ | ︙ |
Changes to ext/fts3/fts3Int.h.
︙ | ︙ | |||
494 495 496 497 498 499 500 | void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*); void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *, const char *, const char *, int, int ); void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *); /* fts3_expr.c */ | | > > > > | 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 | void sqlite3Fts3Offsets(sqlite3_context*, Fts3Cursor*); void sqlite3Fts3Snippet(sqlite3_context *, Fts3Cursor *, const char *, const char *, const char *, int, int ); void sqlite3Fts3Matchinfo(sqlite3_context *, Fts3Cursor *, const char *); /* fts3_expr.c */ int sqlite3Fts3ExprParse(sqlite3_tokenizer *, int, char **, int, int, int, const char *, int, Fts3Expr ** ); void sqlite3Fts3ExprFree(Fts3Expr *); #ifdef SQLITE_TEST int sqlite3Fts3ExprInitTestInterface(sqlite3 *db); int sqlite3Fts3InitTerm(sqlite3 *db); #endif int sqlite3Fts3OpenTokenizer(sqlite3_tokenizer *, int, const char *, int, sqlite3_tokenizer_cursor ** ); /* fts3_aux.c */ int sqlite3Fts3InitAux(sqlite3 *db); void sqlite3Fts3EvalPhraseCleanup(Fts3Phrase *); int sqlite3Fts3MsrIncrStart( |
︙ | ︙ |
Changes to ext/fts3/fts3_expr.c.
︙ | ︙ | |||
88 89 90 91 92 93 94 95 96 97 98 99 100 101 | ** FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the ** FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to ** zero. */ typedef struct ParseContext ParseContext; struct ParseContext { sqlite3_tokenizer *pTokenizer; /* Tokenizer module */ const char **azCol; /* Array of column names for fts3 table */ int bFts4; /* True to allow FTS4-only syntax */ int nCol; /* Number of entries in azCol[] */ int iDefaultCol; /* Default column to query */ int isNot; /* True if getNextNode() sees a unary - */ sqlite3_context *pCtx; /* Write error message here */ int nNest; /* Number of nested brackets */ | > | 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | ** FTSQUERY_PHRASE with a unary "-" attached to it. i.e. "mysql" in the ** FTS3 query "sqlite -mysql". Otherwise, ParseContext.isNot is set to ** zero. */ typedef struct ParseContext ParseContext; struct ParseContext { sqlite3_tokenizer *pTokenizer; /* Tokenizer module */ int iLangid; /* Language id used with tokenizer */ const char **azCol; /* Array of column names for fts3 table */ int bFts4; /* True to allow FTS4-only syntax */ int nCol; /* Number of entries in azCol[] */ int iDefaultCol; /* Default column to query */ int isNot; /* True if getNextNode() sees a unary - */ sqlite3_context *pCtx; /* Write error message here */ int nNest; /* Number of nested brackets */ |
︙ | ︙ | |||
123 124 125 126 127 128 129 130 131 132 133 134 135 136 | */ static void *fts3MallocZero(int nByte){ void *pRet = sqlite3_malloc(nByte); if( pRet ) memset(pRet, 0, nByte); return pRet; } /* ** Extract the next token from buffer z (length n) using the tokenizer ** and other information (column names etc.) in pParse. Create an Fts3Expr ** structure of type FTSQUERY_PHRASE containing a phrase consisting of this ** single token and set *ppExpr to point to it. If the end of the buffer is ** reached before a token is found, set *ppExpr to zero. It is the | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 | */ static void *fts3MallocZero(int nByte){ void *pRet = sqlite3_malloc(nByte); if( pRet ) memset(pRet, 0, nByte); return pRet; } int sqlite3Fts3OpenTokenizer( sqlite3_tokenizer *pTokenizer, int iLangid, const char *z, int n, sqlite3_tokenizer_cursor **ppCsr ){ sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; sqlite3_tokenizer_cursor *pCsr = 0; int rc; rc = pModule->xOpen(pTokenizer, z, n, &pCsr); assert( rc==SQLITE_OK || pCsr==0 ); if( rc==SQLITE_OK ){ pCsr->pTokenizer = pTokenizer; if( pModule->iVersion>=1 ){ rc = pModule->xLanguageid(pCsr, iLangid); if( rc!=SQLITE_OK ){ pModule->xClose(pCsr); pCsr = 0; } } } *ppCsr = pCsr; return rc; } /* ** Extract the next token from buffer z (length n) using the tokenizer ** and other information (column names etc.) in pParse. Create an Fts3Expr ** structure of type FTSQUERY_PHRASE containing a phrase consisting of this ** single token and set *ppExpr to point to it. If the end of the buffer is ** reached before a token is found, set *ppExpr to zero. It is the |
︙ | ︙ | |||
150 151 152 153 154 155 156 | sqlite3_tokenizer *pTokenizer = pParse->pTokenizer; sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; int rc; sqlite3_tokenizer_cursor *pCursor; Fts3Expr *pRet = 0; int nConsumed = 0; | | < < | 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | sqlite3_tokenizer *pTokenizer = pParse->pTokenizer; sqlite3_tokenizer_module const *pModule = pTokenizer->pModule; int rc; sqlite3_tokenizer_cursor *pCursor; Fts3Expr *pRet = 0; int nConsumed = 0; rc = sqlite3Fts3OpenTokenizer(pTokenizer, pParse->iLangid, z, n, &pCursor); if( rc==SQLITE_OK ){ const char *zToken; int nToken, iStart, iEnd, iPosition; int nByte; /* total space to allocate */ rc = pModule->xNext(pCursor, &zToken, &nToken, &iStart, &iEnd, &iPosition); if( rc==SQLITE_OK ){ nByte = sizeof(Fts3Expr) + sizeof(Fts3Phrase) + nToken; pRet = (Fts3Expr *)fts3MallocZero(nByte); if( !pRet ){ rc = SQLITE_NOMEM; }else{ pRet->eType = FTSQUERY_PHRASE; |
︙ | ︙ | |||
264 265 266 267 268 269 270 | ** ** Buffer zTemp: Contains copies of all tokens. ** ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below, ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase ** structures. */ | > | < | 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 | ** ** Buffer zTemp: Contains copies of all tokens. ** ** The second pass, in the block that begins "if( rc==SQLITE_DONE )" below, ** appends buffer zTemp to buffer p, and fills in the Fts3Expr and Fts3Phrase ** structures. */ rc = sqlite3Fts3OpenTokenizer( pTokenizer, pParse->iLangid, zInput, nInput, &pCursor); if( rc==SQLITE_OK ){ int ii; for(ii=0; rc==SQLITE_OK; ii++){ const char *zByte; int nByte, iBegin, iEnd, iPos; rc = pModule->xNext(pCursor, &zByte, &nByte, &iBegin, &iEnd, &iPos); if( rc==SQLITE_OK ){ Fts3PhraseToken *pToken; |
︙ | ︙ | |||
741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 | ** that appears on the left-hand-side of the MATCH operator (the default ** column to match against for tokens for which a column name is not explicitly ** specified as part of the query string), or -1 if tokens may by default ** match any table column. */ int sqlite3Fts3ExprParse( sqlite3_tokenizer *pTokenizer, /* Tokenizer module */ char **azCol, /* Array of column names for fts3 table */ int bFts4, /* True to allow FTS4-only syntax */ int nCol, /* Number of entries in azCol[] */ int iDefaultCol, /* Default column to query */ const char *z, int n, /* Text of MATCH query */ Fts3Expr **ppExpr /* OUT: Parsed query structure */ ){ int nParsed; int rc; ParseContext sParse; sParse.pTokenizer = pTokenizer; sParse.azCol = (const char **)azCol; sParse.nCol = nCol; sParse.iDefaultCol = iDefaultCol; | > > > > < | 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 | ** that appears on the left-hand-side of the MATCH operator (the default ** column to match against for tokens for which a column name is not explicitly ** specified as part of the query string), or -1 if tokens may by default ** match any table column. */ int sqlite3Fts3ExprParse( sqlite3_tokenizer *pTokenizer, /* Tokenizer module */ int iLangid, /* Language id for tokenizer */ char **azCol, /* Array of column names for fts3 table */ int bFts4, /* True to allow FTS4-only syntax */ int nCol, /* Number of entries in azCol[] */ int iDefaultCol, /* Default column to query */ const char *z, int n, /* Text of MATCH query */ Fts3Expr **ppExpr /* OUT: Parsed query structure */ ){ int nParsed; int rc; ParseContext sParse; memset(&sParse, 0, sizeof(ParseContext)); sParse.pTokenizer = pTokenizer; sParse.iLangid = iLangid; sParse.azCol = (const char **)azCol; sParse.nCol = nCol; sParse.iDefaultCol = iDefaultCol; sParse.bFts4 = bFts4; if( z==0 ){ *ppExpr = 0; return SQLITE_OK; } if( n<0 ){ n = (int)strlen(z); |
︙ | ︙ | |||
946 947 948 949 950 951 952 | goto exprtest_out; } for(ii=0; ii<nCol; ii++){ azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]); } rc = sqlite3Fts3ExprParse( | | | 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 | goto exprtest_out; } for(ii=0; ii<nCol; ii++){ azCol[ii] = (char *)sqlite3_value_text(argv[ii+2]); } rc = sqlite3Fts3ExprParse( pTokenizer, 0, azCol, 0, nCol, nCol, zExpr, nExpr, &pExpr ); if( rc!=SQLITE_OK && rc!=SQLITE_NOMEM ){ sqlite3_result_error(context, "Error parsing expression", -1); }else if( rc==SQLITE_NOMEM || !(zBuf = exprToString(pExpr, 0)) ){ sqlite3_result_error_nomem(context); }else{ sqlite3_result_text(context, zBuf, -1, SQLITE_TRANSIENT); |
︙ | ︙ |
Changes to ext/fts3/fts3_snippet.c.
︙ | ︙ | |||
528 529 530 531 532 533 534 535 536 537 538 539 540 541 | ** This is done as part of extracting the snippet text, not when selecting ** the snippet. Snippet selection is done based on doclists only, so there ** is no way for fts3BestSnippet() to know whether or not the document ** actually contains terms that follow the final highlighted term. */ static int fts3SnippetShift( Fts3Table *pTab, /* FTS3 table snippet comes from */ int nSnippet, /* Number of tokens desired for snippet */ const char *zDoc, /* Document text to extract snippet from */ int nDoc, /* Size of buffer zDoc in bytes */ int *piPos, /* IN/OUT: First token of snippet */ u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */ ){ u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */ | > | 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 | ** This is done as part of extracting the snippet text, not when selecting ** the snippet. Snippet selection is done based on doclists only, so there ** is no way for fts3BestSnippet() to know whether or not the document ** actually contains terms that follow the final highlighted term. */ static int fts3SnippetShift( Fts3Table *pTab, /* FTS3 table snippet comes from */ int iLangid, /* Language id to use in tokenizing */ int nSnippet, /* Number of tokens desired for snippet */ const char *zDoc, /* Document text to extract snippet from */ int nDoc, /* Size of buffer zDoc in bytes */ int *piPos, /* IN/OUT: First token of snippet */ u64 *pHlmask /* IN/OUT: Mask of tokens to highlight */ ){ u64 hlmask = *pHlmask; /* Local copy of initial highlight-mask */ |
︙ | ︙ | |||
563 564 565 566 567 568 569 | sqlite3_tokenizer_module *pMod; sqlite3_tokenizer_cursor *pC; pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) ** or more tokens in zDoc/nDoc. */ | | < | 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 | sqlite3_tokenizer_module *pMod; sqlite3_tokenizer_cursor *pC; pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; /* Open a cursor on zDoc/nDoc. Check if there are (nSnippet+nDesired) ** or more tokens in zDoc/nDoc. */ rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, iLangid, zDoc, nDoc, &pC); if( rc!=SQLITE_OK ){ return rc; } while( rc==SQLITE_OK && iCurrent<(nSnippet+nDesired) ){ const char *ZDUMMY; int DUMMY1, DUMMY2, DUMMY3; rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &DUMMY2, &DUMMY3, &iCurrent); } pMod->xClose(pC); if( rc!=SQLITE_OK && rc!=SQLITE_DONE ){ return rc; } |
︙ | ︙ | |||
627 628 629 630 631 632 633 | } return SQLITE_OK; } nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol); /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; | | < | 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 | } return SQLITE_OK; } nDoc = sqlite3_column_bytes(pCsr->pStmt, iCol); /* Open a token cursor on the document. */ pMod = (sqlite3_tokenizer_module *)pTab->pTokenizer->pModule; rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc,nDoc,&pC); if( rc!=SQLITE_OK ){ return rc; } while( rc==SQLITE_OK ){ int iBegin; /* Offset in zDoc of start of token */ int iFin; /* Offset in zDoc of end of token */ int isHighlight; /* True for highlighted terms */ rc = pMod->xNext(pC, &ZDUMMY, &DUMMY1, &iBegin, &iFin, &iCurrent); |
︙ | ︙ | |||
653 654 655 656 657 658 659 | } break; } if( iCurrent<iPos ){ continue; } if( !isShiftDone ){ int n = nDoc - iBegin; | | > > | 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 | } break; } if( iCurrent<iPos ){ continue; } if( !isShiftDone ){ int n = nDoc - iBegin; rc = fts3SnippetShift( pTab, pCsr->iLangid, nSnippet, &zDoc[iBegin], n, &iPos, &hlmask ); isShiftDone = 1; /* Now that the shift has been done, check if the initial "..." are ** required. They are required if (a) this is not the first fragment, ** or (b) this fragment does not begin at position 0 of its column. */ if( rc==SQLITE_OK && (iPos>0 || iFragment>0) ){ |
︙ | ︙ | |||
1386 1387 1388 1389 1390 1391 1392 | continue; } rc = SQLITE_NOMEM; goto offsets_out; } /* Initialize a tokenizer iterator to iterate through column iCol. */ | > | > < | 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 | continue; } rc = SQLITE_NOMEM; goto offsets_out; } /* Initialize a tokenizer iterator to iterate through column iCol. */ rc = sqlite3Fts3OpenTokenizer(pTab->pTokenizer, pCsr->iLangid, zDoc, nDoc, &pC ); if( rc!=SQLITE_OK ) goto offsets_out; rc = pMod->xNext(pC, &ZDUMMY, &NDUMMY, &iStart, &iEnd, &iCurrent); while( rc==SQLITE_OK ){ int i; /* Used to loop through terms */ int iMinPos = 0x7FFFFFFF; /* Position of next token */ TermOffset *pTerm = 0; /* TermOffset associated with next token */ |
︙ | ︙ |
Changes to ext/fts3/fts3_test.c.
︙ | ︙ | |||
9 10 11 12 13 14 15 16 17 18 19 20 21 22 | ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** This file is not part of the production FTS code. It is only used for ** testing. It contains a Tcl command that can be used to test if a document ** matches an FTS NEAR expression. */ #include <tcl.h> #include <string.h> #include <assert.h> #ifdef SQLITE_TEST | > > > | 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | ** May you share freely, never taking more than you give. ** ****************************************************************************** ** ** This file is not part of the production FTS code. It is only used for ** testing. It contains a Tcl command that can be used to test if a document ** matches an FTS NEAR expression. ** ** As of March 2012, it also contains a version 1 tokenizer used for testing ** that the sqlite3_tokenizer_module.xLanguage() method is invoked correctly. */ #include <tcl.h> #include <string.h> #include <assert.h> #ifdef SQLITE_TEST |
︙ | ︙ | |||
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 | Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); #endif return TCL_OK; } int Sqlitetestfts3_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); Tcl_CreateObjCommand(interp, "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 ); return TCL_OK; } #endif /* ifdef SQLITE_TEST */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 | Tcl_SetObjResult(interp, pRet); Tcl_DecrRefCount(pRet); #endif return TCL_OK; } /************************************************************************** ** Beginning of test tokenizer code. ** ** For language 0, this tokenizer is similar to the default 'simple' ** tokenizer. For other languages L, the following: ** ** * Odd numbered languages are case-sensitive. Even numbered ** languages are not. ** ** * Language ids 100 or greater are considered an error. ** ** The implementation assumes that the input contains only ASCII characters ** (i.e. those that may be encoded in UTF-8 using a single byte). */ typedef struct test_tokenizer { sqlite3_tokenizer base; } test_tokenizer; typedef struct test_tokenizer_cursor { sqlite3_tokenizer_cursor base; const char *aInput; /* Input being tokenized */ int nInput; /* Size of the input in bytes */ int iInput; /* Current offset in aInput */ int iToken; /* Index of next token to be returned */ char *aBuffer; /* Buffer containing current token */ int nBuffer; /* Number of bytes allocated at pToken */ int iLangid; /* Configured language id */ } test_tokenizer_cursor; static int testTokenizerCreate( int argc, const char * const *argv, sqlite3_tokenizer **ppTokenizer ){ test_tokenizer *pNew; pNew = sqlite3_malloc(sizeof(test_tokenizer)); if( !pNew ) return SQLITE_NOMEM; memset(pNew, 0, sizeof(test_tokenizer)); *ppTokenizer = (sqlite3_tokenizer *)pNew; return SQLITE_OK; } static int testTokenizerDestroy(sqlite3_tokenizer *pTokenizer){ test_tokenizer *p = (test_tokenizer *)pTokenizer; sqlite3_free(p); return SQLITE_OK; } static int testTokenizerOpen( sqlite3_tokenizer *pTokenizer, /* The tokenizer */ const char *pInput, int nBytes, /* String to be tokenized */ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ ){ int rc = SQLITE_OK; /* Return code */ test_tokenizer_cursor *pCsr; /* New cursor object */ UNUSED_PARAMETER(pTokenizer); pCsr = (test_tokenizer_cursor *)sqlite3_malloc(sizeof(test_tokenizer_cursor)); if( pCsr==0 ){ rc = SQLITE_NOMEM; }else{ memset(pCsr, 0, sizeof(test_tokenizer_cursor)); pCsr->aInput = pInput; if( nBytes<0 ){ pCsr->nInput = strlen(pInput); }else{ pCsr->nInput = nBytes; } } *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; return rc; } static int testTokenizerClose(sqlite3_tokenizer_cursor *pCursor){ test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; sqlite3_free(pCsr->aBuffer); sqlite3_free(pCsr); return SQLITE_OK; } static int testIsTokenChar(char c){ return (c>='a' && c<='z') || (c>='A' && c<='Z'); } static int testTolower(char c){ char ret = c; if( ret>='A' && ret<='Z') ret = ret - ('A'-'a'); return ret; } static int testTokenizerNext( sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by testTokenizerOpen */ const char **ppToken, /* OUT: *ppToken is the token text */ int *pnBytes, /* OUT: Number of bytes in token */ int *piStartOffset, /* OUT: Starting offset of token */ int *piEndOffset, /* OUT: Ending offset of token */ int *piPosition /* OUT: Position integer of token */ ){ test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; int rc = SQLITE_OK; const char *p; const char *pEnd; p = &pCsr->aInput[pCsr->iInput]; pEnd = &pCsr->aInput[pCsr->nInput]; /* Skip past any white-space */ assert( p<=pEnd ); while( p<pEnd && testIsTokenChar(*p)==0 ) p++; if( p==pEnd ){ rc = SQLITE_DONE; }else{ /* Advance to the end of the token */ const char *pToken = p; int nToken; while( p<pEnd && testIsTokenChar(*p) ) p++; nToken = p-pToken; /* Copy the token into the buffer */ if( nToken>pCsr->nBuffer ){ sqlite3_free(pCsr->aBuffer); pCsr->aBuffer = sqlite3_malloc(nToken); } if( pCsr->aBuffer==0 ){ rc = SQLITE_NOMEM; }else{ int i; if( pCsr->iLangid & 0x00000001 ){ for(i=0; i<nToken; i++) pCsr->aBuffer[i] = pToken[i]; }else{ for(i=0; i<nToken; i++) pCsr->aBuffer[i] = testTolower(pToken[i]); } pCsr->iToken++; pCsr->iInput = p - pCsr->aInput; *ppToken = pCsr->aBuffer; *pnBytes = nToken; *piStartOffset = pToken - pCsr->aInput; *piEndOffset = p - pCsr->aInput; *piPosition = pCsr->iToken; } } return rc; } static int testTokenizerLanguage( sqlite3_tokenizer_cursor *pCursor, int iLangid ){ int rc = SQLITE_OK; test_tokenizer_cursor *pCsr = (test_tokenizer_cursor *)pCursor; pCsr->iLangid = iLangid; if( pCsr->iLangid>=100 ){ rc = SQLITE_ERROR; } return rc; } static int fts3_test_tokenizer_cmd( ClientData clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ static const sqlite3_tokenizer_module testTokenizerModule = { 1, testTokenizerCreate, testTokenizerDestroy, testTokenizerOpen, testTokenizerClose, testTokenizerNext, testTokenizerLanguage }; const sqlite3_tokenizer_module *pPtr = &testTokenizerModule; if( objc!=1 ){ Tcl_WrongNumArgs(interp, 1, objv, ""); return TCL_ERROR; } Tcl_SetObjResult(interp, Tcl_NewByteArrayObj( (const unsigned char *)&pPtr, sizeof(sqlite3_tokenizer_module *) )); return TCL_OK; } /* ** End of tokenizer code. **************************************************************************/ int Sqlitetestfts3_Init(Tcl_Interp *interp){ Tcl_CreateObjCommand(interp, "fts3_near_match", fts3_near_match_cmd, 0, 0); Tcl_CreateObjCommand(interp, "fts3_configure_incr_load", fts3_configure_incr_load_cmd, 0, 0 ); Tcl_CreateObjCommand( interp, "fts3_test_tokenizer", fts3_test_tokenizer_cmd, 0, 0 ); return TCL_OK; } #endif /* ifdef SQLITE_TEST */ |
Changes to ext/fts3/fts3_tokenizer.c.
︙ | ︙ | |||
284 285 286 287 288 289 290 | Tcl_IncrRefCount(pRet); if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ zErr = "error in xCreate()"; goto finish; } pTokenizer->pModule = p; | | < | 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 | Tcl_IncrRefCount(pRet); if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){ zErr = "error in xCreate()"; goto finish; } pTokenizer->pModule = p; if( sqlite3Fts3OpenTokenizer(pTokenizer, 0, zInput, nInput, &pCsr) ){ zErr = "error in xOpen()"; goto finish; } while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){ Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos)); Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); zToken = &zInput[iStart]; nToken = iEnd-iStart; Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken)); |
︙ | ︙ |
Changes to ext/fts3/fts3_tokenizer.h.
︙ | ︙ | |||
48 49 50 51 52 53 54 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; typedef struct sqlite3_tokenizer sqlite3_tokenizer; typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; struct sqlite3_tokenizer_module { /* | | | 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 | typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module; typedef struct sqlite3_tokenizer sqlite3_tokenizer; typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor; struct sqlite3_tokenizer_module { /* ** Structure version. Should always be set to 0 or 1. */ int iVersion; /* ** Create a new tokenizer. The values in the argv[] array are the ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL ** TABLE statement that created the fts3 table. For example, if |
︙ | ︙ | |||
129 130 131 132 133 134 135 136 137 138 139 140 141 142 | int (*xNext)( sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ int *piStartOffset, /* OUT: Byte offset of token in input buffer */ int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ int *piPosition /* OUT: Number of tokens returned before this one */ ); }; struct sqlite3_tokenizer { const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ /* Tokenizer implementations will typically add additional fields */ }; | > > > > > > > > > | 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | int (*xNext)( sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */ const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */ int *piStartOffset, /* OUT: Byte offset of token in input buffer */ int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */ int *piPosition /* OUT: Number of tokens returned before this one */ ); /*********************************************************************** ** Methods below this point are only available if iVersion>=1. */ /* ** Configure the language id of a tokenizer cursor. */ int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid); }; struct sqlite3_tokenizer { const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */ /* Tokenizer implementations will typically add additional fields */ }; |
︙ | ︙ |
Changes to ext/fts3/fts3_write.c.
︙ | ︙ | |||
653 654 655 656 657 658 659 660 661 662 663 664 665 666 | ** pending-terms hash-table. The docid used is that currently stored in ** p->iPrevDocid, and the column is specified by argument iCol. ** ** If successful, SQLITE_OK is returned. Otherwise, an SQLite error code. */ static int fts3PendingTermsAdd( Fts3Table *p, /* Table into which text will be inserted */ const char *zText, /* Text of document to be inserted */ int iCol, /* Column into which text is being inserted */ u32 *pnWord /* OUT: Number of tokens inserted */ ){ int rc; int iStart; int iEnd; | > | 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 | ** pending-terms hash-table. The docid used is that currently stored in ** p->iPrevDocid, and the column is specified by argument iCol. ** ** If successful, SQLITE_OK is returned. Otherwise, an SQLite error code. */ static int fts3PendingTermsAdd( Fts3Table *p, /* Table into which text will be inserted */ int iLangid, /* Language id to use */ const char *zText, /* Text of document to be inserted */ int iCol, /* Column into which text is being inserted */ u32 *pnWord /* OUT: Number of tokens inserted */ ){ int rc; int iStart; int iEnd; |
︙ | ︙ | |||
682 683 684 685 686 687 688 | ** zText==0. In this case, add zero token entries to the hash table and ** return early. */ if( zText==0 ){ *pnWord = 0; return SQLITE_OK; } | | < | 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | ** zText==0. In this case, add zero token entries to the hash table and ** return early. */ if( zText==0 ){ *pnWord = 0; return SQLITE_OK; } rc = sqlite3Fts3OpenTokenizer(pTokenizer, iLangid, zText, -1, &pCsr); if( rc!=SQLITE_OK ){ return rc; } xNext = pModule->xNext; while( SQLITE_OK==rc && SQLITE_OK==(rc = xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos)) ){ int i; if( iPos>=nWord ) nWord = iPos+1; |
︙ | ︙ | |||
779 780 781 782 783 784 785 | ** This function is called by the xUpdate() method as part of an INSERT ** operation. It adds entries for each term in the new record to the ** pendingTerms hash table. ** ** Argument apVal is the same as the similarly named argument passed to ** fts3InsertData(). Parameter iDocid is the docid of the new row. */ | | > > > > > | | 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 | ** This function is called by the xUpdate() method as part of an INSERT ** operation. It adds entries for each term in the new record to the ** pendingTerms hash table. ** ** Argument apVal is the same as the similarly named argument passed to ** fts3InsertData(). Parameter iDocid is the docid of the new row. */ static int fts3InsertTerms( Fts3Table *p, int iLangid, sqlite3_value **apVal, u32 *aSz ){ int i; /* Iterator variable */ for(i=2; i<p->nColumn+2; i++){ const char *zText = (const char *)sqlite3_value_text(apVal[i]); int rc = fts3PendingTermsAdd(p, iLangid, zText, i-2, &aSz[i-2]); if( rc!=SQLITE_OK ){ return rc; } aSz[p->nColumn] += sqlite3_value_bytes(apVal[i]); } return SQLITE_OK; } |
︙ | ︙ | |||
929 930 931 932 933 934 935 | sqlite3_stmt *pSelect; if( *pRC ) return; rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pSelect, &pRowid); if( rc==SQLITE_OK ){ if( SQLITE_ROW==sqlite3_step(pSelect) ){ int i; | < | | < | | 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 | sqlite3_stmt *pSelect; if( *pRC ) return; rc = fts3SqlStmt(p, SQL_SELECT_CONTENT_BY_ROWID, &pSelect, &pRowid); if( rc==SQLITE_OK ){ if( SQLITE_ROW==sqlite3_step(pSelect) ){ int i; int iLangid = langidFromSelect(p, pSelect); rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pSelect, 0)); for(i=1; rc==SQLITE_OK && i<=p->nColumn; i++){ const char *zText = (const char *)sqlite3_column_text(pSelect, i); rc = fts3PendingTermsAdd(p, iLangid, zText, -1, &aSz[i-1]); aSz[p->nColumn] += sqlite3_column_bytes(pSelect, i); } if( rc!=SQLITE_OK ){ sqlite3_reset(pSelect); *pRC = rc; return; } |
︙ | ︙ | |||
3098 3099 3100 3101 3102 3103 3104 | aSzIns = &aSz[p->nColumn+1]; aSzDel = &aSzIns[p->nColumn+1]; } } while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ int iCol; | > | < < | | 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 | aSzIns = &aSz[p->nColumn+1]; aSzDel = &aSzIns[p->nColumn+1]; } } while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pStmt) ){ int iCol; int iLangid = langidFromSelect(p, pStmt); rc = fts3PendingTermsDocid(p, iLangid, sqlite3_column_int64(pStmt, 0)); aSz[p->nColumn] = 0; for(iCol=0; rc==SQLITE_OK && iCol<p->nColumn; iCol++){ const char *z = (const char *) sqlite3_column_text(pStmt, iCol+1); rc = fts3PendingTermsAdd(p, iLangid, z, iCol, &aSz[iCol]); aSz[p->nColumn] += sqlite3_column_bytes(pStmt, iCol+1); } if( p->bHasDocsize ){ fts3InsertDocsize(&rc, p, aSz); } if( rc!=SQLITE_OK ){ sqlite3_finalize(pStmt); |
︙ | ︙ | |||
3223 3224 3225 3226 3227 3228 3229 | assert( pCsr->isRequireSeek==0 ); iDocid = sqlite3_column_int64(pCsr->pStmt, 0); for(i=0; i<p->nColumn && rc==SQLITE_OK; i++){ const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1); sqlite3_tokenizer_cursor *pTC = 0; | | < | 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 | assert( pCsr->isRequireSeek==0 ); iDocid = sqlite3_column_int64(pCsr->pStmt, 0); for(i=0; i<p->nColumn && rc==SQLITE_OK; i++){ const char *zText = (const char *)sqlite3_column_text(pCsr->pStmt, i+1); sqlite3_tokenizer_cursor *pTC = 0; rc = sqlite3Fts3OpenTokenizer(pT, pCsr->iLangid, zText, -1, &pTC); while( rc==SQLITE_OK ){ char const *zToken; /* Buffer containing token */ int nToken; /* Number of bytes in token */ int iDum1, iDum2; /* Dummy variables */ int iPos; /* Position of token in zText */ rc = pModule->xNext(pTC, &zToken, &nToken, &iDum1, &iDum2, &iPos); for(pDef=pCsr->pDeferred; pDef && rc==SQLITE_OK; pDef=pDef->pNext){ Fts3PhraseToken *pPT = pDef->pToken; if( (pDef->iCol>=p->nColumn || pDef->iCol==i) && (pPT->bFirst==0 || iPos==0) && (pPT->n==nToken || (pPT->isPrefix && pPT->n<nToken)) && (0==memcmp(zToken, pPT->z, pPT->n)) |
︙ | ︙ | |||
3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 | assert( sqlite3_value_type(apVal[0])==SQLITE_INTEGER ); rc = fts3DeleteByRowid(p, apVal[0], &nChng, aSzDel); isRemove = 1; } /* If this is an INSERT or UPDATE operation, insert the new record. */ if( nArg>1 && rc==SQLITE_OK ){ if( bInsertDone==0 ){ rc = fts3InsertData(p, apVal, pRowid); if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){ rc = FTS_CORRUPT_VTAB; } } if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){ | > | < < < | | 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 | assert( sqlite3_value_type(apVal[0])==SQLITE_INTEGER ); rc = fts3DeleteByRowid(p, apVal[0], &nChng, aSzDel); isRemove = 1; } /* If this is an INSERT or UPDATE operation, insert the new record. */ if( nArg>1 && rc==SQLITE_OK ){ int iLangid = sqlite3_value_int(apVal[2 + p->nColumn + 2]); if( bInsertDone==0 ){ rc = fts3InsertData(p, apVal, pRowid); if( rc==SQLITE_CONSTRAINT && p->zContentTbl==0 ){ rc = FTS_CORRUPT_VTAB; } } if( rc==SQLITE_OK && (!isRemove || *pRowid!=p->iPrevDocid ) ){ rc = fts3PendingTermsDocid(p, iLangid, *pRowid); } if( rc==SQLITE_OK ){ assert( p->iPrevDocid==*pRowid ); rc = fts3InsertTerms(p, iLangid, apVal, aSzIns); } if( p->bHasDocsize ){ fts3InsertDocsize(&rc, p, aSzIns); } nChng++; } |
︙ | ︙ |
Changes to test/fts4langid.test.
︙ | ︙ | |||
36 37 38 39 40 41 42 | # 2.2.* - Same as 2.1.*, after an 'optimize' command. # # 2.3.* - Same as 2.1.*, after a 'rebuild' command. # # 3.* - Tests with content= tables. Both where there is a real # underlying content table and where there is not. # | < | 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | # 2.2.* - Same as 2.1.*, after an 'optimize' command. # # 2.3.* - Same as 2.1.*, after a 'rebuild' command. # # 3.* - Tests with content= tables. Both where there is a real # underlying content table and where there is not. # # 4.* - Test that if one is provided, the tokenizer xLanguage method # is called to configure the tokenizer before tokenizing query # or document text. # # 5.* - Test the fts4aux table when the associated FTS4 table contains # multiple languages. # |
︙ | ︙ | |||
338 339 340 341 342 343 344 | do_test_query1 3.3.3 {zero one two} { and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two] } do_test_query1 3.3.4 {"zero one" OR "one two"} { or_merge_lists [rowid_list "zero one"] [rowid_list "one two"] } | > > | > > > > > > | > > > > > > > | > > > > > > > > > > > > > > > > > > | > > > > > | 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 | do_test_query1 3.3.3 {zero one two} { and_merge_lists [rowid_list zero] [rowid_list one] [rowid_list two] } do_test_query1 3.3.4 {"zero one" OR "one two"} { or_merge_lists [rowid_list "zero one"] [rowid_list "one two"] } #------------------------------------------------------------------------- # Test cases 4.* # proc build_multilingual_db_2 {db} { $db eval { CREATE VIRTUAL TABLE t4 USING fts4( tokenize=testtokenizer, languageid=lid ); } for {set i 0} {$i < 50} {incr i} { execsql { INSERT INTO t4(docid, content, lid) VALUES($i, 'The Quick Brown Fox', $i) } } } do_test 4.1.0 { reset_db set ptr [fts3_test_tokenizer] execsql { SELECT fts3_tokenizer('testtokenizer', $ptr) } build_multilingual_db_2 db } {} do_execsql_test 4.1.1 { SELECT docid FROM t4 WHERE t4 MATCH 'quick'; } {0} do_execsql_test 4.1.2 { SELECT docid FROM t4 WHERE t4 MATCH 'quick' AND lid=1; } {} do_execsql_test 4.1.3 { SELECT docid FROM t4 WHERE t4 MATCH 'Quick' AND lid=1; } {1} for {set i 0} {$i < 50} {incr i} { do_execsql_test 4.1.4.$i { SELECT count(*) FROM t4 WHERE t4 MATCH 'fox' AND lid=$i; } [expr 0==($i%2)] } do_catchsql_test 4.1.5 { INSERT INTO t4(content, lid) VALUES('hello world', 101) } {1 {SQL logic error or missing database}} finish_test |
Changes to test/permutations.test.
︙ | ︙ | |||
180 181 182 183 184 185 186 | fts3defer.test fts3defer2.test fts3e.test fts3expr.test fts3expr2.test fts3near.test fts3query.test fts3shared.test fts3snippet.test fts3sort.test fts3fault.test fts3malloc.test fts3matchinfo.test fts3aux1.test fts3comp1.test fts3auto.test fts4aa.test fts4content.test fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test | | < | 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | fts3defer.test fts3defer2.test fts3e.test fts3expr.test fts3expr2.test fts3near.test fts3query.test fts3shared.test fts3snippet.test fts3sort.test fts3fault.test fts3malloc.test fts3matchinfo.test fts3aux1.test fts3comp1.test fts3auto.test fts4aa.test fts4content.test fts3conf.test fts3prefix.test fts3fault2.test fts3corrupt.test fts3corrupt2.test fts3first.test fts4langid.test } lappend ::testsuitelist xxx #------------------------------------------------------------------------- # Define the coverage related test suites: # |
︙ | ︙ |