/* Check that off_t can represent 2**63 - 1 correctly.
We can't simply define LARGE_OFF_T to be 9223372036854775807,
since some C++ compilers masquerading as C compilers
incorrectly reject 9223372036854775807. */
-#define LARGE_OFF_T (((off_t) 1 << 62) - 1 + ((off_t) 1 << 62))
+#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31))
int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721
&& LARGE_OFF_T % 2147483647 == 1)
? 1 : -1];
int
main ()
@@ -10369,14 +10373,12 @@
if test "x${TCLLIBDIR+set}" != "xset" ; then
TCLLIBDIR='$(libdir)'
for i in `echo 'puts stdout $auto_path' | ${TCLSH_CMD}` ; do
- if test -d $i ; then
- TCLLIBDIR=$i
- break
- fi
+ TCLLIBDIR=$i
+ break
done
TCLLIBDIR="${TCLLIBDIR}/sqlite3"
fi
@@ -10394,71 +10396,15 @@
RELEASE=`cat $srcdir/VERSION`
{ $as_echo "$as_me:${as_lineno-$LINENO}: Release set to $RELEASE" >&5
$as_echo "$as_me: Release set to $RELEASE" >&6;}
-
-##########
-# Handle --with-wasi-sdk=DIR
-#
-# This must be early because it changes the toolchain.
-#
-
-# Check whether --with-wasi-sdk was given.
-if test "${with_wasi_sdk+set}" = set; then :
- withval=$with_wasi_sdk; with_wasisdk=${withval}
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for WASI SDK directory" >&5
-$as_echo_n "checking for WASI SDK directory... " >&6; }
-if ${ac_cv_c_wasi_sdk+:} false; then :
- $as_echo_n "(cached) " >&6
-else
-
- # First check to see if --with-tcl was specified.
- if test x"${with_wasi_sdk}" != x ; then
- if ! test -d "${with_wasi_sdk}" ; then
- as_fn_error $? "${with_wasi_sdk} directory doesn't exist" "$LINENO" 5
- fi
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: ${with_wasi_sdk}: using wasi-sdk clang, disabling: tcl, CLI shell, DLL" >&5
-$as_echo "${with_wasi_sdk}: using wasi-sdk clang, disabling: tcl, CLI shell, DLL" >&6; }
- use_wasi_sdk=yes
- else
- use_wasi_sdk=no
- fi
-
-fi
-
-if test "${use_wasi_sdk}" = "no" ; then
- HAVE_WASI_SDK=""
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-else
- HAVE_WASI_SDK=1
-# Changing --host and --target have no effect here except to possibly
-# cause confusion. autoconf has finished processing them by this
-# point.
-#
-# host_alias=wasm32-wasi
-# target=wasm32-wasi
-#
-# Merely changing CC and LD to the wasi-sdk's is enough to get
-# sqlite3.o building in WASM format.
- CC="${with_wasi_sdk}/bin/clang"
- LD="${with_wasi_sdk}/bin/wasm-ld"
- RANLIB="${with_wasi_sdk}/bin/llvm-ranlib"
- cross_compiling=yes
- enable_threadsafe=no
- use_tcl=no
- enable_tcl=no
- # libtool is apparently hard-coded to use gcc for linking DLLs, so
- # we disable the DLL build...
- enable_shared=no
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-fi
-
+VERSION_NUMBER=`cat $srcdir/VERSION \
+ | sed 's/[^0-9]/ /g' \
+ | awk '{printf "%d%03d%03d",$1,$2,$3}'`
+{ $as_echo "$as_me:${as_lineno-$LINENO}: Version number set to $VERSION_NUMBER" >&5
+$as_echo "$as_me: Version number set to $VERSION_NUMBER" >&6;}
#########
# Locate a compiler for the build machine. This compiler should
# generate command-line programs that run on the build machine.
@@ -11316,32 +11262,25 @@
# Check whether --enable-debug was given.
if test "${enable_debug+set}" = set; then :
enableval=$enable_debug;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build type" >&5
-$as_echo_n "checking build type... " >&6; }
if test "${enable_debug}" = "yes" ; then
TARGET_DEBUG="-DSQLITE_DEBUG=1 -DSQLITE_ENABLE_SELECTTRACE -DSQLITE_ENABLE_WHERETRACE -O0"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: debug" >&5
-$as_echo "debug" >&6; }
else
TARGET_DEBUG="-DNDEBUG"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: release" >&5
-$as_echo "release" >&6; }
fi
#########
# See whether we should use the amalgamation to build
-
# Check whether --enable-amalgamation was given.
if test "${enable_amalgamation+set}" = set; then :
enableval=$enable_amalgamation;
fi
-if test "${enable_amalgamation}" = "no" ; then
+if test "${enable_amalgamation}" == "no" ; then
USE_AMALGAMATION=0
fi
#########
@@ -11486,113 +11425,10 @@
fi
else
OPT_FEATURE_FLAGS="-DSQLITE_OMIT_LOAD_EXTENSION=1"
fi
-
-##########
-# Do we want to support math functions
-#
-# Check whether --enable-math was given.
-if test "${enable_math+set}" = set; then :
- enableval=$enable_math;
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support math functions" >&5
-$as_echo_n "checking whether to support math functions... " >&6; }
-if test "$enable_math" = "no"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
- OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MATH_FUNCTIONS"
- { $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing ceil" >&5
-$as_echo_n "checking for library containing ceil... " >&6; }
-if ${ac_cv_search_ceil+:} false; then :
- $as_echo_n "(cached) " >&6
-else
- ac_func_search_save_LIBS=$LIBS
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h. */
-
-/* Override any GCC internal prototype to avoid an error.
- Use char because int might match the return type of a GCC
- builtin and then its argument prototype would still apply. */
-#ifdef __cplusplus
-extern "C"
-#endif
-char ceil ();
-int
-main ()
-{
-return ceil ();
- ;
- return 0;
-}
-_ACEOF
-for ac_lib in '' m; do
- if test -z "$ac_lib"; then
- ac_res="none required"
- else
- ac_res=-l$ac_lib
- LIBS="-l$ac_lib $ac_func_search_save_LIBS"
- fi
- if ac_fn_c_try_link "$LINENO"; then :
- ac_cv_search_ceil=$ac_res
-fi
-rm -f core conftest.err conftest.$ac_objext \
- conftest$ac_exeext
- if ${ac_cv_search_ceil+:} false; then :
- break
-fi
-done
-if ${ac_cv_search_ceil+:} false; then :
-
-else
- ac_cv_search_ceil=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_search_ceil" >&5
-$as_echo "$ac_cv_search_ceil" >&6; }
-ac_res=$ac_cv_search_ceil
-if test "$ac_res" != no; then :
- test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-
-fi
-
-fi
-
-##########
-# Do we want to support JSON functions
-#
-# Check whether --enable-json was given.
-if test "${enable_json+set}" = set; then :
- enableval=$enable_json;
-fi
-
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support JSON functions" >&5
-$as_echo_n "checking whether to support JSON functions... " >&6; }
-if test "$enable_json" = "no"; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
- OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_OMIT_JSON"
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-fi
-
-########
-# The --enable-all argument is short-hand to enable
-# multiple extensions.
-# Check whether --enable-all was given.
-if test "${enable_all+set}" = set; then :
- enableval=$enable_all;
-fi
-
##########
# Do we want to support memsys3 and/or memsys5
#
# Check whether --enable-memsys5 was given.
@@ -11631,30 +11467,19 @@
# Check whether --enable-fts3 was given.
if test "${enable_fts3+set}" = set; then :
enableval=$enable_fts3;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support FTS3" >&5
-$as_echo_n "checking whether to support FTS3... " >&6; }
if test "${enable_fts3}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS3"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
# Check whether --enable-fts4 was given.
if test "${enable_fts4+set}" = set; then :
enableval=$enable_fts4;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support FTS4" >&5
-$as_echo_n "checking whether to support FTS4... " >&6; }
-if test "${enable_fts4}" = "yes" -o "${enable_all}" = "yes" ; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+if test "${enable_fts4}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS4"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
$as_echo_n "checking for library containing log... " >&6; }
if ${ac_cv_search_log+:} false; then :
$as_echo_n "(cached) " >&6
@@ -11708,24 +11533,17 @@
if test "$ac_res" != no; then :
test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
fi
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
# Check whether --enable-fts5 was given.
if test "${enable_fts5+set}" = set; then :
enableval=$enable_fts5;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support FTS5" >&5
-$as_echo_n "checking whether to support FTS5... " >&6; }
-if test "${enable_fts5}" = "yes" -o "${enable_all}" = "yes" ; then
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+if test "${enable_fts5}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS5"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
$as_echo_n "checking for library containing log... " >&6; }
if ${ac_cv_search_log+:} false; then :
$as_echo_n "(cached) " >&6
@@ -11779,13 +11597,21 @@
if test "$ac_res" != no; then :
test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
fi
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
+fi
+
+#########
+# See whether we should enable JSON1
+# Check whether --enable-json1 was given.
+if test "${enable_json1+set}" = set; then :
+ enableval=$enable_json1;
+fi
+
+if test "${enable_json1}" = "yes" ; then
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_JSON1"
fi
#########
# See whether we should enable the LIMIT clause on UPDATE and DELETE
# statements.
@@ -11792,19 +11618,12 @@
# Check whether --enable-update-limit was given.
if test "${enable_update_limit+set}" = set; then :
enableval=$enable_update_limit;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support LIMIT on UPDATE and DELETE statements" >&5
-$as_echo_n "checking whether to support LIMIT on UPDATE and DELETE statements... " >&6; }
-if test "${enable_update_limit}" = "yes" ; then
+if test "${enable_udlimit}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
#########
# See whether we should enable GEOPOLY
# Check whether --enable-geopoly was given.
@@ -11812,57 +11631,36 @@
enableval=$enable_geopoly; enable_geopoly=yes
else
enable_geopoly=no
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support GEOPOLY" >&5
-$as_echo_n "checking whether to support GEOPOLY... " >&6; }
-if test "${enable_geopoly}" = "yes" -o "${enable_all}" = "yes" ; then
+if test "${enable_geopoly}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_GEOPOLY"
enable_rtree=yes
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
#########
# See whether we should enable RTREE
# Check whether --enable-rtree was given.
if test "${enable_rtree+set}" = set; then :
enableval=$enable_rtree;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support RTREE" >&5
-$as_echo_n "checking whether to support RTREE... " >&6; }
if test "${enable_rtree}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_RTREE"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
#########
# See whether we should enable the SESSION extension
# Check whether --enable-session was given.
if test "${enable_session+set}" = set; then :
enableval=$enable_session;
fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support SESSION" >&5
-$as_echo_n "checking whether to support SESSION... " >&6; }
-if test "${enable_session}" = "yes" -o "${enable_all}" = "yes" ; then
+if test "${enable_session}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_SESSION"
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_PREUPDATE_HOOK"
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
-else
- { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
fi
#########
# attempt to duplicate any OMITS and ENABLES into the ${OPT_FEATURE_FLAGS} parameter
for option in $CFLAGS $CPPFLAGS
@@ -11926,25 +11724,14 @@
else
USE_GCOV=0
fi
-#########
-# Enable/disabled amalagamation line macros
-########
-AMALGAMATION_LINE_MACROS=--linemacros=0
-if test "${amalgamation_line_macros}" = "yes" ; then
- AMALGAMATION_LINE_MACROS=--linemacros=1
-fi
-if test "${amalgamation_line_macros}" = "no" ; then
- AMALGAMATION_LINE_MACROS=--linemacros=0
-fi
-
#########
# Output the config header
-ac_config_headers="$ac_config_headers sqlite_cfg.h"
+ac_config_headers="$ac_config_headers config.h"
#########
# Generate the output files.
#
@@ -12455,11 +12242,11 @@
cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# Save the log message, to keep $0 and so on meaningful, and to
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by sqlite $as_me 3.41.0, which was
+This file was extended by sqlite $as_me 3.28.0, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
CONFIG_HEADERS = $CONFIG_HEADERS
CONFIG_LINKS = $CONFIG_LINKS
@@ -12521,11 +12308,11 @@
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-sqlite config.status 3.41.0
+sqlite config.status 3.28.0
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
Copyright (C) 2012 Free Software Foundation, Inc.
This config.status script is free software; the Free Software Foundation
@@ -12903,11 +12690,11 @@
# Handling of arguments.
for ac_config_target in $ac_config_targets
do
case $ac_config_target in
"libtool") CONFIG_COMMANDS="$CONFIG_COMMANDS libtool" ;;
- "sqlite_cfg.h") CONFIG_HEADERS="$CONFIG_HEADERS sqlite_cfg.h" ;;
+ "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;;
"Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;;
"sqlite3.pc") CONFIG_FILES="$CONFIG_FILES sqlite3.pc" ;;
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;
esac
Index: configure.ac
==================================================================
--- configure.ac
+++ configure.ac
@@ -68,15 +68,15 @@
#
# The filename extension for executables on the
# target platform. "" for Unix and ".exe" for windows.
#
# This configure.in file is easy to reuse on other projects. Just
-# change the argument to AC_INIT. And disable any features that
+# change the argument to AC_INIT(). And disable any features that
# you don't need (for example BLT) by erasing or commenting out
# the corresponding code.
#
-AC_INIT([sqlite],[m4_esyscmd(cat VERSION | tr -d '\n')])
+AC_INIT(sqlite, m4_esyscmd([cat VERSION | tr -d '\n']))
dnl Make sure the local VERSION file matches this configure script
sqlite_version_sanity_check=`cat $srcdir/VERSION | tr -d '\n'`
if test "$PACKAGE_VERSION" != "$sqlite_version_sanity_check" ; then
AC_MSG_ERROR([configure script is out of date:
@@ -86,11 +86,11 @@
fi
#########
# Programs needed
#
-LT_INIT
+AC_PROG_LIBTOOL
AC_PROG_INSTALL
#########
# Enable large file support (if special flags are necessary)
#
@@ -132,14 +132,12 @@
AC_ARG_VAR([TCLLIBDIR], [Where to install tcl plugin])
if test "x${TCLLIBDIR+set}" != "xset" ; then
TCLLIBDIR='$(libdir)'
for i in `echo 'puts stdout $auto_path' | ${TCLSH_CMD}` ; do
- if test -d $i ; then
- TCLLIBDIR=$i
- break
- fi
+ TCLLIBDIR=$i
+ break
done
TCLLIBDIR="${TCLLIBDIR}/sqlite3"
fi
@@ -155,60 +153,15 @@
AC_MSG_NOTICE(Version set to $VERSION)
AC_SUBST(VERSION)
RELEASE=`cat $srcdir/VERSION`
AC_MSG_NOTICE(Release set to $RELEASE)
AC_SUBST(RELEASE)
-
-##########
-# Handle --with-wasi-sdk=DIR
-#
-# This must be early because it changes the toolchain.
-#
-AC_ARG_WITH(wasi-sdk,
-AS_HELP_STRING([--with-wasi-sdk=DIR],
- [directory containing the WASI SDK. Triggers cross-compile to WASM.]), with_wasisdk=${withval})
-AC_MSG_CHECKING([for WASI SDK directory])
-AC_CACHE_VAL(ac_cv_c_wasi_sdk,[
- # First check to see if --with-tcl was specified.
- if test x"${with_wasi_sdk}" != x ; then
- if ! test -d "${with_wasi_sdk}" ; then
- AC_MSG_ERROR([${with_wasi_sdk} directory doesn't exist])
- fi
- AC_MSG_RESULT([${with_wasi_sdk}: using wasi-sdk clang, disabling: tcl, CLI shell, DLL])
- use_wasi_sdk=yes
- else
- use_wasi_sdk=no
- fi
-])
-if test "${use_wasi_sdk}" = "no" ; then
- HAVE_WASI_SDK=""
- AC_MSG_RESULT([no])
-else
- HAVE_WASI_SDK=1
-# Changing --host and --target have no effect here except to possibly
-# cause confusion. autoconf has finished processing them by this
-# point.
-#
-# host_alias=wasm32-wasi
-# target=wasm32-wasi
-#
-# Merely changing CC and LD to the wasi-sdk's is enough to get
-# sqlite3.o building in WASM format.
- CC="${with_wasi_sdk}/bin/clang"
- LD="${with_wasi_sdk}/bin/wasm-ld"
- RANLIB="${with_wasi_sdk}/bin/llvm-ranlib"
- cross_compiling=yes
- enable_threadsafe=no
- use_tcl=no
- enable_tcl=no
- # libtool is apparently hard-coded to use gcc for linking DLLs, so
- # we disable the DLL build...
- enable_shared=no
- AC_MSG_RESULT([yes])
-fi
-AC_SUBST(HAVE_WASI_SDK)
-
+VERSION_NUMBER=[`cat $srcdir/VERSION \
+ | sed 's/[^0-9]/ /g' \
+ | awk '{printf "%d%03d%03d",$1,$2,$3}'`]
+AC_MSG_NOTICE(Version number set to $VERSION_NUMBER)
+AC_SUBST(VERSION_NUMBER)
#########
# Locate a compiler for the build machine. This compiler should
# generate command-line programs that run on the build machine.
#
@@ -227,11 +180,11 @@
##########
# Do we want to support multithreaded use of sqlite
#
AC_ARG_ENABLE(threadsafe,
-AS_HELP_STRING([--disable-threadsafe],[Disable mutexing]))
+AC_HELP_STRING([--disable-threadsafe],[Disable mutexing]))
AC_MSG_CHECKING([whether to support threadsafe operation])
if test "$enable_threadsafe" = "no"; then
SQLITE_THREADSAFE=0
AC_MSG_RESULT([no])
else
@@ -247,11 +200,11 @@
##########
# Do we want to support release
#
AC_ARG_ENABLE(releasemode,
-AS_HELP_STRING([--enable-releasemode],[Support libtool link to release mode]),,enable_releasemode=no)
+AC_HELP_STRING([--enable-releasemode],[Support libtool link to release mode]),,enable_releasemode=no)
AC_MSG_CHECKING([whether to support shared library linked as release mode or not])
if test "$enable_releasemode" = "no"; then
ALLOWRELEASE=""
AC_MSG_RESULT([no])
else
@@ -262,11 +215,11 @@
##########
# Do we want temporary databases in memory
#
AC_ARG_ENABLE(tempstore,
-AS_HELP_STRING([--enable-tempstore],[Use an in-ram database for temporary tables (never,no,yes,always)]),,enable_tempstore=no)
+AC_HELP_STRING([--enable-tempstore],[Use an in-ram database for temporary tables (never,no,yes,always)]),,enable_tempstore=no)
AC_MSG_CHECKING([whether to use an in-ram database for temporary tables])
case "$enable_tempstore" in
never )
TEMP_STORE=0
AC_MSG_RESULT([never])
@@ -302,19 +255,11 @@
AC_MSG_RESULT(yes)
else
AC_MSG_RESULT(unknown)
fi
if test "$CYGWIN" != "yes"; then
- m4_warn([obsolete],
-[AC_CYGWIN is obsolete: use AC_CANONICAL_HOST and check if $host_os
-matches *cygwin*])dnl
-AC_CANONICAL_HOST
-case $host_os in
- *cygwin* ) CYGWIN=yes;;
- * ) CYGWIN=no;;
-esac
-
+ AC_CYGWIN
fi
if test "$CYGWIN" = "yes"; then
BUILD_EXEEXT=.exe
else
BUILD_EXEEXT=$EXEEXT
@@ -345,14 +290,14 @@
# This code is derived from the SC_PATH_TCLCONFIG and SC_LOAD_TCLCONFIG
# macros in the in the tcl.m4 file of the standard TCL distribution.
# Those macros could not be used directly since we have to make some
# minor changes to accomodate systems that do not have TCL installed.
#
-AC_ARG_ENABLE(tcl, AS_HELP_STRING([--disable-tcl],[do not build TCL extension]),
+AC_ARG_ENABLE(tcl, AC_HELP_STRING([--disable-tcl],[do not build TCL extension]),
[use_tcl=$enableval],[use_tcl=yes])
if test "${use_tcl}" = "yes" ; then
- AC_ARG_WITH(tcl, AS_HELP_STRING([--with-tcl=DIR],[directory containing tcl configuration (tclConfig.sh)]), with_tclconfig=${withval})
+ AC_ARG_WITH(tcl, AC_HELP_STRING([--with-tcl=DIR],[directory containing tcl configuration (tclConfig.sh)]), with_tclconfig=${withval})
AC_MSG_CHECKING([for Tcl configuration])
AC_CACHE_VAL(ac_cv_c_tclconfig,[
# First check to see if --with-tcl was specified.
if test x"${with_tclconfig}" != x ; then
if test -f "${with_tclconfig}/tclConfig.sh" ; then
@@ -530,15 +475,15 @@
TARGET_READLINE_LIBS=""
TARGET_READLINE_INC=""
TARGET_HAVE_READLINE=0
TARGET_HAVE_EDITLINE=0
AC_ARG_ENABLE([editline],
- [AS_HELP_STRING([--enable-editline],[enable BSD editline support])],
+ [AC_HELP_STRING([--enable-editline],[enable BSD editline support])],
[with_editline=$enableval],
[with_editline=auto])
AC_ARG_ENABLE([readline],
- [AS_HELP_STRING([--disable-readline],[disable readline support])],
+ [AC_HELP_STRING([--disable-readline],[disable readline support])],
[with_readline=$enableval],
[with_readline=auto])
if test x"$with_editline" != xno; then
sLIBS=$LIBS
@@ -550,11 +495,11 @@
fi
if test x"$with_readline" != xno; then
found="yes"
AC_ARG_WITH([readline-lib],
- [AS_HELP_STRING([--with-readline-lib],[specify readline library])],
+ [AC_HELP_STRING([--with-readline-lib],[specify readline library])],
[with_readline_lib=$withval],
[with_readline_lib="auto"])
if test "x$with_readline_lib" = xauto; then
save_LIBS="$LIBS"
LIBS=""
@@ -565,11 +510,11 @@
else
TARGET_READLINE_LIBS="$with_readline_lib"
fi
AC_ARG_WITH([readline-inc],
- [AS_HELP_STRING([--with-readline-inc],[specify readline include paths])],
+ [AC_HELP_STRING([--with-readline-inc],[specify readline include paths])],
[with_readline_inc=$withval],
[with_readline_inc="auto"])
if test "x$with_readline_inc" = xauto; then
AC_CHECK_HEADER(readline.h, [found="yes"], [
found="no"
@@ -610,27 +555,23 @@
#
AC_SEARCH_LIBS(fdatasync, [rt])
#########
# check for debug enabled
-AC_ARG_ENABLE(debug, AS_HELP_STRING([--enable-debug],[enable debugging & verbose explain]))
-AC_MSG_CHECKING([build type])
+AC_ARG_ENABLE(debug, AC_HELP_STRING([--enable-debug],[enable debugging & verbose explain]))
if test "${enable_debug}" = "yes" ; then
TARGET_DEBUG="-DSQLITE_DEBUG=1 -DSQLITE_ENABLE_SELECTTRACE -DSQLITE_ENABLE_WHERETRACE -O0"
- AC_MSG_RESULT([debug])
else
TARGET_DEBUG="-DNDEBUG"
- AC_MSG_RESULT([release])
fi
AC_SUBST(TARGET_DEBUG)
#########
# See whether we should use the amalgamation to build
-
-AC_ARG_ENABLE(amalgamation, AS_HELP_STRING([--disable-amalgamation],
+AC_ARG_ENABLE(amalgamation, AC_HELP_STRING([--disable-amalgamation],
[Disable the amalgamation and instead build all files separately]))
-if test "${enable_amalgamation}" = "no" ; then
+if test "${enable_amalgamation}" == "no" ; then
USE_AMALGAMATION=0
fi
AC_SUBST(USE_AMALGAMATION)
#########
@@ -639,66 +580,33 @@
AC_SEARCH_LIBS(deflate, z, [HAVE_ZLIB="-DSQLITE_HAVE_ZLIB=1"], [HAVE_ZLIB=""])
AC_SUBST(HAVE_ZLIB)
#########
# See whether we should allow loadable extensions
-AC_ARG_ENABLE(load-extension, AS_HELP_STRING([--disable-load-extension],
+AC_ARG_ENABLE(load-extension, AC_HELP_STRING([--disable-load-extension],
[Disable loading of external extensions]),,[enable_load_extension=yes])
if test "${enable_load_extension}" = "yes" ; then
OPT_FEATURE_FLAGS=""
AC_SEARCH_LIBS(dlopen, dl)
else
OPT_FEATURE_FLAGS="-DSQLITE_OMIT_LOAD_EXTENSION=1"
fi
-##########
-# Do we want to support math functions
-#
-AC_ARG_ENABLE(math,
-AS_HELP_STRING([--disable-math],[Disable math functions]))
-AC_MSG_CHECKING([whether to support math functions])
-if test "$enable_math" = "no"; then
- AC_MSG_RESULT([no])
-else
- AC_MSG_RESULT([yes])
- OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MATH_FUNCTIONS"
- AC_SEARCH_LIBS(ceil, m)
-fi
-
-##########
-# Do we want to support JSON functions
-#
-AC_ARG_ENABLE(json,
-AS_HELP_STRING([--disable-json],[Disable JSON functions]))
-AC_MSG_CHECKING([whether to support JSON functions])
-if test "$enable_json" = "no"; then
- AC_MSG_RESULT([no])
- OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_OMIT_JSON"
-else
- AC_MSG_RESULT([yes])
-fi
-
-########
-# The --enable-all argument is short-hand to enable
-# multiple extensions.
-AC_ARG_ENABLE(all, AS_HELP_STRING([--enable-all],
- [Enable FTS4, FTS5, Geopoly, RTree, Sessions]))
-
##########
# Do we want to support memsys3 and/or memsys5
#
AC_ARG_ENABLE(memsys5,
- AS_HELP_STRING([--enable-memsys5],[Enable MEMSYS5]))
+ AC_HELP_STRING([--enable-memsys5],[Enable MEMSYS5]))
AC_MSG_CHECKING([whether to support MEMSYS5])
if test "${enable_memsys5}" = "yes"; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS5"
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
fi
AC_ARG_ENABLE(memsys3,
- AS_HELP_STRING([--enable-memsys3],[Enable MEMSYS3]))
+ AC_HELP_STRING([--enable-memsys3],[Enable MEMSYS3]))
AC_MSG_CHECKING([whether to support MEMSYS3])
if test "${enable_memsys3}" = "yes" -a "${enable_memsys5}" = "no"; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS3"
AC_MSG_RESULT([yes])
else
@@ -705,90 +613,69 @@
AC_MSG_RESULT([no])
fi
#########
# See whether we should enable Full Text Search extensions
-AC_ARG_ENABLE(fts3, AS_HELP_STRING([--enable-fts3],
+AC_ARG_ENABLE(fts3, AC_HELP_STRING([--enable-fts3],
[Enable the FTS3 extension]))
-AC_MSG_CHECKING([whether to support FTS3])
if test "${enable_fts3}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS3"
- AC_MSG_RESULT([yes])
-else
- AC_MSG_RESULT([no])
fi
-AC_ARG_ENABLE(fts4, AS_HELP_STRING([--enable-fts4],
+AC_ARG_ENABLE(fts4, AC_HELP_STRING([--enable-fts4],
[Enable the FTS4 extension]))
-AC_MSG_CHECKING([whether to support FTS4])
-if test "${enable_fts4}" = "yes" -o "${enable_all}" = "yes" ; then
- AC_MSG_RESULT([yes])
+if test "${enable_fts4}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS4"
AC_SEARCH_LIBS([log],[m])
-else
- AC_MSG_RESULT([no])
fi
-AC_ARG_ENABLE(fts5, AS_HELP_STRING([--enable-fts5],
+AC_ARG_ENABLE(fts5, AC_HELP_STRING([--enable-fts5],
[Enable the FTS5 extension]))
-AC_MSG_CHECKING([whether to support FTS5])
-if test "${enable_fts5}" = "yes" -o "${enable_all}" = "yes" ; then
- AC_MSG_RESULT([yes])
+if test "${enable_fts5}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS5"
AC_SEARCH_LIBS([log],[m])
-else
- AC_MSG_RESULT([no])
+fi
+
+#########
+# See whether we should enable JSON1
+AC_ARG_ENABLE(json1, AC_HELP_STRING([--enable-json1],[Enable the JSON1 extension]))
+if test "${enable_json1}" = "yes" ; then
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_JSON1"
fi
#########
# See whether we should enable the LIMIT clause on UPDATE and DELETE
# statements.
-AC_ARG_ENABLE(update-limit, AS_HELP_STRING([--enable-update-limit],
+AC_ARG_ENABLE(update-limit, AC_HELP_STRING([--enable-update-limit],
[Enable the UPDATE/DELETE LIMIT clause]))
-AC_MSG_CHECKING([whether to support LIMIT on UPDATE and DELETE statements])
-if test "${enable_update_limit}" = "yes" ; then
+if test "${enable_udlimit}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT"
- AC_MSG_RESULT([yes])
-else
- AC_MSG_RESULT([no])
fi
#########
# See whether we should enable GEOPOLY
-AC_ARG_ENABLE(geopoly, AS_HELP_STRING([--enable-geopoly],
+AC_ARG_ENABLE(geopoly, AC_HELP_STRING([--enable-geopoly],
[Enable the GEOPOLY extension]),
[enable_geopoly=yes],[enable_geopoly=no])
-AC_MSG_CHECKING([whether to support GEOPOLY])
-if test "${enable_geopoly}" = "yes" -o "${enable_all}" = "yes" ; then
+if test "${enable_geopoly}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_GEOPOLY"
enable_rtree=yes
- AC_MSG_RESULT([yes])
-else
- AC_MSG_RESULT([no])
fi
#########
# See whether we should enable RTREE
-AC_ARG_ENABLE(rtree, AS_HELP_STRING([--enable-rtree],
+AC_ARG_ENABLE(rtree, AC_HELP_STRING([--enable-rtree],
[Enable the RTREE extension]))
-AC_MSG_CHECKING([whether to support RTREE])
if test "${enable_rtree}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_RTREE"
- AC_MSG_RESULT([yes])
-else
- AC_MSG_RESULT([no])
fi
#########
# See whether we should enable the SESSION extension
-AC_ARG_ENABLE(session, AS_HELP_STRING([--enable-session],
+AC_ARG_ENABLE(session, AC_HELP_STRING([--enable-session],
[Enable the SESSION extension]))
-AC_MSG_CHECKING([whether to support SESSION])
-if test "${enable_session}" = "yes" -o "${enable_all}" = "yes" ; then
+if test "${enable_session}" = "yes" ; then
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_SESSION"
OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_PREUPDATE_HOOK"
- AC_MSG_RESULT([yes])
-else
- AC_MSG_RESULT([no])
fi
#########
# attempt to duplicate any OMITS and ENABLES into the ${OPT_FEATURE_FLAGS} parameter
for option in $CFLAGS $CPPFLAGS
@@ -840,39 +727,27 @@
BUILD_CFLAGS=$ac_temp_BUILD_CFLAGS
#########
# See whether we should use GCOV
-AC_ARG_ENABLE(gcov, AS_HELP_STRING([--enable-gcov],
+AC_ARG_ENABLE(gcov, AC_HELP_STRING([--enable-gcov],
[Enable coverage testing using gcov]))
if test "${use_gcov}" = "yes" ; then
USE_GCOV=1
else
USE_GCOV=0
fi
AC_SUBST(USE_GCOV)
-#########
-# Enable/disabled amalagamation line macros
-########
-AMALGAMATION_LINE_MACROS=--linemacros=0
-if test "${amalgamation_line_macros}" = "yes" ; then
- AMALGAMATION_LINE_MACROS=--linemacros=1
-fi
-if test "${amalgamation_line_macros}" = "no" ; then
- AMALGAMATION_LINE_MACROS=--linemacros=0
-fi
-AC_SUBST(AMALGAMATION_LINE_MACROS)
#########
# Output the config header
-AC_CONFIG_HEADERS(sqlite_cfg.h)
+AC_CONFIG_HEADERS(config.h)
#########
# Generate the output files.
#
AC_SUBST(BUILD_CFLAGS)
-AC_CONFIG_FILES([
+AC_OUTPUT([
Makefile
sqlite3.pc
])
-AC_OUTPUT
DELETED doc/json-enhancements.md
Index: doc/json-enhancements.md
==================================================================
--- doc/json-enhancements.md
+++ /dev/null
@@ -1,144 +0,0 @@
-# JSON Functions Enhancements (2022)
-
-This document summaries enhancements to the SQLite JSON support added in
-early 2022.
-
-## 1.0 Change summary:
-
- 1. New **->** and **->>** operators that work like MySQL and PostgreSQL (PG).
- 2. JSON functions are built-in rather than being an extension. They
- are included by default, but can be omitted using the
- -DSQLITE_OMIT_JSON compile-time option.
-
-
-## 2.0 New operators **->** and **->>**
-
-The SQLite language adds two new binary operators **->** and **->>**.
-Both operators are similar to json_extract(). The left operand is
-JSON and the right operand is a JSON path expression (possibly abbreviated
-for compatibility with PG - see below). So they are similar to a
-two-argument call to json_extract().
-
-The difference between -> and ->> (and json_extract()) is as follows:
-
- * The -> operator always returns JSON.
-
- * The ->> operator converts the answer into a primitive SQL datatype
- such as TEXT, INTEGER, REAL, or NULL. If a JSON object or array
- is selected, that object or array is rendered as text. If a JSON
- value is selected, that value is converted into its corresponding
- SQL type
-
- * The json_extract() interface returns JSON when a JSON object or
- array is selected, or a primitive SQL datatype when a JSON value
- is selected. This is different from MySQL, in which json_extract()
- always returns JSON, but the difference is retained because it has
- worked that way for 6 years and changing it now would likely break
- a lot of legacy code.
-
-In MySQL and PG, the ->> operator always returns TEXT (or NULL) and never
-INTEGER or REAL. This is due to limitations in the type handling capabilities
-of those systems. In MySQL and PG, the result type a function or operator
-may only depend on the type of its arguments, never the value of its arguments.
-But the underlying JSON type depends on the value of the JSON path
-expression, not the type of the JSON path expression (which is always TEXT).
-Hence, the result type of ->> in MySQL and PG is unable to vary according
-to the type of the JSON value being extracted.
-
-The type system in SQLite is more general. Functions in SQLite are able
-to return different datatypes depending on the value of their arguments.
-So the ->> operator in SQLite is able to return TEXT, INTEGER, REAL, or NULL
-depending on the JSON type of the value being extracted. This means that
-the behavior of the ->> is slightly different in SQLite versus MySQL and PG
-in that it will sometimes return INTEGER and REAL values, depending on its
-inputs. It is possible to implement the ->> operator in SQLite so that it
-always operates exactly like MySQL and PG and always returns TEXT or NULL,
-but I have been unable to think of any situations where returning the
-actual JSON value this would cause problems, so I'm including the enhanced
-functionality in SQLite.
-
-The table below attempts to summarize the differences between the
--> and ->> operators and the json_extract() function, for SQLite, MySQL,
-and PG. JSON values are shown using their SQL text representation but
-in a bold font.
-
-
-
-JSON | PATH | -> operator (all) | ->> operator (MySQL/PG)
- | ->> operator (SQLite) | json_extract() (SQLite)
- |
---|
**'{"a":123}'** | '$.a' | **'123'** | '123' | 123 | 123
- |
**'{"a":4.5}'** | '$.a' | **'4.5'** | '4.5' | 4.5 | 4.5
- |
**'{"a":"xyz"}'** | '$.a' | **'"xyz"'** | 'xyz' | 'xyz' | 'xyz'
- |
**'{"a":null}'** | '$.a' | **'null'** | NULL | NULL | NULL
- |
**'{"a":[6,7,8]}'** | '$.a' | **'[6,7,8]'** | '[6,7,8]' | '[6,7,8]' | **'[6,7,8]'**
- |
**'{"a":{"x":9}}'** | '$.a' | **'{"x":9}'** | '{"x":9}' | '{"x":9}' | **'{"x":9}'**
- |
**'{"b":999}'** | '$.a' | NULL | NULL | NULL | NULL
- |
-
-Important points about the table above:
-
- * The -> operator always returns either JSON or NULL.
-
- * The ->> operator never returns JSON. It always returns TEXT or NULL, or in the
- case of SQLite, INTEGER or REAL.
-
- * The MySQL json_extract() function works exactly the same
- as the MySQL -> operator.
-
- * The SQLite json_extract() operator works like -> for JSON objects and
- arrays, and like ->> for JSON values.
-
- * The -> operator works the same for all systems.
-
- * The only difference in ->> between SQLite and other systems is that
- when the JSON value is numeric, SQLite returns a numeric SQL value,
- whereas the other systems return a text representation of the numeric
- value.
-
-### 2.1 Abbreviated JSON path expressions for PG compatibility
-
-The table above always shows the full JSON path expression: '$.a'. But
-PG does not accept this syntax. PG only allows a single JSON object label
-name or a single integer array index. In order to provide compatibility
-with PG, The -> and ->> operators in SQLite are extended to also support
-a JSON object label or an integer array index for the right-hand side
-operand, in addition to a full JSON path expression.
-
-Thus, a -> or ->> operator that works on MySQL will work in
-SQLite. And a -> or ->> operator that works in PG will work in SQLite.
-But because SQLite supports the union of the disjoint capabilities of
-MySQL and PG, there will always be -> and ->> operators that work in
-SQLite that do not work in one of MySQL and PG. This is an unavoidable
-consequence of the different syntax for -> and ->> in MySQL and PG.
-
-In the following table, assume that "value1" is a JSON object and
-"value2" is a JSON array.
-
-
-SQL expression | Works in MySQL? | Works in PG? | Works in SQLite
- |
---|
value1->'$.a' | yes | no | yes
- |
value1->'a' | no | yes | yes
- |
value2->'$[2]' | yes | no | yes
- |
value2->2 | no | yes | yes
- |
-
-The abbreviated JSON path expressions only work for the -> and ->> operators
-in SQLite. The json_extract() function, and all other built-in SQLite
-JSON functions, continue to require complete JSON path expressions for their
-PATH arguments.
-
-## 3.0 JSON moved into the core
-
-The JSON interface is now moved into the SQLite core.
-
-When originally written in 2015, the JSON functions were an extension
-that could be optionally included at compile-time, or loaded at run-time.
-The implementation was in a source file named ext/misc/json1.c in the
-source tree. JSON functions were only compiled in if the
--DSQLITE_ENABLE_JSON1 compile-time option was used.
-
-After these enhancements, the JSON functions are now built-ins.
-The source file that implements the JSON functions is moved to src/json.c.
-No special compile-time options are needed to load JSON into the build.
-Instead, there is a new -DSQLITE_OMIT_JSON compile-time option to leave
-them out.
Index: doc/lemon.html
==================================================================
--- doc/lemon.html
+++ doc/lemon.html
@@ -1,11 +1,10 @@
The Lemon Parser Generator
-
-
+
The Lemon Parser Generator
Lemon is an LALR(1) parser generator for C.
It does the same job as "bison" and "yacc".
But Lemon is not a bison or yacc clone. Lemon
@@ -22,45 +21,15 @@
or embedded controllers.
This document is an introduction to the Lemon
parser generator.
-
-1.0 Table of Contents
-
-
-
-2.0 Security Note
+Security Note
The language parser code created by Lemon is very robust and
is well-suited for use in internet-facing applications that need to
-safely process maliciously crafted inputs.
+safely process maliciously crafted inputs.
The "lemon.exe" command-line tool itself works great when given a valid
input grammar file and almost always gives helpful
error messages for malformed inputs. However, it is possible for
a malicious user to craft a grammar file that will cause
@@ -72,64 +41,60 @@
- Parser code generated by lemon → Robust and secure
- The "lemon.exe" command line tool itself → Not so much
-
-3.0 Theory of Operation
+Theory of Operation
-Lemon is computer program that translates a context free grammar (CFG)
+
The main goal of Lemon is to translate a context free grammar (CFG)
for a particular language into C code that implements a parser for
that language.
-The Lemon program has two inputs:
+The program has two inputs:
- The grammar specification.
- A parser template file.
-Typically, only the grammar specification is supplied by the programmer.
-Lemon comes with a default parser template
-("lempar.c")
-that works fine for most applications. But the user is free to substitute
-a different parser template if desired.
+Typically, only the grammar specification is supplied by the programmer.
+Lemon comes with a default parser template which works fine for most
+applications. But the user is free to substitute a different parser
+template if desired.
Depending on command-line options, Lemon will generate up to
-three output files.
+three output files.
-- C code to implement a parser for the input grammar.
-
- A header file defining an integer ID for each terminal symbol
- (or "token").
+
- C code to implement the parser.
+
- A header file defining an integer ID for each terminal symbol.
- An information file that describes the states of the generated parser
automaton.
-By default, all three of these output files are generated.
+By default, all three of these output files are generated.
The header file is suppressed if the "-m" command-line option is
used and the report file is omitted when "-q" is selected.
The grammar specification file uses a ".y" suffix, by convention.
In the examples used in this document, we'll assume the name of the
grammar file is "gram.y". A typical use of Lemon would be the
-following command:
+following command:
lemon gram.y
-This command will generate three output files named "gram.c",
+This command will generate three output files named "gram.c",
"gram.h" and "gram.out".
The first is C code to implement the parser. The second
is the header file that defines numerical values for all
terminal symbols, and the last is the report that explains
the states used by the parser automaton.
-
-3.1 Command Line Options
+Command Line Options
The behavior of Lemon can be modified using command-line options.
You can obtain a list of the available command-line options together
-with a brief explanation of what each does by typing
+with a brief explanation of what each does by typing
lemon "-?"
-As of this writing, the following command-line options are supported:
+As of this writing, the following command-line options are supported:
- -b
Show only the basis for each parser state in the report file.
- -c
Do not compress the generated action tables. The parser will be a
@@ -137,17 +102,13 @@
- -ddirectory
Write all output files into directory. Normally, output files
are written into the directory that contains the input grammar file.
- -Dname
Define C preprocessor macro name. This macro is usable by
-"%ifdef",
-"%ifndef", and
-"%if lines
+"%ifdef" and
+"%ifndef" lines
in the grammar file.
-
- -E
-Run the "%if" preprocessor step only and print the revised grammar
-file.
- -g
Do not generate a parser. Instead write the input grammar to standard
output with all comments, actions, and other extraneous text removed.
- -l
Omit "#line" directives in the generated parser C code.
@@ -160,58 +121,57 @@
- -q
Suppress generation of the report file.
- -r
Do not sort or renumber the parser states as part of optimization.
- -s
-Show parser statistics before exiting.
+Show parser statistics before existing.
- -Tfile
Use file as the template for the generated C-code parser implementation.
- -x
Print the Lemon version number.
-
-3.2 The Parser Interface
+The Parser Interface
Lemon doesn't generate a complete, working program. It only generates
a few subroutines that implement a parser. This section describes
the interface to those subroutines. It is up to the programmer to
call these subroutines in an appropriate way in order to produce a
complete system.
Before a program begins using a Lemon-generated parser, the program
must first create the parser.
-A new parser is created as follows:
+A new parser is created as follows:
void *pParser = ParseAlloc( malloc );
-The ParseAlloc() routine allocates and initializes a new parser and
+The ParseAlloc() routine allocates and initializes a new parser and
returns a pointer to it.
The actual data structure used to represent a parser is opaque —
its internal structure is not visible or usable by the calling routine.
For this reason, the ParseAlloc() routine returns a pointer to void
rather than a pointer to some particular structure.
The sole argument to the ParseAlloc() routine is a pointer to the
subroutine used to allocate memory. Typically this means malloc().
After a program is finished using a parser, it can reclaim all
-memory allocated by that parser by calling
+memory allocated by that parser by calling
ParseFree(pParser, free);
-The first argument is the same pointer returned by ParseAlloc(). The
+The first argument is the same pointer returned by ParseAlloc(). The
second argument is a pointer to the function used to release bulk
memory back to the system.
After a parser has been allocated using ParseAlloc(), the programmer
must supply the parser with a sequence of tokens (terminal symbols) to
be parsed. This is accomplished by calling the following function
-once for each token:
+once for each token:
Parse(pParser, hTokenID, sTokenData, pArg);
-The first argument to the Parse() routine is the pointer returned by
+The first argument to the Parse() routine is the pointer returned by
ParseAlloc().
The second argument is a small positive integer that tells the parser the
type of the next token in the data stream.
There is one token type for each terminal symbol in the grammar.
The gram.h file generated by Lemon contains #define statements that
@@ -233,11 +193,11 @@
with this argument except to pass it through to action routines.
This is a convenient mechanism for passing state information down
to the action routines without having to use global variables.
A typical use of a Lemon parser might look something like the
-following:
+following:
1 ParseTree *ParseFile(const char *zFilename){
2 Tokenizer *pTokenizer;
3 void *pParser;
4 Token sToken;
@@ -254,11 +214,11 @@
15 ParseFree(pParser, free );
16 TokenizerFree(pTokenizer);
17 return sState.treeRoot;
18 }
-This example shows a user-written routine that parses a file of
+This example shows a user-written routine that parses a file of
text and returns a pointer to the parse tree.
(All error-handling code is omitted from this example to keep it
simple.)
We assume the existence of some kind of tokenizer which is created
using TokenizerCreate() on line 8 and deleted by TokenizerFree()
@@ -266,11 +226,11 @@
next token from the input file and puts its type in the
integer variable hTokenId. The sToken variable is assumed to be
some kind of structure that contains details about each token,
such as its complete text, what line it occurs on, etc.
-This example also assumes the existence of a structure of type
+
This example also assumes the existence of structure of type
ParserState that holds state information about a particular parse.
An instance of such a structure is created on line 6 and initialized
on line 10. A pointer to this structure is passed into the Parse()
routine as the optional 4th argument.
The action routine specified by the grammar for the parser can use
@@ -277,11 +237,11 @@
the ParserState structure to hold whatever information is useful and
appropriate. In the example, we note that the treeRoot field of
the ParserState structure is left pointing to the root of the parse
tree.
-The core of this example as it relates to Lemon is as follows:
+The core of this example as it relates to Lemon is as follows:
ParseFile(){
pParser = ParseAlloc( malloc );
while( GetNextToken(pTokenizer,&hTokenId, &sToken) ){
Parse(pParser, hTokenId, sToken);
@@ -288,11 +248,11 @@
}
Parse(pParser, 0, sToken);
ParseFree(pParser, free );
}
-Basically, what a program has to do to use a Lemon-generated parser
+Basically, what a program has to do to use a Lemon-generated parser
is first create the parser, then send it lots of tokens obtained by
tokenizing an input source. When the end of input is reached, the
Parse() routine should be called one last time with a token type
of 0. This step is necessary to inform the parser that the end of
input has been reached. Finally, we reclaim memory used by the
@@ -299,143 +259,59 @@
parser by calling ParseFree().
There is one other interface routine that should be mentioned
before we move on.
The ParseTrace() function can be used to generate debugging output
-from the parser. A prototype for this routine is as follows:
+from the parser. A prototype for this routine is as follows:
ParseTrace(FILE *stream, char *zPrefix);
-After this routine is called, a short (one-line) message is written
+After this routine is called, a short (one-line) message is written
to the designated output stream every time the parser changes states
or calls an action routine. Each such message is prefaced using
the text given by zPrefix. This debugging output can be turned off
by calling ParseTrace() again with a first argument of NULL (0).
-
-3.2.1 Allocating The Parse Object On Stack
-
-If all calls to the Parse() interface are made from within
-%code directives, then the parse
-object can be allocated from the stack rather than from the heap.
-These are the steps:
-
-
-- Declare a local variable of type "yyParser"
-
- Initialize the variable using ParseInit()
-
- Pass a pointer to the variable in calls ot Parse()
-
- Deallocate substructure in the parse variable using ParseFinalize().
-
-
-The following code illustrates how this is done:
-
-
- ParseFile(){
- yyParser x;
- ParseInit( &x );
- while( GetNextToken(pTokenizer,&hTokenId, &sToken) ){
- Parse(&x, hTokenId, sToken);
- }
- Parse(&x, 0, sToken);
- ParseFinalize( &x );
- }
-
-
-
-3.2.2 Interface Summary
-
-Here is a quick overview of the C-language interface to a
-Lemon-generated parser:
-
-
-void *ParseAlloc( (void*(*malloc)(size_t) );
-void ParseFree(void *pParser, (void(*free)(void*) );
-void Parse(void *pParser, int tokenCode, ParseTOKENTYPE token, ...);
-void ParseTrace(FILE *stream, char *zPrefix);
-
-
-Notes:
-
-
-
-3.3 Differences With YACC and BISON
+Differences With YACC and BISON
Programmers who have previously used the yacc or bison parser
generator will notice several important differences between yacc and/or
-bison and Lemon.
+bison and Lemon.
- In yacc and bison, the parser calls the tokenizer. In Lemon,
the tokenizer calls the parser.
- Lemon uses no global variables. Yacc and bison use global variables
to pass information between the tokenizer and parser.
- Lemon allows multiple parsers to be running simultaneously. Yacc
and bison do not.
-These differences may cause some initial confusion for programmers
+These differences may cause some initial confusion for programmers
with prior yacc and bison experience.
But after years of experience using Lemon, I firmly
believe that the Lemon way of doing things is better.
Updated as of 2016-02-16:
The text above was written in the 1990s.
We are told that Bison has lately been enhanced to support the
-tokenizer-calls-parser paradigm used by Lemon, eliminating the
+tokenizer-calls-parser paradigm used by Lemon, and to obviate the
need for global variables.
-
-3.4 Building The "lemon" or "lemon.exe" Executable
-
-The "lemon" or "lemon.exe" program is built from a single file
-of C-code named
-"lemon.c".
-The Lemon source code is generic C89 code that uses
-no unusual or non-standard libraries. Any
-reasonable C compiler should suffice to compile the lemon program.
-A command-line like the following will usually work:
-
-
-cc -o lemon lemon.c
-
On Windows machines with Visual C++ installed, bring up a
-"VS20NN x64 Native Tools Command Prompt" window and enter:
-
-
-cl lemon.c
-
-
-Compiling Lemon really is that simple.
-Additional compiler options such as
-"-O2" or "-g" or "-Wall" can be added if desired, but they are not
-necessary.
-
-
-
-4.0 Input File Syntax
+Input File Syntax
The main purpose of the grammar specification file for Lemon is
to define the grammar for the parser. But the input file also
specifies additional information Lemon requires to do its job.
Most of the work in using Lemon is in writing an appropriate
grammar file.
-The grammar file for Lemon is, for the most part, a free format.
+
The grammar file for Lemon is, for the most part, free format.
It does not have sections or divisions like yacc or bison. Any
-declaration can occur at any point in the file. Lemon ignores
-whitespace (except where it is needed to separate tokens), and it
-honors the same commenting conventions as C and C++.
+declaration can occur at any point in the file.
+Lemon ignores whitespace (except where it is needed to separate
+tokens), and it honors the same commenting conventions as C and C++.
-
-4.1 Terminals and Nonterminals
+Terminals and Nonterminals
A terminal symbol (token) is any string of alphanumeric
and/or underscore characters
that begins with an uppercase letter.
A terminal can contain lowercase letters after the first character,
@@ -456,12 +332,11 @@
names or to be individual characters included in single quotes, like
this: ')' or '$'. Lemon does not allow this alternative form for
terminal symbols. With Lemon, all symbols, terminals and nonterminals,
must have alphanumeric names.
-
-4.2 Grammar Rules
+Grammar Rules
The main component of a Lemon grammar file is a sequence of grammar
rules.
Each grammar rule consists of a nonterminal symbol followed by
the special symbol "::=" and then a list of terminals and/or nonterminals.
@@ -470,17 +345,18 @@
rule can be empty.
Rules can occur in any order, except that the left-hand side of the
first rule is assumed to be the start symbol for the grammar (unless
specified otherwise using the %start_symbol
directive described below.)
-A typical sequence of grammar rules might look something like this:
+A typical sequence of grammar rules might look something like this:
expr ::= expr PLUS expr.
expr ::= expr TIMES expr.
expr ::= LPAREN expr RPAREN.
expr ::= VALUE.
+
There is one non-terminal in this example, "expr", and five
terminal symbols or tokens: "PLUS", "TIMES", "LPAREN",
"RPAREN" and "VALUE".
@@ -488,14 +364,15 @@
of C code that will be executed whenever a grammar rule is reduced
by the parser.
In Lemon, this action is specified by putting the C code (contained
within curly braces {...}) immediately after the
period that closes the rule.
-For example:
+For example:
expr ::= expr PLUS expr. { printf("Doing an addition...\n"); }
+
In order to be useful, grammar actions must normally be linked to
their associated grammar rules.
In yacc and bison, this is accomplished by embedding a "$$" in the
action to stand for the value of the left-hand side of the rule and
@@ -508,45 +385,45 @@
rule and say "$7" when you really mean "$8".
Lemon avoids the need to count grammar symbols by assigning symbolic
names to each symbol in a grammar rule and then using those symbolic
names in the action.
-In yacc or bison, one would write this:
+In yacc or bison, one would write this:
expr -> expr PLUS expr { $$ = $1 + $3; };
-But in Lemon, the same rule becomes the following:
+But in Lemon, the same rule becomes the following:
expr(A) ::= expr(B) PLUS expr(C). { A = B+C; }
-In the Lemon rule, any symbol in parentheses after a grammar rule
+In the Lemon rule, any symbol in parentheses after a grammar rule
symbol becomes a place holder for that symbol in the grammar rule.
This place holder can then be used in the associated C action to
-stand for the value of that symbol.
+stand for the value of that symbol.
The Lemon notation for linking a grammar rule with its reduce
action is superior to yacc/bison on several counts.
First, as mentioned above, the Lemon method avoids the need to
count grammar symbols.
Secondly, if a terminal or nonterminal in a Lemon grammar rule
includes a linking symbol in parentheses but that linking symbol
is not actually used in the reduce action, then an error message
is generated.
-For example, the rule
+For example, the rule
expr(A) ::= expr(B) PLUS expr(C). { A = B; }
-will generate an error because the linking symbol "C" is used
+will generate an error because the linking symbol "C" is used
in the grammar rule but not in the reduce action.
The Lemon notation for linking grammar rules to reduce actions
also facilitates the use of destructors for reclaiming memory
allocated by the values of terminals and nonterminals on the
right-hand side of a rule.
-
-4.3 Precedence Rules
+
+Precedence Rules
Lemon resolves parsing ambiguities in exactly the same way as
yacc and bison. A shift-reduce conflict is resolved in favor
of the shift, and a reduce-reduce conflict is resolved by reducing
whichever rule comes first in the grammar file.
@@ -560,76 +437,76 @@
%right or
%nonassoc directives. Terminal symbols
mentioned in earlier directives have a lower precedence than
terminal symbols mentioned in later directives. For example:
-
+
%left AND.
%left OR.
%nonassoc EQ NE GT GE LT LE.
%left PLUS MINUS.
%left TIMES DIVIDE MOD.
%right EXP NOT.
-
+
In the preceding sequence of directives, the AND operator is
defined to have the lowest precedence. The OR operator is one
precedence level higher. And so forth. Hence, the grammar would
-attempt to group the ambiguous expression
+attempt to group the ambiguous expression
a AND b OR c
-like this
+like this
a AND (b OR c).
-The associativity (left, right or nonassoc) is used to determine
+The associativity (left, right or nonassoc) is used to determine
the grouping when the precedence is the same. AND is left-associative
-in our example, so
+in our example, so
a AND b AND c
-is parsed like this
+is parsed like this
(a AND b) AND c.
-The EXP operator is right-associative, though, so
+The EXP operator is right-associative, though, so
a EXP b EXP c
-is parsed like this
+is parsed like this
a EXP (b EXP c).
-The nonassoc precedence is used for non-associative operators.
-So
+The nonassoc precedence is used for non-associative operators.
+So
a EQ b EQ c
-is an error.
+is an error.
The precedence of non-terminals is transferred to rules as follows:
The precedence of a grammar rule is equal to the precedence of the
left-most terminal symbol in the rule for which a precedence is
defined. This is normally what you want, but in those cases where
-you want the precedence of a grammar rule to be something different,
+you want to precedence of a grammar rule to be something different,
you can specify an alternative precedence symbol by putting the
symbol in square braces after the period at the end of the rule and
before any C-code. For example:
-
+
expr = MINUS expr. [NOT]
-
+
This rule has a precedence equal to that of the NOT symbol, not the
MINUS symbol as would have been the case by default.
With the knowledge of how precedence is assigned to terminal
symbols and individual
grammar rules, we can now explain precisely how parsing conflicts
are resolved in Lemon. Shift-reduce conflicts are resolved
-as follows:
+as follows:
- If either the token to be shifted or the rule to be reduced
lacks precedence information, then resolve in favor of the
shift, but report a parsing conflict.
- If the precedence of the token to be shifted is greater than
@@ -645,11 +522,11 @@
left-associative, then resolve in favor of the reduce.
No parsing conflict is reported.
- Otherwise, resolve the conflict by doing the shift, and
report a parsing conflict.
-Reduce-reduce conflicts are resolved this way:
+Reduce-reduce conflicts are resolved this way:
- If either reduce rule
lacks precedence information, then resolve in favor of the
rule that appears first in the grammar, and report a parsing
conflict.
@@ -658,12 +535,11 @@
precedence, and do not report a conflict.
- Otherwise, resolve the conflict by reducing by the rule that
appears first in the grammar, and report a parsing conflict.
-
-4.4 Special Directives
+Special Directives
The input grammar to Lemon consists of grammar rules and special
directives. We've described all the grammar rules, so now we'll
talk about the special directives.
@@ -671,21 +547,19 @@
the grammar rules, or after the grammar rules, or in the midst of the
grammar rules. It doesn't matter. The relative order of
directives used to assign precedence to terminals is important, but
other than that, the order of directives in Lemon is arbitrary.
-Lemon supports the following special directives:
+Lemon supports the following special directives:
-Each of these directives will be described separately in the
+Each of these directives will be described separately in the
following sections:
-
-4.4.1 The %code directive
+
+The %code directive
The %code directive is used to specify additional C code that
is added to the end of the main output file. This is similar to
the %include directive except that
%include is inserted at the beginning of the main output file.
@@ -718,62 +591,59 @@
%code is typically used to include some action routines or perhaps
a tokenizer or even the "main()" function
as part of the output file.
-There can be multiple %code directives. The arguments of
-all %code directives are concatenated.
-
-
-4.4.2 The %default_destructor directive
+
+The %default_destructor directive
The %default_destructor directive specifies a destructor to
use for non-terminals that do not have their own destructor
specified by a separate %destructor directive. See the documentation
-on the %destructor directive below for
+on the %destructor directive below for
additional information.
In some grammars, many different non-terminal symbols have the
same data type and hence the same destructor. This directive is
a convenient way to specify the same destructor for all those
non-terminals using a single statement.
-
-4.4.3 The %default_type directive
+
+The %default_type directive
The %default_type directive specifies the data type of non-terminal
symbols that do not have their own data type defined using a separate
%type directive.
-
-4.4.4 The %destructor directive
+
+The %destructor directive
The %destructor directive is used to specify a destructor for
a non-terminal symbol.
(See also the %token_destructor
directive which is used to specify a destructor for terminal symbols.)
A non-terminal's destructor is called to dispose of the
non-terminal's value whenever the non-terminal is popped from
-the stack. This includes all of the following circumstances:
+the stack. This includes all of the following circumstances:
- When a rule reduces and the value of a non-terminal on
the right-hand side is not linked to C code.
- When the stack is popped during error processing.
- When the ParseFree() function runs.
-The destructor can do whatever it wants with the value of
+The destructor can do whatever it wants with the value of
the non-terminal, but its design is to deallocate memory
or other resources held by that non-terminal.
-Consider an example:
+Consider an example:
%type nt {void*}
%destructor nt { free($$); }
nt(A) ::= ID NUM. { A = malloc( 100 ); }
-This example is a bit contrived, but it serves to illustrate how
+This example is a bit contrived, but it serves to illustrate how
destructors work. The example shows a non-terminal named
"nt" that holds values of type "void*". When the rule for
an "nt" reduces, it sets the value of the non-terminal to
space obtained from malloc(). Later, when the nt non-terminal
is popped from the stack, the destructor will fire and call
@@ -792,54 +662,54 @@
Destructors help avoid memory leaks by automatically freeing
allocated objects when they go out of scope.
To do the same using yacc or bison is much more difficult.
-
-4.4.5 The %extra_argument directive
+
+The %extra_argument directive
-The %extra_argument directive instructs Lemon to add a 4th parameter
+The %extra_argument directive instructs Lemon to add a 4th parameter
to the parameter list of the Parse() function it generates. Lemon
doesn't do anything itself with this extra argument, but it does
make the argument available to C-code action routines, destructors,
and so forth. For example, if the grammar file contains:
-
+
%extra_argument { MyStruct *pAbc }
-
+
Then the Parse() function generated will have an 4th parameter
of type "MyStruct*" and all action routines will have access to
a variable named "pAbc" that is the value of the 4th parameter
in the most recent call to Parse().
The %extra_context directive works the same except that it
is passed in on the ParseAlloc() or ParseInit() routines instead of
-on Parse().
+on Parse().
-
-4.4.6 The %extra_context directive
+
+The %extra_context directive
-The %extra_context directive instructs Lemon to add a 2nd parameter
-to the parameter list of the ParseAlloc() and ParseInit() functions. Lemon
+The %extra_context directive instructs Lemon to add a 2th parameter
+to the parameter list of the ParseAlloc() and ParseInif() functions. Lemon
doesn't do anything itself with these extra argument, but it does
store the value make it available to C-code action routines, destructors,
and so forth. For example, if the grammar file contains:
-
+
%extra_context { MyStruct *pAbc }
-
+
-Then the ParseAlloc() and ParseInit() functions will have an 2nd parameter
+
Then the ParseAlloc() and ParseInit() functions will have an 2th parameter
of type "MyStruct*" and all action routines will have access to
-a variable named "pAbc" that is the value of that 2nd parameter.
+a variable named "pAbc" that is the value of that 2th parameter.
The %extra_argument directive works the same except that it
-is passed in on the Parse() routine instead of on ParseAlloc()/ParseInit().
+is passed in on the Parse() routine instead of on ParseAlloc()/ParseInit().
-
-4.4.7 The %fallback directive
+
+The %fallback directive
The %fallback directive specifies an alternative meaning for one
or more tokens. The alternative meaning is tried if the original token
would have generated a syntax error.
@@ -851,11 +721,11 @@
them all. Programmers will, therefore, sometimes mistakenly use an
obscure language keyword for an identifier. The %fallback directive
provides a mechanism to tell the parser: "If you are unable to parse
this keyword, try treating it as an identifier instead."
-The syntax of %fallback is as follows:
+The syntax of %fallback is as follows:
%fallback ID TOKEN... .
@@ -864,43 +734,32 @@
The first token name is the fallback token — the
token to which all the other tokens fall back to. The second and subsequent
arguments are tokens which fall back to the token identified by the first
argument.
-
-4.4.8 The %if directive and its friends
+
+The %ifdef, %ifndef, and %endif directives
-The %if, %ifdef, %ifndef, %else,
-and %endif directives
-are similar to #if, #ifdef, #ifndef, #else, and #endif in the C-preprocessor,
+
The %ifdef, %ifndef, and %endif directives
+are similar to #ifdef, #ifndef, and #endif in the C-preprocessor,
just not as general.
Each of these directives must begin at the left margin. No whitespace
is allowed between the "%" and the directive name.
Grammar text in between "%ifdef MACRO" and the next nested
"%endif" is
ignored unless the "-DMACRO" command-line option is used. Grammar text
betwen "%ifndef MACRO" and the next nested "%endif" is
-included except when the "-DMACRO" command-line option is used.
-
-
The text in between "%if CONDITIONAL" and its
-corresponding %endif is included only if CONDITIONAL
-is true. The CONDITION is one or more macro names, optionally connected
-using the "||" and "&&" binary operators, the "!" unary operator,
-and grouped using balanced parentheses. Each term is true if the
-corresponding macro exists, and false if it does not exist.
-
-An optional "%else" directive can occur anywhere in between a
-%ifdef, %ifndef, or %if directive and
-its corresponding %endif.
-
-Note that the argument to %ifdef and %ifndef is
-intended to be a single preprocessor symbol name, not a general expression.
-Use the "%if" directive for general expressions.
-
-
-4.4.9 The %include directive
+included except when the "-DMACRO" command-line option is used.
+
+Note that the argument to %ifdef and %ifndef must
+be a single preprocessor symbol name, not a general expression.
+There is no "%else" directive.
+
+
+
+The %include directive
The %include directive specifies C code that is included at the
top of the generated parser. You can include any text you want —
the Lemon parser generator copies it blindly. If you have multiple
%include directives in your grammar file, their values are concatenated
@@ -909,22 +768,22 @@
The %include directive is very handy for getting some extra #include
preprocessor statements at the beginning of the generated parser.
For example:
-
+
%include {#include <unistd.h>}
-
+
This might be needed, for example, if some of the C actions in the
grammar call functions that are prototyped in unistd.h.
Use the %code directive to add code to
the end of the generated parser.
-
-4.4.10 The %left directive
+
+The %left directive
The %left directive is used (along with the
%right and
%nonassoc directives) to declare
precedences of terminal symbols.
@@ -931,18 +790,18 @@
Every terminal symbol whose name appears after
a %left directive but before the next period (".") is
given the same left-associative precedence value. Subsequent
%left directives have higher precedence. For example:
-
+
%left AND.
%left OR.
%nonassoc EQ NE GT GE LT LE.
%left PLUS MINUS.
%left TIMES DIVIDE MOD.
%right EXP NOT.
-
+
Note the period that terminates each %left,
%right or %nonassoc
directive.
@@ -949,237 +808,214 @@
LALR(1) grammars can get into a situation where they require
a large amount of stack space if you make heavy use or right-associative
operators. For this reason, it is recommended that you use %left
rather than %right whenever possible.
-
-4.4.11 The %name directive
+
+The %name directive
By default, the functions generated by Lemon all begin with the
five-character string "Parse". You can change this string to something
different using the %name directive. For instance:
-
+
%name Abcde
-
+
Putting this directive in the grammar file will cause Lemon to generate
-functions named
+functions named
- AbcdeAlloc(),
- AbcdeFree(),
- AbcdeTrace(), and
- Abcde().
-The %name directive allows you to generate two or more different
+The %name directive allows you to generate two or more different
parsers and link them all into the same executable.
-
-4.4.12 The %nonassoc directive
+
+The %nonassoc directive
This directive is used to assign non-associative precedence to
one or more terminal symbols. See the section on
precedence rules
or on the %left directive
for additional information.
-
-4.4.13 The %parse_accept directive
+
+The %parse_accept directive
The %parse_accept directive specifies a block of C code that is
executed whenever the parser accepts its input string. To "accept"
an input string means that the parser was able to process all tokens
without error.
For example:
-
+
%parse_accept {
printf("parsing complete!\n");
}
-
+
-
-4.4.14 The %parse_failure directive
+
+The %parse_failure directive
The %parse_failure directive specifies a block of C code that
is executed whenever the parser fails complete. This code is not
executed until the parser has tried and failed to resolve an input
error using is usual error recovery strategy. The routine is
only invoked when parsing is unable to continue.
-
+
%parse_failure {
fprintf(stderr,"Giving up. Parser is hopelessly lost...\n");
}
-
+
-
-4.4.15 The %right directive
+
+The %right directive
This directive is used to assign right-associative precedence to
one or more terminal symbols. See the section on
precedence rules
or on the %left directive for additional information.
-
-4.4.16 The %stack_overflow directive
+
+The %stack_overflow directive
The %stack_overflow directive specifies a block of C code that
is executed if the parser's internal stack ever overflows. Typically
this just prints an error message. After a stack overflow, the parser
will be unable to continue and must be reset.
-
+
%stack_overflow {
fprintf(stderr,"Giving up. Parser stack overflow\n");
}
-
+
You can help prevent parser stack overflows by avoiding the use
of right recursion and right-precedence operators in your grammar.
Use left recursion and and left-precedence operators instead to
encourage rules to reduce sooner and keep the stack size down.
-For example, do rules like this:
+For example, do rules like this:
list ::= list element. // left-recursion. Good!
list ::= .
-Not like this:
+Not like this:
list ::= element list. // right-recursion. Bad!
list ::= .
-
+
-
-4.4.17 The %stack_size directive
+
+The %stack_size directive
If stack overflow is a problem and you can't resolve the trouble
by using left-recursion, then you might want to increase the size
of the parser's stack using this directive. Put an positive integer
after the %stack_size directive and Lemon will generate a parse
with a stack of the requested size. The default value is 100.
-
+
%stack_size 2000
-
+
-
-4.4.18 The %start_symbol directive
+
+The %start_symbol directive
By default, the start symbol for the grammar that Lemon generates
is the first non-terminal that appears in the grammar file. But you
can choose a different start symbol using the
%start_symbol directive.
-
+
%start_symbol prog
-
-
-
-4.4.19 The %syntax_error directive
-
-See Error Processing.
-
-
-4.4.20 The %token directive
-
-Tokens are normally created automatically, the first time they are used.
-Any identifier that begins with an upper-case letter is a token.
-
-
Sometimes it is useful to declare tokens in advance, however. The
-integer values assigned to each token determined by the order in which
-the tokens are seen. So by declaring tokens in advance, it is possible to
-cause some tokens to have low-numbered values, which might be desirable in
-some grammers, or to have sequential values assigned to a sequence of
-related tokens. For this reason, the %token directive is provided to
-declare tokens in advance. The syntax is as follows:
-
-
-%token TOKEN TOKEN... .
-
-
-The %token directive is followed by zero or more token symbols and
-terminated by a single ".". Each token named is created if it does not
-already exist. Tokens are created in order.
-
-
-
-
4.4.21 The %token_class directive
+
+
+
+The %syntax_error directive
+
+See Error Processing.
+
+
+The %token_class directive
Undocumented. Appears to be related to the MULTITERMINAL concept.
Implementation.
-
-4.4.22 The %token_destructor directive
+
+The %token_destructor directive
The %destructor directive assigns a destructor to a non-terminal
symbol. (See the description of the
%destructor directive above.)
The %token_destructor directive does the same thing
for all terminal symbols.
-Unlike non-terminal symbols, which may each have a different data type
+
Unlike non-terminal symbols which may each have a different data type
for their values, terminals all use the same data type (defined by
the %token_type directive)
and so they use a common destructor.
Other than that, the token destructor works just like the non-terminal
destructors.
-
-4.4.23 The %token_prefix directive
+
+The %token_prefix directive
Lemon generates #defines that assign small integer constants
to each terminal symbol in the grammar. If desired, Lemon will
add a prefix specified by this directive
to each of the #defines it generates.
-So if the default output of Lemon looked like this:
+So if the default output of Lemon looked like this:
#define AND 1
#define MINUS 2
#define OR 3
#define PLUS 4
-You can insert a statement into the grammar like this:
+You can insert a statement into the grammar like this:
%token_prefix TOKEN_
-to cause Lemon to produce these symbols instead:
+to cause Lemon to produce these symbols instead:
#define TOKEN_AND 1
#define TOKEN_MINUS 2
#define TOKEN_OR 3
#define TOKEN_PLUS 4
-
+
-
-4.4.24 The %token_type and %type directives
+
+The %token_type and %type directives
These directives are used to specify the data types for values
on the parser's stack associated with terminal and non-terminal
symbols. The values of all terminal symbols must be of the same
type. This turns out to be the same data type as the 3rd parameter
to the Parse() function generated by Lemon. Typically, you will
-make the value of a terminal symbol be a pointer to some kind of
+make the value of a terminal symbol by a pointer to some kind of
token structure. Like this:
-
+
%token_type {Token*}
-
+
If the data type of terminals is not specified, the default value
is "void*".
Non-terminal symbols can each have their own data types. Typically
the data type of a non-terminal is a pointer to the root of a parse tree
structure that contains all information about that non-terminal.
For example:
-
+
%type expr {Expr*}
-
+
Each entry on the parser's stack is actually a union containing
instances of all data types for every non-terminal and terminal symbol.
Lemon will automatically use the correct element of this union depending
on what the corresponding non-terminal or terminal symbol is. But
@@ -1187,23 +1023,23 @@
will be the size of its largest element. So if you have a single
non-terminal whose data type requires 1K of storage, then your 100
entry parser stack will require 100K of heap space. If you are willing
and able to pay that price, fine. You just need to know.
-
-4.4.25 The %wildcard directive
+
+The %wildcard directive
The %wildcard directive is followed by a single token name and a
period. This directive specifies that the identified token should
match any input token.
When the generated parser has the choice of matching an input against
the wildcard token and some other token, the other token is always used.
The wildcard token is only matched if there are no alternatives.
-
-5.0 Error Processing
+
+Error Processing
After extensive experimentation over several years, it has been
discovered that the error recovery strategy used by yacc is about
as good as it gets. And so that is what Lemon uses.
@@ -1222,43 +1058,7 @@
is invoked and the parser resets itself to its start state, ready
to begin parsing a new file. This is what will happen at the very
first syntax error, of course, if there are no instances of the
"error" non-terminal in your grammar.
-
-6.0 History of Lemon
-
-Lemon was originally written by Richard Hipp sometime in the late
-1980s on a Sun4 Workstation using K&R C.
-There was a companion LL(1) parser generator program named "Lime", the
-source code to which as been lost.
-
-The lemon.c source file was originally many separate files that were
-compiled together to generate the "lemon" executable. Sometime in the
-1990s, the individual source code files were combined together into
-the current single large "lemon.c" source file. You can still see traces
-of original filenames in the code.
-
-Since 2001, Lemon has been part of the
-SQLite project and the source code
-to Lemon has been managed as a part of the
-SQLite source tree in the following
-files:
-
-
-
-
-7.0 Copyright
-
-All of the source code to Lemon, including the template parser file
-"lempar.c" and this documentation file ("lemon.html") are in the public
-domain. You can use the code for any purpose and without attribution.
-
-The code comes with no warranty. If it breaks, you get to keep both
-pieces.
-
DELETED doc/trusted-schema.md
Index: doc/trusted-schema.md
==================================================================
--- doc/trusted-schema.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# The new-security-options branch
-
-## The problem that the [new-security-options](/timeline?r=new-security-options) branch tries to solve
-
-An attacker might modify the schema of an SQLite database by adding
-structures that cause code to run when some other application opens and
-reads the database. For example, the attacker might replace a table
-definition with a view. Or the attacker might add triggers to tables
-or views, or add new CHECK constraints or generated columns or indexes
-with expressions in the index list or in the WHERE clause. If the
-added features invoke SQL functions or virtual tables with side effects,
-that might cause harm to the system if run by a high-privilege victim.
-Or, the added features might exfiltrate information if the database is
-read by a high-privilege victim.
-
-The changes in this branch strive to make it easier for high-privilege
-applications to safely read SQLite database files that might have been
-maliciously corrupted by an attacker.
-
-## Overview of changes in [new-security-options](/timeline?r=new-security-options)
-
-The basic idea is to tag every SQL function and virtual table with one
-of three risk levels:
-
- 1. Innocuous
- 2. Normal
- 3. Direct-Only
-
-Innocuous functions/vtabs are safe and can be used at any time.
-Direct-only elements, in contrast, might have cause side-effects and
-should only be used from top-level SQL, not from within triggers or views nor
-in elements of the schema such as CHECK constraint, DEFAULT values,
-generated columns, index expressions, or in the WHERE clause of a
-partial index that are potentially under the control of an attacker.
-Normal elements behave like Innocuous if TRUSTED\_SCHEMA=on
-and behave like direct-only if TRUSTED\_SCHEMA=off.
-
-Application-defined functions and virtual tables go in as Normal unless
-the application takes deliberate steps to change the risk level.
-
-For backwards compatibility, the default is TRUSTED\_SCHEMA=on. Documentation
-will be updated to recommend applications turn TRUSTED\_SCHEMA to off.
-
-An innocuous function or virtual table is one that can only read content
-from the database file in which it resides, and can only alter the database
-in which it resides. Most SQL functions are innocuous. For example, there
-is no harm in an attacker running the abs() function.
-
-Direct-only elements that have side-effects that go outside the database file
-in which it lives, or return information from outside of the database file.
-Examples of direct-only elements include:
-
- 1. The fts3\_tokenizer() function
- 2. The writefile() function
- 3. The readfile() function
- 4. The zipvfs virtual table
- 5. The csv virtual table
-
-We do not want an attacker to be able to add these kinds of things to
-the database schema and possibly trick a high-privilege application
-from performing any of these actions. Therefore, functions and vtabs
-with side-effects are marked as Direct-Only.
-
-Legacy applications might add other risky functions or vtabs. Those will
-go in as "Normal" by default. For optimal security, we want those risky
-app-defined functions and vtabs to be direct-only, but making that the
-default might break some legacy applications. Hence, all app-defined
-functions and vtabs go in as Normal, but the application can switch them
-over to "Direct-Only" behavior using a single pragma.
-
-The restrictions on the use of functions and virtual tables do not apply
-to TEMP. A TEMP VIEW or a TEMP TRIGGER can use any valid SQL function
-or virtual table. The idea is that TEMP views and triggers must be
-directly created by the application and are thus under the control of the
-application. TEMP views and triggers cannot be created by an attacker who
-corrupts the schema of a persistent database file. Hence TEMP views and
-triggers are safe.
-
-## Specific changes
-
- 1. New sqlite3\_db\_config() option SQLITE\_DBCONFIG\_TRUSTED\_SCHEMA for
- turning TRUSTED\_SCHEMA on and off. It defaults to ON.
-
- 2. Compile-time option -DSQLITE\_TRUSTED\_SCHEMA=0 causes the default
- TRUSTED\_SCHEMA setting to be off.
-
- 3. New pragma "PRAGMA trusted\_schema=(ON\|OFF);". This provides access
- to the TRUSTED_SCHEMA setting for application coded using scripting
- languages or other secondary languages where they are unable to make
- calls to sqlite3\_db\_config().
-
- 4. New options for the "enc" parameter to sqlite3\_create\_function() and
- its kin:
-
- - _SQLITE\_INNOCUOUS_ → tags the new functions as Innocuous
-
- _SQLITE\_DIRECTONLY_ → tags the new functions as Direct-Only
-
-
- 5. New options to sqlite3\_vtab\_config():
-
- - _SQLITE\_VTAB\_INNOCUOUS_ → tags the vtab as Innocuous
-
- _SQLITE\_VTAB\_DIRECTONLY_ → tags the vtab as Direct-Only
-
-
- 6. Change many of the functions and virtual tables in the SQLite source
- tree to use one of the tags above.
-
- 7. Enhanced PRAGMA function\_list and virtual-table "pragma\_function\_list"
- with additional columns. The columns now are:
-
- - _name_ → Name of the function
-
- _builtin_ → 1 for built-in functions. 0 otherwise.
-
- _type_ → 's'=Scalar, 'a'=Aggregate, 'w'=Window
-
- _enc_ → 'utf8', 'utf16le', or 'utf16be'
-
- _narg_ → number of argument
-
- _flags_ → Bitmask of SQLITE\_INNOCUOUS, SQLITE\_DIRECTONLY,
- SQLITE\_DETERMINISTIC, SQLITE\_SUBTYPE, and
- SQLITE\_FUNC\_INTERNAL flags.
-
- The last four columns are new.
-
- 8. The function\_list PRAGMA now also shows all entries for each function.
- So, for example, if a function can take either 2 or 3 arguments,
- there are separate rows for the 2-argument and 3-argument versions of
- the function.
-
-## Additional Notes
-
-The function_list enhancements allow the application to query the set
-of SQL functions that meet various criteria. For example, to see all
-SQL functions that are never allowed to be used in the schema or in
-trigger or views:
-
-~~~
- SELECT DISTINCT name FROM pragma_function_list
- WHERE (flags & 0x80000)!=0
- ORDER BY name;
-~~~
-
-Doing the same is not possible for virtual tables, as a virtual table
-might be Innocuous, Normal, or Direct-Only depending on the arguments
-passed into the xConnect method.
DELETED doc/vdbesort-memory.md
Index: doc/vdbesort-memory.md
==================================================================
--- doc/vdbesort-memory.md
+++ /dev/null
@@ -1,49 +0,0 @@
-
-20-11-2020
-
-# Memory Allocation In vdbesort.c
-
-Memory allocation is slightly different depending on:
-
- * whether or not SQLITE_CONFIG_SMALL_MALLOC is set, and
- * whether or not worker threads are enabled.
-
-## SQLITE_CONFIG_SMALL_MALLOC=0
-
-Assuming SQLITE_CONFIG_SMALL_MALLOC is not set, keys passed to the sorter are
-added to an in-memory buffer. This buffer is grown using sqlite3Realloc() as
-required it reaches the size configured for the main pager cache using "PRAGMA
-cache_size". i.e. if the user has executed "PRAGMA main.cache_size = -2048",
-then this buffer is allowed to grow up to 2MB in size.
-
-Once the buffer has grown to its threshold, keys are sorted and written to
-a temp file. If worker threads are not enabled, this is the only significant
-allocation the sorter module makes. After keys are sorted and flushed out to
-the temp file, the buffer is reused to accumulate the next batch of keys.
-
-If worker threads are available, then the buffer is passed to a worker thread
-to sort and flush once it is full, and a new buffer allocated to allow the
-main thread to continue to accumulate keys. Buffers are reused once they
-have been flushed, so in this case at most (nWorker+1) buffers are allocated
-and used, where nWorker is the number of configured worker threads.
-
-There are no other significant users of heap memory in the sorter module.
-Once sorted buffers of keys have been flushed to disk, they are read back
-either by mapping the file (via sqlite3_file.xFetch()) or else read back
-in one page at a time.
-
-All buffers are allocated by the main thread. A sorter object is associated
-with a single database connection, to which it holds a pointer.
-
-## SQLITE_CONFIG_SMALL_MALLOC=1
-
-This case is similar to the above, except that instead of accumulating
-multiple keys in a single large buffer, sqlite3VdbeSorterWrite() stores
-keys in a regular heap-memory linked list (one allocation per element).
-List elements are freed as they are flushed to disk, either by the main
-thread or by a worker thread.
-
-Each time a key is added the sorter (and an allocation made),
-sqlite3HeapNearlyFull() is called. If it returns true, the current
-list of keys is flushed to a temporary file, even if it has not yet
-reached the size threshold.
DELETED doc/wal-lock.md
Index: doc/wal-lock.md
==================================================================
--- doc/wal-lock.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Wal-Mode Blocking Locks
-
-On some Unix-like systems, SQLite may be configured to use POSIX blocking locks
-by:
-
- * building the library with SQLITE\_ENABLE\_SETLK\_TIMEOUT defined, and
- * configuring a timeout in ms using the sqlite3\_busy\_timeout() API.
-
-Blocking locks may be advantageous as (a) waiting database clients do not
-need to continuously poll the database lock, and (b) using blocking locks
-facilitates transfer of OS priority between processes when a high priority
-process is blocked by a lower priority one.
-
-Only read/write clients use blocking locks. Clients that have read-only access
-to the \*-shm file nevery use blocking locks.
-
-Threads or processes that access a single database at a time never deadlock as
-a result of blocking database locks. But it is of course possible for threads
-that lock multiple databases simultaneously to do so. In most cases the OS will
-detect the deadlock and return an error.
-
-## Wal Recovery
-
-Wal database "recovery" is a process required when the number of connected
-database clients changes from zero to one. In this case, a client is
-considered to connect to the database when it first reads data from it.
-Before recovery commences, an exclusive WRITER lock is taken.
-
-Without blocking locks, if two clients attempt recovery simultaneously, one
-fails to obtain the WRITER lock and either invokes the busy-handler callback or
-returns SQLITE\_BUSY to the user. With blocking locks configured, the second
-client blocks on the WRITER lock.
-
-## Database Readers
-
-Usually, read-only are not blocked by any other database clients, so they
-have no need of blocking locks.
-
-If a read-only transaction is being opened on a snapshot, the CHECKPOINTER
-lock is required briefly as part of opening the transaction (to check that a
-checkpointer is not currently overwriting the snapshot being opened). A
-blocking lock is used to obtain the CHECKPOINTER lock in this case. A snapshot
-opener may therefore block on and transfer priority to a checkpointer in some
-cases.
-
-## Database Writers
-
-A database writer must obtain the exclusive WRITER lock. It uses a blocking
-lock to do so if any of the following are true:
-
- * the transaction is an implicit one consisting of a single DML or DDL
- statement, or
- * the transaction is opened using BEGIN IMMEDIATE or BEGIN EXCLUSIVE, or
- * the first SQL statement executed following the BEGIN command is a DML or
- DDL statement (not a read-only statement like a SELECT).
-
-In other words, in all cases except when an open read-transaction is upgraded
-to a write-transaction. In that case a non-blocking lock is used.
-
-## Database Checkpointers
-
-Database checkpointers takes the following locks, in order:
-
- * The exclusive CHECKPOINTER lock.
- * The exclusive WRITER lock (FULL, RESTART and TRUNCATE only).
- * Exclusive lock on read-mark slots 1-N. These are immediately released after being taken.
- * Exclusive lock on read-mark 0.
- * Exclusive lock on read-mark slots 1-N again. These are immediately released
- after being taken (RESTART and TRUNCATE only).
-
-All of the above use blocking locks.
-
-## Summary
-
-With blocking locks configured, the only cases in which clients should see an
-SQLITE\_BUSY error are:
-
- * if the OS does not grant a blocking lock before the configured timeout
- expires, and
- * when an open read-transaction is upgraded to a write-transaction.
-
-In all other cases the blocking locks implementation should prevent clients
-from having to handle SQLITE\_BUSY errors and facilitate appropriate transfer
-of priorities between competing clients.
-
-Clients that lock multiple databases simultaneously must be wary of deadlock.
-
-
Index: ext/async/sqlite3async.c
==================================================================
--- ext/async/sqlite3async.c
+++ ext/async/sqlite3async.c
@@ -1702,5 +1702,6 @@
va_end(ap);
return rc;
}
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_ASYNCIO) */
+
Index: ext/async/sqlite3async.h
==================================================================
--- ext/async/sqlite3async.h
+++ ext/async/sqlite3async.h
@@ -218,5 +218,6 @@
#ifdef __cplusplus
} /* End of the 'extern "C"' block */
#endif
#endif /* ifndef __SQLITEASYNC_H_ */
+
Index: ext/expert/expert1.test
==================================================================
--- ext/expert/expert1.test
+++ ext/expert/expert1.test
@@ -26,23 +26,19 @@
if {[info commands sqlite3_expert_new]==""} {
finish_test
return
}
-
set CLI [test_binary_name sqlite3]
set CMD [test_binary_name sqlite3_expert]
proc squish {txt} {
regsub -all {[[:space:]]+} $txt { }
}
proc do_setup_rec_test {tn setup sql res} {
reset_db
- if {[info exists ::set_main_db_name]} {
- dbconfig_maindbname_icecube db
- }
db eval $setup
uplevel [list do_rec_test $tn $sql $res]
}
foreach {tn setup} {
@@ -78,14 +74,10 @@
set tst [subst -nocommands {set {} [squish [join {$result}]]}]
uplevel [list do_test $tn $tst [string trim [squish $res]]]
}
}
3 {
- if {[info commands sqlite3_expert_new]==""} { continue }
- set ::set_main_db_name 1
- }
- 4 {
if {![file executable $CLI]} { continue }
proc do_rec_test {tn sql res} {
set res [squish [string trim $res]]
set tst [subst -nocommands {
@@ -101,93 +93,91 @@
do_setup_rec_test $tn.1 { CREATE TABLE t1(a, b, c) } {
SELECT * FROM t1
} {
(no new indexes)
- SCAN t1
+ SCAN TABLE t1
}
do_setup_rec_test $tn.2 {
CREATE TABLE t1(a, b, c);
} {
SELECT * FROM t1 WHERE b>?;
} {
CREATE INDEX t1_idx_00000062 ON t1(b);
- SEARCH t1 USING INDEX t1_idx_00000062 (b>?)
+ SEARCH TABLE t1 USING INDEX t1_idx_00000062 (b>?)
}
do_setup_rec_test $tn.3 {
CREATE TABLE t1(a, b, c);
} {
SELECT * FROM t1 WHERE b COLLATE nocase BETWEEN ? AND ?
} {
CREATE INDEX t1_idx_3e094c27 ON t1(b COLLATE NOCASE);
- SEARCH t1 USING INDEX t1_idx_3e094c27 (b>? AND b)
+ SEARCH TABLE t1 USING INDEX t1_idx_3e094c27 (b>? AND b)
}
do_setup_rec_test $tn.4 {
CREATE TABLE t1(a, b, c);
} {
SELECT a FROM t1 ORDER BY b;
} {
CREATE INDEX t1_idx_00000062 ON t1(b);
- SCAN t1 USING INDEX t1_idx_00000062
+ SCAN TABLE t1 USING INDEX t1_idx_00000062
}
do_setup_rec_test $tn.5 {
CREATE TABLE t1(a, b, c);
} {
SELECT a FROM t1 WHERE a=? ORDER BY b;
} {
CREATE INDEX t1_idx_000123a7 ON t1(a, b);
- SEARCH t1 USING COVERING INDEX t1_idx_000123a7 (a=?)
+ SEARCH TABLE t1 USING COVERING INDEX t1_idx_000123a7 (a=?)
}
-if 0 {
do_setup_rec_test $tn.6 {
CREATE TABLE t1(a, b, c);
} {
SELECT min(a) FROM t1
} {
CREATE INDEX t1_idx_00000061 ON t1(a);
- SEARCH t1 USING COVERING INDEX t1_idx_00000061
-}
+ SEARCH TABLE t1 USING COVERING INDEX t1_idx_00000061
}
do_setup_rec_test $tn.7 {
CREATE TABLE t1(a, b, c);
} {
SELECT * FROM t1 ORDER BY a, b, c;
} {
CREATE INDEX t1_idx_033e95fe ON t1(a, b, c);
- SCAN t1 USING COVERING INDEX t1_idx_033e95fe
+ SCAN TABLE t1 USING COVERING INDEX t1_idx_033e95fe
}
#do_setup_rec_test $tn.1.8 {
# CREATE TABLE t1(a, b, c);
#} {
# SELECT * FROM t1 ORDER BY a ASC, b COLLATE nocase DESC, c ASC;
#} {
# CREATE INDEX t1_idx_5be6e222 ON t1(a, b COLLATE NOCASE DESC, c);
-# 0|0|0|SCAN t1 USING COVERING INDEX t1_idx_5be6e222
+# 0|0|0|SCAN TABLE t1 USING COVERING INDEX t1_idx_5be6e222
#}
do_setup_rec_test $tn.8.1 {
CREATE TABLE t1(a COLLATE NOCase, b, c);
} {
SELECT * FROM t1 WHERE a=?
} {
CREATE INDEX t1_idx_00000061 ON t1(a);
- SEARCH t1 USING INDEX t1_idx_00000061 (a=?)
+ SEARCH TABLE t1 USING INDEX t1_idx_00000061 (a=?)
}
do_setup_rec_test $tn.8.2 {
CREATE TABLE t1(a, b COLLATE nocase, c);
} {
SELECT * FROM t1 ORDER BY a ASC, b DESC, c ASC;
} {
CREATE INDEX t1_idx_5cb97285 ON t1(a, b DESC, c);
- SCAN t1 USING COVERING INDEX t1_idx_5cb97285
+ SCAN TABLE t1 USING COVERING INDEX t1_idx_5cb97285
}
# Tables with names that require quotes.
#
@@ -194,21 +184,21 @@
do_setup_rec_test $tn.9.1 {
CREATE TABLE "t t"(a, b, c);
} {
SELECT * FROM "t t" WHERE a=?
} {
- CREATE INDEX "t t_idx_00000061" ON "t t"(a);
- SEARCH t t USING INDEX t t_idx_00000061 (a=?)
+ CREATE INDEX 't t_idx_00000061' ON 't t'(a);
+ SEARCH TABLE t t USING INDEX t t_idx_00000061 (a=?)
}
do_setup_rec_test $tn.9.2 {
CREATE TABLE "t t"(a, b, c);
} {
SELECT * FROM "t t" WHERE b BETWEEN ? AND ?
} {
- CREATE INDEX "t t_idx_00000062" ON "t t"(b);
- SEARCH t t USING INDEX t t_idx_00000062 (b>? AND b)
+ CREATE INDEX 't t_idx_00000062' ON 't t'(b);
+ SEARCH TABLE t t USING INDEX t t_idx_00000062 (b>? AND b)
}
# Columns with names that require quotes.
#
do_setup_rec_test $tn.10.1 {
@@ -215,20 +205,20 @@
CREATE TABLE t3(a, "b b", c);
} {
SELECT * FROM t3 WHERE "b b" = ?
} {
CREATE INDEX t3_idx_00050c52 ON t3('b b');
- SEARCH t3 USING INDEX t3_idx_00050c52 (b b=?)
+ SEARCH TABLE t3 USING INDEX t3_idx_00050c52 (b b=?)
}
do_setup_rec_test $tn.10.2 {
CREATE TABLE t3(a, "b b", c);
} {
SELECT * FROM t3 ORDER BY "b b"
} {
CREATE INDEX t3_idx_00050c52 ON t3('b b');
- SCAN t3 USING INDEX t3_idx_00050c52
+ SCAN TABLE t3 USING INDEX t3_idx_00050c52
}
# Transitive constraints
#
do_setup_rec_test $tn.11.1 {
@@ -237,12 +227,12 @@
} {
SELECT * FROM t5, t6 WHERE a=? AND b=c AND c=?
} {
CREATE INDEX t5_idx_000123a7 ON t5(a, b);
CREATE INDEX t6_idx_00000063 ON t6(c);
- SEARCH t6 USING INDEX t6_idx_00000063 (c=?)
- SEARCH t5 USING COVERING INDEX t5_idx_000123a7 (a=? AND b=?)
+ SEARCH TABLE t6 USING INDEX t6_idx_00000063 (c=?)
+ SEARCH TABLE t5 USING COVERING INDEX t5_idx_000123a7 (a=? AND b=?)
}
# OR terms.
#
do_setup_rec_test $tn.12.1 {
@@ -252,13 +242,13 @@
} {
CREATE INDEX t7_idx_00000062 ON t7(b);
CREATE INDEX t7_idx_00000061 ON t7(a);
MULTI-INDEX OR
INDEX 1
- SEARCH t7 USING INDEX t7_idx_00000061 (a=?)
+ SEARCH TABLE t7 USING INDEX t7_idx_00000061 (a=?)
INDEX 2
- SEARCH t7 USING INDEX t7_idx_00000062 (b=?)
+ SEARCH TABLE t7 USING INDEX t7_idx_00000062 (b=?)
}
# rowid terms.
#
do_setup_rec_test $tn.13.1 {
@@ -265,27 +255,27 @@
CREATE TABLE t8(a, b);
} {
SELECT * FROM t8 WHERE rowid=?
} {
(no new indexes)
- SEARCH t8 USING INTEGER PRIMARY KEY (rowid=?)
+ SEARCH TABLE t8 USING INTEGER PRIMARY KEY (rowid=?)
}
do_setup_rec_test $tn.13.2 {
CREATE TABLE t8(a, b);
} {
SELECT * FROM t8 ORDER BY rowid
} {
(no new indexes)
- SCAN t8
+ SCAN TABLE t8
}
do_setup_rec_test $tn.13.3 {
CREATE TABLE t8(a, b);
} {
SELECT * FROM t8 WHERE a=? ORDER BY rowid
} {
CREATE INDEX t8_idx_00000061 ON t8(a);
- SEARCH t8 USING INDEX t8_idx_00000061 (a=?)
+ SEARCH TABLE t8 USING INDEX t8_idx_00000061 (a=?)
}
# Triggers
#
do_setup_rec_test $tn.14 {
@@ -296,11 +286,11 @@
END;
} {
INSERT INTO t9 VALUES(?, ?, ?);
} {
CREATE INDEX t10_idx_00000062 ON t10(b);
- SEARCH t10 USING INDEX t10_idx_00000062 (b=?)
+ SEARCH TABLE t10 USING INDEX t10_idx_00000062 (b=?)
}
do_setup_rec_test $tn.15 {
CREATE TABLE t1(a, b);
CREATE TABLE t2(c, d);
@@ -312,95 +302,21 @@
INSERT INTO t2 SELECT (i-1)/20, (i-1)/5 FROM s;
} {
SELECT * FROM t2, t1 WHERE b=? AND d=? AND t2.rowid=t1.rowid
} {
CREATE INDEX t2_idx_00000064 ON t2(d);
- SEARCH t2 USING INDEX t2_idx_00000064 (d=?)
- SEARCH t1 USING INTEGER PRIMARY KEY (rowid=?)
+ SEARCH TABLE t2 USING INDEX t2_idx_00000064 (d=?)
+ SEARCH TABLE t1 USING INTEGER PRIMARY KEY (rowid=?)
}
do_setup_rec_test $tn.16 {
CREATE TABLE t1(a, b);
} {
SELECT * FROM t1 WHERE b IS NOT NULL;
} {
(no new indexes)
- SCAN t1
-}
-
-do_setup_rec_test $tn.17.1 {
- CREATE TABLE example (A INTEGER, B INTEGER, C INTEGER, PRIMARY KEY (A,B));
-} {
- SELECT * FROM example WHERE a=?
-} {
- (no new indexes)
- SEARCH example USING INDEX sqlite_autoindex_example_1 (A=?)
-}
-do_setup_rec_test $tn.17.2 {
- CREATE TABLE example (A INTEGER, B INTEGER, C INTEGER, PRIMARY KEY (A,B));
-} {
- SELECT * FROM example WHERE b=?
-} {
- CREATE INDEX example_idx_00000042 ON example(B);
- SEARCH example USING INDEX example_idx_00000042 (B=?)
-}
-do_setup_rec_test $tn.17.3 {
- CREATE TABLE example (A INTEGER, B INTEGER, C INTEGER, PRIMARY KEY (A,B));
-} {
- SELECT * FROM example WHERE a=? AND b=?
-} {
- (no new indexes)
- SEARCH example USING INDEX sqlite_autoindex_example_1 (A=? AND B=?)
-}
-do_setup_rec_test $tn.17.4 {
- CREATE TABLE example (A INTEGER, B INTEGER, C INTEGER, PRIMARY KEY (A,B));
-} {
- SELECT * FROM example WHERE a=? AND b>?
-} {
- (no new indexes)
- SEARCH example USING INDEX sqlite_autoindex_example_1 (A=? AND B>?)
-}
-do_setup_rec_test $tn.17.5 {
- CREATE TABLE example (A INTEGER, B INTEGER, C INTEGER, PRIMARY KEY (A,B));
-} {
- SELECT * FROM example WHERE a>? AND b=?
-} {
- CREATE INDEX example_idx_0000cb3f ON example(B, A);
- SEARCH example USING INDEX example_idx_0000cb3f (B=? AND A>?)
-}
-
-do_setup_rec_test $tn.18.0 {
- CREATE TABLE SomeObject (
- a INTEGER PRIMARY KEY,
- x TEXT GENERATED ALWAYS AS(HEX(a)) VIRTUAL
- );
-} {
- SELECT x FROM SomeObject;
-} {
- (no new indexes)
- SCAN SomeObject
-}
-do_setup_rec_test $tn.18.1 {
- CREATE TABLE SomeObject (
- a INTEGER PRIMARY KEY,
- x TEXT GENERATED ALWAYS AS(HEX(a)) VIRTUAL
- );
-} {
- SELECT * FROM SomeObject WHERE x=?;
-} {
- CREATE INDEX SomeObject_idx_00000078 ON SomeObject(x);
- SEARCH SomeObject USING COVERING INDEX SomeObject_idx_00000078 (x=?)
-}
-
-
-do_setup_rec_test $tn.19.0 {
- CREATE TABLE t1("index");
-} {
- SELECT * FROM t1 ORDER BY "index";
-} {
- CREATE INDEX t1_idx_01a7214e ON t1('index');
- SCAN t1 USING COVERING INDEX t1_idx_01a7214e
+ SCAN TABLE t1
}
}
proc do_candidates_test {tn sql res} {
@@ -416,37 +332,37 @@
uplevel [list do_test $tn [list set {} $candidates] $res]
}
reset_db
-do_execsql_test 5.0 {
+do_execsql_test 4.0 {
CREATE TABLE t1(a, b);
CREATE TABLE t2(c, d);
WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<100)
INSERT INTO t1 SELECT (i-1)/50, (i-1)/20 FROM s;
WITH s(i) AS ( VALUES(1) UNION ALL SELECT i+1 FROM s WHERE i<100)
INSERT INTO t2 SELECT (i-1)/20, (i-1)/5 FROM s;
}
-do_candidates_test 5.1 {
+do_candidates_test 4.1 {
SELECT * FROM t1,t2 WHERE (b=? OR a=?) AND (c=? OR d=?)
} {
CREATE INDEX t1_idx_00000062 ON t1(b); -- stat1: 100 20
CREATE INDEX t1_idx_00000061 ON t1(a); -- stat1: 100 50
CREATE INDEX t2_idx_00000063 ON t2(c); -- stat1: 100 20
CREATE INDEX t2_idx_00000064 ON t2(d); -- stat1: 100 5
}
-do_candidates_test 5.2 {
+do_candidates_test 4.2 {
SELECT * FROM t1,t2 WHERE a=? AND b=? AND c=? AND d=?
} {
CREATE INDEX t1_idx_000123a7 ON t1(a, b); -- stat1: 100 50 17
CREATE INDEX t2_idx_0001295b ON t2(c, d); -- stat1: 100 20 5
}
-do_execsql_test 5.3 {
+do_execsql_test 4.3 {
CREATE INDEX t1_idx_00000061 ON t1(a); -- stat1: 100 50
CREATE INDEX t1_idx_00000062 ON t1(b); -- stat1: 100 20
CREATE INDEX t1_idx_000123a7 ON t1(a, b); -- stat1: 100 50 16
CREATE INDEX t2_idx_00000063 ON t2(c); -- stat1: 100 20
@@ -461,7 +377,8 @@
t1 t1_idx_000123a7 {100 50 17}
t2 t2_idx_00000063 {100 20}
t2 t2_idx_00000064 {100 5}
t2 t2_idx_0001295b {100 20 5}
}
+
finish_test
Index: ext/expert/sqlite3expert.c
==================================================================
--- ext/expert/sqlite3expert.c
+++ ext/expert/sqlite3expert.c
@@ -12,27 +12,10 @@
*/
#include "sqlite3expert.h"
#include
#include
#include
-
-#if !defined(SQLITE_AMALGAMATION)
-#if defined(SQLITE_COVERAGE_TEST) || defined(SQLITE_MUTATION_TEST)
-# define SQLITE_OMIT_AUXILIARY_SAFETY_CHECKS 1
-#endif
-#if defined(SQLITE_OMIT_AUXILIARY_SAFETY_CHECKS)
-# define ALWAYS(X) (1)
-# define NEVER(X) (0)
-#elif !defined(NDEBUG)
-# define ALWAYS(X) ((X)?1:(assert(0),0))
-# define NEVER(X) ((X)?(assert(0),1):0)
-#else
-# define ALWAYS(X) (X)
-# define NEVER(X) (X)
-#endif
-#endif /* !defined(SQLITE_AMALGAMATION) */
-
#ifndef SQLITE_OMIT_VIRTUALTABLE
typedef sqlite3_int64 i64;
typedef sqlite3_uint64 u64;
@@ -695,37 +678,25 @@
IdxTable **ppOut, /* OUT: New object (if successful) */
char **pzErrmsg /* OUT: Error message (if not) */
){
sqlite3_stmt *p1 = 0;
int nCol = 0;
- int nTab;
- int nByte;
+ int nTab = STRLEN(zTab);
+ int nByte = sizeof(IdxTable) + nTab + 1;
IdxTable *pNew = 0;
int rc, rc2;
char *pCsr = 0;
- int nPk = 0;
- *ppOut = 0;
- if( zTab==0 ) return SQLITE_ERROR;
- nTab = STRLEN(zTab);
- nByte = sizeof(IdxTable) + nTab + 1;
- rc = idxPrintfPrepareStmt(db, &p1, pzErrmsg, "PRAGMA table_xinfo=%Q", zTab);
+ rc = idxPrintfPrepareStmt(db, &p1, pzErrmsg, "PRAGMA table_info=%Q", zTab);
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(p1) ){
const char *zCol = (const char*)sqlite3_column_text(p1, 1);
- const char *zColSeq = 0;
- if( zCol==0 ){
- rc = SQLITE_ERROR;
- break;
- }
nByte += 1 + STRLEN(zCol);
rc = sqlite3_table_column_metadata(
- db, "main", zTab, zCol, 0, &zColSeq, 0, 0, 0
+ db, "main", zTab, zCol, 0, &zCol, 0, 0, 0
);
- if( zColSeq==0 ) zColSeq = "binary";
- nByte += 1 + STRLEN(zColSeq);
+ nByte += 1 + STRLEN(zCol);
nCol++;
- nPk += (sqlite3_column_int(p1, 5)>0);
}
rc2 = sqlite3_reset(p1);
if( rc==SQLITE_OK ) rc = rc2;
nByte += sizeof(IdxColumn) * nCol;
@@ -739,27 +710,23 @@
}
nCol = 0;
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(p1) ){
const char *zCol = (const char*)sqlite3_column_text(p1, 1);
- const char *zColSeq = 0;
- int nCopy;
- if( zCol==0 ) continue;
- nCopy = STRLEN(zCol) + 1;
+ int nCopy = STRLEN(zCol) + 1;
pNew->aCol[nCol].zName = pCsr;
- pNew->aCol[nCol].iPk = (sqlite3_column_int(p1, 5)==1 && nPk==1);
+ pNew->aCol[nCol].iPk = sqlite3_column_int(p1, 5);
memcpy(pCsr, zCol, nCopy);
pCsr += nCopy;
rc = sqlite3_table_column_metadata(
- db, "main", zTab, zCol, 0, &zColSeq, 0, 0, 0
+ db, "main", zTab, zCol, 0, &zCol, 0, 0, 0
);
if( rc==SQLITE_OK ){
- if( zColSeq==0 ) zColSeq = "binary";
- nCopy = STRLEN(zColSeq) + 1;
+ nCopy = STRLEN(zCol) + 1;
pNew->aCol[nCol].zColl = pCsr;
- memcpy(pCsr, zColSeq, nCopy);
+ memcpy(pCsr, zCol, nCopy);
pCsr += nCopy;
}
nCol++;
}
@@ -766,13 +733,13 @@
idxFinalize(&rc, p1);
if( rc!=SQLITE_OK ){
sqlite3_free(pNew);
pNew = 0;
- }else if( ALWAYS(pNew!=0) ){
+ }else{
pNew->zName = pCsr;
- if( ALWAYS(pNew->zName!=0) ) memcpy(pNew->zName, zTab, nTab+1);
+ memcpy(pNew->zName, zTab, nTab+1);
}
*ppOut = pNew;
return rc;
}
@@ -818,14 +785,10 @@
** Return true if zId must be quoted in order to use it as an SQL
** identifier, or false otherwise.
*/
static int idxIdentifierRequiresQuotes(const char *zId){
int i;
- int nId = STRLEN(zId);
-
- if( sqlite3_keyword_check(zId, nId) ) return 1;
-
for(i=0; zId[i]; i++){
if( !(zId[i]=='_')
&& !(zId[i]>='0' && zId[i]<='9')
&& !(zId[i]>='a' && zId[i]<='z')
&& !(zId[i]>='A' && zId[i]<='Z')
@@ -898,11 +861,10 @@
while( rc==SQLITE_OK && sqlite3_step(pIdxList)==SQLITE_ROW ){
int bMatch = 1;
IdxConstraint *pT = pTail;
sqlite3_stmt *pInfo = 0;
const char *zIdx = (const char*)sqlite3_column_text(pIdxList, 1);
- if( zIdx==0 ) continue;
/* Zero the IdxConstraint.bFlag values in the pEq list */
for(pIter=pEq; pIter; pIter=pIter->pLink) pIter->bFlag = 0;
rc = idxPrintfPrepareStmt(dbm, &pInfo, 0, "PRAGMA index_xInfo=%Q", zIdx);
@@ -944,23 +906,10 @@
*pRc = rc;
return 0;
}
-/* Callback for sqlite3_exec() with query with leading count(*) column.
- * The first argument is expected to be an int*, referent to be incremented
- * if that leading column is not exactly '0'.
- */
-static int countNonzeros(void* pCount, int nc,
- char* azResults[], char* azColumns[]){
- (void)azColumns; /* Suppress unused parameter warning */
- if( nc>0 && (azResults[0][0]!='0' || azResults[0][1]!=0) ){
- *((int *)pCount) += 1;
- }
- return 0;
-}
-
static int idxCreateFromCons(
sqlite3expert *p,
IdxScan *pScan,
IdxConstraint *pEq,
IdxConstraint *pTail
@@ -983,57 +932,30 @@
}
if( rc==SQLITE_OK ){
/* Hash the list of columns to come up with a name for the index */
const char *zTable = pScan->pTab->zName;
- int quoteTable = idxIdentifierRequiresQuotes(zTable);
- char *zName = 0; /* Index name */
- int collisions = 0;
- do{
- int i;
- char *zFind;
- for(i=0; zCols[i]; i++){
- h += ((h<<3) + zCols[i]);
- }
- sqlite3_free(zName);
- zName = sqlite3_mprintf("%s_idx_%08x", zTable, h);
- if( zName==0 ) break;
- /* Is is unique among table, view and index names? */
- zFmt = "SELECT count(*) FROM sqlite_schema WHERE name=%Q"
- " AND type in ('index','table','view')";
- zFind = sqlite3_mprintf(zFmt, zName);
- i = 0;
- rc = sqlite3_exec(dbm, zFind, countNonzeros, &i, 0);
- assert(rc==SQLITE_OK);
- sqlite3_free(zFind);
- if( i==0 ){
- collisions = 0;
- break;
- }
- ++collisions;
- }while( collisions<50 && zName!=0 );
- if( collisions ){
- /* This return means "Gave up trying to find a unique index name." */
- rc = SQLITE_BUSY_TIMEOUT;
- }else if( zName==0 ){
+ char *zName; /* Index name */
+ int i;
+ for(i=0; zCols[i]; i++){
+ h += ((h<<3) + zCols[i]);
+ }
+ zName = sqlite3_mprintf("%s_idx_%08x", zTable, h);
+ if( zName==0 ){
rc = SQLITE_NOMEM;
}else{
- if( quoteTable ){
- zFmt = "CREATE INDEX \"%w\" ON \"%w\"(%s)";
+ if( idxIdentifierRequiresQuotes(zTable) ){
+ zFmt = "CREATE INDEX '%q' ON %Q(%s)";
}else{
zFmt = "CREATE INDEX %s ON %s(%s)";
}
zIdx = sqlite3_mprintf(zFmt, zName, zTable, zCols);
if( !zIdx ){
rc = SQLITE_NOMEM;
}else{
rc = sqlite3_exec(dbm, zIdx, 0, 0, p->pzErrmsg);
- if( rc!=SQLITE_OK ){
- rc = SQLITE_BUSY_TIMEOUT;
- }else{
- idxHashAdd(&rc, &p->hIdx, zName, zIdx);
- }
+ idxHashAdd(&rc, &p->hIdx, zName, zIdx);
}
sqlite3_free(zName);
sqlite3_free(zIdx);
}
}
@@ -1181,11 +1103,11 @@
/*
** This function is called after candidate indexes have been created. It
** runs all the queries to see which indexes they prefer, and populates
** IdxStatement.zIdx and IdxStatement.zEQP with the results.
*/
-static int idxFindIndexes(
+int idxFindIndexes(
sqlite3expert *p,
char **pzErr /* OUT: Error message (sqlite3_malloc) */
){
IdxStatement *pStmt;
sqlite3 *dbm = p->dbm;
@@ -1204,23 +1126,18 @@
while( rc==SQLITE_OK && sqlite3_step(pExplain)==SQLITE_ROW ){
/* int iId = sqlite3_column_int(pExplain, 0); */
/* int iParent = sqlite3_column_int(pExplain, 1); */
/* int iNotUsed = sqlite3_column_int(pExplain, 2); */
const char *zDetail = (const char*)sqlite3_column_text(pExplain, 3);
- int nDetail;
+ int nDetail = STRLEN(zDetail);
int i;
- if( !zDetail ) continue;
- nDetail = STRLEN(zDetail);
-
for(i=0; ipTab;
const char *zTab = pTab->zName;
const char *zSql =
- "SELECT 'CREATE TEMP' || substr(sql, 7) FROM sqlite_schema "
+ "SELECT 'CREATE TEMP' || substr(sql, 7) FROM sqlite_master "
"WHERE tbl_name = %Q AND type IN ('table', 'trigger') "
"ORDER BY type;";
sqlite3_stmt *pSelect = 0;
int rc = SQLITE_OK;
char *zWrite = 0;
@@ -1310,11 +1227,10 @@
/* Create the table and its triggers in the temp schema */
rc = idxPrintfPrepareStmt(p->db, &pSelect, pzErr, zSql, zTab, zTab);
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pSelect) ){
const char *zCreate = (const char*)sqlite3_column_text(pSelect, 0);
- if( zCreate==0 ) continue;
rc = sqlite3_exec(p->dbv, zCreate, 0, 0, pzErr);
}
idxFinalize(&rc, pSelect);
/* Rename the table in the temp schema to zInt */
@@ -1400,26 +1316,25 @@
**
** 1) Add an entry to the p->pTable list, and
** 2) Create the equivalent virtual table in dbv.
*/
rc = idxPrepareStmt(p->db, &pSchema, pzErrmsg,
- "SELECT type, name, sql, 1 FROM sqlite_schema "
+ "SELECT type, name, sql, 1 FROM sqlite_master "
"WHERE type IN ('table','view') AND name NOT LIKE 'sqlite_%%' "
" UNION ALL "
- "SELECT type, name, sql, 2 FROM sqlite_schema "
+ "SELECT type, name, sql, 2 FROM sqlite_master "
"WHERE type = 'trigger'"
- " AND tbl_name IN(SELECT name FROM sqlite_schema WHERE type = 'view') "
+ " AND tbl_name IN(SELECT name FROM sqlite_master WHERE type = 'view') "
"ORDER BY 4, 1"
);
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pSchema) ){
const char *zType = (const char*)sqlite3_column_text(pSchema, 0);
const char *zName = (const char*)sqlite3_column_text(pSchema, 1);
const char *zSql = (const char*)sqlite3_column_text(pSchema, 2);
- if( zType==0 || zName==0 ) continue;
if( zType[0]=='v' || zType[1]=='r' ){
- if( zSql ) rc = sqlite3_exec(p->dbv, zSql, 0, 0, pzErrmsg);
+ rc = sqlite3_exec(p->dbv, zSql, 0, 0, pzErrmsg);
}else{
IdxTable *pTab;
rc = idxGetTableInfo(p->db, zName, &pTab, pzErrmsg);
if( rc==SQLITE_OK ){
int i;
@@ -1552,11 +1467,10 @@
break;
case SQLITE_BLOB:
case SQLITE_TEXT: {
int nByte = sqlite3_value_bytes(argv[1]);
- const void *pData = 0;
if( nByte>pSlot->nByte ){
char *zNew = (char*)sqlite3_realloc(pSlot->z, nByte*2);
if( zNew==0 ){
sqlite3_result_error_nomem(pCtx);
return;
@@ -1564,15 +1478,13 @@
pSlot->nByte = nByte*2;
pSlot->z = zNew;
}
pSlot->n = nByte;
if( pSlot->eType==SQLITE_BLOB ){
- pData = sqlite3_value_blob(argv[1]);
- if( pData ) memcpy(pSlot->z, pData, nByte);
+ memcpy(pSlot->z, sqlite3_value_blob(argv[1]), nByte);
}else{
- pData = sqlite3_value_text(argv[1]);
- memcpy(pSlot->z, pData, nByte);
+ memcpy(pSlot->z, sqlite3_value_text(argv[1]), nByte);
}
break;
}
}
}
@@ -1579,11 +1491,11 @@
static int idxLargestIndex(sqlite3 *db, int *pnMax, char **pzErr){
int rc = SQLITE_OK;
const char *zMax =
"SELECT max(i.seqno) FROM "
- " sqlite_schema AS s, "
+ " sqlite_master AS s, "
" pragma_index_list(s.name) AS l, "
" pragma_index_info(l.name) AS i "
"WHERE s.type = 'table'";
sqlite3_stmt *pMax = 0;
@@ -1732,11 +1644,11 @@
sqlite3_stmt *pIndexXInfo = 0;
sqlite3_stmt *pWrite = 0;
const char *zAllIndex =
"SELECT s.rowid, s.name, l.name FROM "
- " sqlite_schema AS s, "
+ " sqlite_master AS s, "
" pragma_index_list(s.name) AS l "
"WHERE s.type = 'table'";
const char *zIndexXInfo =
"SELECT name, coll FROM pragma_index_xinfo(?) WHERE key";
const char *zWrite = "INSERT INTO sqlite_stat1 VALUES(?, ?, ?)";
@@ -1779,11 +1691,10 @@
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pAllIndex) ){
i64 iRowid = sqlite3_column_int64(pAllIndex, 0);
const char *zTab = (const char*)sqlite3_column_text(pAllIndex, 1);
const char *zIdx = (const char*)sqlite3_column_text(pAllIndex, 2);
- if( zTab==0 || zIdx==0 ) continue;
if( p->iSample<100 && iPrev!=iRowid ){
samplectx.target = (double)p->iSample / 100.0;
samplectx.iTarget = p->iSample;
samplectx.nRow = 0.0;
samplectx.nRet = 0.0;
@@ -1801,19 +1712,17 @@
idxFinalize(&rc, pAllIndex);
idxFinalize(&rc, pIndexXInfo);
idxFinalize(&rc, pWrite);
- if( pCtx ){
- for(i=0; inSlot; i++){
- sqlite3_free(pCtx->aSlot[i].z);
- }
- sqlite3_free(pCtx);
+ for(i=0; inSlot; i++){
+ sqlite3_free(pCtx->aSlot[i].z);
}
+ sqlite3_free(pCtx);
if( rc==SQLITE_OK ){
- rc = sqlite3_exec(p->dbm, "ANALYZE sqlite_schema", 0, 0, 0);
+ rc = sqlite3_exec(p->dbm, "ANALYZE sqlite_master", 0, 0, 0);
}
sqlite3_exec(p->db, "DROP TABLE IF EXISTS temp."UNIQUE_TABLE_NAME,0,0,0);
return rc;
}
@@ -1846,18 +1755,18 @@
}
/* Copy the entire schema of database [db] into [dbm]. */
if( rc==SQLITE_OK ){
- sqlite3_stmt *pSql = 0;
+ sqlite3_stmt *pSql;
rc = idxPrintfPrepareStmt(pNew->db, &pSql, pzErrmsg,
- "SELECT sql FROM sqlite_schema WHERE name NOT LIKE 'sqlite_%%'"
+ "SELECT sql FROM sqlite_master WHERE name NOT LIKE 'sqlite_%%'"
" AND sql NOT LIKE 'CREATE VIRTUAL %%'"
);
while( rc==SQLITE_OK && SQLITE_ROW==sqlite3_step(pSql) ){
const char *zSql = (const char*)sqlite3_column_text(pSql, 0);
- if( zSql ) rc = sqlite3_exec(pNew->dbm, zSql, 0, 0, pzErrmsg);
+ rc = sqlite3_exec(pNew->dbm, zSql, 0, 0, pzErrmsg);
}
idxFinalize(&rc, pSql);
}
/* Create the vtab schema */
@@ -1959,14 +1868,10 @@
rc = idxProcessTriggers(p, pzErr);
/* Create candidate indexes within the in-memory database file */
if( rc==SQLITE_OK ){
rc = idxCreateCandidates(p);
- }else if ( rc==SQLITE_BUSY_TIMEOUT ){
- if( pzErr )
- *pzErr = sqlite3_mprintf("Cannot find a unique index name to propose.");
- return rc;
}
/* Generate the stat1 data */
if( rc==SQLITE_OK ){
rc = idxPopulateStat1(p, pzErr);
@@ -2043,6 +1948,6 @@
sqlite3_free(p->zCandidates);
sqlite3_free(p);
}
}
-#endif /* ifndef SQLITE_OMIT_VIRTUALTABLE */
+#endif /* ifndef SQLITE_OMIT_VIRTUAL_TABLE */
Index: ext/expert/sqlite3expert.h
==================================================================
--- ext/expert/sqlite3expert.h
+++ ext/expert/sqlite3expert.h
@@ -8,12 +8,12 @@
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
*/
-#if !defined(SQLITEEXPERT_H)
-#define SQLITEEXPERT_H 1
+
+
#include "sqlite3.h"
typedef struct sqlite3expert sqlite3expert;
/*
@@ -163,6 +163,6 @@
** should be one call to this function for each successful call to
** sqlite3-expert_new().
*/
void sqlite3_expert_destroy(sqlite3expert*);
-#endif /* !defined(SQLITEEXPERT_H) */
+
ADDED ext/fts1/README.txt
Index: ext/fts1/README.txt
==================================================================
--- /dev/null
+++ ext/fts1/README.txt
@@ -0,0 +1,2 @@
+This folder contains source code to the first full-text search
+extension for SQLite.
ADDED ext/fts1/ft_hash.c
Index: ext/fts1/ft_hash.c
==================================================================
--- /dev/null
+++ ext/fts1/ft_hash.c
@@ -0,0 +1,404 @@
+/*
+** 2001 September 22
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** This is the implementation of generic hash-tables used in SQLite.
+** We've modified it slightly to serve as a standalone hash table
+** implementation for the full-text indexing module.
+*/
+#include
+#include
+#include
+
+#include "ft_hash.h"
+
+void *malloc_and_zero(int n){
+ void *p = malloc(n);
+ if( p ){
+ memset(p, 0, n);
+ }
+ return p;
+}
+
+/* Turn bulk memory into a hash table object by initializing the
+** fields of the Hash structure.
+**
+** "pNew" is a pointer to the hash table that is to be initialized.
+** keyClass is one of the constants HASH_INT, HASH_POINTER,
+** HASH_BINARY, or HASH_STRING. The value of keyClass
+** determines what kind of key the hash table will use. "copyKey" is
+** true if the hash table should make its own private copy of keys and
+** false if it should just use the supplied pointer. CopyKey only makes
+** sense for HASH_STRING and HASH_BINARY and is ignored
+** for other key classes.
+*/
+void HashInit(Hash *pNew, int keyClass, int copyKey){
+ assert( pNew!=0 );
+ assert( keyClass>=HASH_STRING && keyClass<=HASH_BINARY );
+ pNew->keyClass = keyClass;
+#if 0
+ if( keyClass==HASH_POINTER || keyClass==HASH_INT ) copyKey = 0;
+#endif
+ pNew->copyKey = copyKey;
+ pNew->first = 0;
+ pNew->count = 0;
+ pNew->htsize = 0;
+ pNew->ht = 0;
+ pNew->xMalloc = malloc_and_zero;
+ pNew->xFree = free;
+}
+
+/* Remove all entries from a hash table. Reclaim all memory.
+** Call this routine to delete a hash table or to reset a hash table
+** to the empty state.
+*/
+void HashClear(Hash *pH){
+ HashElem *elem; /* For looping over all elements of the table */
+
+ assert( pH!=0 );
+ elem = pH->first;
+ pH->first = 0;
+ if( pH->ht ) pH->xFree(pH->ht);
+ pH->ht = 0;
+ pH->htsize = 0;
+ while( elem ){
+ HashElem *next_elem = elem->next;
+ if( pH->copyKey && elem->pKey ){
+ pH->xFree(elem->pKey);
+ }
+ pH->xFree(elem);
+ elem = next_elem;
+ }
+ pH->count = 0;
+}
+
+#if 0 /* NOT USED */
+/*
+** Hash and comparison functions when the mode is HASH_INT
+*/
+static int intHash(const void *pKey, int nKey){
+ return nKey ^ (nKey<<8) ^ (nKey>>8);
+}
+static int intCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ return n2 - n1;
+}
+#endif
+
+#if 0 /* NOT USED */
+/*
+** Hash and comparison functions when the mode is HASH_POINTER
+*/
+static int ptrHash(const void *pKey, int nKey){
+ uptr x = Addr(pKey);
+ return x ^ (x<<8) ^ (x>>8);
+}
+static int ptrCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ if( pKey1==pKey2 ) return 0;
+ if( pKey1 0 ){
+ h = (h<<3) ^ h ^ *z++;
+ nKey--;
+ }
+ return h & 0x7fffffff;
+}
+static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ if( n1!=n2 ) return 1;
+ return strncmp((const char*)pKey1,(const char*)pKey2,n1);
+}
+
+/*
+** Hash and comparison functions when the mode is HASH_BINARY
+*/
+static int binHash(const void *pKey, int nKey){
+ int h = 0;
+ const char *z = (const char *)pKey;
+ while( nKey-- > 0 ){
+ h = (h<<3) ^ h ^ *(z++);
+ }
+ return h & 0x7fffffff;
+}
+static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ if( n1!=n2 ) return 1;
+ return memcmp(pKey1,pKey2,n1);
+}
+
+/*
+** Return a pointer to the appropriate hash function given the key class.
+**
+** The C syntax in this function definition may be unfamilar to some
+** programmers, so we provide the following additional explanation:
+**
+** The name of the function is "hashFunction". The function takes a
+** single parameter "keyClass". The return value of hashFunction()
+** is a pointer to another function. Specifically, the return value
+** of hashFunction() is a pointer to a function that takes two parameters
+** with types "const void*" and "int" and returns an "int".
+*/
+static int (*hashFunction(int keyClass))(const void*,int){
+#if 0 /* HASH_INT and HASH_POINTER are never used */
+ switch( keyClass ){
+ case HASH_INT: return &intHash;
+ case HASH_POINTER: return &ptrHash;
+ case HASH_STRING: return &strHash;
+ case HASH_BINARY: return &binHash;;
+ default: break;
+ }
+ return 0;
+#else
+ if( keyClass==HASH_STRING ){
+ return &strHash;
+ }else{
+ assert( keyClass==HASH_BINARY );
+ return &binHash;
+ }
+#endif
+}
+
+/*
+** Return a pointer to the appropriate hash function given the key class.
+**
+** For help in interpreted the obscure C code in the function definition,
+** see the header comment on the previous function.
+*/
+static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
+#if 0 /* HASH_INT and HASH_POINTER are never used */
+ switch( keyClass ){
+ case HASH_INT: return &intCompare;
+ case HASH_POINTER: return &ptrCompare;
+ case HASH_STRING: return &strCompare;
+ case HASH_BINARY: return &binCompare;
+ default: break;
+ }
+ return 0;
+#else
+ if( keyClass==HASH_STRING ){
+ return &strCompare;
+ }else{
+ assert( keyClass==HASH_BINARY );
+ return &binCompare;
+ }
+#endif
+}
+
+/* Link an element into the hash table
+*/
+static void insertElement(
+ Hash *pH, /* The complete hash table */
+ struct _ht *pEntry, /* The entry into which pNew is inserted */
+ HashElem *pNew /* The element to be inserted */
+){
+ HashElem *pHead; /* First element already in pEntry */
+ pHead = pEntry->chain;
+ if( pHead ){
+ pNew->next = pHead;
+ pNew->prev = pHead->prev;
+ if( pHead->prev ){ pHead->prev->next = pNew; }
+ else { pH->first = pNew; }
+ pHead->prev = pNew;
+ }else{
+ pNew->next = pH->first;
+ if( pH->first ){ pH->first->prev = pNew; }
+ pNew->prev = 0;
+ pH->first = pNew;
+ }
+ pEntry->count++;
+ pEntry->chain = pNew;
+}
+
+
+/* Resize the hash table so that it cantains "new_size" buckets.
+** "new_size" must be a power of 2. The hash table might fail
+** to resize if sqliteMalloc() fails.
+*/
+static void rehash(Hash *pH, int new_size){
+ struct _ht *new_ht; /* The new hash table */
+ HashElem *elem, *next_elem; /* For looping over existing elements */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ assert( (new_size & (new_size-1))==0 );
+ new_ht = (struct _ht *)pH->xMalloc( new_size*sizeof(struct _ht) );
+ if( new_ht==0 ) return;
+ if( pH->ht ) pH->xFree(pH->ht);
+ pH->ht = new_ht;
+ pH->htsize = new_size;
+ xHash = hashFunction(pH->keyClass);
+ for(elem=pH->first, pH->first=0; elem; elem = next_elem){
+ int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
+ next_elem = elem->next;
+ insertElement(pH, &new_ht[h], elem);
+ }
+}
+
+/* This function (for internal use only) locates an element in an
+** hash table that matches the given key. The hash for this key has
+** already been computed and is passed as the 4th parameter.
+*/
+static HashElem *findElementGivenHash(
+ const Hash *pH, /* The pH to be searched */
+ const void *pKey, /* The key we are searching for */
+ int nKey,
+ int h /* The hash for this key. */
+){
+ HashElem *elem; /* Used to loop thru the element list */
+ int count; /* Number of elements left to test */
+ int (*xCompare)(const void*,int,const void*,int); /* comparison function */
+
+ if( pH->ht ){
+ struct _ht *pEntry = &pH->ht[h];
+ elem = pEntry->chain;
+ count = pEntry->count;
+ xCompare = compareFunction(pH->keyClass);
+ while( count-- && elem ){
+ if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
+ return elem;
+ }
+ elem = elem->next;
+ }
+ }
+ return 0;
+}
+
+/* Remove a single entry from the hash table given a pointer to that
+** element and a hash on the element's key.
+*/
+static void removeElementGivenHash(
+ Hash *pH, /* The pH containing "elem" */
+ HashElem* elem, /* The element to be removed from the pH */
+ int h /* Hash value for the element */
+){
+ struct _ht *pEntry;
+ if( elem->prev ){
+ elem->prev->next = elem->next;
+ }else{
+ pH->first = elem->next;
+ }
+ if( elem->next ){
+ elem->next->prev = elem->prev;
+ }
+ pEntry = &pH->ht[h];
+ if( pEntry->chain==elem ){
+ pEntry->chain = elem->next;
+ }
+ pEntry->count--;
+ if( pEntry->count<=0 ){
+ pEntry->chain = 0;
+ }
+ if( pH->copyKey && elem->pKey ){
+ pH->xFree(elem->pKey);
+ }
+ pH->xFree( elem );
+ pH->count--;
+ if( pH->count<=0 ){
+ assert( pH->first==0 );
+ assert( pH->count==0 );
+ HashClear(pH);
+ }
+}
+
+/* Attempt to locate an element of the hash table pH with a key
+** that matches pKey,nKey. Return the data for this element if it is
+** found, or NULL if there is no match.
+*/
+void *HashFind(const Hash *pH, const void *pKey, int nKey){
+ int h; /* A hash on key */
+ HashElem *elem; /* The element that matches key */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ if( pH==0 || pH->ht==0 ) return 0;
+ xHash = hashFunction(pH->keyClass);
+ assert( xHash!=0 );
+ h = (*xHash)(pKey,nKey);
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
+ return elem ? elem->data : 0;
+}
+
+/* Insert an element into the hash table pH. The key is pKey,nKey
+** and the data is "data".
+**
+** If no element exists with a matching key, then a new
+** element is created. A copy of the key is made if the copyKey
+** flag is set. NULL is returned.
+**
+** If another element already exists with the same key, then the
+** new data replaces the old data and the old data is returned.
+** The key is not copied in this instance. If a malloc fails, then
+** the new data is returned and the hash table is unchanged.
+**
+** If the "data" parameter to this function is NULL, then the
+** element corresponding to "key" is removed from the hash table.
+*/
+void *HashInsert(Hash *pH, const void *pKey, int nKey, void *data){
+ int hraw; /* Raw hash value of the key */
+ int h; /* the hash of the key modulo hash table size */
+ HashElem *elem; /* Used to loop thru the element list */
+ HashElem *new_elem; /* New element added to the pH */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ assert( pH!=0 );
+ xHash = hashFunction(pH->keyClass);
+ assert( xHash!=0 );
+ hraw = (*xHash)(pKey, nKey);
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ h = hraw & (pH->htsize-1);
+ elem = findElementGivenHash(pH,pKey,nKey,h);
+ if( elem ){
+ void *old_data = elem->data;
+ if( data==0 ){
+ removeElementGivenHash(pH,elem,h);
+ }else{
+ elem->data = data;
+ }
+ return old_data;
+ }
+ if( data==0 ) return 0;
+ new_elem = (HashElem*)pH->xMalloc( sizeof(HashElem) );
+ if( new_elem==0 ) return data;
+ if( pH->copyKey && pKey!=0 ){
+ new_elem->pKey = pH->xMalloc( nKey );
+ if( new_elem->pKey==0 ){
+ pH->xFree(new_elem);
+ return data;
+ }
+ memcpy((void*)new_elem->pKey, pKey, nKey);
+ }else{
+ new_elem->pKey = (void*)pKey;
+ }
+ new_elem->nKey = nKey;
+ pH->count++;
+ if( pH->htsize==0 ){
+ rehash(pH,8);
+ if( pH->htsize==0 ){
+ pH->count = 0;
+ pH->xFree(new_elem);
+ return data;
+ }
+ }
+ if( pH->count > pH->htsize ){
+ rehash(pH,pH->htsize*2);
+ }
+ assert( pH->htsize>0 );
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ h = hraw & (pH->htsize-1);
+ insertElement(pH, &pH->ht[h], new_elem);
+ new_elem->data = data;
+ return 0;
+}
ADDED ext/fts1/ft_hash.h
Index: ext/fts1/ft_hash.h
==================================================================
--- /dev/null
+++ ext/fts1/ft_hash.h
@@ -0,0 +1,111 @@
+/*
+** 2001 September 22
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** This is the header file for the generic hash-table implementation
+** used in SQLite. We've modified it slightly to serve as a standalone
+** hash table implementation for the full-text indexing module.
+**
+*/
+#ifndef _HASH_H_
+#define _HASH_H_
+
+/* Forward declarations of structures. */
+typedef struct Hash Hash;
+typedef struct HashElem HashElem;
+
+/* A complete hash table is an instance of the following structure.
+** The internals of this structure are intended to be opaque -- client
+** code should not attempt to access or modify the fields of this structure
+** directly. Change this structure only by using the routines below.
+** However, many of the "procedures" and "functions" for modifying and
+** accessing this structure are really macros, so we can't really make
+** this structure opaque.
+*/
+struct Hash {
+ char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
+ char copyKey; /* True if copy of key made on insert */
+ int count; /* Number of entries in this table */
+ HashElem *first; /* The first element of the array */
+ void *(*xMalloc)(int); /* malloc() function to use */
+ void (*xFree)(void *); /* free() function to use */
+ int htsize; /* Number of buckets in the hash table */
+ struct _ht { /* the hash table */
+ int count; /* Number of entries with this hash */
+ HashElem *chain; /* Pointer to first entry with this hash */
+ } *ht;
+};
+
+/* Each element in the hash table is an instance of the following
+** structure. All elements are stored on a single doubly-linked list.
+**
+** Again, this structure is intended to be opaque, but it can't really
+** be opaque because it is used by macros.
+*/
+struct HashElem {
+ HashElem *next, *prev; /* Next and previous elements in the table */
+ void *data; /* Data associated with this element */
+ void *pKey; int nKey; /* Key associated with this element */
+};
+
+/*
+** There are 4 different modes of operation for a hash table:
+**
+** HASH_INT nKey is used as the key and pKey is ignored.
+**
+** HASH_POINTER pKey is used as the key and nKey is ignored.
+**
+** HASH_STRING pKey points to a string that is nKey bytes long
+** (including the null-terminator, if any). Case
+** is respected in comparisons.
+**
+** HASH_BINARY pKey points to binary data nKey bytes long.
+** memcmp() is used to compare keys.
+**
+** A copy of the key is made for HASH_STRING and HASH_BINARY
+** if the copyKey parameter to HashInit is 1.
+*/
+/* #define HASH_INT 1 // NOT USED */
+/* #define HASH_POINTER 2 // NOT USED */
+#define HASH_STRING 3
+#define HASH_BINARY 4
+
+/*
+** Access routines. To delete, insert a NULL pointer.
+*/
+void HashInit(Hash*, int keytype, int copyKey);
+void *HashInsert(Hash*, const void *pKey, int nKey, void *pData);
+void *HashFind(const Hash*, const void *pKey, int nKey);
+void HashClear(Hash*);
+
+/*
+** Macros for looping over all elements of a hash table. The idiom is
+** like this:
+**
+** Hash h;
+** HashElem *p;
+** ...
+** for(p=HashFirst(&h); p; p=HashNext(p)){
+** SomeStructure *pData = HashData(p);
+** // do something with pData
+** }
+*/
+#define HashFirst(H) ((H)->first)
+#define HashNext(E) ((E)->next)
+#define HashData(E) ((E)->data)
+#define HashKey(E) ((E)->pKey)
+#define HashKeysize(E) ((E)->nKey)
+
+/*
+** Number of entries in a hash table
+*/
+#define HashCount(H) ((H)->count)
+
+#endif /* _HASH_H_ */
ADDED ext/fts1/fts1.c
Index: ext/fts1/fts1.c
==================================================================
--- /dev/null
+++ ext/fts1/fts1.c
@@ -0,0 +1,3348 @@
+/* fts1 has a design flaw which can lead to database corruption (see
+** below). It is recommended not to use it any longer, instead use
+** fts3 (or higher). If you believe that your use of fts1 is safe,
+** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
+*/
+#if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
+ && !defined(SQLITE_ENABLE_BROKEN_FTS1)
+#error fts1 has a design flaw and has been deprecated.
+#endif
+/* The flaw is that fts1 uses the content table's unaliased rowid as
+** the unique docid. fts1 embeds the rowid in the index it builds,
+** and expects the rowid to not change. The SQLite VACUUM operation
+** will renumber such rowids, thereby breaking fts1. If you are using
+** fts1 in a system which has disabled VACUUM, then you can continue
+** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
+** VACUUM, though systems using auto_vacuum are unlikely to invoke
+** VACUUM.
+**
+** fts1 should be safe even across VACUUM if you only insert documents
+** and never delete.
+*/
+
+/* The author disclaims copyright to this source code.
+ *
+ * This is an SQLite module implementing full-text search.
+ */
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS1 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS1 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
+
+#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
+# define SQLITE_CORE 1
+#endif
+
+#include
+#include
+#include
+#include
+#include
+
+#include "fts1.h"
+#include "fts1_hash.h"
+#include "fts1_tokenizer.h"
+#include "sqlite3.h"
+#include "sqlite3ext.h"
+SQLITE_EXTENSION_INIT1
+
+
+#if 0
+# define TRACE(A) printf A; fflush(stdout)
+#else
+# define TRACE(A)
+#endif
+
+/* utility functions */
+
+typedef struct StringBuffer {
+ int len; /* length, not including null terminator */
+ int alloced; /* Space allocated for s[] */
+ char *s; /* Content of the string */
+} StringBuffer;
+
+static void initStringBuffer(StringBuffer *sb){
+ sb->len = 0;
+ sb->alloced = 100;
+ sb->s = malloc(100);
+ sb->s[0] = '\0';
+}
+
+static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
+ if( sb->len + nFrom >= sb->alloced ){
+ sb->alloced = sb->len + nFrom + 100;
+ sb->s = realloc(sb->s, sb->alloced+1);
+ if( sb->s==0 ){
+ initStringBuffer(sb);
+ return;
+ }
+ }
+ memcpy(sb->s + sb->len, zFrom, nFrom);
+ sb->len += nFrom;
+ sb->s[sb->len] = 0;
+}
+static void append(StringBuffer *sb, const char *zFrom){
+ nappend(sb, zFrom, strlen(zFrom));
+}
+
+/* We encode variable-length integers in little-endian order using seven bits
+ * per byte as follows:
+**
+** KEY:
+** A = 0xxxxxxx 7 bits of data and one flag bit
+** B = 1xxxxxxx 7 bits of data and one flag bit
+**
+** 7 bits - A
+** 14 bits - BA
+** 21 bits - BBA
+** and so on.
+*/
+
+/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
+#define VARINT_MAX 10
+
+/* Write a 64-bit variable-length integer to memory starting at p[0].
+ * The length of data written will be between 1 and VARINT_MAX bytes.
+ * The number of bytes written is returned. */
+static int putVarint(char *p, sqlite_int64 v){
+ unsigned char *q = (unsigned char *) p;
+ sqlite_uint64 vu = v;
+ do{
+ *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
+ vu >>= 7;
+ }while( vu!=0 );
+ q[-1] &= 0x7f; /* turn off high bit in final byte */
+ assert( q - (unsigned char *)p <= VARINT_MAX );
+ return (int) (q - (unsigned char *)p);
+}
+
+/* Read a 64-bit variable-length integer from memory starting at p[0].
+ * Return the number of bytes read, or 0 on error.
+ * The value is stored in *v. */
+static int getVarint(const char *p, sqlite_int64 *v){
+ const unsigned char *q = (const unsigned char *) p;
+ sqlite_uint64 x = 0, y = 1;
+ while( (*q & 0x80) == 0x80 ){
+ x += y * (*q++ & 0x7f);
+ y <<= 7;
+ if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
+ assert( 0 );
+ return 0;
+ }
+ }
+ x += y * (*q++);
+ *v = (sqlite_int64) x;
+ return (int) (q - (unsigned char *)p);
+}
+
+static int getVarint32(const char *p, int *pi){
+ sqlite_int64 i;
+ int ret = getVarint(p, &i);
+ *pi = (int) i;
+ assert( *pi==i );
+ return ret;
+}
+
+/*** Document lists ***
+ *
+ * A document list holds a sorted list of varint-encoded document IDs.
+ *
+ * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
+ *
+ * array {
+ * varint docid;
+ * array {
+ * varint position; (delta from previous position plus POS_BASE)
+ * varint startOffset; (delta from previous startOffset)
+ * varint endOffset; (delta from startOffset)
+ * }
+ * }
+ *
+ * Here, array { X } means zero or more occurrences of X, adjacent in memory.
+ *
+ * A position list may hold positions for text in multiple columns. A position
+ * POS_COLUMN is followed by a varint containing the index of the column for
+ * following positions in the list. Any positions appearing before any
+ * occurrences of POS_COLUMN are for column 0.
+ *
+ * A doclist with type DL_POSITIONS is like the above, but holds only docids
+ * and positions without offset information.
+ *
+ * A doclist with type DL_DOCIDS is like the above, but holds only docids
+ * without positions or offset information.
+ *
+ * On disk, every document list has positions and offsets, so we don't bother
+ * to serialize a doclist's type.
+ *
+ * We don't yet delta-encode document IDs; doing so will probably be a
+ * modest win.
+ *
+ * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
+ * After the first offset, estimate the next offset by using the
+ * current token position and the previous token position and offset,
+ * offset to handle some variance. So the estimate would be
+ * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
+ * as normal. Offsets more than 64 chars from the estimate are
+ * encoded as the delta to the previous start offset + 128. An
+ * additional tiny increment can be gained by using the end offset of
+ * the previous token to make the estimate a tiny bit more precise.
+*/
+
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters. This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+ return (c&0x80)==0 ? isspace((unsigned char)c) : 0;
+}
+static int safe_tolower(char c){
+ return (c&0x80)==0 ? tolower((unsigned char)c) : c;
+}
+static int safe_isalnum(char c){
+ return (c&0x80)==0 ? isalnum((unsigned char)c) : 0;
+}
+
+typedef enum DocListType {
+ DL_DOCIDS, /* docids only */
+ DL_POSITIONS, /* docids + positions */
+ DL_POSITIONS_OFFSETS /* docids + positions + offsets */
+} DocListType;
+
+/*
+** By default, only positions and not offsets are stored in the doclists.
+** To change this so that offsets are stored too, compile with
+**
+** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
+**
+*/
+#ifndef DL_DEFAULT
+# define DL_DEFAULT DL_POSITIONS
+#endif
+
+typedef struct DocList {
+ char *pData;
+ int nData;
+ DocListType iType;
+ int iLastColumn; /* the last column written */
+ int iLastPos; /* the last position written */
+ int iLastOffset; /* the last start offset written */
+} DocList;
+
+enum {
+ POS_END = 0, /* end of this position list */
+ POS_COLUMN, /* followed by new column number */
+ POS_BASE
+};
+
+/* Initialize a new DocList to hold the given data. */
+static void docListInit(DocList *d, DocListType iType,
+ const char *pData, int nData){
+ d->nData = nData;
+ if( nData>0 ){
+ d->pData = malloc(nData);
+ memcpy(d->pData, pData, nData);
+ } else {
+ d->pData = NULL;
+ }
+ d->iType = iType;
+ d->iLastColumn = 0;
+ d->iLastPos = d->iLastOffset = 0;
+}
+
+/* Create a new dynamically-allocated DocList. */
+static DocList *docListNew(DocListType iType){
+ DocList *d = (DocList *) malloc(sizeof(DocList));
+ docListInit(d, iType, 0, 0);
+ return d;
+}
+
+static void docListDestroy(DocList *d){
+ free(d->pData);
+#ifndef NDEBUG
+ memset(d, 0x55, sizeof(*d));
+#endif
+}
+
+static void docListDelete(DocList *d){
+ docListDestroy(d);
+ free(d);
+}
+
+static char *docListEnd(DocList *d){
+ return d->pData + d->nData;
+}
+
+/* Append a varint to a DocList's data. */
+static void appendVarint(DocList *d, sqlite_int64 i){
+ char c[VARINT_MAX];
+ int n = putVarint(c, i);
+ d->pData = realloc(d->pData, d->nData + n);
+ memcpy(d->pData + d->nData, c, n);
+ d->nData += n;
+}
+
+static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
+ appendVarint(d, iDocid);
+ if( d->iType>=DL_POSITIONS ){
+ appendVarint(d, POS_END); /* initially empty position list */
+ d->iLastColumn = 0;
+ d->iLastPos = d->iLastOffset = 0;
+ }
+}
+
+/* helper function for docListAddPos and docListAddPosOffset */
+static void addPos(DocList *d, int iColumn, int iPos){
+ assert( d->nData>0 );
+ --d->nData; /* remove previous terminator */
+ if( iColumn!=d->iLastColumn ){
+ assert( iColumn>d->iLastColumn );
+ appendVarint(d, POS_COLUMN);
+ appendVarint(d, iColumn);
+ d->iLastColumn = iColumn;
+ d->iLastPos = d->iLastOffset = 0;
+ }
+ assert( iPos>=d->iLastPos );
+ appendVarint(d, iPos-d->iLastPos+POS_BASE);
+ d->iLastPos = iPos;
+}
+
+/* Add a position to the last position list in a doclist. */
+static void docListAddPos(DocList *d, int iColumn, int iPos){
+ assert( d->iType==DL_POSITIONS );
+ addPos(d, iColumn, iPos);
+ appendVarint(d, POS_END); /* add new terminator */
+}
+
+/*
+** Add a position and starting and ending offsets to a doclist.
+**
+** If the doclist is setup to handle only positions, then insert
+** the position only and ignore the offsets.
+*/
+static void docListAddPosOffset(
+ DocList *d, /* Doclist under construction */
+ int iColumn, /* Column the inserted term is part of */
+ int iPos, /* Position of the inserted term */
+ int iStartOffset, /* Starting offset of inserted term */
+ int iEndOffset /* Ending offset of inserted term */
+){
+ assert( d->iType>=DL_POSITIONS );
+ addPos(d, iColumn, iPos);
+ if( d->iType==DL_POSITIONS_OFFSETS ){
+ assert( iStartOffset>=d->iLastOffset );
+ appendVarint(d, iStartOffset-d->iLastOffset);
+ d->iLastOffset = iStartOffset;
+ assert( iEndOffset>=iStartOffset );
+ appendVarint(d, iEndOffset-iStartOffset);
+ }
+ appendVarint(d, POS_END); /* add new terminator */
+}
+
+/*
+** A DocListReader object is a cursor into a doclist. Initialize
+** the cursor to the beginning of the doclist by calling readerInit().
+** Then use routines
+**
+** peekDocid()
+** readDocid()
+** readPosition()
+** skipPositionList()
+** and so forth...
+**
+** to read information out of the doclist. When we reach the end
+** of the doclist, atEnd() returns TRUE.
+*/
+typedef struct DocListReader {
+ DocList *pDoclist; /* The document list we are stepping through */
+ char *p; /* Pointer to next unread byte in the doclist */
+ int iLastColumn;
+ int iLastPos; /* the last position read, or -1 when not in a position list */
+} DocListReader;
+
+/*
+** Initialize the DocListReader r to point to the beginning of pDoclist.
+*/
+static void readerInit(DocListReader *r, DocList *pDoclist){
+ r->pDoclist = pDoclist;
+ if( pDoclist!=NULL ){
+ r->p = pDoclist->pData;
+ }
+ r->iLastColumn = -1;
+ r->iLastPos = -1;
+}
+
+/*
+** Return TRUE if we have reached then end of pReader and there is
+** nothing else left to read.
+*/
+static int atEnd(DocListReader *pReader){
+ return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
+}
+
+/* Peek at the next docid without advancing the read pointer.
+*/
+static sqlite_int64 peekDocid(DocListReader *pReader){
+ sqlite_int64 ret;
+ assert( !atEnd(pReader) );
+ assert( pReader->iLastPos==-1 );
+ getVarint(pReader->p, &ret);
+ return ret;
+}
+
+/* Read the next docid. See also nextDocid().
+*/
+static sqlite_int64 readDocid(DocListReader *pReader){
+ sqlite_int64 ret;
+ assert( !atEnd(pReader) );
+ assert( pReader->iLastPos==-1 );
+ pReader->p += getVarint(pReader->p, &ret);
+ if( pReader->pDoclist->iType>=DL_POSITIONS ){
+ pReader->iLastColumn = 0;
+ pReader->iLastPos = 0;
+ }
+ return ret;
+}
+
+/* Read the next position and column index from a position list.
+ * Returns the position, or -1 at the end of the list. */
+static int readPosition(DocListReader *pReader, int *iColumn){
+ int i;
+ int iType = pReader->pDoclist->iType;
+
+ if( pReader->iLastPos==-1 ){
+ return -1;
+ }
+ assert( !atEnd(pReader) );
+
+ if( iTypep += getVarint32(pReader->p, &i);
+ if( i==POS_END ){
+ pReader->iLastColumn = pReader->iLastPos = -1;
+ *iColumn = -1;
+ return -1;
+ }
+ if( i==POS_COLUMN ){
+ pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
+ pReader->iLastPos = 0;
+ pReader->p += getVarint32(pReader->p, &i);
+ assert( i>=POS_BASE );
+ }
+ pReader->iLastPos += ((int) i)-POS_BASE;
+ if( iType>=DL_POSITIONS_OFFSETS ){
+ /* Skip over offsets, ignoring them for now. */
+ int iStart, iEnd;
+ pReader->p += getVarint32(pReader->p, &iStart);
+ pReader->p += getVarint32(pReader->p, &iEnd);
+ }
+ *iColumn = pReader->iLastColumn;
+ return pReader->iLastPos;
+}
+
+/* Skip past the end of a position list. */
+static void skipPositionList(DocListReader *pReader){
+ DocList *p = pReader->pDoclist;
+ if( p && p->iType>=DL_POSITIONS ){
+ int iColumn;
+ while( readPosition(pReader, &iColumn)!=-1 ){}
+ }
+}
+
+/* Skip over a docid, including its position list if the doclist has
+ * positions. */
+static void skipDocument(DocListReader *pReader){
+ readDocid(pReader);
+ skipPositionList(pReader);
+}
+
+/* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
+ * matching [iDocid] was found. */
+static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
+ sqlite_int64 d = 0;
+ while( !atEnd(pReader) && (d=peekDocid(pReader))iType>=DL_POSITIONS ){
+ int iPos, iCol;
+ const char *zDiv = "";
+ printf("(");
+ while( (iPos = readPosition(&r, &iCol))>=0 ){
+ printf("%s%d:%d", zDiv, iCol, iPos);
+ zDiv = ":";
+ }
+ printf(")");
+ }
+ }
+ printf("\n");
+ fflush(stdout);
+}
+#endif /* SQLITE_DEBUG */
+
+/* Trim the given doclist to contain only positions in column
+ * [iRestrictColumn]. */
+static void docListRestrictColumn(DocList *in, int iRestrictColumn){
+ DocListReader r;
+ DocList out;
+
+ assert( in->iType>=DL_POSITIONS );
+ readerInit(&r, in);
+ docListInit(&out, DL_POSITIONS, NULL, 0);
+
+ while( !atEnd(&r) ){
+ sqlite_int64 iDocid = readDocid(&r);
+ int iPos, iColumn;
+
+ docListAddDocid(&out, iDocid);
+ while( (iPos = readPosition(&r, &iColumn)) != -1 ){
+ if( iColumn==iRestrictColumn ){
+ docListAddPos(&out, iColumn, iPos);
+ }
+ }
+ }
+
+ docListDestroy(in);
+ *in = out;
+}
+
+/* Trim the given doclist by discarding any docids without any remaining
+ * positions. */
+static void docListDiscardEmpty(DocList *in) {
+ DocListReader r;
+ DocList out;
+
+ /* TODO: It would be nice to implement this operation in place; that
+ * could save a significant amount of memory in queries with long doclists. */
+ assert( in->iType>=DL_POSITIONS );
+ readerInit(&r, in);
+ docListInit(&out, DL_POSITIONS, NULL, 0);
+
+ while( !atEnd(&r) ){
+ sqlite_int64 iDocid = readDocid(&r);
+ int match = 0;
+ int iPos, iColumn;
+ while( (iPos = readPosition(&r, &iColumn)) != -1 ){
+ if( !match ){
+ docListAddDocid(&out, iDocid);
+ match = 1;
+ }
+ docListAddPos(&out, iColumn, iPos);
+ }
+ }
+
+ docListDestroy(in);
+ *in = out;
+}
+
+/* Helper function for docListUpdate() and docListAccumulate().
+** Splices a doclist element into the doclist represented by r,
+** leaving r pointing after the newly spliced element.
+*/
+static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
+ const char *pSource, int nSource){
+ DocList *d = r->pDoclist;
+ char *pTarget;
+ int nTarget, found;
+
+ found = skipToDocid(r, iDocid);
+
+ /* Describe slice in d to place pSource/nSource. */
+ pTarget = r->p;
+ if( found ){
+ skipDocument(r);
+ nTarget = r->p-pTarget;
+ }else{
+ nTarget = 0;
+ }
+
+ /* The sense of the following is that there are three possibilities.
+ ** If nTarget==nSource, we should not move any memory nor realloc.
+ ** If nTarget>nSource, trim target and realloc.
+ ** If nTargetnSource ){
+ memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
+ }
+ if( nTarget!=nSource ){
+ int iDoclist = pTarget-d->pData;
+ d->pData = realloc(d->pData, d->nData+nSource-nTarget);
+ pTarget = d->pData+iDoclist;
+ }
+ if( nTargetnData += nSource-nTarget;
+ r->p = pTarget+nSource;
+}
+
+/* Insert/update pUpdate into the doclist. */
+static void docListUpdate(DocList *d, DocList *pUpdate){
+ DocListReader reader;
+
+ assert( d!=NULL && pUpdate!=NULL );
+ assert( d->iType==pUpdate->iType);
+
+ readerInit(&reader, d);
+ docListSpliceElement(&reader, firstDocid(pUpdate),
+ pUpdate->pData, pUpdate->nData);
+}
+
+/* Propagate elements from pUpdate to pAcc, overwriting elements with
+** matching docids.
+*/
+static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
+ DocListReader accReader, updateReader;
+
+ /* Handle edge cases where one doclist is empty. */
+ assert( pAcc!=NULL );
+ if( pUpdate==NULL || pUpdate->nData==0 ) return;
+ if( pAcc->nData==0 ){
+ pAcc->pData = malloc(pUpdate->nData);
+ memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
+ pAcc->nData = pUpdate->nData;
+ return;
+ }
+
+ readerInit(&accReader, pAcc);
+ readerInit(&updateReader, pUpdate);
+
+ while( !atEnd(&updateReader) ){
+ char *pSource = updateReader.p;
+ sqlite_int64 iDocid = readDocid(&updateReader);
+ skipPositionList(&updateReader);
+ docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
+ }
+}
+
+/*
+** Read the next docid off of pIn. Return 0 if we reach the end.
+*
+* TODO: This assumes that docids are never 0, but they may actually be 0 since
+* users can choose docids when inserting into a full-text table. Fix this.
+*/
+static sqlite_int64 nextDocid(DocListReader *pIn){
+ skipPositionList(pIn);
+ return atEnd(pIn) ? 0 : readDocid(pIn);
+}
+
+/*
+** pLeft and pRight are two DocListReaders that are pointing to
+** positions lists of the same document: iDocid.
+**
+** If there are no instances in pLeft or pRight where the position
+** of pLeft is one less than the position of pRight, then this
+** routine adds nothing to pOut.
+**
+** If there are one or more instances where positions from pLeft
+** are exactly one less than positions from pRight, then add a new
+** document record to pOut. If pOut wants to hold positions, then
+** include the positions from pRight that are one more than a
+** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
+**
+** pLeft and pRight are left pointing at the next document record.
+*/
+static void mergePosList(
+ DocListReader *pLeft, /* Left position list */
+ DocListReader *pRight, /* Right position list */
+ sqlite_int64 iDocid, /* The docid from pLeft and pRight */
+ DocList *pOut /* Write the merged document record here */
+){
+ int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
+ int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
+ int match = 0;
+
+ /* Loop until we've reached the end of both position lists. */
+ while( iLeftPos!=-1 && iRightPos!=-1 ){
+ if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
+ if( !match ){
+ docListAddDocid(pOut, iDocid);
+ match = 1;
+ }
+ if( pOut->iType>=DL_POSITIONS ){
+ docListAddPos(pOut, iRightCol, iRightPos);
+ }
+ iLeftPos = readPosition(pLeft, &iLeftCol);
+ iRightPos = readPosition(pRight, &iRightCol);
+ }else if( iRightCol=0 ) skipPositionList(pLeft);
+ if( iRightPos>=0 ) skipPositionList(pRight);
+}
+
+/* We have two doclists: pLeft and pRight.
+** Write the phrase intersection of these two doclists into pOut.
+**
+** A phrase intersection means that two documents only match
+** if pLeft.iPos+1==pRight.iPos.
+**
+** The output pOut may or may not contain positions. If pOut
+** does contain positions, they are the positions of pRight.
+*/
+static void docListPhraseMerge(
+ DocList *pLeft, /* Doclist resulting from the words on the left */
+ DocList *pRight, /* Doclist for the next word to the right */
+ DocList *pOut /* Write the combined doclist here */
+){
+ DocListReader left, right;
+ sqlite_int64 docidLeft, docidRight;
+
+ readerInit(&left, pLeft);
+ readerInit(&right, pRight);
+ docidLeft = nextDocid(&left);
+ docidRight = nextDocid(&right);
+
+ while( docidLeft>0 && docidRight>0 ){
+ if( docidLeftiType0 && docidRight>0 ){
+ if( docidLeft0 && docidRight>0 ){
+ if( docidLeft<=docidRight ){
+ docListAddDocid(pOut, docidLeft);
+ }else{
+ docListAddDocid(pOut, docidRight);
+ }
+ priorLeft = docidLeft;
+ if( docidLeft<=docidRight ){
+ docidLeft = nextDocid(&left);
+ }
+ if( docidRight>0 && docidRight<=priorLeft ){
+ docidRight = nextDocid(&right);
+ }
+ }
+ while( docidLeft>0 ){
+ docListAddDocid(pOut, docidLeft);
+ docidLeft = nextDocid(&left);
+ }
+ while( docidRight>0 ){
+ docListAddDocid(pOut, docidRight);
+ docidRight = nextDocid(&right);
+ }
+}
+
+/* We have two doclists: pLeft and pRight.
+** Write into pOut all documents that occur in pLeft but not
+** in pRight.
+**
+** Only docids are matched. Position information is ignored.
+**
+** The output pOut never holds positions.
+*/
+static void docListExceptMerge(
+ DocList *pLeft, /* Doclist resulting from the words on the left */
+ DocList *pRight, /* Doclist for the next word to the right */
+ DocList *pOut /* Write the combined doclist here */
+){
+ DocListReader left, right;
+ sqlite_int64 docidLeft, docidRight, priorLeft;
+
+ readerInit(&left, pLeft);
+ readerInit(&right, pRight);
+ docidLeft = nextDocid(&left);
+ docidRight = nextDocid(&right);
+
+ while( docidLeft>0 && docidRight>0 ){
+ priorLeft = docidLeft;
+ if( docidLeft0 && docidRight<=priorLeft ){
+ docidRight = nextDocid(&right);
+ }
+ }
+ while( docidLeft>0 ){
+ docListAddDocid(pOut, docidLeft);
+ docidLeft = nextDocid(&left);
+ }
+}
+
+static char *string_dup_n(const char *s, int n){
+ char *str = malloc(n + 1);
+ memcpy(str, s, n);
+ str[n] = '\0';
+ return str;
+}
+
+/* Duplicate a string; the caller must free() the returned string.
+ * (We don't use strdup() since it is not part of the standard C library and
+ * may not be available everywhere.) */
+static char *string_dup(const char *s){
+ return string_dup_n(s, strlen(s));
+}
+
+/* Format a string, replacing each occurrence of the % character with
+ * zDb.zName. This may be more convenient than sqlite_mprintf()
+ * when one string is used repeatedly in a format string.
+ * The caller must free() the returned string. */
+static char *string_format(const char *zFormat,
+ const char *zDb, const char *zName){
+ const char *p;
+ size_t len = 0;
+ size_t nDb = strlen(zDb);
+ size_t nName = strlen(zName);
+ size_t nFullTableName = nDb+1+nName;
+ char *result;
+ char *r;
+
+ /* first compute length needed */
+ for(p = zFormat ; *p ; ++p){
+ len += (*p=='%' ? nFullTableName : 1);
+ }
+ len += 1; /* for null terminator */
+
+ r = result = malloc(len);
+ for(p = zFormat; *p; ++p){
+ if( *p=='%' ){
+ memcpy(r, zDb, nDb);
+ r += nDb;
+ *r++ = '.';
+ memcpy(r, zName, nName);
+ r += nName;
+ } else {
+ *r++ = *p;
+ }
+ }
+ *r++ = '\0';
+ assert( r == result + len );
+ return result;
+}
+
+static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
+ const char *zFormat){
+ char *zCommand = string_format(zFormat, zDb, zName);
+ int rc;
+ TRACE(("FTS1 sql: %s\n", zCommand));
+ rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
+ free(zCommand);
+ return rc;
+}
+
+static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
+ sqlite3_stmt **ppStmt, const char *zFormat){
+ char *zCommand = string_format(zFormat, zDb, zName);
+ int rc;
+ TRACE(("FTS1 prepare: %s\n", zCommand));
+ rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
+ free(zCommand);
+ return rc;
+}
+
+/* end utility functions */
+
+/* Forward reference */
+typedef struct fulltext_vtab fulltext_vtab;
+
+/* A single term in a query is represented by an instances of
+** the following structure.
+*/
+typedef struct QueryTerm {
+ short int nPhrase; /* How many following terms are part of the same phrase */
+ short int iPhrase; /* This is the i-th term of a phrase. */
+ short int iColumn; /* Column of the index that must match this term */
+ signed char isOr; /* this term is preceded by "OR" */
+ signed char isNot; /* this term is preceded by "-" */
+ char *pTerm; /* text of the term. '\000' terminated. malloced */
+ int nTerm; /* Number of bytes in pTerm[] */
+} QueryTerm;
+
+
+/* A query string is parsed into a Query structure.
+ *
+ * We could, in theory, allow query strings to be complicated
+ * nested expressions with precedence determined by parentheses.
+ * But none of the major search engines do this. (Perhaps the
+ * feeling is that an parenthesized expression is two complex of
+ * an idea for the average user to grasp.) Taking our lead from
+ * the major search engines, we will allow queries to be a list
+ * of terms (with an implied AND operator) or phrases in double-quotes,
+ * with a single optional "-" before each non-phrase term to designate
+ * negation and an optional OR connector.
+ *
+ * OR binds more tightly than the implied AND, which is what the
+ * major search engines seem to do. So, for example:
+ *
+ * [one two OR three] ==> one AND (two OR three)
+ * [one OR two three] ==> (one OR two) AND three
+ *
+ * A "-" before a term matches all entries that lack that term.
+ * The "-" must occur immediately before the term with in intervening
+ * space. This is how the search engines do it.
+ *
+ * A NOT term cannot be the right-hand operand of an OR. If this
+ * occurs in the query string, the NOT is ignored:
+ *
+ * [one OR -two] ==> one OR two
+ *
+ */
+typedef struct Query {
+ fulltext_vtab *pFts; /* The full text index */
+ int nTerms; /* Number of terms in the query */
+ QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
+ int nextIsOr; /* Set the isOr flag on the next inserted term */
+ int nextColumn; /* Next word parsed must be in this column */
+ int dfltColumn; /* The default column */
+} Query;
+
+
+/*
+** An instance of the following structure keeps track of generated
+** matching-word offset information and snippets.
+*/
+typedef struct Snippet {
+ int nMatch; /* Total number of matches */
+ int nAlloc; /* Space allocated for aMatch[] */
+ struct snippetMatch { /* One entry for each matching term */
+ char snStatus; /* Status flag for use while constructing snippets */
+ short int iCol; /* The column that contains the match */
+ short int iTerm; /* The index in Query.pTerms[] of the matching term */
+ short int nByte; /* Number of bytes in the term */
+ int iStart; /* The offset to the first character of the term */
+ } *aMatch; /* Points to space obtained from malloc */
+ char *zOffset; /* Text rendering of aMatch[] */
+ int nOffset; /* strlen(zOffset) */
+ char *zSnippet; /* Snippet text */
+ int nSnippet; /* strlen(zSnippet) */
+} Snippet;
+
+
+typedef enum QueryType {
+ QUERY_GENERIC, /* table scan */
+ QUERY_ROWID, /* lookup by rowid */
+ QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
+} QueryType;
+
+/* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
+** before we start aggregating into larger segments. Lower CHUNK_MAX
+** means that for a given input we have more individual segments per
+** term, which means more rows in the table and a bigger index (due to
+** both more rows and bigger rowids). But it also reduces the average
+** cost of adding new elements to the segment 0 doclist, and it seems
+** to reduce the number of pages read and written during inserts. 256
+** was chosen by measuring insertion times for a certain input (first
+** 10k documents of Enron corpus), though including query performance
+** in the decision may argue for a larger value.
+*/
+#define CHUNK_MAX 256
+
+typedef enum fulltext_statement {
+ CONTENT_INSERT_STMT,
+ CONTENT_SELECT_STMT,
+ CONTENT_UPDATE_STMT,
+ CONTENT_DELETE_STMT,
+
+ TERM_SELECT_STMT,
+ TERM_SELECT_ALL_STMT,
+ TERM_INSERT_STMT,
+ TERM_UPDATE_STMT,
+ TERM_DELETE_STMT,
+
+ MAX_STMT /* Always at end! */
+} fulltext_statement;
+
+/* These must exactly match the enum above. */
+/* TODO(adam): Is there some risk that a statement (in particular,
+** pTermSelectStmt) will be used in two cursors at once, e.g. if a
+** query joins a virtual table to itself? If so perhaps we should
+** move some of these to the cursor object.
+*/
+static const char *const fulltext_zStatement[MAX_STMT] = {
+ /* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
+ /* CONTENT_SELECT */ "select * from %_content where rowid = ?",
+ /* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
+ /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
+
+ /* TERM_SELECT */
+ "select rowid, doclist from %_term where term = ? and segment = ?",
+ /* TERM_SELECT_ALL */
+ "select doclist from %_term where term = ? order by segment",
+ /* TERM_INSERT */
+ "insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
+ /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
+ /* TERM_DELETE */ "delete from %_term where rowid = ?",
+};
+
+/*
+** A connection to a fulltext index is an instance of the following
+** structure. The xCreate and xConnect methods create an instance
+** of this structure and xDestroy and xDisconnect free that instance.
+** All other methods receive a pointer to the structure as one of their
+** arguments.
+*/
+struct fulltext_vtab {
+ sqlite3_vtab base; /* Base class used by SQLite core */
+ sqlite3 *db; /* The database connection */
+ const char *zDb; /* logical database name */
+ const char *zName; /* virtual table name */
+ int nColumn; /* number of columns in virtual table */
+ char **azColumn; /* column names. malloced */
+ char **azContentColumn; /* column names in content table; malloced */
+ sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
+
+ /* Precompiled statements which we keep as long as the table is
+ ** open.
+ */
+ sqlite3_stmt *pFulltextStatements[MAX_STMT];
+};
+
+/*
+** When the core wants to do a query, it create a cursor using a
+** call to xOpen. This structure is an instance of a cursor. It
+** is destroyed by xClose.
+*/
+typedef struct fulltext_cursor {
+ sqlite3_vtab_cursor base; /* Base class used by SQLite core */
+ QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
+ sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
+ int eof; /* True if at End Of Results */
+ Query q; /* Parsed query string */
+ Snippet snippet; /* Cached snippet for the current row */
+ int iColumn; /* Column being searched */
+ DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
+} fulltext_cursor;
+
+static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
+ return (fulltext_vtab *) c->base.pVtab;
+}
+
+static const sqlite3_module fulltextModule; /* forward declaration */
+
+/* Append a list of strings separated by commas to a StringBuffer. */
+static void appendList(StringBuffer *sb, int nString, char **azString){
+ int i;
+ for(i=0; i0 ) append(sb, ", ");
+ append(sb, azString[i]);
+ }
+}
+
+/* Return a dynamically generated statement of the form
+ * insert into %_content (rowid, ...) values (?, ...)
+ */
+static const char *contentInsertStatement(fulltext_vtab *v){
+ StringBuffer sb;
+ int i;
+
+ initStringBuffer(&sb);
+ append(&sb, "insert into %_content (rowid, ");
+ appendList(&sb, v->nColumn, v->azContentColumn);
+ append(&sb, ") values (?");
+ for(i=0; inColumn; ++i)
+ append(&sb, ", ?");
+ append(&sb, ")");
+ return sb.s;
+}
+
+/* Return a dynamically generated statement of the form
+ * update %_content set [col_0] = ?, [col_1] = ?, ...
+ * where rowid = ?
+ */
+static const char *contentUpdateStatement(fulltext_vtab *v){
+ StringBuffer sb;
+ int i;
+
+ initStringBuffer(&sb);
+ append(&sb, "update %_content set ");
+ for(i=0; inColumn; ++i) {
+ if( i>0 ){
+ append(&sb, ", ");
+ }
+ append(&sb, v->azContentColumn[i]);
+ append(&sb, " = ?");
+ }
+ append(&sb, " where rowid = ?");
+ return sb.s;
+}
+
+/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
+** If the indicated statement has never been prepared, it is prepared
+** and cached, otherwise the cached version is reset.
+*/
+static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ assert( iStmtpFulltextStatements[iStmt]==NULL ){
+ const char *zStmt;
+ int rc;
+ switch( iStmt ){
+ case CONTENT_INSERT_STMT:
+ zStmt = contentInsertStatement(v); break;
+ case CONTENT_UPDATE_STMT:
+ zStmt = contentUpdateStatement(v); break;
+ default:
+ zStmt = fulltext_zStatement[iStmt];
+ }
+ rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
+ zStmt);
+ if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
+ if( rc!=SQLITE_OK ) return rc;
+ } else {
+ int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ *ppStmt = v->pFulltextStatements[iStmt];
+ return SQLITE_OK;
+}
+
+/* Step the indicated statement, handling errors SQLITE_BUSY (by
+** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
+** bindings to the new statement).
+** TODO(adam): We should extend this function so that it can work with
+** statements declared locally, not only globally cached statements.
+*/
+static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ int rc;
+ sqlite3_stmt *s = *ppStmt;
+ assert( iStmtpFulltextStatements[iStmt] );
+
+ while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
+ if( rc==SQLITE_BUSY ) continue;
+ if( rc!=SQLITE_ERROR ) return rc;
+
+ /* If an SQLITE_SCHEMA error has occurred, then finalizing this
+ * statement is going to delete the fulltext_vtab structure. If
+ * the statement just executed is in the pFulltextStatements[]
+ * array, it will be finalized twice. So remove it before
+ * calling sqlite3_finalize().
+ */
+ v->pFulltextStatements[iStmt] = NULL;
+ rc = sqlite3_finalize(s);
+ break;
+ }
+ return rc;
+
+ err:
+ sqlite3_finalize(s);
+ return rc;
+}
+
+/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
+** Useful for statements like UPDATE, where we expect no results.
+*/
+static int sql_single_step_statement(fulltext_vtab *v,
+ fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ int rc = sql_step_statement(v, iStmt, ppStmt);
+ return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
+}
+
+/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
+static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
+ sqlite3_value **pValues){
+ sqlite3_stmt *s;
+ int i;
+ int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_value(s, 1, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ for(i=0; inColumn; ++i){
+ rc = sqlite3_bind_value(s, 2+i, pValues[i]);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
+}
+
+/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
+ * where rowid = [iRowid] */
+static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
+ sqlite_int64 iRowid){
+ sqlite3_stmt *s;
+ int i;
+ int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ for(i=0; inColumn; ++i){
+ rc = sqlite3_bind_value(s, 1+i, pValues[i]);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
+}
+
+static void freeStringArray(int nString, const char **pString){
+ int i;
+
+ for (i=0 ; i < nString ; ++i) {
+ if( pString[i]!=NULL ) free((void *) pString[i]);
+ }
+ free((void *) pString);
+}
+
+/* select * from %_content where rowid = [iRow]
+ * The caller must delete the returned array and all strings in it.
+ * null fields will be NULL in the returned array.
+ *
+ * TODO: Perhaps we should return pointer/length strings here for consistency
+ * with other code which uses pointer/length. */
+static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
+ const char ***pValues){
+ sqlite3_stmt *s;
+ const char **values;
+ int i;
+ int rc;
+
+ *pValues = NULL;
+
+ rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
+ if( rc!=SQLITE_ROW ) return rc;
+
+ values = (const char **) malloc(v->nColumn * sizeof(const char *));
+ for(i=0; inColumn; ++i){
+ if( sqlite3_column_type(s, i)==SQLITE_NULL ){
+ values[i] = NULL;
+ }else{
+ values[i] = string_dup((char*)sqlite3_column_text(s, i));
+ }
+ }
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_DONE ){
+ *pValues = values;
+ return SQLITE_OK;
+ }
+
+ freeStringArray(v->nColumn, values);
+ return rc;
+}
+
+/* delete from %_content where rowid = [iRow ] */
+static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
+}
+
+/* select rowid, doclist from %_term
+ * where term = [pTerm] and segment = [iSegment]
+ * If found, returns SQLITE_ROW; the caller must free the
+ * returned doclist. If no rows found, returns SQLITE_DONE. */
+static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
+ int iSegment,
+ sqlite_int64 *rowid, DocList *out){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int(s, 2, iSegment);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
+ if( rc!=SQLITE_ROW ) return rc;
+
+ *rowid = sqlite3_column_int64(s, 0);
+ docListInit(out, DL_DEFAULT,
+ sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ return rc==SQLITE_DONE ? SQLITE_ROW : rc;
+}
+
+/* Load the segment doclists for term pTerm and merge them in
+** appropriate order into out. Returns SQLITE_OK if successful. If
+** there are no segments for pTerm, successfully returns an empty
+** doclist in out.
+**
+** Each document consists of 1 or more "columns". The number of
+** columns is v->nColumn. If iColumn==v->nColumn, then return
+** position information about all columns. If iColumnnColumn,
+** then only return position information about the iColumn-th column
+** (where the first column is 0).
+*/
+static int term_select_all(
+ fulltext_vtab *v, /* The fulltext index we are querying against */
+ int iColumn, /* If nColumn ){ /* querying a single column */
+ docListRestrictColumn(&old, iColumn);
+ }
+
+ /* doclist contains the newer data, so write it over old. Then
+ ** steal accumulated result for doclist.
+ */
+ docListAccumulate(&old, &doclist);
+ docListDestroy(&doclist);
+ doclist = old;
+ }
+ if( rc!=SQLITE_DONE ){
+ docListDestroy(&doclist);
+ return rc;
+ }
+
+ docListDiscardEmpty(&doclist);
+ *out = doclist;
+ return SQLITE_OK;
+}
+
+/* insert into %_term (rowid, term, segment, doclist)
+ values ([piRowid], [pTerm], [iSegment], [doclist])
+** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
+**
+** NOTE(shess) piRowid is IN, with values of "space of int64" plus
+** null, it is not used to pass data back to the caller.
+*/
+static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
+ const char *pTerm, int nTerm,
+ int iSegment, DocList *doclist){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ if( piRowid==NULL ){
+ rc = sqlite3_bind_null(s, 1);
+ }else{
+ rc = sqlite3_bind_int64(s, 1, *piRowid);
+ }
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int(s, 3, iSegment);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
+}
+
+/* update %_term set doclist = [doclist] where rowid = [rowid] */
+static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
+ DocList *doclist){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
+}
+
+static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
+}
+
+/*
+** Free the memory used to contain a fulltext_vtab structure.
+*/
+static void fulltext_vtab_destroy(fulltext_vtab *v){
+ int iStmt, i;
+
+ TRACE(("FTS1 Destroy %p\n", v));
+ for( iStmt=0; iStmtpFulltextStatements[iStmt]!=NULL ){
+ sqlite3_finalize(v->pFulltextStatements[iStmt]);
+ v->pFulltextStatements[iStmt] = NULL;
+ }
+ }
+
+ if( v->pTokenizer!=NULL ){
+ v->pTokenizer->pModule->xDestroy(v->pTokenizer);
+ v->pTokenizer = NULL;
+ }
+
+ free(v->azColumn);
+ for(i = 0; i < v->nColumn; ++i) {
+ sqlite3_free(v->azContentColumn[i]);
+ }
+ free(v->azContentColumn);
+ free(v);
+}
+
+/*
+** Token types for parsing the arguments to xConnect or xCreate.
+*/
+#define TOKEN_EOF 0 /* End of file */
+#define TOKEN_SPACE 1 /* Any kind of whitespace */
+#define TOKEN_ID 2 /* An identifier */
+#define TOKEN_STRING 3 /* A string literal */
+#define TOKEN_PUNCT 4 /* A single punctuation character */
+
+/*
+** If X is a character that can be used in an identifier then
+** IdChar(X) will be true. Otherwise it is false.
+**
+** For ASCII, any character with the high-order bit set is
+** allowed in an identifier. For 7-bit characters,
+** sqlite3IsIdChar[X] must be 1.
+**
+** Ticket #1066. the SQL standard does not allow '$' in the
+** middle of identfiers. But many SQL implementations do.
+** SQLite will allow '$' in identifiers for compatibility.
+** But the feature is undocumented.
+*/
+static const char isIdChar[] = {
+/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
+ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
+};
+#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
+
+
+/*
+** Return the length of the token that begins at z[0].
+** Store the token type in *tokenType before returning.
+*/
+static int getToken(const char *z, int *tokenType){
+ int i, c;
+ switch( *z ){
+ case 0: {
+ *tokenType = TOKEN_EOF;
+ return 0;
+ }
+ case ' ': case '\t': case '\n': case '\f': case '\r': {
+ for(i=1; safe_isspace(z[i]); i++){}
+ *tokenType = TOKEN_SPACE;
+ return i;
+ }
+ case '`':
+ case '\'':
+ case '"': {
+ int delim = z[0];
+ for(i=1; (c=z[i])!=0; i++){
+ if( c==delim ){
+ if( z[i+1]==delim ){
+ i++;
+ }else{
+ break;
+ }
+ }
+ }
+ *tokenType = TOKEN_STRING;
+ return i + (c!=0);
+ }
+ case '[': {
+ for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
+ *tokenType = TOKEN_ID;
+ return i;
+ }
+ default: {
+ if( !IdChar(*z) ){
+ break;
+ }
+ for(i=1; IdChar(z[i]); i++){}
+ *tokenType = TOKEN_ID;
+ return i;
+ }
+ }
+ *tokenType = TOKEN_PUNCT;
+ return 1;
+}
+
+/*
+** A token extracted from a string is an instance of the following
+** structure.
+*/
+typedef struct Token {
+ const char *z; /* Pointer to token text. Not '\000' terminated */
+ short int n; /* Length of the token text in bytes. */
+} Token;
+
+/*
+** Given a input string (which is really one of the argv[] parameters
+** passed into xConnect or xCreate) split the string up into tokens.
+** Return an array of pointers to '\000' terminated strings, one string
+** for each non-whitespace token.
+**
+** The returned array is terminated by a single NULL pointer.
+**
+** Space to hold the returned array is obtained from a single
+** malloc and should be freed by passing the return value to free().
+** The individual strings within the token list are all a part of
+** the single memory allocation and will all be freed at once.
+*/
+static char **tokenizeString(const char *z, int *pnToken){
+ int nToken = 0;
+ Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
+ int n = 1;
+ int e, i;
+ int totalSize = 0;
+ char **azToken;
+ char *zCopy;
+ while( n>0 ){
+ n = getToken(z, &e);
+ if( e!=TOKEN_SPACE ){
+ aToken[nToken].z = z;
+ aToken[nToken].n = n;
+ nToken++;
+ totalSize += n+1;
+ }
+ z += n;
+ }
+ azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
+ zCopy = (char*)&azToken[nToken];
+ nToken--;
+ for(i=0; i=0 ){
+ azIn[j] = azIn[i];
+ }
+ j++;
+ }
+ }
+ azIn[j] = 0;
+ }
+}
+
+
+/*
+** Find the first alphanumeric token in the string zIn. Null-terminate
+** this token. Remove any quotation marks. And return a pointer to
+** the result.
+*/
+static char *firstToken(char *zIn, char **pzTail){
+ int n, ttype;
+ while(1){
+ n = getToken(zIn, &ttype);
+ if( ttype==TOKEN_SPACE ){
+ zIn += n;
+ }else if( ttype==TOKEN_EOF ){
+ *pzTail = zIn;
+ return 0;
+ }else{
+ zIn[n] = 0;
+ *pzTail = &zIn[1];
+ dequoteString(zIn);
+ return zIn;
+ }
+ }
+ /*NOTREACHED*/
+}
+
+/* Return true if...
+**
+** * s begins with the string t, ignoring case
+** * s is longer than t
+** * The first character of s beyond t is not a alphanumeric
+**
+** Ignore leading space in *s.
+**
+** To put it another way, return true if the first token of
+** s[] is t[].
+*/
+static int startsWith(const char *s, const char *t){
+ while( safe_isspace(*s) ){ s++; }
+ while( *t ){
+ if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
+ }
+ return *s!='_' && !safe_isalnum(*s);
+}
+
+/*
+** An instance of this structure defines the "spec" of a
+** full text index. This structure is populated by parseSpec
+** and use by fulltextConnect and fulltextCreate.
+*/
+typedef struct TableSpec {
+ const char *zDb; /* Logical database name */
+ const char *zName; /* Name of the full-text index */
+ int nColumn; /* Number of columns to be indexed */
+ char **azColumn; /* Original names of columns to be indexed */
+ char **azContentColumn; /* Column names for %_content */
+ char **azTokenizer; /* Name of tokenizer and its arguments */
+} TableSpec;
+
+/*
+** Reclaim all of the memory used by a TableSpec
+*/
+static void clearTableSpec(TableSpec *p) {
+ free(p->azColumn);
+ free(p->azContentColumn);
+ free(p->azTokenizer);
+}
+
+/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
+ *
+ * CREATE VIRTUAL TABLE email
+ * USING fts1(subject, body, tokenize mytokenizer(myarg))
+ *
+ * We return parsed information in a TableSpec structure.
+ *
+ */
+static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
+ char**pzErr){
+ int i, n;
+ char *z, *zDummy;
+ char **azArg;
+ const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
+
+ assert( argc>=3 );
+ /* Current interface:
+ ** argv[0] - module name
+ ** argv[1] - database name
+ ** argv[2] - table name
+ ** argv[3..] - columns, optionally followed by tokenizer specification
+ ** and snippet delimiters specification.
+ */
+
+ /* Make a copy of the complete argv[][] array in a single allocation.
+ ** The argv[][] array is read-only and transient. We can write to the
+ ** copy in order to modify things and the copy is persistent.
+ */
+ memset(pSpec, 0, sizeof(*pSpec));
+ for(i=n=0; izDb = azArg[1];
+ pSpec->zName = azArg[2];
+ pSpec->nColumn = 0;
+ pSpec->azColumn = azArg;
+ zTokenizer = "tokenize simple";
+ for(i=3; inColumn] = firstToken(azArg[i], &zDummy);
+ pSpec->nColumn++;
+ }
+ }
+ if( pSpec->nColumn==0 ){
+ azArg[0] = "content";
+ pSpec->nColumn = 1;
+ }
+
+ /*
+ ** Construct the list of content column names.
+ **
+ ** Each content column name will be of the form cNNAAAA
+ ** where NN is the column number and AAAA is the sanitized
+ ** column name. "sanitized" means that special characters are
+ ** converted to "_". The cNN prefix guarantees that all column
+ ** names are unique.
+ **
+ ** The AAAA suffix is not strictly necessary. It is included
+ ** for the convenience of people who might examine the generated
+ ** %_content table and wonder what the columns are used for.
+ */
+ pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
+ if( pSpec->azContentColumn==0 ){
+ clearTableSpec(pSpec);
+ return SQLITE_NOMEM;
+ }
+ for(i=0; inColumn; i++){
+ char *p;
+ pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
+ for (p = pSpec->azContentColumn[i]; *p ; ++p) {
+ if( !safe_isalnum(*p) ) *p = '_';
+ }
+ }
+
+ /*
+ ** Parse the tokenizer specification string.
+ */
+ pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
+ tokenListToIdList(pSpec->azTokenizer);
+
+ return SQLITE_OK;
+}
+
+/*
+** Generate a CREATE TABLE statement that describes the schema of
+** the virtual table. Return a pointer to this schema string.
+**
+** Space is obtained from sqlite3_mprintf() and should be freed
+** using sqlite3_free().
+*/
+static char *fulltextSchema(
+ int nColumn, /* Number of columns */
+ const char *const* azColumn, /* List of columns */
+ const char *zTableName /* Name of the table */
+){
+ int i;
+ char *zSchema, *zNext;
+ const char *zSep = "(";
+ zSchema = sqlite3_mprintf("CREATE TABLE x");
+ for(i=0; ibase */
+ v->db = db;
+ v->zDb = spec->zDb; /* Freed when azColumn is freed */
+ v->zName = spec->zName; /* Freed when azColumn is freed */
+ v->nColumn = spec->nColumn;
+ v->azContentColumn = spec->azContentColumn;
+ spec->azContentColumn = 0;
+ v->azColumn = spec->azColumn;
+ spec->azColumn = 0;
+
+ if( spec->azTokenizer==0 ){
+ return SQLITE_NOMEM;
+ }
+ /* TODO(shess) For now, add new tokenizers as else if clauses. */
+ if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
+ sqlite3Fts1SimpleTokenizerModule(&m);
+ }else if( startsWith(spec->azTokenizer[0], "porter") ){
+ sqlite3Fts1PorterTokenizerModule(&m);
+ }else{
+ *pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
+ rc = SQLITE_ERROR;
+ goto err;
+ }
+ for(n=0; spec->azTokenizer[n]; n++){}
+ if( n ){
+ rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
+ &v->pTokenizer);
+ }else{
+ rc = m->xCreate(0, 0, &v->pTokenizer);
+ }
+ if( rc!=SQLITE_OK ) goto err;
+ v->pTokenizer->pModule = m;
+
+ /* TODO: verify the existence of backing tables foo_content, foo_term */
+
+ schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
+ spec->zName);
+ rc = sqlite3_declare_vtab(db, schema);
+ sqlite3_free(schema);
+ if( rc!=SQLITE_OK ) goto err;
+
+ memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
+
+ *ppVTab = &v->base;
+ TRACE(("FTS1 Connect %p\n", v));
+
+ return rc;
+
+err:
+ fulltext_vtab_destroy(v);
+ return rc;
+}
+
+static int fulltextConnect(
+ sqlite3 *db,
+ void *pAux,
+ int argc, const char *const*argv,
+ sqlite3_vtab **ppVTab,
+ char **pzErr
+){
+ TableSpec spec;
+ int rc = parseSpec(&spec, argc, argv, pzErr);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = constructVtab(db, &spec, ppVTab, pzErr);
+ clearTableSpec(&spec);
+ return rc;
+}
+
+ /* The %_content table holds the text of each document, with
+ ** the rowid used as the docid.
+ **
+ ** The %_term table maps each term to a document list blob
+ ** containing elements sorted by ascending docid, each element
+ ** encoded as:
+ **
+ ** docid varint-encoded
+ ** token elements:
+ ** position+1 varint-encoded as delta from previous position
+ ** start offset varint-encoded as delta from previous start offset
+ ** end offset varint-encoded as delta from start offset
+ **
+ ** The sentinel position of 0 indicates the end of the token list.
+ **
+ ** Additionally, doclist blobs are chunked into multiple segments,
+ ** using segment to order the segments. New elements are added to
+ ** the segment at segment 0, until it exceeds CHUNK_MAX. Then
+ ** segment 0 is deleted, and the doclist is inserted at segment 1.
+ ** If there is already a doclist at segment 1, the segment 0 doclist
+ ** is merged with it, the segment 1 doclist is deleted, and the
+ ** merged doclist is inserted at segment 2, repeating those
+ ** operations until an insert succeeds.
+ **
+ ** Since this structure doesn't allow us to update elements in place
+ ** in case of deletion or update, these are simply written to
+ ** segment 0 (with an empty token list in case of deletion), with
+ ** docListAccumulate() taking care to retain lower-segment
+ ** information in preference to higher-segment information.
+ */
+ /* TODO(shess) Provide a VACUUM type operation which both removes
+ ** deleted elements which are no longer necessary, and duplicated
+ ** elements. I suspect this will probably not be necessary in
+ ** practice, though.
+ */
+static int fulltextCreate(sqlite3 *db, void *pAux,
+ int argc, const char * const *argv,
+ sqlite3_vtab **ppVTab, char **pzErr){
+ int rc;
+ TableSpec spec;
+ StringBuffer schema;
+ TRACE(("FTS1 Create\n"));
+
+ rc = parseSpec(&spec, argc, argv, pzErr);
+ if( rc!=SQLITE_OK ) return rc;
+
+ initStringBuffer(&schema);
+ append(&schema, "CREATE TABLE %_content(");
+ appendList(&schema, spec.nColumn, spec.azContentColumn);
+ append(&schema, ")");
+ rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
+ free(schema.s);
+ if( rc!=SQLITE_OK ) goto out;
+
+ rc = sql_exec(db, spec.zDb, spec.zName,
+ "create table %_term(term text, segment integer, doclist blob, "
+ "primary key(term, segment));");
+ if( rc!=SQLITE_OK ) goto out;
+
+ rc = constructVtab(db, &spec, ppVTab, pzErr);
+
+out:
+ clearTableSpec(&spec);
+ return rc;
+}
+
+/* Decide how to handle an SQL query. */
+static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
+ int i;
+ TRACE(("FTS1 BestIndex\n"));
+
+ for(i=0; inConstraint; ++i){
+ const struct sqlite3_index_constraint *pConstraint;
+ pConstraint = &pInfo->aConstraint[i];
+ if( pConstraint->usable ) {
+ if( pConstraint->iColumn==-1 &&
+ pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
+ pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
+ TRACE(("FTS1 QUERY_ROWID\n"));
+ } else if( pConstraint->iColumn>=0 &&
+ pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
+ /* full-text search */
+ pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
+ TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
+ } else continue;
+
+ pInfo->aConstraintUsage[i].argvIndex = 1;
+ pInfo->aConstraintUsage[i].omit = 1;
+
+ /* An arbitrary value for now.
+ * TODO: Perhaps rowid matches should be considered cheaper than
+ * full-text searches. */
+ pInfo->estimatedCost = 1.0;
+
+ return SQLITE_OK;
+ }
+ }
+ pInfo->idxNum = QUERY_GENERIC;
+ return SQLITE_OK;
+}
+
+static int fulltextDisconnect(sqlite3_vtab *pVTab){
+ TRACE(("FTS1 Disconnect %p\n", pVTab));
+ fulltext_vtab_destroy((fulltext_vtab *)pVTab);
+ return SQLITE_OK;
+}
+
+static int fulltextDestroy(sqlite3_vtab *pVTab){
+ fulltext_vtab *v = (fulltext_vtab *)pVTab;
+ int rc;
+
+ TRACE(("FTS1 Destroy %p\n", pVTab));
+ rc = sql_exec(v->db, v->zDb, v->zName,
+ "drop table if exists %_content;"
+ "drop table if exists %_term;"
+ );
+ if( rc!=SQLITE_OK ) return rc;
+
+ fulltext_vtab_destroy((fulltext_vtab *)pVTab);
+ return SQLITE_OK;
+}
+
+static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
+ fulltext_cursor *c;
+
+ c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
+ /* sqlite will initialize c->base */
+ *ppCursor = &c->base;
+ TRACE(("FTS1 Open %p: %p\n", pVTab, c));
+
+ return SQLITE_OK;
+}
+
+
+/* Free all of the dynamically allocated memory held by *q
+*/
+static void queryClear(Query *q){
+ int i;
+ for(i = 0; i < q->nTerms; ++i){
+ free(q->pTerms[i].pTerm);
+ }
+ free(q->pTerms);
+ memset(q, 0, sizeof(*q));
+}
+
+/* Free all of the dynamically allocated memory held by the
+** Snippet
+*/
+static void snippetClear(Snippet *p){
+ free(p->aMatch);
+ free(p->zOffset);
+ free(p->zSnippet);
+ memset(p, 0, sizeof(*p));
+}
+/*
+** Append a single entry to the p->aMatch[] log.
+*/
+static void snippetAppendMatch(
+ Snippet *p, /* Append the entry to this snippet */
+ int iCol, int iTerm, /* The column and query term */
+ int iStart, int nByte /* Offset and size of the match */
+){
+ int i;
+ struct snippetMatch *pMatch;
+ if( p->nMatch+1>=p->nAlloc ){
+ p->nAlloc = p->nAlloc*2 + 10;
+ p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
+ if( p->aMatch==0 ){
+ p->nMatch = 0;
+ p->nAlloc = 0;
+ return;
+ }
+ }
+ i = p->nMatch++;
+ pMatch = &p->aMatch[i];
+ pMatch->iCol = iCol;
+ pMatch->iTerm = iTerm;
+ pMatch->iStart = iStart;
+ pMatch->nByte = nByte;
+}
+
+/*
+** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
+*/
+#define FTS1_ROTOR_SZ (32)
+#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
+
+/*
+** Add entries to pSnippet->aMatch[] for every match that occurs against
+** document zDoc[0..nDoc-1] which is stored in column iColumn.
+*/
+static void snippetOffsetsOfColumn(
+ Query *pQuery,
+ Snippet *pSnippet,
+ int iColumn,
+ const char *zDoc,
+ int nDoc
+){
+ const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
+ sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
+ sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
+ fulltext_vtab *pVtab; /* The full text index */
+ int nColumn; /* Number of columns in the index */
+ const QueryTerm *aTerm; /* Query string terms */
+ int nTerm; /* Number of query string terms */
+ int i, j; /* Loop counters */
+ int rc; /* Return code */
+ unsigned int match, prevMatch; /* Phrase search bitmasks */
+ const char *zToken; /* Next token from the tokenizer */
+ int nToken; /* Size of zToken */
+ int iBegin, iEnd, iPos; /* Offsets of beginning and end */
+
+ /* The following variables keep a circular buffer of the last
+ ** few tokens */
+ unsigned int iRotor = 0; /* Index of current token */
+ int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
+ int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
+
+ pVtab = pQuery->pFts;
+ nColumn = pVtab->nColumn;
+ pTokenizer = pVtab->pTokenizer;
+ pTModule = pTokenizer->pModule;
+ rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
+ if( rc ) return;
+ pTCursor->pTokenizer = pTokenizer;
+ aTerm = pQuery->pTerms;
+ nTerm = pQuery->nTerms;
+ if( nTerm>=FTS1_ROTOR_SZ ){
+ nTerm = FTS1_ROTOR_SZ - 1;
+ }
+ prevMatch = 0;
+ while(1){
+ rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
+ if( rc ) break;
+ iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
+ iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
+ match = 0;
+ for(i=0; i=0 && iCol1 && (prevMatch & (1<=0; j--){
+ int k = (iRotor-j) & FTS1_ROTOR_MASK;
+ snippetAppendMatch(pSnippet, iColumn, i-j,
+ iRotorBegin[k], iRotorLen[k]);
+ }
+ }
+ }
+ prevMatch = match<<1;
+ iRotor++;
+ }
+ pTModule->xClose(pTCursor);
+}
+
+
+/*
+** Compute all offsets for the current row of the query.
+** If the offsets have already been computed, this routine is a no-op.
+*/
+static void snippetAllOffsets(fulltext_cursor *p){
+ int nColumn;
+ int iColumn, i;
+ int iFirst, iLast;
+ fulltext_vtab *pFts;
+
+ if( p->snippet.nMatch ) return;
+ if( p->q.nTerms==0 ) return;
+ pFts = p->q.pFts;
+ nColumn = pFts->nColumn;
+ iColumn = p->iCursorType - QUERY_FULLTEXT;
+ if( iColumn<0 || iColumn>=nColumn ){
+ iFirst = 0;
+ iLast = nColumn-1;
+ }else{
+ iFirst = iColumn;
+ iLast = iColumn;
+ }
+ for(i=iFirst; i<=iLast; i++){
+ const char *zDoc;
+ int nDoc;
+ zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
+ nDoc = sqlite3_column_bytes(p->pStmt, i+1);
+ snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
+ }
+}
+
+/*
+** Convert the information in the aMatch[] array of the snippet
+** into the string zOffset[0..nOffset-1].
+*/
+static void snippetOffsetText(Snippet *p){
+ int i;
+ int cnt = 0;
+ StringBuffer sb;
+ char zBuf[200];
+ if( p->zOffset ) return;
+ initStringBuffer(&sb);
+ for(i=0; inMatch; i++){
+ struct snippetMatch *pMatch = &p->aMatch[i];
+ zBuf[0] = ' ';
+ sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
+ pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
+ append(&sb, zBuf);
+ cnt++;
+ }
+ p->zOffset = sb.s;
+ p->nOffset = sb.len;
+}
+
+/*
+** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
+** of matching words some of which might be in zDoc. zDoc is column
+** number iCol.
+**
+** iBreak is suggested spot in zDoc where we could begin or end an
+** excerpt. Return a value similar to iBreak but possibly adjusted
+** to be a little left or right so that the break point is better.
+*/
+static int wordBoundary(
+ int iBreak, /* The suggested break point */
+ const char *zDoc, /* Document text */
+ int nDoc, /* Number of bytes in zDoc[] */
+ struct snippetMatch *aMatch, /* Matching words */
+ int nMatch, /* Number of entries in aMatch[] */
+ int iCol /* The column number for zDoc[] */
+){
+ int i;
+ if( iBreak<=10 ){
+ return 0;
+ }
+ if( iBreak>=nDoc-10 ){
+ return nDoc;
+ }
+ for(i=0; i0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
+ return aMatch[i-1].iStart;
+ }
+ }
+ for(i=1; i<=10; i++){
+ if( safe_isspace(zDoc[iBreak-i]) ){
+ return iBreak - i + 1;
+ }
+ if( safe_isspace(zDoc[iBreak+i]) ){
+ return iBreak + i + 1;
+ }
+ }
+ return iBreak;
+}
+
+/*
+** If the StringBuffer does not end in white space, add a single
+** space character to the end.
+*/
+static void appendWhiteSpace(StringBuffer *p){
+ if( p->len==0 ) return;
+ if( safe_isspace(p->s[p->len-1]) ) return;
+ append(p, " ");
+}
+
+/*
+** Remove white space from teh end of the StringBuffer
+*/
+static void trimWhiteSpace(StringBuffer *p){
+ while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
+ p->len--;
+ }
+}
+
+
+
+/*
+** Allowed values for Snippet.aMatch[].snStatus
+*/
+#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
+#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
+
+/*
+** Generate the text of a snippet.
+*/
+static void snippetText(
+ fulltext_cursor *pCursor, /* The cursor we need the snippet for */
+ const char *zStartMark, /* Markup to appear before each match */
+ const char *zEndMark, /* Markup to appear after each match */
+ const char *zEllipsis /* Ellipsis mark */
+){
+ int i, j;
+ struct snippetMatch *aMatch;
+ int nMatch;
+ int nDesired;
+ StringBuffer sb;
+ int tailCol;
+ int tailOffset;
+ int iCol;
+ int nDoc;
+ const char *zDoc;
+ int iStart, iEnd;
+ int tailEllipsis = 0;
+ int iMatch;
+
+
+ free(pCursor->snippet.zSnippet);
+ pCursor->snippet.zSnippet = 0;
+ aMatch = pCursor->snippet.aMatch;
+ nMatch = pCursor->snippet.nMatch;
+ initStringBuffer(&sb);
+
+ for(i=0; iq.nTerms; i++){
+ for(j=0; j0; i++){
+ if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
+ nDesired--;
+ iCol = aMatch[i].iCol;
+ zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
+ nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
+ iStart = aMatch[i].iStart - 40;
+ iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
+ if( iStart<=10 ){
+ iStart = 0;
+ }
+ if( iCol==tailCol && iStart<=tailOffset+20 ){
+ iStart = tailOffset;
+ }
+ if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
+ trimWhiteSpace(&sb);
+ appendWhiteSpace(&sb);
+ append(&sb, zEllipsis);
+ appendWhiteSpace(&sb);
+ }
+ iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
+ iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
+ if( iEnd>=nDoc-10 ){
+ iEnd = nDoc;
+ tailEllipsis = 0;
+ }else{
+ tailEllipsis = 1;
+ }
+ while( iMatchsnippet.zSnippet = sb.s;
+ pCursor->snippet.nSnippet = sb.len;
+}
+
+
+/*
+** Close the cursor. For additional information see the documentation
+** on the xClose method of the virtual table interface.
+*/
+static int fulltextClose(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ TRACE(("FTS1 Close %p\n", c));
+ sqlite3_finalize(c->pStmt);
+ queryClear(&c->q);
+ snippetClear(&c->snippet);
+ if( c->result.pDoclist!=NULL ){
+ docListDelete(c->result.pDoclist);
+ }
+ free(c);
+ return SQLITE_OK;
+}
+
+static int fulltextNext(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ sqlite_int64 iDocid;
+ int rc;
+
+ TRACE(("FTS1 Next %p\n", pCursor));
+ snippetClear(&c->snippet);
+ if( c->iCursorType < QUERY_FULLTEXT ){
+ /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
+ rc = sqlite3_step(c->pStmt);
+ switch( rc ){
+ case SQLITE_ROW:
+ c->eof = 0;
+ return SQLITE_OK;
+ case SQLITE_DONE:
+ c->eof = 1;
+ return SQLITE_OK;
+ default:
+ c->eof = 1;
+ return rc;
+ }
+ } else { /* full-text query */
+ rc = sqlite3_reset(c->pStmt);
+ if( rc!=SQLITE_OK ) return rc;
+
+ iDocid = nextDocid(&c->result);
+ if( iDocid==0 ){
+ c->eof = 1;
+ return SQLITE_OK;
+ }
+ rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
+ if( rc!=SQLITE_OK ) return rc;
+ /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
+ rc = sqlite3_step(c->pStmt);
+ if( rc==SQLITE_ROW ){ /* the case we expect */
+ c->eof = 0;
+ return SQLITE_OK;
+ }
+ /* an error occurred; abort */
+ return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
+ }
+}
+
+
+/* Return a DocList corresponding to the query term *pTerm. If *pTerm
+** is the first term of a phrase query, go ahead and evaluate the phrase
+** query and return the doclist for the entire phrase query.
+**
+** The result is stored in pTerm->doclist.
+*/
+static int docListOfTerm(
+ fulltext_vtab *v, /* The full text index */
+ int iColumn, /* column to restrict to. No restrition if >=nColumn */
+ QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
+ DocList **ppResult /* Write the result here */
+){
+ DocList *pLeft, *pRight, *pNew;
+ int i, rc;
+
+ pLeft = docListNew(DL_POSITIONS);
+ rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
+ if( rc ){
+ docListDelete(pLeft);
+ return rc;
+ }
+ for(i=1; i<=pQTerm->nPhrase; i++){
+ pRight = docListNew(DL_POSITIONS);
+ rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
+ if( rc ){
+ docListDelete(pLeft);
+ return rc;
+ }
+ pNew = docListNew(inPhrase ? DL_POSITIONS : DL_DOCIDS);
+ docListPhraseMerge(pLeft, pRight, pNew);
+ docListDelete(pLeft);
+ docListDelete(pRight);
+ pLeft = pNew;
+ }
+ *ppResult = pLeft;
+ return SQLITE_OK;
+}
+
+/* Add a new term pTerm[0..nTerm-1] to the query *q.
+*/
+static void queryAdd(Query *q, const char *pTerm, int nTerm){
+ QueryTerm *t;
+ ++q->nTerms;
+ q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
+ if( q->pTerms==0 ){
+ q->nTerms = 0;
+ return;
+ }
+ t = &q->pTerms[q->nTerms - 1];
+ memset(t, 0, sizeof(*t));
+ t->pTerm = malloc(nTerm+1);
+ memcpy(t->pTerm, pTerm, nTerm);
+ t->pTerm[nTerm] = 0;
+ t->nTerm = nTerm;
+ t->isOr = q->nextIsOr;
+ q->nextIsOr = 0;
+ t->iColumn = q->nextColumn;
+ q->nextColumn = q->dfltColumn;
+}
+
+/*
+** Check to see if the string zToken[0...nToken-1] matches any
+** column name in the virtual table. If it does,
+** return the zero-indexed column number. If not, return -1.
+*/
+static int checkColumnSpecifier(
+ fulltext_vtab *pVtab, /* The virtual table */
+ const char *zToken, /* Text of the token */
+ int nToken /* Number of characters in the token */
+){
+ int i;
+ for(i=0; inColumn; i++){
+ if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
+ && pVtab->azColumn[i][nToken]==0 ){
+ return i;
+ }
+ }
+ return -1;
+}
+
+/*
+** Parse the text at pSegment[0..nSegment-1]. Add additional terms
+** to the query being assemblied in pQuery.
+**
+** inPhrase is true if pSegment[0..nSegement-1] is contained within
+** double-quotes. If inPhrase is true, then the first term
+** is marked with the number of terms in the phrase less one and
+** OR and "-" syntax is ignored. If inPhrase is false, then every
+** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
+*/
+static int tokenizeSegment(
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
+ const char *pSegment, int nSegment, /* Query expression being parsed */
+ int inPhrase, /* True if within "..." */
+ Query *pQuery /* Append results here */
+){
+ const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
+ sqlite3_tokenizer_cursor *pCursor;
+ int firstIndex = pQuery->nTerms;
+ int iCol;
+ int nTerm = 1;
+
+ int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
+ if( rc!=SQLITE_OK ) return rc;
+ pCursor->pTokenizer = pTokenizer;
+
+ while( 1 ){
+ const char *pToken;
+ int nToken, iBegin, iEnd, iPos;
+
+ rc = pModule->xNext(pCursor,
+ &pToken, &nToken,
+ &iBegin, &iEnd, &iPos);
+ if( rc!=SQLITE_OK ) break;
+ if( !inPhrase &&
+ pSegment[iEnd]==':' &&
+ (iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
+ pQuery->nextColumn = iCol;
+ continue;
+ }
+ if( !inPhrase && pQuery->nTerms>0 && nToken==2
+ && pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
+ pQuery->nextIsOr = 1;
+ continue;
+ }
+ queryAdd(pQuery, pToken, nToken);
+ if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
+ pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
+ }
+ pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
+ if( inPhrase ){
+ nTerm++;
+ }
+ }
+
+ if( inPhrase && pQuery->nTerms>firstIndex ){
+ pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
+ }
+
+ return pModule->xClose(pCursor);
+}
+
+/* Parse a query string, yielding a Query object pQuery.
+**
+** The calling function will need to queryClear() to clean up
+** the dynamically allocated memory held by pQuery.
+*/
+static int parseQuery(
+ fulltext_vtab *v, /* The fulltext index */
+ const char *zInput, /* Input text of the query string */
+ int nInput, /* Size of the input text */
+ int dfltColumn, /* Default column of the index to match against */
+ Query *pQuery /* Write the parse results here. */
+){
+ int iInput, inPhrase = 0;
+
+ if( zInput==0 ) nInput = 0;
+ if( nInput<0 ) nInput = strlen(zInput);
+ pQuery->nTerms = 0;
+ pQuery->pTerms = NULL;
+ pQuery->nextIsOr = 0;
+ pQuery->nextColumn = dfltColumn;
+ pQuery->dfltColumn = dfltColumn;
+ pQuery->pFts = v;
+
+ for(iInput=0; iInputiInput ){
+ tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
+ pQuery);
+ }
+ iInput = i;
+ if( i=nColumn
+** they are allowed to match against any column.
+*/
+static int fulltextQuery(
+ fulltext_vtab *v, /* The full text index */
+ int iColumn, /* Match against this column by default */
+ const char *zInput, /* The query string */
+ int nInput, /* Number of bytes in zInput[] */
+ DocList **pResult, /* Write the result doclist here */
+ Query *pQuery /* Put parsed query string here */
+){
+ int i, iNext, rc;
+ DocList *pLeft = NULL;
+ DocList *pRight, *pNew, *pOr;
+ int nNot = 0;
+ QueryTerm *aTerm;
+
+ rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* Merge AND terms. */
+ aTerm = pQuery->pTerms;
+ for(i = 0; inTerms; i=iNext){
+ if( aTerm[i].isNot ){
+ /* Handle all NOT terms in a separate pass */
+ nNot++;
+ iNext = i + aTerm[i].nPhrase+1;
+ continue;
+ }
+ iNext = i + aTerm[i].nPhrase + 1;
+ rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
+ if( rc ){
+ queryClear(pQuery);
+ return rc;
+ }
+ while( iNextnTerms && aTerm[iNext].isOr ){
+ rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
+ iNext += aTerm[iNext].nPhrase + 1;
+ if( rc ){
+ queryClear(pQuery);
+ return rc;
+ }
+ pNew = docListNew(DL_DOCIDS);
+ docListOrMerge(pRight, pOr, pNew);
+ docListDelete(pRight);
+ docListDelete(pOr);
+ pRight = pNew;
+ }
+ if( pLeft==0 ){
+ pLeft = pRight;
+ }else{
+ pNew = docListNew(DL_DOCIDS);
+ docListAndMerge(pLeft, pRight, pNew);
+ docListDelete(pRight);
+ docListDelete(pLeft);
+ pLeft = pNew;
+ }
+ }
+
+ if( nNot && pLeft==0 ){
+ /* We do not yet know how to handle a query of only NOT terms */
+ return SQLITE_ERROR;
+ }
+
+ /* Do the EXCEPT terms */
+ for(i=0; inTerms; i += aTerm[i].nPhrase + 1){
+ if( !aTerm[i].isNot ) continue;
+ rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
+ if( rc ){
+ queryClear(pQuery);
+ docListDelete(pLeft);
+ return rc;
+ }
+ pNew = docListNew(DL_DOCIDS);
+ docListExceptMerge(pLeft, pRight, pNew);
+ docListDelete(pRight);
+ docListDelete(pLeft);
+ pLeft = pNew;
+ }
+
+ *pResult = pLeft;
+ return rc;
+}
+
+/*
+** This is the xFilter interface for the virtual table. See
+** the virtual table xFilter method documentation for additional
+** information.
+**
+** If idxNum==QUERY_GENERIC then do a full table scan against
+** the %_content table.
+**
+** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
+** in the %_content table.
+**
+** If idxNum>=QUERY_FULLTEXT then use the full text index. The
+** column on the left-hand side of the MATCH operator is column
+** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
+** side of the MATCH operator.
+*/
+/* TODO(shess) Upgrade the cursor initialization and destruction to
+** account for fulltextFilter() being called multiple times on the
+** same cursor. The current solution is very fragile. Apply fix to
+** fts2 as appropriate.
+*/
+static int fulltextFilter(
+ sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
+ int idxNum, const char *idxStr, /* Which indexing scheme to use */
+ int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
+){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ fulltext_vtab *v = cursor_vtab(c);
+ int rc;
+ char *zSql;
+
+ TRACE(("FTS1 Filter %p\n",pCursor));
+
+ zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
+ idxNum==QUERY_GENERIC ? "" : "where rowid=?");
+ sqlite3_finalize(c->pStmt);
+ rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
+ sqlite3_free(zSql);
+ if( rc!=SQLITE_OK ) return rc;
+
+ c->iCursorType = idxNum;
+ switch( idxNum ){
+ case QUERY_GENERIC:
+ break;
+
+ case QUERY_ROWID:
+ rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
+ if( rc!=SQLITE_OK ) return rc;
+ break;
+
+ default: /* full-text search */
+ {
+ const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
+ DocList *pResult;
+ assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
+ assert( argc==1 );
+ queryClear(&c->q);
+ rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
+ if( rc!=SQLITE_OK ) return rc;
+ if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
+ readerInit(&c->result, pResult);
+ break;
+ }
+ }
+
+ return fulltextNext(pCursor);
+}
+
+/* This is the xEof method of the virtual table. The SQLite core
+** calls this routine to find out if it has reached the end of
+** a query's results set.
+*/
+static int fulltextEof(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ return c->eof;
+}
+
+/* This is the xColumn method of the virtual table. The SQLite
+** core calls this method during a query when it needs the value
+** of a column from the virtual table. This method needs to use
+** one of the sqlite3_result_*() routines to store the requested
+** value back in the pContext.
+*/
+static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
+ sqlite3_context *pContext, int idxCol){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ fulltext_vtab *v = cursor_vtab(c);
+
+ if( idxColnColumn ){
+ sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
+ sqlite3_result_value(pContext, pVal);
+ }else if( idxCol==v->nColumn ){
+ /* The extra column whose name is the same as the table.
+ ** Return a blob which is a pointer to the cursor
+ */
+ sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
+ }
+ return SQLITE_OK;
+}
+
+/* This is the xRowid method. The SQLite core calls this routine to
+** retrive the rowid for the current row of the result set. The
+** rowid should be written to *pRowid.
+*/
+static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+
+ *pRowid = sqlite3_column_int64(c->pStmt, 0);
+ return SQLITE_OK;
+}
+
+/* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
+ * we also store positions and offsets in the hash table using the given
+ * column number. */
+static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
+ const char *zText, int iColumn){
+ sqlite3_tokenizer *pTokenizer = v->pTokenizer;
+ sqlite3_tokenizer_cursor *pCursor;
+ const char *pToken;
+ int nTokenBytes;
+ int iStartOffset, iEndOffset, iPosition;
+ int rc;
+
+ rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
+ if( rc!=SQLITE_OK ) return rc;
+
+ pCursor->pTokenizer = pTokenizer;
+ while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
+ &pToken, &nTokenBytes,
+ &iStartOffset, &iEndOffset,
+ &iPosition) ){
+ DocList *p;
+
+ /* Positions can't be negative; we use -1 as a terminator internally. */
+ if( iPosition<0 ){
+ pTokenizer->pModule->xClose(pCursor);
+ return SQLITE_ERROR;
+ }
+
+ p = fts1HashFind(terms, pToken, nTokenBytes);
+ if( p==NULL ){
+ p = docListNew(DL_DEFAULT);
+ docListAddDocid(p, iDocid);
+ fts1HashInsert(terms, pToken, nTokenBytes, p);
+ }
+ if( iColumn>=0 ){
+ docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
+ }
+ }
+
+ /* TODO(shess) Check return? Should this be able to cause errors at
+ ** this point? Actually, same question about sqlite3_finalize(),
+ ** though one could argue that failure there means that the data is
+ ** not durable. *ponder*
+ */
+ pTokenizer->pModule->xClose(pCursor);
+ return rc;
+}
+
+/* Update the %_terms table to map the term [pTerm] to the given rowid. */
+static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
+ DocList *d){
+ sqlite_int64 iIndexRow;
+ DocList doclist;
+ int iSegment = 0, rc;
+
+ rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
+ if( rc==SQLITE_DONE ){
+ docListInit(&doclist, DL_DEFAULT, 0, 0);
+ docListUpdate(&doclist, d);
+ /* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
+ rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
+ goto err;
+ }
+ if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
+
+ docListUpdate(&doclist, d);
+ if( doclist.nData<=CHUNK_MAX ){
+ rc = term_update(v, iIndexRow, &doclist);
+ goto err;
+ }
+
+ /* Doclist doesn't fit, delete what's there, and accumulate
+ ** forward.
+ */
+ rc = term_delete(v, iIndexRow);
+ if( rc!=SQLITE_OK ) goto err;
+
+ /* Try to insert the doclist into a higher segment bucket. On
+ ** failure, accumulate existing doclist with the doclist from that
+ ** bucket, and put results in the next bucket.
+ */
+ iSegment++;
+ while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
+ &doclist))!=SQLITE_OK ){
+ sqlite_int64 iSegmentRow;
+ DocList old;
+ int rc2;
+
+ /* Retain old error in case the term_insert() error was really an
+ ** error rather than a bounced insert.
+ */
+ rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
+ if( rc2!=SQLITE_ROW ) goto err;
+
+ rc = term_delete(v, iSegmentRow);
+ if( rc!=SQLITE_OK ) goto err;
+
+ /* Reusing lowest-number deleted row keeps the index smaller. */
+ if( iSegmentRownColumn ; ++i){
+ char *zText = (char*)sqlite3_value_text(pValues[i]);
+ int rc = buildTerms(v, terms, iRowid, zText, i);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+ return SQLITE_OK;
+}
+
+/* Add empty doclists for all terms in the given row's content to the hash
+ * table [pTerms]. */
+static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
+ const char **pValues;
+ int i;
+
+ int rc = content_select(v, iRowid, &pValues);
+ if( rc!=SQLITE_OK ) return rc;
+
+ for(i = 0 ; i < v->nColumn; ++i) {
+ rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
+ if( rc!=SQLITE_OK ) break;
+ }
+
+ freeStringArray(v->nColumn, pValues);
+ return SQLITE_OK;
+}
+
+/* Insert a row into the %_content table; set *piRowid to be the ID of the
+ * new row. Fill [pTerms] with new doclists for the %_term table. */
+static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
+ sqlite3_value **pValues,
+ sqlite_int64 *piRowid, fts1Hash *pTerms){
+ int rc;
+
+ rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
+ if( rc!=SQLITE_OK ) return rc;
+ *piRowid = sqlite3_last_insert_rowid(v->db);
+ return insertTerms(v, pTerms, *piRowid, pValues);
+}
+
+/* Delete a row from the %_content table; fill [pTerms] with empty doclists
+ * to be written to the %_term table. */
+static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
+ int rc = deleteTerms(v, pTerms, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+ return content_delete(v, iRow); /* execute an SQL DELETE */
+}
+
+/* Update a row in the %_content table; fill [pTerms] with new doclists for the
+ * %_term table. */
+static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
+ sqlite3_value **pValues, fts1Hash *pTerms){
+ /* Generate an empty doclist for each term that previously appeared in this
+ * row. */
+ int rc = deleteTerms(v, pTerms, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* Now add positions for terms which appear in the updated row. */
+ return insertTerms(v, pTerms, iRow, pValues);
+}
+
+/* This function implements the xUpdate callback; it is the top-level entry
+ * point for inserting, deleting or updating a row in a full-text table. */
+static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
+ sqlite_int64 *pRowid){
+ fulltext_vtab *v = (fulltext_vtab *) pVtab;
+ fts1Hash terms; /* maps term string -> PosList */
+ int rc;
+ fts1HashElem *e;
+
+ TRACE(("FTS1 Update %p\n", pVtab));
+
+ fts1HashInit(&terms, FTS1_HASH_STRING, 1);
+
+ if( nArg<2 ){
+ rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
+ } else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
+ /* An update:
+ * ppArg[0] = old rowid
+ * ppArg[1] = new rowid
+ * ppArg[2..2+v->nColumn-1] = values
+ * ppArg[2+v->nColumn] = value for magic column (we ignore this)
+ */
+ sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
+ if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
+ sqlite3_value_int64(ppArg[1]) != rowid ){
+ rc = SQLITE_ERROR; /* we don't allow changing the rowid */
+ } else {
+ assert( nArg==2+v->nColumn+1);
+ rc = index_update(v, rowid, &ppArg[2], &terms);
+ }
+ } else {
+ /* An insert:
+ * ppArg[1] = requested rowid
+ * ppArg[2..2+v->nColumn-1] = values
+ * ppArg[2+v->nColumn] = value for magic column (we ignore this)
+ */
+ assert( nArg==2+v->nColumn+1);
+ rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
+ }
+
+ if( rc==SQLITE_OK ){
+ /* Write updated doclists to disk. */
+ for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
+ DocList *p = fts1HashData(e);
+ rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
+ if( rc!=SQLITE_OK ) break;
+ }
+ }
+
+ /* clean up */
+ for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
+ DocList *p = fts1HashData(e);
+ docListDelete(p);
+ }
+ fts1HashClear(&terms);
+
+ return rc;
+}
+
+/*
+** Implementation of the snippet() function for FTS1
+*/
+static void snippetFunc(
+ sqlite3_context *pContext,
+ int argc,
+ sqlite3_value **argv
+){
+ fulltext_cursor *pCursor;
+ if( argc<1 ) return;
+ if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
+ sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
+ sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
+ }else{
+ const char *zStart = "";
+ const char *zEnd = "";
+ const char *zEllipsis = "...";
+ memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
+ if( argc>=2 ){
+ zStart = (const char*)sqlite3_value_text(argv[1]);
+ if( argc>=3 ){
+ zEnd = (const char*)sqlite3_value_text(argv[2]);
+ if( argc>=4 ){
+ zEllipsis = (const char*)sqlite3_value_text(argv[3]);
+ }
+ }
+ }
+ snippetAllOffsets(pCursor);
+ snippetText(pCursor, zStart, zEnd, zEllipsis);
+ sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
+ pCursor->snippet.nSnippet, SQLITE_STATIC);
+ }
+}
+
+/*
+** Implementation of the offsets() function for FTS1
+*/
+static void snippetOffsetsFunc(
+ sqlite3_context *pContext,
+ int argc,
+ sqlite3_value **argv
+){
+ fulltext_cursor *pCursor;
+ if( argc<1 ) return;
+ if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
+ sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
+ sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
+ }else{
+ memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
+ snippetAllOffsets(pCursor);
+ snippetOffsetText(&pCursor->snippet);
+ sqlite3_result_text(pContext,
+ pCursor->snippet.zOffset, pCursor->snippet.nOffset,
+ SQLITE_STATIC);
+ }
+}
+
+/*
+** This routine implements the xFindFunction method for the FTS1
+** virtual table.
+*/
+static int fulltextFindFunction(
+ sqlite3_vtab *pVtab,
+ int nArg,
+ const char *zName,
+ void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
+ void **ppArg
+){
+ if( strcmp(zName,"snippet")==0 ){
+ *pxFunc = snippetFunc;
+ return 1;
+ }else if( strcmp(zName,"offsets")==0 ){
+ *pxFunc = snippetOffsetsFunc;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+** Rename an fts1 table.
+*/
+static int fulltextRename(
+ sqlite3_vtab *pVtab,
+ const char *zName
+){
+ fulltext_vtab *p = (fulltext_vtab *)pVtab;
+ int rc = SQLITE_NOMEM;
+ char *zSql = sqlite3_mprintf(
+ "ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
+ "ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
+ , p->zDb, p->zName, zName
+ , p->zDb, p->zName, zName
+ );
+ if( zSql ){
+ rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
+ sqlite3_free(zSql);
+ }
+ return rc;
+}
+
+static const sqlite3_module fulltextModule = {
+ /* iVersion */ 0,
+ /* xCreate */ fulltextCreate,
+ /* xConnect */ fulltextConnect,
+ /* xBestIndex */ fulltextBestIndex,
+ /* xDisconnect */ fulltextDisconnect,
+ /* xDestroy */ fulltextDestroy,
+ /* xOpen */ fulltextOpen,
+ /* xClose */ fulltextClose,
+ /* xFilter */ fulltextFilter,
+ /* xNext */ fulltextNext,
+ /* xEof */ fulltextEof,
+ /* xColumn */ fulltextColumn,
+ /* xRowid */ fulltextRowid,
+ /* xUpdate */ fulltextUpdate,
+ /* xBegin */ 0,
+ /* xSync */ 0,
+ /* xCommit */ 0,
+ /* xRollback */ 0,
+ /* xFindFunction */ fulltextFindFunction,
+ /* xRename */ fulltextRename,
+};
+
+int sqlite3Fts1Init(sqlite3 *db){
+ sqlite3_overload_function(db, "snippet", -1);
+ sqlite3_overload_function(db, "offsets", -1);
+ return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
+}
+
+#if !SQLITE_CORE
+#ifdef _WIN32
+__declspec(dllexport)
+#endif
+int sqlite3_fts1_init(sqlite3 *db, char **pzErrMsg,
+ const sqlite3_api_routines *pApi){
+ SQLITE_EXTENSION_INIT2(pApi)
+ return sqlite3Fts1Init(db);
+}
+#endif
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ADDED ext/fts1/fts1.h
Index: ext/fts1/fts1.h
==================================================================
--- /dev/null
+++ ext/fts1/fts1.h
@@ -0,0 +1,11 @@
+#include "sqlite3.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+int sqlite3Fts1Init(sqlite3 *db);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
ADDED ext/fts1/fts1_hash.c
Index: ext/fts1/fts1_hash.c
==================================================================
--- /dev/null
+++ ext/fts1/fts1_hash.c
@@ -0,0 +1,369 @@
+/*
+** 2001 September 22
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** This is the implementation of generic hash-tables used in SQLite.
+** We've modified it slightly to serve as a standalone hash table
+** implementation for the full-text indexing module.
+*/
+#include
+#include
+#include
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS1 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS1 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
+
+
+#include "fts1_hash.h"
+
+static void *malloc_and_zero(int n){
+ void *p = malloc(n);
+ if( p ){
+ memset(p, 0, n);
+ }
+ return p;
+}
+
+/* Turn bulk memory into a hash table object by initializing the
+** fields of the Hash structure.
+**
+** "pNew" is a pointer to the hash table that is to be initialized.
+** keyClass is one of the constants
+** FTS1_HASH_BINARY or FTS1_HASH_STRING. The value of keyClass
+** determines what kind of key the hash table will use. "copyKey" is
+** true if the hash table should make its own private copy of keys and
+** false if it should just use the supplied pointer.
+*/
+void sqlite3Fts1HashInit(fts1Hash *pNew, int keyClass, int copyKey){
+ assert( pNew!=0 );
+ assert( keyClass>=FTS1_HASH_STRING && keyClass<=FTS1_HASH_BINARY );
+ pNew->keyClass = keyClass;
+ pNew->copyKey = copyKey;
+ pNew->first = 0;
+ pNew->count = 0;
+ pNew->htsize = 0;
+ pNew->ht = 0;
+ pNew->xMalloc = malloc_and_zero;
+ pNew->xFree = free;
+}
+
+/* Remove all entries from a hash table. Reclaim all memory.
+** Call this routine to delete a hash table or to reset a hash table
+** to the empty state.
+*/
+void sqlite3Fts1HashClear(fts1Hash *pH){
+ fts1HashElem *elem; /* For looping over all elements of the table */
+
+ assert( pH!=0 );
+ elem = pH->first;
+ pH->first = 0;
+ if( pH->ht ) pH->xFree(pH->ht);
+ pH->ht = 0;
+ pH->htsize = 0;
+ while( elem ){
+ fts1HashElem *next_elem = elem->next;
+ if( pH->copyKey && elem->pKey ){
+ pH->xFree(elem->pKey);
+ }
+ pH->xFree(elem);
+ elem = next_elem;
+ }
+ pH->count = 0;
+}
+
+/*
+** Hash and comparison functions when the mode is FTS1_HASH_STRING
+*/
+static int strHash(const void *pKey, int nKey){
+ const char *z = (const char *)pKey;
+ int h = 0;
+ if( nKey<=0 ) nKey = (int) strlen(z);
+ while( nKey > 0 ){
+ h = (h<<3) ^ h ^ *z++;
+ nKey--;
+ }
+ return h & 0x7fffffff;
+}
+static int strCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ if( n1!=n2 ) return 1;
+ return strncmp((const char*)pKey1,(const char*)pKey2,n1);
+}
+
+/*
+** Hash and comparison functions when the mode is FTS1_HASH_BINARY
+*/
+static int binHash(const void *pKey, int nKey){
+ int h = 0;
+ const char *z = (const char *)pKey;
+ while( nKey-- > 0 ){
+ h = (h<<3) ^ h ^ *(z++);
+ }
+ return h & 0x7fffffff;
+}
+static int binCompare(const void *pKey1, int n1, const void *pKey2, int n2){
+ if( n1!=n2 ) return 1;
+ return memcmp(pKey1,pKey2,n1);
+}
+
+/*
+** Return a pointer to the appropriate hash function given the key class.
+**
+** The C syntax in this function definition may be unfamilar to some
+** programmers, so we provide the following additional explanation:
+**
+** The name of the function is "hashFunction". The function takes a
+** single parameter "keyClass". The return value of hashFunction()
+** is a pointer to another function. Specifically, the return value
+** of hashFunction() is a pointer to a function that takes two parameters
+** with types "const void*" and "int" and returns an "int".
+*/
+static int (*hashFunction(int keyClass))(const void*,int){
+ if( keyClass==FTS1_HASH_STRING ){
+ return &strHash;
+ }else{
+ assert( keyClass==FTS1_HASH_BINARY );
+ return &binHash;
+ }
+}
+
+/*
+** Return a pointer to the appropriate hash function given the key class.
+**
+** For help in interpreted the obscure C code in the function definition,
+** see the header comment on the previous function.
+*/
+static int (*compareFunction(int keyClass))(const void*,int,const void*,int){
+ if( keyClass==FTS1_HASH_STRING ){
+ return &strCompare;
+ }else{
+ assert( keyClass==FTS1_HASH_BINARY );
+ return &binCompare;
+ }
+}
+
+/* Link an element into the hash table
+*/
+static void insertElement(
+ fts1Hash *pH, /* The complete hash table */
+ struct _fts1ht *pEntry, /* The entry into which pNew is inserted */
+ fts1HashElem *pNew /* The element to be inserted */
+){
+ fts1HashElem *pHead; /* First element already in pEntry */
+ pHead = pEntry->chain;
+ if( pHead ){
+ pNew->next = pHead;
+ pNew->prev = pHead->prev;
+ if( pHead->prev ){ pHead->prev->next = pNew; }
+ else { pH->first = pNew; }
+ pHead->prev = pNew;
+ }else{
+ pNew->next = pH->first;
+ if( pH->first ){ pH->first->prev = pNew; }
+ pNew->prev = 0;
+ pH->first = pNew;
+ }
+ pEntry->count++;
+ pEntry->chain = pNew;
+}
+
+
+/* Resize the hash table so that it cantains "new_size" buckets.
+** "new_size" must be a power of 2. The hash table might fail
+** to resize if sqliteMalloc() fails.
+*/
+static void rehash(fts1Hash *pH, int new_size){
+ struct _fts1ht *new_ht; /* The new hash table */
+ fts1HashElem *elem, *next_elem; /* For looping over existing elements */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ assert( (new_size & (new_size-1))==0 );
+ new_ht = (struct _fts1ht *)pH->xMalloc( new_size*sizeof(struct _fts1ht) );
+ if( new_ht==0 ) return;
+ if( pH->ht ) pH->xFree(pH->ht);
+ pH->ht = new_ht;
+ pH->htsize = new_size;
+ xHash = hashFunction(pH->keyClass);
+ for(elem=pH->first, pH->first=0; elem; elem = next_elem){
+ int h = (*xHash)(elem->pKey, elem->nKey) & (new_size-1);
+ next_elem = elem->next;
+ insertElement(pH, &new_ht[h], elem);
+ }
+}
+
+/* This function (for internal use only) locates an element in an
+** hash table that matches the given key. The hash for this key has
+** already been computed and is passed as the 4th parameter.
+*/
+static fts1HashElem *findElementGivenHash(
+ const fts1Hash *pH, /* The pH to be searched */
+ const void *pKey, /* The key we are searching for */
+ int nKey,
+ int h /* The hash for this key. */
+){
+ fts1HashElem *elem; /* Used to loop thru the element list */
+ int count; /* Number of elements left to test */
+ int (*xCompare)(const void*,int,const void*,int); /* comparison function */
+
+ if( pH->ht ){
+ struct _fts1ht *pEntry = &pH->ht[h];
+ elem = pEntry->chain;
+ count = pEntry->count;
+ xCompare = compareFunction(pH->keyClass);
+ while( count-- && elem ){
+ if( (*xCompare)(elem->pKey,elem->nKey,pKey,nKey)==0 ){
+ return elem;
+ }
+ elem = elem->next;
+ }
+ }
+ return 0;
+}
+
+/* Remove a single entry from the hash table given a pointer to that
+** element and a hash on the element's key.
+*/
+static void removeElementGivenHash(
+ fts1Hash *pH, /* The pH containing "elem" */
+ fts1HashElem* elem, /* The element to be removed from the pH */
+ int h /* Hash value for the element */
+){
+ struct _fts1ht *pEntry;
+ if( elem->prev ){
+ elem->prev->next = elem->next;
+ }else{
+ pH->first = elem->next;
+ }
+ if( elem->next ){
+ elem->next->prev = elem->prev;
+ }
+ pEntry = &pH->ht[h];
+ if( pEntry->chain==elem ){
+ pEntry->chain = elem->next;
+ }
+ pEntry->count--;
+ if( pEntry->count<=0 ){
+ pEntry->chain = 0;
+ }
+ if( pH->copyKey && elem->pKey ){
+ pH->xFree(elem->pKey);
+ }
+ pH->xFree( elem );
+ pH->count--;
+ if( pH->count<=0 ){
+ assert( pH->first==0 );
+ assert( pH->count==0 );
+ fts1HashClear(pH);
+ }
+}
+
+/* Attempt to locate an element of the hash table pH with a key
+** that matches pKey,nKey. Return the data for this element if it is
+** found, or NULL if there is no match.
+*/
+void *sqlite3Fts1HashFind(const fts1Hash *pH, const void *pKey, int nKey){
+ int h; /* A hash on key */
+ fts1HashElem *elem; /* The element that matches key */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ if( pH==0 || pH->ht==0 ) return 0;
+ xHash = hashFunction(pH->keyClass);
+ assert( xHash!=0 );
+ h = (*xHash)(pKey,nKey);
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ elem = findElementGivenHash(pH,pKey,nKey, h & (pH->htsize-1));
+ return elem ? elem->data : 0;
+}
+
+/* Insert an element into the hash table pH. The key is pKey,nKey
+** and the data is "data".
+**
+** If no element exists with a matching key, then a new
+** element is created. A copy of the key is made if the copyKey
+** flag is set. NULL is returned.
+**
+** If another element already exists with the same key, then the
+** new data replaces the old data and the old data is returned.
+** The key is not copied in this instance. If a malloc fails, then
+** the new data is returned and the hash table is unchanged.
+**
+** If the "data" parameter to this function is NULL, then the
+** element corresponding to "key" is removed from the hash table.
+*/
+void *sqlite3Fts1HashInsert(
+ fts1Hash *pH, /* The hash table to insert into */
+ const void *pKey, /* The key */
+ int nKey, /* Number of bytes in the key */
+ void *data /* The data */
+){
+ int hraw; /* Raw hash value of the key */
+ int h; /* the hash of the key modulo hash table size */
+ fts1HashElem *elem; /* Used to loop thru the element list */
+ fts1HashElem *new_elem; /* New element added to the pH */
+ int (*xHash)(const void*,int); /* The hash function */
+
+ assert( pH!=0 );
+ xHash = hashFunction(pH->keyClass);
+ assert( xHash!=0 );
+ hraw = (*xHash)(pKey, nKey);
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ h = hraw & (pH->htsize-1);
+ elem = findElementGivenHash(pH,pKey,nKey,h);
+ if( elem ){
+ void *old_data = elem->data;
+ if( data==0 ){
+ removeElementGivenHash(pH,elem,h);
+ }else{
+ elem->data = data;
+ }
+ return old_data;
+ }
+ if( data==0 ) return 0;
+ new_elem = (fts1HashElem*)pH->xMalloc( sizeof(fts1HashElem) );
+ if( new_elem==0 ) return data;
+ if( pH->copyKey && pKey!=0 ){
+ new_elem->pKey = pH->xMalloc( nKey );
+ if( new_elem->pKey==0 ){
+ pH->xFree(new_elem);
+ return data;
+ }
+ memcpy((void*)new_elem->pKey, pKey, nKey);
+ }else{
+ new_elem->pKey = (void*)pKey;
+ }
+ new_elem->nKey = nKey;
+ pH->count++;
+ if( pH->htsize==0 ){
+ rehash(pH,8);
+ if( pH->htsize==0 ){
+ pH->count = 0;
+ pH->xFree(new_elem);
+ return data;
+ }
+ }
+ if( pH->count > pH->htsize ){
+ rehash(pH,pH->htsize*2);
+ }
+ assert( pH->htsize>0 );
+ assert( (pH->htsize & (pH->htsize-1))==0 );
+ h = hraw & (pH->htsize-1);
+ insertElement(pH, &pH->ht[h], new_elem);
+ new_elem->data = data;
+ return 0;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ADDED ext/fts1/fts1_hash.h
Index: ext/fts1/fts1_hash.h
==================================================================
--- /dev/null
+++ ext/fts1/fts1_hash.h
@@ -0,0 +1,112 @@
+/*
+** 2001 September 22
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** This is the header file for the generic hash-table implementation
+** used in SQLite. We've modified it slightly to serve as a standalone
+** hash table implementation for the full-text indexing module.
+**
+*/
+#ifndef _FTS1_HASH_H_
+#define _FTS1_HASH_H_
+
+/* Forward declarations of structures. */
+typedef struct fts1Hash fts1Hash;
+typedef struct fts1HashElem fts1HashElem;
+
+/* A complete hash table is an instance of the following structure.
+** The internals of this structure are intended to be opaque -- client
+** code should not attempt to access or modify the fields of this structure
+** directly. Change this structure only by using the routines below.
+** However, many of the "procedures" and "functions" for modifying and
+** accessing this structure are really macros, so we can't really make
+** this structure opaque.
+*/
+struct fts1Hash {
+ char keyClass; /* HASH_INT, _POINTER, _STRING, _BINARY */
+ char copyKey; /* True if copy of key made on insert */
+ int count; /* Number of entries in this table */
+ fts1HashElem *first; /* The first element of the array */
+ void *(*xMalloc)(int); /* malloc() function to use */
+ void (*xFree)(void *); /* free() function to use */
+ int htsize; /* Number of buckets in the hash table */
+ struct _fts1ht { /* the hash table */
+ int count; /* Number of entries with this hash */
+ fts1HashElem *chain; /* Pointer to first entry with this hash */
+ } *ht;
+};
+
+/* Each element in the hash table is an instance of the following
+** structure. All elements are stored on a single doubly-linked list.
+**
+** Again, this structure is intended to be opaque, but it can't really
+** be opaque because it is used by macros.
+*/
+struct fts1HashElem {
+ fts1HashElem *next, *prev; /* Next and previous elements in the table */
+ void *data; /* Data associated with this element */
+ void *pKey; int nKey; /* Key associated with this element */
+};
+
+/*
+** There are 2 different modes of operation for a hash table:
+**
+** FTS1_HASH_STRING pKey points to a string that is nKey bytes long
+** (including the null-terminator, if any). Case
+** is respected in comparisons.
+**
+** FTS1_HASH_BINARY pKey points to binary data nKey bytes long.
+** memcmp() is used to compare keys.
+**
+** A copy of the key is made if the copyKey parameter to fts1HashInit is 1.
+*/
+#define FTS1_HASH_STRING 1
+#define FTS1_HASH_BINARY 2
+
+/*
+** Access routines. To delete, insert a NULL pointer.
+*/
+void sqlite3Fts1HashInit(fts1Hash*, int keytype, int copyKey);
+void *sqlite3Fts1HashInsert(fts1Hash*, const void *pKey, int nKey, void *pData);
+void *sqlite3Fts1HashFind(const fts1Hash*, const void *pKey, int nKey);
+void sqlite3Fts1HashClear(fts1Hash*);
+
+/*
+** Shorthand for the functions above
+*/
+#define fts1HashInit sqlite3Fts1HashInit
+#define fts1HashInsert sqlite3Fts1HashInsert
+#define fts1HashFind sqlite3Fts1HashFind
+#define fts1HashClear sqlite3Fts1HashClear
+
+/*
+** Macros for looping over all elements of a hash table. The idiom is
+** like this:
+**
+** fts1Hash h;
+** fts1HashElem *p;
+** ...
+** for(p=fts1HashFirst(&h); p; p=fts1HashNext(p)){
+** SomeStructure *pData = fts1HashData(p);
+** // do something with pData
+** }
+*/
+#define fts1HashFirst(H) ((H)->first)
+#define fts1HashNext(E) ((E)->next)
+#define fts1HashData(E) ((E)->data)
+#define fts1HashKey(E) ((E)->pKey)
+#define fts1HashKeysize(E) ((E)->nKey)
+
+/*
+** Number of entries in a hash table
+*/
+#define fts1HashCount(H) ((H)->count)
+
+#endif /* _FTS1_HASH_H_ */
ADDED ext/fts1/fts1_porter.c
Index: ext/fts1/fts1_porter.c
==================================================================
--- /dev/null
+++ ext/fts1/fts1_porter.c
@@ -0,0 +1,643 @@
+/*
+** 2006 September 30
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** Implementation of the full-text-search tokenizer that implements
+** a Porter stemmer.
+*/
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS1 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS1 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
+
+
+#include
+#include
+#include
+#include
+#include
+
+#include "fts1_tokenizer.h"
+
+/*
+** Class derived from sqlite3_tokenizer
+*/
+typedef struct porter_tokenizer {
+ sqlite3_tokenizer base; /* Base class */
+} porter_tokenizer;
+
+/*
+** Class derived from sqlit3_tokenizer_cursor
+*/
+typedef struct porter_tokenizer_cursor {
+ sqlite3_tokenizer_cursor base;
+ const char *zInput; /* input we are tokenizing */
+ int nInput; /* size of the input */
+ int iOffset; /* current position in zInput */
+ int iToken; /* index of next token to be returned */
+ char *zToken; /* storage for current token */
+ int nAllocated; /* space allocated to zToken buffer */
+} porter_tokenizer_cursor;
+
+
+/* Forward declaration */
+static const sqlite3_tokenizer_module porterTokenizerModule;
+
+
+/*
+** Create a new tokenizer instance.
+*/
+static int porterCreate(
+ int argc, const char * const *argv,
+ sqlite3_tokenizer **ppTokenizer
+){
+ porter_tokenizer *t;
+ t = (porter_tokenizer *) calloc(sizeof(*t), 1);
+ if( t==NULL ) return SQLITE_NOMEM;
+
+ *ppTokenizer = &t->base;
+ return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer
+*/
+static int porterDestroy(sqlite3_tokenizer *pTokenizer){
+ free(pTokenizer);
+ return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string. The input
+** string to be tokenized is zInput[0..nInput-1]. A cursor
+** used to incrementally tokenize this string is returned in
+** *ppCursor.
+*/
+static int porterOpen(
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */
+ const char *zInput, int nInput, /* String to be tokenized */
+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
+){
+ porter_tokenizer_cursor *c;
+
+ c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
+ if( c==NULL ) return SQLITE_NOMEM;
+
+ c->zInput = zInput;
+ if( zInput==0 ){
+ c->nInput = 0;
+ }else if( nInput<0 ){
+ c->nInput = (int)strlen(zInput);
+ }else{
+ c->nInput = nInput;
+ }
+ c->iOffset = 0; /* start tokenizing at the beginning */
+ c->iToken = 0;
+ c->zToken = NULL; /* no space allocated, yet. */
+ c->nAllocated = 0;
+
+ *ppCursor = &c->base;
+ return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor previously opened by a call to
+** porterOpen() above.
+*/
+static int porterClose(sqlite3_tokenizer_cursor *pCursor){
+ porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
+ free(c->zToken);
+ free(c);
+ return SQLITE_OK;
+}
+/*
+** Vowel or consonant
+*/
+static const char cType[] = {
+ 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 2, 1
+};
+
+/*
+** isConsonant() and isVowel() determine if their first character in
+** the string they point to is a consonant or a vowel, according
+** to Porter ruls.
+**
+** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
+** 'Y' is a consonant unless it follows another consonant,
+** in which case it is a vowel.
+**
+** In these routine, the letters are in reverse order. So the 'y' rule
+** is that 'y' is a consonant unless it is followed by another
+** consonent.
+*/
+static int isVowel(const char*);
+static int isConsonant(const char *z){
+ int j;
+ char x = *z;
+ if( x==0 ) return 0;
+ assert( x>='a' && x<='z' );
+ j = cType[x-'a'];
+ if( j<2 ) return j;
+ return z[1]==0 || isVowel(z + 1);
+}
+static int isVowel(const char *z){
+ int j;
+ char x = *z;
+ if( x==0 ) return 0;
+ assert( x>='a' && x<='z' );
+ j = cType[x-'a'];
+ if( j<2 ) return 1-j;
+ return isConsonant(z + 1);
+}
+
+/*
+** Let any sequence of one or more vowels be represented by V and let
+** C be sequence of one or more consonants. Then every word can be
+** represented as:
+**
+** [C] (VC){m} [V]
+**
+** In prose: A word is an optional consonant followed by zero or
+** vowel-consonant pairs followed by an optional vowel. "m" is the
+** number of vowel consonant pairs. This routine computes the value
+** of m for the first i bytes of a word.
+**
+** Return true if the m-value for z is 1 or more. In other words,
+** return true if z contains at least one vowel that is followed
+** by a consonant.
+**
+** In this routine z[] is in reverse order. So we are really looking
+** for an instance of of a consonant followed by a vowel.
+*/
+static int m_gt_0(const char *z){
+ while( isVowel(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isConsonant(z) ){ z++; }
+ return *z!=0;
+}
+
+/* Like mgt0 above except we are looking for a value of m which is
+** exactly 1
+*/
+static int m_eq_1(const char *z){
+ while( isVowel(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isConsonant(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isVowel(z) ){ z++; }
+ if( *z==0 ) return 1;
+ while( isConsonant(z) ){ z++; }
+ return *z==0;
+}
+
+/* Like mgt0 above except we are looking for a value of m>1 instead
+** or m>0
+*/
+static int m_gt_1(const char *z){
+ while( isVowel(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isConsonant(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isVowel(z) ){ z++; }
+ if( *z==0 ) return 0;
+ while( isConsonant(z) ){ z++; }
+ return *z!=0;
+}
+
+/*
+** Return TRUE if there is a vowel anywhere within z[0..n-1]
+*/
+static int hasVowel(const char *z){
+ while( isConsonant(z) ){ z++; }
+ return *z!=0;
+}
+
+/*
+** Return TRUE if the word ends in a double consonant.
+**
+** The text is reversed here. So we are really looking at
+** the first two characters of z[].
+*/
+static int doubleConsonant(const char *z){
+ return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
+}
+
+/*
+** Return TRUE if the word ends with three letters which
+** are consonant-vowel-consonent and where the final consonant
+** is not 'w', 'x', or 'y'.
+**
+** The word is reversed here. So we are really checking the
+** first three letters and the first one cannot be in [wxy].
+*/
+static int star_oh(const char *z){
+ return
+ z[0]!=0 && isConsonant(z) &&
+ z[0]!='w' && z[0]!='x' && z[0]!='y' &&
+ z[1]!=0 && isVowel(z+1) &&
+ z[2]!=0 && isConsonant(z+2);
+}
+
+/*
+** If the word ends with zFrom and xCond() is true for the stem
+** of the word that preceeds the zFrom ending, then change the
+** ending to zTo.
+**
+** The input word *pz and zFrom are both in reverse order. zTo
+** is in normal order.
+**
+** Return TRUE if zFrom matches. Return FALSE if zFrom does not
+** match. Not that TRUE is returned even if xCond() fails and
+** no substitution occurs.
+*/
+static int stem(
+ char **pz, /* The word being stemmed (Reversed) */
+ const char *zFrom, /* If the ending matches this... (Reversed) */
+ const char *zTo, /* ... change the ending to this (not reversed) */
+ int (*xCond)(const char*) /* Condition that must be true */
+){
+ char *z = *pz;
+ while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
+ if( *zFrom!=0 ) return 0;
+ if( xCond && !xCond(z) ) return 1;
+ while( *zTo ){
+ *(--z) = *(zTo++);
+ }
+ *pz = z;
+ return 1;
+}
+
+/*
+** This is the fallback stemmer used when the porter stemmer is
+** inappropriate. The input word is copied into the output with
+** US-ASCII case folding. If the input word is too long (more
+** than 20 bytes if it contains no digits or more than 6 bytes if
+** it contains digits) then word is truncated to 20 or 6 bytes
+** by taking 10 or 3 bytes from the beginning and end.
+*/
+static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
+ int i, mx, j;
+ int hasDigit = 0;
+ for(i=0; i='A' && c<='Z' ){
+ zOut[i] = c - 'A' + 'a';
+ }else{
+ if( c>='0' && c<='9' ) hasDigit = 1;
+ zOut[i] = c;
+ }
+ }
+ mx = hasDigit ? 3 : 10;
+ if( nIn>mx*2 ){
+ for(j=mx, i=nIn-mx; i=sizeof(zReverse)-7 ){
+ /* The word is too big or too small for the porter stemmer.
+ ** Fallback to the copy stemmer */
+ copy_stemmer(zIn, nIn, zOut, pnOut);
+ return;
+ }
+ for(i=0, j=sizeof(zReverse)-6; i='A' && c<='Z' ){
+ zReverse[j] = c + 'a' - 'A';
+ }else if( c>='a' && c<='z' ){
+ zReverse[j] = c;
+ }else{
+ /* The use of a character not in [a-zA-Z] means that we fallback
+ ** to the copy stemmer */
+ copy_stemmer(zIn, nIn, zOut, pnOut);
+ return;
+ }
+ }
+ memset(&zReverse[sizeof(zReverse)-5], 0, 5);
+ z = &zReverse[j+1];
+
+
+ /* Step 1a */
+ if( z[0]=='s' ){
+ if(
+ !stem(&z, "sess", "ss", 0) &&
+ !stem(&z, "sei", "i", 0) &&
+ !stem(&z, "ss", "ss", 0)
+ ){
+ z++;
+ }
+ }
+
+ /* Step 1b */
+ z2 = z;
+ if( stem(&z, "dee", "ee", m_gt_0) ){
+ /* Do nothing. The work was all in the test */
+ }else if(
+ (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
+ && z!=z2
+ ){
+ if( stem(&z, "ta", "ate", 0) ||
+ stem(&z, "lb", "ble", 0) ||
+ stem(&z, "zi", "ize", 0) ){
+ /* Do nothing. The work was all in the test */
+ }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
+ z++;
+ }else if( m_eq_1(z) && star_oh(z) ){
+ *(--z) = 'e';
+ }
+ }
+
+ /* Step 1c */
+ if( z[0]=='y' && hasVowel(z+1) ){
+ z[0] = 'i';
+ }
+
+ /* Step 2 */
+ switch( z[1] ){
+ case 'a':
+ stem(&z, "lanoita", "ate", m_gt_0) ||
+ stem(&z, "lanoit", "tion", m_gt_0);
+ break;
+ case 'c':
+ stem(&z, "icne", "ence", m_gt_0) ||
+ stem(&z, "icna", "ance", m_gt_0);
+ break;
+ case 'e':
+ stem(&z, "rezi", "ize", m_gt_0);
+ break;
+ case 'g':
+ stem(&z, "igol", "log", m_gt_0);
+ break;
+ case 'l':
+ stem(&z, "ilb", "ble", m_gt_0) ||
+ stem(&z, "illa", "al", m_gt_0) ||
+ stem(&z, "iltne", "ent", m_gt_0) ||
+ stem(&z, "ile", "e", m_gt_0) ||
+ stem(&z, "ilsuo", "ous", m_gt_0);
+ break;
+ case 'o':
+ stem(&z, "noitazi", "ize", m_gt_0) ||
+ stem(&z, "noita", "ate", m_gt_0) ||
+ stem(&z, "rota", "ate", m_gt_0);
+ break;
+ case 's':
+ stem(&z, "msila", "al", m_gt_0) ||
+ stem(&z, "ssenevi", "ive", m_gt_0) ||
+ stem(&z, "ssenluf", "ful", m_gt_0) ||
+ stem(&z, "ssensuo", "ous", m_gt_0);
+ break;
+ case 't':
+ stem(&z, "itila", "al", m_gt_0) ||
+ stem(&z, "itivi", "ive", m_gt_0) ||
+ stem(&z, "itilib", "ble", m_gt_0);
+ break;
+ }
+
+ /* Step 3 */
+ switch( z[0] ){
+ case 'e':
+ stem(&z, "etaci", "ic", m_gt_0) ||
+ stem(&z, "evita", "", m_gt_0) ||
+ stem(&z, "ezila", "al", m_gt_0);
+ break;
+ case 'i':
+ stem(&z, "itici", "ic", m_gt_0);
+ break;
+ case 'l':
+ stem(&z, "laci", "ic", m_gt_0) ||
+ stem(&z, "luf", "", m_gt_0);
+ break;
+ case 's':
+ stem(&z, "ssen", "", m_gt_0);
+ break;
+ }
+
+ /* Step 4 */
+ switch( z[1] ){
+ case 'a':
+ if( z[0]=='l' && m_gt_1(z+2) ){
+ z += 2;
+ }
+ break;
+ case 'c':
+ if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
+ z += 4;
+ }
+ break;
+ case 'e':
+ if( z[0]=='r' && m_gt_1(z+2) ){
+ z += 2;
+ }
+ break;
+ case 'i':
+ if( z[0]=='c' && m_gt_1(z+2) ){
+ z += 2;
+ }
+ break;
+ case 'l':
+ if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
+ z += 4;
+ }
+ break;
+ case 'n':
+ if( z[0]=='t' ){
+ if( z[2]=='a' ){
+ if( m_gt_1(z+3) ){
+ z += 3;
+ }
+ }else if( z[2]=='e' ){
+ stem(&z, "tneme", "", m_gt_1) ||
+ stem(&z, "tnem", "", m_gt_1) ||
+ stem(&z, "tne", "", m_gt_1);
+ }
+ }
+ break;
+ case 'o':
+ if( z[0]=='u' ){
+ if( m_gt_1(z+2) ){
+ z += 2;
+ }
+ }else if( z[3]=='s' || z[3]=='t' ){
+ stem(&z, "noi", "", m_gt_1);
+ }
+ break;
+ case 's':
+ if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
+ z += 3;
+ }
+ break;
+ case 't':
+ stem(&z, "eta", "", m_gt_1) ||
+ stem(&z, "iti", "", m_gt_1);
+ break;
+ case 'u':
+ if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
+ z += 3;
+ }
+ break;
+ case 'v':
+ case 'z':
+ if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
+ z += 3;
+ }
+ break;
+ }
+
+ /* Step 5a */
+ if( z[0]=='e' ){
+ if( m_gt_1(z+1) ){
+ z++;
+ }else if( m_eq_1(z+1) && !star_oh(z+1) ){
+ z++;
+ }
+ }
+
+ /* Step 5b */
+ if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
+ z++;
+ }
+
+ /* z[] is now the stemmed word in reverse order. Flip it back
+ ** around into forward order and return.
+ */
+ *pnOut = i = strlen(z);
+ zOut[i] = 0;
+ while( *z ){
+ zOut[--i] = *(z++);
+ }
+}
+
+/*
+** Characters that can be part of a token. We assume any character
+** whose value is greater than 0x80 (any UTF character) can be
+** part of a token. In other words, delimiters all must have
+** values of 0x7f or lower.
+*/
+static const char isIdChar[] = {
+/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
+};
+#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
+#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
+
+/*
+** Extract the next token from a tokenization cursor. The cursor must
+** have been opened by a prior call to porterOpen().
+*/
+static int porterNext(
+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
+ const char **pzToken, /* OUT: *pzToken is the token text */
+ int *pnBytes, /* OUT: Number of bytes in token */
+ int *piStartOffset, /* OUT: Starting offset of token */
+ int *piEndOffset, /* OUT: Ending offset of token */
+ int *piPosition /* OUT: Position integer of token */
+){
+ porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
+ const char *z = c->zInput;
+
+ while( c->iOffsetnInput ){
+ int iStartOffset, ch;
+
+ /* Scan past delimiter characters */
+ while( c->iOffsetnInput && isDelim(z[c->iOffset]) ){
+ c->iOffset++;
+ }
+
+ /* Count non-delimiter characters. */
+ iStartOffset = c->iOffset;
+ while( c->iOffsetnInput && !isDelim(z[c->iOffset]) ){
+ c->iOffset++;
+ }
+
+ if( c->iOffset>iStartOffset ){
+ int n = c->iOffset-iStartOffset;
+ if( n>c->nAllocated ){
+ c->nAllocated = n+20;
+ c->zToken = realloc(c->zToken, c->nAllocated);
+ if( c->zToken==NULL ) return SQLITE_NOMEM;
+ }
+ porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
+ *pzToken = c->zToken;
+ *piStartOffset = iStartOffset;
+ *piEndOffset = c->iOffset;
+ *piPosition = c->iToken++;
+ return SQLITE_OK;
+ }
+ }
+ return SQLITE_DONE;
+}
+
+/*
+** The set of routines that implement the porter-stemmer tokenizer
+*/
+static const sqlite3_tokenizer_module porterTokenizerModule = {
+ 0,
+ porterCreate,
+ porterDestroy,
+ porterOpen,
+ porterClose,
+ porterNext,
+};
+
+/*
+** Allocate a new porter tokenizer. Return a pointer to the new
+** tokenizer in *ppModule
+*/
+void sqlite3Fts1PorterTokenizerModule(
+ sqlite3_tokenizer_module const**ppModule
+){
+ *ppModule = &porterTokenizerModule;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ADDED ext/fts1/fts1_tokenizer.h
Index: ext/fts1/fts1_tokenizer.h
==================================================================
--- /dev/null
+++ ext/fts1/fts1_tokenizer.h
@@ -0,0 +1,90 @@
+/*
+** 2006 July 10
+**
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Defines the interface to tokenizers used by fulltext-search. There
+** are three basic components:
+**
+** sqlite3_tokenizer_module is a singleton defining the tokenizer
+** interface functions. This is essentially the class structure for
+** tokenizers.
+**
+** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
+** including customization information defined at creation time.
+**
+** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
+** tokens from a particular input.
+*/
+#ifndef _FTS1_TOKENIZER_H_
+#define _FTS1_TOKENIZER_H_
+
+/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
+** If tokenizers are to be allowed to call sqlite3_*() functions, then
+** we will need a way to register the API consistently.
+*/
+#include "sqlite3.h"
+
+/*
+** Structures used by the tokenizer interface.
+*/
+typedef struct sqlite3_tokenizer sqlite3_tokenizer;
+typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
+typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
+
+struct sqlite3_tokenizer_module {
+ int iVersion; /* currently 0 */
+
+ /*
+ ** Create and destroy a tokenizer. argc/argv are passed down from
+ ** the fulltext virtual table creation to allow customization.
+ */
+ int (*xCreate)(int argc, const char *const*argv,
+ sqlite3_tokenizer **ppTokenizer);
+ int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
+
+ /*
+ ** Tokenize a particular input. Call xOpen() to prepare to
+ ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
+ ** xClose() to free any internal state. The pInput passed to
+ ** xOpen() must exist until the cursor is closed. The ppToken
+ ** result from xNext() is only valid until the next call to xNext()
+ ** or until xClose() is called.
+ */
+ /* TODO(shess) current implementation requires pInput to be
+ ** nul-terminated. This should either be fixed, or pInput/nBytes
+ ** should be converted to zInput.
+ */
+ int (*xOpen)(sqlite3_tokenizer *pTokenizer,
+ const char *pInput, int nBytes,
+ sqlite3_tokenizer_cursor **ppCursor);
+ int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
+ int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
+ const char **ppToken, int *pnBytes,
+ int *piStartOffset, int *piEndOffset, int *piPosition);
+};
+
+struct sqlite3_tokenizer {
+ const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
+ /* Tokenizer implementations will typically add additional fields */
+};
+
+struct sqlite3_tokenizer_cursor {
+ sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
+ /* Tokenizer implementations will typically add additional fields */
+};
+
+/*
+** Get the module for a tokenizer which generates tokens based on a
+** set of non-token characters. The default is to break tokens at any
+** non-alnum character, though the set of delimiters can also be
+** specified by the first argv argument to xCreate().
+*/
+/* TODO(shess) This doesn't belong here. Need some sort of
+** registration process.
+*/
+void sqlite3Fts1SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
+void sqlite3Fts1PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
+
+#endif /* _FTS1_TOKENIZER_H_ */
ADDED ext/fts1/fts1_tokenizer1.c
Index: ext/fts1/fts1_tokenizer1.c
==================================================================
--- /dev/null
+++ ext/fts1/fts1_tokenizer1.c
@@ -0,0 +1,221 @@
+/*
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Implementation of the "simple" full-text-search tokenizer.
+*/
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS1 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS1 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
+*/
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
+
+
+#include
+#include
+#include
+#include
+#include
+
+#include "fts1_tokenizer.h"
+
+typedef struct simple_tokenizer {
+ sqlite3_tokenizer base;
+ char delim[128]; /* flag ASCII delimiters */
+} simple_tokenizer;
+
+typedef struct simple_tokenizer_cursor {
+ sqlite3_tokenizer_cursor base;
+ const char *pInput; /* input we are tokenizing */
+ int nBytes; /* size of the input */
+ int iOffset; /* current position in pInput */
+ int iToken; /* index of next token to be returned */
+ char *pToken; /* storage for current token */
+ int nTokenAllocated; /* space allocated to zToken buffer */
+} simple_tokenizer_cursor;
+
+
+/* Forward declaration */
+static const sqlite3_tokenizer_module simpleTokenizerModule;
+
+static int isDelim(simple_tokenizer *t, unsigned char c){
+ return c<0x80 && t->delim[c];
+}
+
+/*
+** Create a new tokenizer instance.
+*/
+static int simpleCreate(
+ int argc, const char * const *argv,
+ sqlite3_tokenizer **ppTokenizer
+){
+ simple_tokenizer *t;
+
+ t = (simple_tokenizer *) calloc(sizeof(*t), 1);
+ if( t==NULL ) return SQLITE_NOMEM;
+
+ /* TODO(shess) Delimiters need to remain the same from run to run,
+ ** else we need to reindex. One solution would be a meta-table to
+ ** track such information in the database, then we'd only want this
+ ** information on the initial create.
+ */
+ if( argc>1 ){
+ int i, n = strlen(argv[1]);
+ for(i=0; i=0x80 ){
+ free(t);
+ return SQLITE_ERROR;
+ }
+ t->delim[ch] = 1;
+ }
+ } else {
+ /* Mark non-alphanumeric ASCII characters as delimiters */
+ int i;
+ for(i=1; i<0x80; i++){
+ t->delim[i] = !isalnum(i);
+ }
+ }
+
+ *ppTokenizer = &t->base;
+ return SQLITE_OK;
+}
+
+/*
+** Destroy a tokenizer
+*/
+static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
+ free(pTokenizer);
+ return SQLITE_OK;
+}
+
+/*
+** Prepare to begin tokenizing a particular string. The input
+** string to be tokenized is pInput[0..nBytes-1]. A cursor
+** used to incrementally tokenize this string is returned in
+** *ppCursor.
+*/
+static int simpleOpen(
+ sqlite3_tokenizer *pTokenizer, /* The tokenizer */
+ const char *pInput, int nBytes, /* String to be tokenized */
+ sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
+){
+ simple_tokenizer_cursor *c;
+
+ c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
+ if( c==NULL ) return SQLITE_NOMEM;
+
+ c->pInput = pInput;
+ if( pInput==0 ){
+ c->nBytes = 0;
+ }else if( nBytes<0 ){
+ c->nBytes = (int)strlen(pInput);
+ }else{
+ c->nBytes = nBytes;
+ }
+ c->iOffset = 0; /* start tokenizing at the beginning */
+ c->iToken = 0;
+ c->pToken = NULL; /* no space allocated, yet. */
+ c->nTokenAllocated = 0;
+
+ *ppCursor = &c->base;
+ return SQLITE_OK;
+}
+
+/*
+** Close a tokenization cursor previously opened by a call to
+** simpleOpen() above.
+*/
+static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
+ free(c->pToken);
+ free(c);
+ return SQLITE_OK;
+}
+
+/*
+** Extract the next token from a tokenization cursor. The cursor must
+** have been opened by a prior call to simpleOpen().
+*/
+static int simpleNext(
+ sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
+ const char **ppToken, /* OUT: *ppToken is the token text */
+ int *pnBytes, /* OUT: Number of bytes in token */
+ int *piStartOffset, /* OUT: Starting offset of token */
+ int *piEndOffset, /* OUT: Ending offset of token */
+ int *piPosition /* OUT: Position integer of token */
+){
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
+ simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
+ unsigned char *p = (unsigned char *)c->pInput;
+
+ while( c->iOffsetnBytes ){
+ int iStartOffset;
+
+ /* Scan past delimiter characters */
+ while( c->iOffsetnBytes && isDelim(t, p[c->iOffset]) ){
+ c->iOffset++;
+ }
+
+ /* Count non-delimiter characters. */
+ iStartOffset = c->iOffset;
+ while( c->iOffsetnBytes && !isDelim(t, p[c->iOffset]) ){
+ c->iOffset++;
+ }
+
+ if( c->iOffset>iStartOffset ){
+ int i, n = c->iOffset-iStartOffset;
+ if( n>c->nTokenAllocated ){
+ c->nTokenAllocated = n+20;
+ c->pToken = realloc(c->pToken, c->nTokenAllocated);
+ if( c->pToken==NULL ) return SQLITE_NOMEM;
+ }
+ for(i=0; ipToken[i] = ch<0x80 ? tolower(ch) : ch;
+ }
+ *ppToken = c->pToken;
+ *pnBytes = n;
+ *piStartOffset = iStartOffset;
+ *piEndOffset = c->iOffset;
+ *piPosition = c->iToken++;
+
+ return SQLITE_OK;
+ }
+ }
+ return SQLITE_DONE;
+}
+
+/*
+** The set of routines that implement the simple tokenizer
+*/
+static const sqlite3_tokenizer_module simpleTokenizerModule = {
+ 0,
+ simpleCreate,
+ simpleDestroy,
+ simpleOpen,
+ simpleClose,
+ simpleNext,
+};
+
+/*
+** Allocate a new simple tokenizer. Return a pointer to the new
+** tokenizer in *ppModule
+*/
+void sqlite3Fts1SimpleTokenizerModule(
+ sqlite3_tokenizer_module const**ppModule
+){
+ *ppModule = &simpleTokenizerModule;
+}
+
+#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
ADDED ext/fts1/fulltext.c
Index: ext/fts1/fulltext.c
==================================================================
--- /dev/null
+++ ext/fts1/fulltext.c
@@ -0,0 +1,1511 @@
+/* The author disclaims copyright to this source code.
+ *
+ * This is an SQLite module implementing full-text search.
+ */
+
+#include
+#if !defined(__APPLE__)
+#include
+#else
+#include
+#endif
+#include
+#include
+#include
+
+#include "fulltext.h"
+#include "ft_hash.h"
+#include "tokenizer.h"
+#include "sqlite3.h"
+#include "sqlite3ext.h"
+SQLITE_EXTENSION_INIT1
+
+/* utility functions */
+
+/* We encode variable-length integers in little-endian order using seven bits
+ * per byte as follows:
+**
+** KEY:
+** A = 0xxxxxxx 7 bits of data and one flag bit
+** B = 1xxxxxxx 7 bits of data and one flag bit
+**
+** 7 bits - A
+** 14 bits - BA
+** 21 bits - BBA
+** and so on.
+*/
+
+/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
+#define VARINT_MAX 10
+
+/* Write a 64-bit variable-length integer to memory starting at p[0].
+ * The length of data written will be between 1 and VARINT_MAX bytes.
+ * The number of bytes written is returned. */
+static int putVarint(char *p, sqlite_int64 v){
+ unsigned char *q = (unsigned char *) p;
+ sqlite_uint64 vu = v;
+ do{
+ *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
+ vu >>= 7;
+ }while( vu!=0 );
+ q[-1] &= 0x7f; /* turn off high bit in final byte */
+ assert( q - (unsigned char *)p <= VARINT_MAX );
+ return (int) (q - (unsigned char *)p);
+}
+
+/* Read a 64-bit variable-length integer from memory starting at p[0].
+ * Return the number of bytes read, or 0 on error.
+ * The value is stored in *v. */
+static int getVarint(const char *p, sqlite_int64 *v){
+ const unsigned char *q = (const unsigned char *) p;
+ sqlite_uint64 x = 0, y = 1;
+ while( (*q & 0x80) == 0x80 ){
+ x += y * (*q++ & 0x7f);
+ y <<= 7;
+ if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
+ assert( 0 );
+ return 0;
+ }
+ }
+ x += y * (*q++);
+ *v = (sqlite_int64) x;
+ return (int) (q - (unsigned char *)p);
+}
+
+static int getVarint32(const char *p, int *pi){
+ sqlite_int64 i;
+ int ret = getVarint(p, &i);
+ *pi = (int) i;
+ assert( *pi==i );
+ return ret;
+}
+
+/*** Document lists ***
+ *
+ * A document list holds a sorted list of varint-encoded document IDs.
+ *
+ * A doclist with type DL_POSITIONS_OFFSETS is stored like this:
+ *
+ * array {
+ * varint docid;
+ * array {
+ * varint position; (delta from previous position plus 1, or 0 for end)
+ * varint startOffset; (delta from previous startOffset)
+ * varint endOffset; (delta from startOffset)
+ * }
+ * }
+ *
+ * Here, array { X } means zero or more occurrences of X, adjacent in memory.
+ *
+ * A doclist with type DL_POSITIONS is like the above, but holds only docids
+ * and positions without offset information.
+ *
+ * A doclist with type DL_DOCIDS is like the above, but holds only docids
+ * without positions or offset information.
+ *
+ * On disk, every document list has positions and offsets, so we don't bother
+ * to serialize a doclist's type.
+ *
+ * We don't yet delta-encode document IDs; doing so will probably be a
+ * modest win.
+ *
+ * NOTE(shess) I've thought of a slightly (1%) better offset encoding.
+ * After the first offset, estimate the next offset by using the
+ * current token position and the previous token position and offset,
+ * offset to handle some variance. So the estimate would be
+ * (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
+ * as normal. Offsets more than 64 chars from the estimate are
+ * encoded as the delta to the previous start offset + 128. An
+ * additional tiny increment can be gained by using the end offset of
+ * the previous token to make the estimate a tiny bit more precise.
+*/
+
+typedef enum DocListType {
+ DL_DOCIDS, /* docids only */
+ DL_POSITIONS, /* docids + positions */
+ DL_POSITIONS_OFFSETS /* docids + positions + offsets */
+} DocListType;
+
+typedef struct DocList {
+ char *pData;
+ int nData;
+ DocListType iType;
+ int iLastPos; /* the last position written */
+ int iLastOffset; /* the last start offset written */
+} DocList;
+
+/* Initialize a new DocList to hold the given data. */
+static void docListInit(DocList *d, DocListType iType,
+ const char *pData, int nData){
+ d->nData = nData;
+ if( nData>0 ){
+ d->pData = malloc(nData);
+ memcpy(d->pData, pData, nData);
+ } else {
+ d->pData = NULL;
+ }
+ d->iType = iType;
+ d->iLastPos = 0;
+ d->iLastOffset = 0;
+}
+
+/* Create a new dynamically-allocated DocList. */
+static DocList *docListNew(DocListType iType){
+ DocList *d = (DocList *) malloc(sizeof(DocList));
+ docListInit(d, iType, 0, 0);
+ return d;
+}
+
+static void docListDestroy(DocList *d){
+ free(d->pData);
+#ifndef NDEBUG
+ memset(d, 0x55, sizeof(*d));
+#endif
+}
+
+static void docListDelete(DocList *d){
+ docListDestroy(d);
+ free(d);
+}
+
+static char *docListEnd(DocList *d){
+ return d->pData + d->nData;
+}
+
+/* Append a varint to a DocList's data. */
+static void appendVarint(DocList *d, sqlite_int64 i){
+ char c[VARINT_MAX];
+ int n = putVarint(c, i);
+ d->pData = realloc(d->pData, d->nData + n);
+ memcpy(d->pData + d->nData, c, n);
+ d->nData += n;
+}
+
+static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
+ appendVarint(d, iDocid);
+ d->iLastPos = 0;
+}
+
+/* Add a position to the last position list in a doclist. */
+static void docListAddPos(DocList *d, int iPos){
+ assert( d->iType>=DL_POSITIONS );
+ appendVarint(d, iPos-d->iLastPos+1);
+ d->iLastPos = iPos;
+}
+
+static void docListAddPosOffset(DocList *d, int iPos,
+ int iStartOffset, int iEndOffset){
+ assert( d->iType==DL_POSITIONS_OFFSETS );
+ docListAddPos(d, iPos);
+ appendVarint(d, iStartOffset-d->iLastOffset);
+ d->iLastOffset = iStartOffset;
+ appendVarint(d, iEndOffset-iStartOffset);
+}
+
+/* Terminate the last position list in the given doclist. */
+static void docListAddEndPos(DocList *d){
+ appendVarint(d, 0);
+}
+
+typedef struct DocListReader {
+ DocList *pDoclist;
+ char *p;
+ int iLastPos; /* the last position read */
+} DocListReader;
+
+static void readerInit(DocListReader *r, DocList *pDoclist){
+ r->pDoclist = pDoclist;
+ if( pDoclist!=NULL ){
+ r->p = pDoclist->pData;
+ }
+ r->iLastPos = 0;
+}
+
+static int readerAtEnd(DocListReader *pReader){
+ return pReader->p >= docListEnd(pReader->pDoclist);
+}
+
+/* Peek at the next docid without advancing the read pointer. */
+static sqlite_int64 peekDocid(DocListReader *pReader){
+ sqlite_int64 ret;
+ assert( !readerAtEnd(pReader) );
+ getVarint(pReader->p, &ret);
+ return ret;
+}
+
+/* Read the next docid. */
+static sqlite_int64 readDocid(DocListReader *pReader){
+ sqlite_int64 ret;
+ assert( !readerAtEnd(pReader) );
+ pReader->p += getVarint(pReader->p, &ret);
+ pReader->iLastPos = 0;
+ return ret;
+}
+
+/* Read the next position from a position list.
+ * Returns the position, or -1 at the end of the list. */
+static int readPosition(DocListReader *pReader){
+ int i;
+ int iType = pReader->pDoclist->iType;
+ assert( iType>=DL_POSITIONS );
+ assert( !readerAtEnd(pReader) );
+
+ pReader->p += getVarint32(pReader->p, &i);
+ if( i==0 ){
+ pReader->iLastPos = -1;
+ return -1;
+ }
+ pReader->iLastPos += ((int) i)-1;
+ if( iType>=DL_POSITIONS_OFFSETS ){
+ /* Skip over offsets, ignoring them for now. */
+ int iStart, iEnd;
+ pReader->p += getVarint32(pReader->p, &iStart);
+ pReader->p += getVarint32(pReader->p, &iEnd);
+ }
+ return pReader->iLastPos;
+}
+
+/* Skip past the end of a position list. */
+static void skipPositionList(DocListReader *pReader){
+ while( readPosition(pReader)!=-1 )
+ ;
+}
+
+/* Skip over a docid, including its position list if the doclist has
+ * positions. */
+static void skipDocument(DocListReader *pReader){
+ readDocid(pReader);
+ if( pReader->pDoclist->iType >= DL_POSITIONS ){
+ skipPositionList(pReader);
+ }
+}
+
+static sqlite_int64 firstDocid(DocList *d){
+ DocListReader r;
+ readerInit(&r, d);
+ return readDocid(&r);
+}
+
+/* Doclist multi-tool. Pass pUpdate==NULL to delete the indicated docid;
+ * otherwise pUpdate, which must contain only the single docid [iDocid], is
+ * inserted (if not present) or updated (if already present). */
+static int docListUpdate(DocList *d, sqlite_int64 iDocid, DocList *pUpdate){
+ int modified = 0;
+ DocListReader reader;
+ char *p;
+
+ if( pUpdate!=NULL ){
+ assert( d->iType==pUpdate->iType);
+ assert( iDocid==firstDocid(pUpdate) );
+ }
+
+ readerInit(&reader, d);
+ while( !readerAtEnd(&reader) && peekDocid(&reader)nData -= (reader.p - p);
+ modified = 1;
+ }
+
+ /* Insert if indicated. */
+ if( pUpdate!=NULL ){
+ int iDoclist = p-d->pData;
+ docListAddEndPos(pUpdate);
+
+ d->pData = realloc(d->pData, d->nData+pUpdate->nData);
+ p = d->pData + iDoclist;
+
+ memmove(p+pUpdate->nData, p, docListEnd(d) - p);
+ memcpy(p, pUpdate->pData, pUpdate->nData);
+ d->nData += pUpdate->nData;
+ modified = 1;
+ }
+
+ return modified;
+}
+
+/* Split the second half of doclist d into a separate doclist d2. Returns 1
+ * if successful, or 0 if d contains a single document and hence can't be
+ * split. */
+static int docListSplit(DocList *d, DocList *d2){
+ const char *pSplitPoint = d->pData + d->nData / 2;
+ DocListReader reader;
+
+ readerInit(&reader, d);
+ while( reader.piType, reader.p, docListEnd(d) - reader.p);
+ d->nData = reader.p - d->pData;
+ d->pData = realloc(d->pData, d->nData);
+ return 1;
+}
+
+/* A DocListMerge computes the AND of an in-memory DocList [in] and a chunked
+ * on-disk doclist, resulting in another in-memory DocList [out]. [in]
+ * and [out] may or may not store position information according to the
+ * caller's wishes. The on-disk doclist always comes with positions.
+ *
+ * The caller must read each chunk of the on-disk doclist in succession and
+ * pass it to mergeBlock().
+ *
+ * If [in] has positions, then the merge output contains only documents with
+ * matching positions in the two input doclists. If [in] does not have
+ * positions, then the merge output contains all documents common to the two
+ * input doclists.
+ *
+ * If [in] is NULL, then the on-disk doclist is copied to [out] directly.
+ *
+ * A merge is performed using an integer [iOffset] provided by the caller.
+ * [iOffset] is subtracted from each position in the on-disk doclist for the
+ * purpose of position comparison; this is helpful in implementing phrase
+ * searches.
+ *
+ * A DocListMerge is not yet able to propagate offsets through query
+ * processing; we should add that capability soon.
+*/
+typedef struct DocListMerge {
+ DocListReader in;
+ DocList *pOut;
+ int iOffset;
+} DocListMerge;
+
+static void mergeInit(DocListMerge *m,
+ DocList *pIn, int iOffset, DocList *pOut){
+ readerInit(&m->in, pIn);
+ m->pOut = pOut;
+ m->iOffset = iOffset;
+
+ /* can't handle offsets yet */
+ assert( pIn==NULL || pIn->iType <= DL_POSITIONS );
+ assert( pOut->iType <= DL_POSITIONS );
+}
+
+/* A helper function for mergeBlock(), below. Merge the position lists
+ * pointed to by m->in and pBlockReader.
+ * If the merge matches, write [iDocid] to m->pOut; if m->pOut
+ * has positions then write all matching positions as well. */
+static void mergePosList(DocListMerge *m, sqlite_int64 iDocid,
+ DocListReader *pBlockReader){
+ int block_pos = readPosition(pBlockReader);
+ int in_pos = readPosition(&m->in);
+ int match = 0;
+ while( block_pos!=-1 || in_pos!=-1 ){
+ if( block_pos-m->iOffset==in_pos ){
+ if( !match ){
+ docListAddDocid(m->pOut, iDocid);
+ match = 1;
+ }
+ if( m->pOut->iType >= DL_POSITIONS ){
+ docListAddPos(m->pOut, in_pos);
+ }
+ block_pos = readPosition(pBlockReader);
+ in_pos = readPosition(&m->in);
+ } else if( in_pos==-1 || (block_pos!=-1 && block_pos-m->iOffsetin);
+ }
+ }
+ if( m->pOut->iType >= DL_POSITIONS && match ){
+ docListAddEndPos(m->pOut);
+ }
+}
+
+/* Merge one block of an on-disk doclist into a DocListMerge. */
+static void mergeBlock(DocListMerge *m, DocList *pBlock){
+ DocListReader blockReader;
+ assert( pBlock->iType >= DL_POSITIONS );
+ readerInit(&blockReader, pBlock);
+ while( !readerAtEnd(&blockReader) ){
+ sqlite_int64 iDocid = readDocid(&blockReader);
+ if( m->in.pDoclist!=NULL ){
+ while( 1 ){
+ if( readerAtEnd(&m->in) ) return; /* nothing more to merge */
+ if( peekDocid(&m->in)>=iDocid ) break;
+ skipDocument(&m->in);
+ }
+ if( peekDocid(&m->in)>iDocid ){ /* [pIn] has no match with iDocid */
+ skipPositionList(&blockReader); /* skip this docid in the block */
+ continue;
+ }
+ readDocid(&m->in);
+ }
+ /* We have a document match. */
+ if( m->in.pDoclist==NULL || m->in.pDoclist->iType < DL_POSITIONS ){
+ /* We don't need to do a poslist merge. */
+ docListAddDocid(m->pOut, iDocid);
+ if( m->pOut->iType >= DL_POSITIONS ){
+ /* Copy all positions to the output doclist. */
+ while( 1 ){
+ int pos = readPosition(&blockReader);
+ if( pos==-1 ) break;
+ docListAddPos(m->pOut, pos);
+ }
+ docListAddEndPos(m->pOut);
+ } else skipPositionList(&blockReader);
+ continue;
+ }
+ mergePosList(m, iDocid, &blockReader);
+ }
+}
+
+static char *string_dup_n(const char *s, int n){
+ char *str = malloc(n + 1);
+ memcpy(str, s, n);
+ str[n] = '\0';
+ return str;
+}
+
+/* Duplicate a string; the caller must free() the returned string.
+ * (We don't use strdup() since it's not part of the standard C library and
+ * may not be available everywhere.) */
+static char *string_dup(const char *s){
+ return string_dup_n(s, strlen(s));
+}
+
+/* Format a string, replacing each occurrence of the % character with
+ * zName. This may be more convenient than sqlite_mprintf()
+ * when one string is used repeatedly in a format string.
+ * The caller must free() the returned string. */
+static char *string_format(const char *zFormat, const char *zName){
+ const char *p;
+ size_t len = 0;
+ size_t nName = strlen(zName);
+ char *result;
+ char *r;
+
+ /* first compute length needed */
+ for(p = zFormat ; *p ; ++p){
+ len += (*p=='%' ? nName : 1);
+ }
+ len += 1; /* for null terminator */
+
+ r = result = malloc(len);
+ for(p = zFormat; *p; ++p){
+ if( *p=='%' ){
+ memcpy(r, zName, nName);
+ r += nName;
+ } else {
+ *r++ = *p;
+ }
+ }
+ *r++ = '\0';
+ assert( r == result + len );
+ return result;
+}
+
+static int sql_exec(sqlite3 *db, const char *zName, const char *zFormat){
+ char *zCommand = string_format(zFormat, zName);
+ int rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
+ free(zCommand);
+ return rc;
+}
+
+static int sql_prepare(sqlite3 *db, const char *zName, sqlite3_stmt **ppStmt,
+ const char *zFormat){
+ char *zCommand = string_format(zFormat, zName);
+ int rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
+ free(zCommand);
+ return rc;
+}
+
+/* end utility functions */
+
+#define QUERY_GENERIC 0
+#define QUERY_FULLTEXT 1
+
+#define CHUNK_MAX 1024
+
+typedef enum fulltext_statement {
+ CONTENT_INSERT_STMT,
+ CONTENT_SELECT_STMT,
+ CONTENT_DELETE_STMT,
+
+ TERM_SELECT_STMT,
+ TERM_CHUNK_SELECT_STMT,
+ TERM_INSERT_STMT,
+ TERM_UPDATE_STMT,
+ TERM_DELETE_STMT,
+
+ MAX_STMT /* Always at end! */
+} fulltext_statement;
+
+/* These must exactly match the enum above. */
+/* TODO(adam): Is there some risk that a statement (in particular,
+** pTermSelectStmt) will be used in two cursors at once, e.g. if a
+** query joins a virtual table to itself? If so perhaps we should
+** move some of these to the cursor object.
+*/
+static const char *fulltext_zStatement[MAX_STMT] = {
+ /* CONTENT_INSERT */ "insert into %_content (rowid, content) values (?, ?)",
+ /* CONTENT_SELECT */ "select content from %_content where rowid = ?",
+ /* CONTENT_DELETE */ "delete from %_content where rowid = ?",
+
+ /* TERM_SELECT */
+ "select rowid, doclist from %_term where term = ? and first = ?",
+ /* TERM_CHUNK_SELECT */
+ "select max(first) from %_term where term = ? and first <= ?",
+ /* TERM_INSERT */
+ "insert into %_term (term, first, doclist) values (?, ?, ?)",
+ /* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
+ /* TERM_DELETE */ "delete from %_term where rowid = ?",
+};
+
+typedef struct fulltext_vtab {
+ sqlite3_vtab base;
+ sqlite3 *db;
+ const char *zName; /* virtual table name */
+ sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
+
+ /* Precompiled statements which we keep as long as the table is
+ ** open.
+ */
+ sqlite3_stmt *pFulltextStatements[MAX_STMT];
+} fulltext_vtab;
+
+typedef struct fulltext_cursor {
+ sqlite3_vtab_cursor base;
+ int iCursorType; /* QUERY_GENERIC or QUERY_FULLTEXT */
+
+ sqlite3_stmt *pStmt;
+
+ int eof;
+
+ /* The following is used only when iCursorType == QUERY_FULLTEXT. */
+ DocListReader result;
+} fulltext_cursor;
+
+static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
+ return (fulltext_vtab *) c->base.pVtab;
+}
+
+static sqlite3_module fulltextModule; /* forward declaration */
+
+/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
+** If the indicated statement has never been prepared, it is prepared
+** and cached, otherwise the cached version is reset.
+*/
+static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ assert( iStmtpFulltextStatements[iStmt]==NULL ){
+ int rc = sql_prepare(v->db, v->zName, &v->pFulltextStatements[iStmt],
+ fulltext_zStatement[iStmt]);
+ if( rc!=SQLITE_OK ) return rc;
+ } else {
+ int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ *ppStmt = v->pFulltextStatements[iStmt];
+ return SQLITE_OK;
+}
+
+/* Step the indicated statement, handling errors SQLITE_BUSY (by
+** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
+** bindings to the new statement).
+** TODO(adam): We should extend this function so that it can work with
+** statements declared locally, not only globally cached statements.
+*/
+static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ int rc;
+ sqlite3_stmt *s = *ppStmt;
+ assert( iStmtpFulltextStatements[iStmt] );
+
+ while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
+ sqlite3_stmt *pNewStmt;
+
+ if( rc==SQLITE_BUSY ) continue;
+ if( rc!=SQLITE_ERROR ) return rc;
+
+ rc = sqlite3_reset(s);
+ if( rc!=SQLITE_SCHEMA ) return SQLITE_ERROR;
+
+ v->pFulltextStatements[iStmt] = NULL; /* Still in s */
+ rc = sql_get_statement(v, iStmt, &pNewStmt);
+ if( rc!=SQLITE_OK ) goto err;
+ *ppStmt = pNewStmt;
+
+ rc = sqlite3_transfer_bindings(s, pNewStmt);
+ if( rc!=SQLITE_OK ) goto err;
+
+ rc = sqlite3_finalize(s);
+ if( rc!=SQLITE_OK ) return rc;
+ s = pNewStmt;
+ }
+ return rc;
+
+ err:
+ sqlite3_finalize(s);
+ return rc;
+}
+
+/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
+** Useful for statements like UPDATE, where we expect no results.
+*/
+static int sql_single_step_statement(fulltext_vtab *v,
+ fulltext_statement iStmt,
+ sqlite3_stmt **ppStmt){
+ int rc = sql_step_statement(v, iStmt, ppStmt);
+ return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
+}
+
+/* insert into %_content (rowid, content) values ([rowid], [zContent]) */
+static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
+ const char *zContent, int nContent){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_value(s, 1, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 2, zContent, nContent, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
+}
+
+/* select content from %_content where rowid = [iRow]
+ * The caller must delete the returned string. */
+static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
+ char **pzContent){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
+ if( rc!=SQLITE_ROW ) return rc;
+
+ *pzContent = string_dup((const char *)sqlite3_column_text(s, 0));
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ if( rc==SQLITE_DONE ) return SQLITE_OK;
+
+ free(*pzContent);
+ return rc;
+}
+
+/* delete from %_content where rowid = [iRow ] */
+static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, iRow);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
+}
+
+/* select rowid, doclist from %_term where term = [zTerm] and first = [iFirst]
+ * If found, returns SQLITE_OK; the caller must free the returned doclist.
+ * If no rows found, returns SQLITE_ERROR. */
+static int term_select(fulltext_vtab *v, const char *zTerm, int nTerm,
+ sqlite_int64 iFirst,
+ sqlite_int64 *rowid,
+ DocList *out){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_TRANSIENT);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, iFirst);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
+ if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
+
+ *rowid = sqlite3_column_int64(s, 0);
+ docListInit(out, DL_POSITIONS_OFFSETS,
+ sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
+
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ rc = sqlite3_step(s);
+ return rc==SQLITE_DONE ? SQLITE_OK : rc;
+}
+
+/* select max(first) from %_term where term = [zTerm] and first <= [iFirst]
+ * If found, returns SQLITE_ROW and result in *piResult; if the query returns
+ * NULL (meaning no row found) returns SQLITE_DONE.
+ */
+static int term_chunk_select(fulltext_vtab *v, const char *zTerm, int nTerm,
+ sqlite_int64 iFirst, sqlite_int64 *piResult){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_CHUNK_SELECT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, iFirst);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sql_step_statement(v, TERM_CHUNK_SELECT_STMT, &s);
+ if( rc!=SQLITE_ROW ) return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
+
+ switch( sqlite3_column_type(s, 0) ){
+ case SQLITE_NULL:
+ rc = SQLITE_DONE;
+ break;
+ case SQLITE_INTEGER:
+ *piResult = sqlite3_column_int64(s, 0);
+ break;
+ default:
+ return SQLITE_ERROR;
+ }
+ /* We expect only one row. We must execute another sqlite3_step()
+ * to complete the iteration; otherwise the table will remain locked. */
+ if( sqlite3_step(s) != SQLITE_DONE ) return SQLITE_ERROR;
+ return rc;
+}
+
+/* insert into %_term (term, first, doclist)
+ values ([zTerm], [iFirst], [doclist]) */
+static int term_insert(fulltext_vtab *v, const char *zTerm, int nTerm,
+ sqlite_int64 iFirst, DocList *doclist){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(s, 1, zTerm, nTerm, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, iFirst);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_blob(s, 3, doclist->pData, doclist->nData, SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
+}
+
+/* update %_term set doclist = [doclist] where rowid = [rowid] */
+static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
+ DocList *doclist){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData,
+ SQLITE_STATIC);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 2, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
+}
+
+static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
+ sqlite3_stmt *s;
+ int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_int64(s, 1, rowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
+}
+
+static void fulltext_vtab_destroy(fulltext_vtab *v){
+ int iStmt;
+
+ for( iStmt=0; iStmtpFulltextStatements[iStmt]!=NULL ){
+ sqlite3_finalize(v->pFulltextStatements[iStmt]);
+ v->pFulltextStatements[iStmt] = NULL;
+ }
+ }
+
+ if( v->pTokenizer!=NULL ){
+ v->pTokenizer->pModule->xDestroy(v->pTokenizer);
+ v->pTokenizer = NULL;
+ }
+
+ free((void *) v->zName);
+ free(v);
+}
+
+/* Current interface:
+** argv[0] - module name
+** argv[1] - database name
+** argv[2] - table name
+** argv[3] - tokenizer name (optional, a sensible default is provided)
+** argv[4..] - passed to tokenizer (optional based on tokenizer)
+**/
+static int fulltextConnect(
+ sqlite3 *db,
+ void *pAux,
+ int argc,
+ const char * const *argv,
+ sqlite3_vtab **ppVTab,
+ char **pzErr
+){
+ int rc;
+ fulltext_vtab *v;
+ sqlite3_tokenizer_module *m = NULL;
+
+ assert( argc>=3 );
+ v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
+ /* sqlite will initialize v->base */
+ v->db = db;
+ v->zName = string_dup(argv[2]);
+ v->pTokenizer = NULL;
+
+ if( argc==3 ){
+ get_simple_tokenizer_module(&m);
+ } else {
+ /* TODO(shess) For now, add new tokenizers as else if clauses. */
+ if( !strcmp(argv[3], "simple") ){
+ get_simple_tokenizer_module(&m);
+ } else {
+ assert( "unrecognized tokenizer"==NULL );
+ }
+ }
+
+ /* TODO(shess) Since tokenization impacts the index, the parameters
+ ** to the tokenizer need to be identical when a persistent virtual
+ ** table is re-created. One solution would be a meta-table to track
+ ** such information in the database. Then we could verify that the
+ ** information is identical on subsequent creates.
+ */
+ /* TODO(shess) Why isn't argv already (const char **)? */
+ rc = m->xCreate(argc-3, (const char **) (argv+3), &v->pTokenizer);
+ if( rc!=SQLITE_OK ) return rc;
+ v->pTokenizer->pModule = m;
+
+ /* TODO: verify the existence of backing tables foo_content, foo_term */
+
+ rc = sqlite3_declare_vtab(db, "create table x(content text)");
+ if( rc!=SQLITE_OK ) return rc;
+
+ memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
+
+ *ppVTab = &v->base;
+ return SQLITE_OK;
+}
+
+static int fulltextCreate(
+ sqlite3 *db,
+ void *pAux,
+ int argc,
+ const char * const *argv,
+ sqlite3_vtab **ppVTab,
+ char **pzErr
+){
+ int rc;
+ assert( argc>=3 );
+
+ /* The %_content table holds the text of each full-text item, with
+ ** the rowid used as the docid.
+ **
+ ** The %_term table maps each term to a document list blob
+ ** containing elements sorted by ascending docid, each element
+ ** encoded as:
+ **
+ ** docid varint-encoded
+ ** token count varint-encoded
+ ** "count" token elements (poslist):
+ ** position varint-encoded as delta from previous position
+ ** start offset varint-encoded as delta from previous start offset
+ ** end offset varint-encoded as delta from start offset
+ **
+ ** Additionally, doclist blobs can be chunked into multiple rows,
+ ** using "first" to order the blobs. "first" is simply the first
+ ** docid in the blob.
+ */
+ /*
+ ** NOTE(shess) That last sentence is incorrect in the face of
+ ** deletion, which can leave a doclist that doesn't contain the
+ ** first from that row. I _believe_ this does not matter to the
+ ** operation of the system, but it might be reasonable to update
+ ** appropriately in case this assumption becomes more important.
+ */
+ rc = sql_exec(db, argv[2],
+ "create table %_content(content text);"
+ "create table %_term(term text, first integer, doclist blob);"
+ "create index %_index on %_term(term, first)");
+ if( rc!=SQLITE_OK ) return rc;
+
+ return fulltextConnect(db, pAux, argc, argv, ppVTab, pzErr);
+}
+
+/* Decide how to handle an SQL query.
+ * At the moment, MATCH queries can include implicit boolean ANDs; we
+ * haven't implemented phrase searches or OR yet. */
+static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
+ int i;
+
+ for(i=0; inConstraint; ++i){
+ const struct sqlite3_index_constraint *pConstraint;
+ pConstraint = &pInfo->aConstraint[i];
+ if( pConstraint->iColumn==0 &&
+ pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH &&
+ pConstraint->usable ){ /* a full-text search */
+ pInfo->aConstraintUsage[i].argvIndex = 1;
+ pInfo->aConstraintUsage[i].omit = 1;
+ pInfo->idxNum = QUERY_FULLTEXT;
+ pInfo->estimatedCost = 1.0; /* an arbitrary value for now */
+ return SQLITE_OK;
+ }
+ }
+ pInfo->idxNum = QUERY_GENERIC;
+ return SQLITE_OK;
+}
+
+static int fulltextDisconnect(sqlite3_vtab *pVTab){
+ fulltext_vtab_destroy((fulltext_vtab *)pVTab);
+ return SQLITE_OK;
+}
+
+static int fulltextDestroy(sqlite3_vtab *pVTab){
+ fulltext_vtab *v = (fulltext_vtab *)pVTab;
+
+ int rc = sql_exec(v->db, v->zName,
+ "drop table %_content; drop table %_term");
+ if( rc!=SQLITE_OK ) return rc;
+
+ fulltext_vtab_destroy((fulltext_vtab *)pVTab);
+ return SQLITE_OK;
+}
+
+static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
+ fulltext_cursor *c;
+
+ c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
+ /* sqlite will initialize c->base */
+ *ppCursor = &c->base;
+
+ return SQLITE_OK;
+}
+
+static int fulltextClose(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ sqlite3_finalize(c->pStmt);
+ if( c->result.pDoclist!=NULL ){
+ docListDelete(c->result.pDoclist);
+ }
+ free(c);
+ return SQLITE_OK;
+}
+
+static int fulltextNext(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ sqlite_int64 iDocid;
+ int rc;
+
+ switch( c->iCursorType ){
+ case QUERY_GENERIC:
+ /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
+ rc = sqlite3_step(c->pStmt);
+ switch( rc ){
+ case SQLITE_ROW:
+ c->eof = 0;
+ return SQLITE_OK;
+ case SQLITE_DONE:
+ c->eof = 1;
+ return SQLITE_OK;
+ default:
+ c->eof = 1;
+ return rc;
+ }
+ case QUERY_FULLTEXT:
+ rc = sqlite3_reset(c->pStmt);
+ if( rc!=SQLITE_OK ) return rc;
+
+ if( readerAtEnd(&c->result)){
+ c->eof = 1;
+ return SQLITE_OK;
+ }
+ iDocid = readDocid(&c->result);
+ rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
+ if( rc!=SQLITE_OK ) return rc;
+ /* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
+ rc = sqlite3_step(c->pStmt);
+ if( rc==SQLITE_ROW ){ /* the case we expect */
+ c->eof = 0;
+ return SQLITE_OK;
+ }
+ /* an error occurred; abort */
+ return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
+ default:
+ assert( 0 );
+ return SQLITE_ERROR; /* not reached */
+ }
+}
+
+static int term_select_doclist(fulltext_vtab *v, const char *pTerm, int nTerm,
+ sqlite3_stmt **ppStmt){
+ int rc;
+ if( *ppStmt ){
+ rc = sqlite3_reset(*ppStmt);
+ } else {
+ rc = sql_prepare(v->db, v->zName, ppStmt,
+ "select doclist from %_term where term = ? order by first");
+ }
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = sqlite3_bind_text(*ppStmt, 1, pTerm, nTerm, SQLITE_TRANSIENT);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return sqlite3_step(*ppStmt); /* TODO(adamd): handle schema error */
+}
+
+/* Read the posting list for [zTerm]; AND it with the doclist [in] to
+ * produce the doclist [out], using the given offset [iOffset] for phrase
+ * matching.
+ * (*pSelect) is used to hold an SQLite statement used inside this function;
+ * the caller should initialize *pSelect to NULL before the first call.
+ */
+static int query_merge(fulltext_vtab *v, sqlite3_stmt **pSelect,
+ const char *zTerm,
+ DocList *pIn, int iOffset, DocList *out){
+ int rc;
+ DocListMerge merge;
+
+ if( pIn!=NULL && !pIn->nData ){
+ /* If [pIn] is already empty, there's no point in reading the
+ * posting list to AND it in; return immediately. */
+ return SQLITE_OK;
+ }
+
+ rc = term_select_doclist(v, zTerm, -1, pSelect);
+ if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
+
+ mergeInit(&merge, pIn, iOffset, out);
+ while( rc==SQLITE_ROW ){
+ DocList block;
+ docListInit(&block, DL_POSITIONS_OFFSETS,
+ sqlite3_column_blob(*pSelect, 0),
+ sqlite3_column_bytes(*pSelect, 0));
+ mergeBlock(&merge, &block);
+ docListDestroy(&block);
+
+ rc = sqlite3_step(*pSelect);
+ if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ){
+ return rc;
+ }
+ }
+
+ return SQLITE_OK;
+}
+
+typedef struct QueryTerm {
+ int is_phrase; /* true if this term begins a new phrase */
+ const char *zTerm;
+} QueryTerm;
+
+/* A parsed query.
+ *
+ * As an example, parsing the query ["four score" years "new nation"] will
+ * yield a Query with 5 terms:
+ * "four", is_phrase = 1
+ * "score", is_phrase = 0
+ * "years", is_phrase = 1
+ * "new", is_phrase = 1
+ * "nation", is_phrase = 0
+ */
+typedef struct Query {
+ int nTerms;
+ QueryTerm *pTerm;
+} Query;
+
+static void query_add(Query *q, int is_phrase, const char *zTerm){
+ QueryTerm *t;
+ ++q->nTerms;
+ q->pTerm = realloc(q->pTerm, q->nTerms * sizeof(q->pTerm[0]));
+ t = &q->pTerm[q->nTerms - 1];
+ t->is_phrase = is_phrase;
+ t->zTerm = zTerm;
+}
+
+static void query_free(Query *q){
+ int i;
+ for(i = 0; i < q->nTerms; ++i){
+ free((void *) q->pTerm[i].zTerm);
+ }
+ free(q->pTerm);
+}
+
+static int tokenize_segment(sqlite3_tokenizer *pTokenizer,
+ const char *zQuery, int in_phrase,
+ Query *pQuery){
+ sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
+ sqlite3_tokenizer_cursor *pCursor;
+ int is_first = 1;
+
+ int rc = pModule->xOpen(pTokenizer, zQuery, -1, &pCursor);
+ if( rc!=SQLITE_OK ) return rc;
+ pCursor->pTokenizer = pTokenizer;
+
+ while( 1 ){
+ const char *zToken;
+ int nToken, iStartOffset, iEndOffset, dummy_pos;
+
+ rc = pModule->xNext(pCursor,
+ &zToken, &nToken,
+ &iStartOffset, &iEndOffset,
+ &dummy_pos);
+ if( rc!=SQLITE_OK ) break;
+ query_add(pQuery, !in_phrase || is_first, string_dup_n(zToken, nToken));
+ is_first = 0;
+ }
+
+ return pModule->xClose(pCursor);
+}
+
+/* Parse a query string, yielding a Query object. */
+static int parse_query(fulltext_vtab *v, const char *zQuery, Query *pQuery){
+ char *zQuery1 = string_dup(zQuery);
+ int in_phrase = 0;
+ char *s = zQuery1;
+ pQuery->nTerms = 0;
+ pQuery->pTerm = NULL;
+
+ while( *s ){
+ char *t = s;
+ while( *t ){
+ if( *t=='"' ){
+ *t++ = '\0';
+ break;
+ }
+ ++t;
+ }
+ if( *s ){
+ tokenize_segment(v->pTokenizer, s, in_phrase, pQuery);
+ }
+ s = t;
+ in_phrase = !in_phrase;
+ }
+
+ free(zQuery1);
+ return SQLITE_OK;
+}
+
+/* Perform a full-text query; return a list of documents in [pResult]. */
+static int fulltext_query(fulltext_vtab *v, const char *zQuery,
+ DocList **pResult){
+ Query q;
+ int phrase_start = -1;
+ int i;
+ sqlite3_stmt *pSelect = NULL;
+ DocList *d = NULL;
+
+ int rc = parse_query(v, zQuery, &q);
+ if( rc!=SQLITE_OK ) return rc;
+
+ /* Merge terms. */
+ for(i = 0 ; i < q.nTerms ; ++i){
+ /* In each merge step, we need to generate positions whenever we're
+ * processing a phrase which hasn't ended yet. */
+ int need_positions = iiCursorType = idxNum;
+ switch( idxNum ){
+ case QUERY_GENERIC:
+ zStatement = "select rowid, content from %_content";
+ break;
+
+ case QUERY_FULLTEXT: /* full-text search */
+ {
+ const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
+ DocList *pResult;
+ assert( argc==1 );
+ rc = fulltext_query(v, zQuery, &pResult);
+ if( rc!=SQLITE_OK ) return rc;
+ readerInit(&c->result, pResult);
+ zStatement = "select rowid, content from %_content where rowid = ?";
+ break;
+ }
+
+ default:
+ assert( 0 );
+ }
+
+ rc = sql_prepare(v->db, v->zName, &c->pStmt, zStatement);
+ if( rc!=SQLITE_OK ) return rc;
+
+ return fulltextNext(pCursor);
+}
+
+static int fulltextEof(sqlite3_vtab_cursor *pCursor){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ return c->eof;
+}
+
+static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
+ sqlite3_context *pContext, int idxCol){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+ const char *s;
+
+ assert( idxCol==0 );
+ s = (const char *) sqlite3_column_text(c->pStmt, 1);
+ sqlite3_result_text(pContext, s, -1, SQLITE_TRANSIENT);
+
+ return SQLITE_OK;
+}
+
+static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
+ fulltext_cursor *c = (fulltext_cursor *) pCursor;
+
+ *pRowid = sqlite3_column_int64(c->pStmt, 0);
+ return SQLITE_OK;
+}
+
+/* Build a hash table containing all terms in zText. */
+static int build_terms(Hash *terms, sqlite3_tokenizer *pTokenizer,
+ const char *zText, sqlite_int64 iDocid){
+ sqlite3_tokenizer_cursor *pCursor;
+ const char *pToken;
+ int nTokenBytes;
+ int iStartOffset, iEndOffset, iPosition;
+
+ int rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
+ if( rc!=SQLITE_OK ) return rc;
+
+ pCursor->pTokenizer = pTokenizer;
+ HashInit(terms, HASH_STRING, 1);
+ while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
+ &pToken, &nTokenBytes,
+ &iStartOffset, &iEndOffset,
+ &iPosition) ){
+ DocList *p;
+
+ /* Positions can't be negative; we use -1 as a terminator internally. */
+ if( iPosition<0 ) {
+ rc = SQLITE_ERROR;
+ goto err;
+ }
+
+ p = HashFind(terms, pToken, nTokenBytes);
+ if( p==NULL ){
+ p = docListNew(DL_POSITIONS_OFFSETS);
+ docListAddDocid(p, iDocid);
+ HashInsert(terms, pToken, nTokenBytes, p);
+ }
+ docListAddPosOffset(p, iPosition, iStartOffset, iEndOffset);
+ }
+
+err:
+ /* TODO(shess) Check return? Should this be able to cause errors at
+ ** this point? Actually, same question about sqlite3_finalize(),
+ ** though one could argue that failure there means that the data is
+ ** not durable. *ponder*
+ */
+ pTokenizer->pModule->xClose(pCursor);
+ return rc;
+}
+/* Update the %_terms table to map the term [zTerm] to the given rowid. */
+static int index_insert_term(fulltext_vtab *v, const char *zTerm, int nTerm,
+ sqlite_int64 iDocid, DocList *p){
+ sqlite_int64 iFirst;
+ sqlite_int64 iIndexRow;
+ DocList doclist;
+
+ int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
+ if( rc==SQLITE_DONE ){
+ docListInit(&doclist, DL_POSITIONS_OFFSETS, 0, 0);
+ if( docListUpdate(&doclist, iDocid, p) ){
+ rc = term_insert(v, zTerm, nTerm, iDocid, &doclist);
+ docListDestroy(&doclist);
+ return rc;
+ }
+ return SQLITE_OK;
+ }
+ if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
+
+ /* This word is in the index; add this document ID to its blob. */
+
+ rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
+ if( rc!=SQLITE_OK ) return rc;
+
+ if( docListUpdate(&doclist, iDocid, p) ){
+ /* If the blob is too big, split it in half. */
+ if( doclist.nData>CHUNK_MAX ){
+ DocList half;
+ if( docListSplit(&doclist, &half) ){
+ rc = term_insert(v, zTerm, nTerm, firstDocid(&half), &half);
+ docListDestroy(&half);
+ if( rc!=SQLITE_OK ) goto err;
+ }
+ }
+ rc = term_update(v, iIndexRow, &doclist);
+ }
+
+err:
+ docListDestroy(&doclist);
+ return rc;
+}
+
+/* Insert a row into the full-text index; set *piRowid to be the ID of the
+ * new row. */
+static int index_insert(fulltext_vtab *v,
+ sqlite3_value *pRequestRowid, const char *zText,
+ sqlite_int64 *piRowid){
+ Hash terms; /* maps term string -> PosList */
+ HashElem *e;
+
+ int rc = content_insert(v, pRequestRowid, zText, -1);
+ if( rc!=SQLITE_OK ) return rc;
+ *piRowid = sqlite3_last_insert_rowid(v->db);
+
+ if( !zText ) return SQLITE_OK; /* nothing to index */
+
+ rc = build_terms(&terms, v->pTokenizer, zText, *piRowid);
+ if( rc!=SQLITE_OK ) return rc;
+
+ for(e=HashFirst(&terms); e; e=HashNext(e)){
+ DocList *p = HashData(e);
+ rc = index_insert_term(v, HashKey(e), HashKeysize(e), *piRowid, p);
+ if( rc!=SQLITE_OK ) break;
+ }
+
+ for(e=HashFirst(&terms); e; e=HashNext(e)){
+ DocList *p = HashData(e);
+ docListDelete(p);
+ }
+ HashClear(&terms);
+ return rc;
+}
+
+static int index_delete_term(fulltext_vtab *v, const char *zTerm, int nTerm,
+ sqlite_int64 iDocid){
+ sqlite_int64 iFirst;
+ sqlite_int64 iIndexRow;
+ DocList doclist;
+
+ int rc = term_chunk_select(v, zTerm, nTerm, iDocid, &iFirst);
+ if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
+
+ rc = term_select(v, zTerm, nTerm, iFirst, &iIndexRow, &doclist);
+ if( rc!=SQLITE_OK ) return rc;
+
+ if( docListUpdate(&doclist, iDocid, NULL) ){
+ if( doclist.nData>0 ){
+ rc = term_update(v, iIndexRow, &doclist);
+ } else { /* empty posting list */
+ rc = term_delete(v, iIndexRow);
+ }
+ }
+ docListDestroy(&doclist);
+ return rc;
+}
+
+/* Delete a row from the full-text index. */
+static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
+ char *zText;
+ Hash terms;
+ HashElem *e;
+
+ int rc = content_select(v, iRow, &zText);
+ if( rc!=SQLITE_OK ) return rc;
+
+ rc = build_terms(&terms, v->pTokenizer, zText, iRow);
+ free(zText);
+ if( rc!=SQLITE_OK ) return rc;
+
+ for(e=HashFirst(&terms); e; e=HashNext(e)){
+ rc = index_delete_term(v, HashKey(e), HashKeysize(e), iRow);
+ if( rc!=SQLITE_OK ) break;
+ }
+ for(e=HashFirst(&terms); e; e=HashNext(e)){
+ DocList *p = HashData(e);
+ docListDelete(p);
+ }
+ HashClear(&terms);
+
+ return content_delete(v, iRow);
+}
+
+static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
+ sqlite_int64 *pRowid){
+ fulltext_vtab *v = (fulltext_vtab *) pVtab;
+
+ if( nArg<2 ){
+ return index_delete(v, sqlite3_value_int64(ppArg[0]));
+ }
+
+ if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
+ return SQLITE_ERROR; /* an update; not yet supported */
+ }
+
+ assert( nArg==3 ); /* ppArg[1] = rowid, ppArg[2] = content */
+ return index_insert(v, ppArg[1],
+ (const char *)sqlite3_value_text(ppArg[2]), pRowid);
+}
+
+static sqlite3_module fulltextModule = {
+ 0,
+ fulltextCreate,
+ fulltextConnect,
+ fulltextBestIndex,
+ fulltextDisconnect,
+ fulltextDestroy,
+ fulltextOpen,
+ fulltextClose,
+ fulltextFilter,
+ fulltextNext,
+ fulltextEof,
+ fulltextColumn,
+ fulltextRowid,
+ fulltextUpdate
+};
+
+int fulltext_init(sqlite3 *db){
+ return sqlite3_create_module(db, "fulltext", &fulltextModule, 0);
+}
+
+#if !SQLITE_CORE
+#ifdef _WIN32
+__declspec(dllexport)
+#endif
+int sqlite3_fulltext_init(sqlite3 *db, char **pzErrMsg,
+ const sqlite3_api_routines *pApi){
+ SQLITE_EXTENSION_INIT2(pApi)
+ return fulltext_init(db);
+}
+#endif
ADDED ext/fts1/fulltext.h
Index: ext/fts1/fulltext.h
==================================================================
--- /dev/null
+++ ext/fts1/fulltext.h
@@ -0,0 +1,11 @@
+#include "sqlite3.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+int fulltext_init(sqlite3 *db);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
ADDED ext/fts1/simple_tokenizer.c
Index: ext/fts1/simple_tokenizer.c
==================================================================
--- /dev/null
+++ ext/fts1/simple_tokenizer.c
@@ -0,0 +1,174 @@
+/*
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Implementation of the "simple" full-text-search tokenizer.
+*/
+
+#include
+#if !defined(__APPLE__)
+#include
+#else
+#include
+#endif
+#include
+#include
+#include
+
+#include "tokenizer.h"
+
+/* Duplicate a string; the caller must free() the returned string.
+ * (We don't use strdup() since it's not part of the standard C library and
+ * may not be available everywhere.) */
+/* TODO(shess) Copied from fulltext.c, consider util.c for such
+** things. */
+static char *string_dup(const char *s){
+ char *str = malloc(strlen(s) + 1);
+ strcpy(str, s);
+ return str;
+}
+
+typedef struct simple_tokenizer {
+ sqlite3_tokenizer base;
+ const char *zDelim; /* token delimiters */
+} simple_tokenizer;
+
+typedef struct simple_tokenizer_cursor {
+ sqlite3_tokenizer_cursor base;
+ const char *pInput; /* input we are tokenizing */
+ int nBytes; /* size of the input */
+ const char *pCurrent; /* current position in pInput */
+ int iToken; /* index of next token to be returned */
+ char *zToken; /* storage for current token */
+ int nTokenBytes; /* actual size of current token */
+ int nTokenAllocated; /* space allocated to zToken buffer */
+} simple_tokenizer_cursor;
+
+static sqlite3_tokenizer_module simpleTokenizerModule;/* forward declaration */
+
+static int simpleCreate(
+ int argc, const char **argv,
+ sqlite3_tokenizer **ppTokenizer
+){
+ simple_tokenizer *t;
+
+ t = (simple_tokenizer *) malloc(sizeof(simple_tokenizer));
+ /* TODO(shess) Delimiters need to remain the same from run to run,
+ ** else we need to reindex. One solution would be a meta-table to
+ ** track such information in the database, then we'd only want this
+ ** information on the initial create.
+ */
+ if( argc>1 ){
+ t->zDelim = string_dup(argv[1]);
+ } else {
+ /* Build a string excluding alphanumeric ASCII characters */
+ char zDelim[0x80]; /* nul-terminated, so nul not a member */
+ int i, j;
+ for(i=1, j=0; i<0x80; i++){
+ if( !isalnum(i) ){
+ zDelim[j++] = i;
+ }
+ }
+ zDelim[j++] = '\0';
+ assert( j<=sizeof(zDelim) );
+ t->zDelim = string_dup(zDelim);
+ }
+
+ *ppTokenizer = &t->base;
+ return SQLITE_OK;
+}
+
+static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
+ simple_tokenizer *t = (simple_tokenizer *) pTokenizer;
+
+ free((void *) t->zDelim);
+ free(t);
+
+ return SQLITE_OK;
+}
+
+static int simpleOpen(
+ sqlite3_tokenizer *pTokenizer,
+ const char *pInput, int nBytes,
+ sqlite3_tokenizer_cursor **ppCursor
+){
+ simple_tokenizer_cursor *c;
+
+ c = (simple_tokenizer_cursor *) malloc(sizeof(simple_tokenizer_cursor));
+ c->pInput = pInput;
+ c->nBytes = nBytes<0 ? (int) strlen(pInput) : nBytes;
+ c->pCurrent = c->pInput; /* start tokenizing at the beginning */
+ c->iToken = 0;
+ c->zToken = NULL; /* no space allocated, yet. */
+ c->nTokenBytes = 0;
+ c->nTokenAllocated = 0;
+
+ *ppCursor = &c->base;
+ return SQLITE_OK;
+}
+
+static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
+
+ if( NULL!=c->zToken ){
+ free(c->zToken);
+ }
+ free(c);
+
+ return SQLITE_OK;
+}
+
+static int simpleNext(
+ sqlite3_tokenizer_cursor *pCursor,
+ const char **ppToken, int *pnBytes,
+ int *piStartOffset, int *piEndOffset, int *piPosition
+){
+ simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
+ simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
+ int ii;
+
+ while( c->pCurrent-c->pInputnBytes ){
+ int n = (int) strcspn(c->pCurrent, t->zDelim);
+ if( n>0 ){
+ if( n+1>c->nTokenAllocated ){
+ c->zToken = realloc(c->zToken, n+1);
+ }
+ for(ii=0; iipCurrent[ii];
+ c->zToken[ii] = (unsigned char)ch<0x80 ? tolower((unsigned char)ch):ch;
+ }
+ c->zToken[n] = '\0';
+ *ppToken = c->zToken;
+ *pnBytes = n;
+ *piStartOffset = (int) (c->pCurrent-c->pInput);
+ *piEndOffset = *piStartOffset+n;
+ *piPosition = c->iToken++;
+ c->pCurrent += n + 1;
+
+ return SQLITE_OK;
+ }
+ c->pCurrent += n + 1;
+ /* TODO(shess) could strspn() to skip delimiters en masse. Needs
+ ** to happen in two places, though, which is annoying.
+ */
+ }
+ return SQLITE_DONE;
+}
+
+static sqlite3_tokenizer_module simpleTokenizerModule = {
+ 0,
+ simpleCreate,
+ simpleDestroy,
+ simpleOpen,
+ simpleClose,
+ simpleNext,
+};
+
+void get_simple_tokenizer_module(
+ sqlite3_tokenizer_module **ppModule
+){
+ *ppModule = &simpleTokenizerModule;
+}
ADDED ext/fts1/tokenizer.h
Index: ext/fts1/tokenizer.h
==================================================================
--- /dev/null
+++ ext/fts1/tokenizer.h
@@ -0,0 +1,89 @@
+/*
+** 2006 July 10
+**
+** The author disclaims copyright to this source code.
+**
+*************************************************************************
+** Defines the interface to tokenizers used by fulltext-search. There
+** are three basic components:
+**
+** sqlite3_tokenizer_module is a singleton defining the tokenizer
+** interface functions. This is essentially the class structure for
+** tokenizers.
+**
+** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
+** including customization information defined at creation time.
+**
+** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
+** tokens from a particular input.
+*/
+#ifndef _TOKENIZER_H_
+#define _TOKENIZER_H_
+
+/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
+** If tokenizers are to be allowed to call sqlite3_*() functions, then
+** we will need a way to register the API consistently.
+*/
+#include "sqlite3.h"
+
+/*
+** Structures used by the tokenizer interface.
+*/
+typedef struct sqlite3_tokenizer sqlite3_tokenizer;
+typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
+typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
+
+struct sqlite3_tokenizer_module {
+ int iVersion; /* currently 0 */
+
+ /*
+ ** Create and destroy a tokenizer. argc/argv are passed down from
+ ** the fulltext virtual table creation to allow customization.
+ */
+ int (*xCreate)(int argc, const char **argv,
+ sqlite3_tokenizer **ppTokenizer);
+ int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
+
+ /*
+ ** Tokenize a particular input. Call xOpen() to prepare to
+ ** tokenize, xNext() repeatedly until it returns SQLITE_DONE, then
+ ** xClose() to free any internal state. The pInput passed to
+ ** xOpen() must exist until the cursor is closed. The ppToken
+ ** result from xNext() is only valid until the next call to xNext()
+ ** or until xClose() is called.
+ */
+ /* TODO(shess) current implementation requires pInput to be
+ ** nul-terminated. This should either be fixed, or pInput/nBytes
+ ** should be converted to zInput.
+ */
+ int (*xOpen)(sqlite3_tokenizer *pTokenizer,
+ const char *pInput, int nBytes,
+ sqlite3_tokenizer_cursor **ppCursor);
+ int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
+ int (*xNext)(sqlite3_tokenizer_cursor *pCursor,
+ const char **ppToken, int *pnBytes,
+ int *piStartOffset, int *piEndOffset, int *piPosition);
+};
+
+struct sqlite3_tokenizer {
+ sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
+ /* Tokenizer implementations will typically add additional fields */
+};
+
+struct sqlite3_tokenizer_cursor {
+ sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
+ /* Tokenizer implementations will typically add additional fields */
+};
+
+/*
+** Get the module for a tokenizer which generates tokens based on a
+** set of non-token characters. The default is to break tokens at any
+** non-alnum character, though the set of delimiters can also be
+** specified by the first argv argument to xCreate().
+*/
+/* TODO(shess) This doesn't belong here. Need some sort of
+** registration process.
+*/
+void get_simple_tokenizer_module(sqlite3_tokenizer_module **ppModule);
+
+#endif /* _TOKENIZER_H_ */
ADDED ext/fts2/README.tokenizers
Index: ext/fts2/README.tokenizers
==================================================================
--- /dev/null
+++ ext/fts2/README.tokenizers
@@ -0,0 +1,133 @@
+
+1. FTS2 Tokenizers
+
+ When creating a new full-text table, FTS2 allows the user to select
+ the text tokenizer implementation to be used when indexing text
+ by specifying a "tokenizer" clause as part of the CREATE VIRTUAL TABLE
+ statement:
+
+ CREATE VIRTUAL TABLE USING fts2(
+ [, tokenizer []]
+ );
+
+ The built-in tokenizers (valid values to pass as ) are
+ "simple" and "porter".
+
+ should consist of zero or more white-space separated
+ arguments to pass to the selected tokenizer implementation. The
+ interpretation of the arguments, if any, depends on the individual
+ tokenizer.
+
+2. Custom Tokenizers
+
+ FTS2 allows users to provide custom tokenizer implementations. The
+ interface used to create a new tokenizer is defined and described in
+ the fts2_tokenizer.h source file.
+
+ Registering a new FTS2 tokenizer is similar to registering a new
+ virtual table module with SQLite. The user passes a pointer to a
+ structure containing pointers to various callback functions that
+ make up the implementation of the new tokenizer type. For tokenizers,
+ the structure (defined in fts2_tokenizer.h) is called
+ "sqlite3_tokenizer_module".
+
+ FTS2 does not expose a C-function that users call to register new
+ tokenizer types with a database handle. Instead, the pointer must
+ be encoded as an SQL blob value and passed to FTS2 through the SQL
+ engine by evaluating a special scalar function, "fts2_tokenizer()".
+ The fts2_tokenizer() function may be called with one or two arguments,
+ as follows:
+
+ SELECT fts2_tokenizer();
+ SELECT fts2_tokenizer(, );
+
+ Where is a string identifying the tokenizer and
+ is a pointer to an sqlite3_tokenizer_module
+ structure encoded as an SQL blob. If the second argument is present,
+ it is registered as tokenizer and a copy of it
+ returned. If only one argument is passed, a pointer to the tokenizer
+ implementation currently registered as is returned,
+ encoded as a blob. Or, if no such tokenizer exists, an SQL exception
+ (error) is raised.
+
+ SECURITY: If the fts2 extension is used in an environment where potentially
+ malicious users may execute arbitrary SQL (i.e. gears), they should be
+ prevented from invoking the fts2_tokenizer() function, possibly using the
+ authorisation callback.
+
+ See "Sample code" below for an example of calling the fts2_tokenizer()
+ function from C code.
+
+3. ICU Library Tokenizers
+
+ If this extension is compiled with the SQLITE_ENABLE_ICU pre-processor
+ symbol defined, then there exists a built-in tokenizer named "icu"
+ implemented using the ICU library. The first argument passed to the
+ xCreate() method (see fts2_tokenizer.h) of this tokenizer may be
+ an ICU locale identifier. For example "tr_TR" for Turkish as used
+ in Turkey, or "en_AU" for English as used in Australia. For example:
+
+ "CREATE VIRTUAL TABLE thai_text USING fts2(text, tokenizer icu th_TH)"
+
+ The ICU tokenizer implementation is very simple. It splits the input
+ text according to the ICU rules for finding word boundaries and discards
+ any tokens that consist entirely of white-space. This may be suitable
+ for some applications in some locales, but not all. If more complex
+ processing is required, for example to implement stemming or
+ discard punctuation, this can be done by creating a tokenizer
+ implementation that uses the ICU tokenizer as part of its implementation.
+
+ When using the ICU tokenizer this way, it is safe to overwrite the
+ contents of the strings returned by the xNext() method (see
+ fts2_tokenizer.h).
+
+4. Sample code.
+
+ The following two code samples illustrate the way C code should invoke
+ the fts2_tokenizer() scalar function:
+
+ int registerTokenizer(
+ sqlite3 *db,
+ char *zName,
+ const sqlite3_tokenizer_module *p
+ ){
+ int rc;
+ sqlite3_stmt *pStmt;
+ const char zSql[] = "SELECT fts2_tokenizer(?, ?)";
+
+ rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
+ sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
+ sqlite3_step(pStmt);
+
+ return sqlite3_finalize(pStmt);
+ }
+
+ int queryTokenizer(
+ sqlite3 *db,
+ char *zName,
+ const sqlite3_tokenizer_module **pp
+ ){
+ int rc;
+ sqlite3_stmt *pStmt;
+ const char zSql[] = "SELECT fts2_tokenizer(?)";
+
+ *pp = 0;
+ rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
+ if( rc!=SQLITE_OK ){
+ return rc;
+ }
+
+ sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
+ if( SQLITE_ROW==sqlite3_step(pStmt) ){
+ if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
+ memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
+ }
+ }
+
+ return sqlite3_finalize(pStmt);
+ }
ADDED ext/fts2/README.txt
Index: ext/fts2/README.txt
==================================================================
--- /dev/null
+++ ext/fts2/README.txt
@@ -0,0 +1,4 @@
+This folder contains source code to the second full-text search
+extension for SQLite. While the API is the same, this version uses a
+substantially different storage schema from fts1, so tables will need
+to be rebuilt.
ADDED ext/fts2/fts2.c
Index: ext/fts2/fts2.c
==================================================================
--- /dev/null
+++ ext/fts2/fts2.c
@@ -0,0 +1,6860 @@
+/* fts2 has a design flaw which can lead to database corruption (see
+** below). It is recommended not to use it any longer, instead use
+** fts3 (or higher). If you believe that your use of fts2 is safe,
+** add -DSQLITE_ENABLE_BROKEN_FTS2=1 to your CFLAGS.
+*/
+#if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)) \
+ && !defined(SQLITE_ENABLE_BROKEN_FTS2)
+#error fts2 has a design flaw and has been deprecated.
+#endif
+/* The flaw is that fts2 uses the content table's unaliased rowid as
+** the unique docid. fts2 embeds the rowid in the index it builds,
+** and expects the rowid to not change. The SQLite VACUUM operation
+** will renumber such rowids, thereby breaking fts2. If you are using
+** fts2 in a system which has disabled VACUUM, then you can continue
+** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
+** VACUUM, though systems using auto_vacuum are unlikely to invoke
+** VACUUM.
+**
+** Unlike fts1, which is safe across VACUUM if you never delete
+** documents, fts2 has a second exposure to this flaw, in the segments
+** table. So fts2 should be considered unsafe across VACUUM in all
+** cases.
+*/
+
+/*
+** 2006 Oct 10
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** This is an SQLite module implementing full-text search.
+*/
+
+/*
+** The code in this file is only compiled if:
+**
+** * The FTS2 module is being built as an extension
+** (in which case SQLITE_CORE is not defined), or
+**
+** * The FTS2 module is being built into the core of
+** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
+*/
+
+/* TODO(shess) Consider exporting this comment to an HTML file or the
+** wiki.
+*/
+/* The full-text index is stored in a series of b+tree (-like)
+** structures called segments which map terms to doclists. The
+** structures are like b+trees in layout, but are constructed from the
+** bottom up in optimal fashion and are not updatable. Since trees
+** are built from the bottom up, things will be described from the
+** bottom up.
+**
+**
+**** Varints ****
+** The basic unit of encoding is a variable-length integer called a
+** varint. We encode variable-length integers in little-endian order
+** using seven bits * per byte as follows:
+**
+** KEY:
+** A = 0xxxxxxx 7 bits of data and one flag bit
+** B = 1xxxxxxx 7 bits of data and one flag bit
+**
+** 7 bits - A
+** 14 bits - BA
+** 21 bits - BBA
+** and so on.
+**
+** This is identical to how sqlite encodes varints (see util.c).
+**
+**
+**** Document lists ****
+** A doclist (document list) holds a docid-sorted list of hits for a
+** given term. Doclists hold docids, and can optionally associate
+** token positions and offsets with docids.
+**
+** A DL_POSITIONS_OFFSETS doclist is stored like this:
+**
+** array {
+** varint docid;
+** array { (position list for column 0)
+** varint position; (delta from previous position plus POS_BASE)
+** varint startOffset; (delta from previous startOffset)
+** varint endOffset; (delta from startOffset)
+** }
+** array {
+** varint POS_COLUMN; (marks start of position list for new column)
+** varint column; (index of new column)
+** array {
+** varint position; (delta from previous position plus POS_BASE)
+** varint startOffset;(delta from previous startOffset)
+** varint endOffset; (delta from startOffset)
+** }
+** }
+** varint POS_END; (marks end of positions for this document.
+** }
+**
+** Here, array { X } means zero or more occurrences of X, adjacent in
+** memory. A "position" is an index of a token in the token stream
+** generated by the tokenizer, while an "offset" is a byte offset,
+** both based at 0. Note that POS_END and POS_COLUMN occur in the
+** same logical place as the position element, and act as sentinals
+** ending a position list array.
+**
+** A DL_POSITIONS doclist omits the startOffset and endOffset
+** information. A DL_DOCIDS doclist omits both the position and
+** offset information, becoming an array of varint-encoded docids.
+**
+** On-disk data is stored as type DL_DEFAULT, so we don't serialize
+** the type. Due to how deletion is implemented in the segmentation
+** system, on-disk doclists MUST store at least positions.
+**
+**
+**** Segment leaf nodes ****
+** Segment leaf nodes store terms and doclists, ordered by term. Leaf
+** nodes are written using LeafWriter, and read using LeafReader (to
+** iterate through a single leaf node's data) and LeavesReader (to
+** iterate through a segment's entire leaf layer). Leaf nodes have
+** the format:
+**
+** varint iHeight; (height from leaf level, always 0)
+** varint nTerm; (length of first term)
+** char pTerm[nTerm]; (content of first term)
+** varint nDoclist; (length of term's associated doclist)
+** char pDoclist[nDoclist]; (content of doclist)
+** array {
+** (further terms are delta-encoded)
+** varint nPrefix; (length of prefix shared with previous term)
+** varint nSuffix; (length of unshared suffix)
+** char pTermSuffix[nSuffix];(unshared suffix of next term)
+** varint nDoclist; (length of term's associated doclist)
+** char pDoclist[nDoclist]; (content of doclist)
+** }
+**
+** Here, array { X } means zero or more occurrences of X, adjacent in
+** memory.
+**
+** Leaf nodes are broken into blocks which are stored contiguously in
+** the %_segments table in sorted order. This means that when the end
+** of a node is reached, the next term is in the node with the next
+** greater node id.
+**
+** New data is spilled to a new leaf node when the current node
+** exceeds LEAF_MAX bytes (default 2048). New data which itself is
+** larger than STANDALONE_MIN (default 1024) is placed in a standalone
+** node (a leaf node with a single term and doclist). The goal of
+** these settings is to pack together groups of small doclists while
+** making it efficient to directly access large doclists. The
+** assumption is that large doclists represent terms which are more
+** likely to be query targets.
+**
+** TODO(shess) It may be useful for blocking decisions to be more
+** dynamic. For instance, it may make more sense to have a 2.5k leaf
+** node rather than splitting into 2k and .5k nodes. My intuition is
+** that this might extend through 2x or 4x the pagesize.
+**
+**
+**** Segment interior nodes ****
+** Segment interior nodes store blockids for subtree nodes and terms
+** to describe what data is stored by the each subtree. Interior
+** nodes are written using InteriorWriter, and read using
+** InteriorReader. InteriorWriters are created as needed when
+** SegmentWriter creates new leaf nodes, or when an interior node
+** itself grows too big and must be split. The format of interior
+** nodes:
+**
+** varint iHeight; (height from leaf level, always >0)
+** varint iBlockid; (block id of node's leftmost subtree)
+** optional {
+** varint nTerm; (length of first term)
+** char pTerm[nTerm]; (content of first term)
+** array {
+** (further terms are delta-encoded)
+** varint nPrefix; (length of shared prefix with previous term)
+** varint nSuffix; (length of unshared suffix)
+** char pTermSuffix[nSuffix]; (unshared suffix of next term)
+** }
+** }
+**
+** Here, optional { X } means an optional element, while array { X }
+** means zero or more occurrences of X, adjacent in memory.
+**
+** An interior node encodes n terms separating n+1 subtrees. The
+** subtree blocks are contiguous, so only the first subtree's blockid
+** is encoded. The subtree at iBlockid will contain all terms less
+** than the first term encoded (or all terms if no term is encoded).
+** Otherwise, for terms greater than or equal to pTerm[i] but less
+** than pTerm[i+1], the subtree for that term will be rooted at
+** iBlockid+i. Interior nodes only store enough term data to
+** distinguish adjacent children (if the rightmost term of the left
+** child is "something", and the leftmost term of the right child is
+** "wicked", only "w" is stored).
+**
+** New data is spilled to a new interior node at the same height when
+** the current node exceeds INTERIOR_MAX bytes (default 2048).
+** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
+** interior nodes and making the tree too skinny. The interior nodes
+** at a given height are naturally tracked by interior nodes at
+** height+1, and so on.
+**
+**
+**** Segment directory ****
+** The segment directory in table %_segdir stores meta-information for
+** merging and deleting segments, and also the root node of the
+** segment's tree.
+**
+** The root node is the top node of the segment's tree after encoding
+** the entire segment, restricted to ROOT_MAX bytes (default 1024).
+** This could be either a leaf node or an interior node. If the top
+** node requires more than ROOT_MAX bytes, it is flushed to %_segments
+** and a new root interior node is generated (which should always fit
+** within ROOT_MAX because it only needs space for 2 varints, the
+** height and the blockid of the previous root).
+**
+** The meta-information in the segment directory is:
+** level - segment level (see below)
+** idx - index within level
+** - (level,idx uniquely identify a segment)
+** start_block - first leaf node
+** leaves_end_block - last leaf node
+** end_block - last block (including interior nodes)
+** root - contents of root node
+**
+** If the root node is a leaf node, then start_block,
+** leaves_end_block, and end_block are all 0.
+**
+**
+**** Segment merging ****
+** To amortize update costs, segments are groups into levels and
+** merged in matches. Each increase in level represents exponentially
+** more documents.
+**
+** New documents (actually, document updates) are tokenized and
+** written individually (using LeafWriter) to a level 0 segment, with
+** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
+** level 0 segments are merged into a single level 1 segment. Level 1
+** is populated like level 0, and eventually MERGE_COUNT level 1
+** segments are merged to a single level 2 segment (representing
+** MERGE_COUNT^2 updates), and so on.
+**
+** A segment merge traverses all segments at a given level in
+** parallel, performing a straightforward sorted merge. Since segment
+** leaf nodes are written in to the %_segments table in order, this
+** merge traverses the underlying sqlite disk structures efficiently.
+** After the merge, all segment blocks from the merged level are
+** deleted.
+**
+** MERGE_COUNT controls how often we merge segments. 16 seems to be
+** somewhat of a sweet spot for insertion performance. 32 and 64 show
+** very similar performance numbers to 16 on insertion, though they're
+** a tiny bit slower (perhaps due to more overhead in merge-time
+** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
+** 16, 2 about 66% slower than 16.
+**
+** At query time, high MERGE_COUNT increases the number of segments
+** which need to be scanned and merged. For instance, with 100k docs
+** inserted:
+**
+** MERGE_COUNT segments
+** 16 25
+** 8 12
+** 4 10
+** 2 6
+**
+** This appears to have only a moderate impact on queries for very
+** frequent terms (which are somewhat dominated by segment merge
+** costs), and infrequent and non-existent terms still seem to be fast
+** even with many segments.
+**
+** TODO(shess) That said, it would be nice to have a better query-side
+** argument for MERGE_COUNT of 16. Also, it is possible/likely that
+** optimizations to things like doclist merging will swing the sweet
+** spot around.
+**
+**
+**
+**** Handling of deletions and updates ****
+** Since we're using a segmented structure, with no docid-oriented
+** index into the term index, we clearly cannot simply update the term
+** index when a document is deleted or updated. For deletions, we
+** write an empty doclist (varint(docid) varint(POS_END)), for updates
+** we simply write the new doclist. Segment merges overwrite older
+** data for a particular docid with newer data, so deletes or updates
+** will eventually overtake the earlier data and knock it out. The
+** query logic likewise merges doclists so that newer data knocks out
+** older data.
+**
+** TODO(shess) Provide a VACUUM type operation to clear out all
+** deletions and duplications. This would basically be a forced merge
+** into a single segment.
+*/
+
+#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
+
+#if defined(SQLITE_ENABLE_FTS2) && !defined(SQLITE_CORE)
+# define SQLITE_CORE 1
+#endif
+
+#include
+#include
+#include
+#include
+#include "fts2.h"
+#include "fts2_hash.h"
+#include "fts2_tokenizer.h"
+#include "sqlite3.h"
+#include "sqlite3ext.h"
+SQLITE_EXTENSION_INIT1
+
+
+/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
+** would be nice to order the file better, perhaps something along the
+** lines of:
+**
+** - utility functions
+** - table setup functions
+** - table update functions
+** - table query functions
+**
+** Put the query functions last because they're likely to reference
+** typedefs or functions from the table update section.
+*/
+
+#if 0
+# define TRACE(A) printf A; fflush(stdout)
+#else
+# define TRACE(A)
+#endif
+
+/* It is not safe to call isspace(), tolower(), or isalnum() on
+** hi-bit-set characters. This is the same solution used in the
+** tokenizer.
+*/
+/* TODO(shess) The snippet-generation code should be using the
+** tokenizer-generated tokens rather than doing its own local
+** tokenization.
+*/
+/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
+static int safe_isspace(char c){
+ return c==' ' || c=='\t' || c=='\n' || c=='\r' || c=='\v' || c=='\f';
+}
+static int safe_tolower(char c){
+ return (c>='A' && c<='Z') ? (c - 'A' + 'a') : c;
+}
+static int safe_isalnum(char c){
+ return (c>='0' && c<='9') || (c>='A' && c<='Z') || (c>='a' && c<='z');
+}
+
+typedef enum DocListType {
+ DL_DOCIDS, /* docids only */
+ DL_POSITIONS, /* docids + positions */
+ DL_POSITIONS_OFFSETS /* docids + positions + offsets */
+} DocListType;
+
+/*
+** By default, only positions and not offsets are stored in the doclists.
+** To change this so that offsets are stored too, compile with
+**
+** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
+**
+** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
+** into (no deletes or updates).
+*/
+#ifndef DL_DEFAULT
+# define DL_DEFAULT DL_POSITIONS
+#endif
+
+enum {
+ POS_END = 0, /* end of this position list */
+ POS_COLUMN, /* followed by new column number */
+ POS_BASE
+};
+
+/* MERGE_COUNT controls how often we merge segments (see comment at
+** top of file).
+*/
+#define MERGE_COUNT 16
+
+/* utility functions */
+
+/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
+** record to prevent errors of the form:
+**
+** my_function(SomeType *b){
+** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
+** }
+*/
+/* TODO(shess) Obvious candidates for a header file. */
+#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
+
+#ifndef NDEBUG
+# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
+#else
+# define SCRAMBLE(b)
+#endif
+
+/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
+#define VARINT_MAX 10
+
+/* Write a 64-bit variable-length integer to memory starting at p[0].
+ * The length of data written will be between 1 and VARINT_MAX bytes.
+ * The number of bytes written is returned. */
+static int putVarint(char *p, sqlite_int64 v){
+ unsigned char *q = (unsigned char *) p;
+ sqlite_uint64 vu = v;
+ do{
+ *q++ = (unsigned char) ((vu & 0x7f) | 0x80);
+ vu >>= 7;
+ }while( vu!=0 );
+ q[-1] &= 0x7f; /* turn off high bit in final byte */
+ assert( q - (unsigned char *)p <= VARINT_MAX );
+ return (int) (q - (unsigned char *)p);
+}
+
+/* Read a 64-bit variable-length integer from memory starting at p[0].
+ * Return the number of bytes read, or 0 on error.
+ * The value is stored in *v. */
+static int getVarint(const char *p, sqlite_int64 *v){
+ const unsigned char *q = (const unsigned char *) p;
+ sqlite_uint64 x = 0, y = 1;
+ while( (*q & 0x80) == 0x80 ){
+ x += y * (*q++ & 0x7f);
+ y <<= 7;
+ if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
+ assert( 0 );
+ return 0;
+ }
+ }
+ x += y * (*q++);
+ *v = (sqlite_int64) x;
+ return (int) (q - (unsigned char *)p);
+}
+
+static int getVarint32(const char *p, int *pi){
+ sqlite_int64 i;
+ int ret = getVarint(p, &i);
+ *pi = (int) i;
+ assert( *pi==i );
+ return ret;
+}
+
+/*******************************************************************/
+/* DataBuffer is used to collect data into a buffer in piecemeal
+** fashion. It implements the usual distinction between amount of
+** data currently stored (nData) and buffer capacity (nCapacity).
+**
+** dataBufferInit - create a buffer with given initial capacity.
+** dataBufferReset - forget buffer's data, retaining capacity.
+** dataBufferDestroy - free buffer's data.
+** dataBufferSwap - swap contents of two buffers.
+** dataBufferExpand - expand capacity without adding data.
+** dataBufferAppend - append data.
+** dataBufferAppend2 - append two pieces of data at once.
+** dataBufferReplace - replace buffer's data.
+*/
+typedef struct DataBuffer {
+ char *pData; /* Pointer to malloc'ed buffer. */
+ int nCapacity; /* Size of pData buffer. */
+ int nData; /* End of data loaded into pData. */
+} DataBuffer;
+
+static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
+ assert( nCapacity>=0 );
+ pBuffer->nData = 0;
+ pBuffer->nCapacity = nCapacity;
+ pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity);
+}
+static void dataBufferReset(DataBuffer *pBuffer){
+ pBuffer->nData = 0;
+}
+static void dataBufferDestroy(DataBuffer *pBuffer){
+ if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
+ SCRAMBLE(pBuffer);
+}
+static void dataBufferSwap(DataBuffer *pBuffer1, DataBuffer *pBuffer2){
+ DataBuffer tmp = *pBuffer1;
+ *pBuffer1 = *pBuffer2;
+ *pBuffer2 = tmp;
+}
+static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
+ assert( nAddCapacity>0 );
+ /* TODO(shess) Consider expanding more aggressively. Note that the
+ ** underlying malloc implementation may take care of such things for
+ ** us already.
+ */
+ if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
+ pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
+ pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity);
+ }
+}
+static void dataBufferAppend(DataBuffer *pBuffer,
+ const char *pSource, int nSource){
+ assert( nSource>0 && pSource!=NULL );
+ dataBufferExpand(pBuffer, nSource);
+ memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
+ pBuffer->nData += nSource;
+}
+static void dataBufferAppend2(DataBuffer *pBuffer,
+ const char *pSource1, int nSource1,
+ const char *pSource2, int nSource2){
+ assert( nSource1>0 && pSource1!=NULL );
+ assert( nSource2>0 && pSource2!=NULL );
+ dataBufferExpand(pBuffer, nSource1+nSource2);
+ memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
+ memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
+ pBuffer->nData += nSource1+nSource2;
+}
+static void dataBufferReplace(DataBuffer *pBuffer,
+ const char *pSource, int nSource){
+ dataBufferReset(pBuffer);
+ dataBufferAppend(pBuffer, pSource, nSource);
+}
+
+/* StringBuffer is a null-terminated version of DataBuffer. */
+typedef struct StringBuffer {
+ DataBuffer b; /* Includes null terminator. */
+} StringBuffer;
+
+static void initStringBuffer(StringBuffer *sb){
+ dataBufferInit(&sb->b, 100);
+ dataBufferReplace(&sb->b, "", 1);
+}
+static int stringBufferLength(StringBuffer *sb){
+ return sb->b.nData-1;
+}
+static char *stringBufferData(StringBuffer *sb){
+ return sb->b.pData;
+}
+static void stringBufferDestroy(StringBuffer *sb){
+ dataBufferDestroy(&sb->b);
+}
+
+static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
+ assert( sb->b.nData>0 );
+ if( nFrom>0 ){
+ sb->b.nData--;
+ dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
+ }
+}
+static void append(StringBuffer *sb, const char *zFrom){
+ nappend(sb, zFrom, strlen(zFrom));
+}
+
+/* Append a list of strings separated by commas. */
+static void appendList(StringBuffer *sb, int nString, char **azString){
+ int i;
+ for(i=0; i0 ) append(sb, ", ");
+ append(sb, azString[i]);
+ }
+}
+
+static int endsInWhiteSpace(StringBuffer *p){
+ return stringBufferLength(p)>0 &&
+ safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
+}
+
+/* If the StringBuffer ends in something other than white space, add a
+** single space character to the end.
+*/
+static void appendWhiteSpace(StringBuffer *p){
+ if( stringBufferLength(p)==0 ) return;
+ if( !endsInWhiteSpace(p) ) append(p, " ");
+}
+
+/* Remove white space from the end of the StringBuffer */
+static void trimWhiteSpace(StringBuffer *p){
+ while( endsInWhiteSpace(p) ){
+ p->b.pData[--p->b.nData-1] = '\0';
+ }
+}
+
+/*******************************************************************/
+/* DLReader is used to read document elements from a doclist. The
+** current docid is cached, so dlrDocid() is fast. DLReader does not
+** own the doclist buffer.
+**
+** dlrAtEnd - true if there's no more data to read.
+** dlrDocid - docid of current document.
+** dlrDocData - doclist data for current document (including docid).
+** dlrDocDataBytes - length of same.
+** dlrAllDataBytes - length of all remaining data.
+** dlrPosData - position data for current document.
+** dlrPosDataLen - length of pos data for current document (incl POS_END).
+** dlrStep - step to current document.
+** dlrInit - initial for doclist of given type against given data.
+** dlrDestroy - clean up.
+**
+** Expected usage is something like:
+**
+** DLReader reader;
+** dlrInit(&reader, pData, nData);
+** while( !dlrAtEnd(&reader) ){
+** // calls to dlrDocid() and kin.
+** dlrStep(&reader);
+** }
+** dlrDestroy(&reader);
+*/
+typedef struct DLReader {
+ DocListType iType;
+ const char *pData;
+ int nData;
+
+ sqlite_int64 iDocid;
+ int nElement;
+} DLReader;
+
+static int dlrAtEnd(DLReader *pReader){
+ assert( pReader->nData>=0 );
+ return pReader->nData==0;
+}
+static sqlite_int64 dlrDocid(DLReader *pReader){
+ assert( !dlrAtEnd(pReader) );
+ return pReader->iDocid;
+}
+static const char *dlrDocData(DLReader *pReader){
+ assert( !dlrAtEnd(pReader) );
+ return pReader->pData;
+}
+static int dlrDocDataBytes(DLReader *pReader){
+ assert( !dlrAtEnd(pReader) );
+ return pReader->nElement;
+}
+static int dlrAllDataBytes(DLReader *pReader){
+ assert( !dlrAtEnd(pReader) );
+ return pReader->nData;
+}
+/* TODO(shess) Consider adding a field to track iDocid varint length
+** to make these two functions faster. This might matter (a tiny bit)
+** for queries.
+*/
+static const char *dlrPosData(DLReader *pReader){
+ sqlite_int64 iDummy;
+ int n = getVarint(pReader->pData, &iDummy);
+ assert( !dlrAtEnd(pReader) );
+ return pReader->pData+n;
+}
+static int dlrPosDataLen(DLReader *pReader){
+ sqlite_int64 iDummy;
+ int n = getVarint(pReader->pData, &iDummy);
+ assert( !dlrAtEnd(pReader) );
+ return pReader->nElement-n;
+}
+static void dlrStep(DLReader *pReader){
+ assert( !dlrAtEnd(pReader) );
+
+ /* Skip past current doclist element. */
+ assert( pReader->nElement<=pReader->nData );
+ pReader->pData += pReader->nElement;
+ pReader->nData -= pReader->nElement;
+
+ /* If there is more data, read the next doclist element. */
+ if( pReader->nData!=0 ){
+ sqlite_int64 iDocidDelta;
+ int iDummy, n = getVarint(pReader->pData, &iDocidDelta);
+ pReader->iDocid += iDocidDelta;
+ if( pReader->iType>=DL_POSITIONS ){
+ assert( nnData );
+ while( 1 ){
+ n += getVarint32(pReader->pData+n, &iDummy);
+ assert( n<=pReader->nData );
+ if( iDummy==POS_END ) break;
+ if( iDummy==POS_COLUMN ){
+ n += getVarint32(pReader->pData+n, &iDummy);
+ assert( nnData );
+ }else if( pReader->iType==DL_POSITIONS_OFFSETS ){
+ n += getVarint32(pReader->pData+n, &iDummy);
+ n += getVarint32(pReader->pData+n, &iDummy);
+ assert( nnData );
+ }
+ }
+ }
+ pReader->nElement = n;
+ assert( pReader->nElement<=pReader->nData );
+ }
+}
+static void dlrInit(DLReader *pReader, DocListType iType,
+ const char *pData, int nData){
+ assert( pData!=NULL && nData!=0 );
+ pReader->iType = iType;
+ pReader->pData = pData;
+ pReader->nData = nData;
+ pReader->nElement = 0;
+ pReader->iDocid = 0;
+
+ /* Load the first element's data. There must be a first element. */
+ dlrStep(pReader);
+}
+static void dlrDestroy(DLReader *pReader){
+ SCRAMBLE(pReader);
+}
+
+#ifndef NDEBUG
+/* Verify that the doclist can be validly decoded. Also returns the
+** last docid found because it is convenient in other assertions for
+** DLWriter.
+*/
+static void docListValidate(DocListType iType, const char *pData, int nData,
+ sqlite_int64 *pLastDocid){
+ sqlite_int64 iPrevDocid = 0;
+ assert( nData>0 );
+ assert( pData!=0 );
+ assert( pData+nData>pData );
+ while( nData!=0 ){
+ sqlite_int64 iDocidDelta;
+ int n = getVarint(pData, &iDocidDelta);
+ iPrevDocid += iDocidDelta;
+ if( iType>DL_DOCIDS ){
+ int iDummy;
+ while( 1 ){
+ n += getVarint32(pData+n, &iDummy);
+ if( iDummy==POS_END ) break;
+ if( iDummy==POS_COLUMN ){
+ n += getVarint32(pData+n, &iDummy);
+ }else if( iType>DL_POSITIONS ){
+ n += getVarint32(pData+n, &iDummy);
+ n += getVarint32(pData+n, &iDummy);
+ }
+ assert( n<=nData );
+ }
+ }
+ assert( n<=nData );
+ pData += n;
+ nData -= n;
+ }
+ if( pLastDocid ) *pLastDocid = iPrevDocid;
+}
+#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
+#else
+#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
+#endif
+
+/*******************************************************************/
+/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
+** always appends to the buffer and does not own it.
+**
+** dlwInit - initialize to write a given type doclistto a buffer.
+** dlwDestroy - clear the writer's memory. Does not free buffer.
+** dlwAppend - append raw doclist data to buffer.
+** dlwCopy - copy next doclist from reader to writer.
+** dlwAdd - construct doclist element and append to buffer.
+** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
+*/
+typedef struct DLWriter {
+ DocListType iType;
+ DataBuffer *b;
+ sqlite_int64 iPrevDocid;
+#ifndef NDEBUG
+ int has_iPrevDocid;
+#endif
+} DLWriter;
+
+static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){
+ pWriter->b = b;
+ pWriter->iType = iType;
+ pWriter->iPrevDocid = 0;
+#ifndef NDEBUG
+ pWriter->has_iPrevDocid = 0;
+#endif
+}
+static void dlwDestroy(DLWriter *pWriter){
+ SCRAMBLE(pWriter);
+}
+/* iFirstDocid is the first docid in the doclist in pData. It is
+** needed because pData may point within a larger doclist, in which
+** case the first item would be delta-encoded.
+**
+** iLastDocid is the final docid in the doclist in pData. It is
+** needed to create the new iPrevDocid for future delta-encoding. The
+** code could decode the passed doclist to recreate iLastDocid, but
+** the only current user (docListMerge) already has decoded this
+** information.
+*/
+/* TODO(shess) This has become just a helper for docListMerge.
+** Consider a refactor to make this cleaner.
+*/
+static void dlwAppend(DLWriter *pWriter,
+ const char *pData, int nData,
+ sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
+ sqlite_int64 iDocid = 0;
+ char c[VARINT_MAX];
+ int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
+#ifndef NDEBUG
+ sqlite_int64 iLastDocidDelta;
+#endif
+
+ /* Recode the initial docid as delta from iPrevDocid. */
+ nFirstOld = getVarint(pData, &iDocid);
+ assert( nFirstOldiType==DL_DOCIDS) );
+ nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid);
+
+ /* Verify that the incoming doclist is valid AND that it ends with
+ ** the expected docid. This is essential because we'll trust this
+ ** docid in future delta-encoding.
+ */
+ ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
+ assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
+
+ /* Append recoded initial docid and everything else. Rest of docids
+ ** should have been delta-encoded from previous initial docid.
+ */
+ if( nFirstOldb, c, nFirstNew,
+ pData+nFirstOld, nData-nFirstOld);
+ }else{
+ dataBufferAppend(pWriter->b, c, nFirstNew);
+ }
+ pWriter->iPrevDocid = iLastDocid;
+}
+static void dlwCopy(DLWriter *pWriter, DLReader *pReader){
+ dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
+ dlrDocid(pReader), dlrDocid(pReader));
+}
+static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
+ char c[VARINT_MAX];
+ int n = putVarint(c, iDocid-pWriter->iPrevDocid);
+
+ /* Docids must ascend. */
+ assert( !pWriter->has_iPrevDocid || iDocid>pWriter->iPrevDocid );
+ assert( pWriter->iType==DL_DOCIDS );
+
+ dataBufferAppend(pWriter->b, c, n);
+ pWriter->iPrevDocid = iDocid;
+#ifndef NDEBUG
+ pWriter->has_iPrevDocid = 1;
+#endif
+}
+
+/*******************************************************************/
+/* PLReader is used to read data from a document's position list. As
+** the caller steps through the list, data is cached so that varints
+** only need to be decoded once.
+**
+** plrInit, plrDestroy - create/destroy a reader.
+** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
+** plrAtEnd - at end of stream, only call plrDestroy once true.
+** plrStep - step to the next element.
+*/
+typedef struct PLReader {
+ /* These refer to the next position's data. nData will reach 0 when
+ ** reading the last position, so plrStep() signals EOF by setting
+ ** pData to NULL.
+ */
+ const char *pData;
+ int nData;
+
+ DocListType iType;
+ int iColumn; /* the last column read */
+ int iPosition; /* the last position read */
+ int iStartOffset; /* the last start offset read */
+ int iEndOffset; /* the last end offset read */
+} PLReader;
+
+static int plrAtEnd(PLReader *pReader){
+ return pReader->pData==NULL;
+}
+static int plrColumn(PLReader *pReader){
+ assert( !plrAtEnd(pReader) );
+ return pReader->iColumn;
+}
+static int plrPosition(PLReader *pReader){
+ assert( !plrAtEnd(pReader) );
+ return pReader->iPosition;
+}
+static int plrStartOffset(PLReader *pReader){
+ assert( !plrAtEnd(pReader) );
+ return pReader->iStartOffset;
+}
+static int plrEndOffset(PLReader *pReader){
+ assert( !plrAtEnd(pReader) );
+ return pReader->iEndOffset;
+}
+static void plrStep(PLReader *pReader){
+ int i, n;
+
+ assert( !plrAtEnd(pReader) );
+
+ if( pReader->nData==0 ){
+ pReader->pData = NULL;
+ return;
+ }
+
+ n = getVarint32(pReader->pData, &i);
+ if( i==POS_COLUMN ){
+ n += getVarint32(pReader->pData+n, &pReader->iColumn);
+ pReader->iPosition = 0;
+ pReader->iStartOffset = 0;
+ n += getVarint32(pReader->pData+n, &i);
+ }
+ /* Should never see adjacent column changes. */
+ assert( i!=POS_COLUMN );
+
+ if( i==POS_END ){
+ pReader->nData = 0;
+ pReader->pData = NULL;
+ return;
+ }
+
+ pReader->iPosition += i-POS_BASE;
+ if( pReader->iType==DL_POSITIONS_OFFSETS ){
+ n += getVarint32(pReader->pData+n, &i);
+ pReader->iStartOffset += i;
+ n += getVarint32(pReader->pData+n, &i);
+ pReader->iEndOffset = pReader->iStartOffset+i;
+ }
+ assert( n<=pReader->nData );
+ pReader->pData += n;
+ pReader->nData -= n;
+}
+
+static void plrInit(PLReader *pReader, DLReader *pDLReader){
+ pReader->pData = dlrPosData(pDLReader);
+ pReader->nData = dlrPosDataLen(pDLReader);
+ pReader->iType = pDLReader->iType;
+ pReader->iColumn = 0;
+ pReader->iPosition = 0;
+ pReader->iStartOffset = 0;
+ pReader->iEndOffset = 0;
+ plrStep(pReader);
+}
+static void plrDestroy(PLReader *pReader){
+ SCRAMBLE(pReader);
+}
+
+/*******************************************************************/
+/* PLWriter is used in constructing a document's position list. As a
+** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
+** PLWriter writes to the associated DLWriter's buffer.
+**
+** plwInit - init for writing a document's poslist.
+** plwDestroy - clear a writer.
+** plwAdd - append position and offset information.
+** plwCopy - copy next position's data from reader to writer.
+** plwTerminate - add any necessary doclist terminator.
+**
+** Calling plwAdd() after plwTerminate() may result in a corrupt
+** doclist.
+*/
+/* TODO(shess) Until we've written the second item, we can cache the
+** first item's information. Then we'd have three states:
+**
+** - initialized with docid, no positions.
+** - docid and one position.
+** - docid and multiple positions.
+**
+** Only the last state needs to actually write to dlw->b, which would
+** be an improvement in the DLCollector case.
+*/
+typedef struct PLWriter {
+ DLWriter *dlw;
+
+ int iColumn; /* the last column written */
+ int iPos; /* the last position written */
+ int iOffset; /* the last start offset written */
+} PLWriter;
+
+/* TODO(shess) In the case where the parent is reading these values
+** from a PLReader, we could optimize to a copy if that PLReader has
+** the same type as pWriter.
+*/
+static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
+ int iStartOffset, int iEndOffset){
+ /* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
+ ** iStartOffsetDelta, and iEndOffsetDelta.
+ */
+ char c[5*VARINT_MAX];
+ int n = 0;
+
+ /* Ban plwAdd() after plwTerminate(). */
+ assert( pWriter->iPos!=-1 );
+
+ if( pWriter->dlw->iType==DL_DOCIDS ) return;
+
+ if( iColumn!=pWriter->iColumn ){
+ n += putVarint(c+n, POS_COLUMN);
+ n += putVarint(c+n, iColumn);
+ pWriter->iColumn = iColumn;
+ pWriter->iPos = 0;
+ pWriter->iOffset = 0;
+ }
+ assert( iPos>=pWriter->iPos );
+ n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
+ pWriter->iPos = iPos;
+ if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
+ assert( iStartOffset>=pWriter->iOffset );
+ n += putVarint(c+n, iStartOffset-pWriter->iOffset);
+ pWriter->iOffset = iStartOffset;
+ assert( iEndOffset>=iStartOffset );
+ n += putVarint(c+n, iEndOffset-iStartOffset);
+ }
+ dataBufferAppend(pWriter->dlw->b, c, n);
+}
+static void plwCopy(PLWriter *pWriter, PLReader *pReader){
+ plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
+ plrStartOffset(pReader), plrEndOffset(pReader));
+}
+static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){
+ char c[VARINT_MAX];
+ int n;
+
+ pWriter->dlw = dlw;
+
+ /* Docids must ascend. */
+ assert( !pWriter->dlw->has_iPrevDocid || iDocid>pWriter->dlw->iPrevDocid );
+ n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid);
+ dataBufferAppend(pWriter->dlw->b, c, n);
+ pWriter->dlw->iPrevDocid = iDocid;
+#ifndef NDEBUG
+ pWriter->dlw->has_iPrevDocid = 1;
+#endif
+
+ pWriter->iColumn = 0;
+ pWriter->iPos = 0;
+ pWriter->iOffset = 0;
+}
+/* TODO(shess) Should plwDestroy() also terminate the doclist? But
+** then plwDestroy() would no longer be just a destructor, it would
+** also be doing work, which isn't consistent with the overall idiom.
+** Another option would be for plwAdd() to always append any necessary
+** terminator, so that the output is always correct. But that would
+** add incremental work to the common case with the only benefit being
+** API elegance. Punt for now.
+*/
+static void plwTerminate(PLWriter *pWriter){
+ if( pWriter->dlw->iType>DL_DOCIDS ){
+ char c[VARINT_MAX];
+ int n = putVarint(c, POS_END);
+ dataBufferAppend(pWriter->dlw->b, c, n);
+ }
+#ifndef NDEBUG
+ /* Mark as terminated for assert in plwAdd(). */
+ pWriter->iPos = -1;
+#endif
+}
+static void plwDestroy(PLWriter *pWriter){
+ SCRAMBLE(pWriter);
+}
+
+/*******************************************************************/
+/* DLCollector wraps PLWriter and DLWriter to provide a
+** dynamically-allocated doclist area to use during tokenization.
+**
+** dlcNew - malloc up and initialize a collector.
+** dlcDelete - destroy a collector and all contained items.
+** dlcAddPos - append position and offset information.
+** dlcAddDoclist - add the collected doclist to the given buffer.
+** dlcNext - terminate the current document and open another.
+*/
+typedef struct DLCollector {
+ DataBuffer b;
+ DLWriter dlw;
+ PLWriter plw;
+} DLCollector;
+
+/* TODO(shess) This could also be done by calling plwTerminate() and
+** dataBufferAppend(). I tried that, expecting nominal performance
+** differences, but it seemed to pretty reliably be worth 1% to code
+** it this way. I suspect it is the incremental malloc overhead (some
+** percentage of the plwTerminate() calls will cause a realloc), so
+** this might be worth revisiting if the DataBuffer implementation
+** changes.
+*/
+static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){
+ if( pCollector->dlw.iType>DL_DOCIDS ){
+ char c[VARINT_MAX];
+ int n = putVarint(c, POS_END);
+ dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
+ }else{
+ dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
+ }
+}
+static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
+ plwTerminate(&pCollector->plw);
+ plwDestroy(&pCollector->plw);
+ plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
+}
+static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
+ int iStartOffset, int iEndOffset){
+ plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
+}
+
+static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
+ DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector));
+ dataBufferInit(&pCollector->b, 0);
+ dlwInit(&pCollector->dlw, iType, &pCollector->b);
+ plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
+ return pCollector;
+}
+static void dlcDelete(DLCollector *pCollector){
+ plwDestroy(&pCollector->plw);
+ dlwDestroy(&pCollector->dlw);
+ dataBufferDestroy(&pCollector->b);
+ SCRAMBLE(pCollector);
+ sqlite3_free(pCollector);
+}
+
+
+/* Copy the doclist data of iType in pData/nData into *out, trimming
+** unnecessary data as we go. Only columns matching iColumn are
+** copied, all columns copied if iColumn is -1. Elements with no
+** matching columns are dropped. The output is an iOutType doclist.
+*/
+/* NOTE(shess) This code is only valid after all doclists are merged.
+** If this is run before merges, then doclist items which represent
+** deletion will be trimmed, and will thus not effect a deletion
+** during the merge.
+*/
+static void docListTrim(DocListType iType, const char *pData, int nData,
+ int iColumn, DocListType iOutType, DataBuffer *out){
+ DLReader dlReader;
+ DLWriter dlWriter;
+
+ assert( iOutType<=iType );
+
+ dlrInit(&dlReader, iType, pData, nData);
+ dlwInit(&dlWriter, iOutType, out);
+
+ while( !dlrAtEnd(&dlReader) ){
+ PLReader plReader;
+ PLWriter plWriter;
+ int match = 0;
+
+ plrInit(&plReader, &dlReader);
+
+ while( !plrAtEnd(&plReader) ){
+ if( iColumn==-1 || plrColumn(&plReader)==iColumn ){
+ if( !match ){
+ plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
+ match = 1;
+ }
+ plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
+ plrStartOffset(&plReader), plrEndOffset(&plReader));
+ }
+ plrStep(&plReader);
+ }
+ if( match ){
+ plwTerminate(&plWriter);
+ plwDestroy(&plWriter);
+ }
+
+ plrDestroy(&plReader);
+ dlrStep(&dlReader);
+ }
+ dlwDestroy(&dlWriter);
+ dlrDestroy(&dlReader);
+}
+
+/* Used by docListMerge() to keep doclists in the ascending order by
+** docid, then ascending order by age (so the newest comes first).
+*/
+typedef struct OrderedDLReader {
+ DLReader *pReader;
+
+ /* TODO(shess) If we assume that docListMerge pReaders is ordered by
+ ** age (which we do), then we could use pReader comparisons to break
+ ** ties.
+ */
+ int idx;
+} OrderedDLReader;
+
+/* Order eof to end, then by docid asc, idx desc. */
+static int orderedDLReaderCmp(OrderedDLReader *r1, OrderedDLReader *r2){
+ if( dlrAtEnd(r1->pReader) ){
+ if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
+ return 1; /* Only r1 atEnd(). */
+ }
+ if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
+
+ if( dlrDocid(r1->pReader)pReader) ) return -1;
+ if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
+
+ /* Descending on idx. */
+ return r2->idx-r1->idx;
+}
+
+/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
+** p[1..n-1] is already sorted.
+*/
+/* TODO(shess) Is this frequent enough to warrant a binary search?
+** Before implementing that, instrument the code to check. In most
+** current usage, I expect that p[0] will be less than p[1] a very
+** high proportion of the time.
+*/
+static void orderedDLReaderReorder(OrderedDLReader *p, int n){
+ while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
+ OrderedDLReader tmp = p[0];
+ p[0] = p[1];
+ p[1] = tmp;
+ n--;
+ p++;
+ }
+}
+
+/* Given an array of doclist readers, merge their doclist elements
+** into out in sorted order (by docid), dropping elements from older
+** readers when there is a duplicate docid. pReaders is assumed to be
+** ordered by age, oldest first.
+*/
+/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
+** be fixed.
+*/
+static void docListMerge(DataBuffer *out,
+ DLReader *pReaders, int nReaders){
+ OrderedDLReader readers[MERGE_COUNT];
+ DLWriter writer;
+ int i, n;
+ const char *pStart = 0;
+ int nStart = 0;
+ sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
+
+ assert( nReaders>0 );
+ if( nReaders==1 ){
+ dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
+ return;
+ }
+
+ assert( nReaders<=MERGE_COUNT );
+ n = 0;
+ for(i=0; i0 ){
+ orderedDLReaderReorder(readers+i, nReaders-i);
+ }
+
+ dlwInit(&writer, pReaders[0].iType, out);
+ while( !dlrAtEnd(readers[0].pReader) ){
+ sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
+
+ /* If this is a continuation of the current buffer to copy, extend
+ ** that buffer. memcpy() seems to be more efficient if it has a
+ ** lots of data to copy.
+ */
+ if( dlrDocData(readers[0].pReader)==pStart+nStart ){
+ nStart += dlrDocDataBytes(readers[0].pReader);
+ }else{
+ if( pStart!=0 ){
+ dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
+ }
+ pStart = dlrDocData(readers[0].pReader);
+ nStart = dlrDocDataBytes(readers[0].pReader);
+ iFirstDocid = iDocid;
+ }
+ iLastDocid = iDocid;
+ dlrStep(readers[0].pReader);
+
+ /* Drop all of the older elements with the same docid. */
+ for(i=1; i0 ){
+ orderedDLReaderReorder(readers+i, nReaders-i);
+ }
+ }
+
+ /* Copy over any remaining elements. */
+ if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
+ dlwDestroy(&writer);
+}
+
+/* Helper function for posListUnion(). Compares the current position
+** between left and right, returning as standard C idiom of <0 if
+** left0 if left>right, and 0 if left==right. "End" always
+** compares greater.
+*/
+static int posListCmp(PLReader *pLeft, PLReader *pRight){
+ assert( pLeft->iType==pRight->iType );
+ if( pLeft->iType==DL_DOCIDS ) return 0;
+
+ if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
+ if( plrAtEnd(pRight) ) return -1;
+
+ if( plrColumn(pLeft)plrColumn(pRight) ) return 1;
+
+ if( plrPosition(pLeft)