/* ** This C program extracts all "words" from an input document and adds them ** to an SQLite database. A "word" is any contiguous sequence of alphabetic ** characters. All digits, punctuation, and whitespace characters are ** word separators. The database stores a single entry for each distinct ** word together with a count of the number of occurrences of that word. ** A fresh database is created automatically on each run. ** ** wordcount DATABASE INPUTFILE ** ** The INPUTFILE name can be omitted, in which case input it taken from ** standard input. ** ** Option: ** ** ** Modes: ** ** Insert mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,1) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new -- if (1) is a noop ** ** Update mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,0) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new ** ** Replace mode means: ** (1) REPLACE INTO wordcount ** VALUES($new,ifnull((SELECT cnt FROM wordcount WHERE word=$new),0)+1); ** ** Select mode means: ** (1) SELECT 1 FROM wordcount WHERE word=$new ** (2) INSERT INTO wordcount VALUES($new,1) -- if (1) returns nothing ** (3) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new --if (1) return TRUE ** ** Delete mode means: ** (1) DELETE FROM wordcount WHERE word=$new ** ** Query mode means: ** (1) SELECT cnt FROM wordcount WHERE word=$new ** ** Note that delete mode and query mode are only useful for preexisting ** databases. The wordcount table is created using IF NOT EXISTS so this ** utility can be run multiple times on the same database file. The ** --without-rowid, --nocase, and --pagesize parameters are only effective ** when creating a new database and are harmless no-ops on preexisting ** databases. ** ****************************************************************************** ** ** Compile as follows: ** ** gcc -I. wordcount.c sqlite3.c -ldl -lpthreads ** ** Or: ** ** gcc -I. -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \ ** wordcount.c sqlite3.c */ #include #include #include #include #include #include "sqlite3.h" #ifndef _WIN32 # include #else # include #endif #define ISALPHA(X) isalpha((unsigned char)(X)) const char zHelp[] = "Usage: wordcount [OPTIONS] DATABASE [INPUT]\n" " --all Repeat the test for all test modes\n" " --cachesize NNN Use a cache size of NNN\n" " --commit NNN Commit after every NNN operations\n" " --delete Use DELETE mode\n" " --insert Use INSERT mode (the default)\n" " --journal MMMM Use PRAGMA journal_mode=MMMM\n" " --nocase Add the NOCASE collating sequence to the words.\n" " --nosync Use PRAGMA synchronous=OFF\n" " --pagesize NNN Use a page size of NNN\n" " --query Use QUERY mode\n" " --replace Use REPLACE mode\n" " --select Use SELECT mode\n" " --stats Show sqlite3_status() results at the end.\n" " --summary Show summary information on the collected data.\n" " --tag NAME Tag all output using NAME. Use only stdout.\n" " --timer Time the operation of this program\n" " --trace Enable sqlite3_trace() output.\n" " --update Use UPDATE mode\n" " --without-rowid Use a WITHOUT ROWID table to store the words.\n" ; /* Output tag */ char *zTag = "--"; /* Return the current wall-clock time */ static sqlite3_int64 realTime(void){ static sqlite3_vfs *clockVfs = 0; sqlite3_int64 t; if( clockVfs==0 ) clockVfs = sqlite3_vfs_find(0); if( clockVfs->iVersion>=1 && clockVfs->xCurrentTimeInt64!=0 ){ clockVfs->xCurrentTimeInt64(clockVfs, &t); }else{ double r; clockVfs->xCurrentTime(clockVfs, &r); t = (sqlite3_int64)(r*86400000.0); } return t; } /* Print an error message and exit */ static void fatal_error(const char *zMsg, ...){ va_list ap; va_start(ap, zMsg); vfprintf(stderr, zMsg, ap); va_end(ap); exit(1); } /* Print a usage message and quit */ static void usage(void){ printf("%s",zHelp); exit(0); } /* The sqlite3_trace() callback function */ static void traceCallback(void *NotUsed, const char *zSql){ printf("%s;\n", zSql); } /* An sqlite3_exec() callback that prints results on standard output, ** each column separated by a single space. */ static int printResult(void *NotUsed, int nArg, char **azArg, char **azNm){ int i; printf("%s", zTag); for(i=0; i=MODE_COUNT*2 ) return 0; i = (*piLoopCnt)++; *pUseWithoutRowid = i&1; *piMode2 = i>>1; return 1; } int main(int argc, char **argv){ const char *zFileToRead = 0; /* Input file. NULL for stdin */ const char *zDbName = 0; /* Name of the database file to create */ int useWithoutRowid = 0; /* True for --without-rowid */ int iMode = MODE_INSERT; /* One of MODE_xxxxx */ int iMode2; /* Mode to use for current --all iteration */ int iLoopCnt = 0; /* Which iteration when running --all */ int useNocase = 0; /* True for --nocase */ int doTrace = 0; /* True for --trace */ int showStats = 0; /* True for --stats */ int showSummary = 0; /* True for --summary */ int showTimer = 0; /* True for --timer */ int cacheSize = 0; /* Desired cache size. 0 means default */ int pageSize = 0; /* Desired page size. 0 means default */ int commitInterval = 0; /* How often to commit. 0 means never */ int noSync = 0; /* True for --nosync */ const char *zJMode = 0; /* Journal mode */ int nOp = 0; /* Operation counter */ int i, j; /* Loop counters */ sqlite3 *db; /* The SQLite database connection */ char *zSql; /* Constructed SQL statement */ sqlite3_stmt *pInsert = 0; /* The INSERT statement */ sqlite3_stmt *pUpdate = 0; /* The UPDATE statement */ sqlite3_stmt *pSelect = 0; /* The SELECT statement */ sqlite3_stmt *pDelete = 0; /* The DELETE statement */ FILE *in; /* The open input file */ int rc; /* Return code from an SQLite interface */ int iCur, iHiwtr; /* Statistics values, current and "highwater" */ FILE *pTimer = stderr; /* Output channel for the timer */ sqlite3_int64 sumCnt = 0; /* Sum in QUERY mode */ sqlite3_int64 startTime; /* Time of start */ sqlite3_int64 totalTime = 0; /* Total time */ char zInput[2000]; /* A single line of input */ /* Process command-line arguments */ for(i=1; i0 && (nOp%commitInterval)==0 ){ sqlite3_exec(db, "COMMIT; BEGIN IMMEDIATE", 0, 0, 0); } } } sqlite3_exec(db, "COMMIT", 0, 0, 0); sqlite3_finalize(pInsert); pInsert = 0; sqlite3_finalize(pUpdate); pUpdate = 0; sqlite3_finalize(pSelect); pSelect = 0; sqlite3_finalize(pDelete); pDelete = 0; if( iMode2==MODE_QUERY && iMode!=MODE_ALL ){ printf("%s sum of cnt: %lld\n", zTag, sumCnt); rc = sqlite3_prepare_v2(db,"SELECT sum(cnt*cnt) FROM wordcount", -1, &pSelect, 0); if( rc==SQLITE_OK && sqlite3_step(pSelect)==SQLITE_ROW ){ printf("%s double-check: %lld\n", zTag,sqlite3_column_int64(pSelect,0)); } sqlite3_finalize(pSelect); } if( showTimer ){ sqlite3_int64 elapseTime = realTime() - startTime; totalTime += elapseTime; fprintf(pTimer, "%3d.%03d wordcount", (int)(elapseTime/1000), (int)(elapseTime%1000)); if( iMode==MODE_ALL ){ fprintf(pTimer, " %s%s\n", azMode[iMode2], useWithoutRowid? " --without-rowid" : ""); }else{ for(i=1; i