/* ** This C program extracts all "words" from an input document and adds them ** to an SQLite database. A "word" is any contiguous sequence of alphabetic ** characters. All digits, punctuation, and whitespace characters are ** word separators. The database stores a single entry for each distinct ** word together with a count of the number of occurrences of that word. ** A fresh database is created automatically on each run. ** ** wordcount DATABASE INPUTFILE ** ** The INPUTFILE name can be omitted, in which case input it taken from ** standard input. ** ** Option: ** ** --without-rowid Use a WITHOUT ROWID table to store the words. ** --insert Use INSERT mode (the default) ** --replace Use REPLACE mode ** --select Use SELECT mode ** --update Use UPDATE mode ** --delete Use DELETE mode ** --query Use QUERY mode ** --nocase Add the NOCASE collating sequence to the words. ** --trace Enable sqlite3_trace() output. ** --summary Show summary information on the collected data. ** --stats Show sqlite3_status() results at the end. ** --pagesize NNN Use a page size of NNN ** --cachesize NNN Use a cache size of NNN ** --commit NNN Commit after every NNN operations ** --nosync Use PRAGMA synchronous=OFF ** --journal MMMM Use PRAGMA journal_mode=MMMM ** --timer Time the operation of this program ** ** Modes: ** ** Insert mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,1) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new -- if (1) is a noop ** ** Update mode means: ** (1) INSERT OR IGNORE INTO wordcount VALUES($new,0) ** (2) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new ** ** Replace mode means: ** (1) REPLACE INTO wordcount ** VALUES($new,ifnull((SELECT cnt FROM wordcount WHERE word=$new),0)+1); ** ** Select mode means: ** (1) SELECT 1 FROM wordcount WHERE word=$new ** (2) INSERT INTO wordcount VALUES($new,1) -- if (1) returns nothing ** (3) UPDATE wordcount SET cnt=cnt+1 WHERE word=$new --if (1) return TRUE ** ** Delete mode means: ** (1) DELETE FROM wordcount WHERE word=$new ** ** Query mode means: ** (1) SELECT cnt FROM wordcount WHERE word=$new ** ** Note that delete mode and query mode are only useful for preexisting ** databases. The wordcount table is created using IF NOT EXISTS so this ** utility can be run multiple times on the same database file. The ** --without-rowid, --nocase, and --pagesize parameters are only effective ** when creating a new database and are harmless no-ops on preexisting ** databases. ** ****************************************************************************** ** ** Compile as follows: ** ** gcc -I. wordcount.c sqlite3.c -ldl -lpthreads ** ** Or: ** ** gcc -I. -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \ ** wordcount.c sqlite3.c */ #include #include #include #include #include #include "sqlite3.h" #define ISALPHA(X) isalpha((unsigned char)(X)) /* Return the current wall-clock time */ static sqlite3_int64 realTime(void){ static sqlite3_vfs *clockVfs = 0; sqlite3_int64 t; if( clockVfs==0 ) clockVfs = sqlite3_vfs_find(0); if( clockVfs->iVersion>=1 && clockVfs->xCurrentTimeInt64!=0 ){ clockVfs->xCurrentTimeInt64(clockVfs, &t); }else{ double r; clockVfs->xCurrentTime(clockVfs, &r); t = (sqlite3_int64)(r*86400000.0); } return t; } /* Print an error message and exit */ static void fatal_error(const char *zMsg, ...){ va_list ap; va_start(ap, zMsg); vfprintf(stderr, zMsg, ap); va_end(ap); exit(1); } /* The sqlite3_trace() callback function */ static void traceCallback(void *NotUsed, const char *zSql){ printf("%s;\n", zSql); } /* An sqlite3_exec() callback that prints results on standard output, ** each column separated by a single space. */ static int printResult(void *NotUsed, int nArg, char **azArg, char **azNm){ int i; printf("--"); for(i=0; i0 && (nOp%commitInterval)==0 ){ sqlite3_exec(db, "COMMIT; BEGIN IMMEDIATE", 0, 0, 0); } } } sqlite3_exec(db, "COMMIT", 0, 0, 0); if( zFileToRead ) fclose(in); sqlite3_finalize(pInsert); sqlite3_finalize(pUpdate); sqlite3_finalize(pSelect); sqlite3_finalize(pDelete); if( iMode==MODE_QUERY ){ printf("sum of cnt: %lld\n", sumCnt); rc = sqlite3_prepare_v2(db,"SELECT sum(cnt*cnt) FROM wordcount", -1, &pSelect, 0); if( rc==SQLITE_OK && sqlite3_step(pSelect)==SQLITE_ROW ){ printf("double-check: %lld\n", sqlite3_column_int64(pSelect, 0)); } sqlite3_finalize(pSelect); } if( showTimer ){ sqlite3_int64 elapseTime = realTime() - startTime; fprintf(stderr, "%3d.%03d wordcount", (int)(elapseTime/1000), (int)(elapseTime%1000)); for(i=1; i