/ Check-in [bf1dc790]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Updates to the instructions in the header comment of the fuzzer implementation. New test cases for the fuzzer.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: bf1dc7907cf1a5c7e19b04fa1278b2089316c30a
User & Date: drh 2012-02-20 22:44:12
Context
2012-02-21
10:36
Add further test cases and minor fixes for the fuzzer. check-in: 583dde93 user: dan tags: trunk
2012-02-20
22:44
Updates to the instructions in the header comment of the fuzzer implementation. New test cases for the fuzzer. check-in: bf1dc790 user: drh tags: trunk
20:03
Change the way the fuzzer (test_fuzzer.c) works so that it loads its configuration from a database table. check-in: 90b7b957 user: dan tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/test_fuzzer.c.

24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60


61
62
63
64
65
66
67
..
69
70
71
72
73
74
75



76
77
78
79
80
81
82
...
125
126
127
128
129
130
131






132
133
134
135
136
137
138
...
773
774
775
776
777
778
779

780
781
782
783
784
785
786
787
788
789
790
791

792
793
794
795
796
797
798
** variations.
**
** The fuzzer data table must contain exactly four columns (more precisely,
** the statement "SELECT * FROM <fuzzer_data_table>" must return records
** that consist of four columns). It does not matter what the columns are
** named. 
**
** Each row in the fuzzer table represents a single character transformation. 
** The left most column of the row (column 0) contains an integer value -
** the identifier of the ruleset to which the transformation rule belongs
** (see "MULTIPLE RULE SETS" below). The second column of the row (column 0)
** contains the input character or characters. The third column contains the
** output character or characters. And the fourth column contains the integer 
** cost of making the transformation. For example:
**
**    CREATE TABLE f_data(ruleset, cFrom, cTo, Cost);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, '', 'a', 100);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'b', '', 87);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'o', 'oe', 38);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'oe', 'o', 40);
**
** The first row inserted into the fuzzer data table by the SQL script
** above indicates that the cost of inserting a letter 'a' is 100.  (All 
** costs are integers.  We recommend that costs be scaled so that the 
** average cost is around 100.) The second INSERT statement creates a rule
** that the cost of that the cost of deleting a single letter 'b' is 87.
** The third and fourth INSERT statements mean that the cost of transforming 
** a single letter "o" into the two-letter sequence "oe" is 38 and that the
** cost of transforming "oe" back into "o" is 40.
**
** The contents of the fuzzer data table are loaded into main memory when
** a fuzzer table is first created, and may be internally reloaded by the
** system at any subsequent time. Therefore, the fuzzer data table should be 
** populated before the fuzzer table is created and not modified thereafter.
** If you do need to modify the contents of the fuzzer data table, it is
** recommended that the associated fuzzer table be dropped, the fuzzer data
** table edited, and the fuzzer table recreated within a single transaction.


**
** Once it has been created, the fuzzer table can be queried as follows:
**
**    SELECT word, distance FROM f
**     WHERE word MATCH 'abcdefg'
**       AND distance<200;
**
................................................................................
** can be derived from that string by appling the specified transformations.
** The strings are output together with their total transformation cost
** (called "distance") and appear in order of increasing cost.  No string
** is output more than once.  If there are multiple ways to transform the
** target string into the output string then the lowest cost transform is
** the one that is returned.  In the example, the search is limited to 
** strings with a total distance of less than 200.



**
** It is important to put some kind of a limit on the fuzzer output.  This
** can be either in the form of a LIMIT clause at the end of the query,
** or better, a "distance<NNN" constraint where NNN is some number.  The
** running time and memory requirement is exponential in the value of NNN 
** so you want to make sure that NNN is not too big.  A value of NNN that
** is about twice the average transformation cost seems to give good results.
................................................................................
**      AND f.distance<=200
**      AND f.word=vocabulary.w
**      AND f.ruleset=1  -- Specify the ruleset to use here
**    LIMIT 20
**
** If no "ruleset = ?" constraint is specified in the WHERE clause, ruleset 
** 0 is used.






*/
#include "sqlite3.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>

................................................................................
*/
static fuzzer_stem *fuzzerNewStem(
  fuzzer_cursor *pCur,
  const char *zWord,
  fuzzer_cost rBaseCost
){
  fuzzer_stem *pNew;

  unsigned int h;

  pNew = sqlite3_malloc( sizeof(*pNew) + strlen(zWord) + 1 );
  if( pNew==0 ) return 0;
  memset(pNew, 0, sizeof(*pNew));
  pNew->zBasis = (char*)&pNew[1];
  pNew->nBasis = strlen(zWord);
  memcpy(pNew->zBasis, zWord, pNew->nBasis+1);
  pNew->pRule = pCur->pVtab->pRule;
  while( pNew->pRule && pNew->pRule->iRuleset!=pCur->iRuleset ){
    pNew->pRule = pNew->pRule->pNext;
  }

  pNew->n = -1;
  pNew->rBaseCost = pNew->rCostX = rBaseCost;
  h = fuzzerHash(pNew->zBasis);
  pNew->pHash = pCur->apHash[h];
  pCur->apHash[h] = pNew;
  pCur->nStem++;
  return pNew;







|
|
|
|
|
|
|











|
|
|









>
>







 







>
>
>







 







>
>
>
>
>
>







 







>








|
|
|

>







24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
..
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
...
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
...
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
** variations.
**
** The fuzzer data table must contain exactly four columns (more precisely,
** the statement "SELECT * FROM <fuzzer_data_table>" must return records
** that consist of four columns). It does not matter what the columns are
** named. 
**
** Each row in the fuzzer data table represents a single character
** transformation. The left most column of the row (column 0) contains an
** integer value - the identifier of the ruleset to which the transformation
** rule belongs (see "MULTIPLE RULE SETS" below). The second column of the
** row (column 0) contains the input character or characters. The third 
** column contains the output character or characters. And the fourth column
** contains the integer cost of making the transformation. For example:
**
**    CREATE TABLE f_data(ruleset, cFrom, cTo, Cost);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, '', 'a', 100);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'b', '', 87);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'o', 'oe', 38);
**    INSERT INTO f_data(ruleset, cFrom, cTo, Cost) VALUES(0, 'oe', 'o', 40);
**
** The first row inserted into the fuzzer data table by the SQL script
** above indicates that the cost of inserting a letter 'a' is 100.  (All 
** costs are integers.  We recommend that costs be scaled so that the 
** average cost is around 100.) The second INSERT statement creates a rule
** saying that the cost of deleting a single letter 'b' is 87.  The third
** and fourth INSERT statements mean that the cost of transforming a
** single letter "o" into the two-letter sequence "oe" is 38 and that the
** cost of transforming "oe" back into "o" is 40.
**
** The contents of the fuzzer data table are loaded into main memory when
** a fuzzer table is first created, and may be internally reloaded by the
** system at any subsequent time. Therefore, the fuzzer data table should be 
** populated before the fuzzer table is created and not modified thereafter.
** If you do need to modify the contents of the fuzzer data table, it is
** recommended that the associated fuzzer table be dropped, the fuzzer data
** table edited, and the fuzzer table recreated within a single transaction.
** Alternatively, the fuzzer data table can be edited then the database
** connection can be closed and reopened.
**
** Once it has been created, the fuzzer table can be queried as follows:
**
**    SELECT word, distance FROM f
**     WHERE word MATCH 'abcdefg'
**       AND distance<200;
**
................................................................................
** can be derived from that string by appling the specified transformations.
** The strings are output together with their total transformation cost
** (called "distance") and appear in order of increasing cost.  No string
** is output more than once.  If there are multiple ways to transform the
** target string into the output string then the lowest cost transform is
** the one that is returned.  In the example, the search is limited to 
** strings with a total distance of less than 200.
**
** The fuzzer is a read-only table.  Any attempt to DELETE, INSERT, or
** UPDATE on a fuzzer table will throw an error.
**
** It is important to put some kind of a limit on the fuzzer output.  This
** can be either in the form of a LIMIT clause at the end of the query,
** or better, a "distance<NNN" constraint where NNN is some number.  The
** running time and memory requirement is exponential in the value of NNN 
** so you want to make sure that NNN is not too big.  A value of NNN that
** is about twice the average transformation cost seems to give good results.
................................................................................
**      AND f.distance<=200
**      AND f.word=vocabulary.w
**      AND f.ruleset=1  -- Specify the ruleset to use here
**    LIMIT 20
**
** If no "ruleset = ?" constraint is specified in the WHERE clause, ruleset 
** 0 is used.
**
** LIMITS
**
** The maximum ruleset number is 2147483647.  The maximum length of either
** of the strings in the second or third column of the fuzzer data table
** is 50 bytes.  The maximum cost on a rule is 1000.
*/
#include "sqlite3.h"
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <stdio.h>

................................................................................
*/
static fuzzer_stem *fuzzerNewStem(
  fuzzer_cursor *pCur,
  const char *zWord,
  fuzzer_cost rBaseCost
){
  fuzzer_stem *pNew;
  fuzzer_rule *pRule;
  unsigned int h;

  pNew = sqlite3_malloc( sizeof(*pNew) + strlen(zWord) + 1 );
  if( pNew==0 ) return 0;
  memset(pNew, 0, sizeof(*pNew));
  pNew->zBasis = (char*)&pNew[1];
  pNew->nBasis = strlen(zWord);
  memcpy(pNew->zBasis, zWord, pNew->nBasis+1);
  pRule = pCur->pVtab->pRule;
  while( pRule && pRule->iRuleset!=pCur->iRuleset ){
    pRule = pRule->pNext;
  }
  pNew->pRule = pRule;
  pNew->n = -1;
  pNew->rBaseCost = pNew->rCostX = rBaseCost;
  h = fuzzerHash(pNew->zBasis);
  pNew->pHash = pCur->apHash[h];
  pCur->apHash[h] = pNew;
  pCur->nStem++;
  return pNew;

Changes to test/fuzzer1.test.

143
144
145
146
147
148
149










150
151
152
153
154
155
156
157
158
159
160
161
...
216
217
218
219
220
221
222












223
224
225
226
227
228
229
....
1479
1480
1481
1482
1483
1484
1485






















1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
} {abcde 0 axcde 1 abcye 10}
do_test fuzzer1-1.13 {
  db eval {
    SELECT word, distance FROM f1
    WHERE word MATCH 'abcde' AND distance<=11 AND ruleset=1
  }
} {abcde 0 axcde 1 abcye 10 axcye 11}











do_test fuzzer1-2.0 {
  execsql {
    -- costs based on English letter frequencies
    CREATE TEMP TABLE f2_rules(ruleset, cFrom, cTo, cost);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','e',24);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','o',47);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','u',50);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','a',23);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','i',33);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','o',37);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('i','e',33);
................................................................................
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('w','',96);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','x',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('x','',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','y',100);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('y','',100);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','z',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('z','',120);













    CREATE VIRTUAL TABLE temp.f2 USING fuzzer(f2_rules);

    -- Street names for the 28269 ZIPCODE.
    --
    CREATE TEMP TABLE streetname(n TEXT UNIQUE);
    INSERT INTO streetname VALUES('abbotsinch');
................................................................................
  execsql {
    SELECT DISTINCT streetname.n FROM f2, streetname
     WHERE f2.word MATCH 'tayle'
       AND f2.distance<=200
       AND streetname.n>=f2.word AND streetname.n<=(f2.word || x'F7BFBFBF')
  }
} {{tyler finley} trailer taymouth steelewood tallia tallu talwyn thelema}























forcedelete test.db2
do_execsql_test fuzzer1-4.1 {
  ATTACH 'test.db2' AS aux;
  CREATE TABLE aux.f3_rules(ruleset, cfrom, cto, cost);
  INSERT INTO f3_rules(ruleset, cfrom, cto, cost) VALUES(0, 'x','y', 10);
  INSERT INTO f3_rules(ruleset, cfrom, cto, cost) VALUES(1, 'a','b', 10);
  CREATE VIRTUAL TABLE aux.f3 USING fuzzer(f3_rules);
  SELECT word FROM f3 WHERE word MATCH 'ax'
} {ax ay}

finish_test







>
>
>
>
>
>
>
>
>
>




|







 







>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>












143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
...
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
....
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
} {abcde 0 axcde 1 abcye 10}
do_test fuzzer1-1.13 {
  db eval {
    SELECT word, distance FROM f1
    WHERE word MATCH 'abcde' AND distance<=11 AND ruleset=1
  }
} {abcde 0 axcde 1 abcye 10 axcye 11}
do_test fuzzer1-1.14 {
  catchsql {INSERT INTO f1 VALUES(1)}
} {1 {table f1 may not be modified}}
do_test fuzzer1-1.15 {
  catchsql {DELETE FROM f1}
} {1 {table f1 may not be modified}}
do_test fuzzer1-1.16 {
  catchsql {UPDATE f1 SET rowid=rowid+10000}
} {1 {table f1 may not be modified}}


do_test fuzzer1-2.0 {
  execsql {
    -- costs based on English letter frequencies
    CREATE TEMP TABLE f2_rules(ruleset DEFAULT 0, cFrom, cTo, cost);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','e',24);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','o',47);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('a','u',50);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','a',23);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','i',33);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('e','o',37);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('i','e',33);
................................................................................
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('w','',96);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','x',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('x','',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','y',100);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('y','',100);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('','z',120);
    INSERT INTO f2_rules(cFrom,cTo,cost) VALUES('z','',120);
    INSERT INTO f2_rules(ruleset,cFrom,cTo,cost)
      SELECT 1, cFrom, cTo, 100 FROM f2_rules WHERE ruleset=0;
    INSERT INTO f2_rules(ruleset,cFrom,cTo,cost)
      SELECT 2, cFrom, cTo, 200-cost FROM f2_rules WHERE ruleset=0;
    INSERT INTO f2_rules(ruleset,cFrom,cTo,cost)
      SELECT 3, cFrom, cTo, cost FROM f2_rules WHERE ruleset=0;
    INSERT INTO f2_rules(ruleset,cFrom,cTo,cost)
      VALUES(3, 'mallard','duck',50),
            (3, 'duck', 'mallard', 50),
            (3, 'rock', 'stone', 50),
            (3, 'stone', 'rock', 50);


    CREATE VIRTUAL TABLE temp.f2 USING fuzzer(f2_rules);

    -- Street names for the 28269 ZIPCODE.
    --
    CREATE TEMP TABLE streetname(n TEXT UNIQUE);
    INSERT INTO streetname VALUES('abbotsinch');
................................................................................
  execsql {
    SELECT DISTINCT streetname.n FROM f2, streetname
     WHERE f2.word MATCH 'tayle'
       AND f2.distance<=200
       AND streetname.n>=f2.word AND streetname.n<=(f2.word || x'F7BFBFBF')
  }
} {{tyler finley} trailer taymouth steelewood tallia tallu talwyn thelema}
do_test fuzzer1-2.4 {
  execsql {
    SELECT DISTINCT streetname.n
      FROM f2 JOIN streetname
        ON (streetname.n>=f2.word AND streetname.n<=(f2.word || 'zzzzzz'))
     WHERE f2.word MATCH 'duck'
       AND f2.distance<150
       AND f2.ruleset=3
     ORDER BY 1
  }
} {mallard {mallard cove} {mallard forest} {mallard grove} {mallard hill} {mallard park} {mallard ridge} {mallard view}}
do_test fuzzer1-2.5 {
  execsql {
    SELECT DISTINCT streetname.n
      FROM f2 JOIN streetname
        ON (streetname.n>=f2.word AND streetname.n<=(f2.word || 'zzzzzz'))
     WHERE f2.word MATCH 'duck'
       AND f2.distance<150
       AND f2.ruleset=2
     ORDER BY 1
  }
} {}

forcedelete test.db2
do_execsql_test fuzzer1-4.1 {
  ATTACH 'test.db2' AS aux;
  CREATE TABLE aux.f3_rules(ruleset, cfrom, cto, cost);
  INSERT INTO f3_rules(ruleset, cfrom, cto, cost) VALUES(0, 'x','y', 10);
  INSERT INTO f3_rules(ruleset, cfrom, cto, cost) VALUES(1, 'a','b', 10);
  CREATE VIRTUAL TABLE aux.f3 USING fuzzer(f3_rules);
  SELECT word FROM f3 WHERE word MATCH 'ax'
} {ax ay}

finish_test