/ Check-in [68121676]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Tighter compression of the keyword hash table. (CVS 3920)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1:681216767d7fabfccb0b12f6a81b18b6d1c252bf
User & Date: drh 2007-05-04 17:07:53
Context
2007-05-04
18:30
Change incremental vacuum to be triggered by a pragma rather than a command. We have a lot to learn about this yet and we do not want to paint ourselves into a corner by commiting to specific syntax too early. (CVS 3921) check-in: b13e497a user: drh tags: trunk
17:07
Tighter compression of the keyword hash table. (CVS 3920) check-in: 68121676 user: drh tags: trunk
16:14
Optional parameter in the INCREMENTAL VACUUM statement specifies how many pages to vacuum from the database. (CVS 3919) check-in: ed713f9c user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to tool/mkkeywordhash.c.

    11     11   ** A header comment placed at the beginning of generated code.
    12     12   */
    13     13   static const char zHdr[] = 
    14     14     "/***** This file contains automatically generated code ******\n"
    15     15     "**\n"
    16     16     "** The code in this file has been automatically generated by\n"
    17     17     "**\n"
    18         -  "**     $Header: /home/drh/sqlite/trans/cvs/sqlite/sqlite/tool/mkkeywordhash.c,v 1.28 2007/04/26 14:42:36 danielk1977 Exp $\n"
           18  +  "**     $Header: /home/drh/sqlite/trans/cvs/sqlite/sqlite/tool/mkkeywordhash.c,v 1.29 2007/05/04 17:07:53 drh Exp $\n"
    19     19     "**\n"
    20     20     "** The code in this file implements a function that determines whether\n"
    21     21     "** or not a given identifier is really an SQL keyword.  The same thing\n"
    22     22     "** might be implemented more directly using a hand-written hash table.\n"
    23     23     "** But by using this automatically generated code, the size of the code\n"
    24     24     "** is substantially reduced.  This is important for embedded applications\n"
    25     25     "** on platforms with limited memory.\n"
................................................................................
    36     36     char *zTokenType;    /* Token value for this keyword */
    37     37     int mask;            /* Code this keyword if non-zero */
    38     38     int id;              /* Unique ID for this record */
    39     39     int hash;            /* Hash on the keyword */
    40     40     int offset;          /* Offset to start of name string */
    41     41     int len;             /* Length of this keyword, not counting final \000 */
    42     42     int prefix;          /* Number of characters in prefix */
           43  +  int longestSuffix;   /* Longest suffix that is a prefix on another word */
    43     44     int iNext;           /* Index in aKeywordTable[] of next with same hash */
    44     45     int substrId;        /* Id to another keyword this keyword is embedded in */
    45     46     int substrOffset;    /* Offset into substrId for start of this keyword */
    46     47   };
    47     48   
    48     49   /*
    49     50   ** Define masks used to determine which keywords are allowed
................................................................................
   256    257     { "VIEW",             "TK_VIEW",         VIEW                   },
   257    258     { "VIRTUAL",          "TK_VIRTUAL",      VTAB                   },
   258    259     { "WHEN",             "TK_WHEN",         ALWAYS                 },
   259    260     { "WHERE",            "TK_WHERE",        ALWAYS                 },
   260    261   };
   261    262   
   262    263   /* Number of keywords */
   263         -static int NKEYWORD = (sizeof(aKeywordTable)/sizeof(aKeywordTable[0]));
          264  +static int nKeyword = (sizeof(aKeywordTable)/sizeof(aKeywordTable[0]));
   264    265   
   265    266   /* An array to map all upper-case characters into their corresponding
   266    267   ** lower-case character. 
   267    268   */
   268    269   const unsigned char sqlite3UpperToLower[] = {
   269    270         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
   270    271        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
................................................................................
   295    296       n = strcmp(pA->zName, pB->zName);
   296    297     }
   297    298     return n;
   298    299   }
   299    300   static int keywordCompare2(const void *a, const void *b){
   300    301     const Keyword *pA = (Keyword*)a;
   301    302     const Keyword *pB = (Keyword*)b;
   302         -  int n = strcmp(pA->zName, pB->zName);
          303  +  int n = pB->longestSuffix - pA->longestSuffix;
          304  +  if( n==0 ){
          305  +    n = strcmp(pA->zName, pB->zName);
          306  +  }
   303    307     return n;
   304    308   }
   305    309   static int keywordCompare3(const void *a, const void *b){
   306    310     const Keyword *pA = (Keyword*)a;
   307    311     const Keyword *pB = (Keyword*)b;
   308    312     int n = pA->offset - pB->offset;
   309    313     return n;
................................................................................
   310    314   }
   311    315   
   312    316   /*
   313    317   ** Return a KeywordTable entry with the given id
   314    318   */
   315    319   static Keyword *findById(int id){
   316    320     int i;
   317         -  for(i=0; i<NKEYWORD; i++){
          321  +  for(i=0; i<nKeyword; i++){
   318    322       if( aKeywordTable[i].id==id ) break;
   319    323     }
   320    324     return &aKeywordTable[i];
   321    325   }
   322    326   
   323    327   /*
   324    328   ** This routine does the work.  The generated code is printed on standard
................................................................................
   325    329   ** output.
   326    330   */
   327    331   int main(int argc, char **argv){
   328    332     int i, j, k, h;
   329    333     int bestSize, bestCount;
   330    334     int count;
   331    335     int nChar;
   332         -  int aHash[1000];  /* 1000 is much bigger than NKEYWORD */
          336  +  int totalLen = 0;
          337  +  int aHash[1000];  /* 1000 is much bigger than nKeyword */
   333    338   
   334    339     /* Remove entries from the list of keywords that have mask==0 */
   335         -  for(i=j=0; i<NKEYWORD; i++){
          340  +  for(i=j=0; i<nKeyword; i++){
   336    341       if( aKeywordTable[i].mask==0 ) continue;
   337    342       if( j<i ){
   338    343         aKeywordTable[j] = aKeywordTable[i];
   339    344       }
   340    345       j++;
   341    346     }
   342         -  NKEYWORD = j;
          347  +  nKeyword = j;
   343    348   
   344    349     /* Fill in the lengths of strings and hashes for all entries. */
   345         -  for(i=0; i<NKEYWORD; i++){
          350  +  for(i=0; i<nKeyword; i++){
   346    351       Keyword *p = &aKeywordTable[i];
   347    352       p->len = strlen(p->zName);
          353  +    totalLen += p->len;
   348    354       p->hash = (UpperToLower[p->zName[0]]*4) ^
   349    355                 (UpperToLower[p->zName[p->len-1]]*3) ^ p->len;
   350    356       p->id = i+1;
   351    357     }
   352    358   
   353    359     /* Sort the table from shortest to longest keyword */
   354         -  qsort(aKeywordTable, NKEYWORD, sizeof(aKeywordTable[0]), keywordCompare1);
          360  +  qsort(aKeywordTable, nKeyword, sizeof(aKeywordTable[0]), keywordCompare1);
   355    361   
   356    362     /* Look for short keywords embedded in longer keywords */
   357         -  for(i=NKEYWORD-2; i>=0; i--){
          363  +  for(i=nKeyword-2; i>=0; i--){
   358    364       Keyword *p = &aKeywordTable[i];
   359         -    for(j=NKEYWORD-1; j>i && p->substrId==0; j--){
          365  +    for(j=nKeyword-1; j>i && p->substrId==0; j--){
   360    366         Keyword *pOther = &aKeywordTable[j];
   361    367         if( pOther->substrId ) continue;
   362    368         if( pOther->len<=p->len ) continue;
   363    369         for(k=0; k<=pOther->len-p->len; k++){
   364    370           if( memcmp(p->zName, &pOther->zName[k], p->len)==0 ){
   365    371             p->substrId = pOther->id;
   366    372             p->substrOffset = k;
   367    373             break;
   368    374           }
   369    375         }
   370    376       }
   371    377     }
   372    378   
   373         -  /* Sort the table into alphabetical order */
   374         -  qsort(aKeywordTable, NKEYWORD, sizeof(aKeywordTable[0]), keywordCompare2);
          379  +  /* Compute the longestSuffix value for every word */
          380  +  for(i=0; i<nKeyword; i++){
          381  +    Keyword *p = &aKeywordTable[i];
          382  +    if( p->substrId ) continue;
          383  +    for(j=0; j<nKeyword; j++){
          384  +      Keyword *pOther;
          385  +      if( j==i ) continue;
          386  +      pOther = &aKeywordTable[j];
          387  +      if( pOther->substrId ) continue;
          388  +      for(k=p->longestSuffix+1; k<p->len && k<pOther->len; k++){
          389  +        if( memcmp(&p->zName[p->len-k], pOther->zName, k)==0 ){
          390  +          p->longestSuffix = k;
          391  +        }
          392  +      }
          393  +    }
          394  +  }
          395  +
          396  +  /* Sort the table into reverse order by length */
          397  +  qsort(aKeywordTable, nKeyword, sizeof(aKeywordTable[0]), keywordCompare2);
   375    398   
   376    399     /* Fill in the offset for all entries */
   377    400     nChar = 0;
   378         -  for(i=0; i<NKEYWORD; i++){
          401  +  for(i=0; i<nKeyword; i++){
   379    402       Keyword *p = &aKeywordTable[i];
   380    403       if( p->offset>0 || p->substrId ) continue;
   381    404       p->offset = nChar;
   382    405       nChar += p->len;
   383    406       for(k=p->len-1; k>=1; k--){
   384         -      for(j=i+1; j<NKEYWORD; j++){
          407  +      for(j=i+1; j<nKeyword; j++){
   385    408           Keyword *pOther = &aKeywordTable[j];
   386    409           if( pOther->offset>0 || pOther->substrId ) continue;
   387    410           if( pOther->len<=k ) continue;
   388    411           if( memcmp(&p->zName[p->len-k], pOther->zName, k)==0 ){
   389    412             p = pOther;
   390    413             p->offset = nChar - k;
   391    414             nChar = p->offset + p->len;
................................................................................
   394    417             p->prefix = k;
   395    418             j = i;
   396    419             k = p->len;
   397    420           }
   398    421         }
   399    422       }
   400    423     }
   401         -  for(i=0; i<NKEYWORD; i++){
          424  +  for(i=0; i<nKeyword; i++){
   402    425       Keyword *p = &aKeywordTable[i];
   403    426       if( p->substrId ){
   404    427         p->offset = findById(p->substrId)->offset + p->substrOffset;
   405    428       }
   406    429     }
   407    430   
   408    431     /* Sort the table by offset */
   409         -  qsort(aKeywordTable, NKEYWORD, sizeof(aKeywordTable[0]), keywordCompare3);
          432  +  qsort(aKeywordTable, nKeyword, sizeof(aKeywordTable[0]), keywordCompare3);
   410    433   
   411    434     /* Figure out how big to make the hash table in order to minimize the
   412    435     ** number of collisions */
   413         -  bestSize = NKEYWORD;
   414         -  bestCount = NKEYWORD*NKEYWORD;
   415         -  for(i=NKEYWORD/2; i<=2*NKEYWORD; i++){
          436  +  bestSize = nKeyword;
          437  +  bestCount = nKeyword*nKeyword;
          438  +  for(i=nKeyword/2; i<=2*nKeyword; i++){
   416    439       for(j=0; j<i; j++) aHash[j] = 0;
   417         -    for(j=0; j<NKEYWORD; j++){
          440  +    for(j=0; j<nKeyword; j++){
   418    441         h = aKeywordTable[j].hash % i;
   419    442         aHash[h] *= 2;
   420    443         aHash[h]++;
   421    444       }
   422    445       for(j=count=0; j<i; j++) count += aHash[j];
   423    446       if( count<bestCount ){
   424    447         bestCount = count;
   425    448         bestSize = i;
   426    449       }
   427    450     }
   428    451   
   429    452     /* Compute the hash */
   430    453     for(i=0; i<bestSize; i++) aHash[i] = 0;
   431         -  for(i=0; i<NKEYWORD; i++){
          454  +  for(i=0; i<nKeyword; i++){
   432    455       h = aKeywordTable[i].hash % bestSize;
   433    456       aKeywordTable[i].iNext = aHash[h];
   434    457       aHash[h] = i+1;
   435    458     }
   436    459   
   437    460     /* Begin generating code */
   438    461     printf("%s", zHdr);
   439    462     printf("/* Hash score: %d */\n", bestCount);
   440    463     printf("static int keywordCode(const char *z, int n){\n");
          464  +  printf("  /* zText[] encodes %d bytes of keywords in %d bytes */\n",
          465  +          totalLen + nKeyword, nChar+1 );
   441    466   
   442    467     printf("  static const char zText[%d] =\n", nChar+1);
   443         -  for(i=j=0; i<NKEYWORD; i++){
          468  +  for(i=j=0; i<nKeyword; i++){
   444    469       Keyword *p = &aKeywordTable[i];
   445    470       if( p->substrId ) continue;
   446    471       if( j==0 ) printf("    \"");
   447    472       printf("%s", p->zName);
   448    473       j += p->len;
   449    474       if( j>60 ){
   450    475         printf("\"\n");
................................................................................
   461    486       if( j>12 ){
   462    487         printf("\n");
   463    488         j = 0;
   464    489       }
   465    490     }
   466    491     printf("%s  };\n", j==0 ? "" : "\n");    
   467    492   
   468         -  printf("  static const unsigned char aNext[%d] = {\n", NKEYWORD);
   469         -  for(i=j=0; i<NKEYWORD; i++){
          493  +  printf("  static const unsigned char aNext[%d] = {\n", nKeyword);
          494  +  for(i=j=0; i<nKeyword; i++){
   470    495       if( j==0 ) printf("    ");
   471    496       printf(" %3d,", aKeywordTable[i].iNext);
   472    497       j++;
   473    498       if( j>12 ){
   474    499         printf("\n");
   475    500         j = 0;
   476    501       }
   477    502     }
   478    503     printf("%s  };\n", j==0 ? "" : "\n");    
   479    504   
   480         -  printf("  static const unsigned char aLen[%d] = {\n", NKEYWORD);
   481         -  for(i=j=0; i<NKEYWORD; i++){
          505  +  printf("  static const unsigned char aLen[%d] = {\n", nKeyword);
          506  +  for(i=j=0; i<nKeyword; i++){
   482    507       if( j==0 ) printf("    ");
   483    508       printf(" %3d,", aKeywordTable[i].len+aKeywordTable[i].prefix);
   484    509       j++;
   485    510       if( j>12 ){
   486    511         printf("\n");
   487    512         j = 0;
   488    513       }
   489    514     }
   490    515     printf("%s  };\n", j==0 ? "" : "\n");    
   491    516   
   492         -  printf("  static const unsigned short int aOffset[%d] = {\n", NKEYWORD);
   493         -  for(i=j=0; i<NKEYWORD; i++){
          517  +  printf("  static const unsigned short int aOffset[%d] = {\n", nKeyword);
          518  +  for(i=j=0; i<nKeyword; i++){
   494    519       if( j==0 ) printf("    ");
   495    520       printf(" %3d,", aKeywordTable[i].offset);
   496    521       j++;
   497    522       if( j>12 ){
   498    523         printf("\n");
   499    524         j = 0;
   500    525       }
   501    526     }
   502    527     printf("%s  };\n", j==0 ? "" : "\n");
   503    528   
   504         -  printf("  static const unsigned char aCode[%d] = {\n", NKEYWORD);
   505         -  for(i=j=0; i<NKEYWORD; i++){
          529  +  printf("  static const unsigned char aCode[%d] = {\n", nKeyword);
          530  +  for(i=j=0; i<nKeyword; i++){
   506    531       char *zToken = aKeywordTable[i].zTokenType;
   507    532       if( j==0 ) printf("    ");
   508    533       printf("%s,%*s", zToken, (int)(14-strlen(zToken)), "");
   509    534       j++;
   510    535       if( j>=5 ){
   511    536         printf("\n");
   512    537         j = 0;