/ Check-in [a1e2b6ce]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Faster version of patternCompare() that uses new return values rather than an extra parameter to communicate wildcard information back up to parent searches.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | pattern-compare-optimization
Files: files | file ages | folders
SHA1:a1e2b6ce3af690ae91bda3d056357205c4018da7
User & Date: drh 2016-12-01 18:49:40
Context
2016-12-01
18:57
Performance improvement for GLOB and LIKE matching for patterns with two or more multi-character wildcards ("*" or "%"). check-in: 2df0ebf9 user: drh tags: trunk
18:49
Faster version of patternCompare() that uses new return values rather than an extra parameter to communicate wildcard information back up to parent searches. Closed-Leaf check-in: a1e2b6ce user: drh tags: pattern-compare-optimization
17:34
Modify the patternCompare() function (used for GLOB, LIKE) to better handle patterns containing multiple wildcard characters ("*", "%"). check-in: c5e5614d user: dan tags: pattern-compare-optimization
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/func.c.

594
595
596
597
598
599
600







601
602
603





604
605
606
607
608
609
610
...
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676

677
678
679
680
681
682
683
...
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710

711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
...
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768

769
770
771
772
773
774
775
776

777
778
779
780
781
782
783
784
785
786
787
...
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

/*







** Compare two UTF-8 strings for equality where the first string can
** potentially be a "glob" or "like" expression.  Return true (1) if they
** are the same and false (0) if they are different.





**
** Globbing rules:
**
**      '*'       Matches any sequence of zero or more characters.
**
**      '?'       Matches exactly one character.
**
................................................................................
**
** This routine is usually quick, but can be N**2 in the worst case.
*/
static int patternCompare(
  const u8 *zPattern,              /* The glob pattern */
  const u8 *zString,               /* The string to compare against the glob */
  const struct compareInfo *pInfo, /* Information about how to do the compare */
  u32 matchOther,                  /* The escape char (LIKE) or '[' (GLOB) */
  int *pbSeenMatchAll              /* OUT: True if have seen matchAll */
){
  u32 c, c2;                       /* Next pattern and input string chars */
  u32 matchOne = pInfo->matchOne;  /* "?" or "_" */
  u32 matchAll = pInfo->matchAll;  /* "*" or "%" */
  u8 noCase = pInfo->noCase;       /* True if uppercase==lowercase */
  const u8 *zEscaped = 0;          /* One past the last escaped input char */
  
  while( (c = Utf8Read(zPattern))!=0 ){
    if( c==matchAll ){  /* Match "*" */
      /* Skip over multiple "*" characters in the pattern.  If there
      ** are also "?" characters, skip those as well, but consume a
      ** single character of the input string for each "?" skipped */
      *pbSeenMatchAll = 1;
      while( (c=Utf8Read(zPattern)) == matchAll || c == matchOne ){
        if( c==matchOne && sqlite3Utf8Read(&zString)==0 ){
          return 0;
        }
      }
      if( c==0 ){
        return 1;   /* "*" at the end of the pattern matches */
      }else if( c==matchOther ){
        if( pInfo->matchSet==0 ){
          c = sqlite3Utf8Read(&zPattern);
          if( c==0 ) return 0;
        }else{
          int bMA = 0;            /* True if patternCompare sees matchAll */
          /* "[...]" immediately follows the "*".  We have to do a slow
          ** recursive search in this case, but it is an unusual case. */
          assert( matchOther<0x80 );  /* '[' is a single-byte character */
          while( *zString
            && patternCompare(&zPattern[-1],zString,pInfo,matchOther,&bMA)==0
          ){
            if( bMA ) return 0;
            SQLITE_SKIP_UTF8(zString);
          }
          return *zString!=0;

        }
      }

      /* At this point variable c contains the first character of the
      ** pattern string past the "*".  Search in the input string for the
      ** first matching character and recursively continue the match from
      ** that point.
................................................................................
      **
      ** For a case-insensitive search, set variable cx to be the same as
      ** c but in the other case and search the input string for either
      ** c or cx.
      */
      if( c<=0x80 ){
        u32 cx;
        int bMatchAll = 0;
        if( noCase ){
          cx = sqlite3Toupper(c);
          c = sqlite3Tolower(c);
        }else{
          cx = c;
        }
        while( (c2 = *(zString++))!=0 ){
          if( c2!=c && c2!=cx ) continue;
          if( patternCompare(zPattern,zString,pInfo,matchOther, &bMatchAll) ){
            return 1;
          }
          if( bMatchAll ) break;
        }
      }else{
        int bMatchAll = 0;
        while( (c2 = Utf8Read(zString))!=0 ){
          if( c2!=c ) continue;
          if( patternCompare(zPattern,zString,pInfo,matchOther, &bMatchAll) ){
            return 1;

          }
          if( bMatchAll ) break;
        }
      }
      return 0;
    }
    if( c==matchOther ){
      if( pInfo->matchSet==0 ){
        c = sqlite3Utf8Read(&zPattern);
        if( c==0 ) return 0;
        zEscaped = zPattern;
      }else{
        u32 prior_c = 0;
        int seen = 0;
        int invert = 0;
        c = sqlite3Utf8Read(&zString);
        if( c==0 ) return 0;
        c2 = sqlite3Utf8Read(&zPattern);
        if( c2=='^' ){
          invert = 1;
          c2 = sqlite3Utf8Read(&zPattern);
        }
        if( c2==']' ){
          if( c==']' ) seen = 1;
................................................................................
              seen = 1;
            }
            prior_c = c2;
          }
          c2 = sqlite3Utf8Read(&zPattern);
        }
        if( c2==0 || (seen ^ invert)==0 ){
          return 0;
        }
        continue;
      }
    }
    c2 = Utf8Read(zString);
    if( c==c2 ) continue;
    if( noCase  && sqlite3Tolower(c)==sqlite3Tolower(c2) && c<0x80 && c2<0x80 ){
      continue;
    }
    if( c==matchOne && zPattern!=zEscaped && c2!=0 ) continue;
    return 0;
  }
  return *zString==0;
}

/*
** The sqlite3_strglob() interface.

*/
int sqlite3_strglob(const char *zGlobPattern, const char *zString){
  int dummy = 0;
  return patternCompare((u8*)zGlobPattern, (u8*)zString, &globInfo, '[', &dummy)==0;
}

/*
** The sqlite3_strlike() interface.

*/
int sqlite3_strlike(const char *zPattern, const char *zStr, unsigned int esc){
  int dummy = 0;
  return patternCompare((u8*)zPattern, (u8*)zStr, &likeInfoNorm, esc, &dummy)==0;
}

/*
** Count the number of times that the LIKE operator (or GLOB which is
** just a variation of LIKE) gets called.  This is used for testing
** only.
*/
................................................................................
      return;
    }
    escape = sqlite3Utf8Read(&zEsc);
  }else{
    escape = pInfo->matchSet;
  }
  if( zA && zB ){
    int dummy = 0;
#ifdef SQLITE_TEST
    sqlite3_like_count++;
#endif
    sqlite3_result_int(context, patternCompare(zB, zA, pInfo, escape, &dummy));
  }
}

/*
** Implementation of the NULLIF(x,y) function.  The result is the first
** argument if the arguments are different.  The result is NULL if the
** arguments are equal to each other.







>
>
>
>
>
>
>
|
<
<
>
>
>
>
>







 







|
<












<


|



|



|

<



|
|
|
<


<
>







 







|








|
|
|
<
<

|


|
<
>
|
<
|
<
|




|






|







 







|










|

|



|
>


<
|



|
>


<
|







 







<



|







594
595
596
597
598
599
600
601
602
603
604
605
606
607
608


609
610
611
612
613
614
615
616
617
618
619
620
...
642
643
644
645
646
647
648
649

650
651
652
653
654
655
656
657
658
659
660
661

662
663
664
665
666
667
668
669
670
671
672
673

674
675
676
677
678
679

680
681

682
683
684
685
686
687
688
689
...
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708


709
710
711
712
713

714
715

716

717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
...
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773

774
775
776
777
778
779
780
781

782
783
784
785
786
787
788
789
...
853
854
855
856
857
858
859

860
861
862
863
864
865
866
867
868
869
870
** case.  Thus  'a' LIKE 'A' would be true. */
static const struct compareInfo likeInfoNorm = { '%', '_',   0, 1 };
/* If SQLITE_CASE_SENSITIVE_LIKE is defined, then the LIKE operator
** is case sensitive causing 'a' LIKE 'A' to be false */
static const struct compareInfo likeInfoAlt = { '%', '_',   0, 0 };

/*
** Possible error returns from patternMatch()
*/
#define SQLITE_MATCH             0
#define SQLITE_NOMATCH           1
#define SQLITE_NOWILDCARDMATCH   2

/*
** Compare two UTF-8 strings for equality where the first string is


** a GLOB or LIKE expression.  Return values:
**
**    SQLITE_MATCH:            Match
**    SQLITE_NOMATCH:          No match
**    SQLITE_NOWILDCARDMATCH:  No match in spite of having * or % wildcards.
**
** Globbing rules:
**
**      '*'       Matches any sequence of zero or more characters.
**
**      '?'       Matches exactly one character.
**
................................................................................
**
** This routine is usually quick, but can be N**2 in the worst case.
*/
static int patternCompare(
  const u8 *zPattern,              /* The glob pattern */
  const u8 *zString,               /* The string to compare against the glob */
  const struct compareInfo *pInfo, /* Information about how to do the compare */
  u32 matchOther                   /* The escape char (LIKE) or '[' (GLOB) */

){
  u32 c, c2;                       /* Next pattern and input string chars */
  u32 matchOne = pInfo->matchOne;  /* "?" or "_" */
  u32 matchAll = pInfo->matchAll;  /* "*" or "%" */
  u8 noCase = pInfo->noCase;       /* True if uppercase==lowercase */
  const u8 *zEscaped = 0;          /* One past the last escaped input char */
  
  while( (c = Utf8Read(zPattern))!=0 ){
    if( c==matchAll ){  /* Match "*" */
      /* Skip over multiple "*" characters in the pattern.  If there
      ** are also "?" characters, skip those as well, but consume a
      ** single character of the input string for each "?" skipped */

      while( (c=Utf8Read(zPattern)) == matchAll || c == matchOne ){
        if( c==matchOne && sqlite3Utf8Read(&zString)==0 ){
          return SQLITE_NOWILDCARDMATCH;
        }
      }
      if( c==0 ){
        return SQLITE_MATCH;   /* "*" at the end of the pattern matches */
      }else if( c==matchOther ){
        if( pInfo->matchSet==0 ){
          c = sqlite3Utf8Read(&zPattern);
          if( c==0 ) return SQLITE_NOWILDCARDMATCH;
        }else{

          /* "[...]" immediately follows the "*".  We have to do a slow
          ** recursive search in this case, but it is an unusual case. */
          assert( matchOther<0x80 );  /* '[' is a single-byte character */
          while( *zString ){
            int bMatch = patternCompare(&zPattern[-1],zString,pInfo,matchOther);
            if( bMatch!=SQLITE_NOMATCH ) return bMatch;

            SQLITE_SKIP_UTF8(zString);
          }

          return SQLITE_NOWILDCARDMATCH;
        }
      }

      /* At this point variable c contains the first character of the
      ** pattern string past the "*".  Search in the input string for the
      ** first matching character and recursively continue the match from
      ** that point.
................................................................................
      **
      ** For a case-insensitive search, set variable cx to be the same as
      ** c but in the other case and search the input string for either
      ** c or cx.
      */
      if( c<=0x80 ){
        u32 cx;
        int bMatch;
        if( noCase ){
          cx = sqlite3Toupper(c);
          c = sqlite3Tolower(c);
        }else{
          cx = c;
        }
        while( (c2 = *(zString++))!=0 ){
          if( c2!=c && c2!=cx ) continue;
          bMatch = patternCompare(zPattern,zString,pInfo,matchOther);
          if( bMatch!=SQLITE_NOMATCH ) return bMatch;
        }


      }else{
        int bMatch;
        while( (c2 = Utf8Read(zString))!=0 ){
          if( c2!=c ) continue;
          bMatch = patternCompare(zPattern,zString,pInfo,matchOther);

          if( bMatch!=SQLITE_NOMATCH ) return bMatch;
        }

      }

      return SQLITE_NOWILDCARDMATCH;
    }
    if( c==matchOther ){
      if( pInfo->matchSet==0 ){
        c = sqlite3Utf8Read(&zPattern);
        if( c==0 ) return SQLITE_NOMATCH;
        zEscaped = zPattern;
      }else{
        u32 prior_c = 0;
        int seen = 0;
        int invert = 0;
        c = sqlite3Utf8Read(&zString);
        if( c==0 ) return SQLITE_NOMATCH;
        c2 = sqlite3Utf8Read(&zPattern);
        if( c2=='^' ){
          invert = 1;
          c2 = sqlite3Utf8Read(&zPattern);
        }
        if( c2==']' ){
          if( c==']' ) seen = 1;
................................................................................
              seen = 1;
            }
            prior_c = c2;
          }
          c2 = sqlite3Utf8Read(&zPattern);
        }
        if( c2==0 || (seen ^ invert)==0 ){
          return SQLITE_NOMATCH;
        }
        continue;
      }
    }
    c2 = Utf8Read(zString);
    if( c==c2 ) continue;
    if( noCase  && sqlite3Tolower(c)==sqlite3Tolower(c2) && c<0x80 && c2<0x80 ){
      continue;
    }
    if( c==matchOne && zPattern!=zEscaped && c2!=0 ) continue;
    return SQLITE_NOMATCH;
  }
  return *zString==0 ? SQLITE_MATCH : SQLITE_NOMATCH;
}

/*
** The sqlite3_strglob() interface.  Return 0 on a match (like strcmp()) and
** non-zero if there is no match.
*/
int sqlite3_strglob(const char *zGlobPattern, const char *zString){

  return patternCompare((u8*)zGlobPattern, (u8*)zString, &globInfo, '[');
}

/*
** The sqlite3_strlike() interface.  Return 0 on a match and non-zero for
** a miss - like strcmp().
*/
int sqlite3_strlike(const char *zPattern, const char *zStr, unsigned int esc){

  return patternCompare((u8*)zPattern, (u8*)zStr, &likeInfoNorm, esc);
}

/*
** Count the number of times that the LIKE operator (or GLOB which is
** just a variation of LIKE) gets called.  This is used for testing
** only.
*/
................................................................................
      return;
    }
    escape = sqlite3Utf8Read(&zEsc);
  }else{
    escape = pInfo->matchSet;
  }
  if( zA && zB ){

#ifdef SQLITE_TEST
    sqlite3_like_count++;
#endif
    sqlite3_result_int(context, patternCompare(zB, zA, pInfo, escape)==SQLITE_MATCH);
  }
}

/*
** Implementation of the NULLIF(x,y) function.  The result is the first
** argument if the arguments are different.  The result is NULL if the
** arguments are equal to each other.