# # query.yp # # Parse::Yapp input grammar # ####################################################################### # Yapp is a perl version of the Bison parser. For documentation on # Bison, refer to . # # This code is based on the example Calc.yp, which is # /usr/doc/libparse-yapp-perl/examples/Calc.yp on Debian systems. ####################################################################### # This is in effect two separate parsers. This is noted more in the code, but # to explain, it parser a string that follows the form... # var operator condition [ [AND|OR] var operator condition ](0..n times) # The operators are = <> ? == <==> < > and the special cases ~ <~> # condition is a quoted string. In the special cases, the quoted string is also # parsed (after the quotes are stripped). # This special case is recognised by the ~ and <~> occuring. At that point the # Lexer reads ahead to fetch the quoted string and set it up for parsing. # This is how the quotes seem to mysteriously vanish if you are just looking at # the parser lines. # technically it'd be nice to split this second parsing layer out into a separate # .yp file, but it's more efficient to do it here. ####################################################################### # Declarations - define precedence ####################################################################### %left OR %left AND # Expect 2 shift/reduce conflicts. Only report an error if more or less occur. # This is to supress an warning reported by the yapp parser. %expect 2 ####################################################################### # Production Rules # # Assumes &intersect, &query and &union are defined ####################################################################### %% query: cond { "$_[1]" } | query AND query { "\$this->_and($_[1],$_[3])" } | query OR query { "\$this->_or($_[1],$_[3])" } | '(' query ')' { "$_[2]" } ; cond: varlist '=' qval { "\$this->_like('$_[1]', '$_[3]')" } | varlist '<>' qval { "\$this->_notlike('$_[1]', '$_[3]')" } | varlist '?' qval { "\$this->_match('$_[1]', '$_[3]')" } | varlist '' qval { "\$this->_notmatch('$_[1]', '$_[3]')" } | varlist '==' qval { "\$this->_exactly('$_[1]', '$_[3]')" } | varlist '<==>' qval { "\$this->_exactlynot('$_[1]', '$_[3]')" } | varlist '>' qval { "\$this->_gt('$_[1]', '$_[3]')" } | varlist '<' qval { "\$this->_lt('$_[1]', '$_[3]')" } # Remember, hidden absorbtion of quotes around the bconds. (see comments above) | varlist '~' bcond { "\$this->_bool('$_[1]', $_[3])" } | varlist '<~>' bcond { "\$this->_bneg('$_[1]', $_[3])" } ; # qval no longer accepts VAR, only VALUEs (quoted strings) qval: VALUE { "$_[1]" } ; # Generates a shift/reduce conflict that resolves correctly. varlist: VAR ',' varlist { "$_[1],$_[3]" } | VAR { "$_[1]" } ; # bcond is provided to handle the case where there is nothing in the quoted string. # For example # dc.title,dc.desc ~ '' # as compared to... # dc.title,dc.desc ~ 'fred "long string" wild* +required' # Remember that the quotes are absorbed invisibly by the lexer. bcond: blist { "$_[1]" } | {} ; # blist is a list of two or more bkeys # Generates a shift/reduce conflict that resolves correctly. blist: bkey blist { "\$this->_band($_[1], $_[2])" } | bkey { "$_[1]" } ; # Note: a '-' prefix is only allowed for strings and quoted values. # We allow it for wildcards, but in a lot of cases (eg. mysql) it won't do anything). # it only applies to a single term, not to a set. bkey: '-' anyval { "\$this->_bnot(\$this->_token($_[2]))" } | '-' wild { "\$this->_bnot(\$this->_wild($_[2]))" } | '+' anyval { "\$this->_breq(\$this->_token($_[2]))" } | '+' wild { "\$this->_breq(\$this->_wild($_[2]))" } | wild { "\$this->_wild($_[1])" } | anyval { "\$this->_token($_[1])" } ; # Anyval is the set of values generally able to be searched for anyval: STRING { "'$_[1]'" } | VALUE { "'$_[1]'" } ; # Wild doesn't include VALUE as it can't take quoted strings wild: WILDSTR { "'$_[1]'" } ; # VALUE : a quoted string # VAR : a string made up of only a-z, A-Z, 0-9 and _ # STRING : any sequence of symbols other than whitespace, and that doesn't start with a ' ####################################################################### # Additional Code - Tokenizer and Error Handling ####################################################################### %% sub _Error { exists $_[0]->YYData->{ERRMSG} and do { delete $_[0]->YYData->{ERRMSG}; return; }; die "MI::Search Query Syntax error. (" . $_[0]->YYData->{INPUT} . ")"; } sub _Lexer { my($parser)=shift; my $res = ['',undef]; if ($parser->YYData->{boolToParse}) { my $src = $parser->YYData->{boolToParse}; $src =~ s/^\s*//; #strip leading spaces if ($src =~ s/^(\+|\-)([\+\-]*)//x) { # Special cases: +,- symbols $res = [$1, $1]; if ($2 ne '') { set_warning($parser->YYData,"multiple +-"); } } elsif ($src =~ s/^ # Starting at the first character \' # A leading quote or doublequote ( # catch the next bit to use as the result (\\. # a \ followed by any char |[^\']) # or any char other than the leading quote repeated. * # 0 to n times ) # end the catch section \' # that leading quote repeated as the closing quote. //x) { # A quoted string (with all internal quotes escaped) $res = ['VALUE', $1]; } elsif ($src =~ s/^ # Starting at the first character \" # A leading quote or doublequote ( # catch the next bit to use as the result (\\. # a \ followed by any char |[^\"]) # or any char other than the leading quote repeated. * # 0 to n times ) # end the catch section \" # that leading quote repeated as the closing quote. //x) { # A quoted string (with all internal quotes escaped) $res = ['VALUE', $1]; } elsif ($src =~ s/^( # Catch the whole matched section as the result. (\\.|[^\s])* # Match any number of escaped or non-space characters ) # End the catch //x) { # a Collection of characters ending in an unescaped space. # Note: This considers two types of string, the token and the * terminated wildcard. # in the future it may be relevant to consider other types of wildcard, such as an # any-position * wildcard. If that happens, the easiest solution will be to add a # new token type (TRUEWILD?). Othewise many of the shortcuts used by different # searches currently would be broken. my $str=$1; if ($str =~ /(^|[^\\])(\\\\)*\*.*./) { # If there's an asterisk anywhere but the last char. set_warning($parser->YYData,"invalid *"); } # Handle differentiating wildcards from normal strings # check for an unescaped final asterisk if ($str =~ /^( # from the start of the string. (.*[^\\])? # any number of characters as long as # the last one isn't a slash (\\\\)* # then an even number of slashes i.e 0,2,4, etc ) # catch everything to here. \*$ # finish with an asterisk at the end of the string /x) { $res = ['WILDSTR', $1]; } else { $res = ['STRING', $str]; } } if (!defined($src) || $src =~ m/^\s*$/) { $parser->YYData->{boolToParse} = undef; } else { $parser->YYData->{boolToParse} = $src; } } else { my $src = $parser->YYData->{INPUT}; $src =~ s/^\s*//; #strip leading spaces if ($src =~ s/^(AND|OR)(?![a-zA-Z\.0-9_])//i) { $res = [uc( $1 ), uc( $1 )]; } elsif ($src =~ s/^(~|<~>)//) { # Boolean comparison rule my $token = $1; # This is very ugly. \1 doesn't appear to be working in perl, # therefore we need to have two separate regexps for single quotes, # and double quotes. # fetch string to be parsed as the boolean string. # This is a special case, where leading spaces need to be stripped. ( $src =~ s/^ # Starting at the first character \s* # And dropping any white space \' # A leading quote or doublequote ( # catch the next bit to use as the result (\\. # a \ followed by any char |[^\']) # or any char other than the leading quote repeated. * # 0 to n times ) # end the catch section \' # that leading quote repeated as the closing quote. //x ) # A quoted string (with all internal quotes escaped) || ( $src =~ s/^ # Starting at the first character \s* # And dropping any white space \" # A leading quote or doublequote ( # catch the next bit to use as the result (\\. # a \ followed by any char |[^\"]) # or any char other than the leading quote repeated. * # 0 to n times ) # end the catch section \" # that leading quote repeated as the closing quote. //x ); # A quoted string (with all internal quotes escaped) $parser->YYData->{boolToParse} = $1; $res = [$token,$token]; } elsif ($src =~ s/^(<>|<\?>|==|\<==\>|\=|\?|\<|\>|,|\)|\()// ) { # Various tokens used $res = [$1,$1]; } elsif ($src =~ s/^ # Starting at the first character (\"|\') # A leading quote or doublequote ( # catch the next bit to use as the result (\\. # a \ followed by any char |[^\1]) # or any char other than the leading quote repeated. *? # 0 to n times ) # end the catch section \1 # that leading quote repeated as the closing quote. //x) { # A quoted string (with all internal quotes escaped) $res = ['VALUE', $2]; } elsif ($src =~ s/^([a-zA-Z\.0-9_]+)//) { # Traditional format var (letters, numbers, underscore and point) $res = ['VAR',$1]; } if (!defined($src) || $src =~ m/^\s*$/) { $parser->YYData->{INPUT} = ''; } else { $parser->YYData->{INPUT} = $src; } } return ($res->[0],$res->[1]); } # Todo: Use MI::Language, and add a counter here for the number of times it has occurred sub set_warning { my ($obj, $warning) = @_; if ($warning eq "invalid *") { $obj->{WARNINGS}{$warning}="Wildcards can only be trailing asterisks. All other asterisks are treated as text"; return; } elsif ($warning eq "multiple +-") { $obj->{WARNINGS}{$warning}="you cannot use multiple + or - symbols before a search word. +-,--,-+ and ++ are all invalid."; return; } } sub PARSE { my($self,$criteria)=@_; $self->YYData->{INPUT} = $criteria; return undef if (!defined($criteria)); return $self->YYParse( yylex => \&_Lexer, yyerror => \&_Error , yydebug => 0); } 1;