#
# query.yp
#
# Parse::Yapp input grammar
#
#######################################################################
# Yapp is a perl version of the Bison parser.  For documentation on
# Bison, refer to <http://www.gnu.org/manual/bison>.
#
# This code is based on the example Calc.yp, which is
# /usr/doc/libparse-yapp-perl/examples/Calc.yp on Debian systems.
#######################################################################

# This is in effect two separate parsers.  This is noted more in the code, but
# to explain, it parser a string that follows the form...
#	var operator condition [ [AND|OR] var operator condition ](0..n times)
# The operators are = <> ? <?> == <==> < > and the special cases ~ <~>
# condition is a quoted string.  In the special cases, the quoted string is also
# parsed (after the quotes are stripped).
# This special case is recognised by the ~ and <~> occuring.  At that point the
# Lexer reads ahead to fetch the quoted string and set it up for parsing.
# This is how the quotes seem to mysteriously vanish if you are just looking at
# the parser lines.

# technically it'd be nice to split this second parsing layer out into a separate
# .yp file, but it's more efficient to do it here.

#######################################################################
# Declarations - define precedence
#######################################################################
%left   OR
%left   AND

# Expect 2 shift/reduce conflicts.  Only report an error if more or less occur.
# This is to supress an warning reported by the yapp parser.
%expect 2

#######################################################################
# Production Rules
#
# Assumes &intersect, &query and &union are defined
#######################################################################
%%
query:		cond				{ "$_[1]" }
	|	query AND query			{ "\$this->_and($_[1],$_[3])" }
	|	query OR query			{ "\$this->_or($_[1],$_[3])" }
	|	'(' query ')'			{ "$_[2]" }
;

cond:		varlist '=' qval		{ "\$this->_like('$_[1]', '$_[3]')" }
	|	varlist '<>' qval		{ "\$this->_notlike('$_[1]', '$_[3]')" }
	|	varlist '?' qval		{ "\$this->_match('$_[1]', '$_[3]')" }
	|	varlist '<?>' qval		{ "\$this->_notmatch('$_[1]', '$_[3]')" }
	|	varlist '==' qval		{ "\$this->_exactly('$_[1]', '$_[3]')" }
	|	varlist '<==>' qval		{ "\$this->_exactlynot('$_[1]', '$_[3]')" }
	|	varlist '>' qval		{ "\$this->_gt('$_[1]', '$_[3]')" }
	|	varlist '<' qval		{ "\$this->_lt('$_[1]', '$_[3]')" }
# Remember, hidden absorbtion of quotes around the bconds. (see comments above)
	|	varlist '~' bcond 		{ "\$this->_bool('$_[1]', $_[3])" }
	|	varlist '<~>' bcond		{ "\$this->_bneg('$_[1]', $_[3])" }
;

# qval no longer accepts VAR, only VALUEs (quoted strings)
qval:		VALUE				{ "$_[1]" }
;


# Generates a shift/reduce conflict that resolves correctly.
varlist:	VAR ',' varlist			{ "$_[1],$_[3]" }
	|	VAR				{ "$_[1]" }
;

# bcond is provided to handle the case where there is nothing in the quoted string.
# For example
# 	dc.title,dc.desc ~ ''
# as compared to...
# 	dc.title,dc.desc ~ 'fred "long string" wild* +required'
# Remember that the quotes are absorbed invisibly by the lexer.

bcond:		blist				{ "$_[1]" }
	|					{}
;

# blist is a list of two or more bkeys
# Generates a shift/reduce conflict that resolves correctly.
blist:		bkey blist			{ "\$this->_band($_[1], $_[2])" }
	|	bkey				{ "$_[1]" }
;

# Note: a '-' prefix is only allowed for strings and quoted values.
# We allow it for wildcards, but in a lot of cases (eg. mysql) it won't do anything).
# it only applies to a single term, not to a set.

bkey:		'-' anyval 			{ "\$this->_bnot(\$this->_token($_[2]))" }
	|	'-' wild			{ "\$this->_bnot(\$this->_wild($_[2]))" }
	|	'+' anyval			{ "\$this->_breq(\$this->_token($_[2]))" }
	|	'+' wild			{ "\$this->_breq(\$this->_wild($_[2]))" }
	|	wild				{ "\$this->_wild($_[1])" }
	|	anyval				{ "\$this->_token($_[1])" }
;

# Anyval is the set of values generally able to be searched for
anyval:		STRING				{ "'$_[1]'" }
	|	VALUE				{ "'$_[1]'" }
;

# Wild doesn't include VALUE as it can't take quoted strings
wild:		WILDSTR				{ "'$_[1]'" }
;

# VALUE : a quoted string
# VAR : a string made up of only a-z, A-Z, 0-9 and _
# STRING : any sequence of symbols other than whitespace, and that doesn't start with a '

#######################################################################
# Additional Code - Tokenizer and Error Handling
#######################################################################
%%

sub _Error {
	exists $_[0]->YYData->{ERRMSG}
	and do {
	delete $_[0]->YYData->{ERRMSG};
	return;
	};
	die "MI::Search Query Syntax error. (" . $_[0]->YYData->{INPUT} . ")";
}

sub _Lexer {
	my($parser)=shift;
	my $res = ['',undef];
	if ($parser->YYData->{boolToParse}) {
		my $src = $parser->YYData->{boolToParse};
		$src =~	s/^\s*//; #strip leading spaces
		
		if ($src =~ s/^(\+|\-)([\+\-]*)//x) { # Special cases: +,- symbols
			$res = [$1, $1];
			if ($2 ne '') {
				set_warning($parser->YYData,"multiple +-");
			}
		} elsif ($src =~ s/^		# Starting at the first character
				   \'		# A leading quote or doublequote
				   (		# catch the next bit to use as the result
				    (\\.	# a \ followed by any char
				     |[^\'])	# or any char other than the leading quote repeated. 
				      *		# 0 to n times
				   )		# end the catch section
				   \'		# that leading quote repeated as the closing quote.
				  //x) { 	# A quoted string (with all internal quotes escaped)
			$res = ['VALUE', $1];
		} elsif ($src =~ s/^            # Starting at the first character
				   \"		# A leading quote or doublequote
				   (            # catch the next bit to use as the result
				    (\\.        # a \ followed by any char
				     |[^\"])    # or any char other than the leading quote repeated.
				      *         # 0 to n times
				   )            # end the catch section
				   \"           # that leading quote repeated as the closing quote.
				  //x) { 	# A quoted string (with all internal quotes escaped)
			$res = ['VALUE', $1];
		} elsif ($src =~ s/^(			# Catch the whole matched section as the result.
				     (\\.|[^\s])*	# Match any number of escaped or non-space characters
				    )			# End the catch
				  //x) { 	# a Collection of characters ending in an unescaped space.
			# Note: This considers two types of string, the token and the * terminated wildcard.
			# in the future it may be relevant to consider other types of wildcard, such as an
			# any-position * wildcard.  If that happens, the easiest solution will be to add a
			# new token type (TRUEWILD?).  Othewise many of the shortcuts used by different
			# searches currently would be broken.
			my $str=$1;
			if ($str =~ /(^|[^\\])(\\\\)*\*.*./) {
				# If there's an asterisk anywhere but the last char.
				set_warning($parser->YYData,"invalid *");
			}
			# Handle differentiating wildcards from normal strings
			# check for an unescaped final asterisk
			if ($str =~ /^(			# from the start of the string.
					(.*[^\\])?	# any number of characters as long as 
							# the last one isn't a slash 
					(\\\\)*		# then an even number of slashes i.e 0,2,4, etc
				      )			# catch everything to here.
				      \*$	# finish with an asterisk at the end of the string
				    /x) { 
				$res = ['WILDSTR', $1];
			} else {
				$res = ['STRING', $str];
			}
		}

		if (!defined($src) || $src =~ m/^\s*$/) {
			$parser->YYData->{boolToParse} = undef;
		} else {
			$parser->YYData->{boolToParse} = $src;
		}
	} else {
		my $src = $parser->YYData->{INPUT};
		$src =~	s/^\s*//; #strip leading spaces
		if ($src =~ s/^(AND|OR)(?![a-zA-Z\.0-9_])//i) {
			$res = [uc( $1 ), uc( $1 )];
		} elsif ($src =~ s/^(~|<~>)//) { # Boolean comparison rule
			my $token = $1;
			# This is very ugly. \1 doesn't appear to be working in perl,
			# therefore we need to have two separate regexps for single quotes,
			# and double quotes.
			# fetch string to be parsed as the boolean string.
			# This is a special case, where leading spaces need to be stripped.
			( $src =~ s/^		# Starting at the first character
				   \s*		# And dropping any white space
				   \'		# A leading quote or doublequote
				   (		# catch the next bit to use as the result
				    (\\.	# a \ followed by any char
				     |[^\'])	# or any char other than the leading quote repeated. 
				      *		# 0 to n times
				   )		# end the catch section
				   \'		# that leading quote repeated as the closing quote.
				  //x )		# A quoted string (with all internal quotes escaped)
			||
			( $src =~ s/^  		# Starting at the first character
				   \s*          # And dropping any white space
				   \" 		# A leading quote or doublequote
				   (            # catch the next bit to use as the result
				    (\\.        # a \ followed by any char
				     |[^\"])    # or any char other than the leading quote repeated.
				      *  	# 0 to n times
				   )            # end the catch section
				   \"           # that leading quote repeated as the closing quote.
				  //x );        # A quoted string (with all internal quotes escaped)
			
			$parser->YYData->{boolToParse} = $1;
			$res = [$token,$token];
		} elsif ($src =~ s/^(<>|<\?>|==|\<==\>|\=|\?|\<|\>|,|\)|\()// ) { # Various tokens used
			$res = [$1,$1];
		} elsif ($src =~ s/^		# Starting at the first character
				   (\"|\')	# A leading quote or doublequote
				   (		# catch the next bit to use as the result
				    (\\.	# a \ followed by any char
				     |[^\1])	# or any char other than the leading quote repeated. 
				      *?	# 0 to n times
				   )		# end the catch section
				   \1		# that leading quote repeated as the closing quote.
				  //x) { # A quoted string (with all internal quotes escaped)
			$res = ['VALUE', $2];
		} elsif ($src =~ s/^([a-zA-Z\.0-9_]+)//) { # Traditional format var (letters, numbers, underscore and point)
			$res = ['VAR',$1];
		}

		if (!defined($src) || $src =~ m/^\s*$/) {
			$parser->YYData->{INPUT} = '';
		} else {
			$parser->YYData->{INPUT} = $src;
		}
	}

	return ($res->[0],$res->[1]);
}

# Todo: Use MI::Language, and add a counter here for the number of times it has occurred
sub set_warning {
	my ($obj, $warning) = @_;
	if ($warning eq "invalid *") {
		$obj->{WARNINGS}{$warning}="Wildcards can only be trailing asterisks.  All other asterisks are treated as text";
		return;
	} elsif ($warning eq "multiple +-") {
		$obj->{WARNINGS}{$warning}="you cannot use multiple + or - symbols before a search word.  +-,--,-+ and ++ are all invalid.";
		return;
	}
}

sub PARSE {
	my($self,$criteria)=@_;
	$self->YYData->{INPUT} = $criteria;
	return undef if (!defined($criteria));
	return $self->YYParse( yylex => \&_Lexer, yyerror => \&_Error , yydebug => 0);
}

1;