CSS-DOM

 view release on metacpan or  search on metacpan

lib/CSS/DOM/Parser.pm  view on Meta::CPAN

package CSS::DOM::Parser;

$VERSION = '0.17';

use strict; use warnings; no warnings qw 'utf8 parenthesis';
use re 'taint';

use Carp 1.01 qw 'shortmess croak';
use CSS::DOM;
use CSS::DOM::Rule::Style;
use CSS::DOM::Style;
use CSS::DOM::Util 'unescape';

our @CARP_NOT = qw "CSS::DOM CSS::DOM::Rule::Media";


# Tokeniser regexps

my $token_re;

# This one has to be outside the scope, because we need it in tokenise.
my $_optspace   = qr/[ \t\r\n\f]*/;
{

# Vars beginning with _ here are not token regexps, but are used to
# build them.
my $_escape =qr/\\(?:[0-9a-f]{1,6}(?:\r\n|[ \n\r\t\f])?|[^\n\r\f0-9a-f])/i;
my $_id_start = qr/[_a-zA-Z]|[^\0-\177]|$_escape/;
my $_id_cont   = qr/[_a-zA-Z0-9-]|[^\0-\177]|$_escape/;
my $_nl        = qr/\r\n?|[\n\f]/;
my $_invalid_qq = qr/"[^\n\r\f\\"]*(?:(?:\\$_nl|$_escape)[^\n\r\f\\"]*)*/;
my $_invalid_q  = qr/'[^\n\r\f\\']*(?:(?:\\$_nl|$_escape)[^\n\r\f\\']*)*/;

my $ident = qr/-?$_id_start$_id_cont*/;
my $at    = qr/\@$ident/;
my $str   = qr/$_invalid_qq(?:"|\z)|$_invalid_q(?:'|\z)/;
my $invalid = qr/$_invalid_qq|$_invalid_q/;
my $hash     = qr/#$_id_cont+/;
my $num      = qr/(?=\.?[0-9])[0-9]*(?:\.[0-9]*)?/;
my $percent  = qr/$num%/;
my $dim      = qr/$num$ident/;
my $url      = qr/url\($_optspace(?:
		$str
	  	  |
		[^\0- "'()\\\x7f]*(?:$_escape[^\0- "'()\\\x7f]*)*
	)$_optspace(?:\)|\z)/x;
my $uni_range = qr/U\+[0-9A-F?]{1,6}(?:-[0-9a-f]{1,6})?/i;
my $space     = qr/(?:[ \t\r\n\f]+|\/\*.*?(?:\*\/|\z))[ \t\r\n\f]*
                   (?:\/\*.*?(?:\*\/|\z)[ \t\r\n\f]*)*/xs;
my $function  = qr/$ident\(/;

# Literal tokens are as follows:
#  <!-- --> ; { } ( ) [ ] ~= |= , :

# The order of some tokens is important. $url, $uni_range and $function
# have to come before $ident. $url has to come before $function. $percent
# and $dim have to come before $num. 
$token_re = qr/\G(?:
        ($url)|($uni_range)|($function)|($ident)|($at)|($str)|($invalid)|
        ($hash)|($percent)|($dim)|($num)|(<!--|-->)|(;)|(\{)|(})|(\()|(\))
       |(\[)|(])|($space)|(~=)|(\|=)|(,)|(:)|(.)
)/xs;

} # end of tokeniser regexps

# tokenise returns a string of token types in addition to the array of
# tokens so that we can apply grammar rules using regexps. The types are
# as follows:
#  u  url
#  U  unicode range
#  f  function
#  i  identifier
#  @  at keyword
#  '  string
#  "  invalid string (unterminated)
#  #  hash
#  %  percentage
#  D  dimension
#  1  number  (not 0, because we want it true)
#  <  html comment delimiter
#  s  space/comments
#  ~  ~=
#  |  |=
#  d  delimiter (miscellaneous character)
# The characters ;{}()[],: represent themselves. The comma and colon are
# actually delimiters according to the CSS 2.1 spec, but it’s more conveni-
# ent to have them as their own tokens.
# ~~~ It might actually make the code cleaner if we make them all their own
# tokens, in which case we can provide a $delim_re for matching against a
# token type string.

sub tokenise { warn caller unless defined $_[0];for (''.shift) {
	my($tokens,@tokens)='';
	while(/$token_re/gc){
		my $which = (grep defined $+[$_], 1..$#+)[0];
		no strict 'refs';
		push @tokens, $$which;
		no warnings qw]qw];
		$tokens .=
		  qw/u U f i @ ' " # % D 1 < ; { } ( ) [ ] s ~ | , : d/
				[$which-1];

		# We need to close unterminated tokens for the sake of
		# serialisation. If we don’t, then too many other parts of
		# the code base have to deal with it.
		if($tokens =~ /'\z/) {
			$tokens[-1] =~ /^(')[^'\\]*(?:\\.[^'\\]*)*\z
			                 |
			                ^(")[^"\\]*(?:\\.[^"\\]*)*\z/xs
			and $tokens[-1] .= $1 || $2;
		}
		elsif($tokens =~ /u\z/) {
			(my $copy = $tokens[-1]) =~ s/^url\($_optspace(?:
				(')[^'\\]*(?:\\.[^'\\]*)*
			 	  |
				(")[^"\\]*(?:\\.[^"\\]*)*
				  |
				[^)\\]*(?:\\.[^)\\]*)*
			)//sox;
			my $str_delim = $1||$2;
			$str_delim and $copy!~s/^['"]$_optspace//o
				and $tokens[-1] .= $str_delim;
			$copy or $tokens[-1] .= ')';
		}
	}
	# This can’t ever happen:
	pos and pos() < length
	 and die "CSS::DOM::Parser internal error (please report this):"
		." Can't tokenise " .substr $_,pos;

	# close bracketed constructs: again, we do this here so that other
	# pieces of code scattered all over the place (including the reg-
	# exps below,  which  would  need  things  like  ‘(?:\)|\z)’)
	# don’t have to.
	my $brack_count = (()=$tokens=~/[(f]/g)-(()=$tokens=~/\)/g)
	                + (()=$tokens=~/\[/g)-(()=$tokens=~/]/g)
	                + (()=$tokens=~/{/g)-(()=$tokens=~/}/g);



( run in 0.461 second using v1.01-cache-2.11-cpan-39bf76dae61 )