HTML-Microformats

 view release on metacpan or  search on metacpan

lib/HTML/Microformats/Mixin/Parser.pm  view on Meta::CPAN

package HTML::Microformats::Mixin::Parser;

use strict qw(subs vars); no warnings;
use 5.010;

use HTML::Microformats::Utilities qw(/^search/);
use HTML::Microformats::Format::adr;
use HTML::Microformats::Datatype;
use HTML::Microformats::Format::geo;
use HTML::Microformats::Format::hAtom;
use HTML::Microformats::Format::hCalendar;
use HTML::Microformats::Format::hCard;
use HTML::Microformats::Format::hMeasure;
use HTML::Microformats::Format::RelEnclosure;
use HTML::Microformats::Format::RelLicense;
use HTML::Microformats::Format::RelTag;
use HTML::Microformats::Format::species;
use URI::URL;
use XML::LibXML qw(:all);

use Object::AUTHORITY;

BEGIN {
	$HTML::Microformats::Mixin::Parser::AUTHORITY = 'cpan:TOBYINK';
	$HTML::Microformats::Mixin::Parser::VERSION   = '0.105';
}

# Cleans away nested compound microformats. Any intentionally
# nested microformats (e.g. vcard class="agent vcard") should be
# dealt with BEFORE calling the destroyer! Because of the
# destructive nature of this function, make sure that you only
# use it on a clone of the real node.
sub _destroyer
{
	my $self = shift;
	my $element = shift;
	
	# Classes to be destroyed
	my @containers = qw(mfo vcard adr geo vcalendar vevent vtodo valarm
		vfreebusy hfeed hentry hslice hreview hresume xfolkentry biota haudio
		hmeasure hangle hmoney hlisting vtodo-list figure hproduct hnews);
	my %C;
	foreach my $c (@containers) { $C{$c}=1; }
	
	# Some classes may be retained, optionally.
	foreach my $c (@_)          { $C{$c}=0; }

	# Assemble them all into the regular expression of death.
	@containers = ();
	foreach my $c (keys %C) { push @containers, $c if $C{$c}; }
	my $regexp = join '|', @containers;
	$regexp = "\\b($regexp)\\b";
	$regexp =~ s/\-/\\\-/g;
	
	# Destroy child elements matching the regular expression.
	foreach my $e ($element->getElementsByTagName('*'))
	{	
		next if $e == $element;
		
		if ($e->getAttribute('class') =~ /$regexp/)
		{
			$self->_destroy_element($e);
			my $newclass = $e->getAttribute('class');
			$newclass =~ s/$regexp//g;
			$e->setAttribute('class', $newclass);
			$e->removeAttribute('class') unless length $newclass;
		}
	}
}

sub _destroy_element
{
	my $self    = shift;
	my $element = shift;
	
	foreach my $c ($element->getElementsByTagName('*'))
	{
		$c->removeAttribute('class');
		$c->removeAttribute('rel');
		$c->removeAttribute('rev');
	}
}

sub _expand_patterns
{
	my $self = shift;
	my $root = shift || $self->element;
	my $max_include_loops = shift || 2;

	# Expand microformat include pattern.
	my $incl_iterations = 0;
	my $replacements = 1;
	while (($incl_iterations < $max_include_loops) && $replacements)
	{
		$replacements = $self->_expand_include_pattern($root) + $self->_expand_include_pattern_2($root);
		$incl_iterations++;
	}
	
	# Table cell headers pattern.
	$self->_expand_table_header_pattern($root);	

lib/HTML/Microformats/Mixin/Parser.pm  view on Meta::CPAN

		{ @matching_nodes = $root->getElementsByTagName($class); }
	
	if ($type !~ /[rt]/)
	{
		my @mn2 = searchClass($class, $root);
		push @matching_nodes, @mn2;
	}
	
	return @matching_nodes;
}

sub _simple_parse_found_error
{
	my $self = shift;
	push @{ $self->{ERRORS} }, \@_;
}

#  1  = singular, required
#  ?  = singular, optional
#  +  = plural, required
#  *  = plural, optional
#  ** = plural, optional, and funny behaviour with embedded microformats
#  d  = date
#  D  = duration
#  e  = exrule/rrule
#  i  = interval
#  h  = HTML
#  H  = HTML and Text (HTML value is prefixed 'html_')
#  m  = embedded composite microformat
#  M  = embedded composite microformat or text
#  MM = embedded composite microformat or text, if url use pseudo-microformat
#  n  = numeric
#  r  = rel, not class
#  R  = rel *or* class
#  t  = tag name, not class
#  T  = tag name *or* class
#  u  = URI
#  U  = URI or fragment or text
#  &  = concatenate strings
#  <  = Also store node (in $self->{'DATA_'})
#  #  = _simple_parse should ignore this property
#  v  = don't do 'value' excerption

sub _simple_parse
# This was not simple to implement, but should be simple to use.
# This function takes on too much responsibility.
# It should delegate stuff.
{
	my $self    = shift;
	my $root    = shift || $self->element;
	my $classes = $self->format_signature->{'classes'};
	my $options = $self->format_signature->{'options'} || {};
	my $page    = $self->context;
	
	# So far haven't needed any more than this.
	my $uf_roots = {
		'hCard'     => 'vcard',
		'hEvent'    => 'vevent',
		'hAlarm'    => 'valarm',
		'hTodo'     => 'vtodo',
		'hFreebusy' => 'vfreebusy',
		'hCalendar' => 'vcalendar',
		'hMeasure'  => 'hmeasure|hangle|hmoney',
		'species'   => 'biota',
		'hAtom'     => 'hfeed'
	};
	
	# Derived from HTML::Tagset, but some modifications to the order of attrs.
	my $link_elements = {
		'a'       => ['href'],
		'applet'  => ['codebase', 'archive', 'code'],
		'area'    => ['href'],
		'base'    => ['href'],
		'bgsound' => ['src'],
		'blockquote' => ['cite'],
#		'body'    => ['background'],
		'del'     => ['cite'],
		'embed'   => ['src', 'pluginspage'],
		'form'    => ['action'],
		'frame'   => ['src', 'longdesc'],
		'iframe'  => ['src', 'longdesc'],
#		'ilayer'  => ['background'],
		'img'     => ['src', 'lowsrc', 'longdesc', 'usemap'],
		'input'   => ['src', 'usemap'],
		'ins'     => ['cite'],
		'isindex' => ['action'],
		'head'    => ['profile'],
		'layer'   => ['src'], # 'background'
		'link'    => ['href'],
		'object'  => ['data', 'classid', 'codebase', 'archive', 'usemap'],
		'q'       => ['cite'],
		'script'  => ['src', 'for'],
#		'table'   => ['background'],
#		'td'      => ['background'],
#		'th'      => ['background'],
#		'tr'      => ['background'],
		'xmp'     => ['href'],
	};
	
	foreach my $c (@$classes)
	{
		my $class         = $c->[0];
		my $type          = $c->[1];
		my $class_options = $c->[2] || {};
		my @try_ufs       = split / /, $class_options->{'embedded'};
		
		next if $type =~ /#/;
		
		next unless $type =~ /m/i && defined $try_ufs[0];
		
		my @parsed_objects;
		my @matching_nodes = $self->_matching_nodes($class, $type, $root);
		my @ok_matching_nodes;
		
		if ($class_options->{'nesting-ok'})
		{
			@ok_matching_nodes = @matching_nodes;
		}
		else
		{
			# This is a little bit of extra code that checks for interleaving uF
			# root class elements and excludes them. For example, in the following,
			# the outer hCard should not have an agent:
			# <div class="vcard">
			#  <p class="birth vcard">
			#   <span class="agent vcard"></span>
			#  </p>
			# </div>
			my @mfos = qw(mfo vcard adr geo vcalendar vevent vtodo valarm
				vfreebusy hfeed hentry hslice hreview hresume xfolkentry biota haudio
				hmeasure hangle hmoney hlisting vtodo-list figure hproduct hnews);
			my $mfos = '\b('.(join '|', @mfos).')\b';
			foreach my $u (@{$class_options->{'allow-interleaved'}})
				{ $mfos =~ s/\|$u//; }

			foreach my $mn (@matching_nodes)
			{
				my $is_ok = 1;
				my $ancestor = $mn->parentNode;
				while (length $ancestor->getAttribute('data-cpan-html-microformats-nodepath') > length $root->getAttribute('data-cpan-html-microformats-nodepath'))
				{
					if ($ancestor->getAttribute('class')=~$mfos)
					{
						$is_ok = 0;
						last;
					}
					$ancestor = $ancestor->parentNode;
				}
				push @ok_matching_nodes, $mn if ($is_ok);
			}
		}
		
		# For each matching node
		foreach my $node (@ok_matching_nodes)
		{
			my @node_parsed_objects;
			
			# Try each microformat until we find something
			no strict 'refs';
			foreach my $uf (@try_ufs)
			{				
				my $uf_class = (defined $uf_roots->{$uf}) ? $uf_roots->{$uf} : lc($uf);
				last if defined $node_parsed_objects[0];
				
				if ($uf eq '!person')
				{
					# This is used as a last-ditch attempt to parse a person.
					my $obj = HTML::Microformats::Format::hCard->new_fallback($node, $self->context);
					push @node_parsed_objects, $obj;
				}
				elsif ($node->getAttribute('class') =~ /\b($uf_class)\b/)
				{
					my $pkg = 'HTML::Microformats::Format::'.$uf;
					my $obj = eval "${pkg}->new(\$node, \$self->context, in_hcalendar => \$class_options->{'is-in-cal'});";
					push @node_parsed_objects, $obj;
				}
				else
				{
					my $pkg = 'HTML::Microformats::Format::'.$uf;
					my @all = eval "${pkg}->extract_all(\$node, \$self->context, in_hcalendar => \$class_options->{'is-in-cal'});";
					push @node_parsed_objects, @all if @all;
				}
				
				$self->_simple_parse_found_error('W', "Multiple embedded $uf objects found in a single $class property. This is weird.")
					if defined $node_parsed_objects[1];
			}
			use strict 'refs';
			
			# If we've found something
			if (defined $node_parsed_objects[0] && ref $node_parsed_objects[0])



( run in 0.994 second using v1.01-cache-2.11-cpan-39bf76dae61 )