HTML-Microformats
view release on metacpan or search on metacpan
lib/HTML/Microformats/Mixin/Parser.pm view on Meta::CPAN
package HTML::Microformats::Mixin::Parser;
use strict qw(subs vars); no warnings;
use 5.010;
use HTML::Microformats::Utilities qw(/^search/);
use HTML::Microformats::Format::adr;
use HTML::Microformats::Datatype;
use HTML::Microformats::Format::geo;
use HTML::Microformats::Format::hAtom;
use HTML::Microformats::Format::hCalendar;
use HTML::Microformats::Format::hCard;
use HTML::Microformats::Format::hMeasure;
use HTML::Microformats::Format::RelEnclosure;
use HTML::Microformats::Format::RelLicense;
use HTML::Microformats::Format::RelTag;
use HTML::Microformats::Format::species;
use URI::URL;
use XML::LibXML qw(:all);
use Object::AUTHORITY;
BEGIN {
$HTML::Microformats::Mixin::Parser::AUTHORITY = 'cpan:TOBYINK';
$HTML::Microformats::Mixin::Parser::VERSION = '0.105';
}
# Cleans away nested compound microformats. Any intentionally
# nested microformats (e.g. vcard class="agent vcard") should be
# dealt with BEFORE calling the destroyer! Because of the
# destructive nature of this function, make sure that you only
# use it on a clone of the real node.
sub _destroyer
{
my $self = shift;
my $element = shift;
# Classes to be destroyed
my @containers = qw(mfo vcard adr geo vcalendar vevent vtodo valarm
vfreebusy hfeed hentry hslice hreview hresume xfolkentry biota haudio
hmeasure hangle hmoney hlisting vtodo-list figure hproduct hnews);
my %C;
foreach my $c (@containers) { $C{$c}=1; }
# Some classes may be retained, optionally.
foreach my $c (@_) { $C{$c}=0; }
# Assemble them all into the regular expression of death.
@containers = ();
foreach my $c (keys %C) { push @containers, $c if $C{$c}; }
my $regexp = join '|', @containers;
$regexp = "\\b($regexp)\\b";
$regexp =~ s/\-/\\\-/g;
# Destroy child elements matching the regular expression.
foreach my $e ($element->getElementsByTagName('*'))
{
next if $e == $element;
if ($e->getAttribute('class') =~ /$regexp/)
{
$self->_destroy_element($e);
my $newclass = $e->getAttribute('class');
$newclass =~ s/$regexp//g;
$e->setAttribute('class', $newclass);
$e->removeAttribute('class') unless length $newclass;
}
}
}
sub _destroy_element
{
my $self = shift;
my $element = shift;
foreach my $c ($element->getElementsByTagName('*'))
{
$c->removeAttribute('class');
$c->removeAttribute('rel');
$c->removeAttribute('rev');
}
}
sub _expand_patterns
{
my $self = shift;
my $root = shift || $self->element;
my $max_include_loops = shift || 2;
# Expand microformat include pattern.
my $incl_iterations = 0;
my $replacements = 1;
while (($incl_iterations < $max_include_loops) && $replacements)
{
$replacements = $self->_expand_include_pattern($root) + $self->_expand_include_pattern_2($root);
$incl_iterations++;
}
# Table cell headers pattern.
$self->_expand_table_header_pattern($root);
lib/HTML/Microformats/Mixin/Parser.pm view on Meta::CPAN
{ @matching_nodes = $root->getElementsByTagName($class); }
if ($type !~ /[rt]/)
{
my @mn2 = searchClass($class, $root);
push @matching_nodes, @mn2;
}
return @matching_nodes;
}
sub _simple_parse_found_error
{
my $self = shift;
push @{ $self->{ERRORS} }, \@_;
}
# 1 = singular, required
# ? = singular, optional
# + = plural, required
# * = plural, optional
# ** = plural, optional, and funny behaviour with embedded microformats
# d = date
# D = duration
# e = exrule/rrule
# i = interval
# h = HTML
# H = HTML and Text (HTML value is prefixed 'html_')
# m = embedded composite microformat
# M = embedded composite microformat or text
# MM = embedded composite microformat or text, if url use pseudo-microformat
# n = numeric
# r = rel, not class
# R = rel *or* class
# t = tag name, not class
# T = tag name *or* class
# u = URI
# U = URI or fragment or text
# & = concatenate strings
# < = Also store node (in $self->{'DATA_'})
# # = _simple_parse should ignore this property
# v = don't do 'value' excerption
sub _simple_parse
# This was not simple to implement, but should be simple to use.
# This function takes on too much responsibility.
# It should delegate stuff.
{
my $self = shift;
my $root = shift || $self->element;
my $classes = $self->format_signature->{'classes'};
my $options = $self->format_signature->{'options'} || {};
my $page = $self->context;
# So far haven't needed any more than this.
my $uf_roots = {
'hCard' => 'vcard',
'hEvent' => 'vevent',
'hAlarm' => 'valarm',
'hTodo' => 'vtodo',
'hFreebusy' => 'vfreebusy',
'hCalendar' => 'vcalendar',
'hMeasure' => 'hmeasure|hangle|hmoney',
'species' => 'biota',
'hAtom' => 'hfeed'
};
# Derived from HTML::Tagset, but some modifications to the order of attrs.
my $link_elements = {
'a' => ['href'],
'applet' => ['codebase', 'archive', 'code'],
'area' => ['href'],
'base' => ['href'],
'bgsound' => ['src'],
'blockquote' => ['cite'],
# 'body' => ['background'],
'del' => ['cite'],
'embed' => ['src', 'pluginspage'],
'form' => ['action'],
'frame' => ['src', 'longdesc'],
'iframe' => ['src', 'longdesc'],
# 'ilayer' => ['background'],
'img' => ['src', 'lowsrc', 'longdesc', 'usemap'],
'input' => ['src', 'usemap'],
'ins' => ['cite'],
'isindex' => ['action'],
'head' => ['profile'],
'layer' => ['src'], # 'background'
'link' => ['href'],
'object' => ['data', 'classid', 'codebase', 'archive', 'usemap'],
'q' => ['cite'],
'script' => ['src', 'for'],
# 'table' => ['background'],
# 'td' => ['background'],
# 'th' => ['background'],
# 'tr' => ['background'],
'xmp' => ['href'],
};
foreach my $c (@$classes)
{
my $class = $c->[0];
my $type = $c->[1];
my $class_options = $c->[2] || {};
my @try_ufs = split / /, $class_options->{'embedded'};
next if $type =~ /#/;
next unless $type =~ /m/i && defined $try_ufs[0];
my @parsed_objects;
my @matching_nodes = $self->_matching_nodes($class, $type, $root);
my @ok_matching_nodes;
if ($class_options->{'nesting-ok'})
{
@ok_matching_nodes = @matching_nodes;
}
else
{
# This is a little bit of extra code that checks for interleaving uF
# root class elements and excludes them. For example, in the following,
# the outer hCard should not have an agent:
# <div class="vcard">
# <p class="birth vcard">
# <span class="agent vcard"></span>
# </p>
# </div>
my @mfos = qw(mfo vcard adr geo vcalendar vevent vtodo valarm
vfreebusy hfeed hentry hslice hreview hresume xfolkentry biota haudio
hmeasure hangle hmoney hlisting vtodo-list figure hproduct hnews);
my $mfos = '\b('.(join '|', @mfos).')\b';
foreach my $u (@{$class_options->{'allow-interleaved'}})
{ $mfos =~ s/\|$u//; }
foreach my $mn (@matching_nodes)
{
my $is_ok = 1;
my $ancestor = $mn->parentNode;
while (length $ancestor->getAttribute('data-cpan-html-microformats-nodepath') > length $root->getAttribute('data-cpan-html-microformats-nodepath'))
{
if ($ancestor->getAttribute('class')=~$mfos)
{
$is_ok = 0;
last;
}
$ancestor = $ancestor->parentNode;
}
push @ok_matching_nodes, $mn if ($is_ok);
}
}
# For each matching node
foreach my $node (@ok_matching_nodes)
{
my @node_parsed_objects;
# Try each microformat until we find something
no strict 'refs';
foreach my $uf (@try_ufs)
{
my $uf_class = (defined $uf_roots->{$uf}) ? $uf_roots->{$uf} : lc($uf);
last if defined $node_parsed_objects[0];
if ($uf eq '!person')
{
# This is used as a last-ditch attempt to parse a person.
my $obj = HTML::Microformats::Format::hCard->new_fallback($node, $self->context);
push @node_parsed_objects, $obj;
}
elsif ($node->getAttribute('class') =~ /\b($uf_class)\b/)
{
my $pkg = 'HTML::Microformats::Format::'.$uf;
my $obj = eval "${pkg}->new(\$node, \$self->context, in_hcalendar => \$class_options->{'is-in-cal'});";
push @node_parsed_objects, $obj;
}
else
{
my $pkg = 'HTML::Microformats::Format::'.$uf;
my @all = eval "${pkg}->extract_all(\$node, \$self->context, in_hcalendar => \$class_options->{'is-in-cal'});";
push @node_parsed_objects, @all if @all;
}
$self->_simple_parse_found_error('W', "Multiple embedded $uf objects found in a single $class property. This is weird.")
if defined $node_parsed_objects[1];
}
use strict 'refs';
# If we've found something
if (defined $node_parsed_objects[0] && ref $node_parsed_objects[0])
( run in 0.994 second using v1.01-cache-2.11-cpan-39bf76dae61 )