Apache2-HTML-Detergent
view release on metacpan or search on metacpan
lib/Apache2/HTML/Detergent.pm view on Meta::CPAN
=head1 VERSION
Version 0.07
=cut
our $VERSION = '0.07';
=head1 SYNOPSIS
# httpd.conf or .htaccess
# The + prefix forces the module to preload
PerlOutputFilterHandler +Apache2::HTML::Detergent
# These default matching content types can be overridden
DetergentTypes text/html application/xhtml+xml
# This invocation just pulls the matching element into a new document
DetergentMatch /xpath/statement
# An optional second argument can specify an XSLT stylesheet
DetergentMatch /other/xpath/statement /path/to/transform.xsl
# Configure <link> and <meta> tags
DetergentLink relvalue http://href
DetergentMeta namevalue "Content"
# that's it!
=head1 DESCRIPTION
=cut
sub handler : FilterRequestHandler {
#my $f = shift;
my ($f, $bb) = @_;
my $r = $f->r;
my $c = $r->connection;
my $class = __PACKAGE__ . '::Config';
my $config = Apache2::Module::get_config
($class, $r->server, $r->per_dir_config) ||
Apache2::Module::get_config($class, $r->server);
unless ($config) {
$r->log->crit("Cannot find config from $class!");
return Apache2::Const::DECLINED;
}
# store the context; initial content type, payload
my $ctx;
unless ($ctx = $f->ctx) {
# turns out some things don't have a type!
my $x = $r->content_type || '';
my ($t, $c) =
($x =~ /^\s*([^;]*)(?:;.*?charset\s*=\s*['"]*([^'"]+)['"]*?)?/i);
$ctx = [$t || 'application/octet-stream', ''];
$f->ctx($ctx);
}
# get this before changing it
my $type = $ctx->[0];
unless ($config->type_matches($type)) {
$r->log->debug("$type doesn't match");
return Apache2::Const::DECLINED;
}
#$r->headers_out->set('Transfer-Encoding', 'chunked');
# application/xml is the most reliable content type to
# deliver to browsers that use XSLT.
if ($config->xslt) {
$r->log->debug("forcing $type -> application/xml");
$r->content_type('application/xml; charset=utf-8');
}
# XXX will we need to restore $r->status == 200 to this condition?
if ($r->is_initial_req) {
# BEGIN BUCKET
until ($bb->is_empty) {
my $b = $bb->first;
if ($b->is_eos) {
# no further processing if the brigade only contains EOS
return Apache2::Const::DECLINED if $ctx->[1] eq '';
# nuke the brigade
$bb->destroy;
# this is where that xml code goes
return _filter_content($f, $config, $ctx);
}
if ($b->read(my $data)) {
if ($ctx->[1] eq '') {
# XXX here is where we would double-check the mime type
}
$ctx->[1] .= $data;
}
# remove this bucket only if it isn't EOS
$b->remove;
}
# destroy the brigade only after exiting the loop
$bb->destroy;
# END BUCKET
return Apache2::Const::OK;
}
Apache2::Const::DECLINED;
}
sub _filter_content {
my ($f, $config, $ctx) = @_;
my $r = $f->r;
my $c = $r->connection;
my ($type, $content) = @$ctx;
# this is where we hack the content
# set up the input callbacks with subreq voodoo
my $icb = $config->callback;
$icb->register_callbacks([
sub {
# MATCH
return $_[0] =~ m!^/!;
},
sub {
# OPEN
my $uri = shift;
$r->log->debug("opening XML at $uri");
my $subr = $r->lookup_uri($uri);
lib/Apache2/HTML/Detergent.pm view on Meta::CPAN
my $port = $r->get_server_port;
my $uri = URI->new
(sprintf '%s://%s:%d%s', $scheme,
$host, $port, $r->unparsed_uri)->canonical;
$r->log->debug($uri);
my $utf8 = Encode::decode(Detect => $content);
$content = $utf8 if defined $utf8 and ($content ne '' and $utf8 ne '');
undef $utf8;
if ($type =~ m!/.*xml!i) {
$r->log->debug("Attempting to use XML parser for $uri");
$content = eval {
XML::LibXML->load_xml
(string => $content, recover => 1, no_network => 1) };
if ($@) {
$r->log->error("Loading $uri failed: $@");
return Apache2::Const::HTTP_BAD_GATEWAY;
}
}
# note $content might be an XML::LibXML::Document
my $doc = $scrubber->process($content, $uri);
$doc->setEncoding('utf-8');
my $root = $doc->documentElement;
if ($root and lc $root->localName eq 'html') {
# XML_DTD_NODE
# $and not grep { $_->nodeType == 14 } $doc->childNodes) {
$doc->removeInternalSubset;
$doc->removeExternalSubset;
$doc->createExternalSubset('html', undef, undef);
$doc->createInternalSubset('html', undef, undef);
}
if (defined $config->xslt) {
# check for existing xslt
my $found;
for my $child ($doc->childNodes) {
if ($child->nodeType == 7
&& lc($child->nodeName) eq 'xml-stylesheet'
&& lc($child->getData) =~ /xsl/) {
$found = $child;
last;
}
}
# TODO: config directive to override existing XSLT PI?
unless ($found) {
my $pi = $doc->createProcessingInstruction
('xml-stylesheet', sprintf 'type="text/xsl" href="%s"',
$config->xslt);
if ($root) {
$doc->insertBefore($pi, $root);
}
}
}
else {
$r->content_type(sprintf '%s; charset=utf-8', $type);
}
#$r->log->debug($r->content_encoding || 'identity');
#$r->log->debug($r->headers_in->get('Content-Encoding'));
# reuse content
$content = $doc->toString(1);
# explicitly get rid of these big objects
undef $scrubber;
undef $doc;
undef $config;
# now deal with the rest
use bytes;
# $r->log->debug(bytes::length($buf));
$r->set_content_length(bytes::length($content));
my $new_bb = APR::Brigade->new($c->pool, $c->bucket_alloc);
my $b = APR::Bucket->new($new_bb->bucket_alloc, $content);
$new_bb->insert_tail($b);
$new_bb->insert_tail
(APR::Bucket::eos_create($new_bb->bucket_alloc));
my $rv = $f->next->pass_brigade($new_bb);
return $rv unless $rv == APR::Const::SUCCESS;
Apache2::Const::OK;
}
=head1 AUTHOR
Dorian Taylor, C<< <dorian at cpan.org> >>
=head1 BUGS
Please report any bugs or feature requests to
C<bug-apache2-html-detergent at rt.cpan.org>, or through the web
interface at
L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Apache2-HTML-Detergent>.
I will be notified, and then you'll automatically be notified of
progress on your bug as I make changes.
=head1 SUPPORT
You can find documentation for this module with the perldoc command.
perldoc Apache2::HTML::Detergent
You can also look for information at:
=over 4
=item * RT: CPAN's request tracker (report bugs here)
L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=Apache2-HTML-Detergent>
=item * AnnoCPAN: Annotated CPAN documentation
( run in 1.936 second using v1.01-cache-2.11-cpan-39bf76dae61 )