Apache2-HTML-Detergent

 view release on metacpan or  search on metacpan

lib/Apache2/HTML/Detergent.pm  view on Meta::CPAN

        return Apache2::Const::OK;
    }

    Apache2::Const::DECLINED;
}

sub _filter_content {
    my ($f, $config, $ctx) = @_;
    my $r = $f->r;
    my $c = $r->connection;

    my ($type, $content) = @$ctx;

    # this is where we hack the content

    # set up the input callbacks with subreq voodoo
    my $icb = $config->callback;
    $icb->register_callbacks([
        sub {
            # MATCH
            return $_[0] =~ m!^/!;
        },
        sub {
            # OPEN
            my $uri  = shift;
            $r->log->debug("opening XML at $uri");
            my $subr = $r->lookup_uri($uri);
            my $data = '';
            $subr->run_trapped(\$data);
            my $io = IO::Scalar->new(\$data);
            # HACK: the callback infrastructure doesn't like the globref
            \$io;
        },
        sub {
            # READ
            my ($io, $len) = @_;
            # HACK once again
            my $fh = $$io;
            my $buf;
            $fh->read($buf, $len);
            $buf;
        },
        sub {
            # CLOSE
            1;
        },
    ]);

    my $scrubber = HTML::Detergent->new($config);

    # $r->headers_in->get('Host') || $r->get_server_name;
    my $host   = $r->hostname || $r->get_server_name;
    my $scheme = $c->is_https ? 'https' :  'http';
    my $port   = $r->get_server_port;

    my $uri = URI->new
        (sprintf '%s://%s:%d%s', $scheme,
         $host, $port, $r->unparsed_uri)->canonical;
    $r->log->debug($uri);

    my $utf8 = Encode::decode(Detect => $content);
    $content = $utf8 if defined $utf8 and ($content ne '' and $utf8 ne '');
    undef $utf8;

    if ($type =~ m!/.*xml!i) {
        $r->log->debug("Attempting to use XML parser for $uri");
        $content = eval {
            XML::LibXML->load_xml
                  (string => $content, recover => 1, no_network => 1) };
        if ($@) {
            $r->log->error("Loading $uri failed: $@");
            return Apache2::Const::HTTP_BAD_GATEWAY;
        }
    }

    # note $content might be an XML::LibXML::Document
    my $doc = $scrubber->process($content, $uri);
    $doc->setEncoding('utf-8');

    my $root = $doc->documentElement;
    if ($root and lc $root->localName eq 'html') {
        # XML_DTD_NODE
        # $and not grep { $_->nodeType == 14 } $doc->childNodes) {
        $doc->removeInternalSubset;
        $doc->removeExternalSubset;
        $doc->createExternalSubset('html', undef, undef);
        $doc->createInternalSubset('html', undef, undef);
    }

    if (defined $config->xslt) {
        # check for existing xslt
        my $found;
        for my $child ($doc->childNodes) {
            if ($child->nodeType == 7
                    && lc($child->nodeName) eq 'xml-stylesheet'
                        && lc($child->getData) =~ /xsl/) {
                $found = $child;
                last;
            }
        }

        # TODO: config directive to override existing XSLT PI?
        unless ($found) {
            my $pi = $doc->createProcessingInstruction
                ('xml-stylesheet', sprintf 'type="text/xsl" href="%s"',
                 $config->xslt);

            if ($root) {
                $doc->insertBefore($pi, $root);
            }
        }
    }
    else {
        $r->content_type(sprintf '%s; charset=utf-8', $type);
    }
    #$r->log->debug($r->content_encoding || 'identity');
    #$r->log->debug($r->headers_in->get('Content-Encoding'));

    # reuse content
    $content = $doc->toString(1);

    # explicitly get rid of these big objects
    undef $scrubber;



( run in 1.423 second using v1.01-cache-2.11-cpan-39bf76dae61 )