Apache2-HTML-Detergent

 view release on metacpan or  search on metacpan

lib/Apache2/HTML/Detergent.pm  view on Meta::CPAN

    unless ($config->type_matches($type)) {
        $r->log->debug("$type doesn't match");
        return Apache2::Const::DECLINED;
    }

    #$r->headers_out->set('Transfer-Encoding', 'chunked');

    # application/xml is the most reliable content type to
    # deliver to browsers that use XSLT.
    if ($config->xslt) {
        $r->log->debug("forcing $type -> application/xml");
        $r->content_type('application/xml; charset=utf-8');
    }

    # XXX will we need to restore $r->status == 200 to this condition?
    if ($r->is_initial_req) {
        # BEGIN BUCKET
        until ($bb->is_empty) {
            my $b = $bb->first;

            if ($b->is_eos) {
                # no further processing if the brigade only contains EOS
                return Apache2::Const::DECLINED if $ctx->[1] eq '';

                # nuke the brigade
                $bb->destroy;
                # this is where that xml code goes
                return _filter_content($f, $config, $ctx);
            }

            if ($b->read(my $data)) {
                if ($ctx->[1] eq '') {
                    # XXX here is where we would double-check the mime type
                }
                $ctx->[1] .= $data;
            }

            # remove this bucket only if it isn't EOS
            $b->remove;
        }

        # destroy the brigade only after exiting the loop
        $bb->destroy;
        # END BUCKET

        return Apache2::Const::OK;
    }

    Apache2::Const::DECLINED;
}

sub _filter_content {
    my ($f, $config, $ctx) = @_;
    my $r = $f->r;
    my $c = $r->connection;

    my ($type, $content) = @$ctx;

    # this is where we hack the content

    # set up the input callbacks with subreq voodoo
    my $icb = $config->callback;
    $icb->register_callbacks([
        sub {
            # MATCH
            return $_[0] =~ m!^/!;
        },
        sub {
            # OPEN
            my $uri  = shift;
            $r->log->debug("opening XML at $uri");
            my $subr = $r->lookup_uri($uri);
            my $data = '';
            $subr->run_trapped(\$data);
            my $io = IO::Scalar->new(\$data);
            # HACK: the callback infrastructure doesn't like the globref
            \$io;
        },
        sub {
            # READ
            my ($io, $len) = @_;
            # HACK once again
            my $fh = $$io;
            my $buf;
            $fh->read($buf, $len);
            $buf;
        },
        sub {
            # CLOSE
            1;
        },
    ]);

    my $scrubber = HTML::Detergent->new($config);

    # $r->headers_in->get('Host') || $r->get_server_name;
    my $host   = $r->hostname || $r->get_server_name;
    my $scheme = $c->is_https ? 'https' :  'http';
    my $port   = $r->get_server_port;

    my $uri = URI->new
        (sprintf '%s://%s:%d%s', $scheme,
         $host, $port, $r->unparsed_uri)->canonical;
    $r->log->debug($uri);

    my $utf8 = Encode::decode(Detect => $content);
    $content = $utf8 if defined $utf8 and ($content ne '' and $utf8 ne '');
    undef $utf8;

    if ($type =~ m!/.*xml!i) {
        $r->log->debug("Attempting to use XML parser for $uri");
        $content = eval {
            XML::LibXML->load_xml
                  (string => $content, recover => 1, no_network => 1) };
        if ($@) {
            $r->log->error("Loading $uri failed: $@");
            return Apache2::Const::HTTP_BAD_GATEWAY;
        }
    }

    # note $content might be an XML::LibXML::Document
    my $doc = $scrubber->process($content, $uri);
    $doc->setEncoding('utf-8');



( run in 0.644 second using v1.01-cache-2.11-cpan-cdf2f3d4e48 )