Apache2-ModProxyPerlHtml

 view release on metacpan or  search on metacpan

ModProxyPerlHtml.pm  view on Meta::CPAN

#------------------------------------------------------------------------------
# Project  : Reverse Proxy HTML link rewriter
# Name     : ModProxyPerlHtml.pm
# Language : perl 5
# Authors  : Gilles Darold, gilles at darold dot net
# Copyright: Copyright (c) 2005-2022, Gilles Darold - All rights reserved -
# Description : This mod_perl module is a replacement for mod_proxy_html.c
#		with far better URL HTML rewriting.
# Usage    : See documentation in this file with perldoc.
#------------------------------------------------------------------------------
# This program is free software; you can redistribute it and/or modify it under
# the same terms as Perl itself.
#------------------------------------------------------------------------------
package Apache2::ModProxyPerlHtml;
use strict qw(vars);
use warnings;

require mod_perl2;

use Apache2::Connection ();
use Apache2::RequestRec;
use Apache2::RequestUtil;
use APR::Table;
use APR::URI;
use base qw(Apache2::Filter);
use Apache2::Const -compile => qw(OK DECLINED :conn_keepalive);
use constant BUFF_LEN => 8000;
use Apache2::ServerRec;
use Apache2::URI;


$Apache2::ModProxyPerlHtml::VERSION = '4.1';


%Apache2::ModProxyPerlHtml::linkElements = (
	'a'       => ['href'],
	'applet'  => ['archive', 'codebase', 'code'],
	'area'    => ['href'],
	'bgsound' => ['src'],
	'blockquote' => ['cite'],
	'body'    => ['background'],
	'del'     => ['cite'],
	'embed'   => ['pluginspage', 'src'],
	'form'    => ['action'],
	'frame'   => ['src', 'longdesc'],
	'iframe'  => ['src', 'longdesc'],
	'ilayer'  => ['background'],
	'img'     => ['src', 'lowsrc', 'longdesc', 'usemap'],
	'input'   => ['src', 'usemap','formaction'],
	'ins'     => ['cite'],
	'isindex' => ['action'],
	'head'    => ['profile'],
	'layer'   => ['background', 'src'],
	'link'    => ['href'],
	'object'  => ['classid', 'codebase', 'data', 'archive', 'usemap'],
	'q'       => ['cite'],
	'script'  => ['src', 'for'],
	'table'   => ['background'],
	'td'      => ['background'],
	'th'      => ['background'],
	'tr'      => ['background'],
	'xmp'     => ['href'],
	'button'  => ['formaction'],
);

sub handler
{
	my $f = shift;

	my $debug = $f->r->dir_config->get('ProxyHTMLVerbose');
	if ($debug && $debug =~ /(on|1)/i) {
		$debug = 1;
	} else {
		$debug = 0;
	}

	# Thing we do at the first chunk
	my $content_type = $f->r->content_type() || '';
	unless ($f->ctx) {
		$f->r->headers_out->unset('Content-Length');
		my @pattern = $f->r->dir_config->get('ProxyHTMLURLMap');
		my @rewrite = $f->r->dir_config->get('ProxyHTMLRewrite');
		my $contenttype = $f->r->dir_config->get('ProxyHTMLContentType');
		$contenttype ||= '(text\/javascript|text\/html|text\/css|text\/xml|application\/.*javascript|application\/.*xml)';
		my $badcontenttype = $f->r->dir_config->get('ProxyHTMLExcludeContentType');
		$badcontenttype ||= '(application\/vnd\.openxml)';
		my @exclude = $f->r->dir_config->get('ProxyHTMLExcludeUri');
		my @obfuscation = $f->r->dir_config->get('ProxyHTMLRot13Links');

		my $ct = $f->ctx;
		$ct->{data} = '';
		foreach my $p (@pattern) {
			push(@{$ct->{pattern}}, $p);
		}
		foreach my $p (@rewrite) {
			push(@{$ct->{rewrite}}, $p);
		}
		$ct->{contenttype} = $contenttype;
		$ct->{badcontenttype} = $badcontenttype;
		foreach my $u (@exclude) {
			push(@{$ct->{excluded}}, $u);
		}
		foreach my $o (@obfuscation) {
			my ($elt, $attr) = split(/:/, $o);
			if (uc($elt) eq 'ALL') {
				$ct->{rot13elements} = 'All';
				last;
			} else {
				$ct->{rot13elements}->{$elt} = $attr;
			}
		}
		$f->ctx($ct);
	}
	# Thing we do on all invocations
	my $ctx = $f->ctx;
	while ($f->read(my $buffer, BUFF_LEN)) {
		$ctx->{data} .= $buffer;
		$ctx->{keepalives} = $f->c->keepalives;
		$f->ctx($ctx);
	}
	# Thing we do at end
	if ($f->seen_eos) { 
		my $parsed_uri = $f->r->construct_url();
		my $a_encoding = $f->r->headers_in->{'Accept-Encoding'} || '';
		my $c_encoding = $f->r->headers_out->{'Content-Encoding'} || '';
		my $ct = $f->r->headers_out->{'Content-type'} || '';

		# Only proceed URLs that are not excluded from rewritter
		if ( ($#{$ctx->{excluded}} == -1) || !grep($parsed_uri =~ /$_/i, @{$ctx->{excluded}}) ) {

			# if Accept-Encoding: gzip,deflate try to uncompress
			if ( ($c_encoding =~ /gzip|deflate/) && ($ct =~ /$ctx->{contenttype}/is) && ($ct !~ /$ctx->{badcontenttype}/is) ) {
				if ($debug) {
					Apache2::ServerRec::warn("[ModProxyPerlHtml] Uncompressing $ct, Content-Encoding: $c_encoding");
				}
				use IO::Uncompress::AnyInflate qw(anyinflate $AnyInflateError) ;
				my $output = '';
				anyinflate  \$ctx->{data} => \$output or print STDERR "anyinflate failed: $AnyInflateError\n";
				if ($ctx->{data} ne $output) {
					$ctx->{data} = $output;
				} else {
					$c_encoding = '';
				}
			} else {
				$c_encoding = '';
			}

			# Rewrite refresh command in header
			my $refresh = $f->r->headers_out->{'Refresh'};
			if ($refresh) {
				foreach my $p (@{$ctx->{pattern}}) {
					my ($match, $substitute) = split(/[\s\t]+/, $p, 2);
					if ($refresh =~ s#([^\/:])$match#$1$substitute#) {
						if ($debug) {
							Apache2::ServerRec::warn("[ModProxyPerlHtml] Refresh header match '$match', substituted by: /$substitute/");
						}
					}
				}
				$f->r->headers_out->set('Refresh' => $refresh);
			}

			# Rewrite referer in header
			my $referer = $f->r->headers_out->{'Referer'};
			if ($referer) {
				foreach my $p (@{$ctx->{pattern}}) {
					my ($match, $substitute) = split(/[\s\t]+/, $p, 2);
					if ($referer =~ s#([^\/:])$match#$1$substitute#) {
						if ($debug) {
							Apache2::ServerRec::warn("[ModProxyPerlHtml] Referer header match '$match', substituted by: /$substitute/");
						}
					}
				}
				$f->r->headers_out->set('Referer' => $referer);
			}
			
			# Only parse content that should have hyperlinks to rewrite
			if ( ($content_type =~ /$ctx->{contenttype}/is) && ($content_type !~ /$ctx->{badcontenttype}/is) ) {
				if ($debug) {
					Apache2::ServerRec::warn("[ModProxyPerlHtml] Content-type '$content_type' match: /$ctx->{contenttype}/is");
				}
				# Replace links if pattern match
				foreach my $p (@{$ctx->{pattern}}) {
					my ($match, $substitute) = split(/[\s\t]+/, $p, 2);
					&link_replacement(\$ctx->{data}, $match, $substitute, $parsed_uri, $ctx->{rot13elements});
				}
				# Rewrite code if rewrite pattern match
				foreach my $p (@{$ctx->{rewrite}}) {
					my ($match, $substitute) = split(/[\s\t]+/, $p, 2);
					&rewrite_content(\$ctx->{data}, $match, $substitute, $parsed_uri);
				}
			}

			# Compress again data if require
			if (($a_encoding =~ /gzip|deflate/) && ($c_encoding =~ /gzip|deflate/)) {
				if ($debug) {
					Apache2::ServerRec::warn("[ModProxyPerlHtml] Compressing output as Content-Encoding: $c_encoding");
				}
				if ($c_encoding =~ /gzip/) {
					use IO::Compress::Gzip qw(gzip $GzipError) ;
					my $output = '';
					my $status = gzip \$ctx->{data} => \$output or die "gzip failed: $GzipError\n";
					$ctx->{data} = $output;
				} elsif ($c_encoding =~ /deflate/) {
					use IO::Compress::Deflate qw(deflate $DeflateError) ;
					my $output = '';
					my $status = deflate \$ctx->{data} => \$output or die "deflate failed: $DeflateError\n";
					$ctx->{data} = $output;
				}
			}
		}

		# Apply any change
		$f->ctx($ctx);

		# Dump datas out
		$f->print($f->ctx->{data});
		my $c = $f->c;
		if ($c->keepalive == Apache2::Const::CONN_KEEPALIVE && $ctx->{data} && $c->keepalives > $ctx->{keepalives}) {
			if ($debug) {
				Apache2::ServerRec::warn("[ModProxyPerlHtml] Cleaning context for keep alive request");
			}
			$ctx->{data} = '';
			$ctx->{pattern} = ();
			$ctx->{rewrite} = ();
			$ctx->{excluded} = ();
			$ctx->{rot13elements} = ();
			$ctx->{contenttype} = '';
			$ctx->{badcontenttype} = '';
			$ctx->{keepalives} = $c->keepalives;
		}
			
	}

	return Apache2::Const::OK;
}

sub link_replacement
{
	my ($data, $pattern, $replacement, $uri, $rot13elements) = @_;

	return if (!$$data);

	my $old_terminator = $/;
	$/ = '';
	my %TODOS = ();
	my %ROT13TODOS = ();
	my $i = 0;

	# Detect parts that need to be deobfuscated before replacement
	if (defined $rot13elements)
	{
		if ($rot13elements ne 'All') {
			foreach my $tag (keys %{$rot13elements}) {
				while ($$data =~ s/(<$tag\s+[^>]*\b$rot13elements->{$tag}=['"\s]*)([^'"\s>]+)([^>]*>)/ROT13REPLACE_$i\$\$/i) {
					$ROT13TODOS{$i} = "$1ROT13$2ROT13$3";
					$i++;
				}
			}
		} elsif ($rot13elements eq 'All') {
			foreach my $tag (keys %Apache2::ModProxyPerlHtml::linkElements) {
				next if ($$data !~ /<$tag/i);
				foreach my $attr (@{$Apache2::ModProxyPerlHtml::linkElements{$tag}}) {
					while ($$data =~ s/(<$tag\s+[^>]*\b$attr=['"\s]*)([^'"\s>]+)([^>]*>)/ROT13REPLACE_$i\$\$/i) {
						$ROT13TODOS{$i} = "$1ROT13$2ROT13$3";
						$i++;
					}
				}
			}
		}
	}

	# Decode ROT13 links now
	foreach my $k (keys %ROT13TODOS) {
		my $repl = rot13_decode($ROT13TODOS{$k});
		$$data =~ s/ROT13REPLACE_$k\$\$/$repl/;
	}

	# Replace standard link into attributes of any element
	foreach my $tag (keys %Apache2::ModProxyPerlHtml::linkElements) {
		next if ($$data !~ /<$tag/i);
		foreach my $attr (@{$Apache2::ModProxyPerlHtml::linkElements{$tag}}) {
			while ($$data =~ s/(<$tag[\t\s]+[^>]*\b$attr=['"]*)($replacement|$pattern)([^'"\s>]+)/\$\$NEEDREPLACE$i\$\$/i) {
				$TODOS{$i} = "$1$replacement$3";
				$i++;
			}
		}
	}
	# Replace all links in javascript code after hiding javascript replacement pattern
	my %replace_fct = ();



( run in 0.952 second using v1.01-cache-2.11-cpan-df04353d9ac )