Apache-RewritingProxy

 view release on metacpan or  search on metacpan

RewritingProxy.pm  view on Meta::CPAN

package Apache::RewritingProxy;

use strict;
use Apache::Constants qw(:common);
use vars '$req';
use vars '$res';
use vars '$proxiedCookieJar';
use vars '$replayCookies';
use vars '$serverCookies';
use vars '$jar';
use vars '$textHandlerSub';
use vars qw($VERSION @ISA @EXPORT);
$|=1;

$VERSION = '0.7';

# use DynaLoader ();
# @ISA = qw(DynaLoader Exporter);

use Exporter();
@ISA = qw(Exporter);
@EXPORT = qw(new handler fixLink fetchURL);

# This is the directory in which cacheing will eventually take
# place.  More importantly, a subdirectory of this named cookies
# MUST exist and be writeable by the web server.  This is where users'
# cookie jars are stored.

$Apache::RewritingProxy::cacheRoot = '/web/httpd/RewritingProxy';


sub new 
  {
  my $class = shift;
  my $self = {};
  bless $self,$class;
  return $self;
  }

sub handler
  {
  my $r = shift;
  $textHandlerSub = shift;

  # Find the URL we are to fetch...
  my $urlToFetch = substr($r->path_info,1);

  # I only know one protocol thus far.
  return DECLINED if ($urlToFetch !~ /^http:/);

  # Get the resource and shovel it out to the client.
  &fetchURL($r, $urlToFetch);

  # If nothing has happened thus far, we'll assume that's good.
  return OK;
  }

##################################
# sub fetchURL
#
# Parameters:
#
# $r - an Apache request object
# $url - the URL to fetch and process.
#
# Returns:
#
# OK if it's happy.
# an HTTP response code if one other than 200 
# is received.

sub fetchURL
  {
  # This is the guy who actually grabs the page and then parses it.
  # My goal is to find all of the links made in the urlToFetch
  # and rewrite them to be absolute links passing through this module
  # again.
  use Apache::Util qw(:all);
  use LWP::UserAgent;
  use HTML::TokeParser;
  use HTTP::Cookies;
  use CGI;
  my $r = shift;
  my $url = shift;
  my $ua = new LWP::UserAgent;
  # As we form the request to go to the remote server,
  # We should stuff any cookies that might be relavant
  # into the request.  We need to use the Table class
  # here to fetch the cookies and see what cookies 
  # apply to $url.  We then sent those cookies 
  # in the request after yanking out our own URL from 
  # the cookies.
  
  # Fetch a cookie named RewritingProxy from the client.
  my $cookieKey;
  my $clientCookies = $r->header_in('Cookie');
  my @clientCookiePairs = split (/; /, $clientCookies);
  my $thisClientCookiePair;
  foreach $thisClientCookiePair (@clientCookiePairs)
    {
    my ($name,$value) = split (/=/, $thisClientCookiePair);
    $cookieKey = $value if ($name eq "RewritingProxyCookieJar");
    }

  # Set the cookie to be the client's current IP (doesn' really matter).
  # Set the cookie to expire in 6 months.
  # TODO: Make this thing refresh if a client keeps using the proxy.
  if (!$cookieKey)
    {
    $cookieKey = $r->get_remote_host();
    my $cookieString = "RewritingProxyCookieJar=$cookieKey; expires=".
      ht_time(time+518400). "; path=/; domain=".$r->get_server_name;	
    $r->header_out('Set-Cookie'=>$cookieString);
    }
    
  # We now need to open the User's cookie jar and see if any cookies 
  # need to be sent to this particular server.
  $jar = "$Apache::RewritingProxy::cacheRoot/cookies/$cookieKey";
  $serverCookies = HTTP::Cookies->new(
	File => "$jar",
        ignore_discard=>1,
        AutoSave=>1);
  # Load the cookies into memory...
  $serverCookies->load() if (-e $jar);

  # Let's take care of Referer also.
  my $referer = $r->header_in('Referer');
  my $script_name = $r->location;
  $referer =~ s/(.*$script_name\/)//i;

  # Let's carry the User Agent to the server also.
  # TODO: We need to include the proxied via header here.
  my $browser = $r->header_in('User-Agent');
  $ua->agent($browser);

  # We have to append the query string since it got munged by
  # apache when this was first requested.
  my $rurl = $url;
  $rurl .= "?". $r->args if ($r->args && $url !~ /\?/);

  if ($r->method eq 'GET')
    {
    $req = new HTTP::Request 'GET' => "$rurl";
    $req->header('Referer'=>"$referer");
    $serverCookies->add_cookie_header($req);
    # This needs to be a simple request or else the redirects will 
    # not work very nicely.  LWP is too smart sometimes.
    $res = $ua->simple_request($req);
    }
  elsif ($r->method eq 'POST')
    {
    # This is a little bit of tricky ju ju here.
    # We will use another PERLy package to 
    # prepare the URL and pack in the encoded form data.
    use URI::URL;
    my %FORM;
    $req = new HTTP::Request 'POST' => "$rurl";
    $req->header('Referer'=>"$referer");
    $req->content_type('application/x-www-form-urlencoded');
    $serverCookies->add_cookie_header($req);
    # $req->content('$buffer');
    my $pair;
    my @pairs = split (/&/, $r->content);
    # TODO: This next bit more efficiently.
    # It works for the occasional cgi, but not for constant
    # hammering away at this code.  There has to be a better
    # and more OOP way.
    foreach $pair (@pairs)
      {
      my ($name, $value) = split (/=/, $pair);
      # Un-Webify plus signs and %-encoding
      $value =~ tr/+/ /;
      $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
      $FORM{$name} = $value;
      }
    # Now we build the new URL structure with the form data
    # to be sent to the server on the client's behalf.
    my $curl = url("http:");
    $curl->query_form(%FORM);
    $req->content($curl->equery);
    $res = $ua->simple_request($req);
    }

  # We need to store any cookies the server sent to us for future use.

  #Old cookie jar...

  # TODO: Make this much much better.  We still need to lock the cookie
  # jar to keep simultaneous requests from killing each others' changes.
  while (-e "$Apache::RewritingProxy::cacheRoot/cookies/$cookieKey.lock")
	{sleep(1);}
  open (LOCK, ">$Apache::RewritingProxy::cacheRoot/cookies/$cookieKey.lock");
  print LOCK " ";
  close LOCK;
  # New cookies sent by the server...
  my $responseCookies = HTTP::Cookies->new;
  $responseCookies->extract_cookies($res);
  $responseCookies->scan(\&storeCookies);
  # Store the old plus the new...
  # $proxiedCookieJar->save();
  unlink ("$Apache::RewritingProxy::cacheRoot/cookies/$cookieKey.lock");


  sub storeCookies
    {
    my $version = shift;
    my $key = shift;
    my $val = shift;
    my $path = shift;
    my $domain = shift;
    my $port = shift;
    my $path_spec = shift;
    my $secure = shift;
    my $expires = shift;
    my $discard = shift;
    my $hash = shift;
     

    # if (!$expires )
      # {
      # $expires = ht_time(time+3600, '%Y-%m-%d %H:%M:%S',0);
      # }
    my $proxiedCookieJar = HTTP::Cookies->new(
	File => "$jar",
	ignore_discard=>1,
	AutoSave=>1);
    $proxiedCookieJar->load();
    $proxiedCookieJar->set_cookie($version,$key,$val,$path,
	$domain,$port,$path_spec,$secure,$expires);
    $proxiedCookieJar->save();
    }
    
  if ($res->code =~ /^3/)
    {
    # This means it was a server redirect.
    # We should process the headers and insert 
    # ourselves into the headers everywhere we need to.
    my $textHeaders = $res->headers_as_string;


    # We need the address of the current script.
    my ($tmpUri,$junk) = split (/\/http/i, $r->uri);
    my $script_home = $r->get_server_name .":".
	$r->server->port . $tmpUri;

    # Replace any redirect links with a link pointing through us.
    if ($textHeaders =~ /Location: http:/i)
      {
      $textHeaders =~ s#http:#http://$script_home/http:#i;
      }
    else
      {
      $textHeaders =~ s#Location: (.*)
#Location: http://$script_home/$url$1#i;
      }
    # Dump out the headers as though we had created them.
    # Nothing like a little bit of http-plagiarism.
    $r->send_cgi_header($textHeaders); 
    }


  # We only process html documents.  Maybe someday we will
  # work on other types, but there is no need right now since 
  # this program only wants to look at web pages anyhow.
  if ($res->content_type =~ /html/i)
    {
    my $content = $res->content;                 # The actual text
    my $outString = "";				 # The content the user sees
    my $baseHref = "";				 # storage space for <base
    my $p = HTML::TokeParser->new(\$content)
	|| $r->log_error("No Content: $!"); 
		# TODO: This needs to be changed from warn!
    while (my $tolkens = $p->get_token )
      {
      my $text = "";
      # We process all of the possible token types.  
      # text and comments are printed unmolested to the browser.
      # Javascript would have to be parsed out by editing the text
      # between script tags.
      if ($tolkens->[0] eq 'T')
	{
  	if ($textHandlerSub)
	  {
          $outString .= &{$textHandlerSub}($r,$tolkens->[1]);
	  }
	else
	  {
          $outString .= mainTextHandler($r,$tolkens->[1]);
	  }
      	# $outString .= $tolkens->[1];



( run in 0.479 second using v1.01-cache-2.11-cpan-22024b96cdf )