w3mir
view release on metacpan or search on metacpan
$URLSUB = 5; # Do regular expression substitution
# on urls. Arg: RE, substitute.
$NODOC = 6; # Do not return a rebuilt document.
# Saves memory, and time
# NOTE: I SUSPECT URLPROC IS BROKEN IF A BASE TAG APPEARS IN THE TEXT.
# THE URL PROCESSOR NEEDS TO BE PASSED htmlop::process' IDEA OF WHAT THE
# BASE URL IS.
$URLPROC = 7; # Apply function on urls (process
# urls). Arg: Pointer to function to
# apply, userdata. The function will
# be passed the url, modified by any
# previous operations and the
# userdata. The function must return
# the new url.
$NREL = 8; # New relativisation function, works
# much better than the old one. Arg:
# Origin, Top.
$SAVEURL = 9; # Save urls in tag with modified name.
# Arg: attribute prefix.
# Example: <a href=foo> becomes
# <a href=foo w3mir-href=foo> if no other
# processing of the url is done.
$USESAVED = 10; # Use saved urls. Arg: attribute prefix
$TAGCALLBACK = 11; # Procedure to call for each Tag.
# Args: procedure, userdata (one item)
# Args to procedure: userdata, Base
# URL, tag name, reference to array of
# URL attributes, reference to hash of
# all attributes. The base url is
# derived from the one used in ABS or
# the BASE tag.
$debug=0; # Debugging level in this package
# process_html returns a array. The first component of the array is
# the new html document. The rest of the array is the urls. If a
# document is not to be returned a empty string is returned. If a url
# list is not to be returned a empty array is returned.
# HERE BE DRAGONS:
# Where to find URLs in various tags. The second compoent is a array
# reference.
my(%urls) = (
HEAD => [ 'PROFILE' ],
BLOCKQUOTE => [ 'CITE' ],
Q => [ 'CITE' ],
INS => [ 'CITE' ],
DEL => [ 'CITE' ],
A => [ 'HREF' ] ,
IMG => [ 'SRC' ,'LOWSRC' ,'USEMAP', 'LONGDESC' ] ,
EMBED => [ 'SRC' ],
FRAME => [ 'SRC', 'LONGDESC' ],
IFRAME => [ 'SRC', 'LONGDESC' ],
BODY => [ 'BACKGROUND' ],
AREA => [ 'HREF' ],
LINK => [ 'HREF' ],
# The APPLET and OBJECT tags do not fit into my model for URL
# manipulation. Just looking at CODEBASE might work, if the
# URL it names is a browseable directory...
APPLET => [ 'CODEBASE' ], # If the codebase dir is browseable
OBJECT => [ 'CODEBASE' ], # Ditto. Can't handle DATA attribute now
INPUT => [ 'SRC', 'USEMAP' ],
MAP => [ 'HREF' ],
SCRIPT => [ 'SRC', 'FOR' ],# 'FOR's semantics is not defined, the
# attribute is just reserved for possible
# future use...
BGSOUND => [ 'SRC' ],
FORM => [ 'ACTION' ], # Is this asking for trouble?
# Maybe it should just be absolutized...
# On the other hand: It's CGI...
);
my(%relative) = (
# Identify URL attributes containing urls that are relative to
# the named URL attribute. When processing these they should
# be absolitized and then relativized relative to the BASE attribute.
# This is just window dressing for now; it is not used for anything.
# ARCHIVE is really a URI _list_.
CODEBASE => [ 'CLASSID', 'DATA', 'CODE', 'ARCHIVE' ],
);
%isdir = (
# These tags refer to directories:
CODEBASE => 1
);
# Tags that enclose bits we want to leave absolutely alone because they
# are not very like HTML, or some such.
# The material between the start and end tags is copied with no
# processing at all. The end tag is left to be processed.
# The endtag match is case insensitive.
my(%verbatim) = (
SCRIPT => quotemeta('</SCRIPT>'), # Embeded scripts
STYLE => quotemeta('</STYLE>'), # Embeded stylesheet
);
# These are the functions that pick the HTML to pieces. It will not
# work esp. good on a random SGML document since the HTML application
# of SGML has simpler quoting than it might.
sub gettoken {
# Get one token from the argument, removing it from the argument.
# BUG: There should be whitespace at the end of the examined string.
my($c,$token,$i);
# Skip whitespace and newlines
return '' unless defined(@_) && defined($_[0]);
( run in 1.244 second using v1.01-cache-2.11-cpan-71847e10f99 )