w3mir

 view release on metacpan or  search on metacpan

htmlop.pm  view on Meta::CPAN

$URLSUB = 5;			# Do regular expression substitution
                                # on urls.  Arg: RE, substitute.

$NODOC = 6;			# Do not return a rebuilt document.
                                # Saves memory, and time

# NOTE: I SUSPECT URLPROC IS BROKEN IF A BASE TAG APPEARS IN THE TEXT.
# THE URL PROCESSOR NEEDS TO BE PASSED htmlop::process' IDEA OF WHAT THE
# BASE URL IS.

$URLPROC = 7;			# Apply function on urls (process
                                # urls).  Arg: Pointer to function to
				# apply, userdata.  The function will
				# be passed the url, modified by any
				# previous operations and the
				# userdata.  The function must return
				# the new url.

$NREL = 8;			# New relativisation function, works
                                # much better than the old one. Arg:
                                # Origin, Top.

$SAVEURL = 9;			# Save urls in tag with modified name.
				# Arg: attribute prefix.
				# Example: <a href=foo> becomes
				# <a href=foo w3mir-href=foo> if no other
				# processing of the url is done.

$USESAVED = 10;			# Use saved urls. Arg: attribute prefix

$TAGCALLBACK = 11;		# Procedure to call for each Tag.
				# Args: procedure, userdata (one item)

				# Args to procedure: userdata, Base
				# URL, tag name, reference to array of
				# URL attributes, reference to hash of
				# all attributes.  The base url is
				# derived from the one used in ABS or
				# the BASE tag.

$debug=0;			# Debugging level in this package

# process_html returns a array.  The first component of the array is
# the new html document. The rest of the array is the urls.  If a
# document is not to be returned a empty string is returned. If a url
# list is not to be returned a empty array is returned.

# HERE BE DRAGONS:

# Where to find URLs in various tags.  The second compoent is a array
# reference.

my(%urls) = (
	HEAD    => [ 'PROFILE' ],
        BLOCKQUOTE => [ 'CITE' ],
        Q 	=> [ 'CITE' ],
        INS	=> [ 'CITE' ],
	DEL	=> [ 'CITE' ],
	A	=> [ 'HREF' ] ,
	IMG	=> [ 'SRC' ,'LOWSRC' ,'USEMAP', 'LONGDESC' ] ,
	EMBED	=> [ 'SRC' ],
	FRAME	=> [ 'SRC', 'LONGDESC' ],
        IFRAME  => [ 'SRC', 'LONGDESC' ],
	BODY	=> [ 'BACKGROUND' ],
	AREA	=> [ 'HREF' ],
	LINK	=> [ 'HREF' ],

	# The APPLET and OBJECT tags do not fit into my model for URL
	# manipulation.  Just looking at CODEBASE might work, if the
	# URL it names is a browseable directory...
	APPLET	=> [ 'CODEBASE' ],  # If the codebase dir is browseable
	OBJECT  => [ 'CODEBASE' ],  # Ditto.  Can't handle DATA attribute now

	INPUT	=> [ 'SRC', 'USEMAP' ],
	MAP	=> [ 'HREF' ],
	SCRIPT	=> [ 'SRC', 'FOR' ],# 'FOR's semantics is not defined, the
				    # attribute is just reserved for possible
				    # future use...
	BGSOUND => [ 'SRC' ],
	FORM	=> [ 'ACTION' ],    # Is this asking for trouble?
				    # Maybe it should just be absolutized...
	     			    # On the other hand: It's CGI...
	);

my(%relative) = (
	# Identify URL attributes containing urls that are relative to
	# the named URL attribute.   When processing these they should
        # be absolitized and then relativized relative to the BASE attribute.
        # This is just window dressing for now; it is not used for anything.

	# ARCHIVE is really a URI _list_.
	CODEBASE => [ 'CLASSID', 'DATA', 'CODE', 'ARCHIVE' ],
        );

%isdir = (
	# These tags refer to directories:
	CODEBASE => 1
	);

# Tags that enclose bits we want to leave absolutely alone because they
# are not very like HTML, or some such.

# The material between the start and end tags is copied with no
# processing at all.  The end tag is left to be processed.
# The endtag match is case insensitive.
my(%verbatim) = (
        SCRIPT	=> quotemeta('</SCRIPT>'),	# Embeded scripts
	STYLE	=> quotemeta('</STYLE>'),	# Embeded stylesheet
	);

# These are the functions that pick the HTML to pieces.  It will not
# work esp. good on a random SGML document since the HTML application
# of SGML has simpler quoting than it might.

sub gettoken {
  # Get one token from the argument, removing it from the argument.
  # BUG: There should be whitespace at the end of the examined string.
  my($c,$token,$i);
  
  # Skip whitespace and newlines
  return '' unless defined(@_) && defined($_[0]);



( run in 1.244 second using v1.01-cache-2.11-cpan-71847e10f99 )