File-ANVL

 view release on metacpan or  search on metacpan

lib/File/OM.pm  view on Meta::CPAN

		$self->{recnum} = $recnum
	or
		$self->{recnum}++;

	defined($lineno)	or $lineno = '1:';
	# xxxx really? will someone pass that in?

	$self->{verbose} and
		$s .= "# from record $self->{recnum}, line $lineno\n";
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub crec {	# OM::ANVL
	my ($self, $recnum) = (shift, shift);
	$self->{record_is_open} = 0;
	my $s = "\n";
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

# xxx anvl -m anvln? n=normalized?
sub ostream {	# OM::ANVL
	my $self = shift;

	$self->{recnum} = 0;
	$self->{stream_is_open} = 1;
	my $s = '';
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub cstream {	# OM::ANVL
	my $self = shift;
	my ($s, $z) = ('', '');		# built string and catchup string
	$self->{record_is_open} and	# wrap up any loose ends
		$z = $self->crec();
	$self->{outhandle}	or $s .= $z;	# don't retain print status
	$self->{stream_is_open} = 0;
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub name_encode {	# OM::ANVL
	my ($self, $s) = (shift, shift);
	defined($s)		or return '';
	#$s =~ s/^\s+//;
	#$s =~ s/\s+$//;		# trim both ends
	#$s =~ s/\s+/ /g;	# squeeze multiple \s to one space
# xxx keep doubling %?
	$s =~ s/%/%%/g;		# to preserve literal %, double it
# xxx what about granvl?
				# yyy must be decoded by receiver
	#$s =~ s/:/%3a/g;	# URL-encode all colons (%cn)

	$s =~ s{		# URL-encode all colons and whitespace
		([=:<\s])	# \s matches [ \t\n\f] etc.
	}{			# = and < anticipate ANVL extensions
		sprintf("%%%02x", ord($1))	# replacement hex code
	}xeg;

	# This next line takes care of the mainstream case of names that
	# contain spaces.  It makes sure that for every run of one or more
	# spaces, the first space won't be encoded.
	#
	$s =~ s/%20((?:%20)*)/ $1/g;
	$s =~ s/^ /%20/;	# but make sure any initial space is encoded
	$s =~ s/ $/%20/;	# and make sure any final space is encoded

	return $s;

	# XXXX must convert XML namespaces to make safe for ANVL
	# foo:bar ->? bar.foo (sort friendly, and puts namespace into
	#     proper subordinate position similar to dictionaries)?
	#     or if not namespace, foo:bar ->? foo%xxbar
}

# Encoding of names and values is done upon output in ANVL.
# Default is to wrap long lines.

sub value_encode {	# OM::ANVL
	my ($self, $s, $anvl_mode) = (shift, shift, shift);
	defined($s)		or return '';
	$anvl_mode ||= 'ANVL';

	my $value = $s;			# save original value
	#my ($initial_newlines) =	# save initial newlines
	#	$s =~ /^(\n*)/;		# always defined, often ""

	## value after colon starts with either preserved newlines,
	#	a space, or, if value is "" (as opposed to 0), nothing
	#
	#my $value_start = $initial_newlines || ($value eq "" ? '' : ' ');
	#my $value_start = $initial_newlines || ($value eq "" ? '' : ' ');
	my $value_start = $value eq "" ? '' : ' ';

	#my $value_start = $initial_newlines || ($value ? ' ' : '');
	# xxxx is this the right place to enforce the space after ':'?

	# xxx is there a linear whitespace char class??
	#     problem is that \s includes \n
	#$s =~ s/^\s+//;
	#$s =~ s/\s+$//;		# trim both ends

	$s =~ s/%/%%/g;		# to preserve literal %, double it
				# yyy must be decoded by receiver
	$s =~ s{		# URL-encode newlines in portable way
		(\n)		# \n matches all platforms' ends of lines
	}{			#
		sprintf("%%%02x", ord($1))	# replacement hex code
	}xeg;
	if ($anvl_mode eq 'ANVLS') {
		$s =~ s/\|/%7c/g;	# URL-encode all vertical bars (%vb)
		$s =~ s/;/%3b/g;	# URL-encode all semi-colons (%sc)
		# XXX what about others, such as (:...) (=...)
	};
	return $value_start . $s;
}

sub comment_encode {	# OM::ANVL
	my ($self, $s) = (shift, shift);
	defined($s)	or return '';
	$s =~ s/\n/\\n/g;			# escape \n  yyy??
	return $s;
}

package File::OM::CSV;

our @ISA = ('File::OM');

sub elem {	# OM::CSV
	my $self = shift;
	my ($name, $value, $lineno, $elemnum) = (shift, shift, shift, shift);
	my ($s, $z) = ('', '');		# built string and catchup string

	$self->{record_is_open} or	# call orec() to open record first
		($z =  $self->orec(undef, $lineno),	# may call ostream()
		$self->{record_is_open} = 1);
	$self->{outhandle}	or $s .= $z;	# don't retain print status

	defined($elemnum) and
		$self->{elemnum} = $elemnum
	or
		$self->{elemnum}++;

	# Parse $lineno, which is empty or has form LinenumType, where
	# Type is either ':' (real element) or '#' (comment).
	defined($lineno)	or $lineno = '1:';
	my ($num, $type) =
		$lineno =~ /^(\d*)\s*(.)/;

	local ($Text::Wrap::columns, $Text::Wrap::huge);
	my $wrapper;
	$self->{wrap} and
		($wrapper, $Text::Wrap::columns, $Text::Wrap::huge) =
			(\&Text::Wrap::wrap, $self->{wrap}, 'overflow')
	or
		$wrapper = \&File::OM::text_nowrap;
	;


	$self->{elemnum} > 1 and	# we've output an element already,
		$s .= ",";		# so output a separator character

	if ($type eq '#') {
		$self->{element_name} = undef;	# indicates comment

lib/File/OM.pm  view on Meta::CPAN

		# We're at record 1 in a CVS file, so output a header.
		#
		$s .= join("|", map(name_encode($self, $_), @$r_elem_order))
			. "\n";
	}

	$self->{verbose} and
		$s .= "# from record $self->{recnum}, line $lineno\n";
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub crec {	# OM::PSV
	my ($self, $recnum) = (shift, shift);
	$self->{record_is_open} = 0;
	my $s = "\n";
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub ostream {	# OM::PSV
	my $self = shift;

	$self->{recnum} = 0;
	$self->{stream_is_open} = 1;
	my $s = '';
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub cstream {	# OM::PSV
	my $self = shift;
	my ($s, $z) = ('', '');		# built string and catchup string
	$self->{record_is_open} and	# wrap up any loose ends
		$z = $self->crec();
	$self->{outhandle}	or $s .= $z;	# don't retain print status
	$self->{stream_is_open} = 0;
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

sub name_encode {	# OM::PSV
	# PSV names used only in header line
	my ($self, $s) = (shift, shift);
	defined($s)		or return '';

	# xxx document how we don't trim, but encode spaces
	# xxxxxxx and then encode!!
	#$s =~ s/^\s+//;
	#$s =~ s/\s+$//;		# trim both ends
	#$s =~ s/\s+/ /g;	# squeeze multiple \s to one space
	$s =~ s/%/%%/g;		# to preserve literal %, double it
				# yyy must be decoded by receiver
	$s =~ s/\|/%7c/g;	# URL-encode all colons
	$s =~ s/\n/%0a/g;	# URL-encode all newlines

	return $s;
}

sub value_encode {	# OM::PSV
	my ($self, $s) = (shift, shift);
	defined($s)		or return '';

	# xxx document how we don't trim, but encode spaces
	# xxxxxxx and then encode!!
	#$s =~ s/^\s+//;
	#$s =~ s/\s+$//;		# trim both ends
	#$s =~ s/\s+/ /g;	# squeeze multiple \s to one space
	$s =~ s/%/%%/g;		# to preserve literal %, double it
				# yyy must be decoded by receiver
	$s =~ s/\|/%7c/g;	# URL-encode all colons
	$s =~ s/\n/%0a/g;	# URL-encode all newlines

	return $s;
}

sub comment_encode {	# OM::PSV
	# in PSV this would be a pseudo-comment
	my ($self, $s) = (shift, shift);
	defined($s)		or return '';

	$s =~ s/%/%%/g;		# to preserve literal %, double it
				# yyy must be decoded by receiver
	$s =~ s/\|/%7c/g;	# URL-encode all colons
	$s =~ s/\n/%0a/g;	# URL-encode all newlines

	return $s;
}

package File::OM::Turtle;

our @ISA = ('File::OM');

sub elem {	# OM::Turtle

	my $self = shift;
	my ($name, $value, $lineno, $elemnum) = (shift, shift, shift, shift);
	my ($s, $z) = ('', '');		# built string and catchup string

	$self->{record_is_open} or	# call orec() to open record first
		($z =  $self->orec(undef, $lineno),	# may call ostream()
		$self->{record_is_open} = 1);
	$self->{outhandle}	or $s .= $z;	# don't retain print status

	defined($elemnum) and
		$self->{elemnum} = $elemnum
	or
		$self->{elemnum}++;

	# Parse $lineno, which is empty or has form LinenumType, where
	# Type is either ':' (real element) or '#' (comment).
	defined($lineno)	or $lineno = '1:';
	my ($num, $type) =
		$lineno =~ /^(\d*)\s*(.)/;

	if ($type eq '#') {
		$self->{element_name} = undef;	# indicates comment
		$self->{elemnum}--;		# doesn't count as an element
		$s .= "\n#" . $self->comment_encode($value) . "\n";
		#
		# To create syntactically correct Turtle, we need
		# to end a comment with a newline at the end; this
		# can, however, result in ugly Turtle, since the
		# ';' or '.' that ends an element will have to
		# follow on the next line after that, and the only
		# remedy is to peek ahead at the next element.
	}
	elsif (defined $name) {			# no element if no name
		$self->{element_name} = $self->name_encode($name);
		$self->{elemnum} > 1		and $s .= ' ;';
		$s .= "\n" . $self->{turtle_indent};
		$s .= $self->{turtle_stream_prefix}
			. ":$self->{element_name} "
			. '"""'
			. $self->value_encode($value)
			. '"""';
	}
	$self->{outhandle} and
		return (print { $self->{outhandle} } $s)
	or
		return $s;
}

lib/File/OM.pm  view on Meta::CPAN

   outhandle => *STDOUT,    # (opt) print string instead of returning it
   verbose => 1 });         # (opt) also output record and line numbers

 $om->ostream();            # open stream

 $om->cstream();            # close stream

 $om->orec(                 # open record
       $recnum);            # record number (normally tracked from 1)

 $om->crec();               # close record

 $om->elem(                 # output entire element, unless $name undefined
       $name,               # string representing element name
       $value,              # string representing element value
       $lineno,             # input line number/type (default '1:')
       $elemnum);           # element number (normally tracked from 1))

 $om->elems(                # output elements; wrap ANVL/Plain/XML lines
       $name,               # string representing first element name
       $value,              # string representing first element value
       ...);                # other element names and values

 $om->name_encode($s);      # encode a name
 $om->value_encode($s);     # encode a value
 $om->comment_encode($s);   # encode a comment or pseudo-comment

 om_opt_defaults();         # get hash reference with factory defaults

=head1 DESCRIPTION

The B<OM> (Output Multiplexer) Perl module provides a general output
formatting framework for data that can be represented as a stream of
records consisting of element names, values, and comments.  Specific
conversions are possible to XML, Turtle, JSON, CSV, PSV (Pipe Separated
Value) and "Plain" unlabeled text.

The internal element structure is currently identical to the structure
returned by L<File::ANVL::anvl_recarray>.  The C<n>-th element
corresponds to three Perl array elements as follows:

     INDEX   CONTENT
     3n + 0  input file line number
     3n + 1  n-th ANVL element name
     3n + 2  n-th ANVL element value

This means, for example, that the first two ANVL element names would be
found at Perl array indices 4 and 7.  The first triple is special; array
elements 0 and 2 are undefined unless the record begins with an unlabeled
value, such as (in a quasi-ANVL record),

     Smith, Jo
     home: 555-1234
     work: 555-9876

in which case they contain the line number and value, respectively. Array
element 1 always contains a string naming the format of the input, such
as, "ANVL", "JSON", "XML", etc.

The remaining triples are free form except that the values will have been
drawn from the original format and possibly decoded.  The first item
("lineno") in each remaining triple is a number followed by a letter,
such as "34:" or "6#".  The number indicates the line number (or octet
offset, depending on the origin format) of the start of the element.  The
letter is either ':' to indicate a real element or '#' to indicate a
comment; if the latter, the element name has no defined meaning and the
comment is contained in the value.  To output an element as a comment
without regard to line number, give $lineno as "#".

B<OM> presents an object oriented interface.  The object constructor
takes a format argument and returns C<undef> if the format is unknown.
The returned object has methods for creating format-appropriate output
corresponding (currently) to seven output modes; for a complete
application of these methods, see L<File::ANVL::anvl_om>.  Nonetheless,
an application can easily call no method but C<elem()>, as the
necessary open (C<orec()> and C<ostream>) and close (C<crec()> and
C<cstream()>) methods will be invoked automatically before the first
element is output and before the object is destroyed, respectively.
Passing an undefined first argument ($name) to C<elem()> is useful for
skipping an element in a position-based format such as CSV or PSV, which
indicate a missing element by outputing a separator character; when the
format is not position-based, the method usually outputs nothing.

Constructor options include 'verbose', which causes the methods to insert
record and line numbers as comments or pseudo-comments (e.g., for JSON,
an extra element called "#" since JSON doesn't support comments).
Normally output is returned as a string, but if the 'outhandle' option
(defaults to '') contains a file handle, for example,

     { outhandle => *STDOUT }

the string will be printed to the file handle and the method will return
the status of the print call.  Constructor options and defaults:

 {
 outhandle        => '',        # return string instead of printing it
 indent_start     => '',        # overall starting indent
 indent_step      => '  ',      # how much to increment/decrement indent

 # Format specific options.
 turtle_indent    => '    ',    # turtle has one indent width
 turtle_predns    =>            # turtle predicate namespaces
        'http://purl.org/kernel/elements/1.1/',
 turtle_nosubject => 'default', # a default subject (change this)
 turtle_subjelpat => '',        # pattern for matching subject element
 turtle_stream_prefix => 'erc', # symbol we use for turtle
 wrap             => 72,        # wrap text to 72 cols (ANVL, Plain, XML)
 wrap_indent      => '',        # Text::Wrap will insert; "\t" for ANVL
 xml_stream_name  => 'recs',    # for XML output, stream tag
 xml_record_name  => 'rec',     # for XML output, record tag

 # Used to maintain object state.
 elemnum          => 0,         # current element number
 elemsref         => [],        # one array to store record elements
 indent           => '',        # current ident
 recnum           => 0,         # current record number
 }

In this release of the B<OM> package, objects carry limited state
information.  Maintained are the current indention level, element number,
and record number, but there is no stack of "open elements".  Right now



( run in 0.593 second using v1.01-cache-2.11-cpan-39bf76dae61 )