unicode results from the CPAN

unicode

Data-Pond

view release on metacpan or search on metacpan

indentation.  The number given must be the number of leading spaces on
the line on which the resulting element will be placed.  If whitespace
is added, the element will be arranged to end on a line of the same
indentation, and all intermediate lines will have greater indentation.

=item B<undef_is_empty>

If false (the default), C<undef> will be treated as invalid data.
If true, C<undef> will be serialised as an empty string.

=item B<unicode>

If false (the default), the datum will be expressed using only ASCII
characters.  If true, non-ASCII characters may be used in string literals.

=back

=cut

my %str_encode = (
	"\t" => "\\t",

lib/Data/Pond.pm view on Meta::CPAN

	$str_encode{$c} = sprintf("\\x%02x", $_) unless exists $str_encode{$c};
}

sub _strdatum_to_string($$) {
	my($str, $options) = @_;
	return $str if $str =~ /\A(?:0|[1-9][0-9]{0,8})\z/;
	die "Pond data error: invalid character\n"
		unless $str =~ /\A[\x{0}-\x{7fffffff}]*\z/;
	$str =~ s/([\x00-\x1f\"\$\@\\\x7f-\xa0])/$str_encode{$1}/eg;
	$str =~ s/([^\x00-\x7f])/sprintf("\\x{%02x}", ord($1))/eg
		unless $options->{unicode};
	return "\"$str\"";
}

sub _strdatum_to_bareword($$) {
	return $_[0] =~ /\A[A-Za-z_][0-9A-Za-z_]*\z/ ? $_[0] :
		&_strdatum_to_string;
}

sub pond_write_datum($;$);
sub pond_write_datum($;$) {

lib/Data/Pond.xs view on Meta::CPAN

 * octets are treated as U8 type.
 *
 * Characters that are known to be in the ASCII range are in some places
 * processed as U8.  General Unicode characters are processed as U32, with
 * the intent that the entire ISO-10646 31-bit range be handleable.  Any
 * codepoint is accepted for processing, even the surrogates (which are
 * not legal in true UTF-8 encoding).  Perl's extended UTF-8 extends to
 * 72-bit codepoints; encodings beyond the 31-bit range are translated to
 * codepoint U+80000000, whereby they are all treated as invalid.
 *
 * char_unicode() returns the codepoint represented by the character being
 * pointed at, or throws an exception if the encoding is malformed.
 *
 * To move on to the character following the one pointed at, use the core
 * macro UTF8SKIP(), as in (p + UTF8SKIP(p)).  It assumes that the character
 * is properly encoded, so it is essential that char_unicode() has been
 * called on it first.
 *
 * Given an input SV (that is meant to be a string), pass it through
 * upgrade_sv() to return an SV that contains the string in UTF-8.  This
 * could be either the same SV (if it is already UTF-8-encoded or contains
 * no non-ASCII characters) or a mortal upgraded copy.
 */

#define char_unicode(p) THX_char_unicode(aTHX_ p)
static U32 THX_char_unicode(pTHX_ U8 *p)
{
	U32 val = *p;
	U8 req_c1;
	int ncont;
	int i;
	if(!(val & 0x80)) return val;
	if(!(val & 0x40)) throw_utf8_error();
	if(!(val & 0x20)) {
		if(!(val & 0x1e)) throw_utf8_error();
		val &= 0x1f;

lib/Data/Pond.xs view on Meta::CPAN

{
	U8 *p = *pp;
	SV *datum = sv_2mortal(newSVpvs(""));
	SvUTF8_on(datum);
	while(1) {
		U8 c = *p, e;
		if(p == end || char_is_control(c)) throw_syntax_error(p);
		if(!char_is_dqspecial(c)) {
			U8 *q = p;
			do {
				U32 val = char_unicode(q);
				if(unichar_is_control(val))
					throw_syntax_error(q);
				q += UTF8SKIP(q);
				c = *q;
			} while(q != end && !char_is_dqspecial(c));
			sv_catpvn_nomg(datum, (char*)p, q-p);
			p = q;
			continue;
		}
		if(c == '"') break;
		if(c != '\\') throw_syntax_error(p);
		c = *++p;
		if(p == end) throw_syntax_error(p);
		if(c & 0x80) {
			U32 val = char_unicode(p);
			if(unichar_is_control(val)) throw_syntax_error(q);
			/* character will be treated as literal anyway */
			continue;
		}
		e = asciichar_backslash[c];
		if(e == 0xff) {
			U32 val = c & 7;
			c = *++p;
			if(char_is_octdigit(c)) {
				p++;

lib/Data/Pond.xs view on Meta::CPAN

	U8 *p = *pp;
	SV *datum = sv_2mortal(newSVpvs(""));
	SvUTF8_on(datum);
	while(1) {
		U8 c = *p;
		if(p == end || char_is_control(c)) throw_syntax_error(p);
		if(c == '\'') break;
		if(c != '\\') {
			U8 *q = p;
			do {
				U32 val = char_unicode(q);
				if(unichar_is_control(val))
					throw_syntax_error(q);
				q += UTF8SKIP(q);
				c = *q;
			} while(q != end && c != '\'' && c != '\\');
			sv_catpvn_nomg(datum, (char*)p, q-p);
			p = q;
		} else {
			c = p[1];
			if(c == '\\' || c == '\'')

lib/Data/Pond.xs view on Meta::CPAN

	*pp = p;
	return datum;
}

/*
 * Pond writing
 */

struct writer_options {
	int indent;
	int undef_is_empty, unicode;
};

static int pvn_is_integer(U8 *p, STRLEN len)
{
	U8 *e = p + len;
	if(len == 0 || len > 9) return 0;
	if(*p == '0') return len == 1;
	for(; p != e; p++) {
		if(!char_is_decdigit(*p)) return 0;
	}

lib/Data/Pond.xs view on Meta::CPAN

	p = (U8*)SvPV(datum, len);
	if(pvn_is_integer(p, len)) {
		sv_catpvn_nomg(out, (char *)p, len);
	} else {
		U8 *e = p + len;
		U8 *lstart = p;
		sv_catpvs_nomg(out, "\"");
		while(p != e) {
			U8 c = *p;
			if(c & 0x80) {
				U32 val = char_unicode(p);
				if(val == 0x80000000)
					throw_data_error("invalid character");
				if(val <= 0xa0 || !wo->unicode) {
					if(lstart != p)
						sv_catpvn_nomg(out,
							(char*)lstart,
							p-lstart);
				}
				p += UTF8SKIP(p);
				if(val <= 0xa0) {
					c = val;
					p--;
					goto hexpair;
				}
				if(!wo->unicode) {
					char hexbuf[12];
					sprintf(hexbuf, "\\x{%02x}",
						(unsigned)val);
					sv_catpvn_nomg(out, hexbuf,
						strlen(hexbuf));
					lstart = p;
				}
			} else {
				U8 quote = asciichar_quote[c];
				if(quote == ASCIICHAR_QUOTE_LITERAL) {

lib/Data/Pond.xs view on Meta::CPAN

				wo.indent = SvIV(item);
				if(wo.indent < 0)
					throw_data_error(
						"indent option is negative");
			}
		}
		if((item_ptr = hv_fetchs(opthash, "undef_is_empty", 0))) {
			SV *item = *item_ptr;
			wo.undef_is_empty = cBOOL(SvTRUE(item));
		}
		if((item_ptr = hv_fetchs(opthash, "unicode", 0))) {
			SV *item = *item_ptr;
			wo.unicode = cBOOL(SvTRUE(item));
		}
	}
	RETVAL = sv_2mortal(newSVpvs(""));
	SvUTF8_on(RETVAL);
	serialise_datum(&wo, RETVAL, datum);
	SvREFCNT_inc(RETVAL);
OUTPUT:
	RETVAL

t/expr.t view on Meta::CPAN

	'{}',
	'{1,2}',
	'{1,2,}',
	' { 1 , 2 , } ',
	'{a=>b=>}',
	' { a => b => } ',
	'{a=>[],b=>123}',
	'{a=>[],b=>123,}',
	'{" foo",123}';

is pond_write_datum(pond_read_datum($_), {unicode=>1}), $_ foreach
	'""',
	'"abc"',
	'"a b"',
	"\"a'b\"",
	'"a\tb"',
	'"a\nb"',
	"\"a\\\\b\"",
	"\"a\\\"b\"",
	"\"a\\\$b\"",
	"\"a*b\"",

t/expr.t view on Meta::CPAN

	'{}',
	'{1=>2}',
	'{a=>"b"}',
	'{a=>[],b=>123}',
	'{a=>[],b=>"0123"}',
	'{a=>[],b=>"00"}',
	'{a=>[],b=>"1234567890"}',
	'{" foo"=>123}',
	"{\"z\x{123}Z\"=>[\"a\x{123}A\"]}";

is pond_write_datum(pond_read_datum($_), {indent=>0, unicode=>1}), $_ foreach
	'""',
	'"abc"',
	'"a b"',
	"\"a'b\"",
	'"a\tb"',
	'"a\nb"',
	"\"a\\\\b\"",
	"\"a\\\"b\"",
	"\"a\\\$b\"",
	"\"a*b\"",

( run in 0.410 second using v1.01-cache-2.11-cpan-88abd93f124 )