unicode results from the CPAN

Bundle-PBib
#!/usr/bin/perl

require "bptest.pl";

$skiptests = 0;   # to skip early tests during development
$timing = 0;      # to use the same seed for timing tests

#
# This test covers the 8859-1, apple, TeX, troff, and HTML
# character sets.  It also tests the canon<->unicode routines
# from bp-p-utils.pl.
#
# It does not test the 'none' and 'auto' character sets.  The
# auto character set doesn't exist, so it tests all the charsets
# that are currently in bp.
#
# For each charset, we run through an iso string, which contains
# the 8-bit characters 0 through 255.  Before processing, certain
# characters known to not be supported are removed (generally
# 200-237 octal, but sometimes others).  This is to prevent
# excessive warnings and make the tests easier to perform.  We
# run the fromcanon code, then the tocanon code, and check to
# see that the outputs match.
#
# In addition, we do the same thing for a long string of random
# 8-bit characters, with the hope that this will test combinations
# of characters we may not have thought of.
#
# Some formats also test a few special strings that we want to
# test.  These usually are selected by looking at the source
# code for the converter, and checking any seemingly weak points.
#
# Lastly, we run these strings through a loop:
#   88591->can->troff->can->html->can->88591
# and check that they match.  After removing some characters, we try
#   88591->can->troff->can->apple->can->html->can->88591
# 


# Generate a string with each 8bit character.
$isostring = '';
for (0..255) {
  $isostring .= pack("C", $_);
}

# Generate a random string of 8bit characters.  This should
#   be a long string so we test lots of things.
# We do this because although testing the 8bit characters in order
#   should find any problems, certain character combinations can
#   cause problems if the routine isn't written properly.  This may
#   stumble across a combination I didn't think of.
# We must print out the seed if there is an error, so that we can
#   duplicate the string that caused the error.
#
# Argh!  Perl4 coredumps with srand, and Perl5 coredumps with MSRNG.
# I don't know why they crash.  Perl5.002 seems to work with MSRNG.
#
$randstring = '';
$randstringlength = 16385;  # should be at least 8192
$seed = undef;
$seed = 42 if $timing;
if ( ($] >= 5) && ($] < 5.002) ) {
  srand($seed) unless defined $seed;
  $n = 0;
  while ($n++ < $randstringlength) {
    $randstring .= pack("C", int(rand(256)) );
  }
} else {
  if (defined $seed) {
    $seed = &msrand($seed);
  } else {
    $seed = &msrand;
  }
  $n = 0;
  while ($n++ < $randstringlength) {
    $randstring .= pack("C", int(&mrand(256)) );
  }
}
$randstring = $randstring x 5 if $timing;

$cantest = "${bib'cs_ext}0026";


# The character conversion routines assume that the strings will
# already be escaped.  They also do not do any unescaping (except
# for cs_ext and cs_meta, which they are supposed to handle).
$isostring  =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
$randstring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;


print "Testing character set routines, seed=$seed.\n";

if (!$skiptests) {

&begintest("bib'unicode", 12);

&check('', "bib'canon_to_unicode", '006C', 'l');
&check('', "bib'canon_to_unicode", '00C4', "\304");
&check('', "bib'canon_to_unicode", 'A1C6', "${bib'cs_ext}A1c6");
&check('', "bib'canon_to_unicode", '001C', $bib'cs_char_escape);

&check('', "bib'unicode_to_canon", '5', '0035');
&check('', "bib'unicode_to_canon", "\xE9", '00E9');
&check('', "bib'unicode_to_canon", "${bib'cs_ext}CF8A", 'CF8A');

&check('', "bib'decimal_to_unicode", '0023', 35);
&check('', "bib'decimal_to_unicode", '359B', 13723);
&check('', "bib'unicode_to_decimal", 35, '0023');
&check('', "bib'unicode_to_decimal", 13723, '359b');

$f = $failed;
for (0..512) {
  $can = &bib'unicode_to_canon(&bib'decimal_to_unicode($_));
  $val = &bib'unicode_to_decimal(&bib'canon_to_unicode($can));
  &check('nostatus,norun',"unicode loop", $_, $val);
}
$can = $val = undef;
&check('partial', "unicode loop", $f, $failed);

&endtest;

# Test ISO-8859-1

&testcharset("8859-1", 5);

$caniso = $isostring;
&check('', "bp_cs_88591'tocanon", $caniso, $isostring);
&check('', "bp_cs_88591'fromcanon", $isostring, $caniso);
$canran = $randstring;
&check('', "bp_cs_88591'tocanon", $canran, $randstring);
&check('', "bp_cs_88591'fromcanon", $randstring, $canran);

&check('', "bp_cs_88591'fromcanon", '&', $cantest);

&endtest;

# Test Apple

&bib'errors('print');
&testcharset("apple", 5);
# get rid of characters we don't handle.
$c = $caniso;
$c =~ s/[\200-\237\246\255\262\263\271\274\275\276\320\327\335\336\360\375\376]//g;
$appiso = &bp_cs_apple'fromcanon($c);
&check('partial', "apple'fromcanon", 1, 1);
&check('', "bp_cs_apple'tocanon", $c, $appiso);
$c = $appiso = undef;

$c = $canran;
$c =~ s/[\200-\237\246\255\262\263\271\274\275\276\320\327\335\336\360\375\376]//g;
$appran = &bp_cs_apple'fromcanon($c);
&check('partial', "apple'fromcanon", 1, 1);
&check('', "bp_cs_apple'tocanon", $c, $appran);
$c = $appran = undef;

&check('', "bp_cs_apple'fromcanon", '&', $cantest);

&endtest;

# Test troff

&testcharset("troff", 5);

($c = $caniso) =~ s/[\200-\237]//g;  # troff can't handle these
$troiso = &bp_cs_troff'fromcanon($c);
&check('partial', "troff'fromcanon", 1, 1);
&check('', "bp_cs_troff'tocanon", $c, $troiso);
$c = $troiso = undef;

($c = $canran) =~ s/[\200-\237]//g;  # troff can't handle these
$troran = &bp_cs_troff'fromcanon($c);
&check('partial', "troff'fromcanon", 1, 1);
&check('', "bp_cs_troff'tocanon", $c, $troran);
$c = $troran = undef;

&check('', "bp_cs_troff'fromcanon", '&', $cantest);

&endtest;
( run in 0.999 second using v1.01-cache-2.11-cpan-9581c071862 )