unicode results from the CPAN

Bundle-PBib
#
# bibliography package for Perl
#
# Character set common variables and routines
#
# Dana Jacobsen (dana@acm.org)
# 18 November 1995 (last modified 17 March 1996)

# for bib'nocharset which calls fromcanon:
require "bp-cs-none.pl";

######
#
# Return canonical character for a unicode hex string.
#
sub unicode_to_canon {
  local($hex) = @_;

  $hex =~ tr/a-f/A-F/;

  # XXXXX Should we prepend '0' characters if we don't have 4 digits?
  if ($hex !~ /^[\dA-F]{4}$/) {
    &bib'gotwarn("Invalid Unicode character: $hex");
    return '';
  }
  if ($hex =~ /00(..)/) {
    return pack("C", hex($1));
  }
  return $bib'cs_ext . $hex;
}

sub canon_to_unicode {
  local($can) = @_;
  local($hex);

  if (length($can) == 1) {
    $hex = sprintf("%2lx", ord($can));
    $hex =~ tr/a-f /A-F0/;
    return( '00' . $hex );
  }
  if ($can =~ /$bib'cs_ext(....)/) {
    $hex = $1;
    $hex =~ tr/a-f/A-F/;
    return $hex;
  }
  if ($can eq $bib'cs_char_escape) {
    return &bib'canon_to_unicode($bib'cs_escape);
  }
  return &bib'gotwarn("Can't convert $can to Unicode");
}

sub decimal_to_unicode {
  local($num) = @_;
  local($hex);

  if ($num < 256) {
    $hex = sprintf("00%2lx", $num);
  } elsif ($num < 65536) {
    local($div) = $num / 256;
    local($high) = int($div);
    local($low) = 256 * ($div - $high);
    $hex = sprintf("%2lx%2lx", $high, $low);
  } else {
    return &bib'gotwarn("Illegal number $num given to decimal_to_unicode");
  }
  $hex =~ tr/a-f /A-F0/;
  $hex;
}

sub unicode_to_decimal {
  local($uni) = @_;

  return &bib'gotwarn("Illegal unicode length: $uni") unless length($uni) == 4;
  return &bib'gotwarn("Illegal unicode string: $uni") if $uni =~ /[^\da-fA-F]/;

  hex($uni);
}

sub unicode_name {
  local($hex) = @_;
  local($name);

  # For now, just print hex value
  $name = "Unicode '$hex'";
  $name;
}

sub meta_name {
  local($hex) = @_;
  local($name);

  # For now, just print hex value
  $name = "Meta '$hex'";
  $name;
}

# Oh boy, this is getting really complicated.
#
# We have an approx table set up, which says that one can approximate XXXX
# by YYYY, where presumably YYYY is easier.  There shouldn't be any loops,
# so programs can recurse through the table.
#
# That's for the meta codes.  For the unicode approx, we just have a
# string.  This allows multiple character approximations.
#
# XXXXX Think about C3's idea of multiple approximations.
#
# A map of 0000 means that it maps to the null string -- our "approximation"
# is to get rid of it.  This is what we can do if it isn't terribly harmful
# to remove it.

sub meta_approx {
  local($orig) = @_;

  require "${glb_bpprefix}p-cstab.pl" unless defined %bib'mapprox_tab;

  if (defined $mapprox_tab{$orig}) {
    return '' if $mapprox_tab{$orig} eq '0000';
    return "${bib'cs_meta}$mapprox_tab{$orig}";
  }
  undef;
}

sub unicode_approx {
  local($orig) = @_;

  # XXXXX Should we warn them that they're getting an approx?

  require "${glb_bpprefix}p-cstab.pl" unless defined %bib'uapprox_tab;

  $uapprox_tab{$orig};
}

######
#
# Font change checker.  Verifies and tries to correct font changes.
#
# After fonts are converted in your tocanon routine, call this.  In your
# fromcanon routine, you can assume this has been called.
#
# XXXXX Should we call this in bp.pl's conversion routines?  That would
#       guarantee that it has been run.  Right now, we let each module
#       decide when and if it needs to be run.
#
# It takes a string that has font changes in it and makes sure they always
# match up and that there isn't an odd number (more starts than ends, more
# ends than starts).
#

sub font_check {
  local($_) = @_;

  # XXXXX Ought to read meta information from 00 or as input.
  return $_ unless /${bib'cs_meta}01[01]/;

  local(@fontstack) = ();
  local($fontsmatch, $font, $pfont);

  # Check for this special occurance:  They don't have end fonts (or don't
  # use them).  They just make everything a begin font (troff often does this).
  # Solution: Try to fix it up by replacing each begin after the first with
  #           an endprevious / begin pair.  Then remove the last begin.
  if (!/${bib'cs_meta}011/) {
    local($pos) = $[;
    local($lpos) = 0;
    local($distance) = length($bib'cs_meta) + 3;
    local($n) = 0;
    while (($pos = index($_, "${bib'cs_meta}010", $pos)) >= $[) {
      $n++;
      if ($n == 1) {
        $lpos = $pos;
        $pfont = substr($_, $pos + $distance, 1);
        $pos++;
        next;
      }
      $lpos = $pos;
      $font = substr($_, $pos + $distance, 1);
      substr($_, $pos, 0) = "${bib'cs_meta}0110";
      $pos += ($distance*2); # need to skip over the one we just put in.
    }
    if ($n > 1) {
      # now remove the last begin
      substr($_, $lpos + $distance + 1, $distance + 1) = '';
    } elsif ($n == 1) {
( run in 0.571 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )