unicode results from the CPAN

Bundle-PBib

#
# bibliography package for Perl
#
# troff character set.
#
# Dana Jacobsen (dana@acm.org)
# 13 January 1995, rewrote 20 November 1995 for new canon.
# Last modified on 23 March 1996.

package bp_cs_troff;

######

$bib'charsets{'troff', 'i_name'} = 'troff';

$bib'charsets{'troff', 'tocanon'}  = "bp_cs_troff'tocanon";
$bib'charsets{'troff', 'fromcanon'} = "bp_cs_troff'fromcanon";

$bib'charsets{'troff', 'toesc'}   = '[\\\\]';
$bib'charsets{'troff', 'fromesc'} = "[\\\\\200-\377]|${bib'cs_ext}|${bib'cs_meta}";

######

$opt_doublebs = 1;

# variables used throughout the package
$unicode = '';
$mine = '';
$can = '';

# Rather than defining all our maps and running code for reverse maps at
# load time, we're going to embed them in functions.  When tocanon or
# fromcanon get called, we do the init if we haven't already.  This should
# save startup time -- especially if they never actually call our function!
# In the troff code in particular, the tocanon code needs a lot of reverse
# maps and eval code.  If we're doing xyz->troff, we don't need to load
# all of that.
$cs_init    = 0;
$cs_to_init = 0;
$cs_fr_init = 0;

######

sub init_cs {

# This uses the Roman-8 mapping for the rarer Latin-1 characters.
# XXXXX We need to think about which particular mapping we think is
#	best for the characters that have multiple mappings.  We might
#	even want options for groff, Roman-8, -ms, etc.
%charmap = (
'00A0',	'\0',
'00A1',	'\(r!',
'00A2',	'\(ct',
'00A3',	'\(Po',
'00A4',	'\(Cs',
'00A5',	'\(Ye',
'00A6',	'\(bb',
'00A7',	'\(sc',
'00A8',	'\(ad',
'00A9',	'\(co',
'00AA',	'\(Of',
'00AB',	'\(Fo',
'00AC',	'\(no',
'00AD',	'\(hy',
'00AE',	'\(rg',
'00AF',	'\(a-',
'00B0',	'\(de',
'00B1',	'\(+-',
'00B2',	'\(S2',
'00B3',	'\(S3',
'00B4',	'\(aa',
'00B5',	'\(*m',
'00B6',	'\(ps',
'00B7',	'\(md',
'00B8',	'\(ac',
'00B9',	'\(S1',
'00BA',	'\(Om',
'00BB',	'\(Fc',
'00BC',	'\(14',
'00BD',	'\(12',
'00BE',	'\(34',
'00BF',	'\(r?',
'00C0',	'A\*`',
'00C1',	'A\*\'',
'00C2',	'A\*^',
'00C3',	'A\*~',
'00C4',	'A\*:',

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN

'00FB',	'u\*^',
'00FC',	'u\*:',
'00FD',	'y\*\'',
'00FE',	'\(Tp',
'00FF',	'y\*:',
'010D', 'c\*v',
'015F', 's\*,',
'017E', 'z\*v',
'2002',	'\ ',
'2003',	'\ \ ',
'2007',	'\|',  # I think this is wrong.  It's a "numsp" Number Space.
'2009',	'\^',
'2014',	'\-',
'201C',	'\*Q',
'201D',	'\*U',
);

%metamap = (
'0302',	'\u\s-3',  # superscript
'0312',	'\d\s-3',  # subscript
'030F',	'\s3\d',
'031F',	'\s3\u',
);

%fontmap = (
'0101', '\fR',
'0102', '\fI',
'0103', '\fB',
'0104', '\fC',
'0110', '\fP',	# simplistic, but it should work.
'0111', '\f1',	# we just go to font 1 when they want the previous font.
'0112', '\f1',	# or they end fonts.
'0113', '\f1',
'0114', '\f1',
);

  $cs_init = 1;
}

sub init_cs_fr {

  &init_cs unless $cs_init;

  # We're not using any eval strings at the moment, so there isn't anything
  # to do here.

  $cs_fr_init = 1;
}

sub init_cs_to {

  &init_cs unless $cs_init;

  # Build up a search string to do the reverse map.
  $cmap_eval = '';
  #$cmap_from_eval = '';
  %rmap = ();
  $mineE = '';

  # Step 1: Build a reverse map
  while (($unicode, $mine) = each %charmap) {
    $rmap{$mine} = &bib'unicode_to_canon( $unicode );
  }
  # Step 2: walk through the keys in sorted order
  #         (sigh, without a tree, this is still as slow as a dog)
  foreach $mine (sort keys %rmap) {
    $can = $rmap{$mine};
    $mineE = $mine;
    $mineE =~ s/(\W)/\\$1/g;
    if ( $mine !~ /\\\(../  &&  $mine !~ /.\\\*./ ) {
      $cmap_eval    .= "s/$mineE/$can/g;\n";
    }
    # This isn't being used right now.
    #$cmap_from_eval .= "s/$can/$mineE/g;\n";
  }
  # Leave rmap

  # These are characters that need to be mapped only in the to mapping.
  # There are a zillion different ways to write each symbol in troff, one for
  # each macro package and each implementation.
  # Nightmare 1: one troff character maps to different entities.  Example:
  #                  (groff)   \(Cs --> '00A4 CURRENCY SIGN
  #                  (roman8)  \(Cs --> '2660 BLACK SPADE SUIT
  # Nightmare 2: one entity maps to multiple characters, but no one of them
  #              is supported by a large group of implementations.
  %chartos = (
  'A\*a',		'00C5',
  'a\*a',		'00E5',
  'A\*o',		'00C5',
  'a\*o',		'00E5',
  'O\*/',		'00D8',
  'o\*/',		'00F8',
  '\*CC',		'010C',
  '\*Cc',		'010D',
  '\*CE',		'011A',
  '\*Ce',		'011B',
  '\*CL',		'013D',
  '\*Cl',		'013E',
  '\*CN',		'0147',
  '\*Cn',		'0148',
  '\*?',		'00BF',
  '\*!',		'00A1',
  '\(n~',		'00F1',
  );
  
  $cmap_to_eval = '';
  foreach $mine (sort keys %chartos) {
    $can = &bib'unicode_to_canon( $chartos{$mine} );
    $mineE = $mine;
    $mineE =~ s/(\W)/\\$1/g;
    if ( $mine !~ /\\\(../  &&  $mine !~ /.\\\*./ ) {
      $cmap_to_eval  .= "s/$mineE/$can/g;\n";
    } else {
      # Mapped up front with the rest.
      if (defined $rmap{$mine}) {
        &bib'goterror("Error in troff tables -- duplicate entry for $mine.");
      }
      $rmap{$mine} = $can;
    }
  }

  $cs_to_init = 1;
}


#####################


sub tocanon {
  local($_, $protect) = @_;

  &bib'panic("cs-troff tocanon called with no arguments!") unless defined $_;

  # always check to see if we have any characters to change
  # (with our toesc search string, is this first check necessary?)
  return $_  unless /\\/;

  # do this even if we don't have opt_doublebs on
  s/\\\\/\\/g;  

  &init_cs_to unless $cs_to_init;

  study;

  # Check for accents of the form \(xx and x\*x
  if (/\\[(*]/) {
    while (/(\\\(..)/) {
      $repl = $1;
      if (!defined $rmap{$repl}) {
        &bib'gotwarn("Unknown troff special $repl");
        $can = '';
      } else {
        $can = $rmap{$repl};
      }
      $repl =~ s/(\W)/\\$1/g;
      s/$repl/$can/g;
    }
    # Next check for all characters of the form x\*x
    while (/(.\\\*.)/) {
      $repl = $1;
      if (!defined $rmap{$repl}) {
        next if s/\\\*Q/${bib'cs_ext}201C/go;
        next if s/\\\*U/${bib'cs_ext}201D/go;
        &bib'gotwarn("Unknown troff special $repl");
        $can = '';
      } else {
        $can = $rmap{$repl};
      }
      $repl =~ s/(\W)/\\$1/g;
      s/$repl/$can/g;
    }

    return $_  unless /\\/;
  }

  eval $cmap_eval;

  return $_  unless /\\/;

  # OK, they've got something fairly weird.

  # Handle the different ways of specifying characters
  eval $cmap_to_eval;

  return $_  unless /\\/;

  if (/\\f[123RIBP]/) {
    # font changes
    while (/\\f([123RIBP])/) {
      $repl = $1;
      $repl eq 'P'    && ($mine = $bib'cs_meta . '0110');
      $repl =~ /[1R]/ && ($mine = $bib'cs_meta . '0101');
      $repl =~ /[2I]/ && ($mine = $bib'cs_meta . '0102');
      $repl =~ /[3B]/ && ($mine = $bib'cs_meta . '0103');
      s/\\f$repl/$mine/g;
    }
    $_ = &bib'font_check($_);
  }

  while (($unicode, $mine) = each %metamap) {
    $mine =~ s/(\W)/\\$1/g;
    s/$mine/${bib'cs_meta}$can/g;
  }

  return $_  unless /\\/;

  # Last of all, the escape character.  First we check to see if there is
  # anything else.  We can't delete it because of the way troff does it's
  # coding.
  if (/\\[^e]/) {
    &bib'gotwarn("Unknown troff characters in '$_'");
  }
  # Then convert the escape character
  s/\\e/\\/g;

  $_;
}

######

sub fromcanon {
  local($_, $protect) = @_;
  local($repl);

  &bib'panic("cs-troff fromcanon called with no arguments!") unless defined $_;

  s/\\/\\e/g;

  # tr/\200-\237//d && &bib'gotwarn("Zapped chars.");
  if (/[\200-\237]/) {
    while (/([\200-\237])/) {
      $repl = $1;
      $unicode = &bib'canon_to_unicode($repl);
      &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
      s/$repl//g;
    }
  }

  &init_cs_fr unless $cs_fr_init;

  # Which one of these to use probably depends on the frequency of
  # special characters.  The first method will be best with only one
  # or two, but the second is better if there are a lot.
  while (/([\240-\377])/g) {
    $repl = $1;
    $unicode = &bib'canon_to_unicode($repl);
    s/$repl/$charmap{$unicode}/g;
  }
  # Note that the definition of cmap_from_eval is now commented out above.
  #if (/[\240-\377]/) {
  #  eval $cmap_from_eval;
  #}

  # should we make the output have double backslashes?
  $opt_doublebs  &&  s/\\/\\\\/g;

  # Maybe we can go now?
  return $_ unless /$bib'cs_escape/o;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    if ($unicode =~ /^00[0-7]/) {
      1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
      next;
    }
    defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
                               && next;

    $can = &bib'unicode_approx($repl);
    defined $can  &&  s/$bib'cs_ext$repl/$can/g  &&  next;

    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
    s/${bib'cs_ext}$unicode//g;
  }

  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    defined $fontmap{$repl} && s/${bib'cs_meta}$repl/$fontmap{$repl}/g
                            && next;
    defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
                            && next;

    $can = &bib'meta_approx($repl);
    defined $can  &&  s/$bib'cs_meta$repl/$can/g  &&  next;

    &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to troff");
    s/${bib'cs_meta}$repl//g;
  }

  $_;
}

#######################
# end of package
#######################

1;

( run in 0.575 second using v1.01-cache-2.11-cpan-99c4e6809bf )