unicode results from the CPAN

unicode

Bundle-PBib

view release on metacpan or search on metacpan

#
#    load_converter($converter_name);
#
#    find_bp_files();
#    find_bp_files($rehash);
#
#    reg_format($long_name, $short_name, $pkg_name, $charset_name, @info);
#
#           [ file bp-p-cs ]
#
#    unicode_to_canon($unicode);
#
#    canon_to_unicode($character);
#
#    decimal_to_unicode($number);
#
#    unicode_to_decimal($unicode);
#
#    unicode_name($unicode);
#
#    meta_name($metacode);
#
#    meta_approx($metacode);
#
#    unicode_approx($unicode);
#
#    nocharset($string);
#
#           [ file bp-p-util ]
#
#    bp_util'mname_to_canon($names_string);
#    bp_util'mname_to_canon($names_string, $flag_reverse_author);
#
#    bp_util'name_to_canon($name_string);
#    bp_util'name_to_canon($name_string, $flag_reverse_author);

lib/Biblio/bp/CHANGELOG view on Meta::CPAN

		Month and Year were reversed, and were short
		output_date is now in bp-p-utils
  bp-p-utils	Added output_date
		Fixed multiple von prefix parsing in mname_to_canon
  bp-s-generic	School name in report
		s/,"$/."/ at ending
  bp-output	Added style name to trailer
  bp-canon	New format, used for testing mainly
  bp-cs-canon	New charset, used for testing
  bp-refer	Changes to EndNote %E parsing
  bp-cs-dead	Forgot \\ in front of $charmap{$unicode} in fromcanon
  bp-procite	Wrote fromcanon routine
  bp-cs-html	Added support for &U+xxxx; characters
		Added call to font_noprev
		Changed some // calls to //o and s/// to s///o.
  bp-p-cs	Added font_noprev routine
		changed fontcheck to font_check

--------- version 0.2.2 released ---------

14 Mar 96:

lib/Biblio/bp/CHANGELOG view on Meta::CPAN

  bp-medline.pl:  Added support for Entrez Medlars to Medline format.
                  Added support for Entrez HTML output (notitles).

15 Jan 96:
  bp-inspec4.pl:  Wrote INSPEC format, style 4.
  bp-inspec.pl:   Wrote INSPEC general format.
  bp-p-dload.pl:  Added comment about unreg_format function needed.

2 Dec 95:
  bp.pl:          Bumped version number to 0.2.1.
  bp-cs-tab.pl:   Added 4 greek characters to unicode approx table.

30 Nov 95:
  bp-cs-html.pl:  Newline before <BLOCKQUOTE>.

--------- version 0.2.0 released ---------

--------- Changelog started ---------

lib/Biblio/bp/lib/bp-cs-8859-1.pl view on Meta::CPAN

  # We're eight bit ISO-8859-1, so there isn't anything for us to do.
  # We assume here that the escape character is already done.

  $_[0];
}

######

sub fromcanon {
  local($_, $protect) = @_;
  local($repl, $unicode, $can);

  return $_ unless /$bib'cs_escape/o;

  1 while s/${bib'cs_ext}00(..)/&bib'unicode_to_canon('00'.$1)/ge;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    $can = &bib'unicode_approx($unicode);
    defined $can  &&  s/$bib'cs_ext$unicode/$can/g  &&  next;
    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to ISO-8859-1");
    s/${bib'cs_ext}$unicode//g;
  }
  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    $can = &bib'meta_approx($repl);
    defined $can  &&  s/$bib'cs_meta$repl/$can/g  &&  next;
    &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to ISO-8859-1");
    s/${bib'cs_meta}$repl//g;
  }

  $_;

lib/Biblio/bp/lib/bp-cs-apple.pl view on Meta::CPAN

'02D8', 249,  # BREVE
'02D9', 250,  # DOT ABOVE (Mandarin Chinese light tone)
'02DA', 251,  # RING ABOVE
'00B8', 252,  # CEDILLA
'02DD', 253,  # DOUBLE ACUTE ACCENT
'02DB', 254,  # OGONEK
'02C7', 255,  # CARON (Mandarin Chinese third tone)
);
# Table done.

$unicode = '';
$repl = '';
$can = '';

$eb_eval_fromcanon = '';
$eb_eval_tocanon = '';
$eb_nomapC = '';
$eb_nomapA = '';
$eb_mapC = '';
$eb_mapA = '';

#
# Build the eval string for the fromcanon code.
#
# For each 8bit code, we either:
#   1) don't have a character for this code.  So we zap and complain.
#   2) we do know it, so we translate them all at once after we're done
#      with all the ones we don't know.
#
foreach $can (128..255) {
  $unicode = &bib'decimal_to_unicode($can);
  $repl =  pack("C", $can);
  if (defined $umap{$unicode}) {
    $eb_mapC .= $repl;
    $eb_mapA .= pack("C", $umap{$unicode});
  } else {
    $eb_nomapC .= $repl;
    $eb_eval_fromcanon .= "tr/$repl//d && \&bib'gotwarn(\"Can't convert "
                       . &bib'unicode_name($unicode) . " to Apple\");\n";
  }
}
substr($eb_eval_fromcanon,0,0) = "if (/[$eb_nomapC]/) {\n";
$eb_eval_fromcanon .= "}\ntr/$eb_mapC/$eb_mapA/;\n";

#
# Build the eval string for the tocanon code.
#
# nomapA just means there isn't a direct 8bit replacement.  We just insert
# the extended character.
#
foreach $unicode (keys %umap) {
  next if $unicode =~ /^00/;
  $repl = pack("C", $umap{$unicode});
  $eb_nomapA .= $repl;
  $eb_eval_tocanon .= "s/$repl/$bib'cs_ext$unicode/g;\n";
}
substr($eb_eval_tocanon,0,0) = "if (/[$eb_nomapA]/) {\n";
$eb_eval_tocanon .= "}\ntr/$eb_mapA/$eb_mapC/;\n";



#####################

sub tocanon {
  local($_, $protect) = @_;

lib/Biblio/bp/lib/bp-cs-apple.pl view on Meta::CPAN

  #  step 2: Use tr/<canons>/<apples>/ to translate all the two-way
  #          mapped characters right across.
  eval $eb_eval_fromcanon;

  return $_ unless /$bib'cs_escape/o;

  # The standard 7bit map.
  1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    defined $umap{$unicode}
             && s/${bib'cs_ext}$unicode/pack("C", $umap{$unicode})/ge
             && next;
    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to Apple");
    s/${bib'cs_ext}$unicode//g;
  }
  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to Apple");
    s/${bib'cs_meta}$repl//g;
  }

  $_;
}

lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN

$bib'charsets{'dead', 'tocanon'}   = "bp_cs_dead'tocanon";
$bib'charsets{'dead', 'fromcanon'} = "bp_cs_dead'fromcanon";

$bib'charsets{'dead', 'toesc'}   = "[\\\\]";
$bib'charsets{'dead', 'fromesc'} = "[\\\200-\377]|${bib'cs_ext}|${bib'cs_meta}";

######

# package variables for anyone to use
$repl = '';
$unicode = '';
$can = '';

######

# XXXXX Should charmap include the \ character also?

%charmap = (
#00A0
#00..
#00BE

lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN

'0268',	'-i',
);

# Secondary mappings
%charmap2 = (
'sz',	'00DF',
);

# Build a reverse map and eval string
$reval = '';
while (($unicode, $repl) = each %charmap) {
  $can = &bib'unicode_to_canon( $unicode );
  if ($repl =~ /^[-`'^~",v]/) {
    $rmap{$repl} = $can;
  } else {
    $repl =~ s/(\W)/\\$1/g;
    $reval .= "s/\\\\$repl/$can/g;\n";
  }
}

# continue the same reverse map for the secondary mappings.
while (($repl, $unicode) = each %charmap2) {
  $can = &bib'unicode_to_canon( $unicode );
  if ($repl =~ /^[-`'^~",v]/) {
    $rmap{$repl} = $can;
  } else {
    $repl =~ s/(\W)/\\$1/g;
    $reval .= "s/\\\\$repl/$can/g;\n";
  }
}


######

lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN

  local($_, $protect) = @_;

  if ($protect) {
    s/\\/\\\\/go;
  }

  s/\240/ /g;
  s/\255/-/g;
  while (/([\200-\377])/) {
    $repl = $1;
    $unicode = &bib'canon_to_unicode($repl);
    if (defined $charmap{$unicode}) {
      s/$repl/\\$charmap{$unicode}/g;
    } else {
      &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to dead-key");
      s/$repl//g;
    }
  }

  return $_ unless /$bib'cs_escape/o;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    if ($unicode =~ /^00[0-7]/) {   # 7-bit characters
      1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
      next;
    }
    defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/\\$charmap{$unicode}/g
                               && next;
    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to dead-key");
    s/${bib'cs_ext}$unicode//g;
  }

  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to dead-key");
    s/${bib'cs_meta}$repl//g;
  }

  $_;
}

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN

# to this one.  References for those characters may be found in
#   http://www.w3.org/hypertext/WWW/MarkUp/html3/mathsym.html
# Also see
#   http://www.to.icl.fi/~aj/html3_test/entities.html
#
# Big additions in HTML3 include the greek characters and proper typesetting
# spaces.
#
# As referenced in:
#   http://www.acl.lanl.gov/HTML_WG/html-wg-96q1.messages/0102.html
# I am going to allow the &U+xxxx; form for unicode characters for HTML 3.
# I'm not sure if this is going to stay in the standard however.
#
# Specifically, I use the named references as opposed to the numeric, as that
# is recommended in the html3 draft spec, and it is much easier to read.
# Reading through the working notes of the HTML_WG, it looks like they
# specifically recommend translating Numerical Character Entities of Latin-1
# into the real things.  See reference:
#   http://www.acl.lanl.gov/HTML_WG/html-wg-95q1.messages/0920.html
# about the can of worms this opens.  Specifically, &215; refers to _different_
# characters depending on which ISO 8859-x group you're working with!  The

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN

'021F', '/SMALL',
);

  $cs_init = 1;
}

sub init_cs_fr {

  &init_cs unless $cs_init;

  # XXXXX We should just use unicode_approx.
  # Map various unicode entities to our stuff.
  %charmap_from = (
  '2212', '-',
  '2013', '--',
  '2014', '---',
  '2002', ' ',    # These two are probably wrong.
  '2003', '  ',
  );

  # HTML 2.0 Secondary meta mappings for fromcanon
  # Note that these do _not_ get wrapped in <> like metamap_2 does.

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN


#####################


# Essentially we're ISO-8859-1, but with a few protected characters
# and some extra ones.
# XXX protect lonely & characters

sub tocanon {
  local($_, $protect) = @_;
  local($repl, $can, $mine, $unicode);

  &bib'panic("cs-html tocanon called with no arguments!") unless defined $_;

  # Check to see if we have anything to do.
  return $_ unless /<|\&/;

  # XXXXX Ignore named Latin-1 for now.

  # Handle links first.
  local($link, $text);

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN

  # To prevent matching this, we leave it in extended format.
  s/&amp;/${bib'cs_ext}0026/go;

  # always check to see if we have any characters left to change
  return $_  unless /\&/;

  # This should handle 8 bit chars correctly as well as the new UCS-2, cf.
  #   <http://www.ics.uci.edu/pub/ietf/html/draft-ietf-html-i18n-03.txt>
  while (/\&#(\d+);/) {
    $repl = $1;
    $can = &bib'unicode_to_canon(  &bib'decimal_to_unicode($repl)  );
    s/\&#$repl;/$can/g;
  }
  # This handles to the new &U+xxx; form.  I need to find a good reference
  # for this.
  while (/\&U\+([\dA-Fa-f]{4});/) {
    $repl = $1;
    $can = &bib'unicode_to_canon( $repl );
    s/\&U\+$repl;/$can/g;
  }

  # Now for the named entities.

  while (/\&(\S*);/) {
    $repl = $1;
    $repl =~ s/(\W)/\\$1/g;
    if (defined $entitynames{$repl}) {
      $can = pack("C", $entitynames{$repl});
      s/\&$repl;/$can/g;
      next;
    }
    # We now search through the charmaps to see if they contain our character.
    # Terribly inefficient, but we shouldn't be doing this often.
    if (defined $opt_html3) {
      $can = undef;
      foreach $unicode (keys %charmap_3) {
        next unless $charmap_3{$unicode} eq $repl;
        # only here if we found the map.
        $can = &bib'unicode_to_canon($unicode);
        s/\&$repl;/$can/g;
        last;
      }
      next if defined $can;
    }
    $can = undef;
    foreach $unicode (keys %charmap_2) {
      next unless $charmap_2{$unicode} eq $repl;
      # only here if we found the map.
      $can = &bib'unicode_to_canon($unicode);
      s/\&$repl;/$can/g;
      last;
    }
    next if defined $can;

    &bib'gotwarn("Unknown HTML entity: \&$repl; : in $_");
    s/\&$repl;//g;
  }
  s/&(\s)/${bib'cs_ext}0026$1/g;

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN

######

sub fromcanon {
  local($_, $protect) = @_;
  local($repl, $can);

  &bib'panic("cs-html fromcanon called with no arguments!") unless defined $_;

  # Leave 8bit characters alone since we're assuming ISO-8859-1 HTML.

  #1 while s/${bib'cs_ext}(00..)/&bib'unicode_to_canon($1)/ge;

  # XXXXX  I think these should go here, as they create a mess if they go
  #        after the ext and meta maps.

  s/\&/\&amp;/g;
  s/</\&lt;/g;
  s/>/\&gt;/g;
  s/${bib'cs_ext}0026/\&amp;/go;

  return $_ unless /$bib'cs_escape/o;

lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN

               && next;
    defined $charmap_2{$repl}
               && s/$bib'cs_ext$repl/\&$charmap_2{$repl};/g
               && next;
    defined $charmap_from{$repl}
               && s/$bib'cs_ext$repl/$charmap_from{$repl}/g
               && next;

    $opt_html3  &&  s/$bib'cs_ext$repl/\&U\+$repl;/g  &&  next;

    $can = &bib'unicode_approx($repl);
    defined $can  &&  s/$bib'cs_ext$repl/$can/g  &&  next;

    &bib'gotwarn("Can't convert ".&bib'unicode_name($repl)." to HTML");
    s/${bib'cs_ext}$repl//g;
  }

  # XXXXX We need to deal with font changes.
  $_ = &bib'font_noprev($_) if /${bib'cs_meta}0110/o;

  while (/${bib'cs_meta}(....)/o) {
    $repl = $1;
    # We need to do these in order of most to least inclusive.
    # That way we can get, say ThinSpace to map to a thin space in HTML3

lib/Biblio/bp/lib/bp-cs-none.pl view on Meta::CPAN

      s/\xE6/ae/g;
      s/[\200-\377]//g;
    }
  }

  if (/$bib'cs_escape/o) {
    local($repl, $can);
    s/${bib'cs_meta}....//g;
    while (/${bib'cs_ext}(....)/) {
      $repl = $1;
      $can = &bib'unicode_approx($repl);
      defined $can  &&  s/$bib'cs_ext$repl/$can/g  &&  next;
      s/${bib'cs_ext}$repl//g;
    }
  }
  tr/\x00-\x1F//d;

  $_;
}

#######################

lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN

$bib'charsets{'tex', 'toesc'}   = "[\$\\\\]";
# XXXXX We have so many characters to protect, should we even bother?
$bib'charsets{'tex', 'fromesc'} = "[\\#\$\%\&{}_\|><\^~\200-\377]|${bib'cs_ext}|${bib'cs_meta}";

######

$cs_init = 0;

# package variables for anyone to use
$mine = '';
$unicode = '';
$can = '';

######

sub init_cs {

# Thorn and eth are really nasty since they don't exist in the standard TeX
# fonts.  This is what I came up with in r2b to fake it.  Fortunately they
# aren't used often.  Get the cmoer fonts if you want to do them right.
# My eth is pretty nice, but the thorn leaves a little to be desired.

lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN


# Build up a search string to do the reverse map.
$cmap_to_eval = '';
$cmap_from8_eval = '';
$cmap_to_eval_1 = '';
$cmap_to_eval_2 = '';
%rmap = ();
%accent = ();

# Step 1: Build a reverse map
while (($unicode, $mine) = each %charmap) {
  $rmap{$mine} = $unicode;
}
# Step 2: walk through the keys in sorted order
local($mineE);
foreach $mine (sort keys %rmap) {
  $can = &bib'unicode_to_canon( $rmap{$mine} );
  $mineE = $mine;
  $mineE =~ s/(\W)/\\$1/g;
  # The various maps for tocanon
  if ($mine =~ /^{\\([`'^"~])([\w])}$/) {
    $accent{$1 . $2} = $can;
  } elsif ($mine =~ /^{\\([vc])(\w)}$/) {
    $accent{$1 . $2} = $can;
  } elsif ($mine =~ /^{\\([vc]){(\w)}}$/) {
    $accent{$1 . $2} = $can;
  } elsif ($mine =~ /leavevmode/) {

lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN

    s/\|/\$\|\$/g;
    s/>/\$>\$/g;
    s/</\$<\$/g;
    s/\^/\\^{}/g;
    s/~/\\~{}/g;
    s/$bib'cs_temp/\$\\backslash\$/go;
  }

  while (/([\200-\237])/) {
    $repl = $1;
    $unicode = &bib'canon_to_unicode($repl);
    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
    s/$repl//g;
  }

  &init_cs unless $cs_init;

  #if (/[\240-\377]/) {
  #  eval $cmap_from8_eval;
  #}
  s/\240/~/g;
  s/\255/-/g;
  while (/([\240-\377])/) {
    $repl = $1;
    $unicode = &bib'canon_to_unicode($repl);
    s/$repl/$charmap{$unicode}/g;
  }

  # Maybe we can go now?
  return $_ unless /$bib'cs_escape/o;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    if ($unicode =~ /^00[0-7]/) {
      1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
      next;
    }
    defined $charmap{$unicode}  && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
                                && next;
    defined $charmap2{$unicode} && s/${bib'cs_ext}$unicode/$charmap2{$unicode}/g
                                && next;

    $can = &bib'unicode_approx($unicode);
    defined $can  &&  s/$bib'cs_ext$unicode/$can/g  &&  next;

    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
    s/${bib'cs_ext}$unicode//g;
  }

  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
                            && next;

    $can = &bib'meta_approx($repl);
    defined $can  &&  s/$bib'cs_meta$repl/$can/g  &&  next;

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN

$bib'charsets{'troff', 'fromcanon'} = "bp_cs_troff'fromcanon";

$bib'charsets{'troff', 'toesc'}   = '[\\\\]';
$bib'charsets{'troff', 'fromesc'} = "[\\\\\200-\377]|${bib'cs_ext}|${bib'cs_meta}";

######

$opt_doublebs = 1;

# variables used throughout the package
$unicode = '';
$mine = '';
$can = '';

# Rather than defining all our maps and running code for reverse maps at
# load time, we're going to embed them in functions.  When tocanon or
# fromcanon get called, we do the init if we haven't already.  This should
# save startup time -- especially if they never actually call our function!
# In the troff code in particular, the tocanon code needs a lot of reverse
# maps and eval code.  If we're doing xyz->troff, we don't need to load
# all of that.

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN


  &init_cs unless $cs_init;

  # Build up a search string to do the reverse map.
  $cmap_eval = '';
  #$cmap_from_eval = '';
  %rmap = ();
  $mineE = '';

  # Step 1: Build a reverse map
  while (($unicode, $mine) = each %charmap) {
    $rmap{$mine} = &bib'unicode_to_canon( $unicode );
  }
  # Step 2: walk through the keys in sorted order
  #         (sigh, without a tree, this is still as slow as a dog)
  foreach $mine (sort keys %rmap) {
    $can = $rmap{$mine};
    $mineE = $mine;
    $mineE =~ s/(\W)/\\$1/g;
    if ( $mine !~ /\\\(../  &&  $mine !~ /.\\\*./ ) {
      $cmap_eval    .= "s/$mineE/$can/g;\n";
    }

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN

  '\*Cl',		'013E',
  '\*CN',		'0147',
  '\*Cn',		'0148',
  '\*?',		'00BF',
  '\*!',		'00A1',
  '\(n~',		'00F1',
  );
  
  $cmap_to_eval = '';
  foreach $mine (sort keys %chartos) {
    $can = &bib'unicode_to_canon( $chartos{$mine} );
    $mineE = $mine;
    $mineE =~ s/(\W)/\\$1/g;
    if ( $mine !~ /\\\(../  &&  $mine !~ /.\\\*./ ) {
      $cmap_to_eval  .= "s/$mineE/$can/g;\n";
    } else {
      # Mapped up front with the rest.
      if (defined $rmap{$mine}) {
        &bib'goterror("Error in troff tables -- duplicate entry for $mine.");
      }
      $rmap{$mine} = $can;

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN

      $repl = $1;
      $repl eq 'P'    && ($mine = $bib'cs_meta . '0110');
      $repl =~ /[1R]/ && ($mine = $bib'cs_meta . '0101');
      $repl =~ /[2I]/ && ($mine = $bib'cs_meta . '0102');
      $repl =~ /[3B]/ && ($mine = $bib'cs_meta . '0103');
      s/\\f$repl/$mine/g;
    }
    $_ = &bib'font_check($_);
  }

  while (($unicode, $mine) = each %metamap) {
    $mine =~ s/(\W)/\\$1/g;
    s/$mine/${bib'cs_meta}$can/g;
  }

  return $_  unless /\\/;

  # Last of all, the escape character.  First we check to see if there is
  # anything else.  We can't delete it because of the way troff does it's
  # coding.
  if (/\\[^e]/) {

lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN

  local($repl);

  &bib'panic("cs-troff fromcanon called with no arguments!") unless defined $_;

  s/\\/\\e/g;

  # tr/\200-\237//d && &bib'gotwarn("Zapped chars.");
  if (/[\200-\237]/) {
    while (/([\200-\237])/) {
      $repl = $1;
      $unicode = &bib'canon_to_unicode($repl);
      &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
      s/$repl//g;
    }
  }

  &init_cs_fr unless $cs_fr_init;

  # Which one of these to use probably depends on the frequency of
  # special characters.  The first method will be best with only one
  # or two, but the second is better if there are a lot.
  while (/([\240-\377])/g) {
    $repl = $1;
    $unicode = &bib'canon_to_unicode($repl);
    s/$repl/$charmap{$unicode}/g;
  }
  # Note that the definition of cmap_from_eval is now commented out above.
  #if (/[\240-\377]/) {
  #  eval $cmap_from_eval;
  #}

  # should we make the output have double backslashes?
  $opt_doublebs  &&  s/\\/\\\\/g;

  # Maybe we can go now?
  return $_ unless /$bib'cs_escape/o;

  while (/${bib'cs_ext}(....)/) {
    $unicode = $1;
    if ($unicode =~ /^00[0-7]/) {
      1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
      next;
    }
    defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
                               && next;

    $can = &bib'unicode_approx($repl);
    defined $can  &&  s/$bib'cs_ext$repl/$can/g  &&  next;

    &bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
    s/${bib'cs_ext}$unicode//g;
  }

  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    defined $fontmap{$repl} && s/${bib'cs_meta}$repl/$fontmap{$repl}/g
                            && next;
    defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
                            && next;

    $can = &bib'meta_approx($repl);

lib/Biblio/bp/lib/bp-cs-utf8.pl view on Meta::CPAN



sub tocanon {
  $_[0];
}

######

sub fromcanon {
  local($_, $protect) = @_;
  local($repl, $unicode, $can);

  return $_ unless /$bib'cs_escape/o;

  1 while s/${bib'cs_ext}(....)/\X{$1}/g;

  while (/${bib'cs_meta}(....)/) {
    $repl = $1;
    $can = &bib'meta_approx($repl);
    defined $can  &&  s/$bib'cs_meta$repl/$can/g  &&  next;
    &bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to UTF8");

lib/Biblio/bp/lib/bp-p-cs.pl view on Meta::CPAN

# Character set common variables and routines
#
# Dana Jacobsen (dana@acm.org)
# 18 November 1995 (last modified 17 March 1996)

# for bib'nocharset which calls fromcanon:
require "bp-cs-none.pl";

######
#
# Return canonical character for a unicode hex string.
#
sub unicode_to_canon {
  local($hex) = @_;

  $hex =~ tr/a-f/A-F/;

  # XXXXX Should we prepend '0' characters if we don't have 4 digits?
  if ($hex !~ /^[\dA-F]{4}$/) {
    &bib'gotwarn("Invalid Unicode character: $hex");
    return '';
  }
  if ($hex =~ /00(..)/) {
    return pack("C", hex($1));
  }
  return $bib'cs_ext . $hex;
}

sub canon_to_unicode {
  local($can) = @_;
  local($hex);

  if (length($can) == 1) {
    $hex = sprintf("%2lx", ord($can));
    $hex =~ tr/a-f /A-F0/;
    return( '00' . $hex );
  }
  if ($can =~ /$bib'cs_ext(....)/) {
    $hex = $1;
    $hex =~ tr/a-f/A-F/;
    return $hex;
  }
  if ($can eq $bib'cs_char_escape) {
    return &bib'canon_to_unicode($bib'cs_escape);
  }
  return &bib'gotwarn("Can't convert $can to Unicode");
}

sub decimal_to_unicode {
  local($num) = @_;
  local($hex);

  if ($num < 256) {
    $hex = sprintf("00%2lx", $num);
  } elsif ($num < 65536) {
    local($div) = $num / 256;
    local($high) = int($div);
    local($low) = 256 * ($div - $high);
    $hex = sprintf("%2lx%2lx", $high, $low);
  } else {
    return &bib'gotwarn("Illegal number $num given to decimal_to_unicode");
  }
  $hex =~ tr/a-f /A-F0/;
  $hex;
}

sub unicode_to_decimal {
  local($uni) = @_;

  return &bib'gotwarn("Illegal unicode length: $uni") unless length($uni) == 4;
  return &bib'gotwarn("Illegal unicode string: $uni") if $uni =~ /[^\da-fA-F]/;

  hex($uni);
}

sub unicode_name {
  local($hex) = @_;
  local($name);

  # For now, just print hex value
  $name = "Unicode '$hex'";
  $name;
}

sub meta_name {
  local($hex) = @_;

lib/Biblio/bp/lib/bp-p-cs.pl view on Meta::CPAN

  $name = "Meta '$hex'";
  $name;
}

# Oh boy, this is getting really complicated.
#
# We have an approx table set up, which says that one can approximate XXXX
# by YYYY, where presumably YYYY is easier.  There shouldn't be any loops,
# so programs can recurse through the table.
#
# That's for the meta codes.  For the unicode approx, we just have a
# string.  This allows multiple character approximations.
#
# XXXXX Think about C3's idea of multiple approximations.
#
# A map of 0000 means that it maps to the null string -- our "approximation"
# is to get rid of it.  This is what we can do if it isn't terribly harmful
# to remove it.

sub meta_approx {
  local($orig) = @_;

  require "${glb_bpprefix}p-cstab.pl" unless defined %bib'mapprox_tab;

  if (defined $mapprox_tab{$orig}) {
    return '' if $mapprox_tab{$orig} eq '0000';
    return "${bib'cs_meta}$mapprox_tab{$orig}";
  }
  undef;
}

sub unicode_approx {
  local($orig) = @_;

  # XXXXX Should we warn them that they're getting an approx?

  require "${glb_bpprefix}p-cstab.pl" unless defined %bib'uapprox_tab;

  $uapprox_tab{$orig};
}

######

lib/Biblio/bp/lib/bp.pl view on Meta::CPAN

#
#    load_converter($converter_name);
#
#    find_bp_files();
#    find_bp_files($rehash);
#
#    reg_format($long_name, $short_name, $pkg_name, $charset_name, @info);
#
#           [ file bp-p-cs ]
#
#    unicode_to_canon($unicode);
#
#    canon_to_unicode($character);
#
#    decimal_to_unicode($number);
#
#    unicode_to_decimal($unicode);
#
#    unicode_name($unicode);
#
#    meta_name($metacode);
#
#    meta_approx($metacode);
#
#    unicode_approx($unicode);
#
#    nocharset($string);
#
#           [ file bp-p-util ]
#
#    bp_util'mname_to_canon($names_string);
#    bp_util'mname_to_canon($names_string, $flag_reverse_author);
#
#    bp_util'name_to_canon($name_string);
#    bp_util'name_to_canon($name_string, $flag_reverse_author);

lib/Biblio/bp/lib/bp.pl view on Meta::CPAN

# bib'load_charset
# bib'find_bp_files
# bib'reg_format

######

require "${glb_bpprefix}p-cs.pl";
# loads:
# variables used by the cs routines
# bib'nocharset
# bib'unicode_to_canon

######

require "${glb_bpprefix}p-option.pl";
# loads:
# bib'stdargs
# bib'options
# bib'parse_num_option
# bib'parse_option
# bib'doc

lib/Biblio/bp/tests/newcset.pl view on Meta::CPAN

#!/usr/bin/perl

require "bptest.pl";

$skiptests = 0;   # to skip early tests during development
$timing = 0;      # to use the same seed for timing tests

#
# This test covers the 8859-1, apple, TeX, troff, and HTML
# character sets.  It also tests the canon<->unicode routines
# from bp-p-utils.pl.
#
# It does not test the 'none' and 'auto' character sets.  The
# auto character set doesn't exist, so it tests all the charsets
# that are currently in bp.
#
# For each charset, we run through an iso string, which contains
# the 8-bit characters 0 through 255.  Before processing, certain
# characters known to not be supported are removed (generally
# 200-237 octal, but sometimes others).  This is to prevent

lib/Biblio/bp/tests/newcset.pl view on Meta::CPAN

# already be escaped.  They also do not do any unescaping (except
# for cs_ext and cs_meta, which they are supposed to handle).
$isostring  =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
$randstring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;


print "Testing character set routines, seed=$seed.\n";

if (!$skiptests) {

&begintest("bib'unicode", 12);

&check('', "bib'canon_to_unicode", '006C', 'l');
&check('', "bib'canon_to_unicode", '00C4', "\304");
&check('', "bib'canon_to_unicode", 'A1C6', "${bib'cs_ext}A1c6");
&check('', "bib'canon_to_unicode", '001C', $bib'cs_char_escape);

&check('', "bib'unicode_to_canon", '5', '0035');
&check('', "bib'unicode_to_canon", "\xE9", '00E9');
&check('', "bib'unicode_to_canon", "${bib'cs_ext}CF8A", 'CF8A');

&check('', "bib'decimal_to_unicode", '0023', 35);
&check('', "bib'decimal_to_unicode", '359B', 13723);
&check('', "bib'unicode_to_decimal", 35, '0023');
&check('', "bib'unicode_to_decimal", 13723, '359b');

$f = $failed;
for (0..512) {
  $can = &bib'unicode_to_canon(&bib'decimal_to_unicode($_));
  $val = &bib'unicode_to_decimal(&bib'canon_to_unicode($can));
  &check('nostatus,norun',"unicode loop", $_, $val);
}
$can = $val = undef;
&check('partial', "unicode loop", $f, $failed);

&endtest;

# Test ISO-8859-1

&testcharset("8859-1", 5);

$caniso = $isostring;
&check('', "bp_cs_88591'tocanon", $caniso, $isostring);
&check('', "bp_cs_88591'fromcanon", $isostring, $caniso);

lib/PBib/BibliographyStyle.pm view on Meta::CPAN

# --*-Perl-*--
# $Id: BibliographyStyle.pm 11 2004-11-22 23:56:20Z tandler $
#

package PBib::BibliographyStyle;
use strict;
use English;
use charnames ':full';	# enable \N{unicode char name} in strings

# for debug:
use Data::Dumper;

BEGIN {
    use vars qw($Revision $VERSION);
	my $major = 1; q$Revision: 11 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}

# superclass

lib/PBib/Document/OpenOffice.pm view on Meta::CPAN

Base class for OpenOffice documents.

All OpenOffice documents have a similar structure: They are a zip archive that contains a content.xml file with the content.

=cut

package PBib::Document::OpenOffice;
use 5.008; # for Unicode / utf-8 support
use strict;
use warnings;
use charnames ':full';	# enable \N{unicode char name} in strings
#  use English;

# for debug:
use Data::Dumper;

BEGIN {
    use vars qw($Revision $VERSION);
	my $major = 1; q$Revision: 13 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}

lib/PBib/Document/OpenOffice.pm view on Meta::CPAN

# used own modules


# module variables
#use vars qw(mmmm);

use constant {
	CONTENTNAME => 'content.xml',
	};
	
# Unicode chars, see http://www.unicode.org/charts/
my $EM_DASH = "\N{EM DASH}";				# = \x{2014}
my $EN_DASH = "\N{EN DASH}";				# = \x{2013}
my $FIG_DASH = "\N{FIGURE DASH}";			# 20 12
my $NB_HYPHEN = "\N{NON-BREAKING HYPHEN}";	# 20 11
my $HYPHEN = "\N{HYPHEN}";					# 20 10 (difference to "hypen-minus"?)
my $OPT_HYPHEN = "\N{SOFT HYPHEN}";			# 00 AD

my $LDBLQUOTE_EN = "\N{LEFT DOUBLE QUOTATION MARK}";	# 20 1C
my $RDBLQUOTE_EN = "\N{RIGHT DOUBLE QUOTATION MARK}";	# 20 1D
my $LQUOTE_EN = "\N{LEFT SINGLE QUOTATION MARK}";		# 20 18

lib/PBib/ReferenceConverter.pm view on Meta::CPAN

}

sub warn {
    my $self = shift;
    $self->logMessage("WARNING: @_");
}


sub utf8_to_ascii {
	# on my system (win), STDERR does not support utf8 (per default)
	# this function maps unicode to plain ascii to avoid warnings 
	# about unprintable wide characters.
   return join("",
	 map { $_ > 255 ?                  # if wide character...
		   sprintf("&#x%04X;", $_) :   # \x{...}
		   chr($_)                    # else as themselves
	 } unpack("U*", $_[0]));         # unpack Unicode characters
}


#

( run in 1.083 second using v1.01-cache-2.11-cpan-88abd93f124 )