view release on metacpan or search on metacpan
lib/Biblio/BP.pm view on Meta::CPAN
#
# load_converter($converter_name);
#
# find_bp_files();
# find_bp_files($rehash);
#
# reg_format($long_name, $short_name, $pkg_name, $charset_name, @info);
#
# [ file bp-p-cs ]
#
# unicode_to_canon($unicode);
#
# canon_to_unicode($character);
#
# decimal_to_unicode($number);
#
# unicode_to_decimal($unicode);
#
# unicode_name($unicode);
#
# meta_name($metacode);
#
# meta_approx($metacode);
#
# unicode_approx($unicode);
#
# nocharset($string);
#
# [ file bp-p-util ]
#
# bp_util'mname_to_canon($names_string);
# bp_util'mname_to_canon($names_string, $flag_reverse_author);
#
# bp_util'name_to_canon($name_string);
# bp_util'name_to_canon($name_string, $flag_reverse_author);
lib/Biblio/bp/CHANGELOG view on Meta::CPAN
Month and Year were reversed, and were short
output_date is now in bp-p-utils
bp-p-utils Added output_date
Fixed multiple von prefix parsing in mname_to_canon
bp-s-generic School name in report
s/,"$/."/ at ending
bp-output Added style name to trailer
bp-canon New format, used for testing mainly
bp-cs-canon New charset, used for testing
bp-refer Changes to EndNote %E parsing
bp-cs-dead Forgot \\ in front of $charmap{$unicode} in fromcanon
bp-procite Wrote fromcanon routine
bp-cs-html Added support for &U+xxxx; characters
Added call to font_noprev
Changed some // calls to //o and s/// to s///o.
bp-p-cs Added font_noprev routine
changed fontcheck to font_check
--------- version 0.2.2 released ---------
14 Mar 96:
lib/Biblio/bp/CHANGELOG view on Meta::CPAN
bp-medline.pl: Added support for Entrez Medlars to Medline format.
Added support for Entrez HTML output (notitles).
15 Jan 96:
bp-inspec4.pl: Wrote INSPEC format, style 4.
bp-inspec.pl: Wrote INSPEC general format.
bp-p-dload.pl: Added comment about unreg_format function needed.
2 Dec 95:
bp.pl: Bumped version number to 0.2.1.
bp-cs-tab.pl: Added 4 greek characters to unicode approx table.
30 Nov 95:
bp-cs-html.pl: Newline before <BLOCKQUOTE>.
--------- version 0.2.0 released ---------
--------- Changelog started ---------
lib/Biblio/bp/lib/bp-cs-8859-1.pl view on Meta::CPAN
# We're eight bit ISO-8859-1, so there isn't anything for us to do.
# We assume here that the escape character is already done.
$_[0];
}
######
sub fromcanon {
local($_, $protect) = @_;
local($repl, $unicode, $can);
return $_ unless /$bib'cs_escape/o;
1 while s/${bib'cs_ext}00(..)/&bib'unicode_to_canon('00'.$1)/ge;
while (/${bib'cs_ext}(....)/) {
$unicode = $1;
$can = &bib'unicode_approx($unicode);
defined $can && s/$bib'cs_ext$unicode/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to ISO-8859-1");
s/${bib'cs_ext}$unicode//g;
}
while (/${bib'cs_meta}(....)/) {
$repl = $1;
$can = &bib'meta_approx($repl);
defined $can && s/$bib'cs_meta$repl/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to ISO-8859-1");
s/${bib'cs_meta}$repl//g;
}
$_;
lib/Biblio/bp/lib/bp-cs-apple.pl view on Meta::CPAN
'02D8', 249, # BREVE
'02D9', 250, # DOT ABOVE (Mandarin Chinese light tone)
'02DA', 251, # RING ABOVE
'00B8', 252, # CEDILLA
'02DD', 253, # DOUBLE ACUTE ACCENT
'02DB', 254, # OGONEK
'02C7', 255, # CARON (Mandarin Chinese third tone)
);
# Table done.
$unicode = '';
$repl = '';
$can = '';
$eb_eval_fromcanon = '';
$eb_eval_tocanon = '';
$eb_nomapC = '';
$eb_nomapA = '';
$eb_mapC = '';
$eb_mapA = '';
#
# Build the eval string for the fromcanon code.
#
# For each 8bit code, we either:
# 1) don't have a character for this code. So we zap and complain.
# 2) we do know it, so we translate them all at once after we're done
# with all the ones we don't know.
#
foreach $can (128..255) {
$unicode = &bib'decimal_to_unicode($can);
$repl = pack("C", $can);
if (defined $umap{$unicode}) {
$eb_mapC .= $repl;
$eb_mapA .= pack("C", $umap{$unicode});
} else {
$eb_nomapC .= $repl;
$eb_eval_fromcanon .= "tr/$repl//d && \&bib'gotwarn(\"Can't convert "
. &bib'unicode_name($unicode) . " to Apple\");\n";
}
}
substr($eb_eval_fromcanon,0,0) = "if (/[$eb_nomapC]/) {\n";
$eb_eval_fromcanon .= "}\ntr/$eb_mapC/$eb_mapA/;\n";
#
# Build the eval string for the tocanon code.
#
# nomapA just means there isn't a direct 8bit replacement. We just insert
# the extended character.
#
foreach $unicode (keys %umap) {
next if $unicode =~ /^00/;
$repl = pack("C", $umap{$unicode});
$eb_nomapA .= $repl;
$eb_eval_tocanon .= "s/$repl/$bib'cs_ext$unicode/g;\n";
}
substr($eb_eval_tocanon,0,0) = "if (/[$eb_nomapA]/) {\n";
$eb_eval_tocanon .= "}\ntr/$eb_mapA/$eb_mapC/;\n";
#####################
sub tocanon {
local($_, $protect) = @_;
lib/Biblio/bp/lib/bp-cs-apple.pl view on Meta::CPAN
# step 2: Use tr/<canons>/<apples>/ to translate all the two-way
# mapped characters right across.
eval $eb_eval_fromcanon;
return $_ unless /$bib'cs_escape/o;
# The standard 7bit map.
1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
while (/${bib'cs_ext}(....)/) {
$unicode = $1;
defined $umap{$unicode}
&& s/${bib'cs_ext}$unicode/pack("C", $umap{$unicode})/ge
&& next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to Apple");
s/${bib'cs_ext}$unicode//g;
}
while (/${bib'cs_meta}(....)/) {
$repl = $1;
&bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to Apple");
s/${bib'cs_meta}$repl//g;
}
$_;
}
lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN
$bib'charsets{'dead', 'tocanon'} = "bp_cs_dead'tocanon";
$bib'charsets{'dead', 'fromcanon'} = "bp_cs_dead'fromcanon";
$bib'charsets{'dead', 'toesc'} = "[\\\\]";
$bib'charsets{'dead', 'fromesc'} = "[\\\200-\377]|${bib'cs_ext}|${bib'cs_meta}";
######
# package variables for anyone to use
$repl = '';
$unicode = '';
$can = '';
######
# XXXXX Should charmap include the \ character also?
%charmap = (
#00A0
#00..
#00BE
lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN
'0268', '-i',
);
# Secondary mappings
%charmap2 = (
'sz', '00DF',
);
# Build a reverse map and eval string
$reval = '';
while (($unicode, $repl) = each %charmap) {
$can = &bib'unicode_to_canon( $unicode );
if ($repl =~ /^[-`'^~",v]/) {
$rmap{$repl} = $can;
} else {
$repl =~ s/(\W)/\\$1/g;
$reval .= "s/\\\\$repl/$can/g;\n";
}
}
# continue the same reverse map for the secondary mappings.
while (($repl, $unicode) = each %charmap2) {
$can = &bib'unicode_to_canon( $unicode );
if ($repl =~ /^[-`'^~",v]/) {
$rmap{$repl} = $can;
} else {
$repl =~ s/(\W)/\\$1/g;
$reval .= "s/\\\\$repl/$can/g;\n";
}
}
######
lib/Biblio/bp/lib/bp-cs-dead.pl view on Meta::CPAN
local($_, $protect) = @_;
if ($protect) {
s/\\/\\\\/go;
}
s/\240/ /g;
s/\255/-/g;
while (/([\200-\377])/) {
$repl = $1;
$unicode = &bib'canon_to_unicode($repl);
if (defined $charmap{$unicode}) {
s/$repl/\\$charmap{$unicode}/g;
} else {
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to dead-key");
s/$repl//g;
}
}
return $_ unless /$bib'cs_escape/o;
while (/${bib'cs_ext}(....)/) {
$unicode = $1;
if ($unicode =~ /^00[0-7]/) { # 7-bit characters
1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
next;
}
defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/\\$charmap{$unicode}/g
&& next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to dead-key");
s/${bib'cs_ext}$unicode//g;
}
while (/${bib'cs_meta}(....)/) {
$repl = $1;
&bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to dead-key");
s/${bib'cs_meta}$repl//g;
}
$_;
}
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
# to this one. References for those characters may be found in
# http://www.w3.org/hypertext/WWW/MarkUp/html3/mathsym.html
# Also see
# http://www.to.icl.fi/~aj/html3_test/entities.html
#
# Big additions in HTML3 include the greek characters and proper typesetting
# spaces.
#
# As referenced in:
# http://www.acl.lanl.gov/HTML_WG/html-wg-96q1.messages/0102.html
# I am going to allow the &U+xxxx; form for unicode characters for HTML 3.
# I'm not sure if this is going to stay in the standard however.
#
# Specifically, I use the named references as opposed to the numeric, as that
# is recommended in the html3 draft spec, and it is much easier to read.
# Reading through the working notes of the HTML_WG, it looks like they
# specifically recommend translating Numerical Character Entities of Latin-1
# into the real things. See reference:
# http://www.acl.lanl.gov/HTML_WG/html-wg-95q1.messages/0920.html
# about the can of worms this opens. Specifically, &215; refers to _different_
# characters depending on which ISO 8859-x group you're working with! The
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
'021F', '/SMALL',
);
$cs_init = 1;
}
sub init_cs_fr {
&init_cs unless $cs_init;
# XXXXX We should just use unicode_approx.
# Map various unicode entities to our stuff.
%charmap_from = (
'2212', '-',
'2013', '--',
'2014', '---',
'2002', ' ', # These two are probably wrong.
'2003', ' ',
);
# HTML 2.0 Secondary meta mappings for fromcanon
# Note that these do _not_ get wrapped in <> like metamap_2 does.
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
#####################
# Essentially we're ISO-8859-1, but with a few protected characters
# and some extra ones.
# XXX protect lonely & characters
sub tocanon {
local($_, $protect) = @_;
local($repl, $can, $mine, $unicode);
&bib'panic("cs-html tocanon called with no arguments!") unless defined $_;
# Check to see if we have anything to do.
return $_ unless /<|\&/;
# XXXXX Ignore named Latin-1 for now.
# Handle links first.
local($link, $text);
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
# To prevent matching this, we leave it in extended format.
s/&/${bib'cs_ext}0026/go;
# always check to see if we have any characters left to change
return $_ unless /\&/;
# This should handle 8 bit chars correctly as well as the new UCS-2, cf.
# <http://www.ics.uci.edu/pub/ietf/html/draft-ietf-html-i18n-03.txt>
while (/\&#(\d+);/) {
$repl = $1;
$can = &bib'unicode_to_canon( &bib'decimal_to_unicode($repl) );
s/\&#$repl;/$can/g;
}
# This handles to the new &U+xxx; form. I need to find a good reference
# for this.
while (/\&U\+([\dA-Fa-f]{4});/) {
$repl = $1;
$can = &bib'unicode_to_canon( $repl );
s/\&U\+$repl;/$can/g;
}
# Now for the named entities.
while (/\&(\S*);/) {
$repl = $1;
$repl =~ s/(\W)/\\$1/g;
if (defined $entitynames{$repl}) {
$can = pack("C", $entitynames{$repl});
s/\&$repl;/$can/g;
next;
}
# We now search through the charmaps to see if they contain our character.
# Terribly inefficient, but we shouldn't be doing this often.
if (defined $opt_html3) {
$can = undef;
foreach $unicode (keys %charmap_3) {
next unless $charmap_3{$unicode} eq $repl;
# only here if we found the map.
$can = &bib'unicode_to_canon($unicode);
s/\&$repl;/$can/g;
last;
}
next if defined $can;
}
$can = undef;
foreach $unicode (keys %charmap_2) {
next unless $charmap_2{$unicode} eq $repl;
# only here if we found the map.
$can = &bib'unicode_to_canon($unicode);
s/\&$repl;/$can/g;
last;
}
next if defined $can;
&bib'gotwarn("Unknown HTML entity: \&$repl; : in $_");
s/\&$repl;//g;
}
s/&(\s)/${bib'cs_ext}0026$1/g;
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
######
sub fromcanon {
local($_, $protect) = @_;
local($repl, $can);
&bib'panic("cs-html fromcanon called with no arguments!") unless defined $_;
# Leave 8bit characters alone since we're assuming ISO-8859-1 HTML.
#1 while s/${bib'cs_ext}(00..)/&bib'unicode_to_canon($1)/ge;
# XXXXX I think these should go here, as they create a mess if they go
# after the ext and meta maps.
s/\&/\&/g;
s/</\</g;
s/>/\>/g;
s/${bib'cs_ext}0026/\&/go;
return $_ unless /$bib'cs_escape/o;
lib/Biblio/bp/lib/bp-cs-html.pl view on Meta::CPAN
&& next;
defined $charmap_2{$repl}
&& s/$bib'cs_ext$repl/\&$charmap_2{$repl};/g
&& next;
defined $charmap_from{$repl}
&& s/$bib'cs_ext$repl/$charmap_from{$repl}/g
&& next;
$opt_html3 && s/$bib'cs_ext$repl/\&U\+$repl;/g && next;
$can = &bib'unicode_approx($repl);
defined $can && s/$bib'cs_ext$repl/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($repl)." to HTML");
s/${bib'cs_ext}$repl//g;
}
# XXXXX We need to deal with font changes.
$_ = &bib'font_noprev($_) if /${bib'cs_meta}0110/o;
while (/${bib'cs_meta}(....)/o) {
$repl = $1;
# We need to do these in order of most to least inclusive.
# That way we can get, say ThinSpace to map to a thin space in HTML3
lib/Biblio/bp/lib/bp-cs-none.pl view on Meta::CPAN
s/\xE6/ae/g;
s/[\200-\377]//g;
}
}
if (/$bib'cs_escape/o) {
local($repl, $can);
s/${bib'cs_meta}....//g;
while (/${bib'cs_ext}(....)/) {
$repl = $1;
$can = &bib'unicode_approx($repl);
defined $can && s/$bib'cs_ext$repl/$can/g && next;
s/${bib'cs_ext}$repl//g;
}
}
tr/\x00-\x1F//d;
$_;
}
#######################
lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN
$bib'charsets{'tex', 'toesc'} = "[\$\\\\]";
# XXXXX We have so many characters to protect, should we even bother?
$bib'charsets{'tex', 'fromesc'} = "[\\#\$\%\&{}_\|><\^~\200-\377]|${bib'cs_ext}|${bib'cs_meta}";
######
$cs_init = 0;
# package variables for anyone to use
$mine = '';
$unicode = '';
$can = '';
######
sub init_cs {
# Thorn and eth are really nasty since they don't exist in the standard TeX
# fonts. This is what I came up with in r2b to fake it. Fortunately they
# aren't used often. Get the cmoer fonts if you want to do them right.
# My eth is pretty nice, but the thorn leaves a little to be desired.
lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN
# Build up a search string to do the reverse map.
$cmap_to_eval = '';
$cmap_from8_eval = '';
$cmap_to_eval_1 = '';
$cmap_to_eval_2 = '';
%rmap = ();
%accent = ();
# Step 1: Build a reverse map
while (($unicode, $mine) = each %charmap) {
$rmap{$mine} = $unicode;
}
# Step 2: walk through the keys in sorted order
local($mineE);
foreach $mine (sort keys %rmap) {
$can = &bib'unicode_to_canon( $rmap{$mine} );
$mineE = $mine;
$mineE =~ s/(\W)/\\$1/g;
# The various maps for tocanon
if ($mine =~ /^{\\([`'^"~])([\w])}$/) {
$accent{$1 . $2} = $can;
} elsif ($mine =~ /^{\\([vc])(\w)}$/) {
$accent{$1 . $2} = $can;
} elsif ($mine =~ /^{\\([vc]){(\w)}}$/) {
$accent{$1 . $2} = $can;
} elsif ($mine =~ /leavevmode/) {
lib/Biblio/bp/lib/bp-cs-tex.pl view on Meta::CPAN
s/\|/\$\|\$/g;
s/>/\$>\$/g;
s/</\$<\$/g;
s/\^/\\^{}/g;
s/~/\\~{}/g;
s/$bib'cs_temp/\$\\backslash\$/go;
}
while (/([\200-\237])/) {
$repl = $1;
$unicode = &bib'canon_to_unicode($repl);
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
s/$repl//g;
}
&init_cs unless $cs_init;
#if (/[\240-\377]/) {
# eval $cmap_from8_eval;
#}
s/\240/~/g;
s/\255/-/g;
while (/([\240-\377])/) {
$repl = $1;
$unicode = &bib'canon_to_unicode($repl);
s/$repl/$charmap{$unicode}/g;
}
# Maybe we can go now?
return $_ unless /$bib'cs_escape/o;
while (/${bib'cs_ext}(....)/) {
$unicode = $1;
if ($unicode =~ /^00[0-7]/) {
1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
next;
}
defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
&& next;
defined $charmap2{$unicode} && s/${bib'cs_ext}$unicode/$charmap2{$unicode}/g
&& next;
$can = &bib'unicode_approx($unicode);
defined $can && s/$bib'cs_ext$unicode/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to TeX");
s/${bib'cs_ext}$unicode//g;
}
while (/${bib'cs_meta}(....)/) {
$repl = $1;
defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
&& next;
$can = &bib'meta_approx($repl);
defined $can && s/$bib'cs_meta$repl/$can/g && next;
lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN
$bib'charsets{'troff', 'fromcanon'} = "bp_cs_troff'fromcanon";
$bib'charsets{'troff', 'toesc'} = '[\\\\]';
$bib'charsets{'troff', 'fromesc'} = "[\\\\\200-\377]|${bib'cs_ext}|${bib'cs_meta}";
######
$opt_doublebs = 1;
# variables used throughout the package
$unicode = '';
$mine = '';
$can = '';
# Rather than defining all our maps and running code for reverse maps at
# load time, we're going to embed them in functions. When tocanon or
# fromcanon get called, we do the init if we haven't already. This should
# save startup time -- especially if they never actually call our function!
# In the troff code in particular, the tocanon code needs a lot of reverse
# maps and eval code. If we're doing xyz->troff, we don't need to load
# all of that.
lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN
&init_cs unless $cs_init;
# Build up a search string to do the reverse map.
$cmap_eval = '';
#$cmap_from_eval = '';
%rmap = ();
$mineE = '';
# Step 1: Build a reverse map
while (($unicode, $mine) = each %charmap) {
$rmap{$mine} = &bib'unicode_to_canon( $unicode );
}
# Step 2: walk through the keys in sorted order
# (sigh, without a tree, this is still as slow as a dog)
foreach $mine (sort keys %rmap) {
$can = $rmap{$mine};
$mineE = $mine;
$mineE =~ s/(\W)/\\$1/g;
if ( $mine !~ /\\\(../ && $mine !~ /.\\\*./ ) {
$cmap_eval .= "s/$mineE/$can/g;\n";
}
lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN
'\*Cl', '013E',
'\*CN', '0147',
'\*Cn', '0148',
'\*?', '00BF',
'\*!', '00A1',
'\(n~', '00F1',
);
$cmap_to_eval = '';
foreach $mine (sort keys %chartos) {
$can = &bib'unicode_to_canon( $chartos{$mine} );
$mineE = $mine;
$mineE =~ s/(\W)/\\$1/g;
if ( $mine !~ /\\\(../ && $mine !~ /.\\\*./ ) {
$cmap_to_eval .= "s/$mineE/$can/g;\n";
} else {
# Mapped up front with the rest.
if (defined $rmap{$mine}) {
&bib'goterror("Error in troff tables -- duplicate entry for $mine.");
}
$rmap{$mine} = $can;
lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN
$repl = $1;
$repl eq 'P' && ($mine = $bib'cs_meta . '0110');
$repl =~ /[1R]/ && ($mine = $bib'cs_meta . '0101');
$repl =~ /[2I]/ && ($mine = $bib'cs_meta . '0102');
$repl =~ /[3B]/ && ($mine = $bib'cs_meta . '0103');
s/\\f$repl/$mine/g;
}
$_ = &bib'font_check($_);
}
while (($unicode, $mine) = each %metamap) {
$mine =~ s/(\W)/\\$1/g;
s/$mine/${bib'cs_meta}$can/g;
}
return $_ unless /\\/;
# Last of all, the escape character. First we check to see if there is
# anything else. We can't delete it because of the way troff does it's
# coding.
if (/\\[^e]/) {
lib/Biblio/bp/lib/bp-cs-troff.pl view on Meta::CPAN
local($repl);
&bib'panic("cs-troff fromcanon called with no arguments!") unless defined $_;
s/\\/\\e/g;
# tr/\200-\237//d && &bib'gotwarn("Zapped chars.");
if (/[\200-\237]/) {
while (/([\200-\237])/) {
$repl = $1;
$unicode = &bib'canon_to_unicode($repl);
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
s/$repl//g;
}
}
&init_cs_fr unless $cs_fr_init;
# Which one of these to use probably depends on the frequency of
# special characters. The first method will be best with only one
# or two, but the second is better if there are a lot.
while (/([\240-\377])/g) {
$repl = $1;
$unicode = &bib'canon_to_unicode($repl);
s/$repl/$charmap{$unicode}/g;
}
# Note that the definition of cmap_from_eval is now commented out above.
#if (/[\240-\377]/) {
# eval $cmap_from_eval;
#}
# should we make the output have double backslashes?
$opt_doublebs && s/\\/\\\\/g;
# Maybe we can go now?
return $_ unless /$bib'cs_escape/o;
while (/${bib'cs_ext}(....)/) {
$unicode = $1;
if ($unicode =~ /^00[0-7]/) {
1 while s/${bib'cs_ext}00([0-7].)/pack("C", hex($1))/ge;
next;
}
defined $charmap{$unicode} && s/${bib'cs_ext}$unicode/$charmap{$unicode}/g
&& next;
$can = &bib'unicode_approx($repl);
defined $can && s/$bib'cs_ext$repl/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'unicode_name($unicode)." to troff");
s/${bib'cs_ext}$unicode//g;
}
while (/${bib'cs_meta}(....)/) {
$repl = $1;
defined $fontmap{$repl} && s/${bib'cs_meta}$repl/$fontmap{$repl}/g
&& next;
defined $metamap{$repl} && s/${bib'cs_meta}$repl/$metamap{$repl}/g
&& next;
$can = &bib'meta_approx($repl);
lib/Biblio/bp/lib/bp-cs-utf8.pl view on Meta::CPAN
sub tocanon {
$_[0];
}
######
sub fromcanon {
local($_, $protect) = @_;
local($repl, $unicode, $can);
return $_ unless /$bib'cs_escape/o;
1 while s/${bib'cs_ext}(....)/\X{$1}/g;
while (/${bib'cs_meta}(....)/) {
$repl = $1;
$can = &bib'meta_approx($repl);
defined $can && s/$bib'cs_meta$repl/$can/g && next;
&bib'gotwarn("Can't convert ".&bib'meta_name($repl)." to UTF8");
lib/Biblio/bp/lib/bp-p-cs.pl view on Meta::CPAN
# Character set common variables and routines
#
# Dana Jacobsen (dana@acm.org)
# 18 November 1995 (last modified 17 March 1996)
# for bib'nocharset which calls fromcanon:
require "bp-cs-none.pl";
######
#
# Return canonical character for a unicode hex string.
#
sub unicode_to_canon {
local($hex) = @_;
$hex =~ tr/a-f/A-F/;
# XXXXX Should we prepend '0' characters if we don't have 4 digits?
if ($hex !~ /^[\dA-F]{4}$/) {
&bib'gotwarn("Invalid Unicode character: $hex");
return '';
}
if ($hex =~ /00(..)/) {
return pack("C", hex($1));
}
return $bib'cs_ext . $hex;
}
sub canon_to_unicode {
local($can) = @_;
local($hex);
if (length($can) == 1) {
$hex = sprintf("%2lx", ord($can));
$hex =~ tr/a-f /A-F0/;
return( '00' . $hex );
}
if ($can =~ /$bib'cs_ext(....)/) {
$hex = $1;
$hex =~ tr/a-f/A-F/;
return $hex;
}
if ($can eq $bib'cs_char_escape) {
return &bib'canon_to_unicode($bib'cs_escape);
}
return &bib'gotwarn("Can't convert $can to Unicode");
}
sub decimal_to_unicode {
local($num) = @_;
local($hex);
if ($num < 256) {
$hex = sprintf("00%2lx", $num);
} elsif ($num < 65536) {
local($div) = $num / 256;
local($high) = int($div);
local($low) = 256 * ($div - $high);
$hex = sprintf("%2lx%2lx", $high, $low);
} else {
return &bib'gotwarn("Illegal number $num given to decimal_to_unicode");
}
$hex =~ tr/a-f /A-F0/;
$hex;
}
sub unicode_to_decimal {
local($uni) = @_;
return &bib'gotwarn("Illegal unicode length: $uni") unless length($uni) == 4;
return &bib'gotwarn("Illegal unicode string: $uni") if $uni =~ /[^\da-fA-F]/;
hex($uni);
}
sub unicode_name {
local($hex) = @_;
local($name);
# For now, just print hex value
$name = "Unicode '$hex'";
$name;
}
sub meta_name {
local($hex) = @_;
lib/Biblio/bp/lib/bp-p-cs.pl view on Meta::CPAN
$name = "Meta '$hex'";
$name;
}
# Oh boy, this is getting really complicated.
#
# We have an approx table set up, which says that one can approximate XXXX
# by YYYY, where presumably YYYY is easier. There shouldn't be any loops,
# so programs can recurse through the table.
#
# That's for the meta codes. For the unicode approx, we just have a
# string. This allows multiple character approximations.
#
# XXXXX Think about C3's idea of multiple approximations.
#
# A map of 0000 means that it maps to the null string -- our "approximation"
# is to get rid of it. This is what we can do if it isn't terribly harmful
# to remove it.
sub meta_approx {
local($orig) = @_;
require "${glb_bpprefix}p-cstab.pl" unless defined %bib'mapprox_tab;
if (defined $mapprox_tab{$orig}) {
return '' if $mapprox_tab{$orig} eq '0000';
return "${bib'cs_meta}$mapprox_tab{$orig}";
}
undef;
}
sub unicode_approx {
local($orig) = @_;
# XXXXX Should we warn them that they're getting an approx?
require "${glb_bpprefix}p-cstab.pl" unless defined %bib'uapprox_tab;
$uapprox_tab{$orig};
}
######
lib/Biblio/bp/lib/bp.pl view on Meta::CPAN
#
# load_converter($converter_name);
#
# find_bp_files();
# find_bp_files($rehash);
#
# reg_format($long_name, $short_name, $pkg_name, $charset_name, @info);
#
# [ file bp-p-cs ]
#
# unicode_to_canon($unicode);
#
# canon_to_unicode($character);
#
# decimal_to_unicode($number);
#
# unicode_to_decimal($unicode);
#
# unicode_name($unicode);
#
# meta_name($metacode);
#
# meta_approx($metacode);
#
# unicode_approx($unicode);
#
# nocharset($string);
#
# [ file bp-p-util ]
#
# bp_util'mname_to_canon($names_string);
# bp_util'mname_to_canon($names_string, $flag_reverse_author);
#
# bp_util'name_to_canon($name_string);
# bp_util'name_to_canon($name_string, $flag_reverse_author);
lib/Biblio/bp/lib/bp.pl view on Meta::CPAN
# bib'load_charset
# bib'find_bp_files
# bib'reg_format
######
require "${glb_bpprefix}p-cs.pl";
# loads:
# variables used by the cs routines
# bib'nocharset
# bib'unicode_to_canon
######
require "${glb_bpprefix}p-option.pl";
# loads:
# bib'stdargs
# bib'options
# bib'parse_num_option
# bib'parse_option
# bib'doc
lib/Biblio/bp/tests/newcset.pl view on Meta::CPAN
#!/usr/bin/perl
require "bptest.pl";
$skiptests = 0; # to skip early tests during development
$timing = 0; # to use the same seed for timing tests
#
# This test covers the 8859-1, apple, TeX, troff, and HTML
# character sets. It also tests the canon<->unicode routines
# from bp-p-utils.pl.
#
# It does not test the 'none' and 'auto' character sets. The
# auto character set doesn't exist, so it tests all the charsets
# that are currently in bp.
#
# For each charset, we run through an iso string, which contains
# the 8-bit characters 0 through 255. Before processing, certain
# characters known to not be supported are removed (generally
# 200-237 octal, but sometimes others). This is to prevent
lib/Biblio/bp/tests/newcset.pl view on Meta::CPAN
# already be escaped. They also do not do any unescaping (except
# for cs_ext and cs_meta, which they are supposed to handle).
$isostring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
$randstring =~ s/$bib'cs_escape/$bib'cs_char_escape/go;
print "Testing character set routines, seed=$seed.\n";
if (!$skiptests) {
&begintest("bib'unicode", 12);
&check('', "bib'canon_to_unicode", '006C', 'l');
&check('', "bib'canon_to_unicode", '00C4', "\304");
&check('', "bib'canon_to_unicode", 'A1C6', "${bib'cs_ext}A1c6");
&check('', "bib'canon_to_unicode", '001C', $bib'cs_char_escape);
&check('', "bib'unicode_to_canon", '5', '0035');
&check('', "bib'unicode_to_canon", "\xE9", '00E9');
&check('', "bib'unicode_to_canon", "${bib'cs_ext}CF8A", 'CF8A');
&check('', "bib'decimal_to_unicode", '0023', 35);
&check('', "bib'decimal_to_unicode", '359B', 13723);
&check('', "bib'unicode_to_decimal", 35, '0023');
&check('', "bib'unicode_to_decimal", 13723, '359b');
$f = $failed;
for (0..512) {
$can = &bib'unicode_to_canon(&bib'decimal_to_unicode($_));
$val = &bib'unicode_to_decimal(&bib'canon_to_unicode($can));
&check('nostatus,norun',"unicode loop", $_, $val);
}
$can = $val = undef;
&check('partial', "unicode loop", $f, $failed);
&endtest;
# Test ISO-8859-1
&testcharset("8859-1", 5);
$caniso = $isostring;
&check('', "bp_cs_88591'tocanon", $caniso, $isostring);
&check('', "bp_cs_88591'fromcanon", $isostring, $caniso);
lib/PBib/BibliographyStyle.pm view on Meta::CPAN
# --*-Perl-*--
# $Id: BibliographyStyle.pm 11 2004-11-22 23:56:20Z tandler $
#
package PBib::BibliographyStyle;
use strict;
use English;
use charnames ':full'; # enable \N{unicode char name} in strings
# for debug:
use Data::Dumper;
BEGIN {
use vars qw($Revision $VERSION);
my $major = 1; q$Revision: 11 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}
# superclass
lib/PBib/Document/OpenOffice.pm view on Meta::CPAN
Base class for OpenOffice documents.
All OpenOffice documents have a similar structure: They are a zip archive that contains a content.xml file with the content.
=cut
package PBib::Document::OpenOffice;
use 5.008; # for Unicode / utf-8 support
use strict;
use warnings;
use charnames ':full'; # enable \N{unicode char name} in strings
# use English;
# for debug:
use Data::Dumper;
BEGIN {
use vars qw($Revision $VERSION);
my $major = 1; q$Revision: 13 $ =~ /: (\d+)/; my ($minor) = ($1); $VERSION = "$major." . ($minor<10 ? '0' : '') . $minor;
}
lib/PBib/Document/OpenOffice.pm view on Meta::CPAN
# used own modules
# module variables
#use vars qw(mmmm);
use constant {
CONTENTNAME => 'content.xml',
};
# Unicode chars, see http://www.unicode.org/charts/
my $EM_DASH = "\N{EM DASH}"; # = \x{2014}
my $EN_DASH = "\N{EN DASH}"; # = \x{2013}
my $FIG_DASH = "\N{FIGURE DASH}"; # 20 12
my $NB_HYPHEN = "\N{NON-BREAKING HYPHEN}"; # 20 11
my $HYPHEN = "\N{HYPHEN}"; # 20 10 (difference to "hypen-minus"?)
my $OPT_HYPHEN = "\N{SOFT HYPHEN}"; # 00 AD
my $LDBLQUOTE_EN = "\N{LEFT DOUBLE QUOTATION MARK}"; # 20 1C
my $RDBLQUOTE_EN = "\N{RIGHT DOUBLE QUOTATION MARK}"; # 20 1D
my $LQUOTE_EN = "\N{LEFT SINGLE QUOTATION MARK}"; # 20 18
lib/PBib/ReferenceConverter.pm view on Meta::CPAN
}
sub warn {
my $self = shift;
$self->logMessage("WARNING: @_");
}
sub utf8_to_ascii {
# on my system (win), STDERR does not support utf8 (per default)
# this function maps unicode to plain ascii to avoid warnings
# about unprintable wide characters.
return join("",
map { $_ > 255 ? # if wide character...
sprintf("&#x%04X;", $_) : # \x{...}
chr($_) # else as themselves
} unpack("U*", $_[0])); # unpack Unicode characters
}
#