perl
view release on metacpan or search on metacpan
lib/unicore/mktables view on Meta::CPAN
begin with a word in angle brackets, like <super>, which denotes the
compatible decomposition type. If the map does not begin with the <angle
brackets>, the decomposition is canonical.
END
));
my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
Default_Map => "",
Perl_Extension => 1,
Directory => $map_directory,
Type => $STRING,
To_Output_Map => $OUTPUT_ADJUSTED,
);
$Decimal_Digit->add_comment(join_lines(<<END
This file gives the mapping of all code points which represent a single
decimal digit [0-9] to their respective digits, but it has ranges of 10 code
points, and the mapping of each non-initial element of each range is actually
not to "0", but to the offset that element has from its corresponding DIGIT 0.
These code points are those that have Numeric_Type=Decimal; not special
things, like subscripts nor Roman numerals.
END
));
# These properties are not used for generating anything else, and are
# usually not output. By making them last in the list, we can just
# change the high end of the loop downwards to avoid the work of
# generating a table(s) that is/are just going to get thrown away.
if (! property_ref('Decomposition_Mapping')->to_output_map
&& ! property_ref('Name')->to_output_map)
{
$last_field = min($NAME, $DECOMP_MAP) - 1;
} elsif (property_ref('Decomposition_Mapping')->to_output_map) {
$last_field = $DECOMP_MAP;
} elsif (property_ref('Name')->to_output_map) {
$last_field = $NAME;
}
return;
}
my $first_time = 1; # ? Is this the first line of the file
my $in_range = 0; # ? Are we in one of the file's ranges
my $previous_cp; # hex code point of previous line
my $decimal_previous_cp = -1; # And its decimal equivalent
my @start; # For each field, the current starting
# code point in hex for the range
# being accumulated.
my @fields; # The input fields;
my @previous_fields; # And those from the previous call
sub filter_UnicodeData_line($file) {
# Handle a single input line from UnicodeData.txt; see comments above
# Conceptually this takes a single line from the file containing N
# properties, and converts it into N lines with one property per line,
# which is what the final handler expects. But there are
# complications due to the quirkiness of the input file, and to save
# time, it accumulates ranges where the property values don't change
# and only emits lines when necessary. This is about an order of
# magnitude fewer lines emitted.
# $_ contains the input line.
# -1 in split means retain trailing null fields
(my $cp, @fields) = split /\s*;\s*/, $_, -1;
#local $to_trace = 1 if main::DEBUG;
trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
if (@fields > $input_field_count) {
$file->carp_bad_line('Extra fields');
$_ = "";
return;
}
my $decimal_cp = hex $cp;
# We have to output all the buffered ranges when the next code point
# is not exactly one after the previous one, which means there is a
# gap in the ranges.
my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
# The decomposition mapping field requires special handling. It looks
# like either:
#
# <compat> 0032 0020
# 0041 0300
#
# The decomposition type is enclosed in <brackets>; if missing, it
# means the type is canonical. There are two decomposition mapping
# tables: the one for use by Perl's normalize.pm has a special format
# which is this field intact; the other, for general use is of
# standard format. In either case we have to find the decomposition
# type. Empty fields have None as their type, and map to the code
# point itself
if ($fields[$PERL_DECOMPOSITION] eq "") {
$fields[$DECOMP_TYPE] = 'None';
$fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
}
else {
($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
=~ / < ( .+? ) > \s* ( .+ ) /x;
if (! defined $fields[$DECOMP_TYPE]) {
$fields[$DECOMP_TYPE] = 'Canonical';
$fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
}
else {
$fields[$DECOMP_MAP] = $map;
}
}
# The 3 numeric fields also require special handling. The 2 digit
# fields must be either empty or match the number field. This means
# that if it is empty, they must be as well, and the numeric type is
# None, and the numeric value is 'Nan'.
# The decimal digit field must be empty or match the other digit
# field. If the decimal digit field is non-empty, the code point is
# a decimal digit, and the other two fields will have the same value.
# If it is empty, but the other digit field is non-empty, the code
# point is an 'other digit', and the number field will have the same
# value as the other digit field. If the other digit field is empty,
# but the number field is non-empty, the code point is a generic
# numeric type.
if ($fields[$NUMERIC] eq "") {
if ($fields[$PERL_DECIMAL_DIGIT] ne ""
lib/unicore/mktables view on Meta::CPAN
# nothing in it
if ($table->is_empty) {
if ($property->type == $BINARY) {
push @tables_that_may_be_empty, $table->complete_name;
}
else {
$table->set_fate($SUPPRESSED, $after_first_version);
}
}
# Now we add the removed code points to the property's
# map, as they should now map to the grab-bag default
# property (which they did in the first comparison
# version). But we don't have to do this if the map is
# only for internal use.
if (defined $default_map && $property->to_output_map) {
# The gc property has pseudo property values whose names
# have length 1. These are the union of all the
# property values whose name is longer than 1 and
# whose first letter is all the same. The replacement
# is done once for the longer-named tables.
next if $property == $gc && length $table->name == 1;
foreach my $range ($deltas->ranges) {
$property->add_map($range->start,
$range->end,
$default_map,
Replace => $UNCONDITIONALLY);
}
}
}
}
}
}
# The above code doesn't work on 'gc=C', as it is a superset of the default
# ('Cn') table. It's easiest to just special case it here.
my $C = $gc->table('C');
$C += $gc->table('Cn');
return;
}
my %gcb_components;
my %lb_components;
my %wb_components;
sub split_property($base_property, $property, $splits)
{
# Given $property that divides the possible code points into equivalence
# classes, this changes it to also encapsulate the criteria given by
# $splits. It does this by creating more equivalence classes based on
# intersecting $property with $splits. It returns a hash showing the new
# classes, as well as showing how the original components have been split.
#
# This function should be called just once on $property, so that the
# returned hash can be properly built up. $splits is therefore an array
# reference in case $property needs to be divided multiple times,
# Each element of $splits is independent, so that a split can be further
# split in the next iteration, but each element of $split must be
# consistent with itself, with ranges that are disjoint.
#
# Each element of $split is a a hash with two keys
# name => is the name to be applied to the split
# ranges => is a RangeList of the ranges it has, or something that a
# RangeList can be automatically extracted from using the
# overloaded operators.
#
# An example should clarify. Unicode publishes the Line Break (LB)
# property, where each possible code point is given a type, like
# Alphabetic, or Opening Parenthesis. They also publish rules for whether
# it is permissible to break a line between each type. You wouldn't break
# a line between two alphabetics or between an opening parenthesis and an
# alphabetic, but you could between a Space and almost any other type.
# The types are essentially equivalence classes that divide up the
# possible code points.
#
# In recent releases, Unicode has mostly stopped creating new equivalence
# classes as it has refined the rules. The line breaking rules for East
# Asian languages are very differet from other scripts, for example, but
# instead of dividing Alphabetics into East Asian ones, and non-, Unicode
# has published rules that involve criteria that aren't encapsulated in
# the LB property. Perl's algorithm for determining if there is a break
# would have to start including these foreign criteria by changing code
# instead of being table driven. Instead, we use this function to create
# a modified LB property that incorporates these criteria, by adding new
# equivalence classes that do distinguish between the East Asian
# alphabetics vs non-East Asian alphabetics. mk_invlists uses these to
# create tables that then do incorporate these criteria. This allows the
# code that implements the actual algorithm to not have to have ad-hoc
# cases that Unicode's failure to create new equivalence classes
# encourages. (Ad-hoc dfas do have to be created when context must be
# considered for determining the breaking or not, but there are relatively
# few of these.)
#
# The original property had certain maps, furnished by Unicode. After
# this function is done, it will have more maps, with the new ones being
# each original map that has code points that are shared with the ones
# $split_ranges defines.
#
# LB is not only split into subclasses by East Asian or not, but by other
# independent criteria as well. That is why $splits is an array ref, to
# allow one splitting by East Asian, and others by the other criteria. By
# the time the function is done, all the splits will be applied, creating
# perhaps sub-sub-sub-... classes to whatever level is necessary,
# returning a hash that gives these and how the original classes were
# sub-divided.
#
# $property is going to be $base_property modified to contain the split.
# $base_property is passed to us because it has match tables in it that
# have been processed to know about all the aliases the property's
# equivalence classes can have. $property is raw, and lacks that
# information. And there is otherwise no need to spend the time
# generating them.
#
# It may well be that some classes have no code points in common with what
# is being split. Or that some classes match exactly with the split ones.
# This makes sure not to disturb either of those, using a second pass
# to restore the ones it temporarily changeed. An example of the latter
# is the Line Break Korean ones. All of them are also East Asian, so no
( run in 0.616 second using v1.01-cache-2.11-cpan-71847e10f99 )