perl

 view release on metacpan or  search on metacpan

lib/unicore/mktables  view on Meta::CPAN

begin with a word in angle brackets, like <super>, which denotes the
compatible decomposition type.  If the map does not begin with the <angle
brackets>, the decomposition is canonical.
END
        ));

        my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
                                        Default_Map => "",
                                        Perl_Extension => 1,
                                        Directory => $map_directory,
                                        Type => $STRING,
                                        To_Output_Map => $OUTPUT_ADJUSTED,
                                        );
        $Decimal_Digit->add_comment(join_lines(<<END
This file gives the mapping of all code points which represent a single
decimal digit [0-9] to their respective digits, but it has ranges of 10 code
points, and the mapping of each non-initial element of each range is actually
not to "0", but to the offset that element has from its corresponding DIGIT 0.
These code points are those that have Numeric_Type=Decimal; not special
things, like subscripts nor Roman numerals.
END
        ));

        # These properties are not used for generating anything else, and are
        # usually not output.  By making them last in the list, we can just
        # change the high end of the loop downwards to avoid the work of
        # generating a table(s) that is/are just going to get thrown away.
        if (! property_ref('Decomposition_Mapping')->to_output_map
            && ! property_ref('Name')->to_output_map)
        {
            $last_field = min($NAME, $DECOMP_MAP) - 1;
        } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
            $last_field = $DECOMP_MAP;
        } elsif (property_ref('Name')->to_output_map) {
            $last_field = $NAME;
        }
        return;
    }

    my $first_time = 1;                 # ? Is this the first line of the file
    my $in_range = 0;                   # ? Are we in one of the file's ranges
    my $previous_cp;                    # hex code point of previous line
    my $decimal_previous_cp = -1;       # And its decimal equivalent
    my @start;                          # For each field, the current starting
                                        # code point in hex for the range
                                        # being accumulated.
    my @fields;                         # The input fields;
    my @previous_fields;                # And those from the previous call

    sub filter_UnicodeData_line($file) {
        # Handle a single input line from UnicodeData.txt; see comments above
        # Conceptually this takes a single line from the file containing N
        # properties, and converts it into N lines with one property per line,
        # which is what the final handler expects.  But there are
        # complications due to the quirkiness of the input file, and to save
        # time, it accumulates ranges where the property values don't change
        # and only emits lines when necessary.  This is about an order of
        # magnitude fewer lines emitted.

        # $_ contains the input line.
        # -1 in split means retain trailing null fields
        (my $cp, @fields) = split /\s*;\s*/, $_, -1;

        #local $to_trace = 1 if main::DEBUG;
        trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
        if (@fields > $input_field_count) {
            $file->carp_bad_line('Extra fields');
            $_ = "";
            return;
        }

        my $decimal_cp = hex $cp;

        # We have to output all the buffered ranges when the next code point
        # is not exactly one after the previous one, which means there is a
        # gap in the ranges.
        my $force_output = ($decimal_cp != $decimal_previous_cp + 1);

        # The decomposition mapping field requires special handling.  It looks
        # like either:
        #
        # <compat> 0032 0020
        # 0041 0300
        #
        # The decomposition type is enclosed in <brackets>; if missing, it
        # means the type is canonical.  There are two decomposition mapping
        # tables: the one for use by Perl's normalize.pm has a special format
        # which is this field intact; the other, for general use is of
        # standard format.  In either case we have to find the decomposition
        # type.  Empty fields have None as their type, and map to the code
        # point itself
        if ($fields[$PERL_DECOMPOSITION] eq "") {
            $fields[$DECOMP_TYPE] = 'None';
            $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
        }
        else {
            ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
                                            =~ / < ( .+? ) > \s* ( .+ ) /x;
            if (! defined $fields[$DECOMP_TYPE]) {
                $fields[$DECOMP_TYPE] = 'Canonical';
                $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
            }
            else {
                $fields[$DECOMP_MAP] = $map;
            }
        }

        # The 3 numeric fields also require special handling.  The 2 digit
        # fields must be either empty or match the number field.  This means
        # that if it is empty, they must be as well, and the numeric type is
        # None, and the numeric value is 'Nan'.
        # The decimal digit field must be empty or match the other digit
        # field.  If the decimal digit field is non-empty, the code point is
        # a decimal digit, and the other two fields will have the same value.
        # If it is empty, but the other digit field is non-empty, the code
        # point is an 'other digit', and the number field will have the same
        # value as the other digit field.  If the other digit field is empty,
        # but the number field is non-empty, the code point is a generic
        # numeric type.
        if ($fields[$NUMERIC] eq "") {
            if ($fields[$PERL_DECIMAL_DIGIT] ne ""

lib/unicore/mktables  view on Meta::CPAN

                    # nothing in it
                    if ($table->is_empty) {
                        if ($property->type == $BINARY) {
                            push @tables_that_may_be_empty, $table->complete_name;
                        }
                        else {
                            $table->set_fate($SUPPRESSED, $after_first_version);
                        }
                    }

                    # Now we add the removed code points to the property's
                    # map, as they should now map to the grab-bag default
                    # property (which they did in the first comparison
                    # version).  But we don't have to do this if the map is
                    # only for internal use.
                    if (defined $default_map && $property->to_output_map) {

                        # The gc property has pseudo property values whose names
                        # have length 1.  These are the union of all the
                        # property values whose name is longer than 1 and
                        # whose first letter is all the same.  The replacement
                        # is done once for the longer-named tables.
                        next if $property == $gc && length $table->name == 1;

                        foreach my $range ($deltas->ranges) {
                            $property->add_map($range->start,
                                            $range->end,
                                            $default_map,
                                            Replace => $UNCONDITIONALLY);
                        }
                    }
                }
            }
        }
    }

    # The above code doesn't work on 'gc=C', as it is a superset of the default
    # ('Cn') table.  It's easiest to just special case it here.
    my $C = $gc->table('C');
    $C += $gc->table('Cn');

    return;
}

my %gcb_components;
my %lb_components;
my %wb_components;

sub split_property($base_property, $property, $splits)
{
    # Given $property that divides the possible code points into equivalence
    # classes, this changes it to also encapsulate the criteria given by
    # $splits.  It does this by creating more equivalence classes based on
    # intersecting $property with $splits.  It returns a hash showing the new
    # classes, as well as showing how the original components have been split.
    #
    # This function should be called just once on $property, so that the
    # returned hash can be properly built up.  $splits is therefore an array
    # reference in case $property needs to be divided multiple times,
    # Each element of $splits is independent, so that a split can be further
    # split in the next iteration, but each element of $split must be
    # consistent with itself, with ranges that are disjoint.
    #
    # Each element of $split is a a hash with two keys
    #   name => is the name to be applied to the split
    #   ranges => is a RangeList of the ranges it has, or something that a
    #             RangeList can be automatically extracted from using the
    #             overloaded operators.
    #
    # An example should clarify.  Unicode publishes the Line Break (LB)
    # property, where each possible code point is given a type, like
    # Alphabetic, or Opening Parenthesis.  They also publish rules for whether
    # it is permissible to break a line between each type.  You wouldn't break
    # a line between two alphabetics or between an opening parenthesis and an
    # alphabetic, but you could between a Space and almost any other type.
    # The types are essentially equivalence classes that divide up the
    # possible code points.
    #
    # In recent releases, Unicode has mostly stopped creating new equivalence
    # classes as it has refined the rules.  The line breaking rules for East
    # Asian languages are very differet from other scripts, for example, but
    # instead of dividing Alphabetics into East Asian ones, and non-, Unicode
    # has published rules that involve criteria that aren't encapsulated in
    # the LB property.  Perl's algorithm for determining if there is a break
    # would have to start including these foreign criteria by changing code
    # instead of being table driven.  Instead, we use this function to create
    # a modified LB property that incorporates these criteria, by adding new
    # equivalence classes that do distinguish between the East Asian
    # alphabetics vs non-East Asian alphabetics.  mk_invlists uses these to
    # create tables that then do incorporate these criteria.  This allows the
    # code that implements the actual algorithm to not have to have ad-hoc
    # cases that Unicode's failure to create new equivalence classes
    # encourages.  (Ad-hoc dfas do have to be created when context must be
    # considered for determining the breaking or not, but there are relatively
    # few of these.)
    #
    # The original property had certain maps, furnished by Unicode.  After
    # this function is done, it will have more maps, with the new ones being
    # each original map that has code points that are shared with the ones
    # $split_ranges defines.
    #
    # LB is not only split into subclasses by East Asian or not, but by other
    # independent criteria as well.  That is why $splits is an array ref, to
    # allow one splitting by East Asian, and others by the other criteria.  By
    # the time the function is done, all the splits will be applied, creating
    # perhaps sub-sub-sub-... classes to whatever level is necessary,
    # returning a hash that gives these and how the original classes were
    # sub-divided.
    #
    # $property is going to be $base_property modified to contain the split.
    # $base_property is passed to us because it has match tables in it that
    # have been processed to know about all the aliases the property's
    # equivalence classes can have.  $property is raw, and lacks that
    # information.  And there is otherwise no need to spend the time
    # generating them.
    #
    # It may well be that some classes have no code points in common with what
    # is being split.  Or that some classes match exactly with the split ones.
    # This makes sure not to disturb either of those, using a second pass
    # to restore the ones it temporarily changeed.  An example of the latter
    # is the Line Break Korean ones.  All of them are also East Asian, so no



( run in 0.616 second using v1.01-cache-2.11-cpan-71847e10f99 )