unicode results from the CPAN

unicode
Benchmark-Perl-Formance-Cargo
view release on metacpan or search on metacpan
share/P6STD/CursorBase.pmc view on Meta::CPAN
        $self->deb("Commit $$xact[0] to $P") if DEBUG & DEBUG::matchers;
        return $self->cursor_xact("CL") if $xact->[0] =~ /^ALTLTM/;
        $xact = $xact->[-1];
    }
    die "Not in a longest token matcher, so can't commit to a longest token";
}

sub _COMMITRULE { my $self = shift;
    my $xact = $self->xact;
#    $self->{LAST} = shift() if @_;
    local $CTX = $self->callm if DEBUG & DEBUG::trace_call;
    my $P = $self->{_pos};
    while ($xact) {
        $xact->[-2] = 1;
        $self->deb("Commit $$xact[0] to $P") if DEBUG & DEBUG::matchers;
        return $self->cursor_xact("CR") if $xact->[0] =~ /^RULE/;
        $xact = $xact->[-1];
    }
    die "Not in a rule, so can't commit to rule";
}

sub commit { my $self = shift;
    my $xact = $self->xact;
#    $self->{LAST} = shift() if @_;
    local $CTX = $self->callm if DEBUG & DEBUG::trace_call;
    my $P = $self->{_pos};
    while ($xact) {
        $xact->[-2] = 1;
        $self->deb("Commit $$xact[0] to $P") if DEBUG & DEBUG::matchers;
        return $self->cursor_xact("CM") if $xact->[0] =~ /^MATCH/;
        $xact = $xact->[-1];
    }
    die "Not in a match, so can't commit to match";
}

sub fail { my $self = shift;
    my $m = shift;
    return ();
}

sub bless { CORE::bless $_[1], $_[0]->WHAT }

#############################################################
# JIT lexer generator
#############################################################

## NFA structure:
##   array of (NFA node) ->
##     0: non extensible (imperative) flag
##     0: array of fate (array of fate element)
##     array of (label) at odd index, new index at even
## DFA structure:
##   each DFA node is array:
##     0: array of object fates
##     1: hash of specific cases (char => DFAnode)
##        also carries some debug data
##    2n: reference to a uniprop hash
##  2n+1: DFAnode is that hash existed
## Labels: undef is epsilon link.
##   otherwise list - 1 positive, 0+ negative
##   each is: 1 character, else unicode prop in "Gc/L" form
## "DFA" lexer structure:
##   {DFA} -> array of refs to all DFA nodes
##   {DBA}, {FILE}, {NAME} same as "RE" lexer structure
##   {S} -> ref to DFA root
##   {NFA} -> NFA structure
## individual fates in the NFA end with a hook which can be 1 to stop adding
## fates on the end; it's not always possible to associate a unique fate with
## each NFA node, consider (a|b)*
## A NFA or DFA node is accepting if it has a nonempty list of fates

#cycle breaker
{
    package CursorBase::dfa;
    sub DESTROY {
        my $self = shift;
        for (@$self) { @$_ = (); }
    }
}

# Steal data from Perl5's Unicode maps
my %unicode_map_cache;
BEGIN {
    $unicode_map_cache{ALL} = [scalar("\377" x 128) x 1088, "ALL"] ;
    my $name = File::Spec->catfile($data_dir, "uniprops");
    open MAP, "<", "$name" or
        die "cannot open unicode maps from $name : $!\n";

    binmode MAP;
    while (defined (my $c = getc MAP)) {
        my $name = "";
        my $used;
        my $tile;
        read MAP, $name, ord($c);
        read MAP, $used, 136;

        $unicode_map_cache{$name} = [ (("") x 1088), $name ];

        for (my $i = 0; $i < 1088; $i++) {
            if (vec($used, $i, 1)) {
                read MAP, $tile, 128;
                $unicode_map_cache{$name}[$i] = $tile;
            }
        }
    }
    close MAP or die "cannot close unicode maps: $!";
}
sub _get_unicode_map {
    my $propname = shift;
    $unicode_map_cache{$propname} //
        die "Map $propname not found.  Edit gen-unicode-table.pl and rerun.";
}

# This is the fast path handling for JIT DFA lexer generation (although it gets
# short-circuited if the DFALEXERS entry exists, later).  The lexer generation
# process sometimes recurses to this, which is tracked using %::AUTOLEXED; if
# the value is already set, we need to suppress recursion.

our $fakepos = 1;

sub _dump_nfa { my ($name, $nfa) = @_;
    print ::LOG "--- BEGIN NFA DUMP ($name) ---\n";
    for my $ix (0 .. @$nfa-1) {
        my @go;
        for (my $j = 2; $j < @{ $nfa->[$ix] }; $j += 2) {
            push @go, "[" . join("-",@{$nfa->[$ix][$j] // []}) . "] => " . $nfa->[$ix][$j+1];
        }
        my $l = sprintf "%4d: %-30s ", $ix, join(", ", @go);
        if ($nfa->[$ix][1]) {
            my @x = @{ $nfa->[$ix][1] };
            push @x, "..." if shift(@x);
            $l .= join(" ", "-->", @x);
        }
        print ::LOG $l, "\n";
    }
    print ::LOG "---- END NFA DUMP ----\n";
}

sub _dtree_dump { my ($ord, $dt) = @_;
    print ::LOG ("    " x (2 + $ord));
    if (!defined $dt) {
        print ::LOG "END\n";
    } elsif (ref $dt ne 'ARRAY') {
        print ::LOG ($$dt)->[1]{ID}, "\n";
    } else {
        print ::LOG $dt->[2][-1], "?\n";
        _dtree_dump($ord+1, $dt->[1]);
        _dtree_dump($ord+1, $dt->[0]);
    }
}

sub _dfa_dump_node { my ($dfan) = @_;
    my @go;
    my @gor = %{ $dfan->[1] };
    while (my ($a, $b) = splice @gor, 0, 2) {
        next if $a eq 'DESC';
        next if $a eq 'ID';
        push @go, "'" . ::qm(chr $a) . "' => " . $b->[1]{ID};
    }
    printf ::LOG "%-30s %-30s\n", $dfan->[1]{DESC} . ":", join(", ", @go);
    _dtree_dump(0, $dfan->[2]);
    for (@{ $dfan->[0] }) {
        my @arr;
        for (my $fate = $_; $fate; $fate = $fate->[0]) {
            push @arr, $fate->[1], $fate->[2];
        }
        print ::LOG "    --> ", join(" ", @arr), "\n";
    }
}

sub _elem_matches { # my ($char, $element) = @_;
    # Optimize for the common path
    return $_[0] eq $_[1] if length $_[1] == 1;

    my $i = ord $_[0];
    return vec(_get_unicode_map($_[1])->[$i >> 10], $i & 1023, 1);
}

my %boolean_tables = map { $_, 1 } qw/AHex Alpha BidiC BidiM CE CI CWCF CWCM
    CWKCF CWL CWT CWU Cased CompEx DI Dash Dep Dia Ext GrBase GrExt Hex Hyphen
    IDC IDS IDSB IDST Ideo JoinC Lower Math NChar NFDQC OAlpha ODI OGrExt OIDC
    OIDS OLower OMath OUpper PatSyn PatWS QMark Radical SD STerm Space Term
    UIdeo Upper VS XIDC XIDS/;
sub _elem_excludes { my ($up1, $up2) = @_;
    my ($t1, $v1) = split "/", $up1;
    my ($t2, $v2) = split "/", $up2;

    return 0 if $t1 ne $t2;
    return 0 if $v1 eq $v2;

    return 1 if $boolean_tables{$t1};
    return 1 if $t1 eq 'Gc' && (length($v1) == length($v2)
        || substr($v1, 0, 1) ne substr($v2, 0, 1));

    return 0;
}

sub _elem_implies { my ($up1, $up2) = @_;
    my ($t1, $v1) = split "/", $up1;
    my ($t2, $v2) = split "/", $up2;

    return 0 if $t1 ne $t2;
    return 1 if $v1 eq $v2;

    return 1 if $t1 eq 'Gc' && substr($v1, 0, 1) eq $v2;

    return 0;
}

sub _elem_dich { my ($up1, $up2) = @_;
    my ($t1, $v1) = split "/", $up1;
    my ($t2, $v2) = split "/", $up2;

    return 0 if $t1 ne $t2;
    return 0 if $v1 eq $v2;

    return 1 if $boolean_tables{$t1};
    return 0;
}

sub _decision_tree { my ($thunk, @edges) = @_;
    my $branch;

    TERM: for (my $i = 0; $i < @edges; $i += 2) {
        for my $c (@{ $edges[$i] }) {
            next if $c eq 'ALL';
            $branch = $c;
            last TERM;
        }
    }

    if (defined $branch) {
        my @true;
        my @false;

        for (my $i = 0; $i < @edges; $i += 2) {
            my ($p, @n) = @{ $edges[$i] };

            if (!_elem_excludes($branch, $p) &&
                    !(grep { _elem_implies($branch, $_) } @n)) {
                my $pp = _elem_implies($branch, $p) ? 'ALL' : $p;
                my @nn = grep { !_elem_excludes($branch, $_) } @n;
                push @true, [ $pp, @nn ], $edges[$i+1];
            }

            if (!_elem_implies($p, $branch) &&
                    !(grep { _elem_dich($branch, $_) } @n)) {
                my $pp = _elem_dich($branch, $p) ? 'ALL' : $p;
                my @nn = grep { !_elem_implies($_, $branch) } @n;
                push @false, [ $pp, @nn ], $edges[$i+1];
            }
        }

        return [ _decision_tree($thunk, @false),
                 _decision_tree($thunk, @true),
                 _get_unicode_map($branch) ];
    } else {
        # all edges are labelled [ALL]
        my $bm = "";
        for (my $i = 1; $i < @edges; $i += 2) {
            vec($bm, $edges[$i], 1) = 1;
        }
        return ($bm ne '') ? (\ $thunk->($bm)) : undef;
    }
}

sub _tangle_edges { my ($our_edges, $thunk) = @_;
    my %used_chars;
    my %used_cats;

    for (my $i = 0; $i < @$our_edges; $i += 2) {
        next unless $our_edges->[$i];
        for (@{ $our_edges->[$i] }) {
            if (length($_) == 1) {
                $used_chars{$_} = 1;
            } else {
                $used_cats{$_} = 1;
            }
        }
    }

    # First, all specifically mentioned characters are floated to the initial
    # case
    my %next_1;
    my $edgelistref;
    for my $ch (keys %used_chars) {
        my $bm = "";
        EDGE: for (my $i = 0; $i < @$our_edges; $i += 2) {
            $edgelistref = $our_edges->[$i];
            if (length $edgelistref->[0] != 1) {
              my $o = ord $ch; # inlined from _elem_matches
              next unless vec(_get_unicode_map($edgelistref->[0])->[$o >> 10], $o & 1023, 1);
            } elsif ($edgelistref->[0] ne $ch) {
              next;
            }
            my @edgelist = @$edgelistref;
            for (my $j = 0; ++$j < @edgelist; ) {
                next EDGE if _elem_matches($ch, $edgelistref->[$j]);
            }
            vec($bm, $our_edges->[$i+1], 1) = 1;
        }
        $next_1{ord $ch} = $thunk->($bm);
    }

    # Now clean them out so the decision tree engine doesn't have to deal with
    # single characters
    $our_edges = [ @$our_edges ];
    for (my $i = 0; $i < @$our_edges; ) {
        if (!$our_edges->[$i] || length($our_edges->[$i][0]) == 1) {
            splice @$our_edges, $i, 2;
        } else {
            $our_edges->[$i] = [grep { length($_) > 1 } @{ $our_edges->[$i] }];
            $i += 2;
        }
    }

    \%next_1, _decision_tree($thunk, @$our_edges);
}

sub _jit_dfa_node { my ($lexer, $node) = @_;
    my $nfa2dfa = sub { my $nbm = shift;
        $lexer->{NFA2DFA}->{$nbm} //= do {
            my @node;
            $node[1] = { ID => scalar(@{ $lexer->{DFA} }), BITS => $nbm };
            push @{ $lexer->{DFA} }, \@node;
            \@node;
        }
    };

    my $bf = $node->[1]{BITS};
    my $id = $node->[1]{ID};
    my $nfa = $lexer->{NFA};

    my %black;
    my @nfixes = grep { vec($bf, $_, 1) } (0 .. length($bf)*8 - 1);
    my @grey = @nfixes;
    my @ouredges;

    while (@grey) {
        my $nix = pop @grey;
        next if $black{$nix};
        $black{$nix} = 1;
        my $nfn = $nfa->[$nix];

        push @{ $node->[0] }, $nfn->[1] if $nfn->[1];
        for (my $i = 2; $i < @$nfn; $i += 2) {
            if (!$nfn->[$i]) {
                push @grey, $nfn->[$i+1];
            } else {
                push @ouredges, $nfn->[$i], $nfn->[$i+1];
            }
        }
( run in 0.395 second using v1.01-cache-2.11-cpan-71847e10f99 )