Lingua-ZH-CEDICT

 view release on metacpan or  search on metacpan

lib/Lingua/ZH/CEDICT.pm  view on Meta::CPAN


    $self->{_findPos} = 0;
    $self->{_findTerm} = $term;
}

# returns a reference to the first/following entry that matches
sub find {
    my ($self) = @_;
    my $term = $self->{_findTerm};

    while ($self->{_findPos} < $self->numEntries) {
        $self->{_findPos}++;
        my $e = $self->{entry}->[$self->{_findPos} - 1];
        return $e
            if (($e->[0] eq $term) or
                ($e->[1] eq $term) or
                ($e->[2] =~ /^$term$/i) or
                ($e->[3] =~ /^$term$/i) or
                ($e->[4] =~ /^$term$/i));
    }

    # nothing found
    return undef;
}

# Formatting ****************************************************************

my %xlat =
    (a1    => "ā", e1    => "ē", i1    => "ī",
     o1    => "ō", u1    => "ū", 'v1'  => "ǖ",
     a2    => "á", e2    => "é", i2    => "í",
     o2    => "ó", u2    => "ú", 'v2'  => "ǘ",
     a3    => "ǎ", e3    => "ě", i3    => "ǐ",
     o3    => "Ç’", u3    => "Ç”", 'v3'  => "Çš",
     a4    => "à", e4    => "è", i4    => "ì",
     o4    => "ò", u4    => "ù", 'v4'  => "ǜ",
     a5    => 'a',  e5    => 'e',  i5    => 'i',
     o5    => 'o',  u5    => 'u',  'v5'  => 'ü');

sub utf8Pinyin {
    my ($self, $p) = @_;
    $p = $self unless ref($self);

    # normalize u: and v to v
    $p =~ s/u:/v/g;

    $p =~ s/([iuv]?)([aeiouv])([a-z]*)([1-5])/$1$xlat{"$2$4"}$3/g;
    return $p;
}

sub formatEnglish {
    my ($self, $en) = @_;
    $en = $self unless ref($self);

    my $separator = " · ";
#    my $separator = "/";

#    $en =~ s|/|$separator|g;
#    return $en;

    my @terms = split m|/|, $en;

    foreach (0..$#terms) {
        $terms[$_] =~ s|\(([^(]+)\)$|<i>$1</i>|;
    }

    return join($separator, @terms);
}

sub removePinyinTones {
    my ($self, $p) = @_;

    $p =~ s/[12345]//g;
    $p =~ s/(u:|v)/u/g;

    return $p;
}

sub englishToKeywords {
    my ($self, $en) = @_;
    my @kw;

    foreach (split(m|/|, $en)) {
        next if /^\([^()]+\)$/;

        # remove trailing explanation in brackets
        s/\s+\([^(]+\)$//;
        s/^\(?(to|the|a|an|to be)\)?\s+//i;

        # remove characters we don't like in keywords
        s|[^-a-zA-Z0-9 /.]||g;
        s|^\.+||;
#        s!(\w|\d|\s|-|/)!!g;

        # remove leading and trailing and multiple whitespace
        s/^\s+//;
        s/\s+$//;
        s/\s\s+/ /g;

        # definitions like "(a sense of) uncertainty"
        if (/^\((.+?)\)\s+(.+)$/) {
            push @kw, uc($2);
            push @kw, uc("$1 $2");
        } else {
            push @kw, uc($_);
        }
    }

    # return non-empty keywords
    return grep /\w/, @kw;
}

1;
__END__

=head1 NAME

Lingua::ZH::CEDICT - Interface for CEDICT, a Chinese-English dictionary

=head1 SYNOPSIS



( run in 0.593 second using v1.01-cache-2.11-cpan-71847e10f99 )