Business-CompanyDesignator
view release on metacpan or search on metacpan
lib/Business/CompanyDesignator.pm view on Meta::CPAN
my @pattern = map {
# Periods are treated as optional literals, with optional trailing commas and/or whitespace
/\./ ? $optional1 :
# Embedded spaces can be multiple, and include leading commas
/ / ? ',?\s+' :
# Escape other regex metacharacters
/[()]/ ? "\\$_" : $_
} split //, $string;
$assembler->insert(@pattern);
# Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
my $pattern_string = join '', @pattern;
# Special case - optional match characters can cause clashes between
# distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/
# We need to handle such cases as ambiguous with extra checks
my $optional1e = "\Q$optional1\E";
my $alt_pattern_string1;
if ($pattern_string =~ /^(\w)(\w)$/) {
$alt_pattern_string1 = "$1$optional1$2$optional1";
} elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
$alt_pattern_string1 = "$1$2";
}
# If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
# across entries, and we can't unambiguously map back to a standard designator
if (exists $self->pattern_string_map->{ $pattern_string }) {
my $current = $self->pattern_string_map->{ $pattern_string };
if ($current && $current ne $reference_string) {
# Reset to undef to mark ambiguity
$self->pattern_string_map->{ $pattern_string } = undef;
}
}
# Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
if ($current && $current ne $reference_string) {
# Reset both pairs to undef to mark ambiguity
$self->pattern_string_map->{ $pattern_string } = undef;
$self->pattern_string_map->{ $alt_pattern_string1 } = undef;
}
}
else {
$self->pattern_string_map->{ $pattern_string } = $reference_string;
}
if ($lang) {
for my $l (@$lang) {
if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
if ($current && $current ne $reference_string) {
# Reset to undef to mark ambiguity
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
}
}
else {
$self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
}
}
}
# If $string contains unicode diacritics, also add a version without them for misspellings
if ($string =~ m/\pM/) {
my $stripped = $string;
$stripped =~ s/\pM//g;
$self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
}
}
# Assemble designator regexes
sub _build_regex {
my $self = shift;
my ($type, $lang) = @_;
state $types = { map { $_ => 1 } qw(end end_cont begin) };
if (! $types->{$type}) {
croak "invalid regex type '$type'";
}
# RA constructor - case insensitive, with match tracking
my $assembler = Regexp::Assemble->new->flags('i')->track(1);
# Construct language regex if $lang is set
my $lang_re;
if ($lang) {
$lang = [ $lang ] if ! ref $lang;
my $lang_str = join '|', sort @$lang;
$lang_re = qr/^($lang_str)$/;
}
my $count = 0;
while (my ($long, $entry) = each %{ $self->data }) {
# If $lang is set, restrict to entries that include $lang
next if $lang_re && $entry->{lang} !~ $lang_re;
# If $type is 'begin', restrict to 'lead' entries
next if $type eq 'begin' && ! $entry->{lead};
# if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};
$count++;
my $long_nfd = NFD($long);
$self->_add_to_assembler($assembler, $lang, $long_nfd);
# Add all abbreviations
if (my $abbr_list = $entry->{abbr}) {
$abbr_list = [ $abbr_list ] if ! ref $abbr_list;
for my $abbr (@$abbr_list) {
# Only treat non-ascii abbreviations as continuous
next if $type eq 'end_cont' && $abbr =~ /^\p{ASCII}+$/;
my $abbr_nfd = NFD($abbr);
my $abbr_std = NFD($entry->{abbr_std} || $abbr);
$self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
}
}
}
# If no entries found (a strange/bogus language?), return undef
return if $count == 0;
return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
}
( run in 2.154 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )