Business-CompanyDesignator

 view release on metacpan or  search on metacpan

lib/Business/CompanyDesignator.pm  view on Meta::CPAN

  my @pattern = map {
    # Periods are treated as optional literals, with optional trailing commas and/or whitespace
    /\./   ? $optional1 :
    # Embedded spaces can be multiple, and include leading commas
    / /    ? ',?\s+' :
    # Escape other regex metacharacters
    /[()]/ ? "\\$_" : $_
  } split //, $string;
  $assembler->insert(@pattern);

  # Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
  my $pattern_string = join '', @pattern;

  # Special case - optional match characters can cause clashes between
  # distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/
  # We need to handle such cases as ambiguous with extra checks
  my $optional1e = "\Q$optional1\E";
  my $alt_pattern_string1;
  if ($pattern_string =~ /^(\w)(\w)$/) {
    $alt_pattern_string1 = "$1$optional1$2$optional1";
  } elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
    $alt_pattern_string1 = "$1$2";
  }

  # If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
  # across entries, and we can't unambiguously map back to a standard designator
  if (exists $self->pattern_string_map->{ $pattern_string }) {
    my $current = $self->pattern_string_map->{ $pattern_string };
    if ($current && $current ne $reference_string) {
      # Reset to undef to mark ambiguity
      $self->pattern_string_map->{ $pattern_string } = undef;
    }
  }
  # Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
  elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
    my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
    if ($current && $current ne $reference_string) {
      # Reset both pairs to undef to mark ambiguity
      $self->pattern_string_map->{ $pattern_string } = undef;
      $self->pattern_string_map->{ $alt_pattern_string1 } = undef;
    }
  }
  else {
    $self->pattern_string_map->{ $pattern_string } = $reference_string;
  }
  if ($lang) {
    for my $l (@$lang) {
      if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
        my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
        if ($current && $current ne $reference_string) {
          # Reset to undef to mark ambiguity
          $self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
        }
      }
      else {
        $self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
      }
    }
  }

  # If $string contains unicode diacritics, also add a version without them for misspellings
  if ($string =~ m/\pM/) {
    my $stripped = $string;
    $stripped =~ s/\pM//g;
    $self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
  }
}

# Assemble designator regexes
sub _build_regex {
  my $self = shift;
  my ($type, $lang) = @_;

  state $types = { map { $_ => 1 } qw(end end_cont begin) };
  if (! $types->{$type}) {
    croak "invalid regex type '$type'";
  }

  # RA constructor - case insensitive, with match tracking
  my $assembler = Regexp::Assemble->new->flags('i')->track(1);

  # Construct language regex if $lang is set
  my $lang_re;
  if ($lang) {
    $lang = [ $lang ] if ! ref $lang;
    my $lang_str = join '|', sort @$lang;
    $lang_re = qr/^($lang_str)$/;
  }

  my $count = 0;
  while (my ($long, $entry) = each %{ $self->data }) {
    # If $lang is set, restrict to entries that include $lang
    next if $lang_re && $entry->{lang} !~ $lang_re;
    # If $type is 'begin', restrict to 'lead' entries
    next if $type eq 'begin' && ! $entry->{lead};
    # if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
    next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};

    $count++;
    my $long_nfd = NFD($long);
    $self->_add_to_assembler($assembler, $lang, $long_nfd);

    # Add all abbreviations
    if (my $abbr_list = $entry->{abbr}) {
      $abbr_list = [ $abbr_list ] if ! ref $abbr_list;
      for my $abbr (@$abbr_list) {
        # Only treat non-ascii abbreviations as continuous
        next if $type eq 'end_cont' && $abbr =~ /^\p{ASCII}+$/;
        my $abbr_nfd = NFD($abbr);
        my $abbr_std = NFD($entry->{abbr_std} || $abbr);
        $self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
      }
    }
  }

  # If no entries found (a strange/bogus language?), return undef
  return if $count == 0;

  return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
}



( run in 2.154 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )