Lingua-Norms-SUBTLEX

 view release on metacpan or  search on metacpan

lib/Lingua/Norms/SUBTLEX.pm  view on Meta::CPAN

      hascontent( $args{'scale'} )
      ? $self->{'_FIELDS'}
      ->datum( $self->{'_LANG'}, 'frq_' . $args{'scale'} . '_idx' )
      : $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' );
    my @vals = ();
    for my $str ( @{$strs} ) {
        push @vals,
          _get_val_for_str( _get_usr_str($str), $col_i,
            map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
    }
    return @vals;
}

=head2 Retrieving letter-strings/words

=head3 select_strings

 $aref = $subtlex->select_strings(frq_opm => [1, 20], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
 $aref = $subtlex->select_strings(frq_zipf => [0, 2], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');

I<Alias>: select_words

Returns a list of strings (presumably words) from the SUBTLEX corpus that satisfies certain criteria, as per the following arguments:

=over 2

=item length

minimum and/or maximum length of the string (or "letter-length")

=item frq_opm, frq_log, cd_count, etc.

minimum and/or maximum frequency (as given in whatever unit offered by the datafile for the set language)

=item cv_pattern

a consonant-vowel pattern, given as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant. The tested strings are strip...

=item regex

a regular expression (L<perlretut|perlretut>). In the examples above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified for retrieval. Alternatively, for example, the regex value '[^aeiouy]$' speci...

=back

For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [...

The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).

Calling this method as "list_strings" or "list_words" is deprecated; to avoid confusion with L<all_strings|Lingua::Norms::SUBTLEX/all_strings>, which also returns a list of strings. A deprecation warning and wrap to the method is in place as of versi...

=cut

sub select_strings {
    my ( $self, %args ) = @_;
    my %patterns = ();
    if ( hascontent( $args{'regex'} ) ) {
        $patterns{'regex'} = qr/$args{'regex'}/xms;
    }
    if ( hascontent( $args{'cv_pattern'} ) ) {
        my $tmp = q{};
        my @c = split m//ms, uc( $args{'cv_pattern'} );
        foreach (@c) {
            $tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
        }
        $patterns{'cv_pattern'} = qr/^$tmp$/ixms;
    }

    my @list = ();
    open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
  LINES:
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        my @line = split m/\Q$self->{'_DELIM'}\E/xms;
        next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
        for ( keys %patterns ) {
            next LINES if unidecode( $line[0] ) !~ $patterns{$_};
        }
        for (qw/frq_count frq_opm frq_log frq_zipf cd_count cd_pct cd_log/) {
            if (
                ref $args{$_}
                and hascontent(
                    $self->{'_FIELDS'}->datum( $self->{'_LANG'}, $_ . '_idx' )
                )
              )
            {
                next LINES
                  if !_in_range(
                    _clean_value(
                        $line[
                          $self->{'_FIELDS'}
                          ->datum( $self->{'_LANG'}, $_ . '_idx' )
                        ]
                    ),
                    @{ $args{$_} }
                  );
            }
        }
        if ( ref $args{'pos'} ) {
            next LINES
              if none {
                $_ eq $line[ $self->{'_FIELDS'}
                  ->datum( $self->{'_LANG'}, 'pos_dom_idx' ) ]
            }
            @{ $args{'pos'} };
        }
        push @list, $line[0];
    }
    close $fh or croak $OS_ERROR;

    return \@list;
}
*select_words = \&select_strings;

=head3 all_strings

 $aref = $subtlex->all_strings();

I<Alias>: all_words

Returns a reference to an array of all letter-strings in the corpus. These are culled of empty and duplicate strings, and then alphabetically sorted.

=cut

sub all_strings {
    my ( $self, %args ) = @_;
    my @list = ();
    open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        push @list, _get_file_str( $_, $self->{'_DELIM'} );
    }
    close $fh or croak $OS_ERROR;
    return [ sort { lc($a) cmp lc($b) } uniq( grep { hascontent($_) } @list ) ];
}
*all_words = \&all_strings;

=head3 random_string

 $string = $subtlex->random_string();
 @data = $subtlex->random_string();

I<Alias>: random_word

Picks a random line from the corpus, using L<File::RandomLine|File::RandomLine> (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifyin...

=cut

sub random_string {
    my ( $self, %args ) = @_;
    eval { require File::RandomLine; };
    croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
    my $rl =
      File::RandomLine->new( $self->{'_PATH'}, { algorithm => 'uniform' } );
    my @ari = ();
    while ( not scalar @ari or $ari[0] eq 'Word' ) {
        @ari = split m/\Q$self->{'_DELIM'}\E/xms, $rl->next;
    }
    return wantarray ? @ari : $ari[0];
}
*random_word = \&random_string;

=head2 Miscellaneous

=head3 n_lines

 $num = $subtlex->n_lines();

Returns the number of lines, less the column headings and any lines with no content, in the installed language file. Expects/accepts no arguments.

=cut

sub n_lines {
    my $self = shift;
    my $z    = 0;
    open( my $fh, q{<}, $self->{'_PATH'} ) or croak $OS_ERROR;
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        next if nocontent($_);
        $z++;
    }
    close $fh or croak $OS_ERROR;
    return $z;
}
*nlines = \&n_lines;                        # legacy alias

=head3 pct_alpha

Returns the percentage of strings in the subtitles file that satisfy "look like words" relative to the number of lines (as per L<n_lines|Lingua::Norms::SUBTLEX/n_lines>). Specifically, after ASCII transliteration of the string (per L<Text::Unidecode|...

=cut

sub pct_alpha {
    my ( $self, %args ) = @_;
    my $all_strs_aref = $self->all_strings();
    my $count_all     = count( @{$all_strs_aref} );
    my $pct           = q{};
    if ( $count_all > 0 ) {
        my $count_alpha_strings = count( grep { m/[\p{XPosixAlpha}\-']/xsm }
              map { unidecode($_) } @{$all_strs_aref} );
        $pct = 100 * $count_alpha_strings / $count_all;
    }
    return $pct;
}

=head3 set_lang

 $lang = $subtlex->set_lang(lang => STR); # DE, FR, NL_all, NL_min, PT, UK or US
 $lang = $subtlex->set_lang(lang => STR, path => 'this/is/the/file.csv');
 $lang = $subtlex->set_lang(lang => STR, dir => 'file/is/in/here');

Set or guess location of datafile; see L<new|Lingua::Norms::SUBTLEX/new>. Naturally, the given value of B<lang> (required)--which is used as a database ID--should correspond with any given B<path> to the SUBTLEX datafile (optional but recommended). I...

=cut

sub set_lang {
    my ( $self, %args ) = @_;

lib/Lingua/Norms/SUBTLEX.pm  view on Meta::CPAN

      if nocontent($str);
    croak "The requested value is not defined for the current SUBTLEX corpus"
      if nocontent($col_i);

    my $val = q{};    # default value returned is empty string
    open( my $fh, q{<}, $path ) or croak $OS_ERROR;
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        my $file_str =
          _get_file_str( $_, $delim );    # have to declare as can be empty (!)
        next if nocontent($file_str);
        if ( $eq_fn->( $str, $file_str ) ) {
            $val = _get_val( $_, $delim, $col_i );
            last;
        }
    }
    close $fh or croak $OS_ERROR;
    return $val;
}

sub _get_val {
    my ( $line, $delim, $col_i ) = @_;

# if the line has quoted fields, and uses the delimiter within the quotes,
# as in SUBTLEX-PT, need to firstly clean the line up:
# this "fix" assumes the quotes are either double- or single quotes and nothing else,
# and there is no trailing delimiter.
# It strips the quotes, and replaces the comma with a vertical bar:
    $line =~ s/["']([^"'\Q$delim\E]+)\Q$delim\E([^"'\Q$delim\E]+)["']/$1|$2/gxsm;

    $line =~ m/^(
            [^\Q$delim\E]* # any character from the start not including the delimiter (which might be \t)
            \Q$delim\E # now ending with the delimiter, perhaps as a quoted string 
            )
            {$col_i,}? # as many times as necessary to get to the required field value
            ([^\Q$delim\E]*) # which should be here
            /msx;
    return _clean_value($2);    # now format the number, strip space ...
}

sub _get_val_for_strs {
    my ( $str, $col_i_aref, $path, $delim, $eq_fn ) = @_;

    # Check we have a string, and valid filed indices:
    croak
      'No word to test; pass a letter-string named \'string\' to the function'
      if nocontent($str);
    croak "The requested value is not defined for the SUBTLEX corpus"
      if any { nocontent($_) } @{$col_i_aref};

    my $val = [];

    # Search for the string, and isolate the requested values:
    open( my $fh, q{<}, $path ) or croak $OS_ERROR;
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        my $file_str =
          _get_file_str( $_, $delim );    # have to declare as can be empty (!)
        next if nocontent($file_str);
        if ( $eq_fn->( $str, $file_str ) ) {
            my @line = split m/\Q$delim\E/xms;
            for my $col_i ( @{$col_i_aref} ) {
                push @{$val}, _clean_value( $line[$col_i] );
            }
            last;
        }
    }
    close $fh or croak;

# return the reference to array if there is more than 1 value, otherwise just the single value itself
    ## but if the string itself was not found, return the empty string for the number of requested fields:
    my $n_vals = scalar grep { hascontent($_) } @{$val};
    return
        $n_vals
      ? $n_vals > 1
          ? $val
          : $val->[0]
      : scalar @{$col_i_aref} > 1 ? [ q{} x scalar @{$col_i_aref} ]
      :                             q{};
}

sub _get_any_vals_for_string_list {
    my ( $str_aref, $col_i_href, $path, $delim, $eq_fn ) = @_;
    my %string_vals = ();
    my @usr_strings = sort { $a cmp $b } @{$str_aref};

    # Search for the string, and isolate the requested values:
    open( my $fh, q{<}, $path ) or croak $OS_ERROR;
    while (<$fh>) {
        next if $INPUT_LINE_NUMBER == 1;    # skip column heading line
        my $file_str =
          _get_file_str( $_, $delim );    # have to declare as can be empty (!)
        next if nocontent($file_str);
        if ( my $found = first { $eq_fn->( $_, $file_str ) } @usr_strings ) {
            my @line = split m/\Q$delim\E/xms;    # split the line
            for my $col_i ( keys %{$col_i_href} ) {
                $string_vals{$file_str}->{ $col_i_href->{$col_i} } =
                  _clean_value( $line[$col_i] );
            }
            last if scalar keys %string_vals == scalar @{$str_aref};
            splice @usr_strings, ( firstidx { $_ eq $found } @usr_strings ), 1;

            #print STDERR "checking ",join(q{,}, @usr_strings),"\n";
        }
    }
    close $fh or croak;
    return \%string_vals;

# return the reference to array if there is more than 1 value, otherwise just the single value itself
    ## but if the string itself was not found, return the empty string for the number of requested fields:
#my $n_vals = scalar grep { hascontent($_) } @{$val};
#return $n_vals ? $n_vals > 1 ? $val : $val->[0] : scalar @{$col_i_aref} > 1 ? [q{} x scalar @{$col_i_aref}] : q{};
}

# Loads a hash-ref of the "specs" for each language file, including the field indices in each file for the measures they contain:
## Called only by new() after setting the MODULE_DIR
sub _set_spec_hash {
    my ( $self, $fieldpath ) = @_;
    $fieldpath ||= File::Spec->catfile( $self->{'_MODULE_DIR'}, 'specs.csv' );
    $self->{'_FIELDS'} = Text::CSV::Hashify->new(
        { file => $fieldpath, format => 'hoh', key => 'Lang_stub' } );
    return;
}

sub _in_range {
    my ( $n, $min, $max ) = @_;
    my $res = 1;
    if ( !is_numeric($n) ) {
        $res = 0;
    }
    else {
        if ( hascontent($min) and $n < $min ) {    # fails min
            $res = 0;
        }
        if ( $res && ( hascontent($max) and $n > $max ) ) {  # fails max and min
            $res = 0;
        }
    }
    return $res;
}

sub _clean_value {
    my $val = shift;
    return q{} if nocontent($val);
    $val =~ s/,([^,]+)$/.$1/xsm;    # replace ultimate , with .
    return trim( unquote($val) );
}

sub _pos_is {
    my ( $pos_aref, $fields, $lang ) = @_;
    $pos_aref = [$pos_aref] if !ref $pos_aref;
    my @test_str = map { split /[\W\.]+/xsm } @{$pos_aref};
    return [qw/UK/] if !scalar @test_str;
    my @pos_ari = ();
    for my $pos_str (@test_str) {



( run in 1.130 second using v1.01-cache-2.11-cpan-71847e10f99 )