Lingua-Norms-SUBTLEX
view release on metacpan or search on metacpan
lib/Lingua/Norms/SUBTLEX.pm view on Meta::CPAN
hascontent( $args{'scale'} )
? $self->{'_FIELDS'}
->datum( $self->{'_LANG'}, 'frq_' . $args{'scale'} . '_idx' )
: $self->{'_FIELDS'}->datum( $self->{'_LANG'}, 'frq_opm_idx' );
my @vals = ();
for my $str ( @{$strs} ) {
push @vals,
_get_val_for_str( _get_usr_str($str), $col_i,
map { $self->{$_} } (qw/_PATH _DELIM _EQ/) );
}
return @vals;
}
=head2 Retrieving letter-strings/words
=head3 select_strings
$aref = $subtlex->select_strings(frq_opm => [1, 20], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
$aref = $subtlex->select_strings(frq_zipf => [0, 2], length => [4, 4], cv_pattern => 'CVCV', regex => '^f');
I<Alias>: select_words
Returns a list of strings (presumably words) from the SUBTLEX corpus that satisfies certain criteria, as per the following arguments:
=over 2
=item length
minimum and/or maximum length of the string (or "letter-length")
=item frq_opm, frq_log, cd_count, etc.
minimum and/or maximum frequency (as given in whatever unit offered by the datafile for the set language)
=item cv_pattern
a consonant-vowel pattern, given as a string by the usual convention, e.g., 'CCVCC' defines a 5-letter word starting and ending with pairs of consonants, the pairs separated by a vowel. 'Y' is defined here as a consonant. The tested strings are strip...
=item regex
a regular expression (L<perlretut|perlretut>). In the examples above, only letter-strings starting with the letter 'f', followed by one of more other letters, are specified for retrieval. Alternatively, for example, the regex value '[^aeiouy]$' speci...
=back
For the minimum/maximum constrained criteria, the two limits are given as a referenced array where the first element is the minimum and the second element is the maximum. For example, [3, 7] would specify letter-strings of 3 to 7 letters in length; [...
The value returned is always a reference to the list of words retrieved (or to an empty list if none was retrieved).
Calling this method as "list_strings" or "list_words" is deprecated; to avoid confusion with L<all_strings|Lingua::Norms::SUBTLEX/all_strings>, which also returns a list of strings. A deprecation warning and wrap to the method is in place as of versi...
=cut
sub select_strings {
my ( $self, %args ) = @_;
my %patterns = ();
if ( hascontent( $args{'regex'} ) ) {
$patterns{'regex'} = qr/$args{'regex'}/xms;
}
if ( hascontent( $args{'cv_pattern'} ) ) {
my $tmp = q{};
my @c = split m//ms, uc( $args{'cv_pattern'} );
foreach (@c) {
$tmp .= $_ eq 'C' ? '[BCDFGHJKLMNPQRSTVWXYZ]' : '[AEIOU]';
}
$patterns{'cv_pattern'} = qr/^$tmp$/ixms;
}
my @list = ();
open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
LINES:
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
my @line = split m/\Q$self->{'_DELIM'}\E/xms;
next if !_in_range( length( $line[0] ), @{ $args{'length'} } );
for ( keys %patterns ) {
next LINES if unidecode( $line[0] ) !~ $patterns{$_};
}
for (qw/frq_count frq_opm frq_log frq_zipf cd_count cd_pct cd_log/) {
if (
ref $args{$_}
and hascontent(
$self->{'_FIELDS'}->datum( $self->{'_LANG'}, $_ . '_idx' )
)
)
{
next LINES
if !_in_range(
_clean_value(
$line[
$self->{'_FIELDS'}
->datum( $self->{'_LANG'}, $_ . '_idx' )
]
),
@{ $args{$_} }
);
}
}
if ( ref $args{'pos'} ) {
next LINES
if none {
$_ eq $line[ $self->{'_FIELDS'}
->datum( $self->{'_LANG'}, 'pos_dom_idx' ) ]
}
@{ $args{'pos'} };
}
push @list, $line[0];
}
close $fh or croak $OS_ERROR;
return \@list;
}
*select_words = \&select_strings;
=head3 all_strings
$aref = $subtlex->all_strings();
I<Alias>: all_words
Returns a reference to an array of all letter-strings in the corpus. These are culled of empty and duplicate strings, and then alphabetically sorted.
=cut
sub all_strings {
my ( $self, %args ) = @_;
my @list = ();
open my $fh, q{<}, $self->{'_PATH'} or croak $OS_ERROR;
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
push @list, _get_file_str( $_, $self->{'_DELIM'} );
}
close $fh or croak $OS_ERROR;
return [ sort { lc($a) cmp lc($b) } uniq( grep { hascontent($_) } @list ) ];
}
*all_words = \&all_strings;
=head3 random_string
$string = $subtlex->random_string();
@data = $subtlex->random_string();
I<Alias>: random_word
Picks a random line from the corpus, using L<File::RandomLine|File::RandomLine> (except the top header line). Returns the word in that line if called in scalar context; otherwise, the array of data for that line. (A future version might let specifyin...
=cut
sub random_string {
my ( $self, %args ) = @_;
eval { require File::RandomLine; };
croak 'Need to install and access module File::RandomLine' if $EVAL_ERROR;
my $rl =
File::RandomLine->new( $self->{'_PATH'}, { algorithm => 'uniform' } );
my @ari = ();
while ( not scalar @ari or $ari[0] eq 'Word' ) {
@ari = split m/\Q$self->{'_DELIM'}\E/xms, $rl->next;
}
return wantarray ? @ari : $ari[0];
}
*random_word = \&random_string;
=head2 Miscellaneous
=head3 n_lines
$num = $subtlex->n_lines();
Returns the number of lines, less the column headings and any lines with no content, in the installed language file. Expects/accepts no arguments.
=cut
sub n_lines {
my $self = shift;
my $z = 0;
open( my $fh, q{<}, $self->{'_PATH'} ) or croak $OS_ERROR;
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
next if nocontent($_);
$z++;
}
close $fh or croak $OS_ERROR;
return $z;
}
*nlines = \&n_lines; # legacy alias
=head3 pct_alpha
Returns the percentage of strings in the subtitles file that satisfy "look like words" relative to the number of lines (as per L<n_lines|Lingua::Norms::SUBTLEX/n_lines>). Specifically, after ASCII transliteration of the string (per L<Text::Unidecode|...
=cut
sub pct_alpha {
my ( $self, %args ) = @_;
my $all_strs_aref = $self->all_strings();
my $count_all = count( @{$all_strs_aref} );
my $pct = q{};
if ( $count_all > 0 ) {
my $count_alpha_strings = count( grep { m/[\p{XPosixAlpha}\-']/xsm }
map { unidecode($_) } @{$all_strs_aref} );
$pct = 100 * $count_alpha_strings / $count_all;
}
return $pct;
}
=head3 set_lang
$lang = $subtlex->set_lang(lang => STR); # DE, FR, NL_all, NL_min, PT, UK or US
$lang = $subtlex->set_lang(lang => STR, path => 'this/is/the/file.csv');
$lang = $subtlex->set_lang(lang => STR, dir => 'file/is/in/here');
Set or guess location of datafile; see L<new|Lingua::Norms::SUBTLEX/new>. Naturally, the given value of B<lang> (required)--which is used as a database ID--should correspond with any given B<path> to the SUBTLEX datafile (optional but recommended). I...
=cut
sub set_lang {
my ( $self, %args ) = @_;
lib/Lingua/Norms/SUBTLEX.pm view on Meta::CPAN
if nocontent($str);
croak "The requested value is not defined for the current SUBTLEX corpus"
if nocontent($col_i);
my $val = q{}; # default value returned is empty string
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
my $file_str =
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
next if nocontent($file_str);
if ( $eq_fn->( $str, $file_str ) ) {
$val = _get_val( $_, $delim, $col_i );
last;
}
}
close $fh or croak $OS_ERROR;
return $val;
}
sub _get_val {
my ( $line, $delim, $col_i ) = @_;
# if the line has quoted fields, and uses the delimiter within the quotes,
# as in SUBTLEX-PT, need to firstly clean the line up:
# this "fix" assumes the quotes are either double- or single quotes and nothing else,
# and there is no trailing delimiter.
# It strips the quotes, and replaces the comma with a vertical bar:
$line =~ s/["']([^"'\Q$delim\E]+)\Q$delim\E([^"'\Q$delim\E]+)["']/$1|$2/gxsm;
$line =~ m/^(
[^\Q$delim\E]* # any character from the start not including the delimiter (which might be \t)
\Q$delim\E # now ending with the delimiter, perhaps as a quoted string
)
{$col_i,}? # as many times as necessary to get to the required field value
([^\Q$delim\E]*) # which should be here
/msx;
return _clean_value($2); # now format the number, strip space ...
}
sub _get_val_for_strs {
my ( $str, $col_i_aref, $path, $delim, $eq_fn ) = @_;
# Check we have a string, and valid filed indices:
croak
'No word to test; pass a letter-string named \'string\' to the function'
if nocontent($str);
croak "The requested value is not defined for the SUBTLEX corpus"
if any { nocontent($_) } @{$col_i_aref};
my $val = [];
# Search for the string, and isolate the requested values:
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
my $file_str =
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
next if nocontent($file_str);
if ( $eq_fn->( $str, $file_str ) ) {
my @line = split m/\Q$delim\E/xms;
for my $col_i ( @{$col_i_aref} ) {
push @{$val}, _clean_value( $line[$col_i] );
}
last;
}
}
close $fh or croak;
# return the reference to array if there is more than 1 value, otherwise just the single value itself
## but if the string itself was not found, return the empty string for the number of requested fields:
my $n_vals = scalar grep { hascontent($_) } @{$val};
return
$n_vals
? $n_vals > 1
? $val
: $val->[0]
: scalar @{$col_i_aref} > 1 ? [ q{} x scalar @{$col_i_aref} ]
: q{};
}
sub _get_any_vals_for_string_list {
my ( $str_aref, $col_i_href, $path, $delim, $eq_fn ) = @_;
my %string_vals = ();
my @usr_strings = sort { $a cmp $b } @{$str_aref};
# Search for the string, and isolate the requested values:
open( my $fh, q{<}, $path ) or croak $OS_ERROR;
while (<$fh>) {
next if $INPUT_LINE_NUMBER == 1; # skip column heading line
my $file_str =
_get_file_str( $_, $delim ); # have to declare as can be empty (!)
next if nocontent($file_str);
if ( my $found = first { $eq_fn->( $_, $file_str ) } @usr_strings ) {
my @line = split m/\Q$delim\E/xms; # split the line
for my $col_i ( keys %{$col_i_href} ) {
$string_vals{$file_str}->{ $col_i_href->{$col_i} } =
_clean_value( $line[$col_i] );
}
last if scalar keys %string_vals == scalar @{$str_aref};
splice @usr_strings, ( firstidx { $_ eq $found } @usr_strings ), 1;
#print STDERR "checking ",join(q{,}, @usr_strings),"\n";
}
}
close $fh or croak;
return \%string_vals;
# return the reference to array if there is more than 1 value, otherwise just the single value itself
## but if the string itself was not found, return the empty string for the number of requested fields:
#my $n_vals = scalar grep { hascontent($_) } @{$val};
#return $n_vals ? $n_vals > 1 ? $val : $val->[0] : scalar @{$col_i_aref} > 1 ? [q{} x scalar @{$col_i_aref}] : q{};
}
# Loads a hash-ref of the "specs" for each language file, including the field indices in each file for the measures they contain:
## Called only by new() after setting the MODULE_DIR
sub _set_spec_hash {
my ( $self, $fieldpath ) = @_;
$fieldpath ||= File::Spec->catfile( $self->{'_MODULE_DIR'}, 'specs.csv' );
$self->{'_FIELDS'} = Text::CSV::Hashify->new(
{ file => $fieldpath, format => 'hoh', key => 'Lang_stub' } );
return;
}
sub _in_range {
my ( $n, $min, $max ) = @_;
my $res = 1;
if ( !is_numeric($n) ) {
$res = 0;
}
else {
if ( hascontent($min) and $n < $min ) { # fails min
$res = 0;
}
if ( $res && ( hascontent($max) and $n > $max ) ) { # fails max and min
$res = 0;
}
}
return $res;
}
sub _clean_value {
my $val = shift;
return q{} if nocontent($val);
$val =~ s/,([^,]+)$/.$1/xsm; # replace ultimate , with .
return trim( unquote($val) );
}
sub _pos_is {
my ( $pos_aref, $fields, $lang ) = @_;
$pos_aref = [$pos_aref] if !ref $pos_aref;
my @test_str = map { split /[\W\.]+/xsm } @{$pos_aref};
return [qw/UK/] if !scalar @test_str;
my @pos_ari = ();
for my $pos_str (@test_str) {
( run in 1.130 second using v1.01-cache-2.11-cpan-71847e10f99 )