App-Test-Generator
view release on metacpan or search on metacpan
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
package App::Test::Generator::SchemaExtractor;
use strict;
use warnings;
use autodie qw(:all);
use App::Test::Generator::Model::Method;
use App::Test::Generator::Analyzer::Complexity;
use App::Test::Generator::Analyzer::Return;
use App::Test::Generator::Analyzer::ReturnMeta;
use App::Test::Generator::Analyzer::SideEffect;
use Carp qw(carp croak);
use PPI;
use Pod::Simple::Text;
use File::Basename;
use File::Path qw(make_path);
use Params::Get;
use Safe;
use Scalar::Util qw(looks_like_number);
use YAML::XS;
use IPC::Open3;
use JSON::MaybeXS qw(encode_json decode_json);
use Readonly;
use Symbol qw(gensym);
# --------------------------------------------------
# Confidence score thresholds for input and output analysis
# --------------------------------------------------
Readonly my $CONFIDENCE_HIGH_THRESHOLD => 60;
Readonly my $CONFIDENCE_MEDIUM_THRESHOLD => 35;
Readonly my $CONFIDENCE_LOW_THRESHOLD => 15;
# --------------------------------------------------
# Confidence level label strings
# --------------------------------------------------
Readonly my $LEVEL_HIGH => 'high';
Readonly my $LEVEL_MEDIUM => 'medium';
Readonly my $LEVEL_LOW => 'low';
Readonly my $LEVEL_VERY_LOW => 'very_low';
Readonly my $LEVEL_NONE => 'none';
# --------------------------------------------------
# Analysis limits
# --------------------------------------------------
Readonly my $DEFAULT_MAX_PARAMETERS => 20;
Readonly my $DEFAULT_CONFIDENCE_THRESH => 0.5;
Readonly my $POD_WALK_LIMIT => 200;
Readonly my $SIGNATURE_TIMEOUT_SECS => 3;
Readonly my $MEMORY_LIMIT_BYTES => 50_000_000;
# --------------------------------------------------
# Numeric boundary values for test hint generation
# --------------------------------------------------
Readonly my $INT32_MAX => 2_147_483_647;
# --------------------------------------------------
# Boolean return score thresholds
# --------------------------------------------------
Readonly my $BOOLEAN_SCORE_THRESHOLD => 30;
=head1 NAME
App::Test::Generator::SchemaExtractor - Extract test schemas from Perl modules
=head1 VERSION
Version 0.33
=cut
our $VERSION = '0.33';
=head1 SYNOPSIS
use App::Test::Generator::SchemaExtractor;
my $extractor = App::Test::Generator::SchemaExtractor->new(
input_file => 'lib/MyModule.pm',
output_dir => 'schemas/',
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
#
# Entry: $hash - a PPI node with a children()
# method, typically a
# PPI::Structure::Block from
# a validate_strict call.
#
# Exit: Returns a hashref with keys:
# input - hashref of param specs
# input_style - 'hash'
# _confidence - confidence hashref
# or undef if parsing fails.
#
# Side effects: None.
# --------------------------------------------------
sub _parse_schema_hash {
my ($self, $hash) = @_;
my %result;
for my $child ($hash->children) {
# skip whitespace and operators
if ($child->isa('PPI::Statement') || $child->isa('PPI::Statement::Expression')) {
my ($key, $val);
my @tokens = grep {
!$_->isa('PPI::Token::Whitespace') &&
!$_->isa('PPI::Token::Operator')
} $child->children;
for (my $i = 0; $i < @tokens - 1; $i++) {
if(($tokens[$i]->isa('PPI::Token::Word') || $tokens[$i]->isa('PPI::Token::Quote')) &&
$tokens[$i+1]->isa('PPI::Structure::Constructor')) {
$key = $tokens[$i]->content;
$key =~ s/^['"]|['"]$//g;
$val = $tokens[$i+1];
last;
}
}
next unless $key && $val;
my %param;
for my $inner ($val->children) {
next unless $inner->isa('PPI::Statement') || $inner->isa('PPI::Statement::Expression');
my ($k, undef, $v) = grep {
!$_->isa('PPI::Token::Whitespace') &&
!$_->isa('PPI::Token::Operator')
} $inner->children;
next unless $k && $v;
my $keyname = $k->content;
my $value = $v->can('content') ? $v->content : undef;
$value =~ s/^['"]|['"]$//g if defined $value;
if ($keyname eq 'type') {
$param{type} = lc($value);
} elsif ($keyname eq 'optional') {
$param{optional} = $value ? 1 : 0;
} elsif ($keyname =~ /^(min|max)$/ && looks_like_number($value)) {
$param{$keyname} = 0 + $value;
} elsif ($keyname eq 'matches') {
$param{matches} = qr/$value/;
}
}
$param{type} //= 'string';
$param{optional} //= 0;
$result{$key} = \%param;
}
}
return {
input => \%result,
input_style => 'hash',
_confidence => {
input => {
level => 'high',
factors => ['Input schema extracted from validator'],
},
},
};
}
# --------------------------------------------------
# _ppi
#
# Purpose: Return a PPI::Document for a code
# string, using a per-instance cache
# to avoid re-parsing the same string
# multiple times during a single
# analysis pass.
#
# Entry: $code - either a string of Perl source
# code, or an object that
# already has a find() method
# (returned as-is).
#
# Exit: Returns a PPI::Document, or the
# original object if it already
# supports find().
#
# Side effects: Populates $self->{_ppi_cache}.
# --------------------------------------------------
sub _ppi {
my ($self, $code) = @_;
return $code if ref($code) && $code->can('find');
$self->{_ppi_cache} ||= {};
return $self->{_ppi_cache}{$code} //= PPI::Document->new(\$code);
}
# --------------------------------------------------
# _extract_pvs_schema
#
# Purpose: Detect and extract a parameter schema
# from a Params::Validate::Strict
# validate_strict() call in the method
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
if(exists $output->{type}) {
if(!$valid_types{$output->{type}}) {
$self->_log(" WARNING Output value type is unknown: '$output->{type}', setting to string");
$output->{type} = 'string';
}
}
}
# --------------------------------------------------
# _parse_constraints
#
# Purpose: Parse a constraint string extracted
# from POD documentation and populate
# min, max, or other constraint fields
# in a parameter hashref.
#
# Entry: $param - hashref for the parameter
# being annotated (modified
# in place).
# $constraint - the constraint string,
# e.g. '3-50', 'positive',
# '>= 0', 'min 3'.
#
# Exit: Returns nothing. Modifies $param in
# place by setting min and/or max keys.
#
# Side effects: Logs min/max values to stdout when
# verbose is set.
# --------------------------------------------------
sub _parse_constraints {
my ($self, $param, $constraint) = @_;
# Range: "3-50" or "1-100 chars"
if ($constraint =~ /(\d+)\s*-\s*(\d+)/) {
$param->{min} = $1;
$param->{max} = $2;
}
elsif ($constraint =~ /(\d+)\s*\.\.\s*(\d+)/) {
# Range: 0..19
$param->{min} = $1;
$param->{max} = $2;
}
# Minimum: "min 3" or "at least 5"
elsif ($constraint =~ /(?:min|minimum|at least)\s*(\d+)/i) {
$param->{min} = $1;
}
# Maximum: "max 50" or "up to 100"
elsif ($constraint =~ /(?:max|maximum|up to)\s*(\d+)/i) {
$param->{max} = $1;
}
# Positive
elsif ($constraint =~ /positive/i) {
$param->{min} = 1 if $param->{type} && $param->{type} eq 'integer';
$param->{min} = 0.01 if $param->{type} && $param->{type} eq 'number';
}
# Non-negative
elsif ($constraint =~ /non-negative/i) {
$param->{min} = 0;
} elsif($constraint =~ /(.+)?\s(.+)/) {
my ($op, $val) = ($1, $2);
if(looks_like_number($val)) {
if ($op eq '<') {
$param->{max} = $val - 1;
} elsif ($op eq '<=') {
$param->{max} = $val;
} elsif ($op eq '>') {
$param->{min} = $val + 1;
} elsif ($op eq '>=') {
$param->{min} = $val;
}
}
}
if(defined($param->{max})) {
$self->_log(" Set max to $param->{max}");
}
if(defined($param->{min})) {
$self->_log(" Set min to $param->{min}");
}
}
# --------------------------------------------------
# _analyze_code
#
# Purpose: Analyse a method's source code using
# pattern matching to infer parameter
# names, types, constraints, defaults,
# and optionality. Orchestrates all
# per-parameter code analysis helpers.
#
# Entry: $code - method body source string.
# $method - method hashref (used for
# constructor-specific logic
# when extracting parameters
# from @_ patterns).
#
# Exit: Returns a hashref of parameter name
# to parameter spec hashref, with as
# much type and constraint information
# as could be inferred from the code.
#
# Side effects: Logs progress and warnings to stdout
# when verbose is set.
#
# Notes: Analysis is capped at max_parameters
# to prevent runaway processing on
# pathological methods. Falls back to
# classic @_ extraction if signature
# extraction found no parameters.
# --------------------------------------------------
sub _analyze_code {
my ($self, $code, $method) = @_;
my %params;
# Safety check - limit parameter analysis to prevent runaway processing
my $param_count = 0;
# Extract parameter names from various signature styles
$self->_extract_parameters_from_signature(\%params, $code);
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
# operators, and path manipulation
# patterns involving the parameter.
#
# Entry: $p - parameter hashref (modified
# in place).
# $param - parameter name string.
# $code - method body source string.
#
# Exit: Returns nothing. Modifies $p in place,
# setting type, isa, and semantic keys.
# Returns immediately on first match.
#
# Side effects: Logs detections to stdout when
# verbose is set.
# --------------------------------------------------
sub _detect_filehandle_type {
my ($self, $p, $param, $code) = @_;
return unless defined $param && $param =~ /^\w+$/;
# File handle operations
if ($code =~ /(?:open|close|read|print|say|sysread|syswrite)\s*\(?\s*\$$param/) {
$p->{type} = 'object';
$p->{isa} = 'IO::Handle';
$p->{semantic} = 'filehandle';
$self->_log(" ADVANCED: $param is a file handle");
return;
}
# Filehandle-specific operations
if ($code =~ /\$$param\s*->\s*(readline|getline|print|say|close|flush|autoflush)/) {
$p->{type} = 'object';
$p->{isa} = 'IO::Handle';
$p->{semantic} = 'filehandle';
$self->_log(" ADVANCED: $param uses filehandle methods");
return;
}
# File test operators
if ($code =~ /(?:-[frwxoOeszlpSbctugkTBMAC])\s+\$$param/) {
$p->{type} = 'string';
$p->{semantic} = 'filepath';
$self->_log(" ADVANCED: $param is tested as file path");
return;
}
# File::Spec operations or path manipulation
if ($code =~ /File::(?:Spec|Basename)::\w+\s*\(\s*\$$param/ ||
$code =~ /(?:basename|dirname|fileparse)\s*\(\s*\$$param/) {
$p->{type} = 'string';
$p->{semantic} = 'filepath';
$self->_log(" ADVANCED: $param manipulated as file path");
return;
}
# Path validation patterns
# Only match a literal path assigned or defaulted to this variable
if(defined $p->{_default} && $p->{_default} =~ m{^([A-Za-z]:\\|/|\./|\.\./)}) {
$p->{type} = 'string';
$p->{semantic} = 'filepath';
$self->_log(" ADVANCED: $param default looks like a path");
return;
}
# IO::File detection
if ($code =~ /\$$param\s*->\s*isa\s*\(\s*['"]IO::File['"]\s*\)/ ||
$code =~ /IO::File\s*->\s*new\s*\(\s*\$$param/) {
$p->{type} = 'object';
$p->{isa} = 'IO::File';
$p->{semantic} = 'filehandle';
$self->_log(" ADVANCED: $param is IO::File object");
return;
}
}
# --------------------------------------------------
# _detect_coderef_type
#
# Purpose: Detect coderef and callback parameters
# by analysing ref() checks, invocation
# patterns, and parameter naming
# conventions.
#
# Entry: $p - parameter hashref (modified
# in place).
# $param - parameter name string.
# $code - method body source string.
#
# Exit: Returns nothing. Modifies $p in place,
# setting type and semantic keys.
# Returns immediately on first match.
#
# Side effects: Logs detections to stdout when
# verbose is set.
# --------------------------------------------------
sub _detect_coderef_type {
my ($self, $p, $param, $code) = @_;
return unless defined $param && $param =~ /^\w+$/;
# ref() check for CODE
if ($code =~ /ref\s*\(\s*\$$param\s*\)\s*eq\s*['"]CODE['"]/i) {
$p->{type} = 'coderef';
$p->{semantic} = 'callback';
$self->_log(" ADVANCED: $param is coderef (ref check)");
return;
}
# Invocation as coderef - note the escaped @ in \@_
if ($code =~ /\$$param\s*->\s*\(/ ||
$code =~ /\$$param\s*->\s*\(\s*\@_\s*\)/ ||
$code =~ /&\s*\{\s*\$$param\s*\}/) {
$p->{type} = 'coderef';
$p->{semantic} = 'callback';
$self->_log(" ADVANCED: $param invoked as coderef");
return;
}
# Parameter name suggests callback
if ($param =~ /^(?:callback|cb|handler|sub|code|fn|func|on_\w+)$/i) {
$p->{type} = 'coderef';
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
# Mark required
$$p->{optional} = 0;
if ($op eq '<=') {
$$p->{min} = $num + 1;
} elsif ($op eq '<') {
$$p->{min} = $num;
} elsif ($op eq '>=') {
$$p->{max} = $num - 1;
} elsif ($op eq '>') {
$$p->{max} = $num;
}
$self->_log(" ERROR: $param normalized constraint from '$op $num'");
}
}
# --------------------------------------------------
# _extract_parameters_from_signature
#
# Purpose: Extract parameter names and positions
# from a method's signature, trying
# modern Perl subroutine signatures
# first and falling back to traditional
# @_ extraction styles.
#
# Entry: $params - hashref to populate with
# parameter specs (modified
# in place).
# $code - method body source string.
#
# Exit: Returns nothing. Populates $params.
#
# Side effects: Logs detections to stdout when
# verbose is set.
#
# Notes: Three traditional styles are
# supported: (1) my ($self, ...) = @_,
# (2) my $self = shift; my $x = shift,
# (3) my $x = $_[N]. $self and $class
# are always excluded from the returned
# parameters.
# --------------------------------------------------
sub _extract_parameters_from_signature {
my ($self, $params, $code) = @_;
# Modern Style: Subroutine signatures with attributes
# Handle multi-line signatures
# sub foo :attr1 :attr2(val) (
# $self,
# $x :Type,
# $y = default
# ) { }
# Try to match signature after attributes
# Look for the parameter list - it's the last (...) before the opening brace
# that contains sigils ($, %, @)
if ($code =~ /sub\s+\w+\s*(?::\w+(?:\([^)]*\))?\s*)*\(((?:[^()]|\([^)]*\))*)\)\s*\{/s) {
my $potential_sig = $1;
# Check if this looks like parameters (has sigils)
if ($potential_sig =~ /[\$\%\@]/) {
$self->_log(" SIG: Found modern signature: ($potential_sig)");
$self->_parse_modern_signature($params, $potential_sig);
return;
}
}
# Traditional Style 1: my ($self, $arg1, $arg2) = @_;
if ($code =~ /my\s*\(\s*([^)]+)\)\s*=\s*\@_/s) {
my $sig = $1;
my $pos = 0;
while ($sig =~ /\$(\w+)/g) {
my $name = $1;
next if $name =~ /^(self|class)$/i;
$params->{$name} //= {
_source => 'code',
optional => 1,
};
$params->{$name}{position} = $pos unless exists $params->{$name}{position};
$pos++;
}
return;
} elsif ($code =~ /my\s+\$self\s*=\s*shift/) {
# Traditional Style 2: my $self = shift; my $arg1 = shift;
my @shifts;
while ($code =~ /my\s+\$(\w+)\s*=\s*shift/g) {
push @shifts, $1;
}
shift @shifts if @shifts && $shifts[0] =~ /^(self|class)$/i;
my $pos = 0;
foreach my $param (@shifts) {
$params->{$param} ||= { _source => 'code', optional => 1, position => $pos++ };
}
return;
}
# Traditional Style 3: Function parameters (no $self)
if ($code =~ /my\s*\(\s*([^)]+)\)\s*=\s*\@_/s) {
my $sig = $1;
my @param_names = $sig =~ /\$(\w+)/g;
my $pos = 0;
foreach my $param (@param_names) {
next if $param =~ /^(self|class)$/i;
$params->{$param} ||= { _source => 'code', optional => 1, position => $pos++ };
}
}
# De-duplicate
my %seen;
foreach my $param (keys %$params) {
if ($seen{$param}++) {
$self->_log(" WARNING: Duplicate parameter '$param' found");
}
}
}
lib/App/Test/Generator/SchemaExtractor.pm view on Meta::CPAN
}
# --------------------------------------------------
# _analyze_parameter_constraints
#
# Purpose: Infer min, max, and regex match
# constraints for a single parameter
# from length checks, numeric
# comparisons, and regex match
# patterns in the method body.
#
# Entry: $p_ref - reference to the parameter
# hashref (modified in place).
# $param - parameter name string.
# $code - method body source string.
#
# Exit: Returns nothing. Modifies the
# referenced parameter hashref.
#
# Side effects: Logs detections to stdout when
# verbose is set.
#
# Notes: Numeric comparisons that appear
# inside die/croak guard conditions
# are excluded to avoid inferring
# invalid-input ranges as valid
# constraints.
# --------------------------------------------------
sub _analyze_parameter_constraints {
my ($self, $p_ref, $param, $code) = @_;
my $p = $$p_ref;
# Do not treat comparisons inside die/croak/confess as valid constraints
my $guarded = 0;
if ($code =~ /(die|croak|confess)\b[^{;]*\bif\b[^{;]*\$$param\b/s) {
$guarded = 1;
}
# Length checks for strings
if ($code =~ /length\s*\(\s*\$$param\s*\)\s*([<>]=?)\s*(\d+)/) {
my ($op, $val) = ($1, $2);
$p->{type} ||= 'string';
if ($op eq '<') {
$p->{max} = $val - 1;
} elsif ($op eq '<=') {
$p->{max} = $val;
} elsif ($op eq '>') {
$p->{min} = $val + 1;
} elsif ($op eq '>=') {
$p->{min} = $val;
}
$self->_log(" CODE: $param length constraint $op $val");
}
# Numeric range checks (only if NOT part of error guard)
if (
!$guarded
&& $code =~ /\$$param\s*([<>]=?)\s*([+-]?(?:\d+\.?\d*|\.\d+))/
) {
my ($op, $val) = ($1, $2);
$p->{type} ||= looks_like_number($val) ? 'number' : 'integer';
if ($op eq '<' || $op eq '<=') {
# Only set max if it tightens the range
my $max = ($op eq '<') ? $val - 1 : $val;
$p->{max} = $max if !defined($p->{max}) || $max < $p->{max};
} elsif ($op eq '>' || $op eq '>=') {
my $min = ($op eq '>') ? $val + 1 : $val;
$p->{min} = $min if !defined($p->{min}) || $min > $p->{min};
}
}
# Regex pattern matching with better capture
if ($code =~ /\$$param\s*=~\s*((?:qr?\/[^\/]+\/|\$[\w:]+|\$\{\w+\}))/) {
my $pattern = $1;
$p->{type} ||= 'string';
# Clean up the pattern if it's a straightforward regex
if ($pattern =~ /^qr?\/([^\/]+)\/$/) {
$p->{matches} = "/$1/";
} else {
$p->{matches} = $pattern;
}
$self->_log(" CODE: $param matches pattern: $p->{matches}");
}
}
# --------------------------------------------------
# _analyze_parameter_validation
#
# Purpose: Determine optionality and extract
# default values for a single parameter
# by analysing explicit required checks
# (die/croak unless defined) and default
# assignment patterns in the method body.
#
# Entry: $p_ref - reference to the parameter
# hashref (modified in place).
# $param - parameter name string.
# $code - method body source string.
#
# Exit: Returns nothing. Modifies the
# referenced parameter hashref.
#
# Side effects: Logs detections to stdout when
# verbose is set.
#
# Notes: Explicit required checks take highest
# priority and override any default
# value detected earlier.
# --------------------------------------------------
sub _analyze_parameter_validation {
my ($self, $p_ref, $param, $code) = @_;
my $p = $$p_ref;
# Required/optional checks
my $is_required = 0;
# Die/croak if not defined
if ($code =~ /(?:die|croak|confess)\s+[^;]*unless\s+(?:defined\s+)?\$$param/s) {
$is_required = 1;
}
# Extract default values with the new method
my $default_value = $self->_extract_default_value($param, $code);
if (defined $default_value && !exists $p->{_default}) {
$p->{optional} = 1;
$p->{_default} = $default_value;
# Try to infer type from default value if not already set
unless ($p->{type}) {
if (looks_like_number($default_value)) {
$p->{type} = $default_value =~ /\./ ? 'number' : 'integer';
} elsif (ref($default_value) eq 'ARRAY') {
$p->{type} = 'arrayref';
} elsif (ref($default_value) eq 'HASH') {
$p->{type} = 'hashref';
} elsif ($default_value eq 'undef') {
$p->{type} = 'scalar'; # undef can be any scalar
} elsif (defined $default_value && !ref($default_value)) {
$p->{type} = 'string';
}
}
$self->_log(" CODE: $param has default value: " . (ref($default_value) ? ref($default_value) . ' ref' : $default_value));
}
# Also check for simple default assignment without condition
# Pattern: $param = 'value';
if (!$default_value && !exists $p->{_default} && $code =~ /\$$param\s*=\s*([^;{}]+?)(?:\s*[;}])/s) {
my $assignment = $1;
# Make sure it's not part of a larger expression
if ($assignment !~ /\$$param/ && $assignment !~ /^shift/) {
my $possible_default = $assignment;
$possible_default =~ s/\s*;\s*$//;
$possible_default = $self->_clean_default_value($possible_default);
if (defined $possible_default) {
$p->{_default} = $possible_default;
$p->{optional} = 1;
$self->_log(" CODE: $param has unconditional default: $possible_default");
}
}
}
# Explicit required check overrides default detection
if ($is_required) {
$p->{optional} = 0;
delete $p->{_default} if exists $p->{_default};
$self->_log(" CODE: $param is required (validation check)");
}
}
# --------------------------------------------------
# _merge_parameter_analyses
#
# Purpose: Merge parameter information from POD,
# code, and signature analysis into a
# single authoritative parameter hashref
# for each parameter.
#
# Entry: $pod - hashref of parameters from POD
# analysis.
# $code - hashref of parameters from
# code analysis.
# $sig - hashref of parameters from
# signature analysis (optional,
# defaults to empty hashref).
#
# Exit: Returns a merged hashref of parameter
# name to spec hashref. Each spec has
# all available information combined,
# with POD taking highest priority,
( run in 0.962 second using v1.01-cache-2.11-cpan-39bf76dae61 )