App-Test-Generator

 view release on metacpan or  search on metacpan

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

package App::Test::Generator::SchemaExtractor;

use strict;
use warnings;
use autodie qw(:all);

use App::Test::Generator::Model::Method;
use App::Test::Generator::Analyzer::Complexity;
use App::Test::Generator::Analyzer::Return;
use App::Test::Generator::Analyzer::ReturnMeta;
use App::Test::Generator::Analyzer::SideEffect;

use Carp qw(carp croak);
use PPI;
use Pod::Simple::Text;
use File::Basename;
use File::Path qw(make_path);
use Params::Get;
use Safe;
use Scalar::Util qw(looks_like_number);
use YAML::XS;
use IPC::Open3;
use JSON::MaybeXS qw(encode_json decode_json);
use Readonly;
use Symbol qw(gensym);

# --------------------------------------------------
# Confidence score thresholds for input and output analysis
# --------------------------------------------------
Readonly my $CONFIDENCE_HIGH_THRESHOLD   => 60;
Readonly my $CONFIDENCE_MEDIUM_THRESHOLD => 35;
Readonly my $CONFIDENCE_LOW_THRESHOLD    => 15;

# --------------------------------------------------
# Confidence level label strings
# --------------------------------------------------
Readonly my $LEVEL_HIGH     => 'high';
Readonly my $LEVEL_MEDIUM   => 'medium';
Readonly my $LEVEL_LOW      => 'low';
Readonly my $LEVEL_VERY_LOW => 'very_low';
Readonly my $LEVEL_NONE     => 'none';

# --------------------------------------------------
# Analysis limits
# --------------------------------------------------
Readonly my $DEFAULT_MAX_PARAMETERS     => 20;
Readonly my $DEFAULT_CONFIDENCE_THRESH  => 0.5;
Readonly my $POD_WALK_LIMIT             => 200;
Readonly my $SIGNATURE_TIMEOUT_SECS     => 3;
Readonly my $MEMORY_LIMIT_BYTES         => 50_000_000;

# --------------------------------------------------
# Numeric boundary values for test hint generation
# --------------------------------------------------
Readonly my $INT32_MAX => 2_147_483_647;

# --------------------------------------------------
# Boolean return score thresholds
# --------------------------------------------------
Readonly my $BOOLEAN_SCORE_THRESHOLD => 30;

=head1 NAME

App::Test::Generator::SchemaExtractor - Extract test schemas from Perl modules

=head1 VERSION

Version 0.33

=cut

our $VERSION = '0.33';

=head1 SYNOPSIS

	use App::Test::Generator::SchemaExtractor;

	my $extractor = App::Test::Generator::SchemaExtractor->new(
		input_file => 'lib/MyModule.pm',
		output_dir => 'schemas/',

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

#
# Entry:      $hash - a PPI node with a children()
#                     method, typically a
#                     PPI::Structure::Block from
#                     a validate_strict call.
#
# Exit:       Returns a hashref with keys:
#               input       - hashref of param specs
#               input_style - 'hash'
#               _confidence - confidence hashref
#             or undef if parsing fails.
#
# Side effects: None.
# --------------------------------------------------
sub _parse_schema_hash {
	my ($self, $hash) = @_;

	my %result;

	for my $child ($hash->children) {
		# skip whitespace and operators
		if ($child->isa('PPI::Statement') || $child->isa('PPI::Statement::Expression')) {
			my ($key, $val);

			my @tokens = grep {
				!$_->isa('PPI::Token::Whitespace') &&
				!$_->isa('PPI::Token::Operator')
			} $child->children;

			for (my $i = 0; $i < @tokens - 1; $i++) {
				if(($tokens[$i]->isa('PPI::Token::Word') || $tokens[$i]->isa('PPI::Token::Quote')) &&
				   $tokens[$i+1]->isa('PPI::Structure::Constructor')) {
					$key = $tokens[$i]->content;
					$key =~ s/^['"]|['"]$//g;
					$val = $tokens[$i+1];
					last;
				}
			}

			next unless $key && $val;

			my %param;
			for my $inner ($val->children) {
				next unless $inner->isa('PPI::Statement') || $inner->isa('PPI::Statement::Expression');

				my ($k, undef, $v) = grep {
					!$_->isa('PPI::Token::Whitespace') &&
					!$_->isa('PPI::Token::Operator')
				} $inner->children;

				next unless $k && $v;

				my $keyname = $k->content;
				my $value = $v->can('content') ? $v->content : undef;
				$value =~ s/^['"]|['"]$//g if defined $value;

				if ($keyname eq 'type') {
					$param{type} = lc($value);
				} elsif ($keyname eq 'optional') {
					$param{optional} = $value ? 1 : 0;
				} elsif ($keyname =~ /^(min|max)$/ && looks_like_number($value)) {
					$param{$keyname} = 0 + $value;
				} elsif ($keyname eq 'matches') {
					$param{matches} = qr/$value/;
				}
			}

			$param{type} //= 'string';
			$param{optional} //= 0;

			$result{$key} = \%param;
		}
	}

	return {
		input => \%result,
		input_style => 'hash',
		_confidence => {
			input => {
				level => 'high',
				factors => ['Input schema extracted from validator'],
			},
		},
	};
}

# --------------------------------------------------
# _ppi
#
# Purpose:    Return a PPI::Document for a code
#             string, using a per-instance cache
#             to avoid re-parsing the same string
#             multiple times during a single
#             analysis pass.
#
# Entry:      $code - either a string of Perl source
#                     code, or an object that
#                     already has a find() method
#                     (returned as-is).
#
# Exit:       Returns a PPI::Document, or the
#             original object if it already
#             supports find().
#
# Side effects: Populates $self->{_ppi_cache}.
# --------------------------------------------------
sub _ppi {
	my ($self, $code) = @_;

	return $code if ref($code) && $code->can('find');

	$self->{_ppi_cache} ||= {};
	return $self->{_ppi_cache}{$code} //= PPI::Document->new(\$code);
}

# --------------------------------------------------
# _extract_pvs_schema
#
# Purpose:    Detect and extract a parameter schema
#             from a Params::Validate::Strict
#             validate_strict() call in the method

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

	if(exists $output->{type}) {
		if(!$valid_types{$output->{type}}) {
			$self->_log("  WARNING Output value type is unknown: '$output->{type}', setting to string");
			$output->{type} = 'string';
		}
	}
}

# --------------------------------------------------
# _parse_constraints
#
# Purpose:    Parse a constraint string extracted
#             from POD documentation and populate
#             min, max, or other constraint fields
#             in a parameter hashref.
#
# Entry:      $param      - hashref for the parameter
#                           being annotated (modified
#                           in place).
#             $constraint - the constraint string,
#                           e.g. '3-50', 'positive',
#                           '>= 0', 'min 3'.
#
# Exit:       Returns nothing. Modifies $param in
#             place by setting min and/or max keys.
#
# Side effects: Logs min/max values to stdout when
#               verbose is set.
# --------------------------------------------------
sub _parse_constraints {
	my ($self, $param, $constraint) = @_;

	# Range: "3-50" or "1-100 chars"
	if ($constraint =~ /(\d+)\s*-\s*(\d+)/) {
		$param->{min} = $1;
		$param->{max} = $2;
	}
	elsif ($constraint =~ /(\d+)\s*\.\.\s*(\d+)/) {
		# Range: 0..19
		$param->{min} = $1;
		$param->{max} = $2;
	}
	# Minimum: "min 3" or "at least 5"
	elsif ($constraint =~ /(?:min|minimum|at least)\s*(\d+)/i) {
		$param->{min} = $1;
	}
	# Maximum: "max 50" or "up to 100"
	elsif ($constraint =~ /(?:max|maximum|up to)\s*(\d+)/i) {
		$param->{max} = $1;
	}
	# Positive
	elsif ($constraint =~ /positive/i) {
		$param->{min} = 1 if $param->{type} && $param->{type} eq 'integer';
		$param->{min} = 0.01 if $param->{type} && $param->{type} eq 'number';
	}
	# Non-negative
	elsif ($constraint =~ /non-negative/i) {
		$param->{min} = 0;
	} elsif($constraint =~ /(.+)?\s(.+)/) {
		my ($op, $val) = ($1, $2);
		if(looks_like_number($val)) {
			if ($op eq '<') {
				$param->{max} = $val - 1;
			} elsif ($op eq '<=') {
				$param->{max} = $val;
			} elsif ($op eq '>') {
				$param->{min} = $val + 1;
			} elsif ($op eq '>=') {
				$param->{min} = $val;
			}
		}
	}

	if(defined($param->{max})) {
		$self->_log("  Set max to $param->{max}");
	}
	if(defined($param->{min})) {
		$self->_log("  Set min to $param->{min}");
	}
}

# --------------------------------------------------
# _analyze_code
#
# Purpose:    Analyse a method's source code using
#             pattern matching to infer parameter
#             names, types, constraints, defaults,
#             and optionality. Orchestrates all
#             per-parameter code analysis helpers.
#
# Entry:      $code   - method body source string.
#             $method - method hashref (used for
#                       constructor-specific logic
#                       when extracting parameters
#                       from @_ patterns).
#
# Exit:       Returns a hashref of parameter name
#             to parameter spec hashref, with as
#             much type and constraint information
#             as could be inferred from the code.
#
# Side effects: Logs progress and warnings to stdout
#               when verbose is set.
#
# Notes:      Analysis is capped at max_parameters
#             to prevent runaway processing on
#             pathological methods. Falls back to
#             classic @_ extraction if signature
#             extraction found no parameters.
# --------------------------------------------------
sub _analyze_code {
	my ($self, $code, $method) = @_;

	my %params;

	# Safety check - limit parameter analysis to prevent runaway processing
	my $param_count = 0;

	# Extract parameter names from various signature styles
	$self->_extract_parameters_from_signature(\%params, $code);

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

#             operators, and path manipulation
#             patterns involving the parameter.
#
# Entry:      $p     - parameter hashref (modified
#                      in place).
#             $param - parameter name string.
#             $code  - method body source string.
#
# Exit:       Returns nothing. Modifies $p in place,
#             setting type, isa, and semantic keys.
#             Returns immediately on first match.
#
# Side effects: Logs detections to stdout when
#               verbose is set.
# --------------------------------------------------
sub _detect_filehandle_type {
	my ($self, $p, $param, $code) = @_;

	return unless defined $param && $param =~ /^\w+$/;

	# File handle operations
	if ($code =~ /(?:open|close|read|print|say|sysread|syswrite)\s*\(?\s*\$$param/) {
		$p->{type} = 'object';
		$p->{isa} = 'IO::Handle';
		$p->{semantic} = 'filehandle';
		$self->_log("  ADVANCED: $param is a file handle");
		return;
	}

	# Filehandle-specific operations
	if ($code =~ /\$$param\s*->\s*(readline|getline|print|say|close|flush|autoflush)/) {
		$p->{type} = 'object';
		$p->{isa} = 'IO::Handle';
		$p->{semantic} = 'filehandle';
		$self->_log("  ADVANCED: $param uses filehandle methods");
		return;
	}

	# File test operators
	if ($code =~ /(?:-[frwxoOeszlpSbctugkTBMAC])\s+\$$param/) {
		$p->{type} = 'string';
		$p->{semantic} = 'filepath';
		$self->_log("  ADVANCED: $param is tested as file path");
		return;
	}

	# File::Spec operations or path manipulation
	if ($code =~ /File::(?:Spec|Basename)::\w+\s*\(\s*\$$param/ ||
	    $code =~ /(?:basename|dirname|fileparse)\s*\(\s*\$$param/) {
		$p->{type} = 'string';
		$p->{semantic} = 'filepath';
		$self->_log("  ADVANCED: $param manipulated as file path");
		return;
	}

	# Path validation patterns
	# Only match a literal path assigned or defaulted to this variable
	if(defined $p->{_default} && $p->{_default} =~ m{^([A-Za-z]:\\|/|\./|\.\./)}) {
		$p->{type} = 'string';
		$p->{semantic} = 'filepath';
		$self->_log("  ADVANCED: $param default looks like a path");
		return;
	}

	# IO::File detection
	if ($code =~ /\$$param\s*->\s*isa\s*\(\s*['"]IO::File['"]\s*\)/ ||
	    $code =~ /IO::File\s*->\s*new\s*\(\s*\$$param/) {
		$p->{type} = 'object';
		$p->{isa} = 'IO::File';
		$p->{semantic} = 'filehandle';
		$self->_log("  ADVANCED: $param is IO::File object");
		return;
	}
}

# --------------------------------------------------
# _detect_coderef_type
#
# Purpose:    Detect coderef and callback parameters
#             by analysing ref() checks, invocation
#             patterns, and parameter naming
#             conventions.
#
# Entry:      $p     - parameter hashref (modified
#                      in place).
#             $param - parameter name string.
#             $code  - method body source string.
#
# Exit:       Returns nothing. Modifies $p in place,
#             setting type and semantic keys.
#             Returns immediately on first match.
#
# Side effects: Logs detections to stdout when
#               verbose is set.
# --------------------------------------------------
sub _detect_coderef_type {
	my ($self, $p, $param, $code) = @_;

	return unless defined $param && $param =~ /^\w+$/;

	# ref() check for CODE
	if ($code =~ /ref\s*\(\s*\$$param\s*\)\s*eq\s*['"]CODE['"]/i) {
		$p->{type} = 'coderef';
		$p->{semantic} = 'callback';
		$self->_log("  ADVANCED: $param is coderef (ref check)");
		return;
	}

	# Invocation as coderef - note the escaped @ in \@_
	if ($code =~ /\$$param\s*->\s*\(/ ||
	    $code =~ /\$$param\s*->\s*\(\s*\@_\s*\)/ ||
	    $code =~ /&\s*\{\s*\$$param\s*\}/) {
		$p->{type} = 'coderef';
		$p->{semantic} = 'callback';
		$self->_log("  ADVANCED: $param invoked as coderef");
		return;
	}

	# Parameter name suggests callback
	if ($param =~ /^(?:callback|cb|handler|sub|code|fn|func|on_\w+)$/i) {
		$p->{type} = 'coderef';

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

		# Mark required
		$$p->{optional} = 0;

		if ($op eq '<=') {
			$$p->{min} = $num + 1;
		} elsif ($op eq '<') {
			$$p->{min} = $num;
		} elsif ($op eq '>=') {
			$$p->{max} = $num - 1;
		} elsif ($op eq '>') {
			$$p->{max} = $num;
		}

		$self->_log("  ERROR: $param normalized constraint from '$op $num'");
	}
}

# --------------------------------------------------
# _extract_parameters_from_signature
#
# Purpose:    Extract parameter names and positions
#             from a method's signature, trying
#             modern Perl subroutine signatures
#             first and falling back to traditional
#             @_ extraction styles.
#
# Entry:      $params - hashref to populate with
#                       parameter specs (modified
#                       in place).
#             $code   - method body source string.
#
# Exit:       Returns nothing. Populates $params.
#
# Side effects: Logs detections to stdout when
#               verbose is set.
#
# Notes:      Three traditional styles are
#             supported: (1) my ($self, ...) = @_,
#             (2) my $self = shift; my $x = shift,
#             (3) my $x = $_[N]. $self and $class
#             are always excluded from the returned
#             parameters.
# --------------------------------------------------
sub _extract_parameters_from_signature {
	my ($self, $params, $code) = @_;

	# Modern Style: Subroutine signatures with attributes
	# Handle multi-line signatures
	# sub foo :attr1 :attr2(val) (
	#     $self,
	#     $x :Type,
	#     $y = default
	# ) { }

	# Try to match signature after attributes
	# Look for the parameter list - it's the last (...) before the opening brace
	# that contains sigils ($, %, @)
	if ($code =~ /sub\s+\w+\s*(?::\w+(?:\([^)]*\))?\s*)*\(((?:[^()]|\([^)]*\))*)\)\s*\{/s) {
		my $potential_sig = $1;

		# Check if this looks like parameters (has sigils)
		if ($potential_sig =~ /[\$\%\@]/) {
			$self->_log("  SIG: Found modern signature: ($potential_sig)");
			$self->_parse_modern_signature($params, $potential_sig);
			return;
		}
	}

	# Traditional Style 1: my ($self, $arg1, $arg2) = @_;
	if ($code =~ /my\s*\(\s*([^)]+)\)\s*=\s*\@_/s) {
		my $sig = $1;
		my $pos = 0;

		while ($sig =~ /\$(\w+)/g) {
			my $name = $1;

			next if $name =~ /^(self|class)$/i;

			$params->{$name} //= {
				_source => 'code',
				optional => 1,
			};

			$params->{$name}{position} = $pos unless exists $params->{$name}{position};

			$pos++;
		}
		return;
	} elsif ($code =~ /my\s+\$self\s*=\s*shift/) {
		# Traditional Style 2: my $self = shift; my $arg1 = shift;
		my @shifts;
		while ($code =~ /my\s+\$(\w+)\s*=\s*shift/g) {
			push @shifts, $1;
		}
		shift @shifts if @shifts && $shifts[0] =~ /^(self|class)$/i;
		my $pos = 0;
		foreach my $param (@shifts) {
			$params->{$param} ||= { _source => 'code', optional => 1, position => $pos++ };
		}
		return;
	}

	# Traditional Style 3: Function parameters (no $self)
	if ($code =~ /my\s*\(\s*([^)]+)\)\s*=\s*\@_/s) {
		my $sig = $1;
		my @param_names = $sig =~ /\$(\w+)/g;
		my $pos = 0;
		foreach my $param (@param_names) {
			next if $param =~ /^(self|class)$/i;
			$params->{$param} ||= { _source => 'code', optional => 1, position => $pos++ };
		}
	}

	# De-duplicate
	my %seen;
	foreach my $param (keys %$params) {
		if ($seen{$param}++) {
			$self->_log("  WARNING: Duplicate parameter '$param' found");
		}
	}
}

lib/App/Test/Generator/SchemaExtractor.pm  view on Meta::CPAN

}

# --------------------------------------------------
# _analyze_parameter_constraints
#
# Purpose:    Infer min, max, and regex match
#             constraints for a single parameter
#             from length checks, numeric
#             comparisons, and regex match
#             patterns in the method body.
#
# Entry:      $p_ref - reference to the parameter
#                      hashref (modified in place).
#             $param - parameter name string.
#             $code  - method body source string.
#
# Exit:       Returns nothing. Modifies the
#             referenced parameter hashref.
#
# Side effects: Logs detections to stdout when
#               verbose is set.
#
# Notes:      Numeric comparisons that appear
#             inside die/croak guard conditions
#             are excluded to avoid inferring
#             invalid-input ranges as valid
#             constraints.
# --------------------------------------------------
sub _analyze_parameter_constraints {
	my ($self, $p_ref, $param, $code) = @_;
	my $p = $$p_ref;

	# Do not treat comparisons inside die/croak/confess as valid constraints
	my $guarded = 0;
		if ($code =~ /(die|croak|confess)\b[^{;]*\bif\b[^{;]*\$$param\b/s) {
		$guarded = 1;
	}

	# Length checks for strings
	if ($code =~ /length\s*\(\s*\$$param\s*\)\s*([<>]=?)\s*(\d+)/) {
		my ($op, $val) = ($1, $2);
		$p->{type} ||= 'string';
		if ($op eq '<') {
			$p->{max} = $val - 1;
		} elsif ($op eq '<=') {
			$p->{max} = $val;
		} elsif ($op eq '>') {
			$p->{min} = $val + 1;
		} elsif ($op eq '>=') {
			$p->{min} = $val;
		}
		$self->_log("  CODE: $param length constraint $op $val");
	}

	# Numeric range checks (only if NOT part of error guard)
	if (
		!$guarded
		&& $code =~ /\$$param\s*([<>]=?)\s*([+-]?(?:\d+\.?\d*|\.\d+))/
	) {
		my ($op, $val) = ($1, $2);
		$p->{type} ||= looks_like_number($val) ? 'number' : 'integer';

		if ($op eq '<' || $op eq '<=') {
			# Only set max if it tightens the range
			my $max = ($op eq '<') ? $val - 1 : $val;
			$p->{max} = $max if !defined($p->{max}) || $max < $p->{max};
		} elsif ($op eq '>' || $op eq '>=') {
			my $min = ($op eq '>') ? $val + 1 : $val;
			$p->{min} = $min if !defined($p->{min}) || $min > $p->{min};
		}
	}

	# Regex pattern matching with better capture
	if ($code =~ /\$$param\s*=~\s*((?:qr?\/[^\/]+\/|\$[\w:]+|\$\{\w+\}))/) {
		my $pattern = $1;
		$p->{type} ||= 'string';

		# Clean up the pattern if it's a straightforward regex
		if ($pattern =~ /^qr?\/([^\/]+)\/$/) {
			$p->{matches} = "/$1/";
		} else {
			$p->{matches} = $pattern;
		}
		$self->_log("  CODE: $param matches pattern: $p->{matches}");
	}
}

# --------------------------------------------------
# _analyze_parameter_validation
#
# Purpose:    Determine optionality and extract
#             default values for a single parameter
#             by analysing explicit required checks
#             (die/croak unless defined) and default
#             assignment patterns in the method body.
#
# Entry:      $p_ref - reference to the parameter
#                      hashref (modified in place).
#             $param - parameter name string.
#             $code  - method body source string.
#
# Exit:       Returns nothing. Modifies the
#             referenced parameter hashref.
#
# Side effects: Logs detections to stdout when
#               verbose is set.
#
# Notes:      Explicit required checks take highest
#             priority and override any default
#             value detected earlier.
# --------------------------------------------------
sub _analyze_parameter_validation {
	my ($self, $p_ref, $param, $code) = @_;
	my $p = $$p_ref;

	# Required/optional checks
	my $is_required = 0;

	# Die/croak if not defined
	if ($code =~ /(?:die|croak|confess)\s+[^;]*unless\s+(?:defined\s+)?\$$param/s) {
		$is_required = 1;
	}

	# Extract default values with the new method
	my $default_value = $self->_extract_default_value($param, $code);
	if (defined $default_value && !exists $p->{_default}) {
		$p->{optional} = 1;
		$p->{_default} = $default_value;

		# Try to infer type from default value if not already set
		unless ($p->{type}) {
			if (looks_like_number($default_value)) {
				$p->{type} = $default_value =~ /\./ ? 'number' : 'integer';
			} elsif (ref($default_value) eq 'ARRAY') {
				$p->{type} = 'arrayref';
			} elsif (ref($default_value) eq 'HASH') {
				$p->{type} = 'hashref';
			} elsif ($default_value eq 'undef') {
				$p->{type} = 'scalar';	# undef can be any scalar
			} elsif (defined $default_value && !ref($default_value)) {
				$p->{type} = 'string';
			}
		}

		$self->_log("  CODE: $param has default value: " . (ref($default_value) ? ref($default_value) . ' ref' : $default_value));
	}

	# Also check for simple default assignment without condition
	# Pattern: $param = 'value';
	if (!$default_value && !exists $p->{_default} && $code =~ /\$$param\s*=\s*([^;{}]+?)(?:\s*[;}])/s) {
		my $assignment = $1;
		# Make sure it's not part of a larger expression
		if ($assignment !~ /\$$param/ && $assignment !~ /^shift/) {
			my $possible_default = $assignment;
			$possible_default =~ s/\s*;\s*$//;
			$possible_default = $self->_clean_default_value($possible_default);
			if (defined $possible_default) {
				$p->{_default} = $possible_default;
				$p->{optional} = 1;
				$self->_log("  CODE: $param has unconditional default: $possible_default");
			}
		}
	}

	# Explicit required check overrides default detection
	if ($is_required) {
		$p->{optional} = 0;
		delete $p->{_default} if exists $p->{_default};
		$self->_log("  CODE: $param is required (validation check)");
	}
}

# --------------------------------------------------
# _merge_parameter_analyses
#
# Purpose:    Merge parameter information from POD,
#             code, and signature analysis into a
#             single authoritative parameter hashref
#             for each parameter.
#
# Entry:      $pod - hashref of parameters from POD
#                    analysis.
#             $code - hashref of parameters from
#                     code analysis.
#             $sig  - hashref of parameters from
#                     signature analysis (optional,
#                     defaults to empty hashref).
#
# Exit:       Returns a merged hashref of parameter
#             name to spec hashref. Each spec has
#             all available information combined,
#             with POD taking highest priority,



( run in 0.962 second using v1.01-cache-2.11-cpan-39bf76dae61 )