Algorithm-DecisionTree

 view release on metacpan or  search on metacpan

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

}

sub minmax {
    my $arr = shift;
    my ($min, $max);
    foreach my $i (0..@{$arr}-1) {
        if ( (!defined $min) || ($arr->[$i] < $min) ) {
            $min = $arr->[$i];
        }
        if ( (!defined $max) || ($arr->[$i] > $max) ) {
            $max = $arr->[$i];
        }
    }
    return ($min, $max);
}

sub sample_index {
    my $arg = shift;
    $arg =~ /_(.+)$/;
    return $1;
}    

sub check_for_illegal_params {
    my @params = @_;
    my @legal_params = qw / training_datafile
                            max_depth_desired
                            dependent_variable_column
                            predictor_columns
                            mse_threshold
                            need_data_normalization
                            jacobian_choice
                            csv_cleanup_needed
                            debug1_r
                            debug2_r
                            debug3_r
                          /;
    my $found_match_flag;
    foreach my $param (@params) {
        foreach my $legal (@legal_params) {
            $found_match_flag = 0;
            if ($param eq $legal) {
                $found_match_flag = 1;
                last;
            }
        }
        last if $found_match_flag == 0;
    }
    return $found_match_flag;
}

sub cleanup_csv {
    my $line = shift;
    $line =~ tr/\/:?()[]{}'/          /;
#    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]+\"/g;
    my @double_quoted = substr($line, index($line,',')) =~ /\"[^\"]*\"/g;
    for (@double_quoted) {
        my $item = $_;
        $item = substr($item, 1, -1);
        $item =~ s/^\s+|,|\s+$//g;
        $item = join '_',  split /\s+/, $item;
        substr($line, index($line, $_), length($_)) = $item;
    }
    my @white_spaced = $line =~ /,(\s*[^,]+)(?=,|$)/g;
    for (@white_spaced) {
        my $item = $_;
        $item =~ s/\s+/_/g;
        $item =~ s/^\s*_|_\s*$//g;
        substr($line, index($line, $_), length($_)) = $item;
    }
    $line =~ s/,\s*(?=,|$)/,NA/g;
    return $line;
}

sub transpose {
    my $matrix = shift;
    my $num_rows = $matrix->rows();
    my $num_cols = $matrix->cols();
    my $transpose = Math::GSL::Matrix->new($num_cols, $num_rows);
    foreach my $i (0..$num_rows-1) {
        my @row = $matrix->row($i)->as_list;
        $transpose->set_col($i, \@row );
    }
    return $transpose;
}

sub vector_subtract {
    my $vec1 = shift;
    my $vec2 = shift;
    die "wrong data types for vector subtract calculation\n" if @$vec1 != @$vec2;
    my @result;
    foreach my $i (0..@$vec1-1){
        push @result, $vec1->[$i] - $vec2->[$i];
    }
    return @result;
}

sub vector_norm {
    my $vec = shift;       # assume it to be a column vector
    my ($rows, $cols) = $vec->dim;
    die "vector_norm() can only be called for a single column matrix" if $cols > 1;
    my @norm = (transpose($vec) * $vec)->as_list;
    return sqrt($norm[0]);
}

sub display_matrix {
    my $matrix = shift;
    my $nrows = $matrix->rows();
    my $ncols = $matrix->cols();
    print "\nDisplaying a matrix of size $nrows rows and $ncols columns:\n";
    foreach my $i (0..$nrows-1) {
        my $row = $matrix->row($i);
        my @row_as_list = $row->as_list;
        map { printf("%.4f ", $_) } @row_as_list;
        print "\n";
    }
    print "\n\n";
}

# Meant only for an array of strings (no nesting):
sub deep_copy_array {
    my $ref_in = shift;
    my $ref_out;
    return [] if scalar @$ref_in == 0;
    foreach my $i (0..@{$ref_in}-1) {
        $ref_out->[$i] = $ref_in->[$i];
    }
    return $ref_out;
}

lib/Algorithm/RegressionTree.pm  view on Meta::CPAN

    $self->{_rt}->{nodes_created} += 1;
    return $self->{_rt}->{nodes_created};
}

sub get_serial_num {
    my $self = shift;
    $self->{_serial_number};
}

# this returns the feature test at the current node
sub get_feature {                                  
    my $self = shift;                              
    return $self->{ _feature };                    
}

sub set_feature {
    my $self = shift;
    my $feature = shift;
    $self->{_feature} = $feature;
}

sub get_branch_features_and_values_or_thresholds {
    my $self = shift; 
    return $self->{_branch_features_and_values_or_thresholds};
}

sub get_children {       
    my $self = shift;                   
    return $self->{_linked_to};
}

sub add_child_link {         
    my ($self, $new_node, ) = @_;                            
    push @{$self->{_linked_to}}, $new_node;                  
}

sub delete_all_links {                  
    my $self = shift;                   
    $self->{_linked_to} = undef;        
}

sub display_node {
    my $self = shift; 
    my $feature_at_node = $self->get_feature() || " ";
    my $serial_num = $self->get_serial_num();
    my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
    print "\n\nNODE $serial_num" .
          ":\n   Branch features and values to this node: @branch_features_and_values_or_thresholds" .
          "\n   Best feature test at current node: $feature_at_node\n\n";
    $self->{_rt}->estimate_regression_coefficients($self->get_node_XMatrix(), $self->get_node_YVector(), 1);
}

sub display_regression_tree {
    my $self = shift;
    my $offset = shift;
    my $serial_num = $self->get_serial_num();
    if (@{$self->get_children()} > 0) {
        my $feature_at_node = $self->get_feature() || " ";
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        print "NODE $serial_num: $offset BRANCH TESTS TO NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
        print "$second_line_offset" . "Decision Feature: $feature_at_node\n\n";
        $offset .= "   ";
        foreach my $child (@{$self->get_children()}) {
            $child->display_regression_tree($offset);
        }
    } else {
        my @branch_features_and_values_or_thresholds = @{$self->get_branch_features_and_values_or_thresholds()};
        print "NODE $serial_num: $offset BRANCH TESTS TO LEAF NODE: @branch_features_and_values_or_thresholds\n";
        my $second_line_offset = "$offset" . " " x (8 + length("$serial_num"));
    }
}

1;



( run in 0.603 second using v1.01-cache-2.11-cpan-8f98c5d2c55 )