AI-NaiveBayes1

 view release on metacpan or  search on metacpan

Changes  view on Meta::CPAN

Revision history for Perl extension AI::NaiveBayes1.

2.012  Sat May 29 11:00:47 ADT 2021
 - Removing non-ASCII character in "Bohmer" last name to avoid testing
   issues, "Daniel B\"ohmer" name changed to "Daniel Bohmer" for
   documentation purposes

2.011  Fri May 28 07:55:50 ADT 2021
  - merged corrections from Daniel Bohmer

2.010  Tue May 25 18:30:47 ADT 2021
  - fixing bugs reported by perltesters on some platforms.
    It seems to be due to running tests in a parallel way, and
    that using the same name of the temporary file is the problem.

2.009  Sun May 23 09:23:06 ADT 2021
  - fixed bug with t/auxfunctions.pl not being located during
    testing (hopefully fixed because it requires a different
    environment to reproduce)
  - made Changes reverse chronological

2.008  Fri May 21 10:56:33 ADT 2021
  - GitHub release
  - documentation improvements

2.007  Wed Jan 29 05:58:37 AST 2020
  - documentation update
  - fixed bug: import_from_YAML and import_from_YAML_file now
    return bless-ed references (they did not return before)

2.006  Tue Mar 15 06:17:04 ADT 2011
  - reduced precision in \d{4}.\d+ to \d{4}.\d{10} avoid testing
    problems on different platforms

2.005  Tue Mar  8 06:35:43 AST 2011
  - reduced precision in comparison to avoid testing problems on
    different platforms (from 14 to 12 digits after decimal point)

2.004  Sat Mar  5 12:39:35 AST 2011
  - added missing files to MANIFEST (t/7-3.out and t/7-5.out)

2.003  Fri Mar  4 06:43:31 AST 2011
  - fixed more testing problems due to differences in precision

2.002  Wed Mar  2 06:29:06 AST 2011
  - fixed some testing problems due to differences in precision

2.001  Mon Feb 28 11:18:22 AST 2011
  - fixed some testing problems due to differences in precision

2.000  Tue Feb 22 09:58:04 AST 2011
  - fixing lexical sorting of version numbers

1.10  Tue Feb 22 09:31:12 AST 2011
  - fixed testing problems due to differences in precision
  - fixed podchecker warning (some space)
  - better test error reporting in 2.t
  - added t/pod.t, thanks to Michael Stevens

1.9   Tue Aug 31 09:27:51 ADT 2010
  - fixed testing problems due to differences in precision in t/2.t

1.8   Fri Aug 21 06:36:34 ADT 2009
  - fixed a pod documentation error

1.7   Thu Aug 20 14:20:15 ADT 2009
  - improvements in documentation
  - added method add_csv_file
  - added method drop_attributes
  - removed real_attr and added attribute_type field

1.6   Wed Aug 19 09:09:57 ADT 2009
  - improved an error message
  - fixed some testing problems due to whitespace
  - small improvement in generating documentation

1.5   Wed Jan 30 08:06:22 AST 2008
  - fixed testing problems due to differences in the lowest
    significant digit on different platforms

1.4   Fri Dec 14 11:42:16 AST 2007
  - added test skipping if YAML module is not available
  - removing deprecated "our" reserved word

1.3   Fri Dec  7 07:26:47 AST 2007
  - added reading from a table format (add_table)
  - fixed some warnings reported during testing
  - added option "with counts" to print_model
  - added smoothing option for attributes

1.2   Mon Mar 14 08:03:16 AST 2005
  - addition to documentation
  - fixing a minor bug and a warning

1.1   Tue Apr 20 08:07:33 ADT 2004
  - added several more test
  - implemented optional Gaussian distribution for numerical
    attributes

1.0   Thu Sep  4 08:01:19 ADT 2003
  - bug fix in testing (attributes and labels are sorted now in
    print_model)

0.03  Wed Sep  3 08:25:43 ADT 2003
  - bug fix
  - import and export to/from string and file (using YAML)

0.02  Fri May  9 15:53:53 ADT 2003
  - bug fix

0.01  Fri May  9 14:48:53 ADT 2003
  - original version;

META.json  view on Meta::CPAN

{
   "abstract" : "Naive Bayes Classification",
   "author" : [
      "Vlado Keselj https://web.cs.dal.ca/~vlado"
   ],
   "dynamic_config" : 1,
   "generated_by" : "ExtUtils::MakeMaker version 7.04, CPAN::Meta::Converter version 2.150001",
   "license" : [
      "perl_5"
   ],
   "meta-spec" : {
      "url" : "http://search.cpan.org/perldoc?CPAN::Meta::Spec",
      "version" : "2"
   },
   "name" : "AI-NaiveBayes1",
   "no_index" : {
      "directory" : [
         "t",
         "inc"
      ]
   },
   "prereqs" : {
      "build" : {
         "requires" : {
            "ExtUtils::MakeMaker" : "0"
         }
      },
      "configure" : {
         "requires" : {
            "ExtUtils::MakeMaker" : "0"
         }
      },
      "runtime" : {
         "requires" : {
            "YAML" : "0.0"
         }
      }
   },
   "release_status" : "stable",
   "version" : "2.012"
}

META.yml  view on Meta::CPAN

---
abstract: 'Naive Bayes Classification'
author:
  - 'Vlado Keselj https://web.cs.dal.ca/~vlado'
build_requires:
  ExtUtils::MakeMaker: '0'
configure_requires:
  ExtUtils::MakeMaker: '0'
dynamic_config: 1
generated_by: 'ExtUtils::MakeMaker version 7.04, CPAN::Meta::Converter version 2.150001'
license: perl
meta-spec:
  url: http://module-build.sourceforge.net/META-spec-v1.4.html
  version: '1.4'
name: AI-NaiveBayes1
no_index:
  directory:
    - t
    - inc
requires:
  YAML: '0.0'
version: '2.012'

Makefile.PL  view on Meta::CPAN

use ExtUtils::MakeMaker;
#<? read_starfish_conf() !>

my $module = 'AI::NaiveBayes1';
my $name   = 'NaiveBayes1.pm';
(my $dir = $module) =~ s/::/-/g;

WriteMakefile
( 'NAME'       => $module,
  'VERSION_FROM' => $name, # finds $VERSION
  'dist' => { COMPRESS=>"gzip",
	      SUFFIX=>"gz",
	      #PREOP=>('starfish README; '.
	      #        "cp -f README $dir-\$(VERSION); "
	      #       ),
	  },
  'clean' => {FILES => "tmp* testfiles/tmp1 AI *~ t/tmp1 t/*~ t/tmp6 t/tmp6-2"},
  'PREREQ_PM' => { YAML => '0.0' },
  'PL_FILES' => {},
  ($] >= 5.005 ?    ## Add these new keywords supported since 5.005
   (ABSTRACT_FROM => $name, # retrieve abstract from module
   # <? echo "   AUTHOR => 'Vlado Keselj $VladoURL')" !>#+
      AUTHOR => 'Vlado Keselj https://web.cs.dal.ca/~vlado')
   #-
   : ()
   ),
  ($ExtUtils::MakeMaker::VERSION ge '6.30_00'? 
   ('LICENSE'         => 'perl', ) : ()),
  );

open(M, ">>Makefile") or die;

if ( -f 'priv.make' ) { print M getfile('priv.make') }

close(M);

sub getfile($) {
    my $f = shift;
    local *F;
    open(F, "<$f") or die "getfile:cannot open $f:$!";
    my @r = <F>;
    close(F);
    return wantarray ? @r : join ('', @r);
}

NaiveBayes1.pm  view on Meta::CPAN

@EXPORT = qw(new);
use vars qw($Version);
$Version = $VERSION = '2.012';

use vars @EXPORT_OK;

# non-exported package globals go here
use vars qw();

sub new {
  my $package = shift;
  return bless {
                attributes => [ ],
		labels     => [ ],
		attvals    => {},
		real_stat  => {},
		numof_instances => 0,
		stat_labels => {},
		stat_attributes => {},
		smoothing => {},
		attribute_type => {},
	       }, $package;
}

sub set_real {
    my ($self, @attr) = @_;
    foreach my $a (@attr) { $self->{attribute_type}{$a} = 'real' }
}

sub import_from_YAML {
    my $package = shift;
    my $yaml = shift;
    my $self = YAML::Load($yaml);
    return bless $self, $package;
}

sub import_from_YAML_file {
    my $package = shift;
    my $yamlf = shift;
    my $self = YAML::LoadFile($yamlf);
    return bless $self, $package;
}

# assume that the last header count means counts
# after optionally removing counts, the last header is label
sub add_table {
    my $self = shift;
    my @atts = (); my $lbl=''; my $cnt = '';
    while (@_) {
	my $table = shift;
	if ($table =~ /^(.*)\n[ \t]*-+\n/) {
	    my $a = $1; $table = $';
	    $a =~ s/^\s+//; $a =~ s/\s+$//;
	    if ($a =~ /\s*\bcount\s*$/) {
		$a=$`; $cnt=1; } else { $cnt='' }
	    @atts = split(/\s+/, $a);
	    $lbl = pop @atts;
	}
	while ($table ne '') {
	    $table =~ /^(.*)\n?/ or die;
	    my $r=$1; $table = $';
	    $r =~ s/^\s+//; $r=~ s/\s+$//;
	    if ($r =~ /^-+$/) { next }
	    my @v = split(/\s+/, $r);
	    die "values (#=$#v): {@v}\natts (#=$#atts): @atts, lbl=$lbl,\n".
                 "count: $cnt\n" unless $#v-($cnt?2:1) == $#atts;
	    my %av=(); my @a = @atts;
	    while (@a) { $av{shift @a} = shift(@v) }
	    $self->add_instances(attributes=>\%av,
				 label=>"$lbl=$v[0]",
				 cases=>($cnt?$v[1]:1) );
	}
    }
} # end of add_table

# Simplified; not generally compatible.
# Assume that the last header is label.  The first row contains
# attribute names.
sub add_csv_file {
    my $self = shift; my $fn = shift; local *F;
    open(F,$fn) or die "Cannot open CSV file `$fn': $!";
    local $_ = <F>; my @atts = (); my $lbl=''; my $cnt = '';
    chomp; @atts = split(/\s*,\s*/, $_); $lbl = pop @atts;
    while (<F>) {
	chomp; my @v = split(/\s*,\s*/, $_);
	die "values (#=$#v): {@v}\natts (#=$#atts): @atts, lbl=$lbl,\n".
	    "count: $cnt\n" unless $#v-($cnt?2:1) == $#atts;
	my %av=(); my @a = @atts;
	while (@a) { $av{shift @a} = shift(@v) }
	$self->add_instances(attributes=>\%av,
			     label=>"$lbl=$v[0]",
			     cases=>($cnt?$v[1]:1) );
    }
    close(F);
} # end of add_csv_file

sub drop_attributes {
    my $self = shift;
    foreach my $a (@_) {
	my @tmp = grep { $a ne $_ } @{ $self->{attributes} };
	$self->{attributes} = \@tmp;
	delete($self->{attvals}{$a});
	delete($self->{stat_attributes}{$a});
	delete($self->{attribute_type}{$a});
	delete($self->{real_stat}{$a});
	delete($self->{smoothing}{$a});
    }
} # end of drop_attributes

sub add_instances {
  my ($self, %params) = @_;
  for ('attributes', 'label', 'cases') {
      die "Missing required '$_' parameter" unless exists $params{$_};
  }

  if (scalar(keys(%{ $self->{stat_attributes} })) == 0) {
      foreach my $a (keys(%{$params{attributes}})) {
	  $self->{stat_attributes}{$a} = {};
	  push @{ $self->{attributes} }, $a;
	  $self->{attvals}{$a} = [ ];
	  $self->{attribute_type}{$a} = 'nominal' unless defined($self->{attribute_type}{$a});
      }
  } else {
      foreach my $a (keys(%{$self->{stat_attributes}}))
      { die "attribute not given in instance: $a"
	    unless exists($params{attributes}{$a}) }
  }

  $self->{numof_instances} += $params{cases};

  push @{ $self->{labels} }, $params{label} unless
      exists $self->{stat_labels}->{$params{label}};

  $self->{stat_labels}{$params{label}} += $params{cases};

  foreach my $a (keys(%{$self->{stat_attributes}})) {
      if ( not exists($params{attributes}{$a}) )
      { die "attribute $a not given" }
      my $attval = $params{attributes}{$a};
      if (not exists($self->{stat_attributes}{$a}{$attval})) {
	  push @{ $self->{attvals}{$a} }, $attval;
	  $self->{stat_attributes}{$a}{$attval} = {};
      }
      $self->{stat_attributes}{$a}{$attval}{$params{label}} += $params{cases};
  }
}

sub add_instance {
    my ($self, %params) = @_; $params{cases} = 1;
    $self->add_instances(%params);
}

sub train {
    my $self = shift;
    my $m = $self->{model} = {};
    
    $m->{labelprob} = {};
    foreach my $label (keys(%{$self->{stat_labels}}))
    { $m->{labelprob}{$label} = $self->{stat_labels}{$label} /
                                $self->{numof_instances} } 

    $m->{condprob} = {};
    $m->{condprobe} = {};
    foreach my $att (keys(%{$self->{stat_attributes}})) {
        next if $self->{attribute_type}{$att} eq 'real';
	$m->{condprob}{$att} = {};
	$m->{condprobe}{$att} = {};
	foreach my $label (keys(%{$self->{stat_labels}})) {
	    my $total = 0; my @attvals = ();
	    foreach my $attval (keys(%{$self->{stat_attributes}{$att}})) {
		next unless
		    exists($self->{stat_attributes}{$att}{$attval}{$label}) and
		    $self->{stat_attributes}{$att}{$attval}{$label} > 0;
		push @attvals, $attval;
		$m->{condprob}{$att}{$attval} = {} unless
		    exists( $m->{condprob}{$att}{$attval} );
		$m->{condprob}{$att}{$attval}{$label} = 
		    $self->{stat_attributes}{$att}{$attval}{$label};
		$m->{condprobe}{$att}{$attval} = {} unless
		    exists( $m->{condprob}{$att}{$attval} );
		$m->{condprobe}{$att}{$attval}{$label} = 
		    $self->{stat_attributes}{$att}{$attval}{$label};
		$total += $m->{condprob}{$att}{$attval}{$label};
	    }
	    if (exists($self->{smoothing}{$att}) and
		$self->{smoothing}{$att} =~ /^unseen count=/) {
		my $uc = $'; $uc = 0.5 if $uc <= 0;
		if(! exists($m->{condprob}{$att}{'*'}) ) {
		    $m->{condprob}{$att}{'*'} = {};
		    $m->{condprobe}{$att}{'*'} = {};
		}
		$m->{condprob}{$att}{'*'}{$label} = $uc;
		$total += $uc;
		if (grep {$_ eq '*'} @attvals) { die }
		push @attvals, '*';
	    }
	    foreach my $attval (@attvals) {
		$m->{condprobe}{$att}{$attval}{$label} =
		    "(= $m->{condprob}{$att}{$attval}{$label} / $total)";
		$m->{condprob}{$att}{$attval}{$label} /= $total;
	    }
	}
    }

    # For real-valued attributes, we use Gaussian distribution
    # let us collect statistics
    foreach my $att (keys(%{$self->{stat_attributes}})) {
        next unless $self->{attribute_type}{$att} eq 'real';
	print STDERR "Smoothing ignored for real attribute $att!\n" if
	    defined($self->{smoothing}{att}) and $self->{smoothing}{att};
        $m->{real_stat}->{$att} = {};
        foreach my $attval (keys %{$self->{stat_attributes}{$att}}){
            foreach my $label (keys %{$self->{stat_attributes}{$att}{$attval}}){
                $m->{real_stat}{$att}{$label}{sum}
                += $attval * $self->{stat_attributes}{$att}{$attval}{$label};

                $m->{real_stat}{$att}{$label}{count}
                += $self->{stat_attributes}{$att}{$attval}{$label};
            }
            foreach my $label (keys %{$self->{stat_attributes}{$att}{$attval}}){
		next if
                !defined($m->{real_stat}{$att}{$label}{count}) ||
		$m->{real_stat}{$att}{$label}{count} == 0;

                $m->{real_stat}{$att}{$label}{mean} =
                    $m->{real_stat}{$att}{$label}{sum} /
                        $m->{real_stat}{$att}{$label}{count};
            }
        }

        # calculate stddev
        foreach my $attval (keys %{$self->{stat_attributes}{$att}}) {
            foreach my $label (keys %{$self->{stat_attributes}{$att}{$attval}}){
                $m->{real_stat}{$att}{$label}{stddev} +=
		    ($attval - $m->{real_stat}{$att}{$label}{mean})**2 *
		    $self->{stat_attributes}{$att}{$attval}{$label};
            }
        }
	foreach my $label (keys %{$m->{real_stat}{$att}}) {
	    $m->{real_stat}{$att}{$label}{stddev} =
		sqrt($m->{real_stat}{$att}{$label}{stddev} /
		     ($m->{real_stat}{$att}{$label}{count}-1)
		     );
	}
    }				# foreach real attribute
}				# end of sub train

sub predict {
  my ($self, %params) = @_;
  my $newattrs = $params{attributes} or die "Missing 'attributes' parameter for predict()";
  my $m = $self->{model};  # For convenience
  
  my %scores;
  my @labels = @{ $self->{labels} };
  $scores{$_} = $m->{labelprob}{$_} foreach (@labels);
  foreach my $att (keys(%{ $newattrs })) {
      if (!defined($self->{attribute_type}{$att})) { die "Unknown attribute: `$att'" }
      next if $self->{attribute_type}{$att} eq 'real';
      die unless exists($self->{stat_attributes}{$att});
      my $attval = $newattrs->{$att};
      die "Unknown value `$attval' for attribute `$att'."
      unless exists($self->{stat_attributes}{$att}{$attval}) or
	  exists($self->{smoothing}{$att});
      foreach my $label (@labels) {
	  if (exists($m->{condprob}{$att}{$attval}) and
	      exists($m->{condprob}{$att}{$attval}{$label}) and
	      $m->{condprob}{$att}{$attval}{$label} > 0 ) {
	      $scores{$label} *=
		  $m->{condprob}{$att}{$attval}{$label};
	  } elsif (exists($self->{smoothing}{$att})) {
	      $scores{$label} *=
                  $m->{condprob}{$att}{'*'}{$label};
	  } else { $scores{$label} = 0 }

      }
  }

  foreach my $att (keys %{$newattrs}){
      next unless $self->{attribute_type}{$att} eq 'real';
      my $sum=0; my %nscores;
      foreach my $label (@labels) {
	  die unless exists $m->{real_stat}{$att}{$label}{mean};
	  $nscores{$label} =
              0.398942280401433 / $m->{real_stat}{$att}{$label}{stddev}*
              exp( -0.5 *
                  ( ( $newattrs->{$att} -
                      $m->{real_stat}{$att}{$label}{mean})
                    / $m->{real_stat}{$att}{$label}{stddev}
                  ) ** 2
		 );
	  $sum += $nscores{$label};
      }
      if ($sum==0) { print STDERR "Ignoring all Gaussian probabilities: all=0!\n" }
      else {
	  foreach my $label (@labels) { $scores{$label} *= $nscores{$label} }
      }
  }

  my $sumPx = 0.0;
  $sumPx += $scores{$_} foreach (keys(%scores));
  $scores{$_} /= $sumPx foreach (keys(%scores));
  return \%scores;
}

sub print_model {
    my $self = shift;
    my $withcounts = '';
    if ($#_>-1 && $_[0] eq 'with counts')
    { shift @_; $withcounts = 1; }
    my $m = $self->{model};
    my @labels = $self->labels;
    my $r;

    # prepare table category P(category)
    my @lines;
    push @lines, 'category ', '-';
    push @lines, "$_ " foreach @labels;
    @lines = _append_lines(@lines);
    @lines = map { $_.='| ' } @lines;
    $lines[1] = substr($lines[1],0,length($lines[1])-2).'+-';
    $lines[0] .= "P(category) ";
    foreach my $i (2..$#lines) {
	my $label = $labels[$i-2];
	$lines[$i] .= $m->{labelprob}{$label} .' ';
	if ($withcounts) {
	    $lines[$i] .= "(= $self->{stat_labels}{$label} / ".
		"$self->{numof_instances} ) ";
	}
    }
    @lines = _append_lines(@lines);

    $r .= join("\n", @lines) . "\n". $lines[1]. "\n\n";

    # prepare conditional tables
    my @attributes = sort $self->attributes;
    foreach my $att (@attributes) {
	@lines = ( "category ", '-' );
	my @lines1 = ( "$att ", '-' );
	my @lines2 = ( "P( $att | category ) ", '-' );
	my @attvals = sort keys(%{ $m->{condprob}{$att} });
	foreach my $label (@labels) {
	    if ( $self->{attribute_type}{$att} ne 'real' ) {
		foreach my $attval (@attvals) {
		    next unless exists($m->{condprob}{$att}{$attval}{$label});
		    push @lines, "$label ";
		    push @lines1, "$attval ";

		    my $line = $m->{condprob}{$att}{$attval}{$label};
		    if ($withcounts)
		    { $line.= ' '.$m->{condprobe}{$att}{$attval}{$label} }
		    $line .= ' ';
		    push @lines2, $line;
		}
	    } else {
		push @lines, "$label ";
		push @lines1, "real ";
		push @lines2, "Gaussian(mean=".
		    $m->{real_stat}{$att}{$label}{mean}.",stddev=".
		    $m->{real_stat}{$att}{$label}{stddev}.") ";
	    }
	    push @lines, '-'; push @lines1, '-'; push @lines2, '-';
	}
	@lines = _append_lines(@lines);
	foreach my $i (0 .. $#lines)
	{ $lines[$i] .= ($lines[$i]=~/-$/?'+-':'| ') . $lines1[$i] }
	@lines = _append_lines(@lines);
	foreach my $i (0 .. $#lines)
	{ $lines[$i] .= ($lines[$i]=~/-$/?'+-':'| ') . $lines2[$i] }
	@lines = _append_lines(@lines);

	$r .= join("\n", @lines). "\n\n";
    }

    return $r;
}

sub _append_lines {
    my @l = @_;
    my $m = 0;
    foreach (@l) { $m = length($_) if length($_) > $m }
    @l = map 
    { while (length($_) < $m) { $_.=substr($_,length($_)-1) }; $_ }
    @l;
    return @l;
}

sub labels {
  my $self = shift;
  return @{ $self->{labels} };
}

sub attributes {
  my $self = shift;
  return keys %{ $self->{stat_attributes} };
}

sub export_to_YAML {
    my $self = shift;
    require YAML;
    return YAML::Dump($self);
}

sub export_to_YAML_file {
    my $self = shift;
    my $file = shift;
    require YAML;
    YAML::DumpFile($file, $self);
}

1;
__END__

=head1 NAME

AI::NaiveBayes1 - Naive Bayes Classification

=head1 SYNOPSIS

  use AI::NaiveBayes1;
  my $nb = AI::NaiveBayes1->new;
  $nb->add_table(
  "Html  Caps  Free  Spam  count
  -------------------------------
     Y     Y     Y     Y    42   
     Y     Y     Y     N    32   
     Y     Y     N     Y    17   
     Y     Y     N     N     7   
     Y     N     Y     Y    32   
     Y     N     Y     N    12   
     Y     N     N     Y    20   
     Y     N     N     N    16   
     N     Y     Y     Y    38   
     N     Y     Y     N    18   
     N     Y     N     Y    16   
     N     Y     N     N    16   
     N     N     Y     Y     2   
     N     N     Y     N     9   
     N     N     N     Y    11   
     N     N     N     N    91   
  -------------------------------
  ");
  $nb->train;
  print "Model:\n" . $nb->print_model;
  print "Model (with counts):\n" . $nb->print_model('with counts');

  $nb = AI::NaiveBayes1->new;
  $nb->add_instances(attributes=>{model=>'H',place=>'B'},
		     label=>'repairs=Y',cases=>30);
  $nb->add_instances(attributes=>{model=>'H',place=>'B'},
		     label=>'repairs=N',cases=>10);
  $nb->add_instances(attributes=>{model=>'H',place=>'N'},
		     label=>'repairs=Y',cases=>18);
  $nb->add_instances(attributes=>{model=>'H',place=>'N'},
		     label=>'repairs=N',cases=>16);
  $nb->add_instances(attributes=>{model=>'T',place=>'B'},
		     label=>'repairs=Y',cases=>22);
  $nb->add_instances(attributes=>{model=>'T',place=>'B'},
		     label=>'repairs=N',cases=>14);
  $nb->add_instances(attributes=>{model=>'T',place=>'N'},
		     label=>'repairs=Y',cases=> 6);
  $nb->add_instances(attributes=>{model=>'T',place=>'N'},
		     label=>'repairs=N',cases=>84);

  $nb->train;

  print "Model:\n" . $nb->print_model;
  
  # Find results for unseen instances
  my $result = $nb->predict
     (attributes => {model=>'T', place=>'N'});

  foreach my $k (keys(%{ $result })) {
      print "for label $k P = " . $result->{$k} . "\n";
  }

  # export the model into a string
  my $string = $nb->export_to_YAML();

  # create the same model from the string
  my $nb1 = AI::NaiveBayes1->import_from_YAML($string);

  # write the model to a file (shorter than model->string->file)
  $nb->export_to_YAML_file('t/tmp1');

  # read the model from a file (shorter than file->string->model)
  my $nb2 = AI::NaiveBayes1->import_from_YAML_file('t/tmp1');

See Examples for more examples.

=head1 DESCRIPTION

This module implements the classic "Naive Bayes" machine learning
algorithm.

=head2 Data Structure

NaiveBayes1.pm  view on Meta::CPAN


=item C<{stat_attributes}{$a}>

Statistics for an attribute: C<...{$value}{$label}> = count of
instances.

=item C<{smoothing}{$attribute}>

Attribute smoothing.  No smoothing if does not exist.  Implemented smoothing:

      - /^unseen count=/ followed by number, e.g., 0.5

=back

=head2 Attribute Smoothing

For an attribute A one can specify:

    $nb->{smoothing}{A} = 'unseen count=0.5';

to provide a count for unseen data.  The count is taken into
consideration in training and prediction, when any unseen attribute
values are observed.  Zero probabilities can be prevented in this way.
A count other than 0.5 can be provided, but if it is <=0 it will be
set to 0.5.  The method is similar to add-one smoothing.  A special
attribute value '*' is used for all unseen data. 

=head1 METHODS

NaiveBayes1.pm  view on Meta::CPAN

particular order), or the number of labels if called in a scalar
context.

=back

=head1 THEORY

Bayes' Theorem is a way of inverting a conditional probability. It
states:

                P(y|x) P(x)
      P(x|y) = -------------
                   P(y)

and so on...

This is a pretty standard algorithm explained in many machine learning
textbooks (e.g., "Data Mining" by Witten and Eibe).

The algorithm relies on estimating P(A|C), where A is an arbitrary
attribute, and C is the class attribute.  If A is not real-valued,
then this conditional probability is estimated using a table of all
possible values for A and C.

If A is real-valued, then the distribution P(A|C) is modeled as a
Gaussian (normal) distribution for each possible value of C=c,  Hence,
for each C=c we collect the mean value (m) and standard deviation (s)
for A during training.  During classification, P(A=a|C=c) is estimated
using Gaussian distribution, i.e., in the following way:

                    1               (a-m)^2
 P(A=a|C=c) = ------------ * exp( - ------- )
              sqrt(2*Pi)*s           2*s^2

this boils down to the following lines of code:

    $scores{$label} *=
    0.398942280401433 / $m->{real_stat}{$att}{$label}{stddev}*
      exp( -0.5 *
           ( ( $newattrs->{$att} -
               $m->{real_stat}{$att}{$label}{mean})
             / $m->{real_stat}{$att}{$label}{stddev}
           ) ** 2
	   );

i.e.,

  P(A=a|C=c) = 0.398942280401433 / s *
    exp( -0.5 * ( ( a-m ) / s ) ** 2 );


=head1 EXAMPLES

Example with a real-valued attribute modeled by a Gaussian
distribution (from Witten I. and Frank E. book "Data Mining" (the WEKA
book), page 86):

 # @relation weather
 # 
 # @attribute outlook {sunny, overcast, rainy}
 # @attribute temperature real
 # @attribute humidity real
 # @attribute windy {TRUE, FALSE}
 # @attribute play {yes, no}
 # 
 # @data
 # sunny,85,85,FALSE,no
 # sunny,80,90,TRUE,no
 # overcast,83,86,FALSE,yes
 # rainy,70,96,FALSE,yes
 # rainy,68,80,FALSE,yes
 # rainy,65,70,TRUE,no
 # overcast,64,65,TRUE,yes
 # sunny,72,95,FALSE,no
 # sunny,69,70,FALSE,yes
 # rainy,75,80,FALSE,yes
 # sunny,75,70,TRUE,yes
 # overcast,72,90,TRUE,yes
 # overcast,81,75,FALSE,yes
 # rainy,71,91,TRUE,no
 
 $nb->set_real('temperature', 'humidity');
 
 $nb->add_instance(attributes=>{outlook=>'sunny',temperature=>85,humidity=>85,windy=>'FALSE'},label=>'play=no');
 $nb->add_instance(attributes=>{outlook=>'sunny',temperature=>80,humidity=>90,windy=>'TRUE'},label=>'play=no');
 $nb->add_instance(attributes=>{outlook=>'overcast',temperature=>83,humidity=>86,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'rainy',temperature=>70,humidity=>96,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'rainy',temperature=>68,humidity=>80,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'rainy',temperature=>65,humidity=>70,windy=>'TRUE'},label=>'play=no');
 $nb->add_instance(attributes=>{outlook=>'overcast',temperature=>64,humidity=>65,windy=>'TRUE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'sunny',temperature=>72,humidity=>95,windy=>'FALSE'},label=>'play=no');
 $nb->add_instance(attributes=>{outlook=>'sunny',temperature=>69,humidity=>70,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'rainy',temperature=>75,humidity=>80,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'sunny',temperature=>75,humidity=>70,windy=>'TRUE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'overcast',temperature=>72,humidity=>90,windy=>'TRUE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'overcast',temperature=>81,humidity=>75,windy=>'FALSE'},label=>'play=yes');
 $nb->add_instance(attributes=>{outlook=>'rainy',temperature=>71,humidity=>91,windy=>'TRUE'},label=>'play=no');
 
 $nb->train;
 
 my $printedmodel =  "Model:\n" . $nb->print_model;
 my $p = $nb->predict(attributes=>{outlook=>'sunny',temperature=>66,humidity=>90,windy=>'TRUE'});

 YAML::DumpFile('file', $p);
 die unless (abs($p->{'play=no'}  - 0.792) < 0.001);
 die unless(abs($p->{'play=yes'} - 0.208) < 0.001);

=head1 HISTORY

L<Algorithm::NaiveBayes> by Ken Williams was not what I needed so I
wrote this one.  L<Algorithm::NaiveBayes> is oriented towards text
categorization, it includes smoothing, and log probabilities.  This
module is a generic, basic Naive Bayes algorithm.

=head1 THANKS

README  view on Meta::CPAN

AI/NaiveBayes1 version 2.011
============================

This is an implementation of the basic Naive Bayes classification
algorithm.  Documentation is a part of the file NaiveBayes1.pm in the
pod format.  To convert it into manual page format, type something
like the following:

	  pod2html NaiveBayes1.pm

INSTALLATION

To install this module type the following:

   perl Makefile.PL
   make
   make test
   make install

DEPENDENCIES

The methods for importing to and exporting from YAML format require
module YAML.

COPYRIGHT AND LICENCE

Copyright 2003-21 Vlado Keselj https://web.cs.dal.ca/~vlado

t/1-2.out  view on Meta::CPAN

--- #YAML:1.0 !perl/AI::NaiveBayes1
attributes:
  - place
  - model
attvals:
  model:
    - H
    - T
  place:
    - B
    - N
labels:
  - repairs=Y
  - repairs=N
model:
  condprob:
    model:
      H:
        repairs=N: 0.209677419354839
        repairs=Y: 0.631578947368421
      T:
        repairs=N: 0.790322580645161
        repairs=Y: 0.368421052631579
    place:
      B:
        repairs=N: 0.193548387096774
        repairs=Y: 0.684210526315789
      N:
        repairs=N: 0.806451612903226
        repairs=Y: 0.315789473684211
  labelprob:
    repairs=N: 0.62
    repairs=Y: 0.38
  numerical_attr: {}
numof_instances: 200
stat_attributes:
  model:
    H:
      repairs=N: 26
      repairs=Y: 48
    T:
      repairs=N: 98
      repairs=Y: 28
  place:
    B:
      repairs=N: 24
      repairs=Y: 52
    N:
      repairs=N: 100
      repairs=Y: 24
stat_labels:
  repairs=N: 124
  repairs=Y: 76

t/2-2.out  view on Meta::CPAN

--- #YAML:1.0 !perl/AI::NaiveBayes1
attributes:
  - size1
  - morning
  - html
attvals:
  html:
    - N
    - Y
  morning:
    - Y
    - N
  size1:
    - S
    - L
labels:
  - spam=N
  - spam=Y
model:
  condprob:
    html:
      N:
        spam=N: 0.666666666666667
        spam=Y: 0.0588235294117647
      Y:
        spam=N: 0.333333333333333
        spam=Y: 0.941176470588235
    morning:
      N:
        spam=N: 0.333333333333333
        spam=Y: 0.647058823529412
      Y:
        spam=N: 0.666666666666667
        spam=Y: 0.352941176470588
    size1:
      L:
        spam=N: 0.333333333333333
        spam=Y: 0.588235294117647
      S:
        spam=N: 0.666666666666667
        spam=Y: 0.411764705882353
  labelprob:
    spam=N: 0.15
    spam=Y: 0.85
  numerical_attr: {}
numof_instances: 20
stat_attributes:
  html:
    N:
      spam=N: 2
      spam=Y: 1
    Y:
      spam=N: 1
      spam=Y: 16
  morning:
    N:
      spam=N: 1
      spam=Y: 11
    Y:
      spam=N: 2
      spam=Y: 6
  size1:
    L:
      spam=N: 1
      spam=Y: 10
    S:
      spam=N: 2
      spam=Y: 7
stat_labels:
  spam=N: 3
  spam=Y: 17

t/3-2.out  view on Meta::CPAN

--- #YAML:1.0 !perl/AI::NaiveBayes1
attributes:
  - outlook
  - windy
  - humidity
  - temperature
attvals:
  humidity:
    - high
    - normal
  outlook:
    - sunny
    - overcast
    - rainy
  temperature:
    - hot
    - mild
    - cool
  windy:
    - FALSE
    - TRUE
labels:
  - play=no
  - play=yes
model:
  condprob:
    humidity:
      high:
        play=no: 0.8
        play=yes: 0.333333333333333
      normal:
        play=no: 0.2
        play=yes: 0.666666666666667
    outlook:
      overcast:
        play=no: 0
        play=yes: 0.444444444444444
      rainy:
        play=no: 0.4
        play=yes: 0.333333333333333
      sunny:
        play=no: 0.6
        play=yes: 0.222222222222222
    temperature:
      cool:
        play=no: 0.2
        play=yes: 0.333333333333333
      hot:
        play=no: 0.4
        play=yes: 0.222222222222222
      mild:
        play=no: 0.4
        play=yes: 0.444444444444444
    windy:
      FALSE:
        play=no: 0.4
        play=yes: 0.666666666666667
      TRUE:
        play=no: 0.6
        play=yes: 0.333333333333333
  labelprob:
    play=no: 0.357142857142857
    play=yes: 0.642857142857143
  numerical_attr: {}
numof_instances: 14
stat_attributes:
  humidity:
    high:
      play=no: 4
      play=yes: 3
    normal:
      play=no: 1
      play=yes: 6
  outlook:
    overcast:
      play=yes: 4
    rainy:
      play=no: 2
      play=yes: 3
    sunny:
      play=no: 3
      play=yes: 2
  temperature:
    cool:
      play=no: 1
      play=yes: 3
    hot:
      play=no: 2
      play=yes: 2
    mild:
      play=no: 2
      play=yes: 4
  windy:
    FALSE:
      play=no: 2
      play=yes: 6
    TRUE:
      play=no: 3
      play=yes: 3
stat_labels:
  play=no: 5
  play=yes: 9

t/3.t  view on Meta::CPAN


#putfile('t/3-2.out', $nb->export_to_YAML());
#is($nb->export_to_YAML(), getfile('t/3-2.out'));

eval "require YAML;";
plan skip_all => "YAML module required for the remaining tests in 3.t" if $@;

$nb->export_to_YAML_file('t/tmp3');
my $nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp3');
&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/3-1.out', __FILE__, __LINE__);

my $tmp = $nb->export_to_YAML();
my $nb2 = AI::NaiveBayes1->import_from_YAML($tmp);
&compare_by_line("Model:\n" . &shorterdecimals($nb2->print_model),
		 't/3-1.out', __FILE__, __LINE__);

my $p = $nb->predict(attributes=>{outlook=>'sunny',temperature=>'cool',humidity=>'high',windy=>'TRUE'});

#putfile('t/3-3.out', YAML::Dump($p));
ok(abs($p->{'play=no'}  - 0.795) < 0.001);
ok(abs($p->{'play=yes'} - 0.205) < 0.001);

t/4-2.out  view on Meta::CPAN

--- #YAML:1.0 !perl/AI::NaiveBayes1
attributes:
  - outlook
  - windy
  - humidity
  - temperature
attvals:
  humidity:
    - 85
    - 90
    - 86
    - 96
    - 80
    - 70
    - 65
    - 95
    - 75
    - 91
  outlook:
    - sunny
    - overcast
    - rainy
  temperature:
    - 85
    - 80
    - 83
    - 70
    - 68
    - 65
    - 64
    - 72
    - 69
    - 75
    - 81
    - 71
  windy:
    - FALSE
    - TRUE
labels:
  - play=>no
  - play=>yes
model:
  condprob:
    outlook:
      overcast:
        play=>no: 0
        play=>yes: 0.444444444444444
      rainy:
        play=>no: 0.4
        play=>yes: 0.333333333333333
      sunny:
        play=>no: 0.6
        play=>yes: 0.222222222222222
    windy:
      FALSE:
        play=>no: 0.4
        play=>yes: 0.666666666666667
      TRUE:
        play=>no: 0.6
        play=>yes: 0.333333333333333
  labelprob:
    play=>no: 0.357142857142857
    play=>yes: 0.642857142857143
  real_attr: &1
    humidity: 1
    temperature: 1
  real_stat:
    humidity:
      play=>no:
        count: 5
        mean: 86.2
        stddev: 9.73139250056229
        sum: 431
      play=>yes:
        count: 9
        mean: 79.1111111111111
        stddev: 10.2157286138146
        sum: 712
    temperature:
      play=>no:
        count: 5
        mean: 74.6
        stddev: 7.89303490426845
        sum: 373
      play=>yes:
        count: 9
        mean: 73
        stddev: 6.16441400296898
        sum: 657
numof_instances: 14
real_attr: *1
real_stat: {}
stat_attributes:
  humidity:
    65:
      play=>yes: 1
    70:
      play=>no: 1
      play=>yes: 2
    75:
      play=>yes: 1
    80:
      play=>yes: 2
    85:
      play=>no: 1
    86:
      play=>yes: 1
    90:
      play=>no: 1
      play=>yes: 1
    91:
      play=>no: 1
    95:
      play=>no: 1
    96:
      play=>yes: 1
  outlook:
    overcast:
      play=>yes: 4
    rainy:
      play=>no: 2
      play=>yes: 3
    sunny:
      play=>no: 3
      play=>yes: 2
  temperature:
    64:
      play=>yes: 1
    65:
      play=>no: 1
    68:
      play=>yes: 1
    69:
      play=>yes: 1
    70:
      play=>yes: 1
    71:
      play=>no: 1
    72:
      play=>no: 1
      play=>yes: 1
    75:
      play=>yes: 2
    80:
      play=>no: 1
    81:
      play=>yes: 1
    83:
      play=>yes: 1
    85:
      play=>no: 1
  windy:
    FALSE:
      play=>no: 2
      play=>yes: 6
    TRUE:
      play=>no: 3
      play=>yes: 3
stat_labels:
  play=>no: 5
  play=>yes: 9

t/4.t  view on Meta::CPAN


#putfile('t/4-2.out', $nb->export_to_YAML());
#is($nb->export_to_YAML(), getfile('t/4-2.out'));

eval "require YAML;";
plan skip_all => "YAML module required for the remaining tests in 4.t" if $@;

$nb->export_to_YAML_file('t/tmp4');
my $nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp4');
&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/4-1.out', __FILE__, __LINE__);

my $tmp = $nb->export_to_YAML();
my $nb2 = AI::NaiveBayes1->import_from_YAML($tmp);
&compare_by_line("Model:\n" . &shorterdecimals($nb2->print_model),
		 't/4-1.out', __FILE__, __LINE__);

my $p = $nb->predict(attributes=>{outlook=>'sunny',temperature=>66,humidity=>90,windy=>'TRUE'});

#putfile('t/4-3.out', YAML::Dump($p));
ok(abs($p->{'play=no'}  - 0.792) < 0.001);
ok(abs($p->{'play=yes'} - 0.208) < 0.001);

t/5-2.out  view on Meta::CPAN

--- !!perl/hash:AI::NaiveBayes1
attribute_type:
  html: nominal
  morning: nominal
  size: real
attributes:
  - morning
  - html
  - size
attvals:
  html:
    - N
    - Y
  morning:
    - Y
    - N
  size:
    - 408
    - 3353
    - 4995
    - 1853
    - 732
    - 4017
    - 3190
    - 2345
    - 3569
    - 559
    - 1732
    - 2042
    - 3893
    - 3601
    - 2176
    - 877
    - 272
    - 2740
    - 514
    - 1321
labels:
  - spam=N
  - spam=Y
model:
  condprob:
    html:
      N:
        spam=N: 0.666666666666667
        spam=Y: 0.0588235294117647
      Y:
        spam=N: 0.333333333333333
        spam=Y: 0.941176470588235
    morning:
      N:
        spam=N: 0.333333333333333
        spam=Y: 0.647058823529412
      Y:
        spam=N: 0.666666666666667
        spam=Y: 0.352941176470588
    size: {}
  condprobe:
    html:
      N:
        spam=N: (= 2 / 3)
        spam=Y: (= 1 / 17)
      Y:
        spam=N: (= 1 / 3)
        spam=Y: (= 16 / 17)
    morning:
      N:
        spam=N: (= 1 / 3)
        spam=Y: (= 11 / 17)
      Y:
        spam=N: (= 2 / 3)
        spam=Y: (= 6 / 17)
  labelprob:
    spam=N: 0.15
    spam=Y: 0.85
  real_stat:
    size:
      spam=N:
        count: 3
        mean: 1443.33333333333
        stddev: 1521.30777074638
        sum: 4330
      spam=Y:
        count: 17
        mean: 2344.64705882353
        stddev: 1397.40106721265
        sum: 39859
numof_instances: 20
real_stat: {}
smoothing: {}
stat_attributes:
  html:
    N:
      spam=N: 2
      spam=Y: 1
    Y:
      spam=N: 1
      spam=Y: 16
  morning:
    N:
      spam=N: 1
      spam=Y: 11
    Y:
      spam=N: 2
      spam=Y: 6
  size:
    1321:
      spam=Y: 1
    1732:
      spam=Y: 1
    1853:
      spam=Y: 1
    2042:
      spam=Y: 1
    2176:
      spam=Y: 1
    2345:
      spam=Y: 1
    272:
      spam=Y: 1
    2740:
      spam=Y: 1
    3190:
      spam=N: 1
    3353:
      spam=Y: 1
    3569:
      spam=Y: 1
    3601:
      spam=Y: 1
    3893:
      spam=Y: 1
    4017:
      spam=Y: 1
    408:
      spam=N: 1
    4995:
      spam=Y: 1
    514:
      spam=Y: 1
    559:
      spam=Y: 1
    732:
      spam=N: 1
    877:
      spam=Y: 1
stat_labels:
  spam=N: 3
  spam=Y: 17

t/5.t  view on Meta::CPAN


#putfile('t/5-2.out', $nb->export_to_YAML());
#is($nb->export_to_YAML(), getfile('t/5-2.out'));

eval "require YAML;";
plan skip_all => "YAML module required for the remaining tests in 5.t" if $@;

$nb->export_to_YAML_file('t/tmp5');
my $nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp5');
&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/5-1.out', __FILE__, __LINE__);

my $tmp = $nb->export_to_YAML();
my $nb2 = AI::NaiveBayes1->import_from_YAML($tmp);
&compare_by_line("Model:\n" . &shorterdecimals($nb2->print_model),
		 't/5-1.out', __FILE__, __LINE__);

my $p = $nb->predict(attributes=>{morning=>'Y',html=>'Y',size=>4749});

#putfile('t/5-3.out', YAML::Dump($p));
ok(abs($p->{'spam=N'} - 0.043) < 0.001);
ok(abs($p->{'spam=Y'} - 0.957) < 0.001);

t/6.t  view on Meta::CPAN


#putfile('t/6-1.out', $printedmodel);
&compare_by_line($printedmodel, 't/6-1.out', __FILE__, __LINE__);

eval "require YAML;";
plan skip_all => "YAML module required for the remaining tests in 6.t" if $@;

$nb->export_to_YAML_file('t/tmp6');
my $nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp6');
&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/6-1.out', __FILE__, __LINE__);

my $p = $nb->predict(attributes=>{C=>'Y',F=>0});

#putfile('t/6-2.out', YAML::Dump($p));
ok(abs($p->{'S=N'} - 0.580) < 0.001);
ok(abs($p->{'S=Y'} - 0.420) < 0.001);

# Continual

$nb = AI::NaiveBayes1->new;

t/6.t  view on Meta::CPAN

$printedmodel =  "Model:\n" . $nb->print_model;
$printedmodel = &shorterdecimals($printedmodel);

#putfile('t/6-3.out', $printedmodel);
&compare_by_line($printedmodel, 't/6-3.out', __FILE__, __LINE__);

$nb->export_to_YAML_file('t/tmp6-2');
$nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp6-2');

&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/6-3.out', __FILE__, __LINE__);

$p = $nb->predict(attributes=>{C=>'Y',F=>1});

#putfile('t/6-4.out', YAML::Dump($p));
ok(abs($p->{'S=N'} - 0.339) < 0.001);
ok(abs($p->{'S=Y'} - 0.661) < 0.001);

t/7.t  view on Meta::CPAN


#putfile('t/7-1.out', $printedmodel);
&compare_by_line($printedmodel, 't/7-1.out', __FILE__, __LINE__);

eval "require YAML;";
plan skip_all => "YAML module required for the remaining tests in 7.t" if $@;

$nb->export_to_YAML_file('t/tmp7');
my $nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp7');
&compare_by_line("Model:\n" . &shorterdecimals($nb1->print_model),
		 't/7-1.out', __FILE__, __LINE__);

my $p = $nb->predict(attributes=>{C=>'Y',F=>0});

#putfile('t/7-2.out', YAML::Dump($p));
ok(abs($p->{'S=N'} - 0.580) < 0.001);
ok(abs($p->{'S=Y'} - 0.420) < 0.001);

# Continual

$nb = AI::NaiveBayes1->new;

$nb->add_instances(attributes=>{C=>'Y',F=>2},label=>'S=Y',cases=>30);
$nb->add_instances(attributes=>{C=>'Y',F=>2},label=>'S=N',cases=>10);
$nb->add_instances(attributes=>{C=>'Y',F=>0},label=>'S=Y',cases=>18);
$nb->add_instances(attributes=>{C=>'Y',F=>0},label=>'S=N',cases=>16);
$nb->add_instances(attributes=>{C=>'N',F=>2},label=>'S=Y',cases=>22);
$nb->add_instances(attributes=>{C=>'N',F=>2},label=>'S=N',cases=>14);
#$nb->add_instances(attributes=>{C=>'N',F=>0},label=>'S=Y',cases=> 6);
#$nb->add_instances(attributes=>{C=>'N',F=>0},label=>'S=N',cases=>84);
$nb->add_table("  C   F   S  count\n".
               "------------------\n".
	       "  N   0   Y    6  \n".
	       "  N   0   N   84  \n".
	       '');

$nb->set_real('F');
$nb->train;

$printedmodel =  &shorterdecimals("Model:\n" . $nb->print_model);

#putfile('t/7-3.out', $printedmodel);
&compare_by_line($printedmodel, 't/7-3.out', __FILE__, __LINE__);

$nb->export_to_YAML_file('t/tmp7-2');
$nb1 = AI::NaiveBayes1->import_from_YAML_file('t/tmp7-2');

&compare_by_line(&shorterdecimals("Model:\n" . $nb1->print_model),
		 't/7-3.out', __FILE__, __LINE__);

$p = $nb->predict(attributes=>{C=>'Y',F=>1});

#putfile('t/7-4.out', YAML::Dump($p));
ok(abs($p->{'S=N'} - 0.339) < 0.001);
ok(abs($p->{'S=Y'} - 0.661) < 0.001);

$nb = AI::NaiveBayes1->new;
$nb->add_table(
"Html  Caps  Free  Spam  count
-------------------------------
   Y     Y     Y     Y    42   
   Y     Y     Y     N    32   
   Y     Y     N     Y    17   
   Y     Y     N     N     7   
   Y     N     Y     Y    32   
   Y     N     Y     N    12   
   Y     N     N     Y    20   
   Y     N     N     N    16   
   N     Y     Y     Y    38   
   N     Y     Y     N    18   
   N     Y     N     Y    16   
   N     Y     N     N    16   
   N     N     Y     Y     2   
   N     N     Y     N     9   
   N     N     N     Y    11   
   N     N     N     N    91   
-------------------------------
");
$nb->train;
$printedmodel =  "Model:\n" . $nb->print_model;
$printedmodel = &shorterdecimals($printedmodel);
#putfile('t/7-5.out', $printedmodel);
&compare_by_line($printedmodel, 't/7-5.out', __FILE__, __LINE__);

$p = $nb->predict(attributes=>{Html=>'N',Caps=>'N',Free=>'Y'});
# putfile('t/7-2.out', YAML::Dump($p));

t/8.t  view on Meta::CPAN

use Test::More tests => 2;
use_ok("AI::NaiveBayes1");

use lib '.';
require 't/auxfunctions.pl';

my $nb;
$nb = AI::NaiveBayes1->new;
$nb->add_table(
"  Tp   Wp    W     Wf    T count
  -------------------------------
   PREV PREV  duck  ducks N   4
   N    duck  ducks END   V   4
   PREV PREV  duck  flies N   8
   N    duck  flies END   V   8
   PREV PREV  duck  fly   N   4
   N    duck  fly   fly   N   4
   N    fly   fly   END   V   4
   PREV PREV  duck  ducks V   4
   V    duck  ducks END   N   4
   PREV PREV  duck  END   V   1
   PREV PREV  ducks duck  N   1
   N    ducks duck  END   V   1
   PREV PREV  ducks fly   N   4
   N    ducks fly   END   V   4
   PREV PREV  flies fly   N   4
   N    flies fly   END   V   4
   PREV PREV  fly   flies N   1
   N    fly   flies END   V   1
   PREV PREV  fly   fly   N   1
   N    fly   fly   fly   N   1
   N    fly   fly   END   V   1
   PREV PREV  fly   duck  V   2
   V    fly   duck  END   N   2
  -------------------------------
");

$nb->{smoothing}{W}  = 'unseen count=0.5';
$nb->{smoothing}{Wp} = 'unseen count=0.5';
$nb->{smoothing}{Wf} = 'unseen count=0.5';
$nb->train;
my $printedmodel =  "Model:\n" . $nb->print_model('with counts');

putfile('t/8-1.out', $printedmodel);
is($printedmodel, getfile('t/8-1.out'));

t/auxfunctions.pl  view on Meta::CPAN

#!/usr/bin/perl

sub compare_by_line {
    my $got = shift;
    my $file = shift;
    my $testfile = @_ ? shift @_ : '';
    my $testline = @_ ? shift @_ : '';
    my $expected = getfile($file);
    if ($got eq $expected) { pass; return }
    my $flag = '';
    while ($got ne '' or $expected ne '') {
	my $a=$got;      if ($a =~ /\s*\n/) { $a = $`; $got = $'; }
	my $b=$expected; if ($b =~ /\s*\n/) { $b = $`; $expected = $'; }
	if ($a ne $b) {
	    if ($flag eq '')
	    { print STDERR "\n$testfile:$testline: Failed comparison with $file!\n"; $flag = 1; }
	    print STDERR "     Got: $a\n".
                 	 "Expected: $b\n";
	}
    }
    if ($flag eq '') { pass } else { fail }
}

sub shorterdecimals {
    local $_ = shift;
    s/(\d{4}\.\d{10})\d+/$1/g;
    s/(\.\d{12})\d+/$1/g;
    s/---+/---/g;
    return $_;
}

sub getfile($) {
    my $f = shift;
    local *F;
    open(F, "<$f") or die "getfile:cannot open $f:$!";
    my @r = <F>;
    close(F);
    return wantarray ? @r : join ('', @r);
}

sub putfile($@) {
    my $f = shift;
    local *F;
    open(F, ">$f") or die "putfile:cannot open $f:$!";
    print F '' unless @_;
    while (@_) { print F shift(@_) }
    close(F);
}

1;



( run in 0.449 second using v1.01-cache-2.11-cpan-4d50c553e7e )