Algorithm-BaumWelch

 view release on metacpan or  search on metacpan

lib/Algorithm/BaumWelch.pm  view on Meta::CPAN


    # The observation series see http://www.cs.jhu.edu/~jason/.
    my $obs_series = [qw/ obs2 obs3 obs3 obs2 obs3 obs2 obs3 obs2 obs2 
                          obs3 obs1 obs3 obs3 obs1 obs1 obs1 obs2 obs1 
                          obs1 obs1 obs3 obs1 obs2 obs1 obs1 obs1 obs2 
                          obs3 obs3 obs2 obs3 obs2 obs2
                     /];

    # The emission matrix - each nested array corresponds to the probabilities of a single observation type.
    my $emis = { 
        obs1 =>  [0.3, 0.3], 
        obs2 =>  [0.3, 0.4], 
        obs3 =>  [0.4, 0.3], 
               };

    # The transition matrixi - each row and column correspond to a particular state e.g. P(state1_x|state1_x-1) = 0.9...
    my $trans = [ 
                    [0.9, 0.1], 
                    [0.1, 0.9], 
                ];

    # The probabilities of each state at the start of the series.
    my $start = [0.5, 0.5];

    # Create an Algorithm::BaumWelch object.
    my $ba = Algorithm::BaumWelch->new;

    # Feed in the observation series.
    $ba->feed_obs($obs_series);

    # Feed in the transition and emission matrices and the starting probabilities.
    $ba->feed_values($trans, $emis, $start);

    # Alternatively you can randomly initialise the values - pass it the number of hidden states - 
    # i.e. to determine the parameters we need to make a first guess).
    # $ba->random_initialise(2);
     
    # Perform the algorithm.
    $ba->baum_welch;

    # Use results to pass data. 
    # In VOID-context prints formated results to STDOUT. 
    # In LIST-context returns references to the predicted transition & emission matrices and the starting parameters.
    $ba->results;

=cut

=head1 DESCRIPTION

The Baum-Welch algorithm is used to compute the parameters (transition and emission probabilities) of an Hidden Markov
Model (HMM). The algorithm calculates the forward and backwards probabilities for each HMM state in a series and then re-estimates the parameters of
the model. 

=cut

use version; our $VERSION = qv('0.0.2');

#r/ matrices de BW sao 1xN_states matrices - quer dizer quasi arrays - entao nao usa matrices reais. arrays são bastante
sub new {
    my $class = shift;
    my $self = [undef, undef, []]; bless $self, $class;
    return $self;
}

sub feed_obs {
    my ($self, $series) = @_;
    my %uniq;
    @uniq{@{$series}} = 1;
    my @obs = (keys %uniq);
    @obs = sort { $a cmp $b } @obs;
    $self->[0][0] = $series;
    $self->[0][1] = [@obs];
    $self->[0][2] = scalar @obs;
    return;
}

sub feed_values {
    croak qq{\nThis method expects 3 arguments.} if @_ != 4;
    my ($self, $trans, $emis, $start) = @_;
    croak qq{\nThis method expects 3 arguments.} if (ref $trans ne q{ARRAY} || ref $emis ne q{HASH} || ref $start ne q{ARRAY});
    my $obs_tipos = $self->[0][1];
    my $obs_numero = $self->[0][2];
    my $t_length = &_check_trans($trans);
    &_check_emis($emis, $obs_tipos, $obs_numero, $t_length);
    &_check_start($start, $t_length);
    $self->[1][0] = $trans;
    $self->[1][1] = $emis;
    $self->[1][2] = $start;
    my @stop; # 0.1/1 nao faz diferenca e para|comeca (stop|start) sempre iguala = 0
    for (0..$#{$trans}) { push @stop, 1 };
    $self->[1][3] = [@stop];
    return;
}

sub _check_start {
    my ($start, $t_length) = @_;
    croak qq{\nThere must be an initial probablity for each state in the start ARRAY.} if scalar @{$start} != $t_length;
    for (@{$start}) { croak qq{\nThe start ARRAY values must be numeric.} if !(/^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/) };
    my $sum =0;
    for (@{$start}) { $sum += $_ }
    croak qq{\nThe starting probabilities must sum to 1.} if ($sum <= 0.95 || $sum >= 1.05);
    return;
}

sub _check_emis {
    my ($emis, $obs_tipos, $obs_numero, $t_length) = @_;
    my @emis_keys = (keys %{$emis});
    @emis_keys = sort {$a cmp $b} @emis_keys;
    croak qq{\nThere must be an entry in the emission matrix for each type of observation in the observation series.} if $obs_numero != scalar @emis_keys;
    for (0..$#emis_keys) { croak qq{\nThe observations in the emission matrix do not match those in the observation series.} if $emis_keys[$_] ne $obs_tipos->[$_]; }
    for (values %{$emis}) { 
        croak qq{\nThere must be a probability value for each state in the emission matrix.} if scalar @{$_} != $t_length;
        for my $cell (@{$_}) { croak qq{\nThe emission matrix values must be numeric.} if $cell !~ /^[+-]?\ *(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?$/; }
    }
    for my $i (0..$t_length-1) { # só fazendo 2-estado agora
        my $sum = 0;
        for my $o (@{$obs_tipos}) { $sum += $emis->{$o}[$i] }
        croak qq{\nThe emission matrix column must sum to 1.} if ($sum <= 0.95 || $sum >= 1.05);
    }
    return;
}



( run in 1.537 second using v1.01-cache-2.11-cpan-140bd7fdf52 )