Algorithm-RandomPointGenerator

 view release on metacpan or  search on metacpan

lib/Algorithm/RandomPointGenerator.pm  view on Meta::CPAN

        _output_hist_bins_along_x  =>   $args{output_hist_bins_along_x} || 40,
        _command_line_mode         =>   $args{command_line_mode}        || 0,
        _x_delta             =>   undef,
        _y_delta             =>   undef,
        _input_histogram     =>   undef,
        _output_histogram    =>   undef,
        _bounding_box        =>   undef,
        _generated_points    =>   undef,
        _normalized_input_hist => undef,
        _sigmax_for_proposal_density => undef,
        _sigmay_for_proposal_density => undef,
        _bin_width_for_output_hist   => undef,
        _bin_height_for_output_hist  => undef,
    }, $class;
}

# The file that contains a 2D histogram of the desired probability distribution for
# the random points must either be in the CSV or the white-space-separated
# format. Since we are working with 2D densities, each line of this text file should
# show one row of the histogram.  The subroutine creates an array of arrays from the
# information read from the file, which each inner row holding one row of the
# histogram.
sub read_histogram_file_for_desired_density {
    my $self = shift;
    my $filename = $self->{_hist_file};
    open FILEIN, $filename
        or die "unable to open file: $!";
    my @hist;
    while (<FILEIN>) {
        next if /^#/;
        next if /^\s*$/;
        chomp;
        my @line;
        if ($filename =~ /.csv$/) {
            @line =  map {$_ =~ s/^\s*|\s*$//; $_} split /,/, $_;
        } else {
            @line = split;
        }
        push @hist, \@line;
    }
    close FILEIN;
    if ($self->{_y_axis_pos_direction} eq "up") {
        my @newhist;
        foreach my $i (0..@hist-1) {
            push @newhist, $hist[scalar(@hist)-$i-1];
        }
        $self->{_input_histogram} = \@newhist; 
    } elsif ($self->{_y_axis_pos_direction} eq "down") {
        $self->{_input_histogram} = \@hist; 
    } else {
        die "Something is wrong with your value for the construction option 'y_xis_pos_direction'";
    }
}


# This subroutine is for reading from a text file the horizontal and the vertical
# limits of the bounding box in which the randomly generated points must reside.  The
# file can either be in the CSV format or the white-space-separated format.  Apart
# from any comment lines that begin with the hash mark, this must must contain
# exactly two lines, the first indicated the x-axis points that define the horizontal
# span of the bounding box and the second indicating the y-axis points that define
# the vertical span of the same.
sub read_file_for_bounding_box {
    my $self = shift;
    my $histref = $self->{_input_histogram};
    my $filename = $self->{_bbox_file};
    open FILEIN, $filename
        or die "unable to open file: $!";
    my @bounding_box;
    while (<FILEIN>) {
        next if /^#/;
        next if /^\s*$/;
        chomp;
        my @line;
        if ($filename =~ /.csv$/) {
            @line =  map {$_ =~ s/^\s*|\s*$//; $_} split /,/, $_;
        } else {
            @line = split;
        }
        die "There must exist exactly two entries in each line of the bounding box file"
            if @line != 2;
        push @bounding_box, \@line;
    }
    die "There must exist only two lines in the bounding box file" if @bounding_box != 2;
    close FILEIN;
    $self->{_x_delta} = ($bounding_box[0][1] - $bounding_box[0][0]) / @{$histref->[0]};
    $self->{_y_delta} = ($bounding_box[1][1] - $bounding_box[1][0]) / @$histref;
    $self->{_bounding_box} = \@bounding_box;
}
    
# This is the heart of the module --- in the sense that this method implements the
# Metropolis-Hastings algorithm for generating the random points.  This algorithm is
# the most popular algorithm today for what is known as the MCMC (Markov Chain Monte
# Carlo) sampling from a desired probability distribution.
sub metropolis_hastings {    
    $|++;
    my $self = shift;
    die "\nyou must first call read_file_for_bounding_box() before you can call\n" .   
        "metropolis_hastings()$!\n" unless $self->{_bounding_box};
    die "\nyou must first call normalize_input_histogram() before you can call\n" .
        "metropolis_hastings()$!\n" unless $self->{_normalized_input_hist};
    die "\nyou must first call set_sigmas_for_proposal_density() before you can call\n" .
        "metropolis_hastings()$!\n" unless $self->{_sigmax_for_proposal_density};
    my $box = $self->{_bounding_box};
    my $N_discard = $self->{_how_many_to_discard};
    my $N_iterations = $self->{_N} + $N_discard;
    my $sample = [$box->[0][0] + ($box->[0][1] - $box->[0][0]) / 2.0,
                  $box->[1][0] + ($box->[1][1] - $box->[1][0]) / 2.0];
    while ($self->desired_density($sample) == 0) {
        $sample = [$self->{_bounding_box}->[0][0] +
             random_uniform() * ($self->{_bounding_box}->[0][1] - $self->{_bounding_box}->[0][0]),
                   $self->{_bounding_box}->[1][0] +
             random_uniform() * ($self->{_bounding_box}->[1][1] - $self->{_bounding_box}->[1][0])]
    }
    print "\nstarting sample: @$sample\n" unless $self->{_command_line_mode};
    if ($self->{_command_line_mode}) {
        print "\nThe Metropolis-Hastings algorithm will be run over 2500 iterations.\n" .
              "Of the 2500 points generated, the first 500 will be discarded.\n" .
              "Each dot shown below stands for 50 iterations of the algorithm.\n\n";
    } else {
        print "\nThe Metropolis-Hastings algorithm will be run for $N_discard more iterations\n" .

lib/Algorithm/RandomPointGenerator.pm  view on Meta::CPAN



=head1 CHANGES

Version 1.01 downshifts the version of Perl that is required for this module.  The
implementation code for the module is unchanged from Version 1.0.


=head1 DESCRIPTION

Several testing protocols for "big data" research projects involving large geographic
areas require a random set of points that are distributed according to a
user-specified probability density function that exists in the form of a 2D
histogram.  This module is an implementation of the Metropolis-Hastings algorithm for
generating such a set of points.

=head1 METHODS

The module provides the following methods:

=over 4

=item B<new():>

A call to C<new()> constructs a new instance of the
C<Algorithm::RandomPointGenerator> class.  If you wanted to set all the constructor
options, this call would look like:

  my $generator = Algorithm::RandomPointGenerator->new(
                            input_histogram_file    => $input_histogram_file,
                            bounding_box_file       => $bounding_box_file,
                            number_of_points        => 2000,
                            how_many_to_discard     => 500,
                            proposal_density_width  => 0.1,
                            y_axis_pos_direction    => 'down', 
                  );

where C<input_histogram_file> is the name of the file that contains a 2D histogram as
an approximation to the desired probability density function for the random points to
be generated.  The data in the histogram file is expected to be in CSV format.  Here
is a display of a very small portion of the contents of such a file for an actual
geographic region:

    0,211407,216387,211410,205621,199122,192870, ........
    0,408221,427716,427716,427716,427716,427716,427716, ......
    0,408221,427716,427716,427716,427716,427716,427716, ......
    ....
    ....
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,165,9282,11967,15143, .....
    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,....

The C<bounding_box_file> parameter of the constructor should delineate the portion of
the plane to which the input histogram corresponds.  Here is an example of the
contents of an actual file supplied for this option:

     -71.772016, -70.431923
     -34.254251,  -33.203240

Apart from any comment lines, there must exist exactly two lines in the bounding-box
file, with the first line indicating the left and the right limits of the horizontal
coordinates and the second line indicating the lower and the upper limits of the
vertical coordinates.  (The values shown above are the longitude and the latitude
limits for a region in Chile, in case you are curious.)

=back 

=head2 Constructor Parameters:

=over 8

=item C<input_histogram_file>:

This required parameter supplies the name of the file that contains a 2D histogram as
the desired density function for the random points that the module will generate.
Each line record in this file must correspond to one row of the 2D histogram.  The
left-to-right direction in the file will be considered to be positive direction for
the x-axis.  As for the positive direction for the y-axis, it is common for that to
be from top to bottom when the histogram is written out to a text file as an array of
integers.  It is important to bear in mind this orientation of the histogram in light
of the fact that a bounding box is specified typically with its y-axis going upwards
(whereas the x-axis continues to be positive from left to right).  This inconsistency
between how a 2D histogram is typically stored in a text file and how a bounding box
is likely to be specified means that if the events occur more frequently in the upper
right hand corner of the bounding box, those high counts would show up in the lower
right hand corner of the histogram in a text file (or in a terminal display of the
contents of such a file). B<You can use the constructor option
C<y_axis_pos_direction> to reverse the positive sense of the y direction for the
histogram.  If you set C<y_axis_pos_direction> to the string C<up>, then the
orientation of the y axis in both the histogram and the bounding box will be the
same.>

=item C<bounding_box_file>:

This required parameter supplies the name of the file that contains the bounding box
information.  By bounding box is meant the part of the XY-plane that corresponds to
the histogram supplied through the C<input_histogram_file> option.  Apart from any
comment lines, this file must contain exactly two lines and each line must contain
exactly two numeric entries.  Additionally, the first entry in each of the two lines
must be smaller than the second entry in the same line.  The two entries in the first
line define the lower and the upper bounds on the x-axis and the two entries in the
second line do the same for the y-axis.

=item C<number_of_points>:

This parameter specifies the number of random points that you want the module to
generate.

=item C<how_many_to_discard>:

The Metropolis-Hastings algorithm belongs to a category of algorithms known as
random-walk algorithms. Since the random walk carried out by such algorithms must be
initialized with user input, it is necessary to discard the points produced until the
effects the initial state have died out.  This parameter specifies how many of the
generated points will be discarded.  This parameter is optional in the sense that it
has a default value of 500.

=item C<proposal_density_width>:

Given the current point, the Metropolis-Hastings randomly chooses a candidate for the
next point.  This random selection of a candidate point is carried out using what is
known as the "proposal density".  The module uses a bivariate Gaussian for the
proposal density.  The value supplied through this parameter controls the standard
deviations of the proposal Gaussian in the x and the y directions.  The default value
for this parameter is 0.1.  With that value for the parameter, the standard deviation
of the proposal density along the x direction will be set to 0.1 times the width of
the bounding box, and the standard deviation of the same along the y-direction to 0.1
times the height of the bounding box.

=item C<y_axis_pos_direction>:

See the explanation above for the constructor parameter C<input_histogram_file> for
why you may need to use the C<y_axis_pos_direction> parameter.  The parameter takes
one of two values, C<up> and C<down>.  The usual interpretation of a 2D histogram in
a text file --- with the positive direction of the y-axis pointing downwards ---
corresponds to the case when this parameter takes the default value of C<down>.

=item C<output_hist_bins_along_x>:

This parameter controls the resolution with which the histogram of the generated
random points will be displayed.  The value you supply is for the number of bins
along the x-direction.  The number of bins along the y-direction is set according to
the aspect ratio of the bounding box.

=back  

=over 

=item B<read_histogram_file_for_desired_density():>

    $generator->read_histogram_file_for_desired_density();

This is a required call after the constructor is invoked. As you would expect, this
call reads in the histogram data for the desired probability density function for
random point generation.

=item B<read_file_for_bounding_box():>

    $generator->read_file_for_bounding_box();

This is also a required call.  This call reads in the left and the right limits on
the x-axis and the lower and the upper limits on the y-axis for the portion of the



( run in 0.586 second using v1.01-cache-2.11-cpan-39bf76dae61 )