Algorithm-CurveFit-Simple
view release on metacpan or search on metacpan
META.json
META.yml
Makefile.PL
README
bin/curvefit
cpanfile
data/hra-bhn.tsv
dist.ini
lib/Algorithm/CurveFit/Simple.pm
t/00-fit.t
t/01-init-formula.t
t/02-init-data.t
t/03-init-parameters.t
t/04-implement-formula.t
bin/curvefit view on Meta::CPAN
--bounds-check Implementation will check for out-of-bounds input
--round-result Implementation will round output to nearest integer
--suppress-includes (C only) Do not put #include directives in output implementation
--quiet Do not write supplementary information to stderr
--profile Dump %STATS_H to STDERR as json
=head1 DESCRIPTION
This is a thin wrapper around L<Algorithm::CurveFit::Simple>, which is in turn a convenience wrapper around L<Algorithm::CurveFit>.
Given a set of x,y data pairs on STDIN, it will generate a polynomial formula f(x) = y which fits that data, and write a source code implementation of that formula to STDOUT.
Additionally it will write a maximum deviation and average deviation to STDERR. Closer to 1.0 is better. Play with --terms=# until these deviations are as close to 1.0 as possible, and beware overfitting. Use --quiet to suppress this information.
=head1 SEE ALSO
L<Algorithm::CurveFit::Simple>
=cut
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
require Exporter;
our $VERSION = '1.03';
our @ISA = qw(Exporter);
our @EXPORT_OK = qw(fit %STATS_H);
}
# fit() - only public function for this distribution
# Given at least parameter "xy", generate a best-fit curve within a time limit.
# Output: max deviation, avg deviation, implementation source string (perl or C, for now).
# Optional parameters and their defaults:
# terms => 3 # number of terms in formula, max is 10
# time_limit => 3 # number of seconds to try for better fit
# inv => 1 # invert sense of curve-fit, from x->y to y->x
# impl_lang => 'perl' # programming language used for output implementation: perl, c
# impl_name => 'x2y' # name given to output implementation function
sub fit {
my %p = @_;
my $formula = _init_formula(%p);
my ($xdata, $ydata) = _init_data(%p);
my $parameters = _init_parameters($xdata, $ydata, %p);
my $iter_mode = 'time';
my $time_limit = 3; # sane default?
$time_limit = 0.01 if ($time_limit < 0.01);
my $n_iter;
if (defined($p{iterations})) {
$iter_mode = 'iter';
$n_iter = $p{iterations} || 10000;
} else {
$time_limit = $p{time_limit} // $time_limit;
$n_iter = 10000 * $time_limit; # will use this to figure out how long it -really- takes.
}
my ($n_sec, $params_ar_ar);
if ($iter_mode eq 'time') {
($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
$STATS_H{iter_mode} = $iter_mode;
$STATS_H{fit_calib_iter} = $n_iter;
$STATS_H{fit_calib_time} = $n_sec;
$STATS_H{fit_calib_parar} = $params_ar_ar;
$n_iter = int(($time_limit / $n_sec) * $n_iter + 1);
}
($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
$STATS_H{fit_iter} = $n_iter;
$STATS_H{fit_time} = $n_sec;
$STATS_H{fit_parar} = $params_ar_ar;
my $coderef = _implement_formula($params_ar_ar, "coderef", "", $xdata, \%p);
my ($max_dev, $avg_dev) = _calculate_deviation($coderef, $xdata, $ydata);
my $impl_lang = $p{impl_lang} // 'perl';
$impl_lang = lc($impl_lang);
my $impl_name = $p{inv} ? "y2x" : "x2y";
$impl_name = $p{impl_name} // $impl_name;
my $impl = $coderef;
$impl = _implement_formula($params_ar_ar, $impl_lang, $impl_name, $xdata, \%p) unless($impl_lang eq 'coderef');
return ($max_dev, $avg_dev, $impl);
}
# ($n_sec, $params_ar_ar) = _try_fit($formula, $parameters, $xdata, $ydata, $n_iter, $p{fitter_class});
sub _try_fit {
my ($formula, $parameters, $xdata, $ydata, $n_iter, $fitter_class) = @_;
$fitter_class //= "Algorithm::CurveFit";
my $params_ar_ar = [map {[@$_]} @$parameters]; # making a copy because curve_fit() is destructive
my $tm0 = Time::HiRes::time();
my $res = $fitter_class->curve_fit(
formula => $formula,
params => $params_ar_ar,
variable => 'x',
xdata => $xdata,
ydata => $ydata,
maximum_iterations => $n_iter
);
my $tm_elapsed = Time::HiRes::time() - $tm0;
return ($tm_elapsed, $params_ar_ar);
}
sub _init_formula {
my %p = @_;
my $formula = 'k + a*x + b*x^2 + c*x^3'; # sane'ish default
my $terms = $p{terms} // 3;
die "maximum of 10 terms\n" if ($terms > 10);
if ($terms != 3) {
$formula = 'k';
for (my $i = 1; $i <= $terms; $i++) {
my $fact = chr(ord('a') + $i - 1);
$formula .= " + $fact * x^$i";
}
}
return $formula;
}
# ($xdata, $ydata) = _init_data(%p);
sub _init_data {
my %p = @_;
my ($xdata, $ydata);
if (defined($p{xydata})) {
my $xy = $p{xydata};
unless (
ref($xy) eq 'ARRAY'
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
my $n_points = @$xdata;
foreach my $v (@$ydata) { $k += $v; }
$k /= $n_points;
# zzapp -- implement any precision hints here.
my @params = (['k', $k, 0.0000001]);
my $terms = $p{terms} // 3;
push @params, map {[chr(ord('a')+$_-1), 0.5, 0.0000001]} (1..$terms);
return \@params;
}
# $impl = _implement_formula($params_ar_ar, $impl_lang, $impl_name, $xdata, \%p) unless($impl_lang eq 'coderef');
sub _implement_formula {
my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
return _implement_formula_as_coderef(@_) if ($impl_lang eq 'coderef');
# return _implement_formula_as_python(@_) if ($impl_lang eq 'python'); # zzapp
return _implement_formula_as_C(@_) if ($impl_lang eq 'c');
# return _implement_formula_as_R(@_) if ($impl_lang eq 'r'); # zzapp
# return _implement_formula_as_MATLAB(@_) if ($impl_lang eq 'matlab'); # zzapp
return _implement_formula_as_perl(@_);
}
sub _implement_formula_as_coderef {
my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
my $k_ar = $params_ar_ar->[0];
my $formula = sprintf("%f", $k_ar->[1]);
for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
my $fact = $params_ar_ar->[$i]->[1];
my $pow = ($i == 1) ? "" : "**$i";
$formula .= sprintf(' + %f * $x%s', $fact, $pow);
}
$STATS_H{impl_formula} = $formula;
my $bounder = '';
if ($opt_hr->{bounds_check}) {
my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
foreach my $x (@$xdata) {
$high_x = $x if ($high_x < $x);
$low_x = $x if ($low_x > $x);
}
$bounder = 'die "x out of bounds (high)" if ($x > '.$high_x.'); die "x out of bounds (low)" if ($x < '.$low_x.');';
}
my $rounder = '';
$rounder = '$y = int($y + 0.5);' if ($opt_hr->{round_result});
my $src = 'sub { my($x) = @_; '.$bounder.' my $y = '.$formula.'; '.$rounder.' return $y; }';
$STATS_H{impl_source} = $src;
$STATS_H{impl_exception} = '';
my $coderef = eval($src);
$STATS_H{impl_exception} = $@ unless(defined($coderef));
return $coderef;
}
sub _implement_formula_as_perl {
my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
my $k_ar = $params_ar_ar->[0];
my $formula = sprintf("%.11f", $k_ar->[1]);
for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
my $fact = $params_ar_ar->[$i]->[1];
my $pow = ($i == 1) ? "" : "**$i";
$formula .= sprintf(' + %.11f * $x%s', $fact, $pow);
}
$STATS_H{impl_formula} = $formula;
my $bounder = '';
if ($opt_hr->{bounds_check}) {
my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
foreach my $x (@$xdata) {
$high_x = $x if ($high_x < $x);
$low_x = $x if ($low_x > $x);
}
$bounder = sprintf(' die "x out of bounds (high)" if ($x > %.11f);'."\n", $high_x) .
sprintf(' die "x out of bounds (low)" if ($x < %.11f);'."\n", $low_x);
}
my $rounder = '';
$rounder = ' $y = int($y + 0.5);'."\n" if ($opt_hr->{round_result});
my $src = join("\n",(
"sub $impl_name {",
' my($x) = @_;',
$bounder,
' my $y = '.$formula.';',
$rounder,
' return $y;',
'}'
));
$STATS_H{impl_source} = $src;
$STATS_H{impl_exception} = '';
return $src;
}
sub _implement_formula_as_C {
my ($params_ar_ar, $impl_lang, $impl_name, $xdata, $opt_hr) = @_;
my $k_ar = $params_ar_ar->[0];
my $src = "";
$src .= "#include <math.h>\n" if ($opt_hr->{round_result} && !$opt_hr->{suppress_includes});
$src .= "double $impl_name(double x) {\n";
$src .= sprintf(" double y = %.11f;\n", $k_ar->[1]);
$src .= " double xx = x;\n"; # eliminating pow() calls, which gcc doesn't seem willing to optimize completely away
if ($opt_hr->{bounds_check}) {
my ($high_x, $low_x) = ($xdata->[0], $xdata->[0]);
foreach my $x (@$xdata) {
$high_x = $x if ($high_x < $x);
$low_x = $x if ($low_x > $x);
}
# zzapp -- this is kludgy. better way to signal bounds violation?
$src .= sprintf(" if (x > %.11f) return -1.0;\n", $high_x) .
sprintf(" if (x < %.11f) return -1.0;\n", $low_x);
}
my $formula = "";
for (my $i = 1; defined($params_ar_ar->[$i]); $i++) {
my $fact = $params_ar_ar->[$i]->[1];
$formula .= sprintf(" y += %.11f * xx;\n", $fact);
$formula .= " xx *= x;\n" if(defined($params_ar_ar->[$i+1]));
}
$STATS_H{impl_formula} = $formula; # zzapp -- not clean!
$src .= $formula;
$src .= " y = round(y);\n" if ($opt_hr->{round_result});
$src .= " return y;\n}\n";
$STATS_H{impl_source} = $src;
$STATS_H{impl_exception} = '';
return $src;
}
# ($max_dev, $avg_dev) = _calculate_deviation($coderef, $xdata, $ydata);
sub _calculate_deviation {
my ($coderef, $xdata, $ydata) = @_;
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
my ($max_dev, $avg_dev, $src) = fit(xdata => \@xdata, ydata => \@ydata, ..options..);
# Alternatively pass xdata and ydata together:
my ($max_dev, $avg_dev, $src) = fit(xydata => [\@xdata, \@ydata], ..options..);
# Alternatively pass data as array of [x,y] pairs:
my ($max_dev, $avg_dev, $src) = fit(xydata => [[1, 2], [2, 5], [3, 10]], ..options..);
=head1 DESCRIPTION
This is a convenience wrapper around L<Algorithm::CurveFit>. Given a body of (x, y) data points, it will generate a polynomial formula f(x) = y which fits that data.
Its main differences from L<Algorithm::CurveFit> are:
=over 4
=item * It synthesizes the initial formula for you,
=item * It allows for a time limit on the curve-fit instead of an iteration count,
=item * It implements the formula as source code (or as a perl coderef, if you want to use the formula immediately in your program).
=back
Additionally it returns a maximum deviation and average deviation of the formula vs the xydata, which is more useful (to me, at least) than L<Algorithm::CurveFit>'s square residual output. Closer to 1.0 indicates a better fit. Play with C<terms =E<...
=head1 SUBROUTINES
There is only one public subroutine, C<fit()>. It B<must> be given either C<xydata> or C<xdata> and C<ydata> parameters. All other paramters are optional.
It returns three values: A maximum deviation, the average deviation and the formula implementation.
=head2 Options
=over 4
=item C<fit(xdata =E<gt> \@xdata, ydata =E<gt> \@ydata)>
The data points the formula will fit. Same as L<Algorithm::CurveFit> parameters of the same name.
=item C<fit(xydata =E<gt> [[1, 2, 3, 4], [10, 17, 26, 37]])>
=item C<fit(xydata =E<gt> [[1, 10], [2, 17], [3, 26], [4, 37]])>
A more convenient way to provide data points. C<fit()> will try to detect how the data points are organized -- list of x and list of y, or list of [x,y].
=item C<fit(terms =E<gt> 3)>
Sets the order of the polynomial, which will be of the form C<k + a*x + b*x**2 + c*x**3 ...>. The default is 3 and the limit is 10.
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
=item C<fit(time_limit =E<gt> 3)>
If a time limit is given (in seconds), C<fit()> will spend no more than that long trying to fit the data. It may return in much less time. The default is 3.
=item C<fit(iterations =E<gt> 10000)>
If an iteration count is given, C<fit()> will ignore any time limit and iterate up to C<iterations> times trying to fit the curve. Same as L<Algorithm::CurveFit> parameter of the same name.
=item C<fit(inv =E<gt> 1)>
Setting C<inv> inverts the sense of the fit. Instead of C<f(x) = y> the formula will fit C<f(y) = x>.
=item C<fit(impl_lang =E<gt> "perl")>
Sets the programming language in which the formula will be implemented. Currently supported languages are C<"C">, C<"coderef"> and the default, C<"perl">.
When C<impl_lang =E<gt> "coderef"> is specified, a code reference is returned instead which may be used immediately by your perl script:
my($max_dev, $avg_dev, $x2y) = fit(xydata => \@xy, impl_lang => "coderef");
my $y = $x2y->(42);
More implementation languages will be supported in the future.
=item C<fit(impl_name =E<gt> "x2y")>
Sets the name of the function implementing the formula. The default is C<"x2y">. Has no effect when used with C<impl_lang =E<gt> "coderef")>.
my($max_dev, $avg_dev, $src) = fit(xydata => \@xy, impl_name => "converto");
print "$src\n";
sub converto {
my($x) = @_;
my $y = -5340.93059104837 + 249.23009968947 * $x + -3.87745746448 * $x**2 + 0.02114780993 * $x**3;
return $y;
}
=item C<fit(bounds_check =E<gt> 1)>
When set, the implementation will include logic for checking whether the input is out-of-bounds, per the highest and lowest x points in the data used to fit the formula. For implementation languages which support exceptions, an exception will be thr...
For instance, if the highest x in C<$xydata> is 83.0 and the lowest x is 60.0:
my($max_dev, $avg_dev, $src) = fit(xydata => \@xy, bounds_check => 1);
print "$src\n";
sub x2y {
my($x) = @_;
die "x out of bounds (high)" if ($x > 83.80000000000);
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
=head1 VARIABLES
The class variable C<%STATS_H> contains various intermediate values which might be helpful. For instance, C<$STATS_H{deviation_max_offset_datum}> contains the x data point which corresponds to the maximum deviation returned.
The contents of C<%STATS_H> is subject to change and might not be fully documented in future versions. The current fields are:
=over 4
=item C<deviation_max_offset_datum>: The x data point corresponding with returned maximum deviation.
=item C<fit_calib_parar>: Arrayref of formula parameters as returned by L<Algorithm::CurveFit> after a short fitting attempt used for timing calibration.
=item C<fit_calib_time>: The number of seconds L<Algorithm::CurveFit> spent in the calibration run.
=item C<fit_iter>: The iterations parameter passed to L<Algorithm::CurveFit>.
=item C<fit_parar>: Arrayref of formula parameters as returned by L<Algorithm::CurveFit>.
=item C<fit_time>: The number of seconds L<Algorithm::CurveFit> actually spent fitting the formula.
=item C<impl_exception>: The exception thrown when the implementation was used to calculate the deviations, or the empty string if none.
=item C<impl_formula>: The formula part of the implementation.
=item C<impl_source>: The implementation source string.
=item C<iter_mode>: One of C<"time"> or C<"iter">, indicating whether a time limit was used or an iteration count.
=item C<xdata>: Arrayref of x data points as passed to L<Algorithm::CurveFit>.
=item C<ydata>: Arrayref of y data points as passed to L<Algorithm::CurveFit>.
=back
lib/Algorithm/CurveFit/Simple.pm view on Meta::CPAN
=head1 CAVEATS
=over 4
=item * Only simple polynomial functions are supported. Sometimes you need something else. Use L<Algorithm::CurveFit> for such cases.
=item * If C<xydata> is very large, iterating over it to calculate deviances can take more time than permitted by C<time_limit>.
=item * The dangers of overfitting are real! L<https://en.wikipedia.org/wiki/Overfitting>
=item * Using too many terms can dramatically reduce the accuracy of the fitted formula.
=item * Sometimes calling L<Algorithm::CurveFit> with a ten-term polynomial causes it to hang.
=back
=head1 TO DO
=over 4
=item * Support more programming languages for formula implementation: R, MATLAB, python
=item * Calculate the actual term sigfigs and set precision appropriately in the formula implementation instead of just "%.11f".
=item * Support trying a range of terms and returning whatever gives the best fit.
=item * Support piecewise output formulas.
=item * Work around L<Algorithm::CurveFit>'s occasional hang problem when using ten-term polynomials.
=back
=head1 SEE ALSO
L<Algorithm::CurveFit>
L<curvefit>
t/01-init-formula.t view on Meta::CPAN
#!/bin/env perl
use strict;
use warnings;
use Test::Most;
use JSON::PP;
use lib "./lib";
use Algorithm::CurveFit::Simple qw(%STATS_H);
my $f = eval { Algorithm::CurveFit::Simple::_init_formula(); };
print "# exception: $@\n" if ($@);
ok !$@, "default - no thrown exceptions";
ok defined($f), "default - returned a defined value";
$f //= '';
is ref($f), '', "default - returned string";
ok $f =~ /^k (\+ \w\s?\*\s?x\^?\d*\s*)+$/, "default - formula is well-formed" if($f);
my $formula;
for (my $i = 1; $i < 10; $i++) {
$f = eval { Algorithm::CurveFit::Simple::_init_formula(terms => $i); };
$formula = $f if ($i == 3);
ok !$@, "with $i terms - no thrown exceptions";
ok defined($f), "with $i terms - returned a defined value";
$f //= '';
is ref($f), '', "with $i terms - returned string";
ok $f =~ /^k (\+ \w\s?\*\s?x\^?\d*\s*){$i}$/, "with $i terms - formula is well-formed" if($f);
}
if ($ARGV[0]) {
print "formula=$formula\n";
print JSON::PP::encode_json(\%STATS_H)."\n";
}
done_testing();
exit(0);
t/04-implement-formula.t view on Meta::CPAN
use warnings;
use Test::Most;
use JSON::PP;
use lib "./lib";
use Algorithm::CurveFit::Simple;
my $parar = [["k", 10, 1], ["a", 2, 1], ["b", 3, 1], ["c", 4, 1]];
my $xdata = [8, 3, 5, 11, 9, 7, 4];
my $s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "", "x2y", $xdata, {}); };
ok defined($s), "implementation default throws no exceptions";
ok $s =~ /^sub x2y /, "implementation default is perl";
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "coderef", "x2y", $xdata, {}); };
ok defined($s), "implementation coderef throws no exceptions";
is ref($s), "CODE", "implementation coderef is coderef";
# perl implementation:
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "perl", "x2y", $xdata, {}); };
ok defined($s), "implementation perl throws no exceptions";
ok $s =~ /^sub x2y /, "implementation perl is perl";
ok $s !~ /x out of bounds/, "implementation perl without bounds check is permissive";
ok $s !~ /y = int/, "implementation perl without rounding check returns precision result";
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "perl", "x2y", $xdata, {bounds_check => 1}); };
ok $s =~ /x out of bounds/, "implementation perl with bounds check is limited";
ok $s !~ /y = int/, "implementation perl with bounds check without round result returns precision result";
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "perl", "x2y", $xdata, {round_result => 1}); };
ok $s !~ /x out of bounds/, "implementation perl without bounds check with round result is permissive";
ok $s =~ /y = int/, "implementation perl without bounds check with round result returns rounded result";
# C implementation:
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "c", "x2y", $xdata, {}); };
ok defined($s), "implementation C throws no exceptions";
ok $s =~ /double x2y\(double x\) \{/, "implementation C is C";
ok $s !~ /return -1.0/, "implementation C without bounds check is permissive";
ok $s !~ /y = round/, "implementation C without rounding check returns precision result";
print $Algorithm::CurveFit::Simple::STATS_H{impl_source} . "\n" if ($ARGV[0]);
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "c", "x2y", $xdata, {bounds_check => 1}); };
ok $s =~ /return -1.0/, "implementation C with bounds check is limited";
ok $s !~ /y = round/, "implementation C with bounds check without round result returns precision result";
print $Algorithm::CurveFit::Simple::STATS_H{impl_source} . "\n" if ($ARGV[0]);
$s = eval { Algorithm::CurveFit::Simple::_implement_formula($parar, "c", "x2y", $xdata, {round_result => 1}); };
ok $s !~ /return -1.0/, "implementation C without bounds check with round result is permissive";
ok $s =~ /y = round/, "implementation C without bounds check with round result returns rounded result";
print $Algorithm::CurveFit::Simple::STATS_H{impl_source} . "\n" if ($ARGV[0]);
if ($ARGV[0]) {
print JSON::PP::encode_json(\%Algorithm::CurveFit::Simple::STATS_H) . "\n";
}
done_testing();
exit(0);
( run in 0.421 second using v1.01-cache-2.11-cpan-3cd7ad12f66 )