AI-MXNet
view release on metacpan or search on metacpan
lib/AI/MXNet/Optimizer.pm view on Meta::CPAN
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return undef;
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
AI::MXNet::NDArray|Undef $state
)
{
my $lr = $self->_get_lr($index);
my $wd = $self->_get_wd($index);
$self->_update_count($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
$weight += - $lr/2 * ($grad + $wd * $weight)
+
AI::MXNet::Random->normal(
0, sqrt($lr),
$weight->shape,
$weight->context
);
}
__PACKAGE__->register;
=head1 NAME
AI::MXNet::Adam - Adam optimizer as described in [King2014]_.
=cut
=head1 DESCRIPTION
Adam optimizer as described in [King2014]_.
.. [King2014] Diederik Kingma, Jimmy Ba,
*Adam: A Method for Stochastic Optimization*,
http://arxiv.org/abs/1412.6980
the code in this class was adapted from
https://github.com/mila-udem/blocks/blob/master/blocks/algorithms/__init__.py#L765
Parameters
----------
learning_rate : float, optional
Step size.
Default value is set to 0.001.
beta1 : float, optional
Exponential decay rate for the first moment estimates.
Default value is set to 0.9.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
Default value is set to 0.999.
epsilon : float, optional
Default value is set to 1e-8.
decay_factor : float, optional
Default value is set to 1 - 1e-8.
wd : float, optional
L2 regularization coefficient add to all the weights
rescale_grad : float, optional
rescaling factor of gradient. Normally should be 1/batch_size.
clip_gradient : float, optional
clip gradient in range [-clip_gradient, clip_gradient]
=cut
package AI::MXNet::Adam;
use Mouse;
extends 'AI::MXNet::Optimizer';
has 'kwargs' => (is => "rw", isa => "HashRef[Num]");
has '+learning_rate' => (default => 0.001);
has 'beta1' => (is => "rw", isa => "Num", default => 0.9);
has 'beta2' => (is => "rw", isa => "Num", default => 0.999);
has 'epsilon' => (is => "rw", isa => "Num", default => 1e-8);
has 'decay_factor' => (is => "rw", isa => "Num", default => (1 - 1e-8));
sub BUILD
{
my $self = shift;
$self->kwargs({
rescale_grad => $self->rescale_grad,
beta1 => $self->beta1,
beta2 => $self->beta2,
epsilon => $self->epsilon
});
if($self->clip_gradient)
{
$self->kwargs->{clip_gradient} = $self->clip_gradient;
}
}
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
lib/AI/MXNet/Optimizer.pm view on Meta::CPAN
$weight->shape,
ctx => $weight->context
), # dn
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context
) # n
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
$self->_update_count($index);
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$grad *= $self->rescale_grad;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
my ($dn, $n) = @{ $state };
$dn += $grad - (($n + $grad * $grad)->sqrt - $n->sqrt) * $weight / $lr;
$n += $grad * $grad;
$weight .= ($dn->sign * $self->lamda1 - $dn)
/
(($self->beta + $n->sqrt) / $lr + $wd) * ($dn->abs > $self->lamda1);
}
__PACKAGE__->register;
package AI::MXNet::Adamax;
=head1 NAME
AI::MXNet::Adamax
=cut
=head1 DESCRIPTION
It is a variant of Adam based on the infinity norm
available at http://arxiv.org/abs/1412.6980 Section 7.
This optimizer accepts the following parameters in addition to those accepted
AI::MXNet::Optimizer.
Parameters
----------
beta1 : float, optional
Exponential decay rate for the first moment estimates.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
=cut
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.002);
has 'beta1' => (is => "ro", isa => "Num", default => 0.9);
has 'beta2' => (is => "ro", isa => "Num", default => 0.999);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$self->_update_count($index);
my $t = $self->_index_update_count->{$index};
$lr /= (1 - $self->beta1**$t);
$grad = $grad * $self->rescale_grad + $wd * $weight;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
# update m_t and u_t
my($m_t, $u_t) = @{ $state };
$m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
$u_t .= AI::MXNet::NDArray->maximum($self->beta2 * $u_t, $grad->abs);
# update weight
$weight -= $lr * $m_t / $u_t;
}
__PACKAGE__->register;
package AI::MXNet::Nadam;
=head1 NAME
AI::MXNet::Nadam
=cut
=head1 DESCRIPTION
The Nesterov Adam optimizer.
Much like Adam is essentially RMSprop with momentum,
Nadam is Adam RMSprop with Nesterov momentum available
at http://cs229.stanford.edu/proj2015/054_report.pdf.
This optimizer accepts the following parameters in addition to those accepted
AI::MXNet::Optimizer.
Parameters
----------
beta1 : float, optional
Exponential decay rate for the first moment estimates.
beta2 : float, optional
Exponential decay rate for the second moment estimates.
epsilon : float, optional
Small value to avoid division by 0.
schedule_decay : float, optional
Exponential decay rate for the momentum schedule
=cut
use Mouse;
extends 'AI::MXNet::Optimizer';
has '+learning_rate' => (default => 0.001);
has 'beta1' => (is => "ro", isa => "Num", default => 0.9);
has 'beta2' => (is => "ro", isa => "Num", default => 0.999);
has 'epsilon' => (is => "ro", isa => "Num", default => 1e-8);
has 'schedule_decay' => (is => "ro", isa => "Num", default => 0.004);
has 'm_schedule' => (is => "rw", default => 1, init_arg => undef);
method create_state(Index $index, AI::MXNet::NDArray $weight)
{
return [
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
), # mean
AI::MXNet::NDArray->zeros(
$weight->shape,
ctx => $weight->context,
dtype => $weight->dtype
) # variance
];
}
method update(
Index $index,
AI::MXNet::NDArray $weight,
AI::MXNet::NDArray $grad,
ArrayRef[AI::MXNet::NDArray] $state
)
{
my $wd = $self->_get_wd($index);
my $lr = $self->_get_lr($index);
$self->_update_count($index);
my $t = $self->_index_update_count->{$index};
$grad = $grad * $self->rescale_grad + $wd * $weight;
if($self->clip_gradient)
{
$grad = AI::MXNet::NDArray->clip(
$grad,
-$self->clip_gradient,
$self->clip_gradient
);
}
# warming momentum schedule
my $momentum_t = $self->beta1 * (1 - 0.5 * (0.96**($t * $self->schedule_decay)));
my $momentum_t_1 = $self->beta1 * (1 - 0.5 * (0.96**(($t + 1) * $self->schedule_decay)));
$self->m_schedule = $self->m_schedule * $momentum_t;
my $m_schedule_next = $self->m_schedule * $momentum_t_1;
# update m_t and v_t
my ($m_t, $v_t) = @{ $state };
$m_t .= $self->beta1 * $m_t + (1 - $self->beta1) * $grad;
( run in 0.659 second using v1.01-cache-2.11-cpan-39bf76dae61 )