App-Licensecheck
view release on metacpan or search on metacpan
lib/App/Licensecheck.pm view on Meta::CPAN
elsif ( $position < 0 ) {
$log->error('header end could not be resolved');
$offset = 0;
}
else {
$log->error('header end oddly at beginning of file');
$offset = 0;
}
}
}
elsif ($offset) {
# TODO: distinguish tail from full content
return $content
if defined($tail_content);
$tail_content = '';
$fh->seek( $offset, SEEK_SET ); # see IO::Seekable
$tail_content .= join( '', $fh->getlines );
$log->trace("----- tail -----\n$tail_content----- end tail -----")
if $log->is_trace;
$content = $tail_content;
}
else {
$log->errorf(
'tail offset not usable: %s',
$offset
);
return '';
}
# TODO: distinguish comment-mangled content from pristine content
local $_ = $content or return '';
# Remove generic comments: look for 4 or more lines beginning with
# regular comment pattern and trim it. Fall back to old algorithm
# if no such pattern found.
my @matches = m/^[ \t]*([^a-zA-Z0-9\s]{1,3})[ \t]+\S/mg;
if ( @matches >= 4 ) {
my $comment_re = qr/^[ \t]*[\Q$matches[0]\E]{1,3}[ \t]*/m;
s/$comment_re//g;
}
my @wordmatches = m/^[ \t]*(dnl|REM|COMMENT)[ \t]+\S/mg;
if ( @wordmatches >= 4 ) {
my $comment_re = qr/^[ \t]*\Q$wordmatches[0]\E[ \t]*/m;
s/$comment_re//g;
}
# Remove other side of "boxed" comments
s/[ \t]*[*#][ \t]*$//gm;
# Remove Fortran comments
s/^[cC]$//gm;
s/^[cC] //gm;
# Remove C / C++ comments
s#(\*/|/\*|(?<!:)//)##g;
# Strip escaped newline
s/\s*\\n\s*/ /g;
$content = $_;
return $content;
}
my $html_xml_tags_re = qr/<\/?(?:p|br|ref)(?:\s[^>]*)?>/i;
# clean cruft
method content_cleaned
{
local $_ = $self->content or return '';
# strip common html and xml tags
s/$html_xml_tags_re//g;
# TODO: decode latin1/UTF-8/HTML data instead
s/\xcb\x97|\xe2\x80[\x90-\x95|\xe2\x81\x83|\xe2\x88\x92|\xef\x89\xa3|\xef\xbc\x8d]|[&](?:ndash|mdash|horbar|minus|[#](?:727|820[8-9]|821[0-3]|8259|8722|65123|65293|x727|z201[0-5]|x2043|x2212|xFE63|xFF0D))[;]/-/gm;
s/\x58\xa9|\xc2\xa9|\xe2\x92\x9e|\xe2\x92\xb8|\xe2\x93\x92|\xf0\x9f\x84\x92|\xf0\x9f\x84\xab|\xf0\x9f\x85\x92|[&](?:copy|[#](?:169|9374|9400|9426|127250|127275|127314|x0A9|x249E|x24b8|x24D2|x0F112|x0F12B|x0F152))[;]/©/gm;
# TODO: decode nroff files specifically instead
s/\\//gm; # de-cruft nroff files
return $_;
}
# clean cruft and whitespace
method content_extracleaned
{
local $_ = $self->content or return '';
# strip trailing dash, assuming it is soft-wrap
# (example: disclaimers in GNU autotools file "install-sh")
s/-\r?\n//g;
# strip common html and xml tags
s/$html_xml_tags_re//g;
tr/\t\r\n/ /;
# this also removes quotes
tr% A-Za-z.,:@;0-9\(\)/-%%cd;
tr/ //s;
return $_;
}
=encoding UTF-8
=head1 AUTHOR
Jonas Smedegaard C<< <dr@jones.dk> >>
=head1 COPYRIGHT AND LICENSE
This program is based on the script "licensecheck" from the KDE SDK,
originally introduced by Stefan Westerfeld C<< <stefan@space.twc.de> >>.
Copyright © 2007, 2008 Adam D. Barratt
( run in 1.849 second using v1.01-cache-2.11-cpan-39bf76dae61 )