Excel-ValueReader-XLSX
view release on metacpan or search on metacpan
lib/Excel/ValueReader/XLSX/Backend/Regex.pm view on Meta::CPAN
package Excel::ValueReader::XLSX::Backend::Regex;
use utf8;
use 5.12.1;
use Moose;
use Scalar::Util qw/looks_like_number/;
use Iterator::Simple qw/iter/;
extends 'Excel::ValueReader::XLSX::Backend';
#======================================================================
# LAZY ATTRIBUTE CONSTRUCTORS
#======================================================================
sub _strings {
my $self = shift;
my @strings;
# read from the sharedStrings zip member
my $contents = $self->_zip_member_contents('xl/sharedStrings.xml');
# iterate on <si> nodes
while ($contents =~ m[<si>(.*?)</si>]sg) {
my $innerXML = $1;
# concatenate contents from all <t> nodes (usually there is only 1) and decode XML entities
my $string = join "", ($innerXML =~ m[<t[^>]*>(.+?)</t>]sg);
_decode_xml_entities($string);
push @strings, $string;
}
return \@strings;
}
sub _workbook_data {
my $self = shift;
my %workbook_data;
# read from the workbook.xml zip member
my $workbook = $self->_zip_member_contents('xl/workbook.xml');
# extract sheet names
my @sheet_names = ($workbook =~ m[<sheet name="(.+?)"]g);
$workbook_data{sheets} = {map {$sheet_names[$_] => $_+1} 0 .. $#sheet_names};
# does this workbook use the 1904 calendar ?
my ($date1904) = $workbook =~ m[date1904="(.+?)"];
$workbook_data{base_year} = $date1904 && $date1904 =~ /^(1|true)$/ ? 1904 : 1900;
# active sheet
my ($active_tab) = $workbook =~ m[<workbookView[^>]+activeTab="(\d+)"];
$workbook_data{active_sheet} = $active_tab + 1 if defined $active_tab;
return \%workbook_data;
}
sub _date_styles {
my $self = shift;
state $date_style_regex = qr{[dy]|\bmm\b};
# read from the styles.xml zip member
my $styles = $self->_zip_member_contents('xl/styles.xml');
# start with Excel builtin number formats for dates and times
my @numFmt = $self->Excel_builtin_date_formats;
# add other date formats explicitly specified in this workbook
while ($styles =~ m[<numFmt numFmtId="(\d+)" formatCode="([^"]+)"/>]g) {
my ($id, $code) = ($1, $2);
$numFmt[$id] = $code if $code =~ $date_style_regex;
}
# read all cell formats, just rembember those that involve a date number format
my ($cellXfs) = ($styles =~ m[<cellXfs count="\d+">(.+?)</cellXfs>]);
my @cell_formats = $self->_extract_xf($cellXfs);
my @date_styles = map {$numFmt[$_->{numFmtId}]} @cell_formats;
return \@date_styles; # array of shape (xf_index => numFmt_code)
}
sub _extract_xf {
my ($self, $xml) = @_;
state $xf_node_regex = qr{
<xf # initial format tag
\s
([^>/]*+) # attributes (captured in $1)
(?: # non-capturing group for an alternation :
/> # .. either an xml closing without content
| # or
> # .. closing for the xf tag
.*? # .. then some formatting content
</xf> # .. then the ending tag for the xf node
)
}x;
my @xf_nodes;
while ($xml =~ /$xf_node_regex/g) {
push @xf_nodes, _xml_attrs($1);
}
return @xf_nodes;
}
( run in 0.801 second using v1.01-cache-2.11-cpan-39bf76dae61 )