Catalyst-Controller-SimpleCAS
view release on metacpan or search on metacpan
lib/Catalyst/Controller/SimpleCAS/Role/TextTranscode.pm view on Meta::CPAN
my $MIME = try{
# This will frequently produce uninitialized value warnings from Email::Simple::Header,
# and I haven't been able to figure out how to stop it
Email::MIME->new($src_octets)
};
if($MIME && $MIME->subparts) {
$content = $self->convert_from_mhtml($c,$MIME);
}
# If HTML or binary:
else {
if(!$upload || $upload->type =~ /^text/){
my $src_encoding= encoding_from_html_document($src_octets) || 'utf-8';
my $in_codec= Encode::find_encoding($src_encoding) or die "Unsupported encoding: $src_encoding";
$content = (utf8::is_utf8($src_octets)) ? $src_octets : $in_codec->decode($src_octets);
}
# Binary
else {
my $checksum = $self->Store->add_content_file_mv($upload->tempname) or die "Failed to add content";
my $Content = $self->Content($checksum,$upload->filename);
return $Content->imglink if ($Content->imglink);
return $Content->filelink;
}
}
# TODO: Detect other content types and add fallback logic
$content = $self->parse_html_get_style_body(\$content);
$self->convert_data_uri_scheme_links($c,\$content);
# Use style tags just as a safe place to store the original filename
# (switched to this after having issues with html comments)
$content = '<style>/*----ORIGINAL_FILENAME:' .
$filename .
'----*/</style>' . "\n" . $content if ($filename);
return $content;
}
sub convert_from_mhtml {
my $self = shift;
my $c = shift;
my $MIME = shift;
my ($SubPart) = $MIME->subparts or return;
## -- Check for and remove extra outer MIME wrapper (exists in actual MIME EMails):
$MIME = $SubPart if (
$SubPart->content_type &&
$SubPart->content_type =~ /multipart\/related/
);
## --
my ($MainPart) = $MIME->subparts or return;
## ------
## New: throw the kitchen sink at trying to figure out the charset/encoding
##
## This solves the long-standing problem where MHT files saved by Word 2010
## would load garbled. These files are encoded as 'UTF-16LE', and the system
## is not able to realize this out of the box (I think because it lists the
## the charset ambiguously as ' charset="unicode" ' in the Content-Type
## MIME header, but I'm no expert on Unicode). Below we're basically trying
## all of the functions of HTML::Encoding until we find one that gives us
## an answer, and if we do get an answer, we apply it to the MIME object before
## calling ->body_str() which will then use it to decode to text.
##
my $decoded = $MainPart->body; # <-- decodes from base64 (or whatever) to *bytes*
my $char_set =
HTML::Encoding::encoding_from_html_document ($decoded) ||
HTML::Encoding::encoding_from_byte_order_mark ($decoded) ||
HTML::Encoding::encoding_from_meta_element ($decoded) ||
HTML::Encoding::xml_declaration_from_octets ($decoded) ||
HTML::Encoding::encoding_from_first_chars ($decoded) ||
HTML::Encoding::encoding_from_xml_declaration ($decoded) ||
HTML::Encoding::encoding_from_content_type ($decoded) ||
HTML::Encoding::encoding_from_xml_document ($decoded);
$MainPart->charset_set( $char_set ) if ($char_set);
## ------
my $html = $MainPart->body_str; # <-- decodes to text using the character_set
my $base_path = $self->parse_html_base_href(\$html) || $self->get_mime_part_base_path($MainPart);
my %ndx = ();
$MIME->walk_parts(sub{
my $Part = shift;
return if ($Part == $MIME || $Part == $MainPart); #<-- ignore the outer and main/body parts
my $content_id = $Part->header('Content-ID');
if ($content_id) {
$ndx{'cid:' . $content_id} = $Part;
$content_id =~ s/^\<//;
$content_id =~ s/\>$//;
$ndx{'cid:' . $content_id} = $Part;
}
my $content_location = $Part->header('Content-Location');
if($content_location) {
$ndx{$content_location} = $Part;
if($base_path) {
$content_location =~ s/^\Q$base_path\E//;
$ndx{$content_location} = $Part;
}
}
});
$self->convert_mhtml_links_parts($c,\$html,\%ndx);
return $html;
}
# Try to extract the 'body' from html to prevent causing DOM/parsing issues on the client side
sub parse_html_get_style_body {
my $self = shift;
my $htmlref = shift;
my $body = $self->parse_html_get_body($htmlref) or return $$htmlref;
my $style = $self->parse_html_get_styles($htmlref);
my $auto_css_pre = 'cas-selector-wrap-';
( run in 1.455 second using v1.01-cache-2.11-cpan-99c4e6809bf )