Catalyst-Controller-SimpleCAS

 view release on metacpan or  search on metacpan

lib/Catalyst/Controller/SimpleCAS/Role/TextTranscode.pm  view on Meta::CPAN

  my $MIME = try{
    # This will frequently produce uninitialized value warnings from Email::Simple::Header,
    # and I haven't been able to figure out how to stop it
    Email::MIME->new($src_octets)
  };
  if($MIME && $MIME->subparts) {
    $content = $self->convert_from_mhtml($c,$MIME);
  }
  # If HTML or binary:
  else {
    if(!$upload || $upload->type =~ /^text/){
      my $src_encoding= encoding_from_html_document($src_octets) || 'utf-8';
      my $in_codec= Encode::find_encoding($src_encoding) or die "Unsupported encoding: $src_encoding";
      $content = (utf8::is_utf8($src_octets)) ? $src_octets : $in_codec->decode($src_octets);
    }
    # Binary
    else {
      my $checksum = $self->Store->add_content_file_mv($upload->tempname) or die "Failed to add content";
      my $Content = $self->Content($checksum,$upload->filename);
      return $Content->imglink if ($Content->imglink);
      return $Content->filelink;
    }
  }
  # TODO: Detect other content types and add fallback logic
  
  $content = $self->parse_html_get_style_body(\$content);
  $self->convert_data_uri_scheme_links($c,\$content);
  
  # Use style tags just as a safe place to store the original filename
  # (switched to this after having issues with html comments)
  $content = '<style>/*----ORIGINAL_FILENAME:' .
    $filename .
  '----*/</style>' . "\n" . $content if ($filename);

  return $content;
}


sub convert_from_mhtml {
  my $self = shift;
  my $c = shift;
  my $MIME = shift;

  my ($SubPart) = $MIME->subparts or return;
  
  ## -- Check for and remove extra outer MIME wrapper (exists in actual MIME EMails):
  $MIME = $SubPart if (
    $SubPart->content_type &&
    $SubPart->content_type =~ /multipart\/related/
  );
  ## --
  
  my ($MainPart) = $MIME->subparts or return;

  ## ------
  ## New: throw the kitchen sink at trying to figure out the charset/encoding
  ##
  ## This solves the long-standing problem where MHT files saved by Word 2010
  ## would load garbled. These files are encoded as 'UTF-16LE', and the system
  ## is not able to realize this out of the box (I think because it lists the
  ## the charset ambiguously as ' charset="unicode" ' in the Content-Type
  ## MIME header, but I'm no expert on Unicode). Below we're basically trying 
  ## all of the functions of HTML::Encoding until we find one that gives us
  ## an answer, and if we do get an answer, we apply it to the MIME object before
  ## calling ->body_str() which will then use it to decode to text.
  ##
  my $decoded = $MainPart->body; # <-- decodes from base64 (or whatever) to *bytes*

  my $char_set =
    HTML::Encoding::encoding_from_html_document   ($decoded) ||
    HTML::Encoding::encoding_from_byte_order_mark ($decoded) ||
    HTML::Encoding::encoding_from_meta_element    ($decoded) ||
    HTML::Encoding::xml_declaration_from_octets   ($decoded) ||
    HTML::Encoding::encoding_from_first_chars     ($decoded) ||
    HTML::Encoding::encoding_from_xml_declaration ($decoded) ||
    HTML::Encoding::encoding_from_content_type    ($decoded) ||
    HTML::Encoding::encoding_from_xml_document    ($decoded);

  $MainPart->charset_set( $char_set ) if ($char_set);
  ## ------

  my $html = $MainPart->body_str; # <-- decodes to text using the character_set

  my $base_path = $self->parse_html_base_href(\$html) || $self->get_mime_part_base_path($MainPart);
  
  my %ndx = ();
  $MIME->walk_parts(sub{ 
    my $Part = shift;
    return if ($Part == $MIME || $Part == $MainPart); #<-- ignore the outer and main/body parts
    
    my $content_id = $Part->header('Content-ID');
    if ($content_id) {
      $ndx{'cid:' . $content_id} = $Part;
      $content_id =~ s/^\<//;
      $content_id =~ s/\>$//;
      $ndx{'cid:' . $content_id} = $Part;
    }
    
    my $content_location = $Part->header('Content-Location');
    if($content_location) {
      $ndx{$content_location} = $Part;
      if($base_path) {
        $content_location =~ s/^\Q$base_path\E//;
        $ndx{$content_location} = $Part;
      }
    }
  });
  
  $self->convert_mhtml_links_parts($c,\$html,\%ndx);
  return $html;
}

# Try to extract the 'body' from html to prevent causing DOM/parsing issues on the client side
sub parse_html_get_style_body {
  my $self = shift;
  my $htmlref = shift;
  
  my $body = $self->parse_html_get_body($htmlref) or return $$htmlref;
  my $style = $self->parse_html_get_styles($htmlref);
  
  my $auto_css_pre = 'cas-selector-wrap-';



( run in 1.455 second using v1.01-cache-2.11-cpan-99c4e6809bf )