Text-Distill

 view release on metacpan or  search on metacpan

lib/Text/Distill.pm  view on Meta::CPAN

      my $ContentPath = $ContainerNode->getAttributeNode('full-path')->string_value;
      if (my $ContentMember = $arch->memberNamed( $ContentPath )) {
        my $XMLContent = $ContentMember->contents();

        $xpc->unregisterNs('opf');
        $xpc->registerNs('opf', 'http://www.idpf.org/2007/opf');

        my $Content;
        eval { $Content = $xml->parse_string($XMLContent); };
        if ($@) {
          $! = 11;
          Carp::confess("[libxml2 error ". $@->code() ."] ". $@->message());
        }
        my @ContentNodes = $xpc->findnodes('//opf:package/opf:manifest/opf:item[
            @media-type="application/xhtml+xml"
          and
            starts-with(@id, "content")
          ]',
          $Content
        );
        my $HTMLTree = HTML::TreeBuilder->new();
        foreach my $ContentNode (@ContentNodes) {
          my $HTMLContentPath = $ContentNode->getAttributeNode('href')->string_value;

          if (my $HTMLContentMember = $arch->memberNamed( $HTMLContentPath )) {
            my $HTMLContent = $HTMLContentMember->contents();

            $HTMLTree->parse_content($HTMLContent);
          } else {
            Carp::confess("[Archive::Zip error] $HTMLContentPath not found in ePub ZIP container");
          }
        }
        $Result = DecodeUtf8($HTMLTree->as_text);
      } else {
        Carp::confess("[Archive::Zip error] $ContentPath not found in ePub ZIP container");
      }
    } else {
      Carp::confess("[Archive::Zip error] $requiredMember not found in ePub ZIP container");
    }
  } else {
    Carp::confess("[Archive::Zip error] $!");
  }

  return $Result;
}

sub OPCPartAbsoluteNameFromRelative {
  my $Name = shift;
  my $Dir = shift;
  $Dir =~ s:/$::; # remove trailing slash

  my $FullName = ( $Name =~ m:^/: ) ? $Name :       # $Name has absolute path
                                      "$Dir/$Name"; # $Name has relative path
  $FullName = do{
    use bytes; # A-Za-z are case insensitive
    lc $FullName;
  };

  # parse all . and .. in name
  my @CleanedSegments;
  my @OriginalSegments = split m:/:, $FullName;
  for my $Part ( @OriginalSegments ) {
    if( $Part eq '.' ) {
      # just skip
    } elsif( $Part eq '..' ) {
      pop @CleanedSegments;
    } else {
      push @CleanedSegments, $Part;
    }
  }

  return join '/', @CleanedSegments;
}


sub ExtractSingleZipFile {
  my $FN = shift;
  my $Ext = shift;
  my $Zip = Archive::Zip->new();

  return unless ( $Zip->read( $FN ) == Archive::Zip::AZ_OK );

  my @Files = $Zip->members();
  return unless (scalar @Files == 1 && $Files[0]->{fileName} =~ /(\.$Ext)$/);

  my $TmpDir = File::Temp::tempdir(cleanup=>1);

  my $OutFile = $TmpDir.'/check_' . $$ . '_' . $Files[0]->{fileName};

  return $Zip->extractMember( $Files[0], $OutFile ) == Archive::Zip::AZ_OK ? $OutFile : undef;
}

=head2 DetectBookFormat($FilePath, $Format)

Function detects format of an e-book and returns it. You
may suggest the format to start with, this wiil speed up the process a bit
(not required).

$Format can be 'fb2.zip', 'fb2', 'doc.zip', 'doc', 'docx.zip',
'docx', 'epub.zip', 'epub', 'txt.zip', 'txt', 'fb3', 'fb3'

=cut

sub DetectBookFormat {
  my $File = shift;
  my $Format = shift;
  if (defined $Format && $Format =~/^($rxFormats)$/) {
    $Format = $1;
  } else {
    $Format = '';
  }

  #$Format первым пойдет
  my @Formats = ($Format || (),  grep{ $_ ne $Format } @DetectionOrder);

  foreach( @Formats ) {
    return $_ if $Detectors->{$_}->($File);
  }
  return;
}



( run in 0.882 second using v1.01-cache-2.11-cpan-71847e10f99 )