Data-Edit-Xml-To-Dita
view release on metacpan or search on metacpan
lib/Data/Edit/Xml/To/DitaVb.pm view on Meta::CPAN
#!/usr/bin/perl -I/home/phil/perl/cpan/DataTableText/lib/ -I/home/phil/perl/cpan/GitHubCrud/lib/ -I/home/phil/perl/cpan/DataEditXmlXref/lib/ -I/home/phil/perl/cpan/DitaGBStandard/lib/ -I/home/phil/perl/cpan/FlipFlop/lib/
#-------------------------------------------------------------------------------
# Data::Edit::Xml::To::Dita - Convert multiple Xml documents in parallel to Dita
# Philip R Brenan at gmail dot com, Appa Apps Ltd Inc., 2019
#-------------------------------------------------------------------------------
# podDocumentation
package Data::Edit::Xml::To::DitaVb;
our $VERSION = 20190708;
use warnings FATAL => qw(all);
use strict;
use Carp qw(confess cluck);
use Data::Dump qw(dump);
use Data::Edit::Xml;
use Data::Edit::Xml::Lint;
use Data::Edit::Xml::Xref;
use Data::Table::Text qw(:all);
use Dita::GB::Standard qw(:all);
use Flip::Flop;
use GitHub::Crud;
use Scalar::Util qw(blessed);
use Time::HiRes qw(time);
use utf8;
#D1 Convert Xml to the Dita standard. # Convert Xml to the Dita standard.
sub changeBadXrefToPh{0} #I Change xrefs being placed in M3 by L<Data::Edit::Xml::Xref> to B<ph>.
sub clearCount {&develop ? 1e4 : 1e6} # Limit on number of files to clear from each output folder.
sub client {q()} # The name of the client
sub conversion {&conversionName} # Conversion name
sub convert {1} # Convert documents to dita if true.
sub debug {0} # Debug if true.
sub deguidize {0} # 0 - normal processing, 1 - replace guids in hrefs with their target files to deguidize dita references. Given href g1#g2/id convert g1 to a file name by locating the to...
sub ditaXrefs {0} # Convert xref hrefs expressed as just ids to dita format - useful in non Dita to Dita conversions for example: docBook
sub docSet {1} # Select set of documents to convert.
sub download {&develop ? 0 : 1} # Download from S3 if true.
sub exchange {&develop ? 2 : 2} # 1 - upload to S3 Exchange if at 100% lint, 2 - upload to S3 Exchange regardless, 0 - no upload to S3 Exchange.
sub exchangeItems {q()} # The items to be uploaded to the exchange folder: d - downloads, i - in, p - perl, o - out, t - topic trees. Reports are uploaded by default
sub extendedNames {0} # Expected number of output topics or B<undef> if unknown
sub fixBadRefs {0} # Mask bad references using M3: the Monroe Masking Method if true
sub fixDitaRefs {0} # Fix references in a corpus of L<Dita> documents that have been converted to the L<GBStandard>.
sub fixFailingFiles {0} # Fix failing files in the L<testFails|/testFails> folder if this attribute is true
sub fixRelocatedRefs {1} # Fix references to (re|un)located files that adhere to the GB standard.
sub fixXrefsByTitle {0} # Fix failing xrefs by looking for the unique topic with a title that matches the text of the xref.
sub hits {Flip::Flop::hits(0)} # 1 - track hits so we can see which transformations are actually being used - normally off to avoid the overhead
sub lint {1} # Lint output xml
sub mimajen {0} # 1- Copy files to web, 0 - suppress
sub notify {!&develop and &upload ? &upload : 0} # 1 - Broadcast results of conversion if at 100% lint, 2 - broadcast regardless of error count.
sub numberOfFiles {undef} # Expected number of output files
sub printTopicTrees {1} # 1 - print the parse tree before cutting out the topics
sub publish {0} # 1 - convert Dita to Html and publish via DITA-OT if at 100% lint, 2 - publish regardless
sub restructure {0} # 1 - Restructure results of conversion if at 100% lint, 2 - restructure regardless of error count.
sub restructurePhases{1} # Number of restructuring phases to run
sub testMode {&develop ? 1 : 0} # 1 - run development tests, 2- run standalone tests, 0 run production documents
sub titleOnly {0} # Use only the title of topics to create GB Standard file names otherwise use the following text as well if the title is too short
sub unicode {download} # Convert to utf8 if true.
sub upload {&develop ? 0 : 1} # Upload to S3 Bucket if true and the conversion is at 100%, 2 - upload to S3 Bucket regardless, 0 - no upload to S3 Bucket.
sub version {q()} # Description of this run as printed in notification message and title
sub xref {1} # Xref output xml.
sub xrefAddNavTitles {1} # Add navtitles to bookmap entries if true
sub xrefAllowUniquePartialMatches{1} # Allow partial matching - i.e ignore the stuff to the right of the # in a reference if doing so produces a unique result
sub xrefMatchTopics {0} # Either 0 for no topic matching or the percentage confidence level for topic matching
#ub relinkDitaRefs {0} # Relink dita references that are valid in the input corpus so that they are valid in the output corpus as well.
#ub singleTopicBM {fixDitaRefs} # 1 - allow single topic book maps when cutting out topics which is required if using L<fixDitaRefs>, 0 - multiple topics required for a bookmap
sub catalog {q(/home/phil/r/dita/dita-ot-3.1/catalog-dita.xml)} # Dita catalog to be used for linting.
sub develop {-e q(/home/ubuntu/) ? 0 : 1} # Production run if this file folder is detected otherwise development.
sub ditaBin {fpf(qw(/home phil r dita dita-ot-3.1 bin dita))} # Location of Dita tool
sub downloads {fpd(&home, qw(download))} # Downloads folder.
sub errorLogFile {fpe(&perl, qw(eee txt))} # Error log file.
sub exchangeHome {fpd(qw(/home phil x aws))} # Home of exchange folder
sub fails {fpd(&reports, qw(fails))} # Copies of failing documents in a separate folder to speed up downloading.
sub gathered {fpd(&home, qw(gathered))} # Folder containing saved parse trees after initial parse and information gathering - pretty well obsolete
sub hitsFolder {fpd(&home, qw(hits))} # Folder containing at method hits by process id
sub home {&getHome} # Home folder containing all the other folders.
sub imageCache {fpd(home, qw(imageCache))} # Converted images are cached here to speed things up
sub in {fpd(&home, qw(in))} # Input documents folder.
sub inputExt {qw(.xml .dita .ditamap)} # Extension of input files.
sub out {fpd(&home, qw(out))} # Converted documents output folder.
sub outExtTopic {q(dita)} # Preferred output extension for a topic
sub outExtMap {q(ditamap)} # Preferred output extension for a map
sub parseCache {fpd(&home, qw(parseCache))} # Cached parse trees.
sub parseFailed {fpd(&home, qw(parseFailed))} # Folder for details of xml parse failures
sub perl {fpd(&home, qw(perl))} # Perl folder.
sub process {fpd(&home, qw(process))} # Process data folder used to communicate results between processes.
sub publications {fpd(&www, qw(publications), client)} # Publications folder on web server for client
sub reports {fpd(&home, qw(reports))} # Reports folder.
sub s3Bucket {q(s3Bucket)} # Bucket on S3 holding documents to convert and the converted results.
sub s3FolderIn {q(originals).docSet} # Folder on S3 containing original documents.
sub s3FolderUp {q(results).docSet} # Folder on S3 containing results of conversion.
sub s3Exchange {fpd(qw(exchange.ryffine users aws), client)} # Exchange folder on S3
sub s3Profile {undef} # Aws cli profile keyword value if any.
sub s3Parms {q(--quiet --delete)} # Additional S3 parameters for uploads and downloads.
sub summaryFile {fpe(reports, qw(summary txt))} # Summary report file.
sub targets {fpd(&home, qw(targets))} # Duplicates the in file structure - each file there-in shows us where the original file went
sub tests {fpd(&home, qw(tests/in))} # Folder containing test input files received from test developer at L<testExchangeIn|/testExchangeIn>
sub testExpected {fpd(&home, qw(tests/expected))} # Folder containing test results expected.
sub testExchangeIn {undef} # Exchange folder in which to receive tests so that test writers can disarrange their exchange folders as they please without disrupting testing at this end.
sub testExchangeOut {undef} # Exchange folder to publish tests results in so test writers can see the results in at L<testResults|/testResults>
sub testResults {fpd(&home, qw(tests/results))} # Folder containing actual test results locally, copied to: L<testExchangeOut|/testExchangeOut>
sub testStandAlone {fpd(&home, qw(tests/standalone/active))} # Folder containing standalone tests which is used instead of regression tests if content is present
sub testFails {fpd(&home, qw(fails))} # Folder containing failing files to be fixed by reprocessing them but only if fixFailingFiles is true
sub testFails2 {fpd(&home, qw(fails2))} # Folder containing files still unfixed by the current set of fixes
sub topicTrees {fpd(&home, qw(topicTrees))} # Folder to contain printed topic trees if requested by printTopicTrees
sub user {q(phil)} # Aws userid
sub www {fpd(qw(/var www html))} # Web server folder
my $startTime = time; # Start time.
my $endTime; # End time value.
my $runTime; # Run time value.
sub startTime {$startTime} # Start time of run in seconds since the epoch.
sub endTime {$endTime} # End time of run in seconds since the epoch.
sub runTime {$runTime} # Elapsed run time in seconds.
sub maximumNumberOfProcesses {develop ? 2 : 256} # Maximum number of conversion processes to run in parallel.
lib/Data/Edit/Xml/To/DitaVb.pm view on Meta::CPAN
sub convertOneFileToUTF8 #P Convert one file to utf8 and return undef if successful else the name of the document in error
{my ($source) = @_; # File to convert
ddd "Convert one file to utf8: $source"; # Title
my $target = swapFilePrefix($source, downloads, in); # Target file from source file
makePath($target);
my $fileType = sub # File type
{my $c = qx(file "$source") // q(unknown file type); # Decode contents using B<file>
return q(htmlAscii) if $c =~ m(HTML document, ASCII text)s; # Html
return q(htmlUtf8) if $c =~ m(HTML document, UTF-8 Unicode text)s;
return q(htmlIso8859) if $c =~ m(HTML document, ISO-8859 text)s;
return q(htmlNonIso) if $c =~ m(HTML document, Non-ISO extended-ASCII)s;
return q(xmlUtf8) if $c =~ m(XML 1.0 document text)s; # Xml
return q(ASCII) if $c =~ m(ASCII text)s; # Something to be converted to Xml
return q(ISO_8859-16) if $c =~ m(ISO-8859 text);
return q(UTF8) if $c =~ m(UTF-8 Unicode .*text)s;
return q(UTF16) if $c =~ m(UTF-16 Unicode text)s;
my $t = readBinaryFile($source); # Search unknown file for clues as to its content
return q(UTF16) if $t =~ m(\Aencoding="UTF-16"\Z);
confess "\nUnknown file type $c\n\n";
}->();
if (isFileUtf8($source)) # Copy file directly if already in utf8
{if ($source =~ m(\.html?\Z)s and $fileType =~ m(\Ahtml)s) # Dita xml gets reported as html by file so further restrict the definition of what might be html
{xxx qq(hxnormalize -x < "$source" > "$target"); # Normalize html to xml
}
else
{copyFile($source, $target); # Direct copy
}
}
else # Convert file to utf8
{if ($fileType =~ m(\Ahtml(Ascii|Utf8)))
{xxx qq(hxnormalize -x < "$source" > "$target"); # Normalize html to xml
}
elsif ($fileType =~ m(\Ahtml(Iso8859|NonIso))) # Normalize ISO8859 html to xml
{xxx qq(hxnormalize -x < "$source" | iconv -c -f ISO_8859-16 -t UTF8 -o "$target" -);
}
else
{xxx qq(iconv -c -f $fileType -t UTF8 -o "$target" "$source"); # Silently discard any unconvertible characters with -c !
}
}
if (-e $target) # Preprocess source file
{my $Text = readFile($target);
my $text = $Text =~ s(encoding="[^"]+") (encoding="UTF-8")r; # Write encoding if necessary
$text = spelling $text, $target; # Check/fix spelling
owf($target, $text) unless $text eq $Text;
return undef
}
$source;
}
sub convertToUTF8 #P Convert the encoding of documents in L<downloads|/downloads> to utf8 equivalents in folder L<in|/in>.
{if (unicode)
{clearFolder(in, clearCount);
my @d = searchDirectoryTreesForMatchingFiles(downloads, inputExt); # Files downloaded
my $n = @d;
confess "No documents to convert" unless $n; # Stop right here if there is nothing to convert
lll "Unicode conversion $n ",
"xml documents to convert from folder: ", downloads;
my @results = runInSquareRootParallel(maximumNumberOfProcesses, # Convert in square parallel because we have a lot of small fast conversions
sub{convertOneFileToUTF8(@_)},
sub{@_},
@d);
if (my @failed = grep {$_} @results) # Consolidate results - list of conversions that failed
{my $t = formatTableBasic([[qw(File)], map {[$_]} @failed]);
eee "The following source files failed to convert:\n", $t;
}
else
{lll "Unicode conversion - converted all $n documents";
Flip::Flop::unicode();
}
}
else
{ddd "Unicode conversion not requested";
}
}
sub convertToUTF822 #P Convert the encoding of documents in L<downloads|/downloads> to utf8 equivalents in folder L<in|/in>.
{if (unicode)
{clearFolder(in, clearCount);
my @d = searchDirectoryTreesForMatchingFiles(downloads, inputExt); # Files downloaded
my $n = @d;
confess "No documents to convert" unless $n; # Stop right here if there is nothing to convert
lll "Unicode conversion $n ",
"xml documents to convert from folder: ", downloads;
my $ps = newProcessStarter(maximumNumberOfProcesses); # Process starter
# $ps->processingTitle = q(Convert documents to uft8);
# $ps->totalToBeStarted = $n;
# $ps->processingLogFile = fpe(reports, qw(log convertUtf8 txt));
for my $d(@d) # Convert projects
{$ps->start(sub
{[convertOneFileToUTF8($d)]
});
}
if (my @results = $ps->finish) # Consolidate results
{my @failed; # Projects that failed to convert
for my $r(@results) # Results
{my ($source) = @$r; # Each result
if ($source) # A failing file
{push @failed, $source; # Report failures
}
}
if (@failed) # Confess to projects that failed to covert
{my $t = formatTableBasic([[qw(File)], map {[$_]} @failed]);
eee "The following source files failed to convert:\n", $t;
}
else
{lll "Unicode conversion - converted all $n documents";
Flip::Flop::unicode();
}
}
}
else
{ddd "Unicode conversion not requested";
}
}
sub projectCount() #P Number of projects.
{scalar keys %$projects
}
sub chooseIDGroup($) #r Return the id group for a project - files with the same id group share the same set of id attributes.
{my ($project) = @_; # Project
q(all);
}
# 2019.06.19 00:01:06
#sub chooseNameFromString($) #r Choose a name from a string
# {my ($string) = @_; # String
# nameFromStringRestrictedToTitle($string);
# }
sub newProject($) #P Project details including at a minimum the name of the project and its source file.
{my ($source) = @_; # Source file
confess "Source file does not exist:\n$source\n" unless -e $source;
my $name = fileMd5Sum(qq($source\n)); # The new line forces fileMd5Sum to get the md5 sum of the name not the content - which might well be identical to other files
if (my $p = $projects->{$name}) # Check that we have a unique source file
{confess "Duplicate source files:\n", $source, "\n".$p->source;
}
my $p = genHash(q(Project), # Project definition
idGroup => undef, # Projects with the same id group share id attributes.
name => $name, # Name of project
number => projectCount + 1, # Number of project
parseFailed => undef, # Parse of source file failed
source => $source, # Input file
sourceSize => fileSize($source), # Size of input file
targets => undef, # Where the items cut out of this topic wind up
test => undef, # Test projects write their results unlinted to testResults
);
$p->idGroup = chooseIDGroup($p); # Choose the id group for the project
$projects->{$name} = $p; # Save project definition
}
# 2019.06.19 00:01:06
#sub chooseProjectName($) #r Create a project name for each file to convert
# {my ($file) = @_; # Full file name
# chooseNameFromString($file);
# }
sub findProjectFromSource($) #P Locate a project by its source file
{my ($source) = @_; # Full file name
my @p;
my $file = swapFilePrefix($source, in);
for my $p(values %$projects)
{push @p, $p if swapFilePrefix($p->source, in) eq $file;
lib/Data/Edit/Xml/To/DitaVb.pm view on Meta::CPAN
endTime
errorLogFile
exchange
exchangeHome
exchangeItems
extendedNames
fails
fixBadRefs
fixDitaRefs
fixFailingFiles
fixXrefsByTitle
fixRelocatedRefs
gathered
hits
hitsFolder
home
imageCache
in
inputExt
lint
maximumFileFromStringLength
maximumNumberOfProcesses
mimajen
notify
numberOfFiles
out
outExtMap
outExtTopic
parseCache
parseFailed
perl
printTopicTrees
process
publications
publish
relinkDitaRefs
reports
restructure
restructurePhases
runTime
s3Bucket
s3Exchange
s3FolderIn
s3FolderUp
s3Parms
s3Profile
startTime
summaryFile
targets
testExchangeIn
testExchangeOut
testExpected
testFails
testFails2
testMode
testResults
testStandAlone
tests
titleOnly
topicTrees
unicode
upload
user
version
www
xref
xrefAllowUniquePartialMatches
xrefMatchTopics
)
}
if (0) # Format replaceable attributes
{lll "Replaceable attributes in $0\n", join "\n",
(sort keys %{reportAttributes($0)}), '';
exit;
}
my $overrideMethods; # Merge packages only once
sub overrideMethods(;$) #P Merge packages
{my ($package) = @_; # Name of package to be merged defaulting to that of the caller.
my ($p) = caller(); # Default package if none supplied
$package //= $p; # Supply default package if none supplied
return if $overrideMethods++; # Merge packages only once
Data::Table::Text::overrideMethods($package, __PACKAGE__,
replaceableMethods, attributeMethods);
}
sub saveCode #r Save code if developing
{if (develop)
{saveCodeToS3(1200, &perl, client, q(ryffine/code/perl/),
q(--only-show-errors --profile fmc --region eu-west-1));
}
}
sub checkParameters #P Check parameters for obvious failures
{my $h = home;
$h =~ m(\A/.*/\Z)s or confess "home must start and end with / but got: $h";
$h =~ m(//)s and confess "home contains // see: $h";
}
sub convertXmlToDita #P Perform all the conversion projects.
{my ($package) = caller;
lll conversion; # Title of run
unlink errorLogFile; # Clear log
for my $phase(q(saveCode), # Execute conversion phases
q(reportProgramAttributeSettings),
q(checkParameters),
q(setAtHits),
q(downloadFromS3),
q(convertToUTF8),
q(convertProjects),
# q(relinkDitaRefsInOutputCorpus), # Now done by Xref using fixDitaRefs=>targets/
q(lintResults),
q(restructureOutputFiles), # Deprecated in favor of: restructureResults
q(restructureResults),
q(editOutputFiles),
q(runTests),
q(copyLogFiles),
lib/Data/Edit/Xml/To/DitaVb.pm view on Meta::CPAN
B<Example:>
sub ðð®ðð²ðð¼ð±ð²
{if (develop)
{saveCodeToS3(1200, &perl, client, q(ryffine/code/perl/),
q(--only-show-errors --profile fmc --region eu-west-1));
}
}
You can provide you own implementation of this method in your calling package
via:
sub saveCode {...}
if you wish to override the default processing supplied by this method.
=head2 Project Definition
Project definition
=head3 Output fields
B<idGroup> - Projects with the same id group share id attributes.
B<name> - Name of project
B<number> - Number of project
B<parseFailed> - Parse of source file failed
B<source> - Input file
B<targets> - Where the items cut out of this topic wind up
B<test> - Test projects write their results unlinted to testResults
=head1 Attributes
The following is a list of all the attributes in this package. A method coded
with the same name in your package will over ride the method of the same name
in this package and thus provide your value for the attribute in place of the
default value supplied for this attribute by this package.
=head2 Replaceable Attribute List
catalog clearCount client conversion convert debug deguidize develop ditaBin ditaXrefs docSet download downloads endTime errorLogFile exchange exchangeHome exchangeItems extendedNames fails fixBadRefs fixDitaRefs fixFailingFiles fixXrefsByTitle gathe...
=head2 catalog
Dita catalog to be used for linting.
=head2 clearCount
Limit on number of files to clear from each output folder.
=head2 client
The name of the client
=head2 conversion
Conversion name
=head2 convert
Convert documents to dita if true.
=head2 debug
Debug if true.
=head2 deguidize
0 - normal processing, 1 - replace guids in hrefs with their target files to deguidize dita references. Given href g1#g2/id convert g1 to a file name by locating the topic with topicId g2.
=head2 develop
Production run if this file folder is detected otherwise development.
=head2 ditaBin
Location of Dita tool
=head2 ditaXrefs
Convert xref hrefs expressed as just ids to dita format - useful in non Dita to Dita conversions for example: docBook
=head2 docSet
Select set of documents to convert.
=head2 download
Download from S3 if true.
lib/Data/Edit/Xml/To/DitaVb.pm view on Meta::CPAN
=head2 targets
Duplicates the in file structure - each file there-in shows us where the original file went
=head2 testExchangeIn
Exchange folder in which to receive tests so that test writers can disarrange their exchange folders as they please without disrupting testing at this end.
=head2 testExchangeOut
Exchange folder to publish tests results in so test writers can see the results in at L<testResults|/testResults>
=head2 testExpected
Folder containing test results expected.
=head2 testFails
Folder containing failing files to be fixed by reprocessing them but only if fixFailingFiles is true
=head2 testFails2
Folder containing files still unfixed by the current set of fixes
=head2 testMode
1 - run development tests, 2- run standalone tests, 0 run production documents
=head2 testResults
Folder containing actual test results locally, copied to: L<testExchangeOut|/testExchangeOut>
=head2 testStandAlone
Folder containing standalone tests which is used instead of regression tests if content is present
=head2 tests
Folder containing test input files received from test developer at L<testExchangeIn|/testExchangeIn>
=head2 titleOnly
Use only the title of topics to create GB Standard file names otherwise use the following text as well if the title is too short
=head2 topicTrees
Folder to contain printed topic trees if requested by printTopicTrees
=head2 unicode
Convert to utf8 if true.
=head2 upload
Upload to S3 Bucket if true and the conversion is at 100%, 2 - upload to S3 Bucket regardless, 0 - no upload to S3 Bucket.
=head2 user
Aws userid
=head2 version
Description of this run as printed in notification message and title
=head2 www
Web server folder
=head2 xref
Xref output xml.
=head2 xrefAllowUniquePartialMatches
Allow partial matching - i.e ignore the stuff to the right of the # in a reference if doing so produces a unique result
=head1 Optional Replace Methods
The following is a list of all the optionally replaceable methods in this
package. A method coded with the same name in your package will over ride the
method of the same name in this package providing your preferred processing for
the replaced method in place of the default processing supplied by this
package. If you do not supply such an over riding method, the existing method
in this package will be used instead.
=head2 Replaceable Method List
afterConvertProjects beforeConvertProjects beforeUploadToS3 chooseIDGroup cleanUpBookMap cleanUpCutOutTopic convertDocument formatXml lintResults restructureCleanUp restructureOneDocument restructureOutputFiles saveCode selectFileForProcessing spelli...
=head1 Private Methods
=head2 getHome()
Compute home directory once.
( run in 0.867 second using v1.01-cache-2.11-cpan-f56aa216473 )