Data-Edit-Xml-Xref

 view release on metacpan or  search on metacpan

lib/Data/Edit/Xml/Xref.pm  view on Meta::CPAN

#!/usr/bin/perl -I/home/phil/perl/cpan/DataEditXml/lib/ -I/home/phil/perl/cpan/DataTableText/lib/ -I/home/phil/perl/cpan/DitaGBStandard/lib/
#-------------------------------------------------------------------------------
# Cross reference Dita XML, match topics and ameliorate missing references.
# Philip R Brenan at gmail dot com, Appa Apps Ltd Inc, 2016-2019
# Improvements and maxzoomin
#-------------------------------------------------------------------------------
# Check for image formats that will not display in a browser
# Do not consider companion files!
# Images that are referenced by topics which are not referenced by bookmaps showup as referenced
# It should be possible to remove reportImages by using generic references instead
# Conref processing in reportReferencesFromBookmaps
# Fix xref external/scope and eliminate error count if fixbadrefs in operation.
# Add labels to ditaRefs processing so that references to labels are also fixed
# Add xref expansion from id in file as it is a pain to code up the full details by hand
# Find topics that have no text in them per: PS2-617
# Need test for changeBadXrefToPh
# Unique target needs tests
# Create list of images found in input folder
# Conrefs report should use targets/ to update the conref file so conrefs fixed by fixDitaRefs are considered

package Data::Edit::Xml::Xref;
our $VERSION = 20200424;
use v5.26;
use warnings FATAL => qw(all);
use strict;
use Carp qw(confess cluck);
use Data::Dump qw(dump);
use Data::Edit::Xml;
use Data::Table::Text qw(:all);
use Dita::GB::Standard;
use Storable qw(store retrieve);
use Time::HiRes qw(time);
use utf8;

#sub improvementLength      {80}                                                 #P Maximum length of the test of an improvement suggestion
sub classificationMapSuffix{q(_classification.ditamap)}                         #P Suffix to add to map files to create corresponding classification map file

#D1 Cross reference                                                             # Check the cross references in a set of Dita files and report the results.

sub newXref(%)                                                                  #P Create a new cross referencer
 {my (%attributes) = @_;                                                        # Attributes

  my $xref = genHash(__PACKAGE__,                                               # Attributes used by the Xref cross referencer.
    addNavTitles                        => undef,                               #I If true, add navtitle to outgoing bookmap references to show the title of the target topic.
    allowUniquePartialMatches           => undef,                               # Allow unique partial matches - i.e ignore the stuff to the right of the # in a reference if doing so produces a unique result. This feature has been explicitly disabled...
    attributeCount                      => {},                                  # {file}{attribute name} == count of the different xml attributes found in the xml files.
    attributeNamesAndValuesCount        => {},                                  # {file}{attribute name}{value} = count
    author                              => {},                                  # {file} = author of this file.
    badGuidHrefs                        => {},                                  # Bad conrefs - all.
    badNavTitles                        => {},                                  # Details of nav titles that were not resolved
    badReferencesCount                  => 0,                                   # The number of bad references at the start of the run - however depending on what options were chosen Xref might ameliorate these bad references and thereby reduce this ...
    badTables                           => [],                                  # Array of tables that need fixing.
    badXml1                             => {},                                  # [Files] with a bad xml encoding header on the first line.
    badXml2                             => {},                                  # [Files] with a bad xml doc type on the second line.
    baseFiles                           => {},                                  # {base of file name}{full file name}++ Current location of the file via uniqueness guaranteed by the GB standard
    baseTag                             => {},                                  # Base Tag for each file
    bookMapRefs                         => {},                                  # {bookmap full file name}{href}{navTitle}++ References from bookmaps to topics via appendix, chapter, bookmapref.
    changeBadXrefToPh                   => undef,                               #I Change xrefs being placed in B<M3> by L<fixBadRefs> to B<ph>.
    classificationMaps                  => undef,                               #I Create classification maps if true
    conRefs                             => {},                                  # {file}{href}{tag}++ : conref source detail
    createReports1                      => [],                                  # Reports requested before references fixed
    createReports2                      => [],                                  # Reports requested after references fixed
    currentFolder                       => currentDirectory,                    # The current working folder used to make absolute file names from relative ones
    deleteUnusedIds                     => 0,                                   #I Delete ids (except on topics) that are not referenced in any reference in the corpus regardless of the file component of any such reference.
    deguidize                           => undef,                               #I Set true to replace guids in dita references with file name. Given reference B<g1#g2/id> convert B<g1> to a file name by locating the topic with topicId B<g2>.  This r...
    docType                             => {},                                  # {file} == docType:  the docType for each xml file.
    duplicateIds                        => {},                                  # [file, id]     Duplicate id definitions within each file.
    duplicateTopicIds                   => {},                                  # Duplicate topic ids
    duplicateTopicIds                   => {},                                  # [topicId, [files]] Files with duplicate topic ids - the id on the outermost tag.
    emptyTopics                         => {},                                  # {file} : topics where the *body is empty.
    errors                              => 0,                                   # Number of significant errors as reported in L<statusLine> or 0 if no such errors found
    exteriorMaps                        => {},                                  # {exterior map} : maps that are not referenced by another map
    fileExtensions                      => [qw(.dita .ditamap .xml .fodt)],     # Default file extensions to load
    fixBadRefs                          => undef,                               #I Fix any remaining bad references after any all allowed attempts have been made to fix failing references by moving the failing reference to the B<xtrf> attribute i.e. ...
    fixDitaRefs                         => undef,                               #I Fix references in a corpus of L<Dita> documents that have been converted to the L<GBStandard> and whose target structure has been written to the named folder.
    fixedFolder                         => undef,                               #I Fixed files are placed in this folder.
    fixedFolderTemp                     => undef,                               #I Fixed files are placed in this folder if we are on aws but nit the session leader - this folder is then copied back to L<fixedFolder> on the session leader.
    fixedRefsBad                        => [],                                  # [] hrefs and conrefs from L<fixRefs|/fixRefs> which were moved to the "xtrf" attribute as requested by the L<fixBadHrefs|/fixBadHrefs> attribute because the reference w...
    fixedRefsGB                         => [],                                  # [] files fixed to the Gearhart-Brenan file naming standard
    fixedRefsGood                       => [],                                  # [] hrefs and conrefs from L<fixRefs|/fixRefs> which were invalid but have been fixed by L<deguidizing|/deguidize> them to a valid file name.
    fixedRefsNoAction                   => [],                                  # [] hrefs and conrefs from L<fixRefs|/fixRefs> for which no action was taken.
    fixRefs                             => {},                                  # {file}{ref} where the href or conref target is not valid.
    fixRelocatedRefs                    => undef,                               #I Fix references to topics that have been moved around in the out folder structure assuming that all file names are unique which they will be if they have been renamed t...
    fixXrefsByTitle                     => undef,                               #I Try to fix invalid xrefs by the Gearhart Title Method enhanced by the Monroe map method if true
    flattenFiles                        => {},                                  # {old full file name} = file renamed to Gearhart-Brenan file naming standard
    flattenFolder                       => undef,                               #I Files are renamed to the Gearhart standard and placed in this folder if set.  References to the unflattened files are updated to references to the flattened files.  Th...
    getFileUrl => qq(/cgi-bin/uiSelfServiceXref/client.pl?getFile=),            #I A url to retrieve a specified file from the server running xref used in generating html reports. The complete url is obtained by appending the fully qualified file nam...
    goodImageFiles                      => {},                                  # {file}++ : number of references to each good image
    goodNavTitles                       => {},                                  # Details of nav titles that were resolved.
    guidHrefs                           => {},                                  # {file}{href} = location where href starts with GUID- and is thus probably a guid.
    guidToFile                          => {},                                  # {topic id which is a guid} = file defining topic id.
    hrefUrlEncoding                     => {},                                  # Hrefs that need url encoding because they contain white space.
    html                                => undef,                               #I Generate html version of reports in this folder if supplied
    idNotReferenced                     => {},                                  # {file}{id}++ - id in a file that is not referenced
    idReferencedCount                   => {},                                  # {file}{id}++ - the number of times this id in this file is referenced from the rest of the corpus
    ids                                 => {},                                  # {file}{id}   - id definitions across all files.
    idsRemoved                          => {},                                  # {id}++ : Ids removed from all files
    idTags                              => {},                                  # {file}{id}[tag] The tags associated with each id in a file - there might be more than one if the id is duplicated
    images                              => {},                                  # {file}{href}   Count of image references in each file.
    imagesReferencedFromBookMaps        => {},                                  # {bookmap full file name}{full name of image referenced from topic referenced from bookmap}++
    imagesReferencedFromTopics          => {},                                  # {topic full file name}{full name of image referenced from topic}++
    imagesToRefferingBookMaps           => {},                                  # {image full file name}{bookmap full file name}++ : images to referring bookmaps
    indexWords                          => undef,                               #I Index words to topics and topics to words if true.
    indexWordsFolder                    => undef,                               #I Folder into which to save words to topic and topics to word indexes if L<indexWords> is true.
    indexedWords                        => {},                                  # {word}{full file name of topic the words occurs in}.
    inputFiles                          => [],                                  # Input files from L<inputFolder|/inputFolder>.
    inputFileToTargetTopics             => {},                                  # {input file}{target file}++ : Tells us the topics an input file was split into
    inputFolderImages                   => {},                                  # {full image file name} for all files in input folder thus including any images resent
    inputFolder                         => undef,                               #I A folder containing the dita and ditamap files to be cross referenced.
    ltgt                                => {},                                  # {text between &lt; and &gt}{filename} = count giving the count of text items found between &lt; and &gt;
    matchTopics                         => undef,                               #I Match topics by title and by vocabulary to the specified confidence level between 0 and 1.  This operation might take some time to complete on a large corpus.
    maximumNumberOfProcesses            => numberOfCpus(8),                     #I Maximum number of processes to run in parallel at any one time with a sensible default.
    maxZoomIn                           => undef,                               #I Optional hash of names to regular expressions to look for in each file
    maxZoomOut                          => {},                                  # Results from L<maxZoomIn|/maxZoomIn>  where {file name}{regular expression key name in L<maxZoomIn|/maxZoomIn>}++
    md5Sum                              => {},                                  # MD5 sum for each input file.
    md5SumDuplicates                    => {},                                  # {md5sum}{file}++ : md5 sums with more than one file
    missingImageFiles                   => {},                                  # [file, href] == Missing images in each file.
    missingTopicIds                     => {},                                  # Missing topic ids.
    noHref                              => {},                                  # Tags that should have an href but do not have one.
    notReferenced                       => {},                                  # {file name} Files in input area that are not referenced by a conref, image, bookmapref or xref tag and are not a bookmap.
    olBody                              => {},                                  # The number of ol under body by file
    originalSourceFileAndIdToNewFile    => {},                                  # {original file}{id} = new file: Record mapping from original source file and id to the new file containing the id
    otherMeta                           => {},                                  # {original file}{othermeta name}{othermeta content}++ : the contents of the other meta tags
    otherMetaDuplicatesSeparately       => [],                                  # Duplicate othermeta in bookmaps and topics considered separately
    otherMetaDuplicatesCombined         => [],                                  # Duplicate othermeta in bookmaps with called topics othermeta included
    otherMetaRemainWithTopic            => [],                                  # Othermeta that must stay in the topic
    otherMetaPushToBookMap              => [],                                  # Othermeta that can be pushed to the calling book map
    otherMetaBookMapsBeforeTopicIncludes=> [],                                  # Bookmap othermeta before topic othermeta has been included
    otherMetaBookMapsAfterTopicIncludes => [],                                  # Bookmap othermeta after  topic othermeta has been included
    otherMetaConsolidated               => {},                                  # {Name}{Content}++ : consolidated other meta data across entire corpus
    oxygenProjects                      => undef,                               #I Create oxygen project files for each map - the project file will have an extension of .xpr and the same name and path as the map file or the name return by your implem...
    parseFailed                         => {},                                  # {file} files that failed to parse.
    publicId                            => {},                                  # {file} = Public id on Doctype
    references                          => {},                                  # {file}{reference}++ - the various references encountered
    relocatedReferencesFailed           => [],                                  # Failing references that were not fixed by relocation
    relocatedReferencesFixed            => [],                                  # Relocated references fixed
    requestAttributeNameAndValueCounts  => undef,                               #I Report attribute name and value counts
    requiredCleanUp                     => undef,                               # {full file name}{cleanup} = number of required-cleanups
    reports                             => undef,                               #I Reports folder: Xref will write text versions of the generated reports to files in this folder.
    results                             => [],                                  # Summary of results table.
#   sourceFile                          => undef,                               # The source file from whic#h this structure was generated.
    sourceTopicToTargetBookMap          => {},                                  # {input topic cut into multiple pieces} = output bookmap representing pieces
    statusLine                          => undef,                               # Status line summarizing the cross reference.
    statusTable                         => undef,                               # Status table summarizing the cross reference.
    subjectSchemeMap                    => undef,                               #I Create a subject scheme map in the named file
    suppressReferenceChecks             => undef,                               #I Suppress reference checking - which normally happens by default - but which takes time and might be irrelevant if an earlier xref has already checked all the reference...

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 2.294 seconds using v1.00-cache-2.02-grep-82fe00e-cpan-f5108d614456 )