XML-Twig
view release on metacpan or search on metacpan
tools/xml_spellcheck/xml_spellcheck view on Meta::CPAN
# option processing
$spellchecker ||= $DEFAULT_SC;
$ext ||= $DEFAULT_EXT;
if( $exclude_elements && $include_elements)
{ die "cannot use both --exclude-elements and --include-elements\n"; }
if( defined $pretty_print and !$pretty_print)
{ $pretty_print= $DEFAULT_PP; }
my %twig_options;
my( %include_elements);
if( $exclude_elements)
{ my @exclude_elts = split /\s+/, $exclude_elements;
my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
if( $include_elements)
{ my @include_elts = split /\s+/, $include_elements;
my %start_tag_handlers= map { $_ => \&include_elt } @include_elts;
$twig_options{start_tag_handlers}= \%start_tag_handlers;
}
$twig_options{pretty_print}= $pretty_print if( $pretty_print);
foreach my $file (@ARGV)
{
my $id=0;
my $id2elt={}; # id => element
my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX",
SUFFIX => '.txt'
);
my $t= XML::Twig->new( keep_encoding =>1, %twig_options,);
$t->parsefile( $file);
foreach my $elt ($t->descendants( '#TEXT'))
{
if( (!$include_elements and !$exclude_elements)
or ($include_elements and $elt->inherit_att( '#include'))
or ($exclude_elements and !$elt->inherit_att( '#exclude'))
)
{ $id++;
process_text( $t, $elt, $id, $id2elt, $tmp_fh)
}
}
close $tmp_fh;
system( "$spellchecker $tmp_file") ==0
or die "$spellchecker $tmp_file failed: $?";
open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_file: $!";
while( <$tmp_fh>)
{ chomp;
my( $id, $text)= split /:/, $_, 2;
my $wrap= $id2elt->{$id};
$text=~ s{<\\n>}{\n}g;
my $text_elt= $wrap->first_child or die "internal error 100\n";
if( $text_elt->gi eq '#PCDATA')
{ $text_elt->set_pcdata( $text); }
elsif( $text_elt->gi eq '#CDATA')
{ $text_elt->set_cdata( $text); }
else
{ die "internal error 101\n"; }
$wrap->erase;
}
close $tmp_fh;
rename( $file, "$file$ext") or die "cannot save backup file $file$ext: $!";
open( FILE, ">$file") or die "cannot save spell checked file $file: $!";
$t->print( \*FILE);
close FILE;
}
sub include_elt
{ $_->set_att( '#include' => 1) ; }
sub exclude_elt
{ $_->set_att( '#exclude' => 1) ; }
sub process_text
{ my( $t, $elt, $id, $id2elt, $tmp_fh)= @_;
my $wrap= $elt->wrap_in( '#SC');
#$wrap->set_att( '#ID' => $id);
$id2elt->{$id}= $wrap;
my $text= $elt->text;
$text=~ s{\n}{<\\n>}g;
print $tmp_fh "$id:$text\n";
}
__END__
=head1 NAME
xml_spellcheck - spellcheck XML files
=head1 SYNOPSIS
xml_spellcheck [options] <files>
=head1 DESCRIPTION
xml_spellcheck lets you spell check the content of an XML file.
It extracts the text (the content of elements and optionally of
attributes), call a spell checker on it and then recreates the
XML document.
=head1 OPTIONS
Note that all options can be abbreviated to the first letter
=over 4
=item --conf <configuration_file>
Gets the options from a configuration file. NOT IMPLEMENTED YET.
=item --spellchecker <spellchecker>
The command to use for spell checking, including any option
( run in 0.885 second using v1.01-cache-2.11-cpan-39bf76dae61 )