XML-Twig

 view release on metacpan or  search on metacpan

tools/xml_spellcheck/xml_spellcheck  view on Meta::CPAN

# option processing
$spellchecker ||= $DEFAULT_SC;
$ext          ||= $DEFAULT_EXT;

if( $exclude_elements && $include_elements)
  { die "cannot use both --exclude-elements and --include-elements\n"; }
if( defined $pretty_print and !$pretty_print)
  { $pretty_print= $DEFAULT_PP; }

my %twig_options;

my( %include_elements);
if( $exclude_elements)
  { my @exclude_elts = split /\s+/, $exclude_elements;
    my %start_tag_handlers= map { $_ => \&exclude_elt } @exclude_elts;
    $twig_options{start_tag_handlers}= \%start_tag_handlers;
  }
if( $include_elements)
  { my @include_elts = split /\s+/, $include_elements;
    my %start_tag_handlers= map { $_ => \&include_elt } @include_elts;
    $twig_options{start_tag_handlers}= \%start_tag_handlers;
  }

$twig_options{pretty_print}= $pretty_print if( $pretty_print);

foreach my $file (@ARGV)
  { 
    my $id=0;
    my $id2elt={};           # id => element

    my( $tmp_fh, $tmp_file) = tempfile( "xml_spellcheck_XXXX", 
                                        SUFFIX => '.txt'
                                      );
    my $t= XML::Twig->new( keep_encoding =>1, %twig_options,);
    $t->parsefile( $file);

    foreach my $elt ($t->descendants( '#TEXT'))
      {
        if(    (!$include_elements and !$exclude_elements)
            or ($include_elements and  $elt->inherit_att( '#include'))
            or ($exclude_elements and !$elt->inherit_att( '#exclude'))
          )
          { $id++;
            process_text( $t, $elt, $id, $id2elt, $tmp_fh)
          }
      }
    close $tmp_fh;

    system( "$spellchecker $tmp_file") ==0
      or die "$spellchecker $tmp_file failed: $?";

   
    open( $tmp_fh, "<$tmp_file") or die "cannot open temp file $tmp_file: $!";
    while( <$tmp_fh>)
      { chomp;
        my( $id, $text)= split /:/, $_, 2;
        my $wrap= $id2elt->{$id};
        $text=~ s{<\\n>}{\n}g;
        my $text_elt= $wrap->first_child or die "internal error 100\n";
        if( $text_elt->gi eq '#PCDATA')
          { $text_elt->set_pcdata( $text); }
        elsif( $text_elt->gi eq '#CDATA')
          { $text_elt->set_cdata( $text); }
        else 
          { die "internal error 101\n"; }
        $wrap->erase;
      }
    close $tmp_fh;

    rename( $file, "$file$ext") or die "cannot save backup file $file$ext: $!";
    open( FILE, ">$file")       or die "cannot save spell checked file $file: $!";
    $t->print( \*FILE);
    close FILE;
  }     


sub include_elt
  { $_->set_att( '#include' => 1) ; }

sub exclude_elt
  { $_->set_att( '#exclude' => 1) ; }

sub process_text
  { my( $t, $elt, $id, $id2elt, $tmp_fh)= @_;
    my $wrap= $elt->wrap_in( '#SC');
    #$wrap->set_att( '#ID' => $id);
    $id2elt->{$id}= $wrap;
    my $text= $elt->text;
    $text=~ s{\n}{<\\n>}g;
    print $tmp_fh "$id:$text\n";
  }

__END__

=head1 NAME

xml_spellcheck - spellcheck XML files

=head1 SYNOPSIS

  xml_spellcheck [options] <files>

=head1 DESCRIPTION

xml_spellcheck lets you spell check the content of an XML file.
It extracts the text (the content of elements and optionally of
attributes), call a spell checker on it and then recreates the
XML document.

=head1 OPTIONS

Note that all options can be abbreviated to the first letter

=over 4

=item --conf <configuration_file>

Gets the options from a configuration file. NOT IMPLEMENTED YET.

=item --spellchecker <spellchecker>

The command to use for spell checking, including any option



( run in 0.885 second using v1.01-cache-2.11-cpan-39bf76dae61 )