XML-Comma

 view release on metacpan or  search on metacpan

lib/XML/Comma/Parsing/PurePerl.pm  view on Meta::CPAN

  $self->{_in_include} = 1;
  return sub {
    my %arg = @_;
    if ( $arg{block} ) {
      $self->{_string} = $arg{block};
      $self->{_from_file} = '';
    } else {
      ( $self->{_string}, $self->{_from_file} ) =
        XML::Comma::DefManager->include_string ( $arg{name},$arg{args_string} );
    }
    # dbg 'str', $self->{_string};
    eval {
      $self->handle_element ( @{$handle_element_args} );
    }; if ( $@ ) {
      my $context = join '/', map { $_->tag() } $self->down_tree_branch();
      $context = ($self->{_from_file}.':'.$context) if $self->{_from_file};
      $self->{_el_stack} = undef;
      XML::Comma::Log->err
          ( 'PARSE_ERR', $@, undef,
            "(in '$context' at " . $self->pos_line_and_column() . ")\n" );
    }
    $self->{_el_stack} = undef;
  }
}



sub parse {
  my ( $class, %arg ) = @_; my $self = {}; bless ( $self, $class );
  $self->{_pos} = $self->{_wpos} = 0;
  $self->{_el_stack} = [];
  $self->{_string} = $arg{block} || die "need a block to PurePerl::parse";
  eval {
    # prolog
    my ( $type, $string, $tag ) = $self->skip_prolog();
    # root element
    $self->handle_element ( $self, $tag, 0, 0 );
    # nothing else
    $self->eat_whitespace();
    ( $type, $string, $tag ) = $self->next_token();
    while ( $type != $DONE ) {
      if ( $string            and
           $type != $COMMENT  and  $type != $PROCESSING_INSTRUCTION ) {
        die "more content found after root element: '$string'\n";
      }
      $self->eat_whitespace();
    ( $type, $string, $tag ) = $self->next_token();
    }
  }; if ( $@ ) {
    my $context = join '/', map { $_->tag() } $self->down_tree_branch();
    $self->{_el_stack} = undef;
    XML::Comma::Log->err 
        ( 'PARSE_ERR', $@, undef,
          "(in '$context' at " . $self->pos_line_and_column() . ")\n" );
  }
  $self->{_el_stack} = undef;
}


sub raw_append {}
sub cdata_wrap {}

####
# document parsing
####

sub handle_document {
  my ( $self, $read_args ) = @_;
  my $doc;
  eval {
    # prolog and outermost envelope
    my ( $type, $string, $tag ) = $self->skip_prolog();
    # create document
    $doc = $self->{_top_level_class}
      ->new ( type => $tag,
              from_file => $self->{_from_file},
              last_mod_time => 
                $self->{_from_file} ? (stat($self->{_from_file}))[9] : 0,
              read_args => $read_args );
    push @{$self->{_el_stack}}, $doc;
    # recursively handle elements
    $self->handle_element ( $doc, $tag, 1, 1 );
    # nothing else except comments and whitespace
    $self->eat_whitespace();
    ( $type, $string, $tag ) = $self->next_token();
    while ( $type != $DONE ) {
      if ( $type != $COMMENT and $type != $PROCESSING_INSTRUCTION ) {
        die "more content found after root element: '$string'\n";
      }
      $self->eat_whitespace();
    ( $type, $string, $tag ) = $self->next_token();
    }
  }; if ( $@ ) {
    my $context = join '/', map { $_->tag() } $self->down_tree_branch();
    $context = ($self->{_from_file}.':'.$context) if $self->{_from_file};
    $self->{_el_stack} = undef;
    XML::Comma::Log->err 
        ( 'PARSE_ERR', $@, undef,
          "(in '$context' at " . $self->pos_line_and_column() . ")\n" );
  }
  $self->{_el_stack} = undef;
  return $doc;
}

sub down_tree_branch {
  my $self = shift();
  return @{$self->{_el_stack}};
}

sub skip_prolog {
  my $self = shift();
  # let's be overly forgiving and accept docs with leading whitespace
  $self->eat_whitespace();
  my ( $type, $string, $special ) = $self->next_token();
  while ( $type != $OPEN_TAG ) {
    if ( $type == $CDATA ) {
      die "unexpected CDATA\n";
    } elsif ( $type == $TEXT ) {
      die "text outside of root element\n";
    } elsif ( $type == $DONE ) {
      die "no document content\n";
    }
    $self->eat_whitespace();
    ( $type, $string, $special ) = $self->next_token();
  }
  return ( $type, $string, $special );
}

sub handle_element {
  my ( $self, $el, $tag, $nested, $comma_level ) = @_;
  while ( 1 ) {
    my ( $type, $string, $special ) = $self->next_token();
    if ( $type == $TEXT ) {
      # text -- append (let el do its own checking)
      $el->raw_append ( $string );
    } elsif ( $type == $OPEN_TAG ) {
      # open tag -- recurse
      if ( $nested ) {
        my $new = $el->add_element ( $special, $string );
        push @{$self->{_el_stack}}, $new;
        $self->handle_element
          ( $new,
            $special,
            ($new->def() ? $new->def()->is_nested() : 1),
            1 );
      } else {
        $el->raw_append ( $string );
        $self->handle_element ( $el, $special, 0, 0 );
      }
    } elsif ( $type == $EMPTY_ELEMENT ) {
      if ( $nested ) {
        $el->add_element ( $special );
      } else {
        $el->raw_append ( $string );
      }
    } elsif ( $type == $CLOSE_TAG ) {
      # close tag -- check for match and return
      if ( $special eq $tag ) {
        if ( $comma_level ) {
          $el->finish_initial_read ( $self );
          pop @{$self->{_el_stack}};
        } else {
          $el->raw_append ( $string );
        }
        return; # ok
      } else {
        die "mismatched tag: '$tag', '$special'\n";
      }
    } elsif ( $type == $CDATA ) {
      # cdata -- extract and append
      if ( $nested ) {
        die "cdata content '$string' found for nested element '$tag'\n";
      } else {
        $el->cdata_wrap();
        $el->raw_append ( $special );
      }
    } elsif ( $type == $DOCTYPE ) {
      # doctype -- throw an error
      die "doctype after prolog\n";
    } elsif ( $type == $DONE ) {
      unless ( $self->{_in_include} ) {
        # finished prematurely
        die "reached end of document unexpectedly\n";
      }
      return; # putatively ok
    } elsif ( $type == $PROCESSING_INSTRUCTION ) {
      my $content = trim ( substr $string, 2, length($string) - 4 );
      my ( $directive, $first_word, $rest ) = split ( /\s+/, $content, 3 );
      if ( $directive  and  $directive eq '#include' ) {
        die "#include directive with no include name given\n"
          unless $first_word;
        $self->includes_parser ( [ $el, $tag, $nested, $comma_level ] )
            ->( name        => $first_word,
                args_string => $rest );
      }
      ## ignore other processing instructions
    }
    ## ignore comments
  }
}

sub pos_line_and_column {
  my $self = shift();
  my $line = 1;
  my $pos = 0;
  my $col = 0;
  pos ( $self->{_string} ) = 0;
  while ( $self->{_string} =~ /(\r\n)|(\r)|(\n)/g ) {
    last  if  pos($self->{_string}) > $self->{_pos};
    $line++;
    $pos = pos ( $self->{_string} );
  }
  $col = $self->{_pos} - $pos;
  return "line $line, column $col";
}


####
# token-level routines
####

sub eat_whitespace {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c()) ) {
    last  if  ! ( $c eq ' ' or
                  $c eq "\n" or
                  $c eq "\r" or
                  $c eq "\t" );
  }
  $self->pushback_c()  if  defined $c;
  $self->{_pos} = $self->{_wpos};
}

lib/XML/Comma/Parsing/PurePerl.pm  view on Meta::CPAN

        }
      }
      my $token_string = substr $self->{_string}, $self->{_pos},
        $self->{_wpos} - $self->{_pos};
      $self->{_pos} = $self->{_wpos};
      if ( $token_string =~ m:/>$: ) {
        return ( $EMPTY_ELEMENT, $token_string, $tag_name );
      } else {
        return ( $OPEN_TAG, $token_string, $tag_name );
      }
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside open tag...\n";
}

sub close_tag {
  my $self = shift();
  my $tag_name;
  my $c;
  while ( defined ($c = $self->get_c()) ) {
    if ( $c eq '>' ) {
      $tag_name = substr $self->{_string}, $self->{_pos}+2,
        $self->{_wpos} - $self->{_pos} - 3;
      my $token_string = substr $self->{_string}, $self->{_pos},
        $self->{_wpos} - $self->{_pos};
      $self->{_pos} = $self->{_wpos};
      return ( $CLOSE_TAG, $token_string, $tag_name );
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside close tag\n";
}

sub processing_instruction {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c()) ) {
    if ( $c eq '?' and $self->get_c() eq '>') {
      my $token_string = substr $self->{_string}, $self->{_pos},
        $self->{_wpos} - $self->{_pos};
      $self->{_pos} = $self->{_wpos};
      return ( $PROCESSING_INSTRUCTION, $token_string, undef );
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside <?...\n";
}

sub bang_instruction {
  my $self = shift();
  my $next = $self->get_chars(2);
  if ( $next eq '--' ) {
    return $self->comment()
  } elsif ( $next eq 'DO' and $self->get_chars(5) eq 'CTYPE' ) {
    return $self->doctype();
  } elsif ( $next eq '[C' and $self->get_chars(5) eq 'DATA[' ) {
    return $self->cdata();
  } else {
    die "unrecognized tag, '<!$next'";
  }
}

sub doctype {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c()) ) {
    if ( $c eq '>' ) {
      my $token_string = substr $self->{_string}, $self->{_pos},
        $self->{_wpos} - $self->{_pos};
      $self->{_pos} = $self->{_wpos};
      return ( $DOCTYPE, $token_string, undef );
    } elsif ( $c eq '[' ) {
      die "parser doesn't handle in-line doctype declarations\n";
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside <!DOCTYPE...\n";
}

sub comment {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c(1)) ) {
    if ( $c eq '-' and $self->get_c(1) eq '-' ) {
      if ( $self->get_c(1) eq '>' ) {
        my $token_string = substr $self->{_string}, $self->{_pos},
          $self->{_wpos} - $self->{_pos};
        $self->{_pos} = $self->{_wpos};
        return ( $COMMENT, $token_string, undef );
      } else {
        die "string '--' not allowed inside comments\n";
      }
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside a comment\n";
}

sub cdata {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c(1)) ) {
    if ( $c eq ']' ) {
      my $point = $self->{_wpos};
      if ( $self->get_c(1) eq ']' and $self->get_c(1) eq '>' ) {
        my $token_string = substr $self->{_string}, $self->{_pos},
          $self->{_wpos} - $self->{_pos};
        my $contents_string = substr $token_string, 9, length($token_string)-12;
        $self->{_pos} = $self->{_wpos};
        return ( $CDATA, $token_string, $contents_string );
      } else {
        $self->{_wpos} = $point;
      }
    }
  }
  # if we get here, we've exited the while loop by overrunning the
  # end of our string
  die "reached end of document while inside <![CDATA...\n";
}

sub text {
  my $self = shift();
  my $c;
  while ( defined ($c = $self->get_c()) ) {
    if ( $c eq '<' ) {
      $self->pushback_c();
      my $token_string = substr $self->{_string}, $self->{_pos},
        $self->{_wpos} - $self->{_pos};
      $self->{_pos} = $self->{_wpos};
      return ( $TEXT, $token_string, undef );
    }
  }
  # if we get here, we've exited the while loop by overrunning the end
  # of our string. but we need to let someone higher up handle this
  # problem, so just return what we've gotten up to this point...
  return ( $TEXT, substr($self->{_string}, $self->{_pos},
                         $self->{_wpos} - $self->{_pos}), undef );
}

# gets the next character. unless $ignore_amps is set, skips over
# entities (returns the ';'), and dies if a non-entitieizing & is
# found
sub get_c {
  my ( $self, $ignore_amps ) = @_;
  if ( $self->{_wpos} >= length $self->{_string} ) {
    return undef;
  }
  my $c = substr $self->{_string}, $self->{_wpos}++, 1;
  if ( $c eq '&' and ! $ignore_amps ) {
    while ( defined ($c = $self->get_c()) ) {
      if ( $c eq ';' ) {
        return $c;
      } elsif ( $c !~ /[a-zA-Z_0-9#]/ ) {
        $self->{_pos} = $self->{_wpos};
        die "& found that isn't part of an entity reference\n";
      }
    }
    # if we get here, we've exited the while loop by overrunning the
    # end of our string



( run in 1.412 second using v1.01-cache-2.11-cpan-13bb782fe5a )