LaTeXML

 view release on metacpan or  search on metacpan

lib/LaTeXML/MathParser.pm  view on Meta::CPAN

# |  United States Government & not subject to copyright in the US.     | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov>                        #_#     | #
# | http://dlmf.nist.gov/LaTeXML/                              (o o)    | #
# \=========================================================ooo==U==ooo=/ #
# ================================================================================
# LaTeXML::MathParser  Math Parser for LaTeXML using Parse::RecDescent.
# Parse the intermediate representation generated by the TeX processor.
# ================================================================================
package LaTeXML::MathParser;
use strict;
use warnings;
use Parse::RecDescent;
use LaTeXML::Global;
use LaTeXML::Common::Object;
use LaTeXML::Common::Error;
use LaTeXML::Core::Token;
use LaTeXML::Common::Font;
use LaTeXML::Common::XML;
use List::Util qw(min max);
use base (qw(Exporter));

our @EXPORT_OK = (qw(&Lookup &New &Absent &Apply &ApplyNary &recApply &CatSymbols
    &Annotate &InvisibleTimes &InvisibleComma &MorphVertbar
    &TwoPartRelop &NewFormulae &NewFormula &NewList
    &ApplyDelimited &NewScript &DecorateOperator &InterpretDelimited &NewEvalAt
    &LeftRec
    &Arg &MaybeFunction
    &SawNotation &IsNotationAllowed
    &isMatchingClose &Fence));
our %EXPORT_TAGS = (constructors
    => [qw(&Lookup &New &Absent &Apply &ApplyNary &recApply &CatSymbols
      &Annotate &InvisibleTimes &InvisibleComma &MorphVertbar
      &TwoPartRelop &NewFormulae &NewFormula &NewList
      &ApplyDelimited &NewScript &DecorateOperator &InterpretDelimited &NewEvalAt
      &LeftRec
      &Arg &MaybeFunction
      &SawNotation &IsNotationAllowed
      &isMatchingClose &Fence)]);

DebuggableFeature('recdescent', "Trace Parse::RecDescent");
our $MATHPARSE_PROGRESS_QUANTUM = 100;

# ================================================================================
sub new {
  my ($class, %options) = @_;
  require LaTeXML::MathGrammar;

  my $internalparser = LaTeXML::MathGrammar->new();
  Fatal("expected", "MathGrammar", undef,
    "Compilation of Math Parser grammar failed") unless $internalparser;
  my $self = bless { internalparser => $internalparser, lexematize => $options{lexematize} }, $class;
  return $self; }

sub parseMath {
  my ($self, $document, %options) = @_;
  local $LaTeXML::MathParser::DOCUMENT = $document;
  $self->clear;    # Not reentrant!
  $self->cleanupScripts($document);
  if (my @math = $document->findnodes('descendant-or-self::ltx:XMath[not(ancestor::ltx:XMath)]')) {
    my $proc = "Math Parsing " . scalar(@math) . " formulae ...";
    ProgressSpinup($proc);
#### SEGFAULT TEST
####    $document->doctest("before parse",1);
    foreach my $math (@math) {
      $self->parse($math, $document); }
    ProgressSpindown($proc);

    NoteLog("Math parsing succeeded:"
        . join('', map { "\n   $_: "
            . colorizeString($$self{passed}{$_} . "/" . ($$self{passed}{$_} + $$self{failed}{$_}), ($$self{failed}{$_} == 0 ? 'success' : 'warning')) }
          grep { $$self{passed}{$_} + $$self{failed}{$_} }
          keys %{ $$self{passed} }));
    if (my @unk = keys %{ $$self{unknowns} }) {
      NoteLog("Symbols assumed as simple identifiers (with # of occurences):\n   "
          . join(', ', map { "'" . colorizeString("$_", 'warning') . "' ($$self{unknowns}{$_})" } sort @unk));
      if (!$STATE->lookupValue('MATHPARSER_SPECULATE')) {
        NoteLog("Set MATHPARSER_SPECULATE to speculate on possible notations."); } }
    if (my @funcs = keys %{ $$self{maybe_functions} }) {
      NoteLog("Possibly used as functions?\n  "
          . join(', ', map { "'$_' ($$self{maybe_functions}{$_}/$$self{unknowns}{$_} usages)" }
            sort @funcs)); }
  }
#### SEGFAULT TEST
####    $document->doctest("IN scope",1);
#### SEGFAULT TEST
####    $document->doctest("OUT of scope",1);
  return $document; }

# This is a rather peculiar cleanup that needs to be done to manage ids & idrefs
# Before parsing, sub/superscripts are represented by an operator-less XMApp with the script
# itself as the only child. Ideally, upon parsing these get merged, combined and disappear into
# proper XMApp of an appropriate operator on the base and scripts.  Unless there is a parse
# failure, in which case they remain.
# The problem comes from various XMDual constructs where an XMRef refers to the script XMApp.
# It can occur that one branch parses and the other fails: This can leave a reference to
# the script XMApp which no longer exists!
# To solve this, we find & replace all references to such script XMApps by an explicit XMApp
# with the XMRef refering to the script itself, not the XMApp. (make sense?)
sub cleanupScripts {
  my ($self, $document) = @_;
  foreach my $app ($document->findnodes(
      'descendant-or-self::ltx:XMApp[@xml:id and contains(@role,"SCRIPT")]')) {
    my $role  = $app->getAttribute('role');
    my $appid = $app->getAttribute('xml:id');
    if ($role =~ /^(?:PRE|POST|FLOAT)(:?SUB|SUPER)SCRIPT$/) {
      my @refs = $document->findnodes("descendant-or-self::ltx:XMRef[\@idref = '$appid']");
      if (scalar(@refs)) {
        my $script = $app->firstChild;
        my ($scriptref) = LaTeXML::Package::createXMRefs($document, $script);
        $document->unRecordID($appid);    # no longer refers to the app
        $app->removeAttribute('xml:id');
        # Copy all attributes, EXCEPT xml:id
        my %attr = map { (getQName($_) => $_->getValue) }
          grep { $_->nodeType == XML_ATTRIBUTE_NODE } $app->attributes;
        # Now, replace each ref to the script application by an application to a ref to the script.
        foreach my $ref (@refs) {
          $document->replaceTree(['ltx:XMApp', {%attr}, $scriptref], $ref); }
  } } }
  return; }

lib/LaTeXML/MathParser.pm  view on Meta::CPAN

  elsif ($name = $node->textContent) {
    my $font = $LaTeXML::MathParser::DOCUMENT->getNodeFont($node);
    my %attr = $font->relativeTo(LaTeXML::Common::Font->textDefault);
    my $desc = join(' ', map { ToString($attr{$_}{value}) }
        (grep { !$EXCLUDED_PRETTYNAME_ATTRIBUTES{$_} } (sort keys %attr)));
    $name .= "{$desc}" if $desc; }
  else {
    $name = Stringify($node); }    # what else ????
  return $name; }

sub note_unknown {
  my ($self, $node) = @_;
  my $name = token_prettyname($node);
  $$self{unknowns}{$name}++;
  return; }

# debugging utility, should be somewhere handy.
sub printNode {
  my ($node) = @_;
  if (ref $node eq 'ARRAY') {
    my ($tag, $attr, @children) = @$node;
    my @keys = sort keys %$attr;
    return "<$tag"
      . (@keys ? ' ' . join(' ', map { "$_='" . ($$attr{$_} || '') . "'" } @keys) : '')
      . (@children
      ? ">\n" . join('', map { printNode($_) } @children) . "</$tag>"
      : '/>')
      . "\n"; }
  else {
    return ToString($node); } }

# ================================================================================
# Some more XML utilities, but math specific (?)

# Get the Token's  meaning, else name, else content, else role
sub getTokenMeaning {
  my ($node) = @_;
  my $x;
  $node = realizeXMNode($node);
  return (defined($x = p_getAttribute($node, 'meaning')) ? $x
    : (defined($x = p_getAttribute($node, 'name')) ? $x
      : (($x = (ref $node eq 'ARRAY' ? '' : $node->textContent)) ne '' ? $x
        : (defined($x = p_getAttribute($node, 'role')) ? $x
          : undef)))); }

sub node_location {
  my ($node) = @_;
  my $n = $node;
  while ($n && (ref $n !~ /^XML::LibXML::Document/)    # Sometimes DocuementFragment ???
    && !$n->getAttribute('refnum') && !$n->getAttribute('labels')) {
    $n = $n->parentNode; }
  if ($n && (ref $n !~ /^XML::LibXML::Document/)) {
    my ($r, $l) = ($n->getAttribute('refnum'), $n->getAttribute('labels'));
    return ($r && $l ? "$r ($l)" : $r || $l); }
  else {
    return 'Unknown'; } }

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Parser
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Top-level per-formula parse.
# We do a depth-first traversal of the content of the XMath element,
# since various sub-elements (XMArg & XMWrap) act as containers of
# nominally complete subexpressions.
# We do these first for two reasons.
# Firstly, since after parsing, the parent will be rebuilt from the result,
# we lose the node "identity"; ie. we can't find the child to replace it!
# Secondly, in principle (although this isn't used yet), parsing the
# child could reveal something interesting about it; say, it's effective role.
# Then, this information could be used when parsing the parent.
# In fact, this could work the other way too; parsing the parent could tell
# us something about what the child must be....
sub parse {
  my ($self, $xnode, $document) = @_;
  local $LaTeXML::MathParser::STRICT      = 1;
  local $LaTeXML::MathParser::WARNED      = 0;
  local $LaTeXML::MathParser::UNPARSED    = 0;
  local $LaTeXML::MathParser::XNODE       = $xnode;
  local $LaTeXML::MathParser::PUNCTUATION = {};
  local $LaTeXML::MathParser::LOSTNODES   = {};
  local %LaTeXML::MathParser::IDREFS      = ();
  # This bit for debugging....
  foreach my $n ($document->findnodes("descendant-or-self::*[\@xml:id]", $xnode)) {
    my $id = $n->getAttribute('xml:id');
    $LaTeXML::MathParser::IDREFS{$id} = $n; }
  if ($$self{lexematize}) {
    my $lexeme_form = $self->node_to_lexeme_full($xnode);
    $lexeme_form =~ s/^\s+//;
    $lexeme_form =~ s/\s+$//;
    $lexeme_form =~ s/\s+/ /g;    # normalize internal whitespaces
    $xnode->parentNode->setAttribute('lexemes', $lexeme_form);
  }
  if (my $result = $self->parse_rec($xnode, 'Anything,', $document)) {
    # Add text representation to the containing Math element.
    my $p = $xnode->parentNode;
    # This is a VERY screwy situation? How can the parent be a document fragment??
    # This has got to be a LibXML bug???
    if ($p->nodeType == XML_DOCUMENT_FRAG_NODE) {
      my @n = $p->childNodes;
      if (scalar(@n) == 1) {
        $p = $n[0]; }
      else {
        Fatal('malformed', '<XMath>', $xnode, "XMath node has DOCUMENT_FRAGMENT for parent!"); } }
    # HACK: replace XMRef's to stray trailing punctution
    foreach my $id (keys %$LaTeXML::MathParser::PUNCTUATION) {
      my $r = $$LaTeXML::MathParser::PUNCTUATION{$id}->cloneNode;
      $r->removeAttribute('xml:id');
      foreach my $n ($document->findnodes("descendant-or-self::ltx:XMRef[\@idref='$id']", $p)) {
        $document->replaceTree($r, $n); } }
    foreach my $id (keys %$LaTeXML::MathParser::LOSTNODES) {
      my $repid = $$LaTeXML::MathParser::LOSTNODES{$id};
      # but the replacement my have been replaced as well!
      while (my $reprepid = $$LaTeXML::MathParser::LOSTNODES{$repid}) {
        $repid = $reprepid; }
      if ($document->findnodes("descendant-or-self::*[\@xml:id='$id']")
        && !$document->findnodes("descendant-or-self::*[\@xml:id='$repid']")) {
        # Do nothing if the node never actually got replaced (parse ultimately failed?)
      }
      else {
        foreach my $n ($document->findnodes("descendant-or-self::ltx:XMRef[\@idref='$id']", $p)) {
          $document->setAttribute($n, idref => $repid); } } }

lib/LaTeXML/MathParser.pm  view on Meta::CPAN

#   ($item,@more); }

# sub parse_kludge_reca {
#   my($self,$next,@more)=@_;
#   my $role = $self->getGrammaticalRole($next);
#   if($role =~ /^FLOAT(SUB|SUPER)SCRIPT$/){
#     my($base,@rest) = $self->parse_kludge_rec(@more);
#     (NewScript($base,$next),@rest); }
#   elsif($role eq 'OPEN'){
#     my($open,$close,$seps,$list,@more)=$self->parse_kludge_fence($next,@more);
#     (Apply(Annotate(New(undef,undef,role=>'FENCED'),
#     argopen=>$open, argclose=>$close, separators=>$seps),@$list), @more); }
#   else {
#     ($next,@more); }}

# sub parse_kludge_fence {
#   my($self,$next,@more)=@_;
#   my($open,$close,$punct,$r,$item,@list)=($next,undef,'',undef);
#   while(@more){
#     my @i=();
#     while(@more && (($r=($self->getGrammaticalRole($more[0])||'')) !~ /^(CLOSE|PUNCT)$/)){
#       ($item,@more)=$self->parse_kludge_rec(@more);
#       push(@i,$item); }
#     push(@list,(scalar(@i > 1) ? ['ltx:XMWrap',{},@i] : $i[0]));
#     if($r eq 'CLOSE'){
#       $close=shift(@more); last; }
#     else {
#       $punct .= ($punct ? ' ':''). p_getValue(shift(@more)); }} # Delimited by SINGLE SPACE!
#   ($open,$close,$punct,[@list],@more); }

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Low-level Parser: parse a single expression
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Convert to textual form for processing by MathGrammar
sub parse_single {
  my ($self, $mathnode, $document, $rule) = @_;
  my @nodes = $self->filter_hints($document, $mathnode->childNodes);

  my ($punct, $result, $unparsed);
  my @punct = ();
  # Extract trailing punctuation, if rule allows it.
  if ($rule =~ s/,$//) {
    # Collect ALL trailing PUNCT|PERIOD's...
    my ($x, $r);
    while (($x = $nodes[-1]) && ($x = realizeXMNode($x))
      && ($r = $x->getAttribute('role')) && (($r eq 'PUNCT') || ($r eq 'PERIOD'))) {
      my $p = pop(@nodes);
      # Special case hackery, in case this thing is XMRef'd!!!
      # We could just stick it on the end of the presentation,
      # but it doesn't belong in the content at all!?!?
      if (my $id = $p->getAttribute('xml:id')) {
        $$LaTeXML::MathParser::PUNCTUATION{$id} = $p; }
      unshift(@punct, $p); }
  }

  if ($LaTeXML::DEBUG{recdescent}) {
    $::RD_TRACE = 1;    # Turn on MathGrammar tracing
                        #    my $box = $document->getNodeBox($LaTeXML::MathParser::XNODE);
    my $box = $document->getNodeBox($mathnode);
    Debug(('=' x 60) .
        "\nParsing formula \"" . ToString($box) . "\""
        . "\n from " . ToString($box->getLocator)
        . "\n == \"" . join(' ', map { node_string($_, $document) } @nodes) . "\""
        . "\n == " . ToString($mathnode)); }

  if (scalar(@nodes) < 2) {    # Too few nodes? What's to parse?
    $result = $nodes[0] || Absent(); }
  else {
    # Now do the actual parse.
    ($result, $unparsed) = $self->parse_internal($rule, @nodes);
  }
  # Failure? No result or uparsed lexemes remain.
  # NOTE: Should do script hack??
  if ((!defined $result) || $unparsed) {
    $LaTeXML::MathParser::UNPARSED = 1;
    $self->failureReport($document, $mathnode, $rule, $unparsed, @nodes);
    return; }
  # Success!
  else {
    if (@punct) {    # create a trivial XMDual to treat the punctuation as presentation
      $result = ['ltx:XMDual', {},
        LaTeXML::Package::createXMRefs($document, $result),
        ['ltx:XMWrap', {}, $result, @punct]]; }    # or perhaps: Apply, punctuated???
    if ($LaTeXML::DEBUG{recdescent}) {
      Debug("=>" . printNode($result) . "\n" . ('=' x 60)); }
    return $result; } }

sub node_to_lexeme {
  my ($self, $node) = @_;
  my $qname    = getQName($node);
  my $is_inked = 1;
  my $lexeme   = ($qname eq 'ltx:XMTok' or $qname eq 'ltx:text') && $node->textContent;
  if (!defined($lexeme) || length($lexeme) == 0) {
    $is_inked = 0;
    $lexeme   = getTokenMeaning($node); }
  my $document = $LaTeXML::MathParser::DOCUMENT;
  return unless defined $lexeme;
  if ($is_inked) {
    if (my $font = $node->getAttribute('_font')) {
      my $font_spec = $document->decodeFont($font);
      if (my %declarations = $font_spec && $font_spec->relativeTo(LaTeXML::Common::Font->textDefault)) {
        my %to_add             = ();
        my $font_pending       = $declarations{font}        || {};
        my $properties_pending = $$font_pending{properties} || {};
        foreach my $attr (qw(family series shape)) {
          if (my $value = $$properties_pending{$attr}) {
            $to_add{$value} = 1; } }
        if (%to_add) {
          my $prefix = join("_", sort(keys(%to_add))) . "_";
          $lexeme = join(" ", map { $prefix . $_ } split(' ', $lexeme)); }
    } }
    if ($lexeme =~ /^\p{L}+$/) {
      $lexeme = join(" ", map { 'roman_' . $_ } split(' ', $lexeme)); } }
  $lexeme =~ s/\s+$//;
  return $lexeme; }

sub node_to_lexeme_full {
  my ($self, $unrealized_node) = @_;
  my $node = realizeXMNode($unrealized_node);
  my $tag  = getQName($node);
  if ($tag eq 'ltx:XMHint') { return ""; }    # just skip XMHints, they don't contain lexemes
  my $role = p_getAttribute($node, 'role');
  if ($tag eq 'ltx:XMTok' || ($role && ($tag !~ 'ltx:XM(Dual|App|Arg|Array|Wrap|ath)'))) {
# Elements that directly represent a lexeme, or intended operation with a syntactic role (such as a postscript),
# can proceed to building the lexeme from the leaf node.
    my $lexeme = $self->node_to_lexeme($node);
    if ($role && $role =~ /^(UNDER|OVER)ACCENT$/) {
      # over¯ and under¯ are the lexeme names of choice for \overline and \underline
      $lexeme = lc($1) . $lexeme; }
    return $lexeme; }
# Elements that do not have a role and are intermediate "may" need an argument wrapper, so that arguments
# remain unambiguous. For instance a `\frac{a}{b}` has clear argument structure to be preserved.
  my ($mark_start, $mark_end) = ('', '');
  if ($tag ne 'ltx:XMath') {
    if ($role) {
      $mark_start = "start_$role ";
      $mark_end   = " end_$role";
    } elsif ($tag =~ '^ltx:XM(Arg|Row|Cell)') {
      my $tag_role = uc($1);
      $mark_start = "start_$tag_role ";
      $mark_end   = " end_$tag_role";
    }
  }
  my $lexemes = $mark_start;
  if ($tag eq 'ltx:XMDual') {
    $lexemes .= $self->node_to_lexeme_full($LaTeXML::MathParser::DOCUMENT->getSecondChildElement($node)); }
  elsif ($tag eq 'ltx:XMText') {
    # can be either a single node XMText, we're looking at a leaf text node
    # or \text{}-like construct, with multiple math formulas and interleaved text
    foreach my $child ($node->childNodes) {
      if (ref($child) eq 'XML::LibXML::Text') {
        $lexemes .= $child->textContent() . ' '; }
      else {
        my $child_lexeme = $self->node_to_lexeme_full($child);
        if (defined $child_lexeme && length($child_lexeme) > 0) {
          $lexemes .= $child_lexeme . ' '; } } } }
  elsif ($tag eq 'ltx:text') {    # could recurse in from ltx:XMText
    return $self->node_to_lexeme($node); }
  else {
    my @child_elements = element_nodes($node);
    # skip through single child wrappers (don't serialize)
    while (scalar(@child_elements) == 1 && getQName($child_elements[0]) =~ /^ltx:XM(Arg|Wrap)$/) {
      @child_elements = element_nodes($child_elements[0]);
    }
    foreach my $child (@child_elements) {
      my $child_lexeme = $self->node_to_lexeme_full($child);
      if (defined $child_lexeme && length($child_lexeme) > 0) {
        $lexemes .= $child_lexeme . ' ';
  } } }
  $lexemes .= $mark_end;
  return $lexemes; }

sub parse_internal {
  my ($self, $rule, @nodes) = @_;
  #------------
  # Generate a textual token for each node; The parser operates on this encoded string.
  local $LaTeXML::MathParser::LEXEMES = {};
  my $i       = 0;
  my $lexemes = '';

  foreach my $node (@nodes) {
    # This is a parser-specific lexeme, but it is not (yet) identical to the serialized lexeme by
    # ->node_to_lexeme, which is currently experimental
    my $role = $self->getGrammaticalRole($node);
    my $text = getTokenMeaning($node);
    $text = 'Unknown' unless defined $text;
    my $lexeme = $role . ":" . $text . ":" . ++$i;
    $lexeme =~ s/\s//g;

    $$LaTeXML::MathParser::LEXEMES{$lexeme} = $node;
    $lexemes .= ' ' . $lexeme; }

  #------------
  # apply the parser to the textified sequence.
  local $LaTeXML::MathParser::PARSER               = $self;
  local %LaTeXML::MathParser::SEEN_NOTATIONS       = ();
  local %LaTeXML::MathParser::DISALLOWED_NOTATIONS = ();
  local $LaTeXML::MathParser::MAX_ABS_DEPTH        = 1;
  # "Speculative" mode is disabled by default (set MATHPARSER_SPECULATE).
  # It causes the parser to explore possible undeclared notatations,
  # like possible functions (the only one, so far)
  # These extra grammar rules can be costly in corner cases.
  # This should be disabled when speed (but feedback) is essential
  if ($STATE->lookupValue('MATHPARSER_SPECULATE')) {
    $LaTeXML::MathParser::DISALLOWED_NOTATIONS{MaybeFunctions} = 0; }
  else {
    $LaTeXML::MathParser::DISALLOWED_NOTATIONS{MaybeFunctions} = 1; }
  my $unparsed = $lexemes;
  my $result   = $$self{internalparser}->$rule(\$unparsed);
  if (((!defined $result) || $unparsed)    # If parsing Failed
    && $LaTeXML::MathParser::SEEN_NOTATIONS{QM}) {               # & Saw some QM stuff.
    $LaTeXML::MathParser::DISALLOWED_NOTATIONS{QM} = 1;          # Retry w/o QM notations
    $unparsed = $lexemes;
    $result = $$self{internalparser}->$rule(\$unparsed); }
  while (((!defined $result) || $unparsed)                       # If parsing Failed
    && ($LaTeXML::MathParser::SEEN_NOTATIONS{AbsFail})           # & Attempted deeper abs nesting?
    && ($LaTeXML::MathParser::MAX_ABS_DEPTH < 3)) {              # & Not ridiculously deep
    delete $LaTeXML::MathParser::SEEN_NOTATIONS{AbsFail};
    ++$LaTeXML::MathParser::MAX_ABS_DEPTH;                       # Try deeper.
    $unparsed = $lexemes;
    $result   = $$self{internalparser}->$rule(\$unparsed); }

  # If still failed, try other strategies?

  return ($result, $unparsed); }

sub getGrammaticalRole {
  my ($self, $node) = @_;
  $node = realizeXMNode($node);
  my $role = p_getAttribute($node, 'role');
  if (!defined $role) {
    my $tag = getQName($node);
    if ($tag eq 'ltx:XMTok') {
      $role = 'UNKNOWN'; }
    elsif ($tag eq 'ltx:XMDual') {
      my ($content, $presentation) = element_nodes($node);
      $role = p_getAttribute($content, 'role') || p_getAttribute($presentation, 'role'); }
    $role = 'ATOM' unless defined $role; }
  $self->note_unknown($node) if ($role eq 'UNKNOWN') && $LaTeXML::MathParser::STRICT;
  return $role; }

# How many tokens before & after the failure point to report in the Warning message.
my $FAILURE_PRETOKENS  = 3;    # [CONSTANT]
my $FAILURE_POSTTOKENS = 1;    # [CONSTANT]

sub failureReport {
  my ($self, $document, $mathnode, $rule, $unparsed, @nodes) = @_;
  if ($LaTeXML::MathParser::STRICT || (($LaTeXML::Common::Error::VERBOSITY || 0) > 1)) {
    my $loc = "";
    # If we haven't already done it for this formula, show the original TeX.
    if (!$LaTeXML::MathParser::WARNED) {
      $LaTeXML::MathParser::WARNED = 1;
      my $box = $document->getNodeBox($LaTeXML::MathParser::XNODE);
      $loc = "In \"" . UnTeX($box) . "\""; }
    $unparsed =~ s/^\s*//;
    my @rest = split(/ /, $unparsed);
    my $pos  = scalar(@nodes) - scalar(@rest);
    # Break up the input at the point where the parse failed.
    my $max    = 50;
    my $parsed = join(' ', ($pos > $max ? ('...') : ()),
      (map { node_string($_, $document) } @nodes[max(0, $pos - 50) .. $pos - 1]));
    my $toparse = join(' ',
      (map { node_string($_, $document) } @nodes[$pos .. min($pos + 50, $#nodes)]),
      ($#nodes > $pos + 50 ? ('...') : ()));
    my $parsefail
      = join('.', map { $self->getGrammaticalRole($_) }
        @nodes[($pos - $FAILURE_PRETOKENS >= 0
          ? $pos - $FAILURE_PRETOKENS : 0) .. $pos - 1])
      . ">"
      . join('.', map { $self->getGrammaticalRole($_) }
        @nodes[$pos .. ($pos + $FAILURE_POSTTOKENS - 1 < $#nodes
          ? $pos + $FAILURE_POSTTOKENS - 1 : $#nodes)]);
    my $lexeme = node_location($nodes[$pos] || $nodes[$pos - 1] || $mathnode);
    my $indent = length($parsed) - 2; $indent = 8 if $indent > 8;
    Warn('not_parsed', $parsefail, $mathnode,
      "MathParser failed to match rule '$rule'",
      ($loc ? ($loc) : ()),
      ($parsed
        ? ($parsed, (' ' x $indent) . "> " . $toparse)
        : ("> " . $toparse)));
  }
  return; }

# used for debugging & failure reporting.
sub node_string {
  my ($node, $document) = @_;
  my $role = $node->getAttribute('role') || 'UNKNOWN';
  my $box  = $document->getNodeBox($node);
  return ($box ? ToString($box) : text_form($node)) . "[[$role]]"; }

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Conversion to a less ambiguous, mostly-prefix form.
# Mostly for debugging information?
# Note that the nodes are true libXML nodes, already absorbed into the document
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
sub text_form {
  my ($node) = @_;
  return textrec($node, undef); }

my %PREFIX_ALIAS = (    # [CONSTANT]
  SUPERSCRIPTOP         => '^',  SUBSCRIPTOP              => '_', times          => '*',
  'equals'              => '=',  'less-than'              => '<', 'greater-than' => '>',
  'less-than-or-equals' => '<=', 'greater-than-or-equals' => '>=',
  'much-less-than'      => '<<', 'much-greater-than'      => '>>',
  'plus'                => '+',  'minus'                  => '-', 'divide' => '/');
# Put infix, along with `binding power'
my %IS_INFIX = (METARELOP => 1,    # [CONSTANT]
  RELOP         => 2,    ARROW       => 2,
  ADDOP         => 10,   MULOP       => 100, FRACOP => 100,
  SUPERSCRIPTOP => 1000, SUBSCRIPTOP => 1000);

lib/LaTeXML/MathParser.pm  view on Meta::CPAN

    if ($app_role && $app_role =~ /^FLOAT(SUB|SUPER)SCRIPT$/) {
      return ($1 eq 'SUPER' ? '^' : '_') . textrec($op); }
    else {
      my $name = ((getQName($op) eq 'ltx:XMTok') && getTokenMeaning($op)) || 'unknown';
      my ($bp, $string) = textrec_apply($name, $op, @args);
      return (($bp < $outer_bp) || (($bp == $outer_bp) && ($name ne $outer_name))
        ? '(' . $string . ')' : $string); } }
  elsif ($tag eq 'ltx:XMDual') {
    my ($content, $presentation) = element_nodes($node);
    my $text = textrec($content, $outer_bp, $outer_name);    # Just send out the semantic form.
        # Fall back to presentation, if content has poor semantics (eg. from replacement patterns)
    return ($text =~ /^\(*Unknown/ ? textrec($presentation, $outer_bp, $outer_name) : $text); }
  elsif ($tag eq 'ltx:XMTok') {
    my $name = getTokenMeaning($node);
    $name = 'Unknown' unless defined $name;
    return $PREFIX_ALIAS{$name} || $name; }
  elsif (($tag eq 'ltx:XMWrap') || ($tag eq 'ltx:XMCell')) {
    # ??
    return join('@', map { textrec($_) } element_nodes($node)); }
  elsif ($tag eq 'ltx:XMArray') {
    return textrec_array($node); }
  else {
    return '[' . (p_getValue($node) || '') . ']'; } }

sub textrec_apply {
  no warnings 'recursion';
  my ($name, $op, @args) = @_;
  my $role = ((ref $op ne 'ARRAY') && $op->getAttribute('role')) || 'Unknown';
  if (($role =~ /^(SUB|SUPER)SCRIPTOP$/) && (($op->getAttribute('scriptpos') || '') =~ /^pre\d+$/)) {
    # Note that this will likely get parenthesized due to high bp
    return (5000, textrec($op) . " " . textrec($args[1]) . " " . textrec($args[0])); }
  elsif (my $bp = $IS_INFIX{$role}) {
    # A sub/superscript with a meaning probably should be prefix
    if (($role =~ /^(SUB|SUPER)SCRIPTOP$/) && $op->getAttribute('meaning')) {
      return (500, textrec($op, 10000, $name) . '@(' . join(', ', map { textrec($_) } @args) . ')') }
    else {    # Format as infix.
      return ($bp, (scalar(@args) == 1    # unless a single arg; then prefix.
          ? textrec($op) . ' ' . textrec($args[0], $bp, $name)
          : join(' ' . textrec($op) . ' ', map { textrec($_, $bp, $name) } @args))); } }
  elsif ($role eq 'POSTFIX') {
    return (10000, textrec($args[0], 10000, $name) . textrec($op)); }
  elsif ($name eq 'multirelation') {
    return (2, join(' ', map { textrec($_, 2, $name) } @args)); }
  else {
    return (500, textrec($op, 10000, $name) . '@(' . join(', ', map { textrec($_) } @args) . ')'); } }

sub textrec_array {
  my ($node) = @_;
  my $name   = $node->getAttribute('meaning') || $node->getAttribute('name') || 'Array';
  my @rows   = ();
  foreach my $row (element_nodes($node)) {
    push(@rows, '[' . join(', ', map { ($_->firstChild ? textrec($_->firstChild) : '') } element_nodes($row)) . ']'); }
  return $name . '[' . join(', ', @rows) . ']'; }

sub is_genuinely_unparsed {
  my ($node, $document) = @_;
  # any unparsed fragment should be considered legitimate with one exception
  # author-provided ungrammatical snippets in the presentation branches of XMDual
  # are allowed to fail the parse process.
  #
  # For now a reliable way of if we are in that case is to descend the formula through
  # the content branch of XMDual and check if any node has an "unparsed" mark.
  # Then only genuine parse failures will be detected.
  my $tag = $document->getNodeQName($node);
  if (($tag eq 'ltx:XMWrap') || ($tag eq 'ltx:XMArg') || $node->hasAttribute('_unparsed')) {
    return 1; }
  elsif (($tag eq 'ltx:XMTok') || ($tag eq 'ltx:XMText') || ($tag eq 'ltx:XMHint')) {
    return 0; }
  elsif ($tag eq 'ltx:XMRef') {
    # avoid infinite loops on malformed XMRefs that don't point anywhere
    if (!$node->getAttribute('idref')) {
      return 1; }
    else {
      my $real_node = realizeXMNode($node);
      # Note that some parses fail with an ARRAY ref [ltx:Error,...]
      # so realizeXMNode may fail!
      return 1 if (ref $real_node) !~ /^XML::LibXML/;
      return is_genuinely_unparsed($real_node, $document); } }
  elsif ($tag eq 'ltx:XMDual') {
    my ($content, $presentation) = element_nodes($node);
    return is_genuinely_unparsed($content, $document); }
  else {
    foreach my $child (element_nodes($node)) {
      return 1 if is_genuinely_unparsed($child, $document); }
    return 0; } }

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Cute! Were it NOT for Sub/Superscripts, the whole parsing process only
# builds a new superstructure around the sequence of token nodes in the input.
# Thus, any internal structure is unchanged.
#  They get re-parented, but if the parse fails, we've only got to put them
# BACK into the original node, to recover the original arrangment!!!
# Thus, we don't have to clone, and deal with namespace duplication.
# ...
# EXCEPT, as I said, for sub/superscripts!!!!
#

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Constructors used in grammar
# All the tree construction in the grammar should come through these operations.
# We avoid mucking with the actual XML nodes (both to avoid modifying the original
# tree until we have a successful parse, and to avoid XML::LibXML cloning nightmares)
# We are converting XML nodes to array representation: [$tag, {%attr},@children]
# This means any inspection of nodes has to recognize that
#  * node may be in XML vs ARRAY representation
#  * node may be an XMRef to another node whose properties are the ones we should use.
#
# Also, when we are examining a node's properties (roles, fences, script positioning, etc)
# we should be careful to check for XMRef indirection and examine the properties
# of the node that was referred to.
# HOWEVER, we should construct our parse tree using (a clone of) the XMRef node,
# rather than (a clone of) the referred to node, so as to preserve identity.
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# We're currently keeping the id's on the nodes as they get cloned,
# since they'll (maybe) replace the unparsed nodes.
# However, if we consider multiple parses or preserving both parsed & unparsed,
# we may have to do some adaptation and id shifting.
# ================================================================================

# ================================================================================
# Low-level accessors

lib/LaTeXML/MathParser.pm  view on Meta::CPAN

  return (defined $expect) && (defined $cname) && ($expect eq $cname); }

# Given a delimited sequence: open expr (punct expr)* close
# (OR, an empty sequence open close)
# Convert it into the appropriate thing, depending on the specific open & close used.
# Generate an XMDual to preserve any styling of delimiters and punctuation.
sub Fence {
  my (@stuff) = @_;
  # Peak at delimiters to guess what kind of construct this is.
  my $nargs = scalar(@stuff);
  Error("expected", "arguments", undef,
    "Even number of arguments to Fence(); should be of form open,expr,(punct,expr)*,close",
    "got " . join(' ', map { ToString($_) } @stuff))
    if ($nargs != 2) && (($nargs % 2) == 0);    # either empty or odd number
  my ($open, $close) = (realizeXMNode($stuff[0]), realizeXMNode($stuff[-1]));
  my $o  = p_getValue($open);
  my $c  = p_getValue($close);
  my $n  = int(($nargs - 2 + 1) / 2);
  my @p  = map { p_getValue(realizeXMNode(@stuff[2 * $_])) } 1 .. $n - 1;
  my $op = ($n == 0
    ? 'list'    # ?
    : ($n == 1
      ? $enclose1{ $o . '@' . $c }
      : ($n == 2
        ? ($enclose2{ $o . '@' . $p[0] . '@' . $c } || 'list')
        : ($encloseN{ $o . '@' . $p[0] . '@' . $c } || 'list'))));
  $op = 'delimited-' . $o . $c unless defined $op;
  my $decl_id = p_getAttribute($open, 'decl_id');
  if (($n == 1) && ($op eq 'delimited-()')) {    # Hopefully, can just ignore the parens?
    return ['ltx:XMDual', {},
      LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, $stuff[1]),
      ['ltx:XMWrap', {}, @stuff]]; }
  else {
    return InterpretDelimited(New($op, undef, ($decl_id ? (decl_id => $decl_id) : ())), @stuff); } }

# Compose a complex relational operator from two tokens, such as >=, >>
# (similar to CatSymbols, but specialized to relops)
sub TwoPartRelop {
  my ($op1, $op2) = @_;
  $op1 = Lookup($op1);
  $op2 = Lookup($op2);
  my $m1   = p_getTokenMeaning($op1);
  my $m2   = p_getTokenMeaning($op2);
  my $font = p_getAttribute($op1, '_font');
  my $meaning;
  if ($m1 eq $m2) {
    $meaning = "much-$m1"; }
  else {
    $meaning = "$m1-or-$m2"; }
  my $content = $op1->textContent . $op2->textContent;
  return ['ltx:XMTok', { role => "RELOP", meaning => $meaning, ($font ? (_font => $font) : ()) },
    $content]; }

# NOTE: It might be best to separate the multiple Formulae into separate XMath's???
# but only at the top level!
sub NewFormulae {
  my (@stuff) = @_;
  if (scalar(@stuff) == 1) {
    return $stuff[0]; }
  else {
    my ($seps, @formula) = extract_separators(@stuff);
    return ['ltx:XMDual', {},
      Apply(New('formulae'),
        LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, @formula)),
      ['ltx:XMWrap', {}, @stuff]]; } }

# A Formula is an alternation of expr (relationalop expr)*
# It presumably would be equivalent to (expr1 relop1 expr2) AND (expr2 relop2 expr3) ...
# But, I haven't figured out the ideal prefix form that can easily be converted to presentation.
sub NewFormula {
  my (@args) = @_;
  my $n = scalar(@args);
  if ($n == 1) {
    return $args[0]; }
  elsif ($n == 3) {
    return Apply($args[1], $args[0], $args[2]); }
  else {
    return Apply(New('multirelation'), @args); } }

sub NewList {
  my (@stuff) = @_;
  # drop placeholder token for missing trailing punct, if any
  if (scalar(@stuff) > 1 && ((p_getTokenMeaning($stuff[-1]) || '') eq 'absent')) {
    pop(@stuff); }
  if (@stuff == 1) {
    return $stuff[0]; }
  else {
    my ($seps, @items) = extract_separators(@stuff);
    return ['ltx:XMDual', {},
      Apply(New('list'),
        LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, @items)),
      ['ltx:XMWrap', {}, @stuff]]; } }

# Given alternation of expr (addop expr)*, compose the tree (left recursive),
# flattenning portions that have the same operator
# ie. a + b + c - d  =>  (- (+ a b c) d)
sub LeftRec {
  no warnings 'recursion';
  my ($arg1, @more) = @_;
  if (@more) {
    my $op     = shift(@more);
    my $opname = p_getTokenMeaning(realizeXMNode($op));
    my @args   = ($arg1, shift(@more));
    while (@more && isSameExpr($op, $more[0])) {
      ReplacedBy($more[0], $op, 1);
      shift(@more);
      push(@args, shift(@more)); }
    return LeftRec(Apply($op, @args), @more); }
  else {
    return $arg1; } }

# Like apply($op,$arg1,$arg2), but if $op is 'same' as the operator in $arg1,
# then combine as an nary apply of $op to $arg1's arguments and $arg2.
sub ApplyNary {
  my ($op, $arg1, $arg2) = @_;
  my $rop       = realizeXMNode($op);
  my $opname    = p_getTokenMeaning($rop) || '__undef_meaning__';
  my $opcontent = p_getValue($rop)        || '__undef_content__';
  my @args      = ();
  if (p_getQName($arg1) eq 'ltx:XMApp') {
    my ($op1, @args1) = p_element_nodes($arg1);
    my $rop1 = realizeXMNode($op1);
    if (isSameExpr($rop, $rop1)
      # Check that arg1 isn't wrapped, fenced or enclosed in some restrictive way
      # Especially an ID! (but really only important if the id is referenced somewhere?)
      && !(grep { p_getAttribute(realizeXMNode($arg1), $_) } qw(enclose xml:id))) {
      # Note that $op1 GOES AWAY!!!
      ReplacedBy($op1, $rop, 1);
      push(@args, @args1); }
    else {
      push(@args, $arg1); } }
  else {
    push(@args, $arg1); }
  return Apply($op, @args, $arg2); }

# Usually we just expect to compare a token + to another.
# but want (to some extent) to deal with embellished operators (eg. sub, sup...)
# Rather involved if we want to try to do it "Right".
sub isSameExpr {
  my ($op1, $op2) = @_;
  $op1 = realizeXMNode($op1);
  $op2 = realizeXMNode($op2);
  my $tag1 = getQName($op1);
  my $tag2 = getQName($op2);
  # Either both are tokens,
  # OR both have structure, but then we need to compare the children!!!!
  # First check, same top-level and critical attributes
  return unless
    ($tag1 eq $tag2)
    && ((p_getTokenMeaning($op1) || '__undef_meaning__')
    eq (p_getTokenMeaning($op2) || '__undef_meaning__'))
    && ((p_getValue($op1) || '__undef_content__')
    eq (p_getValue($op2) || '__undef_content__'))
    # Check that ops are used in same way.
    && ((p_getAttribute($op1, 'mathstyle') || '<none>')
    eq (p_getAttribute($op2, 'mathstyle') || '<none>'));
  if ($tag1 eq 'ltx:XMTok') { return 1; }    # If tokens, they match
  else {
    my @ch1 = p_element_nodes($op1);
    my @ch2 = p_element_nodes($op2);
    my $n   = scalar(@ch1);
    return unless $n == scalar(@ch2);
    foreach my $i (0 .. $n - 1) {
      return unless isSameExpr($ch1[$i], $ch1[$i]); }
    return 1; } }

# There are several cases where parsing a formula will rearrange nodes
# such that some nodes will no-longer be used.  For example, when
# converting a nested set of infix + into a single n-ary sum.
# In effect, all those excess +'s are subsumed by the single first one.
# It may be, however, that those lost nodes are referenced (XMRef) from the
# other branch of an XMDual, and those references should be updated to refer
# to the single node replacing the lost ones.
# This function records that replacement, and the top-level parser fixes up the tree.
# NOTE: There may be cases (in the Grammar, eg) where punctuation & ApplyOp's
# get lost completely? Watch out for this!
# If $isdup, assume the two trees are equivalent structure,
# and so mark corresponding internal nodes as "replaced" as well.
sub ReplacedBy {
  my ($lostnode, $keepnode, $isdup) = @_;
  return unless ref $lostnode && ref $keepnode;
  if (my $lostid = p_getAttribute($lostnode, 'xml:id')) {
    # Could be we want to generate an id for $keepnode, here?
    if (my $keepid = p_getAttribute($keepnode, 'xml:id')) {
      $$LaTeXML::MathParser::LOSTNODES{$lostid} = $keepid; }
    else {
      Warn('expected', 'id', undef, "LOST $lostid but no replacement!\n"); } }

  # The following recurses into the two trees,
  # This is on the assumption that they are "equivalent" trees
  if ($isdup) {
    my @lostkids = p_element_nodes($lostnode);
    my @keepkids = p_element_nodes($keepnode);
    my $n        = scalar(@lostkids);
    my $m        = scalar(@keepkids);
    if ($n != $m) {
      Error("unexpected", "nodes", undef, "Nodes aren't the same structure ($n vs $m)",
        "Old: " . printNode($lostnode),
        "New: " . printNode($keepnode)
      );
      $n = $m if $m < $n; }
    foreach my $i (0 .. $n - 1) {
      ReplacedBy($lostkids[$i], $keepkids[$i], $isdup); } }
  return; }

# ================================================================================
# Construct an appropriate application of sub/superscripts
# This accounts for script positioning:
#   Whether it precedes (float), is over/under (if base requests),
# or follows (normal case), along with whether sub/super.
#   the alignment of multiple sub/superscripts derived from the binding level when created.
# scriptpos = (pre|mod|post) number; where number is the binding-level.
# If $pos is given (pre|mid|post), it overrides the position implied by the script
sub NewScript {
  my ($base, $script, $pos) = @_;
  my $role;
  my $rbase   = realizeXMNode($base);
  my $rscript = realizeXMNode($script);
  my $ibase   = $rbase;
  # Get "inner" (content) base, if the base is a dual it may be more relevant
  if (p_getQName($rbase) eq 'ltx:XMDual') {
    ($ibase) = p_element_nodes($rbase); }
  elsif (p_getQName($rbase) eq 'ltx:XMApp') {
    my ($op, $a1, @rest) = p_element_nodes($rbase);
    if ((p_getQName($op) eq 'ltx:XMTok')
      && ((p_getAttribute($op, 'role') || '') =~ /^(FLOAT|POST)?(SUB|SUPER)SCRIPT(OP)?$/)) {
      $ibase = $op; } }

lib/LaTeXML/MathParser.pm  view on Meta::CPAN


=over 4

=item C<< $node = New($name,$content,%attributes); >>

Creates a new C<XMTok> node with given C<$name> (a string or undef),
and C<$content> (a string or undef) (but at least one of name or content should be provided),
and attributes.

=item C<< $node = Arg($node,$n); >>

Returns the C<$n>-th argument of an C<XMApp> node;
0 is the operator node.

=item C<< Annotate($node,%attributes); >>

Add attributes to C<$node>.

=item C<< $node = Apply($op,@args); >>

Create a new C<XMApp> node representing the application of the node
C<$op> to the nodes C<@args>.

=item C<< $node = ApplyDelimited($op,@stuff); >>

Create a new C<XMApp> node representing the application of the node
C<$op> to the arguments found in C<@stuff>.  C<@stuff> are
delimited arguments in the sense that the leading and trailing nodes
should represent open and close delimiters and the arguments are
separated by punctuation nodes.

=item C<< $node = InterpretDelimited($op,@stuff); >>

Similar to C<ApplyDelimited>, this interprets sequence of
delimited, punctuated items as being the application of C<$op> to those items.

=item C<< $node = recApply(@ops,$arg); >>

Given a sequence of operators and an argument, forms the nested
application C<op(op(...(arg)))>>.

=item C<< $node = InvisibleTimes; >>

Creates an invisible times operator.

=item C<< $boole = isMatchingClose($open,$close); >>

Checks whether C<$open> and C<$close> form a `normal' pair of
delimiters, or if either is ".".

=item C<< $node = Fence(@stuff); >>

Given a delimited sequence of nodes, starting and ending with open/close delimiters,
and with intermediate nodes separated by punctuation or such, attempt to guess what
type of thing is represented such as a set, absolute value, interval, and so on.

This would be a good candidate for customization!

=item C<< $node = NewFormulae(@stuff); >>

Given a set of formulas, construct a C<Formulae> application, if there are more than one,
else just return the first.

=item C<< $node = NewList(@stuff); >>

Given a set of expressions, construct a C<list> application, if there are more than one,
else just return the first.

=item C<< $node = LeftRec($arg1,@more); >>

Given an expr followed by repeated (op expr), compose the left recursive tree.
For example C<a + b + c - d> would give C<(- (+ a b c) d)>>

=item C<< MaybeFunction($token); >>

Note the possible use of C<$token> as a function, which may cause incorrect parsing.
This is used to generate warning messages.

=back

=head1 AUTHOR

Bruce Miller <bruce.miller@nist.gov>

=head1 COPYRIGHT

Public domain software, produced as part of work done by the
United States Government & not subject to copyright in the US.

=cut



( run in 0.827 second using v1.01-cache-2.11-cpan-437f7b0c052 )