LaTeXML
view release on metacpan or search on metacpan
lib/LaTeXML/MathParser.pm view on Meta::CPAN
# | United States Government & not subject to copyright in the US. | #
# |---------------------------------------------------------------------| #
# | Bruce Miller <bruce.miller@nist.gov> #_# | #
# | http://dlmf.nist.gov/LaTeXML/ (o o) | #
# \=========================================================ooo==U==ooo=/ #
# ================================================================================
# LaTeXML::MathParser Math Parser for LaTeXML using Parse::RecDescent.
# Parse the intermediate representation generated by the TeX processor.
# ================================================================================
package LaTeXML::MathParser;
use strict;
use warnings;
use Parse::RecDescent;
use LaTeXML::Global;
use LaTeXML::Common::Object;
use LaTeXML::Common::Error;
use LaTeXML::Core::Token;
use LaTeXML::Common::Font;
use LaTeXML::Common::XML;
use List::Util qw(min max);
use base (qw(Exporter));
our @EXPORT_OK = (qw(&Lookup &New &Absent &Apply &ApplyNary &recApply &CatSymbols
&Annotate &InvisibleTimes &InvisibleComma &MorphVertbar
&TwoPartRelop &NewFormulae &NewFormula &NewList
&ApplyDelimited &NewScript &DecorateOperator &InterpretDelimited &NewEvalAt
&LeftRec
&Arg &MaybeFunction
&SawNotation &IsNotationAllowed
&isMatchingClose &Fence));
our %EXPORT_TAGS = (constructors
=> [qw(&Lookup &New &Absent &Apply &ApplyNary &recApply &CatSymbols
&Annotate &InvisibleTimes &InvisibleComma &MorphVertbar
&TwoPartRelop &NewFormulae &NewFormula &NewList
&ApplyDelimited &NewScript &DecorateOperator &InterpretDelimited &NewEvalAt
&LeftRec
&Arg &MaybeFunction
&SawNotation &IsNotationAllowed
&isMatchingClose &Fence)]);
DebuggableFeature('recdescent', "Trace Parse::RecDescent");
our $MATHPARSE_PROGRESS_QUANTUM = 100;
# ================================================================================
sub new {
my ($class, %options) = @_;
require LaTeXML::MathGrammar;
my $internalparser = LaTeXML::MathGrammar->new();
Fatal("expected", "MathGrammar", undef,
"Compilation of Math Parser grammar failed") unless $internalparser;
my $self = bless { internalparser => $internalparser, lexematize => $options{lexematize} }, $class;
return $self; }
sub parseMath {
my ($self, $document, %options) = @_;
local $LaTeXML::MathParser::DOCUMENT = $document;
$self->clear; # Not reentrant!
$self->cleanupScripts($document);
if (my @math = $document->findnodes('descendant-or-self::ltx:XMath[not(ancestor::ltx:XMath)]')) {
my $proc = "Math Parsing " . scalar(@math) . " formulae ...";
ProgressSpinup($proc);
#### SEGFAULT TEST
#### $document->doctest("before parse",1);
foreach my $math (@math) {
$self->parse($math, $document); }
ProgressSpindown($proc);
NoteLog("Math parsing succeeded:"
. join('', map { "\n $_: "
. colorizeString($$self{passed}{$_} . "/" . ($$self{passed}{$_} + $$self{failed}{$_}), ($$self{failed}{$_} == 0 ? 'success' : 'warning')) }
grep { $$self{passed}{$_} + $$self{failed}{$_} }
keys %{ $$self{passed} }));
if (my @unk = keys %{ $$self{unknowns} }) {
NoteLog("Symbols assumed as simple identifiers (with # of occurences):\n "
. join(', ', map { "'" . colorizeString("$_", 'warning') . "' ($$self{unknowns}{$_})" } sort @unk));
if (!$STATE->lookupValue('MATHPARSER_SPECULATE')) {
NoteLog("Set MATHPARSER_SPECULATE to speculate on possible notations."); } }
if (my @funcs = keys %{ $$self{maybe_functions} }) {
NoteLog("Possibly used as functions?\n "
. join(', ', map { "'$_' ($$self{maybe_functions}{$_}/$$self{unknowns}{$_} usages)" }
sort @funcs)); }
}
#### SEGFAULT TEST
#### $document->doctest("IN scope",1);
#### SEGFAULT TEST
#### $document->doctest("OUT of scope",1);
return $document; }
# This is a rather peculiar cleanup that needs to be done to manage ids & idrefs
# Before parsing, sub/superscripts are represented by an operator-less XMApp with the script
# itself as the only child. Ideally, upon parsing these get merged, combined and disappear into
# proper XMApp of an appropriate operator on the base and scripts. Unless there is a parse
# failure, in which case they remain.
# The problem comes from various XMDual constructs where an XMRef refers to the script XMApp.
# It can occur that one branch parses and the other fails: This can leave a reference to
# the script XMApp which no longer exists!
# To solve this, we find & replace all references to such script XMApps by an explicit XMApp
# with the XMRef refering to the script itself, not the XMApp. (make sense?)
sub cleanupScripts {
my ($self, $document) = @_;
foreach my $app ($document->findnodes(
'descendant-or-self::ltx:XMApp[@xml:id and contains(@role,"SCRIPT")]')) {
my $role = $app->getAttribute('role');
my $appid = $app->getAttribute('xml:id');
if ($role =~ /^(?:PRE|POST|FLOAT)(:?SUB|SUPER)SCRIPT$/) {
my @refs = $document->findnodes("descendant-or-self::ltx:XMRef[\@idref = '$appid']");
if (scalar(@refs)) {
my $script = $app->firstChild;
my ($scriptref) = LaTeXML::Package::createXMRefs($document, $script);
$document->unRecordID($appid); # no longer refers to the app
$app->removeAttribute('xml:id');
# Copy all attributes, EXCEPT xml:id
my %attr = map { (getQName($_) => $_->getValue) }
grep { $_->nodeType == XML_ATTRIBUTE_NODE } $app->attributes;
# Now, replace each ref to the script application by an application to a ref to the script.
foreach my $ref (@refs) {
$document->replaceTree(['ltx:XMApp', {%attr}, $scriptref], $ref); }
} } }
return; }
lib/LaTeXML/MathParser.pm view on Meta::CPAN
elsif ($name = $node->textContent) {
my $font = $LaTeXML::MathParser::DOCUMENT->getNodeFont($node);
my %attr = $font->relativeTo(LaTeXML::Common::Font->textDefault);
my $desc = join(' ', map { ToString($attr{$_}{value}) }
(grep { !$EXCLUDED_PRETTYNAME_ATTRIBUTES{$_} } (sort keys %attr)));
$name .= "{$desc}" if $desc; }
else {
$name = Stringify($node); } # what else ????
return $name; }
sub note_unknown {
my ($self, $node) = @_;
my $name = token_prettyname($node);
$$self{unknowns}{$name}++;
return; }
# debugging utility, should be somewhere handy.
sub printNode {
my ($node) = @_;
if (ref $node eq 'ARRAY') {
my ($tag, $attr, @children) = @$node;
my @keys = sort keys %$attr;
return "<$tag"
. (@keys ? ' ' . join(' ', map { "$_='" . ($$attr{$_} || '') . "'" } @keys) : '')
. (@children
? ">\n" . join('', map { printNode($_) } @children) . "</$tag>"
: '/>')
. "\n"; }
else {
return ToString($node); } }
# ================================================================================
# Some more XML utilities, but math specific (?)
# Get the Token's meaning, else name, else content, else role
sub getTokenMeaning {
my ($node) = @_;
my $x;
$node = realizeXMNode($node);
return (defined($x = p_getAttribute($node, 'meaning')) ? $x
: (defined($x = p_getAttribute($node, 'name')) ? $x
: (($x = (ref $node eq 'ARRAY' ? '' : $node->textContent)) ne '' ? $x
: (defined($x = p_getAttribute($node, 'role')) ? $x
: undef)))); }
sub node_location {
my ($node) = @_;
my $n = $node;
while ($n && (ref $n !~ /^XML::LibXML::Document/) # Sometimes DocuementFragment ???
&& !$n->getAttribute('refnum') && !$n->getAttribute('labels')) {
$n = $n->parentNode; }
if ($n && (ref $n !~ /^XML::LibXML::Document/)) {
my ($r, $l) = ($n->getAttribute('refnum'), $n->getAttribute('labels'));
return ($r && $l ? "$r ($l)" : $r || $l); }
else {
return 'Unknown'; } }
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Parser
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Top-level per-formula parse.
# We do a depth-first traversal of the content of the XMath element,
# since various sub-elements (XMArg & XMWrap) act as containers of
# nominally complete subexpressions.
# We do these first for two reasons.
# Firstly, since after parsing, the parent will be rebuilt from the result,
# we lose the node "identity"; ie. we can't find the child to replace it!
# Secondly, in principle (although this isn't used yet), parsing the
# child could reveal something interesting about it; say, it's effective role.
# Then, this information could be used when parsing the parent.
# In fact, this could work the other way too; parsing the parent could tell
# us something about what the child must be....
sub parse {
my ($self, $xnode, $document) = @_;
local $LaTeXML::MathParser::STRICT = 1;
local $LaTeXML::MathParser::WARNED = 0;
local $LaTeXML::MathParser::UNPARSED = 0;
local $LaTeXML::MathParser::XNODE = $xnode;
local $LaTeXML::MathParser::PUNCTUATION = {};
local $LaTeXML::MathParser::LOSTNODES = {};
local %LaTeXML::MathParser::IDREFS = ();
# This bit for debugging....
foreach my $n ($document->findnodes("descendant-or-self::*[\@xml:id]", $xnode)) {
my $id = $n->getAttribute('xml:id');
$LaTeXML::MathParser::IDREFS{$id} = $n; }
if ($$self{lexematize}) {
my $lexeme_form = $self->node_to_lexeme_full($xnode);
$lexeme_form =~ s/^\s+//;
$lexeme_form =~ s/\s+$//;
$lexeme_form =~ s/\s+/ /g; # normalize internal whitespaces
$xnode->parentNode->setAttribute('lexemes', $lexeme_form);
}
if (my $result = $self->parse_rec($xnode, 'Anything,', $document)) {
# Add text representation to the containing Math element.
my $p = $xnode->parentNode;
# This is a VERY screwy situation? How can the parent be a document fragment??
# This has got to be a LibXML bug???
if ($p->nodeType == XML_DOCUMENT_FRAG_NODE) {
my @n = $p->childNodes;
if (scalar(@n) == 1) {
$p = $n[0]; }
else {
Fatal('malformed', '<XMath>', $xnode, "XMath node has DOCUMENT_FRAGMENT for parent!"); } }
# HACK: replace XMRef's to stray trailing punctution
foreach my $id (keys %$LaTeXML::MathParser::PUNCTUATION) {
my $r = $$LaTeXML::MathParser::PUNCTUATION{$id}->cloneNode;
$r->removeAttribute('xml:id');
foreach my $n ($document->findnodes("descendant-or-self::ltx:XMRef[\@idref='$id']", $p)) {
$document->replaceTree($r, $n); } }
foreach my $id (keys %$LaTeXML::MathParser::LOSTNODES) {
my $repid = $$LaTeXML::MathParser::LOSTNODES{$id};
# but the replacement my have been replaced as well!
while (my $reprepid = $$LaTeXML::MathParser::LOSTNODES{$repid}) {
$repid = $reprepid; }
if ($document->findnodes("descendant-or-self::*[\@xml:id='$id']")
&& !$document->findnodes("descendant-or-self::*[\@xml:id='$repid']")) {
# Do nothing if the node never actually got replaced (parse ultimately failed?)
}
else {
foreach my $n ($document->findnodes("descendant-or-self::ltx:XMRef[\@idref='$id']", $p)) {
$document->setAttribute($n, idref => $repid); } } }
lib/LaTeXML/MathParser.pm view on Meta::CPAN
# ($item,@more); }
# sub parse_kludge_reca {
# my($self,$next,@more)=@_;
# my $role = $self->getGrammaticalRole($next);
# if($role =~ /^FLOAT(SUB|SUPER)SCRIPT$/){
# my($base,@rest) = $self->parse_kludge_rec(@more);
# (NewScript($base,$next),@rest); }
# elsif($role eq 'OPEN'){
# my($open,$close,$seps,$list,@more)=$self->parse_kludge_fence($next,@more);
# (Apply(Annotate(New(undef,undef,role=>'FENCED'),
# argopen=>$open, argclose=>$close, separators=>$seps),@$list), @more); }
# else {
# ($next,@more); }}
# sub parse_kludge_fence {
# my($self,$next,@more)=@_;
# my($open,$close,$punct,$r,$item,@list)=($next,undef,'',undef);
# while(@more){
# my @i=();
# while(@more && (($r=($self->getGrammaticalRole($more[0])||'')) !~ /^(CLOSE|PUNCT)$/)){
# ($item,@more)=$self->parse_kludge_rec(@more);
# push(@i,$item); }
# push(@list,(scalar(@i > 1) ? ['ltx:XMWrap',{},@i] : $i[0]));
# if($r eq 'CLOSE'){
# $close=shift(@more); last; }
# else {
# $punct .= ($punct ? ' ':''). p_getValue(shift(@more)); }} # Delimited by SINGLE SPACE!
# ($open,$close,$punct,[@list],@more); }
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Low-level Parser: parse a single expression
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Convert to textual form for processing by MathGrammar
sub parse_single {
my ($self, $mathnode, $document, $rule) = @_;
my @nodes = $self->filter_hints($document, $mathnode->childNodes);
my ($punct, $result, $unparsed);
my @punct = ();
# Extract trailing punctuation, if rule allows it.
if ($rule =~ s/,$//) {
# Collect ALL trailing PUNCT|PERIOD's...
my ($x, $r);
while (($x = $nodes[-1]) && ($x = realizeXMNode($x))
&& ($r = $x->getAttribute('role')) && (($r eq 'PUNCT') || ($r eq 'PERIOD'))) {
my $p = pop(@nodes);
# Special case hackery, in case this thing is XMRef'd!!!
# We could just stick it on the end of the presentation,
# but it doesn't belong in the content at all!?!?
if (my $id = $p->getAttribute('xml:id')) {
$$LaTeXML::MathParser::PUNCTUATION{$id} = $p; }
unshift(@punct, $p); }
}
if ($LaTeXML::DEBUG{recdescent}) {
$::RD_TRACE = 1; # Turn on MathGrammar tracing
# my $box = $document->getNodeBox($LaTeXML::MathParser::XNODE);
my $box = $document->getNodeBox($mathnode);
Debug(('=' x 60) .
"\nParsing formula \"" . ToString($box) . "\""
. "\n from " . ToString($box->getLocator)
. "\n == \"" . join(' ', map { node_string($_, $document) } @nodes) . "\""
. "\n == " . ToString($mathnode)); }
if (scalar(@nodes) < 2) { # Too few nodes? What's to parse?
$result = $nodes[0] || Absent(); }
else {
# Now do the actual parse.
($result, $unparsed) = $self->parse_internal($rule, @nodes);
}
# Failure? No result or uparsed lexemes remain.
# NOTE: Should do script hack??
if ((!defined $result) || $unparsed) {
$LaTeXML::MathParser::UNPARSED = 1;
$self->failureReport($document, $mathnode, $rule, $unparsed, @nodes);
return; }
# Success!
else {
if (@punct) { # create a trivial XMDual to treat the punctuation as presentation
$result = ['ltx:XMDual', {},
LaTeXML::Package::createXMRefs($document, $result),
['ltx:XMWrap', {}, $result, @punct]]; } # or perhaps: Apply, punctuated???
if ($LaTeXML::DEBUG{recdescent}) {
Debug("=>" . printNode($result) . "\n" . ('=' x 60)); }
return $result; } }
sub node_to_lexeme {
my ($self, $node) = @_;
my $qname = getQName($node);
my $is_inked = 1;
my $lexeme = ($qname eq 'ltx:XMTok' or $qname eq 'ltx:text') && $node->textContent;
if (!defined($lexeme) || length($lexeme) == 0) {
$is_inked = 0;
$lexeme = getTokenMeaning($node); }
my $document = $LaTeXML::MathParser::DOCUMENT;
return unless defined $lexeme;
if ($is_inked) {
if (my $font = $node->getAttribute('_font')) {
my $font_spec = $document->decodeFont($font);
if (my %declarations = $font_spec && $font_spec->relativeTo(LaTeXML::Common::Font->textDefault)) {
my %to_add = ();
my $font_pending = $declarations{font} || {};
my $properties_pending = $$font_pending{properties} || {};
foreach my $attr (qw(family series shape)) {
if (my $value = $$properties_pending{$attr}) {
$to_add{$value} = 1; } }
if (%to_add) {
my $prefix = join("_", sort(keys(%to_add))) . "_";
$lexeme = join(" ", map { $prefix . $_ } split(' ', $lexeme)); }
} }
if ($lexeme =~ /^\p{L}+$/) {
$lexeme = join(" ", map { 'roman_' . $_ } split(' ', $lexeme)); } }
$lexeme =~ s/\s+$//;
return $lexeme; }
sub node_to_lexeme_full {
my ($self, $unrealized_node) = @_;
my $node = realizeXMNode($unrealized_node);
my $tag = getQName($node);
if ($tag eq 'ltx:XMHint') { return ""; } # just skip XMHints, they don't contain lexemes
my $role = p_getAttribute($node, 'role');
if ($tag eq 'ltx:XMTok' || ($role && ($tag !~ 'ltx:XM(Dual|App|Arg|Array|Wrap|ath)'))) {
# Elements that directly represent a lexeme, or intended operation with a syntactic role (such as a postscript),
# can proceed to building the lexeme from the leaf node.
my $lexeme = $self->node_to_lexeme($node);
if ($role && $role =~ /^(UNDER|OVER)ACCENT$/) {
# over¯ and under¯ are the lexeme names of choice for \overline and \underline
$lexeme = lc($1) . $lexeme; }
return $lexeme; }
# Elements that do not have a role and are intermediate "may" need an argument wrapper, so that arguments
# remain unambiguous. For instance a `\frac{a}{b}` has clear argument structure to be preserved.
my ($mark_start, $mark_end) = ('', '');
if ($tag ne 'ltx:XMath') {
if ($role) {
$mark_start = "start_$role ";
$mark_end = " end_$role";
} elsif ($tag =~ '^ltx:XM(Arg|Row|Cell)') {
my $tag_role = uc($1);
$mark_start = "start_$tag_role ";
$mark_end = " end_$tag_role";
}
}
my $lexemes = $mark_start;
if ($tag eq 'ltx:XMDual') {
$lexemes .= $self->node_to_lexeme_full($LaTeXML::MathParser::DOCUMENT->getSecondChildElement($node)); }
elsif ($tag eq 'ltx:XMText') {
# can be either a single node XMText, we're looking at a leaf text node
# or \text{}-like construct, with multiple math formulas and interleaved text
foreach my $child ($node->childNodes) {
if (ref($child) eq 'XML::LibXML::Text') {
$lexemes .= $child->textContent() . ' '; }
else {
my $child_lexeme = $self->node_to_lexeme_full($child);
if (defined $child_lexeme && length($child_lexeme) > 0) {
$lexemes .= $child_lexeme . ' '; } } } }
elsif ($tag eq 'ltx:text') { # could recurse in from ltx:XMText
return $self->node_to_lexeme($node); }
else {
my @child_elements = element_nodes($node);
# skip through single child wrappers (don't serialize)
while (scalar(@child_elements) == 1 && getQName($child_elements[0]) =~ /^ltx:XM(Arg|Wrap)$/) {
@child_elements = element_nodes($child_elements[0]);
}
foreach my $child (@child_elements) {
my $child_lexeme = $self->node_to_lexeme_full($child);
if (defined $child_lexeme && length($child_lexeme) > 0) {
$lexemes .= $child_lexeme . ' ';
} } }
$lexemes .= $mark_end;
return $lexemes; }
sub parse_internal {
my ($self, $rule, @nodes) = @_;
#------------
# Generate a textual token for each node; The parser operates on this encoded string.
local $LaTeXML::MathParser::LEXEMES = {};
my $i = 0;
my $lexemes = '';
foreach my $node (@nodes) {
# This is a parser-specific lexeme, but it is not (yet) identical to the serialized lexeme by
# ->node_to_lexeme, which is currently experimental
my $role = $self->getGrammaticalRole($node);
my $text = getTokenMeaning($node);
$text = 'Unknown' unless defined $text;
my $lexeme = $role . ":" . $text . ":" . ++$i;
$lexeme =~ s/\s//g;
$$LaTeXML::MathParser::LEXEMES{$lexeme} = $node;
$lexemes .= ' ' . $lexeme; }
#------------
# apply the parser to the textified sequence.
local $LaTeXML::MathParser::PARSER = $self;
local %LaTeXML::MathParser::SEEN_NOTATIONS = ();
local %LaTeXML::MathParser::DISALLOWED_NOTATIONS = ();
local $LaTeXML::MathParser::MAX_ABS_DEPTH = 1;
# "Speculative" mode is disabled by default (set MATHPARSER_SPECULATE).
# It causes the parser to explore possible undeclared notatations,
# like possible functions (the only one, so far)
# These extra grammar rules can be costly in corner cases.
# This should be disabled when speed (but feedback) is essential
if ($STATE->lookupValue('MATHPARSER_SPECULATE')) {
$LaTeXML::MathParser::DISALLOWED_NOTATIONS{MaybeFunctions} = 0; }
else {
$LaTeXML::MathParser::DISALLOWED_NOTATIONS{MaybeFunctions} = 1; }
my $unparsed = $lexemes;
my $result = $$self{internalparser}->$rule(\$unparsed);
if (((!defined $result) || $unparsed) # If parsing Failed
&& $LaTeXML::MathParser::SEEN_NOTATIONS{QM}) { # & Saw some QM stuff.
$LaTeXML::MathParser::DISALLOWED_NOTATIONS{QM} = 1; # Retry w/o QM notations
$unparsed = $lexemes;
$result = $$self{internalparser}->$rule(\$unparsed); }
while (((!defined $result) || $unparsed) # If parsing Failed
&& ($LaTeXML::MathParser::SEEN_NOTATIONS{AbsFail}) # & Attempted deeper abs nesting?
&& ($LaTeXML::MathParser::MAX_ABS_DEPTH < 3)) { # & Not ridiculously deep
delete $LaTeXML::MathParser::SEEN_NOTATIONS{AbsFail};
++$LaTeXML::MathParser::MAX_ABS_DEPTH; # Try deeper.
$unparsed = $lexemes;
$result = $$self{internalparser}->$rule(\$unparsed); }
# If still failed, try other strategies?
return ($result, $unparsed); }
sub getGrammaticalRole {
my ($self, $node) = @_;
$node = realizeXMNode($node);
my $role = p_getAttribute($node, 'role');
if (!defined $role) {
my $tag = getQName($node);
if ($tag eq 'ltx:XMTok') {
$role = 'UNKNOWN'; }
elsif ($tag eq 'ltx:XMDual') {
my ($content, $presentation) = element_nodes($node);
$role = p_getAttribute($content, 'role') || p_getAttribute($presentation, 'role'); }
$role = 'ATOM' unless defined $role; }
$self->note_unknown($node) if ($role eq 'UNKNOWN') && $LaTeXML::MathParser::STRICT;
return $role; }
# How many tokens before & after the failure point to report in the Warning message.
my $FAILURE_PRETOKENS = 3; # [CONSTANT]
my $FAILURE_POSTTOKENS = 1; # [CONSTANT]
sub failureReport {
my ($self, $document, $mathnode, $rule, $unparsed, @nodes) = @_;
if ($LaTeXML::MathParser::STRICT || (($LaTeXML::Common::Error::VERBOSITY || 0) > 1)) {
my $loc = "";
# If we haven't already done it for this formula, show the original TeX.
if (!$LaTeXML::MathParser::WARNED) {
$LaTeXML::MathParser::WARNED = 1;
my $box = $document->getNodeBox($LaTeXML::MathParser::XNODE);
$loc = "In \"" . UnTeX($box) . "\""; }
$unparsed =~ s/^\s*//;
my @rest = split(/ /, $unparsed);
my $pos = scalar(@nodes) - scalar(@rest);
# Break up the input at the point where the parse failed.
my $max = 50;
my $parsed = join(' ', ($pos > $max ? ('...') : ()),
(map { node_string($_, $document) } @nodes[max(0, $pos - 50) .. $pos - 1]));
my $toparse = join(' ',
(map { node_string($_, $document) } @nodes[$pos .. min($pos + 50, $#nodes)]),
($#nodes > $pos + 50 ? ('...') : ()));
my $parsefail
= join('.', map { $self->getGrammaticalRole($_) }
@nodes[($pos - $FAILURE_PRETOKENS >= 0
? $pos - $FAILURE_PRETOKENS : 0) .. $pos - 1])
. ">"
. join('.', map { $self->getGrammaticalRole($_) }
@nodes[$pos .. ($pos + $FAILURE_POSTTOKENS - 1 < $#nodes
? $pos + $FAILURE_POSTTOKENS - 1 : $#nodes)]);
my $lexeme = node_location($nodes[$pos] || $nodes[$pos - 1] || $mathnode);
my $indent = length($parsed) - 2; $indent = 8 if $indent > 8;
Warn('not_parsed', $parsefail, $mathnode,
"MathParser failed to match rule '$rule'",
($loc ? ($loc) : ()),
($parsed
? ($parsed, (' ' x $indent) . "> " . $toparse)
: ("> " . $toparse)));
}
return; }
# used for debugging & failure reporting.
sub node_string {
my ($node, $document) = @_;
my $role = $node->getAttribute('role') || 'UNKNOWN';
my $box = $document->getNodeBox($node);
return ($box ? ToString($box) : text_form($node)) . "[[$role]]"; }
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Conversion to a less ambiguous, mostly-prefix form.
# Mostly for debugging information?
# Note that the nodes are true libXML nodes, already absorbed into the document
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
sub text_form {
my ($node) = @_;
return textrec($node, undef); }
my %PREFIX_ALIAS = ( # [CONSTANT]
SUPERSCRIPTOP => '^', SUBSCRIPTOP => '_', times => '*',
'equals' => '=', 'less-than' => '<', 'greater-than' => '>',
'less-than-or-equals' => '<=', 'greater-than-or-equals' => '>=',
'much-less-than' => '<<', 'much-greater-than' => '>>',
'plus' => '+', 'minus' => '-', 'divide' => '/');
# Put infix, along with `binding power'
my %IS_INFIX = (METARELOP => 1, # [CONSTANT]
RELOP => 2, ARROW => 2,
ADDOP => 10, MULOP => 100, FRACOP => 100,
SUPERSCRIPTOP => 1000, SUBSCRIPTOP => 1000);
lib/LaTeXML/MathParser.pm view on Meta::CPAN
if ($app_role && $app_role =~ /^FLOAT(SUB|SUPER)SCRIPT$/) {
return ($1 eq 'SUPER' ? '^' : '_') . textrec($op); }
else {
my $name = ((getQName($op) eq 'ltx:XMTok') && getTokenMeaning($op)) || 'unknown';
my ($bp, $string) = textrec_apply($name, $op, @args);
return (($bp < $outer_bp) || (($bp == $outer_bp) && ($name ne $outer_name))
? '(' . $string . ')' : $string); } }
elsif ($tag eq 'ltx:XMDual') {
my ($content, $presentation) = element_nodes($node);
my $text = textrec($content, $outer_bp, $outer_name); # Just send out the semantic form.
# Fall back to presentation, if content has poor semantics (eg. from replacement patterns)
return ($text =~ /^\(*Unknown/ ? textrec($presentation, $outer_bp, $outer_name) : $text); }
elsif ($tag eq 'ltx:XMTok') {
my $name = getTokenMeaning($node);
$name = 'Unknown' unless defined $name;
return $PREFIX_ALIAS{$name} || $name; }
elsif (($tag eq 'ltx:XMWrap') || ($tag eq 'ltx:XMCell')) {
# ??
return join('@', map { textrec($_) } element_nodes($node)); }
elsif ($tag eq 'ltx:XMArray') {
return textrec_array($node); }
else {
return '[' . (p_getValue($node) || '') . ']'; } }
sub textrec_apply {
no warnings 'recursion';
my ($name, $op, @args) = @_;
my $role = ((ref $op ne 'ARRAY') && $op->getAttribute('role')) || 'Unknown';
if (($role =~ /^(SUB|SUPER)SCRIPTOP$/) && (($op->getAttribute('scriptpos') || '') =~ /^pre\d+$/)) {
# Note that this will likely get parenthesized due to high bp
return (5000, textrec($op) . " " . textrec($args[1]) . " " . textrec($args[0])); }
elsif (my $bp = $IS_INFIX{$role}) {
# A sub/superscript with a meaning probably should be prefix
if (($role =~ /^(SUB|SUPER)SCRIPTOP$/) && $op->getAttribute('meaning')) {
return (500, textrec($op, 10000, $name) . '@(' . join(', ', map { textrec($_) } @args) . ')') }
else { # Format as infix.
return ($bp, (scalar(@args) == 1 # unless a single arg; then prefix.
? textrec($op) . ' ' . textrec($args[0], $bp, $name)
: join(' ' . textrec($op) . ' ', map { textrec($_, $bp, $name) } @args))); } }
elsif ($role eq 'POSTFIX') {
return (10000, textrec($args[0], 10000, $name) . textrec($op)); }
elsif ($name eq 'multirelation') {
return (2, join(' ', map { textrec($_, 2, $name) } @args)); }
else {
return (500, textrec($op, 10000, $name) . '@(' . join(', ', map { textrec($_) } @args) . ')'); } }
sub textrec_array {
my ($node) = @_;
my $name = $node->getAttribute('meaning') || $node->getAttribute('name') || 'Array';
my @rows = ();
foreach my $row (element_nodes($node)) {
push(@rows, '[' . join(', ', map { ($_->firstChild ? textrec($_->firstChild) : '') } element_nodes($row)) . ']'); }
return $name . '[' . join(', ', @rows) . ']'; }
sub is_genuinely_unparsed {
my ($node, $document) = @_;
# any unparsed fragment should be considered legitimate with one exception
# author-provided ungrammatical snippets in the presentation branches of XMDual
# are allowed to fail the parse process.
#
# For now a reliable way of if we are in that case is to descend the formula through
# the content branch of XMDual and check if any node has an "unparsed" mark.
# Then only genuine parse failures will be detected.
my $tag = $document->getNodeQName($node);
if (($tag eq 'ltx:XMWrap') || ($tag eq 'ltx:XMArg') || $node->hasAttribute('_unparsed')) {
return 1; }
elsif (($tag eq 'ltx:XMTok') || ($tag eq 'ltx:XMText') || ($tag eq 'ltx:XMHint')) {
return 0; }
elsif ($tag eq 'ltx:XMRef') {
# avoid infinite loops on malformed XMRefs that don't point anywhere
if (!$node->getAttribute('idref')) {
return 1; }
else {
my $real_node = realizeXMNode($node);
# Note that some parses fail with an ARRAY ref [ltx:Error,...]
# so realizeXMNode may fail!
return 1 if (ref $real_node) !~ /^XML::LibXML/;
return is_genuinely_unparsed($real_node, $document); } }
elsif ($tag eq 'ltx:XMDual') {
my ($content, $presentation) = element_nodes($node);
return is_genuinely_unparsed($content, $document); }
else {
foreach my $child (element_nodes($node)) {
return 1 if is_genuinely_unparsed($child, $document); }
return 0; } }
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Cute! Were it NOT for Sub/Superscripts, the whole parsing process only
# builds a new superstructure around the sequence of token nodes in the input.
# Thus, any internal structure is unchanged.
# They get re-parented, but if the parse fails, we've only got to put them
# BACK into the original node, to recover the original arrangment!!!
# Thus, we don't have to clone, and deal with namespace duplication.
# ...
# EXCEPT, as I said, for sub/superscripts!!!!
#
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Constructors used in grammar
# All the tree construction in the grammar should come through these operations.
# We avoid mucking with the actual XML nodes (both to avoid modifying the original
# tree until we have a successful parse, and to avoid XML::LibXML cloning nightmares)
# We are converting XML nodes to array representation: [$tag, {%attr},@children]
# This means any inspection of nodes has to recognize that
# * node may be in XML vs ARRAY representation
# * node may be an XMRef to another node whose properties are the ones we should use.
#
# Also, when we are examining a node's properties (roles, fences, script positioning, etc)
# we should be careful to check for XMRef indirection and examine the properties
# of the node that was referred to.
# HOWEVER, we should construct our parse tree using (a clone of) the XMRef node,
# rather than (a clone of) the referred to node, so as to preserve identity.
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# We're currently keeping the id's on the nodes as they get cloned,
# since they'll (maybe) replace the unparsed nodes.
# However, if we consider multiple parses or preserving both parsed & unparsed,
# we may have to do some adaptation and id shifting.
# ================================================================================
# ================================================================================
# Low-level accessors
lib/LaTeXML/MathParser.pm view on Meta::CPAN
return (defined $expect) && (defined $cname) && ($expect eq $cname); }
# Given a delimited sequence: open expr (punct expr)* close
# (OR, an empty sequence open close)
# Convert it into the appropriate thing, depending on the specific open & close used.
# Generate an XMDual to preserve any styling of delimiters and punctuation.
sub Fence {
my (@stuff) = @_;
# Peak at delimiters to guess what kind of construct this is.
my $nargs = scalar(@stuff);
Error("expected", "arguments", undef,
"Even number of arguments to Fence(); should be of form open,expr,(punct,expr)*,close",
"got " . join(' ', map { ToString($_) } @stuff))
if ($nargs != 2) && (($nargs % 2) == 0); # either empty or odd number
my ($open, $close) = (realizeXMNode($stuff[0]), realizeXMNode($stuff[-1]));
my $o = p_getValue($open);
my $c = p_getValue($close);
my $n = int(($nargs - 2 + 1) / 2);
my @p = map { p_getValue(realizeXMNode(@stuff[2 * $_])) } 1 .. $n - 1;
my $op = ($n == 0
? 'list' # ?
: ($n == 1
? $enclose1{ $o . '@' . $c }
: ($n == 2
? ($enclose2{ $o . '@' . $p[0] . '@' . $c } || 'list')
: ($encloseN{ $o . '@' . $p[0] . '@' . $c } || 'list'))));
$op = 'delimited-' . $o . $c unless defined $op;
my $decl_id = p_getAttribute($open, 'decl_id');
if (($n == 1) && ($op eq 'delimited-()')) { # Hopefully, can just ignore the parens?
return ['ltx:XMDual', {},
LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, $stuff[1]),
['ltx:XMWrap', {}, @stuff]]; }
else {
return InterpretDelimited(New($op, undef, ($decl_id ? (decl_id => $decl_id) : ())), @stuff); } }
# Compose a complex relational operator from two tokens, such as >=, >>
# (similar to CatSymbols, but specialized to relops)
sub TwoPartRelop {
my ($op1, $op2) = @_;
$op1 = Lookup($op1);
$op2 = Lookup($op2);
my $m1 = p_getTokenMeaning($op1);
my $m2 = p_getTokenMeaning($op2);
my $font = p_getAttribute($op1, '_font');
my $meaning;
if ($m1 eq $m2) {
$meaning = "much-$m1"; }
else {
$meaning = "$m1-or-$m2"; }
my $content = $op1->textContent . $op2->textContent;
return ['ltx:XMTok', { role => "RELOP", meaning => $meaning, ($font ? (_font => $font) : ()) },
$content]; }
# NOTE: It might be best to separate the multiple Formulae into separate XMath's???
# but only at the top level!
sub NewFormulae {
my (@stuff) = @_;
if (scalar(@stuff) == 1) {
return $stuff[0]; }
else {
my ($seps, @formula) = extract_separators(@stuff);
return ['ltx:XMDual', {},
Apply(New('formulae'),
LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, @formula)),
['ltx:XMWrap', {}, @stuff]]; } }
# A Formula is an alternation of expr (relationalop expr)*
# It presumably would be equivalent to (expr1 relop1 expr2) AND (expr2 relop2 expr3) ...
# But, I haven't figured out the ideal prefix form that can easily be converted to presentation.
sub NewFormula {
my (@args) = @_;
my $n = scalar(@args);
if ($n == 1) {
return $args[0]; }
elsif ($n == 3) {
return Apply($args[1], $args[0], $args[2]); }
else {
return Apply(New('multirelation'), @args); } }
sub NewList {
my (@stuff) = @_;
# drop placeholder token for missing trailing punct, if any
if (scalar(@stuff) > 1 && ((p_getTokenMeaning($stuff[-1]) || '') eq 'absent')) {
pop(@stuff); }
if (@stuff == 1) {
return $stuff[0]; }
else {
my ($seps, @items) = extract_separators(@stuff);
return ['ltx:XMDual', {},
Apply(New('list'),
LaTeXML::Package::createXMRefs($LaTeXML::MathParser::DOCUMENT, @items)),
['ltx:XMWrap', {}, @stuff]]; } }
# Given alternation of expr (addop expr)*, compose the tree (left recursive),
# flattenning portions that have the same operator
# ie. a + b + c - d => (- (+ a b c) d)
sub LeftRec {
no warnings 'recursion';
my ($arg1, @more) = @_;
if (@more) {
my $op = shift(@more);
my $opname = p_getTokenMeaning(realizeXMNode($op));
my @args = ($arg1, shift(@more));
while (@more && isSameExpr($op, $more[0])) {
ReplacedBy($more[0], $op, 1);
shift(@more);
push(@args, shift(@more)); }
return LeftRec(Apply($op, @args), @more); }
else {
return $arg1; } }
# Like apply($op,$arg1,$arg2), but if $op is 'same' as the operator in $arg1,
# then combine as an nary apply of $op to $arg1's arguments and $arg2.
sub ApplyNary {
my ($op, $arg1, $arg2) = @_;
my $rop = realizeXMNode($op);
my $opname = p_getTokenMeaning($rop) || '__undef_meaning__';
my $opcontent = p_getValue($rop) || '__undef_content__';
my @args = ();
if (p_getQName($arg1) eq 'ltx:XMApp') {
my ($op1, @args1) = p_element_nodes($arg1);
my $rop1 = realizeXMNode($op1);
if (isSameExpr($rop, $rop1)
# Check that arg1 isn't wrapped, fenced or enclosed in some restrictive way
# Especially an ID! (but really only important if the id is referenced somewhere?)
&& !(grep { p_getAttribute(realizeXMNode($arg1), $_) } qw(enclose xml:id))) {
# Note that $op1 GOES AWAY!!!
ReplacedBy($op1, $rop, 1);
push(@args, @args1); }
else {
push(@args, $arg1); } }
else {
push(@args, $arg1); }
return Apply($op, @args, $arg2); }
# Usually we just expect to compare a token + to another.
# but want (to some extent) to deal with embellished operators (eg. sub, sup...)
# Rather involved if we want to try to do it "Right".
sub isSameExpr {
my ($op1, $op2) = @_;
$op1 = realizeXMNode($op1);
$op2 = realizeXMNode($op2);
my $tag1 = getQName($op1);
my $tag2 = getQName($op2);
# Either both are tokens,
# OR both have structure, but then we need to compare the children!!!!
# First check, same top-level and critical attributes
return unless
($tag1 eq $tag2)
&& ((p_getTokenMeaning($op1) || '__undef_meaning__')
eq (p_getTokenMeaning($op2) || '__undef_meaning__'))
&& ((p_getValue($op1) || '__undef_content__')
eq (p_getValue($op2) || '__undef_content__'))
# Check that ops are used in same way.
&& ((p_getAttribute($op1, 'mathstyle') || '<none>')
eq (p_getAttribute($op2, 'mathstyle') || '<none>'));
if ($tag1 eq 'ltx:XMTok') { return 1; } # If tokens, they match
else {
my @ch1 = p_element_nodes($op1);
my @ch2 = p_element_nodes($op2);
my $n = scalar(@ch1);
return unless $n == scalar(@ch2);
foreach my $i (0 .. $n - 1) {
return unless isSameExpr($ch1[$i], $ch1[$i]); }
return 1; } }
# There are several cases where parsing a formula will rearrange nodes
# such that some nodes will no-longer be used. For example, when
# converting a nested set of infix + into a single n-ary sum.
# In effect, all those excess +'s are subsumed by the single first one.
# It may be, however, that those lost nodes are referenced (XMRef) from the
# other branch of an XMDual, and those references should be updated to refer
# to the single node replacing the lost ones.
# This function records that replacement, and the top-level parser fixes up the tree.
# NOTE: There may be cases (in the Grammar, eg) where punctuation & ApplyOp's
# get lost completely? Watch out for this!
# If $isdup, assume the two trees are equivalent structure,
# and so mark corresponding internal nodes as "replaced" as well.
sub ReplacedBy {
my ($lostnode, $keepnode, $isdup) = @_;
return unless ref $lostnode && ref $keepnode;
if (my $lostid = p_getAttribute($lostnode, 'xml:id')) {
# Could be we want to generate an id for $keepnode, here?
if (my $keepid = p_getAttribute($keepnode, 'xml:id')) {
$$LaTeXML::MathParser::LOSTNODES{$lostid} = $keepid; }
else {
Warn('expected', 'id', undef, "LOST $lostid but no replacement!\n"); } }
# The following recurses into the two trees,
# This is on the assumption that they are "equivalent" trees
if ($isdup) {
my @lostkids = p_element_nodes($lostnode);
my @keepkids = p_element_nodes($keepnode);
my $n = scalar(@lostkids);
my $m = scalar(@keepkids);
if ($n != $m) {
Error("unexpected", "nodes", undef, "Nodes aren't the same structure ($n vs $m)",
"Old: " . printNode($lostnode),
"New: " . printNode($keepnode)
);
$n = $m if $m < $n; }
foreach my $i (0 .. $n - 1) {
ReplacedBy($lostkids[$i], $keepkids[$i], $isdup); } }
return; }
# ================================================================================
# Construct an appropriate application of sub/superscripts
# This accounts for script positioning:
# Whether it precedes (float), is over/under (if base requests),
# or follows (normal case), along with whether sub/super.
# the alignment of multiple sub/superscripts derived from the binding level when created.
# scriptpos = (pre|mod|post) number; where number is the binding-level.
# If $pos is given (pre|mid|post), it overrides the position implied by the script
sub NewScript {
my ($base, $script, $pos) = @_;
my $role;
my $rbase = realizeXMNode($base);
my $rscript = realizeXMNode($script);
my $ibase = $rbase;
# Get "inner" (content) base, if the base is a dual it may be more relevant
if (p_getQName($rbase) eq 'ltx:XMDual') {
($ibase) = p_element_nodes($rbase); }
elsif (p_getQName($rbase) eq 'ltx:XMApp') {
my ($op, $a1, @rest) = p_element_nodes($rbase);
if ((p_getQName($op) eq 'ltx:XMTok')
&& ((p_getAttribute($op, 'role') || '') =~ /^(FLOAT|POST)?(SUB|SUPER)SCRIPT(OP)?$/)) {
$ibase = $op; } }
lib/LaTeXML/MathParser.pm view on Meta::CPAN
=over 4
=item C<< $node = New($name,$content,%attributes); >>
Creates a new C<XMTok> node with given C<$name> (a string or undef),
and C<$content> (a string or undef) (but at least one of name or content should be provided),
and attributes.
=item C<< $node = Arg($node,$n); >>
Returns the C<$n>-th argument of an C<XMApp> node;
0 is the operator node.
=item C<< Annotate($node,%attributes); >>
Add attributes to C<$node>.
=item C<< $node = Apply($op,@args); >>
Create a new C<XMApp> node representing the application of the node
C<$op> to the nodes C<@args>.
=item C<< $node = ApplyDelimited($op,@stuff); >>
Create a new C<XMApp> node representing the application of the node
C<$op> to the arguments found in C<@stuff>. C<@stuff> are
delimited arguments in the sense that the leading and trailing nodes
should represent open and close delimiters and the arguments are
separated by punctuation nodes.
=item C<< $node = InterpretDelimited($op,@stuff); >>
Similar to C<ApplyDelimited>, this interprets sequence of
delimited, punctuated items as being the application of C<$op> to those items.
=item C<< $node = recApply(@ops,$arg); >>
Given a sequence of operators and an argument, forms the nested
application C<op(op(...(arg)))>>.
=item C<< $node = InvisibleTimes; >>
Creates an invisible times operator.
=item C<< $boole = isMatchingClose($open,$close); >>
Checks whether C<$open> and C<$close> form a `normal' pair of
delimiters, or if either is ".".
=item C<< $node = Fence(@stuff); >>
Given a delimited sequence of nodes, starting and ending with open/close delimiters,
and with intermediate nodes separated by punctuation or such, attempt to guess what
type of thing is represented such as a set, absolute value, interval, and so on.
This would be a good candidate for customization!
=item C<< $node = NewFormulae(@stuff); >>
Given a set of formulas, construct a C<Formulae> application, if there are more than one,
else just return the first.
=item C<< $node = NewList(@stuff); >>
Given a set of expressions, construct a C<list> application, if there are more than one,
else just return the first.
=item C<< $node = LeftRec($arg1,@more); >>
Given an expr followed by repeated (op expr), compose the left recursive tree.
For example C<a + b + c - d> would give C<(- (+ a b c) d)>>
=item C<< MaybeFunction($token); >>
Note the possible use of C<$token> as a function, which may cause incorrect parsing.
This is used to generate warning messages.
=back
=head1 AUTHOR
Bruce Miller <bruce.miller@nist.gov>
=head1 COPYRIGHT
Public domain software, produced as part of work done by the
United States Government & not subject to copyright in the US.
=cut
( run in 0.827 second using v1.01-cache-2.11-cpan-437f7b0c052 )