Chemistry-File-InChI

 view release on metacpan or  search on metacpan

lib/Chemistry/File/InChI/Parser.yp  view on Meta::CPAN

# Header section

%{

use strict;
use warnings;

use Chemistry::Atom;
use Chemistry::Mol;

use List::Util qw( first );

my @LAYER_ORDER = qw( PREFIX FORMULA CONNECTIONS H_ATOMS CHARGE TETRAHEDRAL STEREOCHEMISTRY );

%}

%%

# Rules section

# The grammar is taken from https://github.com/metamolecular/inchi-grammar/blob/master/grammar.ebnf, commit 74a8858, MIT license

string: prefix
            { 'PREFIX' }
      | string layer
            {
                my $layer1_id = first { $LAYER_ORDER[$_] eq $_[1] } 0..$#LAYER_ORDER;
                my $layer2_id = first { $LAYER_ORDER[$_] eq $_[2] } 0..$#LAYER_ORDER;
                die "unknown layer $_[2]\n" unless defined $layer2_id;
                die "duplicated layer $_[1]\n" if $layer1_id == $layer2_id;
                die "incorrect layer order, $_[2] must appear before $_[1]\n" if $layer1_id > $layer2_id;
                $_[2];
            }
      ;

layer: formula
            { 'FORMULA' }
     | connections
            { 'CONNECTIONS' }
     | h_atoms
            { 'H_ATOMS' }
     | charge
            { 'CHARGE' }
     | tetrahedral
            { 'TETRAHEDRAL' }
     | stereochemistry
            { 'STEREOCHEMISTRY' }
     ;

formula: formula_first
            { [ $_[1] ] }
       | formula formula_continuation
            { push @{$_[1]}, $_[2] }
       ;

connections: '/' 'c'
                { $_[0]->{USER}{CURSOR}++ }
           | '/' 'c' graph
                { $_[0]->{USER}{CURSOR}++ }
           | connections ';'
                { $_[0]->{USER}{CURSOR}++ }
           | connections ';' graph
                { $_[0]->{USER}{CURSOR}++ }
           ;

h_atoms: '/' 'h'
            { $_[0]->{USER}{CURSOR}++ }
       | '/' 'h' hydrogens
            { $_[0]->{USER}{CURSOR}++ }
       | h_atoms ';'
            { $_[0]->{USER}{CURSOR}++ }
       | h_atoms ';' hydrogens
            { $_[0]->{USER}{CURSOR}++ }
       ;

tetrahedral: '/' 't'
           | '/' 't' tetrahedral_centers
           | tetrahedral ';'
           | tetrahedral ';' tetrahedral_centers
           ;

stereochemistry: '/' 's' '1'
                    { $_[0]->{USER}{MOL}->attr( 'inchi/stereochemistry', $_[3] ) }
               | '/' 's' '2'
                    { $_[0]->{USER}{MOL}->attr( 'inchi/stereochemistry', $_[3] ) }
               ;

# Production 'tail' is merged to 'graph' and 'body' for simplicity.

graph: chain
     | count '*' chain
     ;

# Return: The last atom in a chain
chain: index '-' index
        { $_[0]->_add_bonds( $_[1], $_[3] ); return $_[3] }
     | index branches index
        { $_[0]->_add_bonds( $_[1], [ @{$_[2]}, $_[3] ] ); return $_[3] }
     | chain '-' index
        { $_[0]->_add_bonds( $_[1], $_[3] ); return $_[3] }
     | chain branches index
        { $_[0]->_add_bonds( $_[1], [ @{$_[2]}, $_[3] ] ); return $_[3] }
     ;

branch: '(' branch_body ')' { $_[2]->{bonds} } ;

# Return: Array with all atoms connected to the parent one
branch_body: index
                { { bonds => [ $_[1] ], last => $_[1] } }
           | branch_body ',' index
                {
                    push @{$_[1]->{bonds}}, $_[3];

lib/Chemistry/File/InChI/Parser.yp  view on Meta::CPAN

        { $_[0]->{USER}{MOL}->attr( 'inchi/charges' )->[$_[0]->{USER}{CURSOR}] = int( -$_[3] ) }
      | charge ';'
        { $_[0]->{USER}{CURSOR}++ }
      ;

index: '1' | count ;

count: '1' digit
        { join '', @_[1..2] }
     | count digit
        { join '', @_[1..2] }
     | twoplus
     ;

natural: '1'
       | twoplus
       | natural digit
            { join '', @_[1..2] }
       ;

digit: '0' | '1' | twoplus ;

twoplus: '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' ;

%%

# Footer section

sub parse
{
    my( $self, $string ) = @_;

    $self->YYData->{INPUT} = $string;
    $self->{USER}{MOL}     = Chemistry::Mol->new;
    $self->{USER}{CURSOR}  = 0;
    $self->YYParse( yylex => \&_Lexer, yyerror => \&_Error, yydebug => 0 );

    return $self->{USER}{MOL};
}

sub _Lexer
{
    my( $self ) = @_;

    # If the line is empty and the input is originating from the file,
    # another line is read.
    if( !$self->YYData->{INPUT} && $self->{USER}{FILEIN} ) {
        my $filein = $self->{USER}{FILEIN};
        $self->YYData->{INPUT} = <$filein>;
        $self->{USER}{CHARNO} = 0;
    }

    # Prefix
    if( $self->YYData->{INPUT} =~ s/^(InChI=1S?)// ) {
        return ( 'prefix', $1 );
    }

    # Formula parts
    # TODO: Check Hill order, require counts > 1
    if( $self->YYData->{INPUT} =~ s/^([\/\.])([2-9]|[1-9][0-9]+)?(([A-Z][a-z]?\d*)+)// ) {
        my( $sep, $count, $formula ) = ( $1, $2, $3 );
        $count = 1 unless $count;
        my %atom_map;

        while( $formula =~ /([A-Z][a-z]?)(\d*)/g ) {
            my( $element, $count ) = ( $1, $2 );
            next if $element eq 'H'; # H atoms will be added later
            $count = 1 unless $count;
            for (1..$count) {
                my $atom = Chemistry::Atom->new( symbol => $element );
                $self->{USER}{MOL}->add_atom( $atom );
                $atom_map{scalar( keys %atom_map ) + 1} = $atom;
            }
        }

        if( $sep eq '/') {
            $self->{USER}{ATOM_MAPS} = [ \%atom_map ];
            $self->{USER}{MOL}->attr( 'inchi/counts', [ $count ] );
            return ( 'formula_first', $formula );
        } else {
            push @{$self->{USER}{ATOM_MAPS}}, \%atom_map;
            push @{$self->{USER}{MOL}->attr( 'inchi/counts' )}, $count;
            return ( 'formula_continuation', $formula );
        }
    }

    # Reset cursor on 'h', 'q' or 't'
    if( $self->YYData->{INPUT} =~ s/^([hqt])// ) {
        $self->{USER}{CURSOR} = 0;
        return ( $1, $1 );
    }

    # Remove unsupported layers
    $self->YYData->{INPUT} =~ s/^(\/[pbmsifo][^\/]*)+//;

    # Any other character
    if( $self->YYData->{INPUT} =~ s/^(.)// ) {
        return ( $1, $1 );
    }

    return ( '', '' );
}

sub _Error
{
    my( $self ) = @_;
    die 'ERROR: ', $self->YYData->{INPUT}, "\n";
}

sub _add_bonds
{
    my( $self, $a, $b ) = @_;

    my @bonds = ref $b eq 'ARRAY' ? @$b : $b;
    for (@bonds) {
        $self->{USER}{MOL}->new_bond( atoms => [ $self->_get_atom( $a ),
                                                 $self->_get_atom( $_ ) ] );
    }
}

sub _add_hydrogens
{
    my( $self, $atoms, $count ) = @_;

    my $atom_map = $self->{USER}{ATOM_MAPS}[$self->{USER}{CURSOR}];
    my @atoms = map { $atom_map->{$_} }
                    ref $atoms ? @$atoms : ( $atoms );
    $count = 1 unless $count;
    for my $atom (@atoms) {
        for (1..$count) {
            my $H = Chemistry::Atom->new( symbol => 'H' );
            $self->{USER}{MOL}->add_atom( $H );
            $self->{USER}{MOL}->new_bond( atoms => [ $atom, $H ] );
        }
    }
}

sub _get_atom
{
    my( $self, $atom ) = @_;

    my $atom_map = $self->{USER}{ATOM_MAPS}[$self->{USER}{CURSOR}];



( run in 0.471 second using v1.01-cache-2.11-cpan-d7a12ab2c7f )