MeSH-Parser-ASCII
view release on metacpan or search on metacpan
lib/MeSH/Parser/ASCII.pm view on Meta::CPAN
}
}
=head1 DESCRIPTION
Parser for the MeSH ASCII format.
=over
=item meshfile
MeSH file in ASCII format
=back
=head2 METHODS
=over
=item parse()
Parses the MeSH file and loads it into a hash ref.
=item heading
Returns a hash ref collection of all the parsed headings. Each consists of a label,
and id and synonyms if any were available.
Label is extracted from I<Mesh Heading> field in Descriptor Data Elements,
or I<Name of substance> in Supplementary Concept Records,
or I<Subheading> in Qualifier Data Elements.
Synonyms are only parsed for Descriptor Data Elements (I<PRINT ENTRY> and I<ENTRY> entries)
=back
=head1 AUTHOR
Tomasz Adamusiak <tomasz@cpan.org>
=head1 COPYRIGHT AND LICENSE
Copyright (c) 2010 European Bioinformatics Institute. All Rights Reserved.
This module is free software; you can redistribute it and/or modify it
under GPLv3.
This software is provided "as is" without warranty of any kind.
=cut
package MeSH::Parser::ASCII;
use Moose 0.89;
use Log::Log4perl qw(:easy);
Log::Log4perl->easy_init( { level => $INFO, layout => '%-5p - %m%n' } );
our $VERSION = 0.03;
has 'meshfile' => ( is => 'rw', isa => 'Str', required => 1 );
has 'heading' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
sub parse() {
my $self = shift;
INFO 'Parsing file ' . $self->meshfile . ' ...';
# open file
open my $fh, '<', $self->meshfile;
my ( $label, $id, $synonyms, $treeNos, $count );
$count->{syns} = 0;
while (<$fh>) {
# multiplatform chomp
# this will also rtrim the line
s/\s+$//;
# initialise
if (/^\*NEWRECORD/) {
$synonyms = undef;
$label = undef;
$id = undef;
$treeNos = undef;
}
DEBUG '<' . $_ . '>';
# save on new line
if (/^$/) {
LOGDIE 'Could not parse heading\'s label.'
unless defined $label;
$count->{headings}++;
WARN "Duplicate heading found for $id"
if defined $self->heading->{$id};
$self->heading->{$id}->{label} = $label;
$self->heading->{$id}->{synonyms} = $synonyms
if defined $synonyms;
DEBUG $label . ' ' . $id . "\n";
for my $syn (@$synonyms) {
DEBUG "\t" . $syn;
$count->{syns}++;
}
$self->heading->{$id}->{treeNos} = $treeNos
if defined $treeNos;
}
# Mesh Heading in Descriptor Data Elements
$label = ( split(/ = /) )[1] if /^MH = /;
# Name of substance in Supplementary Concept Records
$label = ( split(/ = /) )[1] if /^NM = /;
# Subheading in Qualifier Data Elements
$label = ( split(/ = /) )[1] if /^SH = /;
$id = ( split(/ = /) )[1] if /^UI = /;
# PRINT ENTRY and ENTRY are synonyms in Descriptor Data Elements
# splits on ENTRY = , and then disregards anything after pipe |
( run in 2.160 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )