OWL-Simple
view release on metacpan or search on metacpan
scripts/similarity_match.pl view on Meta::CPAN
=cut
sub parseMeSH($) {
my ($file) = @_;
my $term;
INFO "Parsing MeSH file $file ...";
my $parser = MeSH::Parser::ASCII->new( meshfile => $file );
# parse the file
$parser->parse();
# loop through all the headings
while ( my ( $id, $heading ) = each %{ $parser->heading } ) {
$term->{$id}->{label} = $heading->{label};
$term->{$id}->{synonyms} = $heading->{synonyms};
}
return $term;
}
=item parseMeSH()
Custom OMIM parser.
=cut
sub parseOMIM($) {
my ($file) = @_;
my $term;
INFO "Parsing OMIM file $file ...";
my $synonym_count;
# FIXME: The external parser is suboptimal in many ways
# if this becomes more often used consider creating
# a custom one from sratch
my $parser = Bio::Phenotype::OMIM::OMIMparser->new( -omimtext => $file );
# loop through all the records
while ( my $omim_entry = $parser->next_phenotype() ) {
# *FIELD* NO
my $id = $omim_entry->MIM_number();
$id = 'OMIM:' . $id;
# *FIELD* TI - first line
my $title = $omim_entry->title();
$title =~ s/^.\d+ //; # remove id from title
$title =~ s/INCLUDED//g; # remove INCLUDED as it screws up scoring
# *FIELD* TI - additional lines
my $alt = $omim_entry->alternative_titles_and_symbols();
# OMIM uses this weird delimiter ;;
# to signal sections irrespective of actual line endings
# this is a major headache to resolve, the parser doesn't
# do this and we're not going to bother with it either
$alt =~ s/;;//g;
$alt =~ s/INCLUDED//g; # remove INCLUDED as it screws up scoring
my @synonyms = split m!\n!, $alt;
# if alt doesn't start with ;; it's an overspill from the
# title (go figure!)
if ($alt ne '' &&
$omim_entry->alternative_titles_and_symbols() !~ /^;;/) {
$title .= shift @synonyms;
}
$term->{$id}->{label} = $title;
$term->{$id}->{synonyms} = \@synonyms;
$synonym_count += scalar @synonyms;
}
INFO "Loaded "
. keys( %$term )
. " OMIM terms and "
. $synonym_count
. " synonyms";
return $term;
}
=item parseFlat()
Custom flat file parser.
=cut
sub parseFlat($) {
my $file = shift;
my $term;
INFO "Parsing flat file $file ...";
open my $fh_in, '<', $file;
# parse header
my $header = <$fh_in>;
chomp $header;
( $flat_header[0], $flat_header[1] ) = parseFlatColumns($header);
INFO "Using first line as header <$header>";
INFO "Using first column <$flat_header[0]> to match terms";
# load input
while (<$fh_in>) {
chomp;
next if /^$/; #skip empty line
# preserve existing columns in the file
my ( $label, $ragged_end ) = parseFlatColumns($_);
# trim
$label =~ s/^\s+//;
$label =~ s/\s+$//;
# drop trailing quotation marks (excel artefact?)
$label =~ s/^"+//;
$label =~ s/"+$//;
( run in 1.034 second using v1.01-cache-2.11-cpan-71847e10f99 )