OWL-Simple

 view release on metacpan or  search on metacpan

scripts/similarity_match.pl  view on Meta::CPAN


=cut

sub parseMeSH($) {
	my ($file) = @_;
	my $term;
	INFO "Parsing MeSH file $file ...";

	my $parser = MeSH::Parser::ASCII->new( meshfile => $file );

	# parse the file
	$parser->parse();

	# loop through all the headings
	while ( my ( $id, $heading ) = each %{ $parser->heading } ) {
		$term->{$id}->{label}    = $heading->{label};
		$term->{$id}->{synonyms} = $heading->{synonyms};
	}

	return $term;
}

=item parseMeSH()

Custom OMIM parser.

=cut

sub parseOMIM($) {
	my ($file) = @_;
	my $term;
	INFO "Parsing OMIM file $file ...";

	my $synonym_count;
	
	# FIXME: The external parser is suboptimal in many ways
	# if this becomes more often used consider creating
	# a custom one from sratch
	my $parser = Bio::Phenotype::OMIM::OMIMparser->new( -omimtext => $file );

	# loop through all the records
	while ( my $omim_entry = $parser->next_phenotype() ) {

		# *FIELD* NO
		my $id = $omim_entry->MIM_number();
		$id    = 'OMIM:' . $id;
		
		# *FIELD* TI - first line
		my $title = $omim_entry->title();
		$title =~ s/^.\d+ //; # remove id from title
		$title =~ s/INCLUDED//g; # remove INCLUDED as it screws up scoring

		# *FIELD* TI - additional lines
		my $alt = $omim_entry->alternative_titles_and_symbols();
		# OMIM uses this weird delimiter ;;
		# to signal sections irrespective of actual line endings
		# this is a major headache to resolve, the parser doesn't 
		# do this and we're not going to bother with it either
		$alt =~ s/;;//g; 
		$alt =~ s/INCLUDED//g; # remove INCLUDED as it screws up scoring
		my @synonyms = split m!\n!, $alt;
		# if alt doesn't start with ;; it's an overspill from the
		# title (go figure!)
		if ($alt ne '' && 
				$omim_entry->alternative_titles_and_symbols() !~ /^;;/) {
			$title .= shift @synonyms;
		}
		
		$term->{$id}->{label} = $title;
		$term->{$id}->{synonyms} = \@synonyms;
		
		$synonym_count += scalar @synonyms;
	
	}

		INFO "Loaded "
	  . keys( %$term )
	  . " OMIM terms and "
	  . $synonym_count
	  . " synonyms";

	return $term;
}

=item parseFlat()

Custom flat file parser.

=cut

sub parseFlat($) {
	my $file = shift;
	my $term;
	INFO "Parsing flat file $file ...";

	open my $fh_in, '<', $file;

	# parse header
	my $header = <$fh_in>;
	chomp $header;
	( $flat_header[0], $flat_header[1] ) = parseFlatColumns($header);

	INFO "Using first line as header <$header>";
	INFO "Using first column <$flat_header[0]> to match terms";

	# load input
	while (<$fh_in>) {
		chomp;
		next if /^$/;    #skip empty line

		# preserve existing columns in the file
		my ( $label, $ragged_end ) = parseFlatColumns($_);

		# trim
		$label =~ s/^\s+//;
		$label =~ s/\s+$//;

		# drop trailing quotation marks (excel artefact?)
		$label =~ s/^"+//;
		$label =~ s/"+$//;



( run in 1.034 second using v1.01-cache-2.11-cpan-71847e10f99 )