Finnigan

 view release on metacpan or  search on metacpan

bin/mzxml-scan  view on Meta::CPAN

for my $i (1 .. 5) {
  $head .= <>;
}

my ($type, $binaryTag);

if ($head =~ /<mzML/m) {
  $type = 'mzML';
}
elsif ($head =~ /<mzXML/) {
  $type = 'mzXML';
}
else {
  say STDERR "can't recognise file format:";
  say STDERR $head . "...";
}


if ( $type eq 'mzXML' ) {
  $/ = '</peaks>';
  while ( <> ) {
    if ( s/(<peaks[^>]+>)(.*)$// ) {
      my ($tag, $data) = ($1, $2);
      s/\n$//;
      if ( /scan num="(\d+)"/ ) {
        next if $1 < $args->{'-n'}{'<n>'};
        $data =~ s{</peaks>$}{};
        my @spec = map {unpack "f", $_} map {pack "V", $_} unpack("N*", decode_base64($data));

        foreach my $i (0 .. scalar @spec / 2 - 1) {
          say join("\t", @spec[2*$i .. 2*$i+1]);
        }
        last;
      }
      else {
        die "cannot determine scan number";
      }
    }
  }
} # mzXML

else { # mzML
  $/ = '</binaryDataArrayList>';
  while ( <> ) {
    chomp;

    my $chromatogram = 0;
    if ( /<chromatogramList/) {
      $chromatogram = 1;
    }

    if ( /<spectrum[^>]+scan=(\d+)/s ) {
      next if $1 < $args->{'-n'}{'<n>'};
      if ( s/^(.*<binaryDataArrayList\s+count\s*=\s*"(\d+)">)(.+)$//s ) {
        my ($tag, $n, $tail) = ($1, $2, $3);
        die "don't know what to do with <binaryDataArrayList> of size != 2 (read: $n)" unless $n == 2;
        
        my $chunk_no = 0;
        my @key = qw/mz intensity/;
        my $table;
        foreach my $chunk ( split m{</binaryDataArray>\s*}, $tail) {
          if ( $chunk =~ m{^(.*<binaryDataArray.+<binary>)(.*)(</binary>.*)$}s ) {
            my ($head, $data, $tail) = ($1, $2, $3);

            my ($size) = ( $head =~ /<cvParam.+name="(\d\d-bit)/ );

            if ( $size eq '32-bit' ) {
              $table->{$key[$chunk_no]} = [unpack("f<*", decode_base64($data))];
            }
            elsif ( $size eq '64-bit' ) {
              $table->{$key[$chunk_no]} = [unpack("d<*", decode_base64($data))];
            }
            else {
              die "unknown number size: $size";
            }
            $chunk_no++;

          }
          else {
            say STDERR "<binaryDataArray>...<binary> not matched in: " . substr($chunk, 0, 500) . "...\n";
          }
        }
        my ($n1, $n2) = (scalar @{$table->{mz}}, scalar @{$table->{intensity}});
        die "unequal sizes of the M/z and intensity arrays ($n1 and $n2)" unless $n1 == $n2;
        foreach my $i ( 0 .. $n1 - 1 ) {
          say join "\t", map {$table->{$_}->[$i]} qw/mz intensity/;
        }
      }
      last;
    }
  }
}

__END__
=head1 NAME

mzxml-unpack - decode the base64-encoded scan data in an mzXML or mzML file

=head1 SYNOPSIS

mzxml-unpack [options] <file>

 Options:

  -r[ange] <from:0+n> .. <to:0+n>  write only scans with numbers between <from> and <to>
  -hex                             add the hex encoding of decimals
  <file>                           input file

=head1 OPTIONS

=over 4

=item B<-r[ange] E<lt>from:0+nE<gt> .. E<lt>to:0+nE<gt>>

extract only scans with numbers between E<lt>fromE<gt> and E<lt>toE<gt>

B<Note:> this option breaks the structure of the output file (the parts preceding and following the selected range of scans are not written). It is mainly useful in checking the XML syntax and the contents of a small number of scans. For extracting t...

=item B<-hex>

add the hex encoding of decimals



( run in 0.621 second using v1.01-cache-2.11-cpan-71847e10f99 )