Convert-Pheno
view release on metacpan - search on metacpan
view release on metacpan or search on metacpan
lib/Convert/Pheno/IO/CSVHandler.pm view on Meta::CPAN
# Encode data
my $encoded_data =
encode_omop_stream( $table_name, $hash_slice, $person, $count,
$self );
# Only after encoding we are able to discard 'null'
say $fh_out $encoded_data if $encoded_data ne 'null';
# Print if verbose
say "Rows processed: $count"
if ( $self->{verbose} && $count % 10_000 == 0 );
}
}
say "==============\nRows total: $count\n" if $self->{verbose};
#say $fh_out "]"; # not needed
# Closing filehandles
close $fh_in;
close $fh_out;
return 1;
}
sub encode_omop_stream {
my ( $table_name, $hash_slice, $person, $count, $self ) = @_;
# *** IMPORTANT ***
# We only print person_id ONCE!!!
my $person_id = $hash_slice->{person_id};
my $data = {
$table_name => [$hash_slice],
PERSON => $count == 1
? $person->{$person_id}
: {
map { $_ => $person->{$person_id}{$_} }
qw(person_id gender_concept_id birth_datetime)
}
};
# Obtain
my $stream = Convert::Pheno::omop2bff_stream_processing( $self, $data );
# Return JSON string
# - canonical has some overhead but needed for t/)
# - $fh is already utf-8, no need to encode again here
return JSON::XS->new->canonical->encode($stream);
}
sub read_sqldump {
my $arg = shift;
my $filepath = $arg->{in};
my $self = $arg->{self};
# Before resorting to writting this subroutine I performed an exhaustive search on CPAN:
# - Tested MySQL::Dump::Parser::XS but I could not make it work...
# - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables)
# - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages....
# Anyway, I ended up writting the parser myself...
# The parser is based in reading COPY paragraphs from PostgreSQL dump by using Perl's paragraph mode $/ = "";
# NB: Each paragraph (TABLE) is loaded into memory. Not great for large files.
# Define variables that modify what we load
my $max_lines_sql = $self->{max_lines_sql};
my @omop_tables = @{ $self->{omop_tables} };
# Set record separator to paragraph
local $/ = "";
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin;
# ......
# \.
# Start reading the SQL dump
my $fh = open_filehandle( $filepath, 'r' );
# We'll store the data in the hashref $data
my $data = {};
# Process paragraphs
while ( my $paragraph = <$fh> ) {
# Discarding paragraphs not having m/^COPY/
next unless $paragraph =~ m/^COPY/;
# Load all lines into an array (via "\n")
my @lines = split /\n/, $paragraph;
next unless scalar @lines > 2;
pop @lines; # last line eq '\.'
# First line contains the headers
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin;
$lines[0] =~ s/[\(\),]//g; # getting rid of (),
my @headers = split /\s+/, $lines[0];
my $table_name =
uc( ( split /\./, $headers[1] )[1] ); # ATTRIBUTE_DEFINITION
# Discarding non @$omop_tables:
# This step improves RAM consumption
next unless any { $_ eq $table_name } @omop_tables;
# Say if verbose
say "Processing table ... <$table_name>" if $self->{verbose};
# Discarding first line
shift @lines;
# Discarding headers which are not terms/variables
@headers = @headers[ 2 .. $#headers - 2 ];
# Initializing $data>key as empty arrayref
$data->{$table_name} = [];
# Ad hoc counter for dev
my $count = 0;
# Processing line by line
for my $line (@lines) {
$count++;
# Columns are separated by \t
# NB: 'split' and 'Text::CSV' split to strings
# We go with 'split'. Coercing a posteriori
my @fields = split /\t/, $line;
# Loading the fields like this:
#
# $VAR1 = {
view all matches for this distributionview release on metacpan - search on metacpan
( run in 3.059 seconds using v1.00-cache-2.02-grep-82fe00e-cpan-d29e8ade9f55 )