Convert-Pheno

 view release on metacpan or  search on metacpan

lib/Convert/Pheno/IO/CSVHandler.pm  view on Meta::CPAN

            # Encode data
            my $encoded_data =
              encode_omop_stream( $table_name, $hash_slice, $person, $count,
                $self );

            # Only after encoding we are able to discard 'null'
            say $fh_out $encoded_data if $encoded_data ne 'null';

            # Print if verbose
            say "Rows processed: $count"
              if ( $self->{verbose} && $count % 10_000 == 0 );
        }
    }
    say "==============\nRows total:     $count\n" if $self->{verbose};

    #say $fh_out "]"; # not needed

    # Closing filehandles
    close $fh_in;
    close $fh_out;
    return 1;
}

sub encode_omop_stream {

    my ( $table_name, $hash_slice, $person, $count, $self ) = @_;

    # *** IMPORTANT ***
    # We only print person_id ONCE!!!
    my $person_id = $hash_slice->{person_id};
    my $data      = {
        $table_name => [$hash_slice],
        PERSON      => $count == 1
        ? $person->{$person_id}
        : {
            map { $_ => $person->{$person_id}{$_} }
              qw(person_id gender_concept_id birth_datetime)
        }
    };

    # Obtain 
    my $stream = Convert::Pheno::omop2bff_stream_processing( $self, $data );

    # Return JSON string
    #  - canonical has some overhead but needed for t/)
    #  - $fh is already utf-8, no need to encode again here
    return JSON::XS->new->canonical->encode($stream);
}

sub read_sqldump {

    my $arg      = shift;
    my $filepath = $arg->{in};
    my $self     = $arg->{self};

# Before resorting to writting this subroutine I performed an exhaustive search on CPAN:
# - Tested MySQL::Dump::Parser::XS but I could not make it work...
# - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables)
# - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages....
# Anyway, I ended up writting the parser myself...
# The parser is based in reading COPY paragraphs from PostgreSQL dump by using Perl's paragraph mode  $/ = "";
# NB: Each paragraph (TABLE) is loaded into memory. Not great for large files.

    # Define variables that modify what we load
    my $max_lines_sql = $self->{max_lines_sql};
    my @omop_tables   = @{ $self->{omop_tables} };

    # Set record separator to paragraph
    local $/ = "";

#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin;
# ......
# \.

    # Start reading the SQL dump
    my $fh = open_filehandle( $filepath, 'r' );

    # We'll store the data in the hashref $data
    my $data = {};

    # Process paragraphs
    while ( my $paragraph = <$fh> ) {

        # Discarding paragraphs not having  m/^COPY/
        next unless $paragraph =~ m/^COPY/;

        # Load all lines into an array (via "\n")
        my @lines = split /\n/, $paragraph;
        next unless scalar @lines > 2;
        pop @lines;    # last line eq '\.'

# First line contains the headers
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin;
        $lines[0] =~ s/[\(\),]//g;    # getting rid of (),
        my @headers = split /\s+/, $lines[0];
        my $table_name =
          uc( ( split /\./, $headers[1] )[1] );    # ATTRIBUTE_DEFINITION

        # Discarding non @$omop_tables:
        # This step improves RAM consumption
        next unless any { $_ eq $table_name } @omop_tables;

        # Say if verbose
        say "Processing table ... <$table_name>" if $self->{verbose};

        # Discarding first line
        shift @lines;

        # Discarding headers which are not terms/variables
        @headers = @headers[ 2 .. $#headers - 2 ];

        # Initializing $data>key as empty arrayref
        $data->{$table_name} = [];

        # Ad hoc counter for dev
        my $count = 0;

        # Processing line by line
        for my $line (@lines) {
            $count++;

            # Columns are separated by \t
            # NB: 'split' and 'Text::CSV' split to strings
            # We go with 'split'. Coercing a posteriori
            my @fields = split /\t/, $line;

            # Loading the fields like this:
            #
            #  $VAR1 = {

 view all matches for this distribution
 view release on metacpan -  search on metacpan

( run in 3.059 seconds using v1.00-cache-2.02-grep-82fe00e-cpan-d29e8ade9f55 )