Lingua-Ogmios

 view release on metacpan or  search on metacpan

lib/Lingua/Ogmios/Annotations.pm  view on Meta::CPAN

    my @section_infos;
    my $section_string;

    while($temp_canonicalDocumentString =~ /<section[^>]*>|<list>|<item>/o) {
	push @canonicalDocument_sections, $`; #` }
	$temp_canonicalDocumentString = $';


        $section_string = $&; # '
	 #    warn "section string: $section_string\n";
         # warn $temp_canonicalDocumentString . "\n\n";
        my %tmp;
	$tmp{'type'} = "empty";
	$tmp{'title'} = "empty";
	if ($section_string =~ /<item>/o) {
	    $tmp{'type'} = "item";
	    $tmp{'title'} = undef;
	} else {
	    if ($section_string =~ /<list>/o) {
		$tmp{'type'} = "list";
		$tmp{'title'} = undef;
	    } else {
		$tmp{'type'} = "narrative";
		if ($section_string =~ /<section(\s+sectionType=\"(?<st>[^"]+)\")?(\s+title=\"(?<t>[^"]+)\")?>/o) { #"
		    $tmp{'type'} = $+{st};
		    $tmp{'title'} = $+{t};
		} else {
		    $tmp{'title'} = undef;
	        } 
	    }
        }
	push @section_infos, \%tmp;
    }
    push @canonicalDocument_sections, $temp_canonicalDocumentString;

    shift @canonicalDocument_sections; # what is before the first section cannot be into the document
    
#     shift @section_infos;

    $start_position = 0;
    $end_position = 0;

    warn "[LOG] Identifying start position of the sections\n";

    my $j;
    for($j=0;$j < scalar(@canonicalDocument_sections);$j++) {
	$section = $canonicalDocument_sections[$j];
	# warn "-> $section\n";
# 	push @section_starts, $start_position;
	my @tmp = ($start_position, $section_infos[$j], $section);
	push @section_starts, \@tmp;
	$section =~ s/<[^>]+>//go;
	# warn "$section\n";
	# warn "\t" . length(Lingua::Ogmios::Annotations::Element->_xmldecode($section)) . "\n";
	$start_position += length(Lingua::Ogmios::Annotations::Element->_xmldecode($section));
	# warn "\t" . $start_position . "\n";
    }

    warn "\n[LOG] Identifying end position of the sections\n";

    @canonicalDocument_sections = split m!</section>|</list>|</item>!, $canonicalDocumentString;

    foreach $section (@canonicalDocument_sections) {
	if ($section eq "") {
	    # warn "empty section content ($end_position)\n";
	    if ($end_position != 0) {
		push @section_ends, $end_position-1;
	    } else {
		push @section_ends, $end_position;
	    }
	} else {
	    # warn "-> $section\n";
	    # warn "\t" . $end_position . "(a)\n";
	    # if ($section =~ /([^<]*)<[^>]+>/os) {
	    #     $end_position += length($1);
	    # }
	    while($section =~ s/([^<]*)<[^>]+>//os) {
		$end_position += length($1);
	    }
	    # warn "\t" . $end_position . "(b)\n";
	    # $section =~ s/<[^>]+>//go;
	    # warn "$section\n";
	    # warn "\t" . length(Lingua::Ogmios::Annotations::Element->_xmldecode($section)) . "\n";
	    $end_position += length(Lingua::Ogmios::Annotations::Element->_xmldecode($section));
	    $end_position--;
	    # warn "\t" . $end_position . "(c)\n";
	    push @section_ends, $end_position;
	    $end_position++;
#	push @section_ends, $section_starts[$#section_ends + 1] + length(Lingua::Ogmios::Annotations::Element->_xmldecode($section)) - 1;
	}
    }

    # as empty trailing fields are deleted with split, put empty
    # string instead to get the same number of start and end section
    # position

    for($i = scalar(@section_ends); $i < scalar @section_starts; $i++) {
	$section_ends[$i] = $section_ends[$#section_ends];
    }



    $canonicalDocumentString =~ s/<[^>]+>//go;

    $canonicalDocumentString = Lingua::Ogmios::Annotations::Element->_xmldecode($canonicalDocumentString);

    warn "[LOG] Merging identification of the end and start position\n";
    $start_position = 0;
    $end_position = 0;
    &_merge_sections(\@section_starts, \@section_ends, \$start_position, \$end_position, \@sections, \@section_infos, 0, undef);

    # if ($debug_devel_level == 1) {
	# warn "[LOG/$debug_devel_level] Check merging identification of the end and start position\n";
	
	# foreach $section (@sections) {
	#     ($start_position, $end_position) = ($section->getFrom, $section->getTo);
 	#     warn "[LOG/$debug_devel_level] Section from $start_position to $end_position\n";
	#     print STDERR "\t" . substr($canonicalDocumentString, $start_position, $end_position - $start_position + 1) . "\n";
	# }
    # }



( run in 0.670 second using v1.01-cache-2.11-cpan-5511b514fd6 )