MARC-Convert-Wikidata

 view release on metacpan or  search on metacpan

Wikidata/Utils.pm  view on Meta::CPAN


	if (! defined $isbn) {
		return;
	}

	my $ret_isbn = $isbn;
	$ret_isbn =~ s/\s+:?$//ms;

	if ($ret_isbn !~ m/^[\d\-]+$/ms) {
		if ($DEBUG) {
			warn "ISBN '$ret_isbn' couldn't clean.";
		}
		$ret_isbn = undef;
	}

	return $ret_isbn;
}

sub clean_issn {
	my $issn = shift;

	if (! defined $issn) {
		return;
	}

	my $ret_issn = $issn;
	$ret_issn =~ s/\s+;?$//ms;

	if ($ret_issn !~ m/^\d{4}-\d{4}$/ms) {
		if ($DEBUG) {
			warn "ISSN '$ret_issn' couldn't clean.";
		}
		$ret_issn = undef;
	}

	return $ret_issn;
}

sub clean_number_of_pages {
	my $number_of_pages = shift;

	if (! defined $number_of_pages) {
		return;
	}

	my $ret_number_of_pages = $number_of_pages;

	# Remove []
	$ret_number_of_pages =~ s/\[(\d+)\]/$1/msg;

	# Remove text informations.
	my $pril = decode_utf8('příl');
	my $neci = decode_utf8('nečíslovaných');
	$ret_number_of_pages =~ s/\s*(stran|s|$pril|$neci|l|barev|obr)\.*\s*//msg;

	# Remove other characters like ':' and ';'.
	$ret_number_of_pages =~ s/[:;]//msg;

	# TODO Support roman numbers.
	if ($ret_number_of_pages =~ m/^[\ \d,]+$/ms) {
		my @numbers = split m/\s*,\s*/ms, $ret_number_of_pages;
		$ret_number_of_pages = sum(@numbers);
	}

	if ($ret_number_of_pages !~ m/^\d+$/ms) {
		if ($DEBUG) {
			warn "Number of pages '$number_of_pages' couldn't clean.";
		}
		$ret_number_of_pages = undef;
	}

	return $ret_number_of_pages;
}

sub clean_oclc {
	my $oclc = shift;

	if (! defined $oclc) {
		return;
	}

	my $ret_oclc = $oclc;
	$ret_oclc =~ s/^\(OCoLC\)//ms;

	return $ret_oclc;
}

sub clean_publication_date {
	my $publication_date = shift;

	my $ret_publication_date = $publication_date;

	my ($start_date, $end_date, $dash);
	if ($ret_publication_date =~ m/^([^-]+)(\-?)(.*)$/ms) {
		$start_date = $1;
		$dash = $2;
		$end_date = $3;
	}

	# Remove [] on begin and end.
	# XXX [] is circa
	$start_date = _remove_square_brackets($start_date);
	if (defined $end_date) {
		$end_date = _remove_square_brackets($end_date);
	}

	# Detect circa.
	my $option;
	foreach my $date ($start_date, $end_date) {
		if (defined $date && ($date =~ s/^c(.*)$/$1/ms
			|| $date =~ s/^(.*)\?$/$1/ms)) {

			# XXX Circa of start and end
			$option = 'circa';
		}
	}

	# Combine back.
	$ret_publication_date = $start_date;
	if ($dash) {
		$ret_publication_date .= $dash;



( run in 0.685 second using v1.01-cache-2.11-cpan-71847e10f99 )