App-PDFLibrarian
view release on metacpan or search on metacpan
bin/pdf-lbr-output-bib view on Meta::CPAN
#!/usr/bin/env perl
# Copyright (C) 2016--2026 Karl Wette
#
# This file is part of App::PDFLibrarian.
#
# App::PDFLibrarian is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# App::PDFLibrarian is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with App::PDFLibrarian. If not, see <http://www.gnu.org/licenses/>.
use strict;
use warnings;
use Carp;
use Clipboard;
use FindBin qw($Script);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;
FindBin::again();
use App::PDFLibrarian qw(%default_filter %default_output_text_format);
use App::PDFLibrarian::BibTeX qw(read_bib_from_pdf find_dup_bib_keys format_bib format_bib_authors write_bib_to_fh);
use App::PDFLibrarian::TitleAbbr qw(get_aas_macros abbr_iso4_title);
use App::PDFLibrarian::Util qw(get_file_list find_pdf_files remove_tex_markup);
=pod
=head1 NAME
B<pdf-lbr-output-bib> - Output BibTeX bibliographic metadata from PDF files.
=head1 SYNOPSIS
B<pdf-lbr-output-bib> B<--version>
B<pdf-lbr-output-bib> B<--help>|B<-h>
B<pdf-lbr-output-bib> [ B<--clipboard>|B<-c> ] [ B<--max-authors>|B<-m> I<count> [ B<--only-first-author>|B<-f> ] ] [ B<--filter>|B<-F> [I<type>B<:>]I<field>[B<?>I<iffield>|B<!>I<ifnotfield>...]B<=>I<spec> ... ] [ B<--no-default-filter>|B<-N> ] [ B<-...
... I<files>|I<directories> ... B<|> B<pdf-lbr-output-bib> ...
=head1 DESCRIPTION
B<pdf-lbr-output-bib> reads BibTeX bibliographic metadata embedded in PDF I<files> and/or any PDF files in I<directories>. If I<files>|I<directories> are not given on the command line, they are read from standard input, one per line.
The BibTeX metadata is then printed to standard output; if B<--clipboard> is given, it is instead copied to the clipboard.
=head1 OPTIONS
=over 4
=item B<--max-authors>|B<-m> I<count> [ B<--only-first-author>|B<-f> ]
If the number of authors is greater than I<count>, and
=over 4
=item * If B<--only-first-author> is given, output only the first author, followed by "and others".
=item * Otherwise, output the first I<count> authors, followed by "and others".
=back
=item B<--filter>|B<-F> [I<type>B<:>]I<field>[B<?>I<iffield>|B<!>I<ifnotfield>...]B<=>I<spec> ... [ B<--no-default-filter>|B<-N> ]
Apply the filter I<spec> to the BibTeX I<field>. If given, I<type> applies filter only to BibTeX entries of that type, I<iffield> applies filter only to BibTeX entries where <iffield> is defined, and I<ifnotfield> applies filter only to BibTeX entrie...
=over 4
=item B<d>
Exclude I<field> from output.
=item B<=>I<value>
Set I<field> to I<value> in output.
=item B<s>B</>I<pattern>B</>I<replacement>[B</>I<pattern>B</>I<replacement>...]B</>
Replace each regular expression I<pattern> with I<replacement> in output.
=back
If no B<--filter> arguments are given, default filters given in the configuration file are applied (unless B<--no-default-filter> is given).
=item B<--abbreviate>|B<-a> I<scheme> ...
Abbreviate journal/series titles according to the given I<scheme>, applied in the order given on the command line. Available I<scheme>s:
=over 4
=item I<aas>
AAS macros for astronomy journals, used by the NASA Astrophysics Data System.
=item I<iso4>
ISO4 abbreviations using the ISSN List of Title Word Abbreviations.
=item I<iso4~>
Same as I<iso4> but separate words with tildes instead of spaces.
=back
=item B<--pdf-file-comment>|B<-P>
If true, output the PDF filename as a comment before each BibTeX entry. Default is false. (The PDF filename is never included in the BibTeX entry itself.)
=item B<--output-text-format>|B<-o> I<type>=<format>
Instead of outputting a BibTeX entry, output plain text, formatting entries of type I<type> with format I<format>. BibTeX I<field>s may be substituted into I<format> with the syntax I<%field>.
The I<author> and I<editor> fields must include one of the suffixes I<:fvlj> or I<:vljf> to indicate the citation style: I<author:fvlj> cites authors with initials then last name; I<author:vljf> cites authors with last name then initials.
Format text surrounded by curly braces is removed if it contains a I<%> from an unexpanded I<field>. Curly braces may be nested to define alternatives for missing fields, e.g. I<{DOI:%doi{URL:%url}}> provides a URL only if the DOI field is missing.
=item B<--output-text>|B<-O>
Instead of outputting a BibTeX entry, output plain text, formatting entries with formats given in the configuration file.
=back
=head1 PART OF
App::PDFLibrarian
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2016--2026 Karl Wette. Licensed under the GNU General Public License, version 3 or later.
=cut
# handle help options
my ($version, $help, $clipboard, $max_authors, $only_first_author, %filter, $no_default_filter, @abbreviate_schemes, $pdf_file_comment, %output_text_format, $output_text);
$max_authors = 0;
$pdf_file_comment = 0;
GetOptions(
"version" => \$version,
"help|h" => \$help,
"clipboard|c" => \$clipboard,
"max-authors|m=i" => \$max_authors,
"only-first-author|f" => \$only_first_author,
"filter|F=s" => \%filter,
"no-default-filter|N" => \$no_default_filter,
"abbreviate|a=s" => \@abbreviate_schemes,
"pdf-file-comment|P" => \$pdf_file_comment,
"output-text-format|o=s" => \%output_text_format,
"output-text|O" => \$output_text,
) or croak "$Script: could not parse options";
if ($version) { print "App::PDFLibrarian version $App::PDFLibrarian::VERSION\n"; exit 1; }
pod2usage(-verbose => 2, -exitval => 1) if ($help);
croak "$Script: --max-authors must be positive" if $max_authors < 0;
# use default field filter if --no-default-filter is not given
if ($no_default_filter) {
printf STDERR "$Script: using no default field filters\n";
} else {
printf STDERR "$Script: using default field filters from configuration file\n";
foreach my $field (keys %default_filter) {
if (!defined($filter{$field})) {
$filter{$field} = $default_filter{$field};
}
}
}
# parse field filters
my %filterbibtype;
my %filterbibfield;
my %filteriffields;
my %filterifnotfields;
foreach my $field (keys %filter) {
my @tokens = split(/([:?!])/, $field);
# parse [type:]field
if (@tokens == 1) {
$filterbibtype{$field} = ".";
$filterbibfield{$field} = shift @tokens;
} elsif ($tokens[1] eq ":") {
$filterbibtype{$field} = shift @tokens;
shift @tokens;
$filterbibfield{$field} = shift @tokens;
} else {
$filterbibtype{$field} = ".";
$filterbibfield{$field} = shift @tokens;
}
# parse [?iffield|!ifnotfield]
$filteriffields{$field} = [];
$filterifnotfields{$field} = [];
while (@tokens > 0) {
my $cond = shift @tokens;
my $condfield = shift @tokens;
if ($cond eq "?") {
push @{$filteriffields{$field}}, $condfield;
} elsif ($cond eq "!") {
push @{$filterifnotfields{$field}}, $condfield;
} else {
croak "$Script: unrecognised field condition '$cond'";
}
}
bin/pdf-lbr-output-bib view on Meta::CPAN
# get BibTeX fields
my %bibfields;
foreach my $bibfield ($bibentry->fieldlist()) {
if ($bibfield eq "author" or $bibfield eq "editor") {
# format authors/editors
foreach my $authorformat (qw(fvlj vljf)) {
my @names = format_bib_authors($authorformat, $max_authors, "et al.", $bibentry->names($bibfield));
my $joined_names = join(", ", @names);
$joined_names =~ s/, et al/ et al/;
$bibfields{"${bibfield}:${authorformat}"} = $joined_names;
}
} elsif ($bibfield eq "collaboration") {
# format collaborations
my @names = format_bib_authors("l", $max_authors, "et al.", $bibentry->names($bibfield));
my $joined_names = join(", ", @names);
$joined_names =~ s/, et al/ et al/;
$bibfields{$bibfield} = $joined_names;
} else {
$bibfields{$bibfield} = remove_tex_markup($bibentry->get($bibfield));
}
}
# replace fields in format
# - ensure that field replacements do not introduce duplicate periods
foreach my $bibfield (keys %bibfields) {
my $bibfieldvalue = $bibfields{$bibfield};
my $bibfieldvalue_with_period = $bibfieldvalue;
$bibfieldvalue_with_period =~ s/\.*$/./;
$bibstr =~ s/%${bibfield}\./${bibfieldvalue_with_period}/g;
$bibstr =~ s/%${bibfield}/${bibfieldvalue}/g;
}
# remove unused fields within curly braces
$bibstr =~ s/{[^{}]*%[^{}]*((?:{[^{%}]*})?)}/$1/g;
$bibstr =~ s/[{}]//g;
# add to output string
$bibstring .= "\n$bibstr\n";
}
} else {
# write BibTeX entries
open(my $fh, "+<", \$bibstring);
write_bib_to_fh({
fh => $fh,
pdf_file => $pdf_file_comment ? "comment" : "none"
},
@bibentries);
close($fh);
}
# output BibTeX entries
if ($clipboard) {
Clipboard->copy_to_all_selections($bibstring);
printf STDERR "$Script: BibTeX metadata has been copied to the clipboard\n";
} else {
print "$bibstring";
}
exit 0;
( run in 1.406 second using v1.01-cache-2.11-cpan-2398b32b56e )