App-PDFLibrarian

 view release on metacpan or  search on metacpan

bin/pdf-lbr-output-bib  view on Meta::CPAN

#!/usr/bin/env perl

# Copyright (C) 2016--2026 Karl Wette
#
# This file is part of App::PDFLibrarian.
#
# App::PDFLibrarian is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or (at
# your option) any later version.
#
# App::PDFLibrarian is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with App::PDFLibrarian. If not, see <http://www.gnu.org/licenses/>.

use strict;
use warnings;

use Carp;
use Clipboard;
use FindBin qw($Script);
use Getopt::Long qw(:config no_ignore_case);
use Pod::Usage;

FindBin::again();

use App::PDFLibrarian qw(%default_filter %default_output_text_format);
use App::PDFLibrarian::BibTeX qw(read_bib_from_pdf find_dup_bib_keys format_bib format_bib_authors write_bib_to_fh);
use App::PDFLibrarian::TitleAbbr qw(get_aas_macros abbr_iso4_title);
use App::PDFLibrarian::Util qw(get_file_list find_pdf_files remove_tex_markup);

=pod

=head1 NAME

B<pdf-lbr-output-bib> - Output BibTeX bibliographic metadata from PDF files.

=head1 SYNOPSIS

B<pdf-lbr-output-bib> B<--version>
B<pdf-lbr-output-bib> B<--help>|B<-h>

B<pdf-lbr-output-bib> [ B<--clipboard>|B<-c> ] [ B<--max-authors>|B<-m> I<count> [ B<--only-first-author>|B<-f> ] ] [ B<--filter>|B<-F> [I<type>B<:>]I<field>[B<?>I<iffield>|B<!>I<ifnotfield>...]B<=>I<spec> ... ] [ B<--no-default-filter>|B<-N> ] [ B<-...

... I<files>|I<directories> ... B<|> B<pdf-lbr-output-bib> ...

=head1 DESCRIPTION

B<pdf-lbr-output-bib> reads BibTeX bibliographic metadata embedded in PDF I<files> and/or any PDF files in I<directories>. If I<files>|I<directories> are not given on the command line, they are read from standard input, one per line.

The BibTeX metadata is then printed to standard output; if B<--clipboard> is given, it is instead copied to the clipboard.

=head1 OPTIONS

=over 4

=item B<--max-authors>|B<-m> I<count> [ B<--only-first-author>|B<-f> ]

If the number of authors is greater than I<count>, and

=over 4

=item * If B<--only-first-author> is given, output only the first author, followed by "and others".

=item * Otherwise, output the first I<count> authors, followed by "and others".

=back

=item B<--filter>|B<-F> [I<type>B<:>]I<field>[B<?>I<iffield>|B<!>I<ifnotfield>...]B<=>I<spec> ... [ B<--no-default-filter>|B<-N> ]

Apply the filter I<spec> to the BibTeX I<field>. If given, I<type> applies filter only to BibTeX entries of that type, I<iffield> applies filter only to BibTeX entries where <iffield> is defined, and I<ifnotfield> applies filter only to BibTeX entrie...

=over 4

=item B<d>

Exclude I<field> from output.

=item B<=>I<value>

Set I<field> to I<value> in output.

=item B<s>B</>I<pattern>B</>I<replacement>[B</>I<pattern>B</>I<replacement>...]B</>

Replace each regular expression I<pattern> with I<replacement> in output.

=back

If no B<--filter> arguments are given, default filters given in the configuration file are applied (unless B<--no-default-filter> is given).

=item B<--abbreviate>|B<-a> I<scheme> ...

Abbreviate journal/series titles according to the given I<scheme>, applied in the order given on the command line. Available I<scheme>s:

=over 4

=item I<aas>

AAS macros for astronomy journals, used by the NASA Astrophysics Data System.

=item I<iso4>

ISO4 abbreviations using the ISSN List of Title Word Abbreviations.

=item I<iso4~>

Same as I<iso4> but separate words with tildes instead of spaces.

=back

=item B<--pdf-file-comment>|B<-P>

If true, output the PDF filename as a comment before each BibTeX entry. Default is false. (The PDF filename is never included in the BibTeX entry itself.)

=item B<--output-text-format>|B<-o> I<type>=<format>

Instead of outputting a BibTeX entry, output plain text, formatting entries of type I<type> with format I<format>. BibTeX I<field>s may be substituted into I<format> with the syntax I<%field>.

The I<author> and I<editor> fields must include one of the suffixes I<:fvlj> or I<:vljf> to indicate the citation style: I<author:fvlj> cites authors with initials then last name; I<author:vljf> cites authors with last name then initials.

Format text surrounded by curly braces is removed if it contains a I<%> from an unexpanded I<field>. Curly braces may be nested to define alternatives for missing fields, e.g. I<{DOI:%doi{URL:%url}}> provides a URL only if the DOI field is missing.

=item B<--output-text>|B<-O>

Instead of outputting a BibTeX entry, output plain text, formatting entries with formats given in the configuration file.

=back

=head1 PART OF

App::PDFLibrarian

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2016--2026 Karl Wette. Licensed under the GNU General Public License, version 3 or later.

=cut

# handle help options
my ($version, $help, $clipboard, $max_authors, $only_first_author, %filter, $no_default_filter, @abbreviate_schemes, $pdf_file_comment, %output_text_format, $output_text);
$max_authors = 0;
$pdf_file_comment = 0;
GetOptions(
           "version" => \$version,
           "help|h" => \$help,
           "clipboard|c" => \$clipboard,
           "max-authors|m=i" => \$max_authors,
           "only-first-author|f" => \$only_first_author,
           "filter|F=s" => \%filter,
           "no-default-filter|N" => \$no_default_filter,
           "abbreviate|a=s" => \@abbreviate_schemes,
           "pdf-file-comment|P" => \$pdf_file_comment,
           "output-text-format|o=s" => \%output_text_format,
           "output-text|O" => \$output_text,
          ) or croak "$Script: could not parse options";
if ($version) { print "App::PDFLibrarian version $App::PDFLibrarian::VERSION\n"; exit 1; }
pod2usage(-verbose => 2, -exitval => 1) if ($help);
croak "$Script: --max-authors must be positive" if $max_authors < 0;

# use default field filter if --no-default-filter is not given
if ($no_default_filter) {
  printf STDERR "$Script: using no default field filters\n";
} else {
  printf STDERR "$Script: using default field filters from configuration file\n";
  foreach my $field (keys %default_filter) {
    if (!defined($filter{$field})) {
      $filter{$field} = $default_filter{$field};
    }
  }
}

# parse field filters
my %filterbibtype;
my %filterbibfield;
my %filteriffields;
my %filterifnotfields;
foreach my $field (keys %filter) {
  my @tokens = split(/([:?!])/, $field);

  # parse [type:]field
  if (@tokens == 1) {
    $filterbibtype{$field} = ".";
    $filterbibfield{$field} = shift @tokens;
  } elsif ($tokens[1] eq ":") {
    $filterbibtype{$field} = shift @tokens;
    shift @tokens;
    $filterbibfield{$field} = shift @tokens;
  } else {
    $filterbibtype{$field} = ".";
    $filterbibfield{$field} = shift @tokens;
  }

  # parse [?iffield|!ifnotfield]
  $filteriffields{$field} = [];
  $filterifnotfields{$field} = [];
  while (@tokens > 0) {
    my $cond =  shift @tokens;
    my $condfield = shift @tokens;
    if ($cond eq "?") {
      push @{$filteriffields{$field}}, $condfield;
    } elsif ($cond eq "!") {
      push @{$filterifnotfields{$field}}, $condfield;
    } else {
      croak "$Script: unrecognised field condition '$cond'";
    }
  }

bin/pdf-lbr-output-bib  view on Meta::CPAN


    # get BibTeX fields
    my %bibfields;
    foreach my $bibfield ($bibentry->fieldlist()) {
      if ($bibfield eq "author" or $bibfield eq "editor") {

        # format authors/editors
        foreach my $authorformat (qw(fvlj vljf)) {
          my @names = format_bib_authors($authorformat, $max_authors, "et al.", $bibentry->names($bibfield));
          my $joined_names = join(", ", @names);
          $joined_names =~ s/, et al/ et al/;
          $bibfields{"${bibfield}:${authorformat}"} = $joined_names;
        }

      } elsif ($bibfield eq "collaboration") {

        # format collaborations
        my @names = format_bib_authors("l", $max_authors, "et al.", $bibentry->names($bibfield));
        my $joined_names = join(", ", @names);
        $joined_names =~ s/, et al/ et al/;
        $bibfields{$bibfield} = $joined_names;

      } else {
        $bibfields{$bibfield} = remove_tex_markup($bibentry->get($bibfield));
      }
    }

    # replace fields in format
    # - ensure that field replacements do not introduce duplicate periods
    foreach my $bibfield (keys %bibfields) {
      my $bibfieldvalue = $bibfields{$bibfield};
      my $bibfieldvalue_with_period = $bibfieldvalue;
      $bibfieldvalue_with_period =~ s/\.*$/./;
      $bibstr =~ s/%${bibfield}\./${bibfieldvalue_with_period}/g;
      $bibstr =~ s/%${bibfield}/${bibfieldvalue}/g;
    }

    # remove unused fields within curly braces
    $bibstr =~ s/{[^{}]*%[^{}]*((?:{[^{%}]*})?)}/$1/g;
    $bibstr =~ s/[{}]//g;

    # add to output string
    $bibstring .= "\n$bibstr\n";

  }

} else {

  # write BibTeX entries
  open(my $fh, "+<", \$bibstring);
  write_bib_to_fh({
                   fh => $fh,
                   pdf_file => $pdf_file_comment ? "comment" : "none"
                  },
                  @bibentries);
  close($fh);

}

# output BibTeX entries
if ($clipboard) {
  Clipboard->copy_to_all_selections($bibstring);
  printf STDERR "$Script: BibTeX metadata has been copied to the clipboard\n";
} else {
  print "$bibstring";
}

exit 0;



( run in 1.406 second using v1.01-cache-2.11-cpan-2398b32b56e )