CPAN-InGit

 view release on metacpan or  search on metacpan

lib/CPAN/InGit/MirrorTree.pm  view on Meta::CPAN

package CPAN::InGit::MirrorTree;
our $VERSION = '0.003'; # VERSION
# ABSTRACT: Subclass of ArchiveTree which automatically mirrors files from upstream


use Carp;
use Scalar::Util 'refaddr', 'blessed';
use POSIX 'strftime';
use IO::Uncompress::Gunzip qw( gunzip $GunzipError );
use JSON::PP;
use Time::Piece;
use Log::Any '$log';
use Moo;
use v5.36;

extends 'CPAN::InGit::ArchiveTree';


has upstream_url            => ( is => 'rw', coerce => \&_add_trailing_slash );
has upstream_backup_url     => ( is => 'rw', lazy => 1, builder => 1, coerce => \&_add_trailing_slash );
has autofetch               => ( is => 'rw', default => 1 );
has package_details_max_age => ( is => 'rw', default => 86400 );

sub _build_upstream_backup_url($self) {
   ($self->upstream_url||'') =~ m{^(https?)://www\.cpan\.org}
      ? "$1://backpan.perl.org/"
      : undef;
}

sub _add_trailing_slash {
   my $x= shift;
   defined $x? $x =~ s{/?\z}{/}r : $x
}

sub _pack_config($self, $config) {
   $config->{upstream_url}= $self->upstream_url;
   $config->{upstream_backup_url}= $self->upstream_backup_url;
   $config->{autofetch}= $self->autofetch;
   $config->{package_details_max_age}= $self->package_details_max_age;
   $self->next::method($config);
}
sub _unpack_config($self, $config) {
   $self->next::method($config);
   $self->upstream_url($config->{upstream_url});
   $self->upstream_backup_url($config->{upstream_backup_url})
      if exists $config->{upstream_backup_url};
   $self->autofetch($config->{autofetch});
   $self->package_details_max_age($config->{package_details_max_age});
}

sub get_path($self, $path) {
   my $ent= $self->next::method($path);
   if ($self->autofetch) {
      # Special case for 02packages.details.txt, load it if missing or if cache is stale
      if ($path eq 'modules/02packages.details.txt') {
         if ($ent) {
            my $blob_last_update= $self->{_blob_last_update}{$ent->[0]->id} // do {
               # parse it out of the file
               my $head= substr($ent->[0]->content, 0, 10000);
               $head =~ /^Last-Updated:\s*(.*)$/m or die "Can't parse 02packages.details.txt";
               (my $date= $1) =~ s/\s+\z//;
               $log->debug("Date in modules/02packages.details.txt is '$date'");
               Time::Piece->strptime($date, "%a, %d %b %Y %H:%M:%S GMT")->epoch
            };
            if ($blob_last_update >= time - $self->package_details_max_age) {
               $log->trace(' 02package.details.txt cache is current');
            } else {
               $log->trace(' 02package.details.txt cache expired');
               $ent= undef;
            }
         }
         unless ($ent) {
            $log->debug(" mirror autofetch $path");
            my $blob= $self->add_upstream_package_details;
            $self->clear_package_details; # will lazily rebuild
            $ent= [ $blob, 0100644 ];
         }
      }
      elsif ($path =~ m{^authors/id/(.*)} and !$ent) {
         $log->debug(" mirror autofetch $path");
         my $author_path= $1;
         my $blob= $self->add_upstream_author_file($author_path, undef_if_404 => 1);
         $ent= [ $blob, 0100644 ] if $blob;
      }
   }
   return $ent;
}


sub fetch_upstream_file($self, $path, %options) {
   croak "No upstream URL for this tree"
      unless defined $self->upstream_url;
   my $url= $self->upstream_url . $path;
   my $tx= $self->parent->useragent->get($url);
   $log->debugf(" GET %s -> %s %s", $url, $tx->result->code, $tx->result->message);
   unless ($tx->result->is_success) {
      if ($self->upstream_backup_url && $path =~ m{^authors/id/}) {
         my $url2= $self->upstream_backup_url . $path;
         my $tx2= $self->parent->useragent->get($url2);
         $log->debugf(" GET %s -> %s %s", $url2, $tx2->result->code, $tx2->result->message);
         return \$tx2->result->body
            if $tx2->result->is_success;
      }
      return undef if $options{undef_if_404} && $tx->result->code == 404;
      croak "Failed to find file upstream: ".$tx->result->message;
   }
   return \$tx->result->body;
}


sub add_upstream_package_details($self, %options) {
   my $content_ref= $self->fetch_upstream_file('modules/02packages.details.txt.gz', %options)
      or return undef;
   # Unzip the file and store uncompressed, so that 'git diff' works nicely on it.
   my $txt;
   gunzip $content_ref => \$txt
      or croak "gunzip failed: $GunzipError";
   my $blob= Git::Raw::Blob->create($self->git_repo, $txt);
   $self->set_path('modules/02packages.details.txt', $blob);
   $self->{_blob_last_update}{$blob->id}= time;
   return $blob;
}


sub add_upstream_author_file($self, $author_path, %options) {
   my $path= "authors/id/$author_path";
   my $content_ref= $self->fetch_upstream_file($path, %options)
      or return undef;
   my $blob= Git::Raw::Blob->create($self->git_repo, $$content_ref);
   $self->set_path($path, $blob);
   return $blob;
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

CPAN::InGit::MirrorTree - Subclass of ArchiveTree which automatically mirrors files from upstream

=head1 DESCRIPTION

This is a subclass of L<CPAN::InGit::ArchiveTree> which behaves as a pure mirror of an
upstream CPAN or DarkPAN.  The attribute L</autofetch> allows it to import files from the public
CPAN on demand.

=head1 ATTRIBUTES

=head2 upstream_url

This is the base URL from which files will be fetched.

=head2 upstream_backup_url

This is a fallback URL for if the primary URL lacks a distribution file.  The backup url is
presumed to have the exact same distribution files as the primary URL, but a longer history of
them.  The package index of the backup URL is never used.

If the primary URL is C<< http://www.cpan.org >> then this will default to
C<< https://backpan.perl.org >>.

=head2 autofetch

If enabled, attempts to access author files which exist on the L</upstream_url> and not locally
will immediately go download the file and return it as if it had existed all along.  These
changes are not automatically committed.  Use C<has_changes> to see if anything needs committed.

=head2 package_details_max_age

Number of seconds to cache the package_details file before attempting to re-fetch it.
Defaults to one day (86400).  This only has an effect when C<autofetch> is enabled.

=head1 METHODS

=head2 fetch_upstream_file

  $content= $mirror->fetch_upstream_file($path, %options);
  
  # %options:
  #   undef_if_404 - boolean, return undef instead of croaking on a 404 error

=head2 add_upstream_package_details

  $blob= $mirror->add_upstream_package_details;

Fetches C<modules/02packages.details.txt.gz> from upstream, unzips it, adds it to the tree,
and returns the C<Git::Raw::BLOB>.

=head2 add_upstream_author_file

  $blob= $mirror->add_upstream_author_file($author_path, %options);

Fetch the file (relative to C<authors/id/>) from upstream and add it to this tree.
Also return the C<Git::Raw::BLOB>.

=head1 VERSION

version 0.003

=head1 AUTHOR

Michael Conrad <mike@nrdvana.net>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2026 by Michael Conrad, and IntelliTree Solutions.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut



( run in 2.144 seconds using v1.01-cache-2.11-cpan-39bf76dae61 )