App-FeedDeduplicator

 view release on metacpan or  search on metacpan

lib/App/FeedDeduplicator/Deduplicator.pm  view on Meta::CPAN

It is designed to be used in conjunction with the Aggregator and Publisher
classes to provide a complete feed deduplication and publishing solution.

=head2 find_canonical

Finds the canonical link for a given entry. It fetches the entry's link using
LWP::UserAgent and parses the HTML content using HTML::TreeBuilder::XPath.

It looks for the <link rel="canonical"> tag in the HTML content and returns
the canonical URL if found. If the canonical link is not found, it returns
undef.

It is used during the deduplication process to determine the unique
identifier for each entry.

=cut

package App::FeedDeduplicator::Deduplicator; # For MetaCPAN

use v5.40;
use feature 'class';
no warnings 'experimental::class';

class App::FeedDeduplicator::Deduplicator {
  use HTML::TreeBuilder::XPath;
  use LWP::UserAgent;
  use URI;

  field $entries :param;
  field $deduplicated :reader;
  field $ua :param;

  method deduplicate {
    my %seen;
    my @result;

    for my $entry (@$entries) {
      # warn ref($entry) . "\n" . ref($entry->{entry}) . "\n";
      my $canonical = $self->find_canonical($entry->{entry}) // '';
      my $title = $entry->{entry}->title // '';

      push @result, $entry
        unless ($canonical and $seen{$canonical})
            or ($title     and $seen{$title});

      ++$seen{$canonical} if $canonical;
      ++$seen{$title}     if $title;
    }

    $deduplicated = \@result;
  }

  method find_canonical ($entry) {
    my $link = $entry->link;
    return unless $link;

    my $response = $ua->get($link);
    return unless $response->is_success;

    my $tree = HTML::TreeBuilder::XPath->new_from_content(
      $response->decoded_content
    );
    my $node = $tree->findnodes('//link[@rel="canonical"]')->[0];

    return unless $node;
    return URI->new($node->attr('href'))->as_string;
  }
}

=head1 AUTHOR

Dave Cross <dave@perlhacks.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2025 Magnum Solutions Ltd.
All rights reserved.

This program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

See L<http://dev.perl.org/licenses/artistic.html> for more details.

=cut

1;



( run in 0.759 second using v1.01-cache-2.11-cpan-39bf76dae61 )