App-DrivePlayer
view release on metacpan or search on metacpan
lib/App/DrivePlayer/MetadataFetcher.pm view on Meta::CPAN
package App::DrivePlayer::MetadataFetcher;
use App::DrivePlayer::Setup;
use File::Temp qw( tempfile );
use HTTP::Tiny;
use JSON::PP qw( decode_json );
use URI::Escape qw( uri_escape_utf8 );
use Time::HiRes qw( sleep time usleep );
my $log = do { eval { require Log::Log4perl; Log::Log4perl->get_logger(__PACKAGE__) } };
Readonly my $USER_AGENT => 'DrivePlayer/1.0 (https://github.com/mvsjes2/drive_player)';
Readonly my $ITUNES_BASE => 'https://itunes.apple.com/search';
Readonly my $MB_BASE => 'https://musicbrainz.org/ws/2';
Readonly my $AID_BASE => 'https://api.acoustid.org/v2/lookup';
Readonly my $DRIVE_URL => 'https://www.googleapis.com/drive/v3/files/%s?alt=media';
Readonly my $MB_MIN_GAP => 1.1;
Readonly my $DOWNLOAD_MB => 5; # MB to download for fingerprinting
my $last_mb_req = 0;
has yield => (
is => 'ro',
isa => Maybe[CodeRef],
default => sub { undef },
);
has acoustid_key => (
is => 'ro',
isa => Str,
default => sub { '' },
);
has token_fn => (
is => 'ro',
isa => Maybe[CodeRef],
default => sub { undef },
);
has _fp_stage => (
is => 'rw',
default => sub { undef },
);
# Returns a short description of where the last fingerprint lookup stopped.
sub last_fp_stage { $_[0]->_fp_stage }
# ------------------------------------------------------------------
# Public: text-based lookup (iTunes -> MusicBrainz, with title cleaning)
# ------------------------------------------------------------------
sub fetch {
my ($self, %args) = @_;
my $title = $args{title} or return;
my $artist = _clean_field($args{artist} // '');
my $album = _clean_field($args{album} // '');
$log->debug("Text search: title='$title' artist='$artist'") if $log;
# 1. Try with original values
my $meta = $self->_fetch_itunes($title, $artist, $album)
// $self->_fetch_musicbrainz($title, $artist, $album);
if ($meta) {
$log->debug("Text search hit for '$title'") if $log;
return $meta;
}
lib/App/DrivePlayer/MetadataFetcher.pm view on Meta::CPAN
$self->_fp_stage('fetching MusicBrainz metadata');
$log->info("Fingerprint: AcoustID matched MusicBrainz recording $aid") if $log;
my $meta = $self->_fetch_musicbrainz_by_id($aid);
$self->_fp_stage($meta ? undef : 'MusicBrainz returned no data');
$log->info($meta ? "Fingerprint: metadata found for drive_id=$drive_id" : "Fingerprint: MusicBrainz returned no data for $aid") if $log;
return $meta;
}
# ------------------------------------------------------------------
# Title cleaning
# ------------------------------------------------------------------
sub _clean_field {
my ($s) = @_;
$s =~ s/_/ /g;
$s =~ s/\s+/ /g;
$s =~ s/^\s+|\s+$//g;
return $s;
}
sub _clean_title {
my ($t) = @_;
# Normalise underscores and whitespace before any pattern matching
$t =~ s/_/ /g;
$t =~ s/\s+/ /g;
$t =~ s/^\s+|\s+$//g;
# Strip leading track number: "01.", "01 -", "(01)", "[01]", "01 "
$t =~ s/^\s*[\(\[]?\d{1,3}[\)\]]?[\s.\-]+//;
# Strip feat/ft/featuring/with credits
$t =~ s/\s*[\(\[](feat|ft|featuring|with)\.?\s+[^\)\]]+[\)\]]//ig;
$t =~ s/\s+(feat|ft|featuring)\s+.+$//ig;
# Strip common parenthetical junk (order matters: specific before generic)
my @strip = (
qr/\s*[\(\[]\s*(?:\d{4}\s+)?(?:digital\s+)?remaster(?:ed)?(?:\s+\d{4})?\s*[\)\]]/i,
qr/\s*[\(\[]\s*live(?:\s+at\s+[^\)\]]+)?\s*[\)\]]/i,
qr/\s*[\(\[]\s*(?:official\s+)?(?:audio|video|lyric\s+video|music\s+video)\s*[\)\]]/i,
qr/\s*[\(\[]\s*(?:radio|single|album|extended|original|instrumental|acoustic|demo|mono|stereo)\s+(?:edit|version|mix|take)?\s*[\)\]]/i,
qr/\s*[\(\[]\s*(?:explicit|clean|censored)\s*[\)\]]/i,
qr/\s*[\(\[]\s*(?:hd|hq|\d+hz|\d+\s*hz|4k)\s*[\)\]]/i,
qr/\s*-\s*(?:single|ep|soundtrack)\s*$/i,
qr/\s*[\(\[]\s*(?:19|20)\d{2}\s*[\)\]]/,
);
for my $re (@strip) { $t =~ s/$re//g }
return _clean_field($t);
}
# ------------------------------------------------------------------
# iTunes
# ------------------------------------------------------------------
sub _fetch_itunes {
my ($self, $title, $artist, $album) = @_;
my $term = join(' ', grep { length } $artist, $title);
my $url = $ITUNES_BASE . '?term=' . uri_escape_utf8($term)
. '&entity=song&media=music&limit=5';
my $data = $self->_get_plain($url) or return;
my $results = $data->{results} or return;
return unless @$results;
my $best = _best_itunes_match($results, $title, $artist, $album);
return unless $best;
my %meta;
$meta{title} = $best->{trackName} if $best->{trackName};
$meta{artist} = $best->{artistName} if $best->{artistName};
$meta{album} = $best->{collectionName} if $best->{collectionName};
$meta{genre} = $best->{primaryGenreName} if $best->{primaryGenreName};
$meta{track_number} = $best->{trackNumber} if $best->{trackNumber};
($meta{year}) = ($best->{releaseDate} // '') =~ /^(\d{4})/;
return \%meta;
}
sub _best_itunes_match {
my ($results, $want_title, $want_artist, $want_album) = @_;
my $score = sub {
my ($r) = @_;
my $s = 0;
$s += 3 if $want_title && _fuzzy($r->{trackName}, $want_title);
$s += 2 if $want_artist && _fuzzy($r->{artistName}, $want_artist);
$s += 1 if $want_album && _fuzzy($r->{collectionName}, $want_album);
return $s;
};
my ($best) = sort { $score->($b) <=> $score->($a) } @$results;
return unless $want_title && _fuzzy($best->{trackName}, $want_title);
return $best;
}
sub _fuzzy {
my ($hay, $needle) = @_;
return unless defined $hay && defined $needle && length $needle;
return index(lc($hay), lc($needle)) >= 0
|| index(lc($needle), lc($hay)) >= 0;
}
# ------------------------------------------------------------------
# MusicBrainz (text search, fuzzy, progressive relaxation)
# ------------------------------------------------------------------
sub _fetch_musicbrainz {
my ($self, $title, $artist, $album) = @_;
my @attempts;
push @attempts, 'recording:' . _mb_q($title) . '~ AND artist:' . _mb_q($artist)
. '~ AND release:' . _mb_q($album) . '~'
if $artist && $album;
push @attempts, 'recording:' . _mb_q($title) . '~ AND artist:' . _mb_q($artist) . '~'
if $artist;
push @attempts, 'recording:' . _mb_q($title) . '~';
for my $query (@attempts) {
my $url = "$MB_BASE/recording?query=" . uri_escape_utf8($query)
. '&fmt=json&limit=5&inc=releases+artist-credits+tags+genres';
my $data = $self->_get_mb($url) or next;
my $recs = $data->{recordings} or next;
next unless @$recs;
my $meta = $self->_parse_mb_with_release($recs->[0]);
return $meta if $meta && %$meta;
}
return;
}
sub _fetch_musicbrainz_by_id {
my ($self, $mb_id) = @_;
my $url = "$MB_BASE/recording/$mb_id?fmt=json&inc=releases+artist-credits+tags+genres";
my $rec = $self->_get_mb($url) or return;
return $self->_parse_mb_with_release($rec);
}
sub _parse_mb_with_release {
my ($self, $rec) = @_;
my %meta = %{ _parse_mb($rec) };
# If we didn't get a genre from the recording, try the release-group level
if (!$meta{genre} && (my $rel = _best_mb_release($rec->{releases} // []))) {
my $rg_url = "$MB_BASE/release/$rel->{id}?fmt=json&inc=release-groups+genres+tags";
my $rd = $self->_get_mb($rg_url);
if ($rd) {
$meta{genre} //= _best_mb_genre($rd->{genres}, $rd->{tags});
if (!$meta{genre} && (my $rg = $rd->{'release-group'})) {
my $rg_full_url = "$MB_BASE/release-group/$rg->{id}?fmt=json&inc=genres+tags";
my $rgd = $self->_get_mb($rg_full_url);
$meta{genre} //= _best_mb_genre($rgd->{genres}, $rgd->{tags}) if $rgd;
}
}
}
return \%meta;
}
sub _parse_mb {
my ($rec) = @_;
my %meta;
$meta{title} = $rec->{title} if $rec->{title};
if (my $credits = $rec->{'artist-credit'}) {
$meta{artist} = join(', ',
map { $_->{name} // $_->{artist}{name} // () }
grep { ref $_ eq 'HASH' } @$credits
);
}
if (my $rel = _best_mb_release($rec->{releases} // [])) {
$meta{album} = $rel->{title};
($meta{year}) = ($rel->{date} // '') =~ /^(\d{4})/;
}
$meta{genre} = _best_mb_genre($rec->{genres}, $rec->{tags});
return \%meta;
}
# Prefer MB's curated genres over folksonomy tags; pick the highest-count entry.
sub _best_mb_genre {
my ($genres, $tags) = @_;
if ($genres && @$genres) {
my ($top) = sort { $b->{count} <=> $a->{count} } @$genres;
lib/App/DrivePlayer/MetadataFetcher.pm view on Meta::CPAN
$meta{title} = $tags{title} if $tags{title};
$meta{artist} = $tags{artist} if $tags{artist};
$meta{album} = $tags{album} if $tags{album};
$meta{genre} = $tags{genre} if $tags{genre};
$meta{comment} = $tags{comment} if $tags{comment};
if ($tags{tracknumber} && $tags{tracknumber} =~ /^(\d+)/) {
$meta{track_number} = $1 + 0;
}
if ($tags{date} && $tags{date} =~ /^(\d{4})/) {
$meta{year} = $1 + 0;
}
# Duration from FLAC stream info block
my $info = $flac->info();
if ($info && $info->{TOTALSAMPLES} && $info->{SAMPLERATE}) {
$meta{duration_ms} = int($info->{TOTALSAMPLES} / $info->{SAMPLERATE} * 1000);
} elsif ($flac->{trackTotalLengthSeconds}) {
$meta{duration_ms} = int($flac->{trackTotalLengthSeconds} * 1000);
}
return %meta ? \%meta : undef;
}
# Probe a Drive file for duration (milliseconds) using ffprobe.
# Returns undef if ffprobe is unavailable or the probe fails.
sub probe_duration_ms {
my ($class, $drive_id, $token) = @_;
return unless ffprobe_available();
my $ffprobe = -x '/usr/bin/ffprobe' ? '/usr/bin/ffprobe' : 'ffprobe';
my $url = sprintf($DRIVE_URL, $drive_id);
my $header = "Authorization: $token\r\n";
my $out = q{};
my $pid = open(my $fh, '-|', $ffprobe,
'-v', 'error',
'-headers', $header,
'-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1',
$url,
);
if ($pid) {
$out = do { local $/; <$fh> };
close $fh;
}
return unless $out =~ /^(\d+(?:\.\d+)?)/m;
return int($1 * 1000);
}
# Keep private alias for internal calls
sub _fpcalc_available { fpcalc_available() }
sub _download_partial {
my ($self, $drive_id, $max_bytes) = @_;
my $token = $self->token_fn->();
unless ($token) {
$log->warn("Fingerprint: no bearer token available") if $log;
return;
}
my $url = sprintf $DRIVE_URL, uri_escape_utf8($drive_id);
my $max = ($max_bytes // ($DOWNLOAD_MB * 1024 * 1024)) - 1;
my $ua = HTTP::Tiny->new(agent => $USER_AGENT, timeout => 30);
my ($fh, $tmpfile) = tempfile(SUFFIX => '.audio', UNLINK => 0);
binmode $fh;
my $res = $ua->request('GET', $url, {
headers => {
Authorization => $token,
Range => "bytes=0-$max",
},
data_callback => sub { print {$fh} $_[0] },
});
close $fh;
if ($res->{success} || $res->{status} == 206) {
$log->debug("Fingerprint: downloaded " . (-s $tmpfile) . " bytes for drive_id=$drive_id") if $log;
return $tmpfile;
}
$log->warn("Fingerprint: HTTP $res->{status} downloading drive_id=$drive_id: $res->{reason}") if $log;
unlink $tmpfile;
return;
}
sub _fingerprint {
my ($tmpfile) = @_;
my $fpcalc = -x '/usr/bin/fpcalc' ? '/usr/bin/fpcalc' : 'fpcalc';
my $json = qx($fpcalc -json -length 120 \Q$tmpfile\E 2>/dev/null);
return unless $json;
my $data = eval { decode_json($json) } or return;
return unless $data->{fingerprint} && $data->{duration};
return { fingerprint => $data->{fingerprint}, duration => int($data->{duration}) };
}
sub _query_acoustid {
my ($self, $fp) = @_;
my $url = $AID_BASE
. '?client=' . uri_escape_utf8($self->acoustid_key)
. '&meta=recordings+compress'
. '&duration=' . $fp->{duration}
. '&fingerprint=' . uri_escape_utf8($fp->{fingerprint});
my $data = $self->_get_plain($url) or return;
my $results = $data->{results} or return;
return unless @$results;
# Pick the result with the highest score
my ($best) = sort { $b->{score} <=> $a->{score} } @$results;
return unless $best->{score} && $best->{score} > 0.5;
my $recordings = $best->{recordings} or return;
return unless @$recordings;
return $recordings->[0]{id};
}
# ------------------------------------------------------------------
# HTTP helpers
# ------------------------------------------------------------------
sub _get_plain {
my ($self, $url) = @_;
my $ua = HTTP::Tiny->new(agent => $USER_AGENT, timeout => 5);
my $res = $ua->get($url);
return unless $res->{success};
return eval { decode_json($res->{content}) };
}
sub _get_mb {
my ($self, $url) = @_;
my $gap = $MB_MIN_GAP - (time() - $last_mb_req);
$self->_yield_sleep($gap) if $gap > 0;
$last_mb_req = time();
return $self->_get_plain($url);
}
sub _yield_sleep {
my ($self, $secs) = @_;
my $yield = $self->yield;
if ($yield) {
my $end = time() + $secs;
while (time() < $end) {
$yield->();
usleep(50_000);
}
} else {
sleep($secs);
}
}
1;
__END__
=head1 NAME
App::DrivePlayer::MetadataFetcher - Fetch track metadata from iTunes, MusicBrainz, and AcoustID
=head1 SYNOPSIS
use App::DrivePlayer::MetadataFetcher;
( run in 1.783 second using v1.01-cache-2.11-cpan-2398b32b56e )