BOM results from the CPAN

MP3-Info

view release on metacpan or search on metacpan

				$pic_len = length($format) + 2;

				if ($pic_len < length($pic)) {

					my ($picture_type, $description) = unpack "x$pic_len C Z*", $pic;

					$pic_len += 1 + length($description) + 1;

					# skip extra terminating null if UTF-16 (encoding 1 or 2)
					if ( $encoding == 1 || $encoding == 2 ) { $pic_len++; }

					$valid_pic  = 1;
					$pic_format = $format;
				}
			}

			# Proceed if we have a valid picture.
			if ($valid_pic && $pic_format) {

				my ($data) = unpack("x$pic_len A*", $pic);

				if (length($data) && $pic_format) {

					$info->{$hash->{$id}} = {
						'DATA'   => $data,
						'FORMAT' => $pic_format,
					}
				}
			}

		} else {
			my $data1 = $v2->{$id};

			$data1 = [ $data1 ] if ref($data1) ne 'ARRAY';

			for my $data (@$data1) {
				# TODO : this should only be done for certain frames;
				# using RAW still gives you access, but we should be smarter
				# about how individual frame types are handled.  it's not
				# like the list is infinitely long.
				$data =~ s/^(.)//; # strip first char (text encoding)
				my $encoding = $1;
				my $desc;

				# Comments & Unsyncronized Lyrics have the same format.
				if ($id =~ /^(COM[M ]?|US?LT)$/) { # space for iTunes brokenness

					$data =~ s/^(?:...)//;		# strip language
				}

				# JRF: I believe this should probably only be applied to the text frames
				#      and not every single frame.
				if ($UNICODE) {

					if ($encoding eq "\001" || $encoding eq "\002") {  # UTF-16, UTF-16BE
						# text fields can be null-separated lists;
						# UTF-16 therefore needs special care
						#
						# foobar2000 encodes tags in UTF-16LE
						# (which is apparently illegal)
						# Encode dies on a bad BOM, so it is
						# probably wise to wrap it in an eval
						# anyway
						$data = eval { Encode::decode('utf16', $data) } || Encode::decode('utf16le', $data);

					} elsif ($encoding eq "\003") { # UTF-8

						# make sure string is UTF8, and set flag appropriately
						$data = Encode::decode('utf8', $data);

					} elsif ($encoding eq "\000") {

						# Only guess if it's not ascii.
						if ($data && $data !~ /^[\x00-\x7F]+$/) {

							if ($unicode_detect_module) {

								my $charset = Encode::Detect::Detector::detect($data) || 'iso-8859-1';
								my $enc     = Encode::find_encoding($charset);

								if ($enc) {
									$data = $enc->decode($data, 0);
								}

							} else {

								# Try and guess the encoding, otherwise just use latin1
								my $dec = Encode::Guess->guess($data);

								if (ref $dec) {
									$data = $dec->decode($data);
								} else {
									# Best try
									$data = Encode::decode('iso-8859-1', $data);
								}
							}
						}
					}

				} else {

					# If the string starts with an
					# UTF-16 little endian BOM, use a hack to
					# convert to ASCII per best-effort
					my $pat;
					if ($data =~ s/^\xFF\xFE//) {
						# strip additional BOMs as seen in COM(M?) and TXX(X?)
						$data = join ("",map { ( /^(..)$/ && ! /(\xFF\xFE)/ )? $_: "" } (split /(..)/, $data));
						$pat = 'v';
					} elsif ($data =~ s/^\xFE\xFF//) {
						# strip additional BOMs as seen in COM(M?) and TXX(X?)
						$data = join ("",map { ( /^(..)$/ && ! /(\xFF\xFE)/ )? $_: "" } (split /(..)/, $data));
						$pat = 'n';
					}

					if ($pat) {
						# strip additional 0s
						$data = join ("",map { ( /^(..)$/ && ! /(\x00\x00)/ )? $_: "" } (split /(..)/, $data));
						$data = pack 'C*', map {
							(chr =~ /[[:ascii:]]/ && chr =~ /[[:print:]]/)
								? $_
								: ord('?')
						} unpack "$pat*", $data;
					}
				}

				# We do this after decoding so we could be certain we're dealing
				# with 8-bit text.
				if ($id =~ /^(COM[M ]?|US?LT)$/) { # space for iTunes brokenness

					$data =~ s/^(.*?)\000//;	# strip up to first NULL(s),
									# for sub-comments (TODO:
									# handle all comment data)
					$desc = $1;

					if ($encoding eq "\001" || $encoding eq "\002") {

						$data =~ s/^\x{feff}//;
					}

				} elsif ($id =~ /^TCON?$/) {

					my ($index, $name);

					# Turn multiple nulls into a single.
					$data =~ s/\000+/\000/g;

					# Handle the ID3v2.x spec - 
					#
					# just an index number, possibly
					# paren enclosed - referer to the v1 genres.
					if ($data =~ /^ \(? (\d+) \)?\000?$/sx) {

						$index = $1;

					# Paren enclosed index with refinement.
					# (4)Eurodisco
					} elsif ($data =~ /^ \( (\d+) \)\000? ([^\(].+)$/x) {

						($index, $name) = ($1, $2);

					# List of indexes: (37)(38)
					} elsif ($data =~ /^ \( (\d+) \)\000?/x) {

						my @genres = ();

						while ($data =~ s/^ \( (\d+) \)//x) {

							# The indexes might have a refinement
							# not sure why one wouldn't just use
							# the proper genre in the first place..

Info.pm view on Meta::CPAN


						$data = $data->[0];
					}

				} elsif ($id =~ /^T...?$/ && $id ne 'TXXX') {
					
					# In ID3v2.4 there's a slight content change for text fields.
					#      They can contain multiple values which are nul terminated
					#      within the frame. We ONLY want to split these into multiple
					#      array values if they didn't request raw values (1).
					#        raw_v2 = 0 => parse simply
					#        raw_v2 = 1 => don't parse
					#        raw_v2 = 2 => do split into arrayrefs
					
					# Strip off any trailing NULs, which would indicate an empty
					# field and cause an array with no elements to be created.
					$data =~ s/\x00+$//;

					
					if ($data =~ /\x00/ && ($raw_v2 == 2 || $raw_v2 == 0))
					{
						# There are embedded nuls in the string, which means an ID3v2.4
						# multi-value frame. And they wanted arrays rather than simple
						# values.
						# Strings are already UTF-8, so any double nuls from 16 bit
						# characters will have already been reduced to single nuls.
						$data = [ split /\000/, $data ];
					}
				}

				if ($desc)
				{
					# It's a frame with a description, so we may need to construct a hash
					# for the data, rather than an array.
					if ($raw_v2 == 2) {

						$data = { $desc => $data };

					} elsif ($desc =~ /^iTun/) {

						# leave iTunes tags alone.
						$data = join(' ', $desc, $data);
					}
				}

				if ($raw_v2 == 2 && exists $info->{$hash->{$id}}) {

					if (ref $info->{$hash->{$id}} eq 'ARRAY') {
						push @{$info->{$hash->{$id}}}, $data;
					} else {
						$info->{$hash->{$id}} = [ $info->{$hash->{$id}}, $data ];
					}

				} else {

					# User defined frame
					if ($id eq 'TXXX') {

						my ($key, $val) = split(/\0/, $data);

						# Some programs - such as FB2K leave a UTF-16 BOM on the value
						if ($encoding eq "\001" || $encoding eq "\002") {

							$val =~ s/^\x{feff}//;
						}

						$info->{uc($key)} = $val;

					} elsif ($id eq 'PRIV') {

						my ($key, $val) = split(/\0/, $data);
						$info->{uc($key)} = unpack('v', $val);

					} else {

						my $key = $hash->{$id};

						# If we have multiple values
						# for the same key - turn them
						# into an array ref.
						if ($ver == 2 && $info->{$key} && !ref($info->{$key})) {

							if (ref($data) eq "ARRAY") {
							
								$info->{$key} = [ $info->{$key}, @$data ];
							} else {
							
								my $old = delete $info->{$key};
							
								@{$info->{$key}} = ($old, $data);
							}

						} elsif ($ver == 2 && ref($info->{$key}) eq 'ARRAY') {
							
							if (ref($data) eq "ARRAY") {

								push @{$info->{$key}}, @$data;

							} else {

								push @{$info->{$key}}, $data;
							}

						} else {

							$info->{$key} = $data;
						}
					}
				}
			}
		}
	}
}

sub _get_v2tag {
	my ($fh, $ver, $raw, $info, $start) = @_;
	my $eof;
	my $gotanyv2 = 0;

	# First we need to check the end of the file for any footer

( run in 1.965 second using v1.01-cache-2.11-cpan-600a1bdf6e4 )