Module-Generic
view release on metacpan or search on metacpan
scripts/gen_magic_json.pl view on Meta::CPAN
#!/usr/bin/env perl
##----------------------------------------------------------------------------
## Module::Generic::File::Magic - scripts/gen_magic_json.pl
## Version v0.1.0
## Copyright(c) 2026 DEGUEST Pte. Ltd.
## Author: Jacques Deguest <jack@deguest.jp>
## Created 2026/03/08
## Modified 2026/03/08
##
## Converts a freedesktop.org shared-mime-info XML file into the compact JSON
## magic database used by Module::Generic::File::Magic's pure-Perl backend.
##
## Usage:
## perl scripts/gen_magic_json.pl [XML_FILE] [OUT_FILE]
##
## Defaults:
## XML_FILE : /usr/share/mime/packages/freedesktop.org.xml
## OUT_FILE : lib/Module/Generic/File/magic.json
##
## Requirements:
## XML::LibXML (preferred) OR XML::Parser OR XML::Twig
## JSON
##
## Output format:
## A JSON array of objects, sorted by descending priority then MIME type:
## [
## {
## "mime" : "application/gzip",
## "priority" : 80,
## "matches" : [
## {
## "offset" : 0, int â start offset in the file
## "range" : 0, int â scan [offset .. offset+range-1]; 0 = exact
## "type" : "string", string|byte|big16|big32|little16|little32|host16|host32
## "bytes" : "1f8b", hex-encoded bytes to match
## "mask" : null, hex-encoded AND mask or null
## "and" : [...] optional sub-matches (all must pass)
## }
## ]
## },
## ...
## ]
##----------------------------------------------------------------------------
use strict;
use warnings;
use File::Spec ();
my $XML_FILE = $ARGV[0] // '/usr/share/mime/packages/freedesktop.org.xml';
my $OUT_FILE = $ARGV[1] // File::Spec->catfile(
File::Spec->curdir, 'lib', 'Module', 'Generic', 'File', 'magic.json'
);
die( "Cannot read XML file: $XML_FILE\n" ) unless( -r $XML_FILE );
my $NS = 'http://www.freedesktop.org/standards/shared-mime-info';
# NOTE: Load an XML parser â try in order of preference
my $parser_type = _detect_xml_parser();
die(
"No XML parser found. Please install one of:\n"
. " cpanm XML::LibXML\n"
. " cpanm XML::Twig\n"
. " cpanm XML::Parser\n"
) unless( defined( $parser_type )) ;
printf( "Using XML parser: %s\n", $parser_type );
printf( "Parsing: %s\n", $XML_FILE );
my @entries = _parse_xml( $XML_FILE, $parser_type );
# Sort: highest priority first, then MIME type for determinism
@entries = sort{
$b->{priority} <=> $a->{priority} || $a->{mime} cmp $b->{mime}
} @entries;
printf( "Generated %d magic entries.\n", scalar( @entries ) );
# NOTE: Write JSON
require JSON;
my $json = JSON->new->utf8->canonical(1)->pretty(1)->encode( \@entries );
open( my $fh, '>:raw', $OUT_FILE ) or
die( "Cannot write $OUT_FILE: $!\n" );
print( $fh $json );
close( $fh );
printf( "Written to %s (%.1f KB)\n", $OUT_FILE, ( -s $OUT_FILE ) / 1024 );
exit(0);
# NOTE: Detect available XML parser
sub _detect_xml_parser
{
for my $mod ( qw( XML::LibXML XML::Twig XML::Parser ) )
{
local $@;
eval{ require $mod; 1 } and return( $mod );
}
return( undef );
}
# NOTE: Parse XML using the available parser
sub _parse_xml
{
my( $file, $parser_type ) = @_;
if( $parser_type eq 'XML::LibXML' )
{
return( _parse_with_libxml( $file ) );
}
else
{
# XML::Twig and XML::Parser: use a SAX-style approach via XML::Twig
# or fall back to slurp + regex for XML::Parser
return( _parse_with_twig_or_parser( $file, $parser_type ) );
}
( run in 0.787 second using v1.01-cache-2.11-cpan-13bb782fe5a )