Module-Generic

 view release on metacpan or  search on metacpan

scripts/gen_magic_json.pl  view on Meta::CPAN

#!/usr/bin/env perl
##----------------------------------------------------------------------------
## Module::Generic::File::Magic - scripts/gen_magic_json.pl
## Version v0.1.0
## Copyright(c) 2026 DEGUEST Pte. Ltd.
## Author: Jacques Deguest <jack@deguest.jp>
## Created  2026/03/08
## Modified 2026/03/08
##
## Converts a freedesktop.org shared-mime-info XML file into the compact JSON
## magic database used by Module::Generic::File::Magic's pure-Perl backend.
##
## Usage:
##   perl scripts/gen_magic_json.pl [XML_FILE] [OUT_FILE]
##
## Defaults:
##   XML_FILE : /usr/share/mime/packages/freedesktop.org.xml
##   OUT_FILE : lib/Module/Generic/File/magic.json
##
## Requirements:
##   XML::LibXML  (preferred)  OR  XML::Parser  OR  XML::Twig
##   JSON
##
## Output format:
##   A JSON array of objects, sorted by descending priority then MIME type:
##   [
##     {
##       "mime"     : "application/gzip",
##       "priority" : 80,
##       "matches"  : [
##         {
##           "offset" : 0,        int — start offset in the file
##           "range"  : 0,        int — scan [offset .. offset+range-1]; 0 = exact
##           "type"   : "string", string|byte|big16|big32|little16|little32|host16|host32
##           "bytes"  : "1f8b",   hex-encoded bytes to match
##           "mask"   : null,     hex-encoded AND mask or null
##           "and"    : [...]     optional sub-matches (all must pass)
##         }
##       ]
##     },
##     ...
##   ]
##----------------------------------------------------------------------------
use strict;
use warnings;
use File::Spec ();

my $XML_FILE = $ARGV[0] // '/usr/share/mime/packages/freedesktop.org.xml';
my $OUT_FILE = $ARGV[1] // File::Spec->catfile(
    File::Spec->curdir, 'lib', 'Module', 'Generic', 'File', 'magic.json'
);

die( "Cannot read XML file: $XML_FILE\n" ) unless( -r $XML_FILE );

my $NS = 'http://www.freedesktop.org/standards/shared-mime-info';

# NOTE: Load an XML parser — try in order of preference
my $parser_type = _detect_xml_parser();
die(
    "No XML parser found. Please install one of:\n"
    . "  cpanm XML::LibXML\n"
    . "  cpanm XML::Twig\n"
    . "  cpanm XML::Parser\n"
) unless( defined( $parser_type )) ;

printf( "Using XML parser: %s\n", $parser_type );
printf( "Parsing: %s\n", $XML_FILE );

my @entries = _parse_xml( $XML_FILE, $parser_type );

# Sort: highest priority first, then MIME type for determinism
@entries = sort{
    $b->{priority} <=> $a->{priority} || $a->{mime} cmp $b->{mime}
} @entries;

printf( "Generated %d magic entries.\n", scalar( @entries ) );

# NOTE: Write JSON
require JSON;
my $json = JSON->new->utf8->canonical(1)->pretty(1)->encode( \@entries );

open( my $fh, '>:raw', $OUT_FILE ) or
    die( "Cannot write $OUT_FILE: $!\n" );
print( $fh $json );
close( $fh );

printf( "Written to %s (%.1f KB)\n", $OUT_FILE, ( -s $OUT_FILE ) / 1024 );
exit(0);

# NOTE: Detect available XML parser
sub _detect_xml_parser
{
    for my $mod ( qw( XML::LibXML XML::Twig XML::Parser ) )
    {
        local $@;
        eval{ require $mod; 1 } and return( $mod );
    }
    return( undef );
}

# NOTE: Parse XML using the available parser
sub _parse_xml
{
    my( $file, $parser_type ) = @_;

    if( $parser_type eq 'XML::LibXML' )
    {
        return( _parse_with_libxml( $file ) );
    }
    else
    {
        # XML::Twig and XML::Parser: use a SAX-style approach via XML::Twig
        # or fall back to slurp + regex for XML::Parser
        return( _parse_with_twig_or_parser( $file, $parser_type ) );
    }



( run in 0.787 second using v1.01-cache-2.11-cpan-13bb782fe5a )