Syndication-NITF
view release on metacpan or search on metacpan
examples/NITFParser.pl view on Meta::CPAN
#!/usr/bin/perl
package NITFParser;
# not finished. It's still a good example of what you can do with the module, though.
use strict;
use NITF;
MAIN:
{
my $filename = $ARGV[0] || die "Usage: $0 <filename to parse>\n";
my $nitf = new Syndication::NITF($filename);
my $head = $nitf->gethead;
print "title is ".$head->gettitle->getText."\n";
&parseTobject($head->gettobject);
&parseDocData($head->getdocdata);
my $body = $nitf->getbody;
&parseBodyHead($body->getbodyhead);
foreach my $bodycontent ($body->getbodycontentList) {
&parseBodyContent($bodycontent);
}
&parseBodyEnd($body->getbodyend);
}
# "tobject" generally stores subject information about this document.
sub parseTobject {
my ($tobject) = @_;
print "Parsing tobject:\n";
print " tobject type: ".$tobject->gettobjecttype."\n";
my $i = 1;
foreach my $subject ($tobject->gettobjectsubjectList) {
print " subject ".$i.": ";
# 1st tier description
print $subject->gettobjectsubjecttype;
print "/";
# 2nd tier description
print $subject->gettobjectsubjectmatter;
print "/";
# 3rd tier description
print $subject->gettobjectsubjectdetail;
# refnum conforms to IPTC subject codes, and gives more information than the type/matter/detail
# on their own (ie the whole path rather than the node name)
# so we will soon embed a copy of the IPTC subject codes and add some routines to handle them.
print " (".$subject->gettobjectsubjectrefnum.")\n";
# Is the IPR (info provider) the person who provides the subject codes or the
# person who categorises the story? Defaults to "IPTC" so I guess it's the former
my $ipr = $subject->gettobjectsubjectipr;
print " info provider is ".$ipr."\n" if $ipr;
$i++;
}
}
# "docdata" contains general metadata about this document.
sub parseDocData {
my ($docdata) = @_;
print "Parsing docdata:\n";
# docdata could contain one or more of:
# fixture --
# date.issue --
# date.release --
# date.expire --
# doc-scope --
# series --
# ed-msg --
# du-key --
# doc.copyright --
# doc.rights --
# key-list --
# identified-content (zero or more) --
# correction -- this story is a correction of another
foreach my $correction ($docdata->getcorrectionList) {
print " Document is a correction\n";
print " This document's ID string is ".$correction->getidstring."\n";
print " It corrects document with ID ".$correction->getregsrc."\n";
print " Message: ".$correction->getinfo."\n";
}
# evloc -- event location data
foreach my $evloc ($docdata->getevlocList) {
print " Document event location data:\n";
print " City: ".$evloc->getcity."\n" if $evloc->getcity;
print " County/district: ".$evloc->getcountydist."\n" if $evloc->getcountydist;
print " ISO country code: ".$evloc->getisocc."\n" if $evloc->getisocc;
print " State/province: ".$evloc->getstateprov."\n" if $evloc->getstateprov;
}
# doc-id -- document identification data
foreach my $docid ($docdata->getdocidList) {
print " Document ID info:\n";
print " This document's ID string is ".$docid->getidstring."\n";
print " Source document ID is ".$docid->getregsrc."\n";
}
# del-list -- delivery trail (like Path: in SMTP mail, I guess)
foreach my $delitem ($docdata->getdellistList) {
print " Delivery list item:\n";
print " Level number is ".$delitem->getlevelnumber."\n" if $delitem->getlevelnumber;
print " Source name is ".$delitem->getsrcname."\n" if $delitem->getsrcname;
}
# urgency -- importance of news item (1=most, 5=normal, 8=least)
foreach my $urgency ($docdata->geturgencyList) {
print " Document urgency is ".$urgency->getedurg."\n";
}
# fixture -- a named document which is refreshed periodically (eg a columnist)
foreach my $fixture ($docdata->getfixtureList) {
print " Fixture ID is ".$fixture->getfixid."\n";
}
# date.issue -- the date of issue of the document (default is date of receipt).
foreach my $dateissue ($docdata->getdateissueList) {
print " Date of issue is ".$dateissue->getnorm."\n";
}
# date.release -- the date/time that the document can be released (default is date of receipt).
foreach my $daterelease ($docdata->getdatereleaseList) {
print " Release (embargo) date is ".$daterelease->getnorm."\n";
}
# date.expire -- the date/time at which the story expires (default is infinity).
foreach my $dateexpire ($docdata->getdateexpireList) {
print " Expiry date is ".$dateexpire->getnorm."\n";
}
# doc-scope -- area that the document covers (usually geographic region)
foreach my $docscope ($docdata->getdocscopeList) {
print " Document scope is ".$docscope->getscope."\n";
}
# series -- Identifies article as part of a series
foreach my $series ($docdata->getseriesList) {
print " Document is part of a series:\n";
print " Series name: ".$series->getseriesname."\n";
print " Part ".$series->getseriespart." of ".$series->getseriestotalpart."\n";
}
# ed-msg -- non-publishable editorial message about the news item
foreach my $edmsg ($docdata->getedmsgList) {
print " Editor's message: ";
print " Type: ".$edmsg->getmsgtype."\n";
print " Message: ".$edmsg->getinfo."\n";
}
# du-key -- Dynamic Use key: semi-unique ID generated by provider, attached to a story for all
# instances. Presumably used to update a story over time.
foreach my $dukey ($docdata->getdukeyList) {
print " Dynamic Use (du) key: ";
print " Key = ".$dukey->getkey."\n";
print " Generation = ".$dukey->getgeneration."\n";
print " Part = ".$dukey->getpart."\n";
print " Version = ".$dukey->getversion."\n";
}
# doc-copyright -- Copyright info: "should be consistent with information in the copyrite tag"
foreach my $doccopyright ($docdata->getdoccopyrightList) {
print " Document copyright info: ";
print " Holder = ".$doccopyright->getholder."\n";
print " Year = ".$doccopyright->getyear."\n";
}
# doc-rights -- Rights holder info:
# "should be consistent with information in the series of rights tags"
foreach my $docrights ($docdata->getdocrightsList) {
print " Document rights holder info: ";
print " Agent = ".$docrights->getagent."\n";
print " Code-source = ".$docrights->getcodesource."\n";
print " enddate = ".$docrights->getenddate."\n";
print " geography = ".$docrights->getgeography."\n";
print " limitations = ".$docrights->getlimitations."\n";
print " location-code = ".$docrights->getlocationcode."\n"; # "from standard list"
print " owner = ".$docrights->getowner."\n";
print " startdate = ".$docrights->getstartdate."\n";
print " type = ".$docrights->gettype."\n";
}
# key-list -- List of keywords
foreach my $keylist ($docdata->getkeylistList) {
print " Keyword list:\n";
foreach my $keyword ($keylist->getkeywordList) {
print " Keyword = ".$keyword->getkey."\n";
}
}
# identified-content -- Content identifiers for this document
foreach my $contentid ($docdata->getidentifiedcontentList) {
print " Content identifiers:\n";
foreach my $person ($contentid->getpersonList) {
&parsePerson($person);
}
foreach my $org ($contentid->getorgList) {
&parseOrg($org);
}
foreach my $location ($contentid->getlocationList) {
&parseLocation($location);
}
}
}
# the difference between <head> (above) and <body.head> (this stuff) is that the <body.head>
# metadata is intended to be seen by the reader in some form.
sub parseBodyHead {
my ($bodyhead) = @_;
print "body head:\n";
# parse Hedline [sic]
my $hedline = $bodyhead->gethedline;
if ($hedline) {
print " Headline:\n";
&parseEnrichedText($hedline->gethl1);
foreach my $hl2 ($hedline->gethl2List) {
print " subheadline:\n";
&parseEnrichedText($hl2);
}
}
# parse advisory notes ("potentially publishable")
print " Notes:\n";
foreach my $note ($bodyhead->getnoteList) {
print " type: ".$note->gettype."\n"; # defaulut "std" (standard)
print " class: ".$note->getnoteclass."\n"; # no default
# the content of this is actually body.content, so could be anything...
# as with hl1 & 2, we ignore any markup for now.
print " content:".$note->getAllText."\n"; # the whole content
}
# parse rights holder information
print " Rights:\n";
my $rights = $bodyhead->getrights;
if ($rights) {
print " rights: ".$rights->getText."\n"; # has PCDATA for display to the reader I guess
print " rights owner: ".$rights->getrightsowner->getText."\n";
print " rights owner contact: ".$rights->getrightsowner->getcontact."\n";
print " rights startdate: ".$rights->getrightsstartdate->getText."\n";
print " rights startdate normalised: ".$rights->getrightsstartdate->getnorm."\n";
print " rights enddate: ".$rights->getrightsenddate->getText."\n";
print " rights enddate normalised: ".$rights->getrightsenddate->getnorm."\n";
print " rights agent: ".$rights->getrightsagent->getText."\n";
print " rights agent contact: ".$rights->getrightsagent->getcontact."\n";
print " rights geography: ".$rights->getrightsgeography->getText."\n";
print " rights geography code: ".$rights->getrightsgeography->getlocationcode."\n";
print " rights geography code source: ".$rights->getrightsgeography->getcodesource."\n";
# no controlled vocabulary for this, a bit silly methinks...
print " rights type: ".$rights->getrightstype->getText."\n";
# no controlled vocabulary for this, a bit silly methinks...
print " rights limitations: ".$rights->getrightslimitations->getText."\n";
}
# parse byline/s
( run in 1.324 second using v1.01-cache-2.11-cpan-39bf76dae61 )