RDFStore

 view release on metacpan or  search on metacpan

lib/RDFStore/Parser/SiRPAC.pm  view on Meta::CPAN

						push @rdf_attlist, [$namespace,$suffix];
						push @rdf_attlist, $attlist[$n+1];
					} else {
						die rdfcroak($expat,"Unresolved namespace prefix '$prefix' for '$suffix'");
						};
				} else {
					if(	($attname eq 'resource') 	|| 
						($attname eq 'ID') 		|| 
						($attname eq 'about') 		|| 
						($attname eq 'aboutEach') 	|| 
						($attname eq 'bagID')		||
						($attname eq 'nodeID')		||
						($attname eq 'datatype')	||
						($attname eq 'parseType')	||
						($attname eq 'type') ) {

						my $msg = rdfwarn($expat,"Unqualified use of 'rdf:$attname' attribute has been deprecated - see http://www.w3.org/2000/03/rdf-tracking/#rdf-ns-prefix-confusion");
						push @{ $expat->{warnings} },$msg;
						warn $msg;

						#default to RDFMS
						$namespace = $RDFStore::Parser::SiRPAC::RDF_SYNTAX_NS;
					} else {
						die rdfcroak($expat,"Using property attribute '$attname' without a namespace is forbidden.")
							unless($parseLiteral);
                                        	};
					push @rdf_attlist, [$namespace,$attname];
					push @rdf_attlist, $attlist[$n+1];
					};
			} else {
				push @rdf_attlist, [$namespace,$attname];
				push @rdf_attlist, $attlist[$n+1];
				};

			$rdf_atts++
				if(	($namespace eq $RDFStore::Parser::SiRPAC::RDF_SYNTAX_NS) &&
					($attname ne 'nodeID') ); # see http://www.w3.org/TR/2003/PR-rdf-syntax-grammar-20031215/#section-Syntax-parsetype-resource
  			};
		};

	# If we have parseType="Literal" set earlier, this element
        # needs some additional attributes to make it stand-alone
        # piece of XML
	if($parseLiteral) {
		#ignored for the moment
		$newElement =  RDFStore::Parser::SiRPAC::Element->new($sNamespace,$tag,\@rdf_attlist, $expat->{SiRPAC}->{'xml:lang'}, $expat->{SiRPAC}->{'rdf:datatype'}, $expat->{SiRPAC}->{'rdfstore:context'});
	} else {
		#....and probably Expat has already something like this.....
		$newElement =  RDFStore::Parser::SiRPAC::Element->new($sNamespace,$tag,\@rdf_attlist, $expat->{SiRPAC}->{'xml:lang'}, $expat->{SiRPAC}->{'rdf:datatype'}, $expat->{SiRPAC}->{'rdfstore:context'});
	};

	$expat->{SiRPAC}->{EXPECT_Element} = $newElement
		if($setScanModeElement);

	my $sLiteralValue;
	if($expat->{SiRPAC}->{scanMode} ne 'SKIPPING') {

		# goes through the attributes of newElement to see
	 	# 1. if there are symbolic references to other nodes in the data model.
		# in which case they must be stored for later resolving with
		# resolveLater method (fix aboutEach on streaming!!!)
		# 2. if there is an identity attribute, it is registered using
		# registerResource or registerID method. 
	
       		my $sResource;
       		$sResource = getAttributeValue($expat,$newElement->{attlist}, $RDFStore::Parser::SiRPAC::RDFMS_resource);
		if (defined $sResource) {
       	 		$newElement->{sResource} = normalizeResourceIdentifier($expat,$sResource);
		} else {
       			$sResource = getAttributeValue($expat,$newElement->{attlist}, $RDFStore::Parser::SiRPAC::RDFMS_nodeID);
			if (defined $sResource) {
       	 			$sResource = 'rdf:nodeID:'.$sResource;
       	 			$newElement->{sResource} = $sResource;
				};
			};

		my $sAboutEach = getAttributeValue($expat,$newElement->{attlist},
                                $RDFStore::Parser::SiRPAC::RDFMS_aboutEach);
                $newElement->{sAboutEach} = $sAboutEach
                        if(defined $sAboutEach);

        	my $sAbout = getAttributeValue($expat,$newElement->{attlist}, $RDFStore::Parser::SiRPAC::RDFMS_about);
		my $bnode=0;
        	if(defined $sAbout) {
        		$newElement->{sAbout} = normalizeResourceIdentifier($expat,$sAbout);
		} else {
        		$sAbout = getAttributeValue($expat,$newElement->{attlist}, $RDFStore::Parser::SiRPAC::RDFMS_nodeID);
			if ( defined $sAbout ) {
        			$sAbout = 'rdf:nodeID:'.$sAbout;
        			$newElement->{sAbout} = $sAbout;
				$bnode=1;
				};
        		};

        	my $sBagID = getAttributeValue($expat,$newElement->{attlist},
				$RDFStore::Parser::SiRPAC::RDFMS_bagID);

        	if (defined $sBagID) {
        		$newElement->{sBagID} = normalizeResourceIdentifier($expat,$sBagID);
			$sBagID = $newElement->{sBagID};
        		};

        	my $sID = getAttributeValue($expat,$newElement->{attlist},
				$RDFStore::Parser::SiRPAC::RDFMS_ID);
        	if (defined $sID) {
        		$newElement->{sID} = normalizeResourceIdentifier($expat,'#'.$sID);
			$sID = $newElement->{sID};
        		};
		if(defined $sAboutEach) {
			#any idea how to support it? caching and backrefs??
			die rdfcroak($expat,"aboutEach is not supported on stream parsing ");
			};

		if(	(defined $sID) && 
			(defined $sAbout) &&
			(! $bnode) ) {
			die rdfcroak($expat,"A description block cannot use both 'ID' and 'about' attributes - see <a href=\"http://www.w3.org/TR/REC-rdf-syntax/#idAboutAttr\">[6.5]</a>");
			};

		# Check parseType
		$sLiteralValue = getAttributeValue($expat,$newElement->{attlist},

lib/RDFStore/Parser/SiRPAC.pm  view on Meta::CPAN

				attlist		=>	$attlist,
				children	=>	[],
				vTargets	=>	[],
				bDone		=>	0,
				isCollection	=>	0,
				#at this level is just because SiRPAC parsing struct is broken (wrong to propagate XML attribute on elements)
				'lang'		=>      $lang, #xml:lang
				'rdf:datatype'  =>	$datatype, #rdf:datatype
				'context'	=> 	$context #rdfstore:context
			};
		bless $self,$pkg;
	};

	sub name {
		return (defined $_[0]->{sNamespace}) ?
				$_[0]->{sNamespace}.$_[0]->{tag} :
				$_[0]->{tag};
	};

	sub localName {
		return $_[0]->{tag};
	};

	sub namespace {
		return $_[0]->{sNamespace};
	};
};

package RDFStore::Parser::SiRPAC::DataElement;
{
	@RDFStore::Parser::SiRPAC::DataElement::ISA = qw( RDFStore::Parser::SiRPAC::Element );
	sub new {
		my ($pkg, $text, $parsetype, $lang, $datatype, $context) = @_;

#print STDERR "RDFStore::Parser::SiRPAC::DataElement::new( @_ ): ".(caller)[2]."\n";

		my $self = $pkg->SUPER::new(undef,$text,undef,$lang, $datatype, $context);

		delete $self->{sNamespace}; # we do not need it
		delete $self->{attlist}; # we do not need it

		$self->{'parse_type'} = (	$parsetype or 
						$datatype eq 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral' ) ? 1 : 0; #Literal or Resource
		$self->{tag} = "[DATA: " . $text . "]";
		$self->{sContent} = $text; #instanceOf Data :-)
		bless $self,$pkg;
	};

	sub name { };
	sub localName { };
	sub namespace { };
};

1;
};

__END__

=head1 NAME

RDFStore::Parser::SiRPAC - This module implements a streaming RDF Parser as a direct implementation of XML::Parser::Expat(3)

=head1 SYNOPSIS

	use RDFStore::Parser::SiRPAC;
        use RDFStore::NodeFactory;
        my $p=new RDFStore::Parser::SiRPAC(
		ErrorContext => 2,
                Handlers        => {
                        Init    => sub { print "INIT\n"; },
                        Final   => sub { print "FINAL\n"; },
                        Assert  => sub { print "STATEMENT - @_\n"; }
                },
                NodeFactory     => new RDFStore::NodeFactory() );

	$p->parsefile('http://www.gils.net/bsr-gils.rdfs');
        $p->parsefile('http://www.gils.net/rdf/bsr-gils.rdfs');
        $p->parsefile('/some/where/my.rdf');
        $p->parsefile('file:/some/where/my.rdf');
	$p->parse(*STDIN); #parse stream but with *blocking* Expat (see below example for n-blocking parsing using XML::Parse::ExpatNB)

	use RDFStore::Parser::SiRPAC;
        use RDFStore::NodeFactory;
	my $pstore=new RDFStore::Parser::SiRPAC(
                ErrorContext 	=> 2,
		Style           => 'RDFStore::Parser::Styles::RDFStore::Model',
                NodeFactory     => new RDFStore::NodeFactory(),
                style_options   =>      {
                                        persistent      =>      1,
                                        seevalues       =>      1,
                                        store_options         =>      { Name => '/tmp/test' }
                                }
        );
	my $rdfstore_model = $pstore->parsefile('http://www.gils.net/bsr-gils.rdfs');

	#using the expat no-blocking feature (generally for large XML streams) - see XML::Parse::Expat(3)
	my $rdfstore_stream_model = $pstore->parsestream(*STDIN);
	

=head1 DESCRIPTION

This module implements a Resource Description Framework (RDF) I<streaming> parser completely in 
Perl using the XML::Parser::Expat(3) module. The actual RDF parsing happens using an instance of XML::Parser::Expat with Namespaces option enabled and start/stop and char handlers set.
The RDF specific code is based on the modified version of SiRPAC of Sergey Melnik in Java; a lot of
changes and adaptations have been done to actually run it under Perl.
Expat options may be provided when the RDFStore::Parser::SiRPAC object is created. These options are then passed on to the Expat object on each parse call.

Exactly like XML::Parser(3) the behavior of the parser is controlled either by the Style entry elsewhere in this document and/or the Handlers entry elsewhere in this document options, or by the setHandlers entry elsewhere in this document method. The...

To see some examples about how to use it look at the sections below and in the samples and utils directory coming with this software distribution.

E.g.
	With RDFStore::Parser::SiRPAC you can easily write an rdfingest.pl script to do something like this:

	fetch -o - -q http://dmoz.org/rdf/content.rdf.u8.gz | \
		gunzip - | \
		sed -f dmoz.content.sed | rdfingest.pl - 

=head1 METHODS

=over 4

=item new

This is a class method, the constructor for RDFStore::Parser::SiRPAC. B<Options> are passed as keyword value
pairs. Recognized options are:

=over 4

=item * NodeFactory

This option is B<mandatory> to run the RDFStore::Parser::SiRPAC parser correctly and must contain a reference to an object of type RDFStore::NodeFactory(3). Such a reference is used during the RDF parsing to create resources, literal and statements t...
with the RDFStore package.

=item * Source

This option can be specified by the user to set a base URI to use for the generation of resource URIs during parsing. If this option is omitted the parser will try to generate a prefix for generated resources using the input filename or URL actually ...

=item * GenidNumber

Seed the counter for bNodes with the given value

=item * bCreateBags

Flag to generate a Bag for each Description element

=item * Style

This option provides an easy way to set a given style of parser. There is one sample Sylte module provided with the RDFStore::Parser::SiRPAC distribution called RDFStore::Parser::Styles::RDFStore::Model. Such a module uses the RDFStore::Model(3) to i...
Custom styles can be provided by giving a full package name containing
at least one '::'. This package should then have subs defined for each
handler it wishes to have installed. See L<"WRITE YOUR OWN PARSER"> below
for a discussion on how to build one.

=item * Handlers

When provided, this option should be an anonymous hash containing as
keys the type of handler and as values a sub reference to handle that
type of event. All the handlers get passed as their 1st parameter the
instance of Expat that is parsing the document. Further details on
handlers can be found in L<"HANDLERS">. Any handler set here
overrides the corresponding handler set with the Style option.

lib/RDFStore/Parser/SiRPAC.pm  view on Meta::CPAN


A more sophisticated solution is to write a complete Perl5 Sytle module for RDFStore::Parser::SiRPAC that
can be easily reused in your code. E.g. a perl script could use this piece of code:

	use RDFStore::Parser::SiRPAC;
	use RDFStore::Parser::SiRPAC::MyStyle;
	use RDFStore::NodeFactory;

	my $p=new RDFStore::Parser::SiRPAC(	Style => 'RDFStore::Parser::SiRPAC::MyStyle',
                			NodeFactory     => new RDFStore::NodeFactory() );
	$p->parsefile('http://www.gils.net/bsr-gils.rdfs');

The Style module self could stored into a file like MyStyle.pm like this:

	package RDFStore::Parser::SiRPAC::MyStyle;

	sub Init { print "INIT\n"; };
	sub Final { print "FINAL\n"; };
	sub Assert {
                print "ASSERT: ",
                                $_[1]->subject()->toString(),
                                $_[1]->predicate()->toString(),
                                $_[1]->object()->toString(), "\n";
	};
	sub Start_XML_Literal { print "STARTAG: ",$_[1],"\n"; };
	sub Stop_XML_Literal { print "ENDTAG: ",$_[1],"\n"; };
	sub Char_XML_Literal { print "UTF8 chrs: ",$_[1],"\n"; };

	1;

For a more complete and useful example see RDFStore::Parser::SiRPAC::RDFStore(3).


=head1 BUGS

This module implements most of the W3C RDF Raccomandation as its Java counterpart SiRPAC from the Stanford University Database Group by Sergey Melnik (see http://www-db.stanford.edu/~melnik/rdf/api.html)
This version is conformant to the latest RDF API Draft on 2000-11-13. It does not support yet:

	* aboutEach

=head1 SEE ALSO

 RDFStore::Parser::SiRPAC(3), DBMS(3) and XML::Parser(3) XML::Parser::Expat(3)

 RDFStore::Model(3) RDFStore::NodeFactory(3)

 RDF Model and Syntax Specification - http://www.w3.org/TR/rdf-syntax-grammar/

 RDF Schema Specification 1.0 - http://www.w3.org/TR/rdf-schema/

 Benchmarking XML Parsers by Clark Cooper - http://www.xml.com/pub/Benchmark/article.html

 See also http://www.w3.org/RDF/Implementations/SiRPAC/SiRPAC-defects.html

 RDF::Parser(3) from http://www.pro-solutions.com

=head1 AUTHOR

	Alberto Reggiori <areggiori@webweaving.org>

	Sergey Melnik <melnik@db.stanford.edu> is the original author of the streaming version of SiRPAC in Java
	Clark Cooper is the author of the XML::Parser(3) module together with Larry Wall



( run in 2.085 seconds using v1.01-cache-2.11-cpan-437f7b0c052 )