Redland
view release on metacpan or search on metacpan
redland/raptor/src/raptor_rss.c view on Meta::CPAN
/* -*- Mode: c; c-basic-offset: 2 -*-
*
* raptor_rss.c - Raptor RSS tag soup parser
*
* Copyright (C) 2003-2006, David Beckett http://purl.org/net/dajobe/
* Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
*
* Contributions:
* Copyright (C) 2004-2005, Suzan Foster <su@islief.nl>
*
* This package is Free Software and part of Redland http://librdf.org/
*
* It is licensed under the following three licenses as alternatives:
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE.html or LICENSE.txt at the top of this package for the
* complete terms and further detail along with the license texts for
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
*
*
*/
#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif
#ifdef WIN32
#include <win32_raptor_config.h>
#endif
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"
#include "raptor_rss.h"
/* local prototypes */
static void raptor_rss_insert_identifiers(raptor_parser* rdf_parser);
static void raptor_rss_uplift_items(raptor_parser* rdf_parser);
static int raptor_rss_emit(raptor_parser* rdf_parser);
static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len);
static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s);
/*
* RSS parser object
*/
struct raptor_rss_parser_s {
/* static model */
raptor_rss_model model;
/* current line */
char *line;
/* current line length */
int line_length;
/* current char in line buffer */
int offset;
/* static statement for use in passing to user code */
raptor_statement statement;
raptor_sax2 *sax2;
/* rss node type of current item */
raptor_rss_type current_type;
/* one place stack */
raptor_rss_type prev_type;
raptor_rss_fields_type current_field;
/* emptyness of current element */
int element_is_empty;
/* stack of namespaces */
raptor_namespace_stack *nstack;
/* non-0 if this is an atom 1.0 parser */
int is_atom;
};
typedef struct raptor_rss_parser_s raptor_rss_parser;
typedef enum {
RAPTOR_RSS_CONTENT_TYPE_NONE,
RAPTOR_RSS_CONTENT_TYPE_XML,
RAPTOR_RSS_CONTENT_TYPE_TEXT
} raptor_rss_content_type;
struct raptor_rss_element_s
{
raptor_uri* uri;
const unsigned char *rel;
/* Two types of content */
raptor_rss_content_type type;
/* 1) XML */
raptor_xml_writer* xml_writer;
/* XML written to this iostream to the xml_content string */
raptor_iostream* iostream;
/* ends up here */
void *xml_content;
size_t xml_content_length;
/* 2) cdata */
raptor_stringbuffer* sb;
};
typedef struct raptor_rss_element_s raptor_rss_element;
static void
raptor_free_rss_element(raptor_rss_element *rss_element)
{
if(rss_element->uri)
raptor_free_uri(rss_element->uri);
if(rss_element->rel)
raptor_free_memory((void*)rss_element->rel);
if(rss_element->type == RAPTOR_RSS_CONTENT_TYPE_XML) {
if(rss_element->xml_writer)
raptor_free_xml_writer(rss_element->xml_writer);
if(rss_element->iostream)
raptor_free_iostream(rss_element->iostream);
if(rss_element->xml_content)
raptor_free_memory(rss_element->xml_content);
}
if(rss_element->sb)
raptor_free_stringbuffer(rss_element->sb);
RAPTOR_FREE(raptor_rss_element, rss_element);
}
static int
raptor_rss_parse_init(raptor_parser* rdf_parser, const char *name)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
raptor_sax2* sax2;
raptor_uri_handler *uri_handler;
void *uri_context;
int n;
raptor_rss_common_init();
raptor_rss_model_init(&rss_parser->model);
rss_parser->prev_type=RAPTOR_RSS_NONE;
rss_parser->current_field=RAPTOR_RSS_FIELD_NONE;
rss_parser->current_type=RAPTOR_RSS_NONE;
if(rss_parser->sax2) {
raptor_free_sax2(rss_parser->sax2);
rss_parser->sax2=NULL;
}
raptor_uri_get_handler(&uri_handler, &uri_context);
rss_parser->nstack=raptor_new_namespaces(uri_handler, uri_context,
NULL, NULL, /* errors */
1);
/* Initialise the namespaces */
for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
unsigned const char* prefix=(unsigned const char*)raptor_rss_namespaces_info[n].prefix;
raptor_uri* uri=raptor_rss_namespaces_info[n].uri;
raptor_namespace* nspace=NULL;
if(prefix && uri)
nspace=raptor_new_namespace_from_uri(rss_parser->nstack,
prefix, uri, 0);
raptor_rss_namespaces_info[n].nspace=nspace;
}
sax2=raptor_new_sax2(rdf_parser,
rdf_parser, raptor_parser_error_message_handler,
rdf_parser, raptor_parser_fatal_error_message_handler,
rdf_parser, raptor_parser_warning_message_handler);
rss_parser->sax2=sax2;
raptor_sax2_set_start_element_handler(sax2, raptor_rss_start_element_handler);
raptor_sax2_set_end_element_handler(sax2, raptor_rss_end_element_handler);
raptor_sax2_set_characters_handler(sax2, raptor_rss_cdata_handler);
raptor_sax2_set_cdata_handler(sax2, raptor_rss_cdata_handler);
raptor_sax2_set_comment_handler(sax2, raptor_rss_comment_handler);
raptor_sax2_set_locator(sax2, &rdf_parser->locator);
return 0;
}
static void
raptor_rss_parse_terminate(raptor_parser *rdf_parser)
{
raptor_rss_parser *rss_parser=(raptor_rss_parser*)rdf_parser->context;
int n;
if(rss_parser->sax2)
raptor_free_sax2(rss_parser->sax2);
raptor_rss_model_clear(&rss_parser->model);
/* Initialise the namespaces */
for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
if(raptor_rss_namespaces_info[n].nspace)
raptor_free_namespace(raptor_rss_namespaces_info[n].nspace);
}
if(rss_parser->nstack)
raptor_free_namespaces(rss_parser->nstack);
raptor_rss_common_terminate();
}
static int
raptor_rss_parse_start(raptor_parser *rdf_parser)
{
raptor_uri *uri=rdf_parser->base_uri;
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
/* base URI required for RSS */
if(!uri)
return 1;
/* Optionally forbid network requests in the XML parser */
raptor_sax2_set_feature(rss_parser->sax2,
RAPTOR_FEATURE_NO_NET,
rdf_parser->features[RAPTOR_FEATURE_NO_NET]);
raptor_sax2_parse_start(rss_parser->sax2, uri);
return 0;
}
static void
raptor_rss_start_element_handler(void *user_data,
raptor_xml_element* xml_element)
{
raptor_parser *rdf_parser;
raptor_rss_parser *rss_parser;
redland/raptor/src/raptor_rss.c view on Meta::CPAN
RAPTOR_DEBUG2(" setting enclosure length %s\n", attrValue);
enclosure->length=(char*)RAPTOR_MALLOC(cstring, len+1);
strncpy(enclosure->length, (char*)attrValue, len+1);
}
} else if (!strcmp((const char*)attrName, "type")) {
if (!strcmp((const char*)name, "enclosure") && enclosure) {
size_t len=strlen((const char*)attrValue);
RAPTOR_DEBUG2(" setting enclosure type %s\n", attrValue);
enclosure->type=(char*)RAPTOR_MALLOC(cstring, len+1);
strncpy(enclosure->type, (char*)attrValue, len+1);
} else if(rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) {
/* do nothing with atom link attribute type */
} else if(rss_parser->is_atom) {
/* Atom only typing */
if (!strcmp((const char*)attrValue, "xhtml") ||
!strcmp((const char*)attrValue, "xml") ||
strstr((const char*)attrValue, "+xml")) {
raptor_uri_handler *uri_handler;
void *uri_context;
RAPTOR_DEBUG2(" found type '%s', making an XML writer\n",
attrValue);
raptor_uri_get_handler(&uri_handler, &uri_context);
rss_element->type=RAPTOR_RSS_CONTENT_TYPE_XML;
rss_element->iostream=raptor_new_iostream_to_string(&rss_element->xml_content, &rss_element->xml_content_length, raptor_alloc_memory);
rss_element->xml_writer=raptor_new_xml_writer(NULL,
uri_handler, uri_context,
rss_element->iostream,
(raptor_simple_message_handler)raptor_parser_simple_error, rdf_parser,
1);
raptor_xml_writer_set_feature(rss_element->xml_writer,
RAPTOR_FEATURE_WRITER_XML_DECLARATION, 0);
raptor_free_stringbuffer(rss_element->sb);
rss_element->sb=NULL;
}
}
} else if (!strcmp((const char*)attrName, "version")) {
if(!raptor_strcasecmp((const char*)name, "feed")) {
if(!strcmp((const char*)attrValue, "0.3"))
rss_parser->is_atom=1;
}
}
}
} /* if have attributes */
}
static void
raptor_rss_end_element_handler(void *user_data,
raptor_xml_element* xml_element)
{
raptor_parser* rdf_parser;
raptor_rss_parser* rss_parser;
#ifdef RAPTOR_DEBUG
const unsigned char* name=raptor_xml_element_get_name(xml_element)->local_name;
#endif
raptor_rss_element* rss_element;
size_t cdata_len=0;
unsigned char* cdata=NULL;
rss_element=(raptor_rss_element*)xml_element->user_data;
rdf_parser=(raptor_parser*)user_data;
rss_parser=(raptor_rss_parser*)rdf_parser->context;
if(rss_element->xml_writer) {
if(rss_element->type != RAPTOR_RSS_CONTENT_TYPE_XML) {
raptor_xml_writer_end_element(rss_element->xml_writer, xml_element);
goto tidy_end_element;
}
/* otherwise we are done making XML */
raptor_free_iostream(rss_element->iostream);
rss_element->iostream=NULL;
cdata=(unsigned char*)rss_element->xml_content;
cdata_len=rss_element->xml_content_length;
}
if(rss_element->sb) {
cdata_len=raptor_stringbuffer_length(rss_element->sb);
cdata=raptor_stringbuffer_as_string(rss_element->sb);
}
if(cdata) {
raptor_uri* base_uri=NULL;
base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2);
if((rss_parser->current_type==RAPTOR_RSS_NONE ||
rss_parser->current_type==RAPTOR_RSS_UNKNOWN) ||
(rss_parser->current_field==RAPTOR_RSS_FIELD_NONE ||
rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN)) {
unsigned char *p=cdata;
int i;
for(i=cdata_len; i>0 && *p; i--) {
if(!isspace(*p))
break;
p++;
}
if(i>0 && *p) {
RAPTOR_DEBUG4("IGNORING non-whitespace text '%s' inside type %s, field %s\n", cdata,
raptor_rss_types_info[rss_parser->current_type].name,
raptor_rss_fields_info[rss_parser->current_field].name);
}
goto tidy_end_element;
}
if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
/* skipHours, skipDays common but IGNORED */
RAPTOR_DEBUG2("Ignoring fields for type %s\n", raptor_rss_types_info[rss_parser->current_type].name);
} else {
raptor_rss_item* update_item;
raptor_rss_field* field=raptor_rss_new_field();
if(rss_parser->current_type == RAPTOR_RSS_ITEM)
update_item=rss_parser->model.last;
else
update_item=raptor_rss_model_get_common(&rss_parser->model,
rss_parser->current_type);
/* if value is always an uri, make it so */
if(raptor_rss_fields_info[rss_parser->current_field].flags &
RAPTOR_RSS_INFO_FLAG_URI_VALUE) {
RAPTOR_DEBUG4("Added URI %s to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=raptor_new_uri_relative_to_base(base_uri, cdata);
} else {
RAPTOR_DEBUG4("Added text '%s' to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=NULL;
field->value=(unsigned char*)RAPTOR_MALLOC(cstring, cdata_len+1);
strncpy((char*)field->value, (const char*)cdata, cdata_len);
field->value[cdata_len]='\0';
}
RAPTOR_DEBUG1("fa3 - ");
raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
}
} /* end if contained cdata */
if(raptor_xml_element_is_empty(xml_element)) {
/* Empty element, so consider adding one of the attributes as
* literal or URI content
*/
if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
/* skipHours, skipDays common but IGNORED */
RAPTOR_DEBUG3("Ignoring empty element %s for type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
} else if(rss_element->uri) {
raptor_rss_item* update_item;
raptor_rss_field* field=raptor_rss_new_field();
if(rss_parser->current_type == RAPTOR_RSS_ITEM)
update_item=rss_parser->model.last;
else
update_item=raptor_rss_model_get_common(&rss_parser->model,
rss_parser->current_type);
if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK &&
rss_element->rel &&
!strcmp((const char*)rss_element->rel, "alternate")) {
/* RSS with rel != alternate ignored FIXME */
} else if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) {
RAPTOR_DEBUG2("Cannot add URI from alternate attribute to type %s unknown field\n", raptor_rss_types_info[rss_parser->current_type].name);
raptor_rss_field_free(field);
} else {
RAPTOR_DEBUG3("Added URI to field %s of type %s\n", raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
field->uri=rss_element->uri;
rss_element->uri=NULL;
RAPTOR_DEBUG1("fa2 - ");
raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
}
}
}
if(rss_parser->current_type != RAPTOR_RSS_NONE) {
if(rss_parser->current_field != RAPTOR_RSS_FIELD_NONE) {
RAPTOR_DEBUG3("Ending element %s field %s\n", name, raptor_rss_fields_info[rss_parser->current_field].name);
rss_parser->current_field= RAPTOR_RSS_FIELD_NONE;
} else {
RAPTOR_DEBUG3("Ending element %s type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
if(rss_parser->prev_type != RAPTOR_RSS_NONE) {
rss_parser->current_type=rss_parser->prev_type;
rss_parser->prev_type=RAPTOR_RSS_NONE;
RAPTOR_DEBUG3("Returning to type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name);
} else
rss_parser->current_type= RAPTOR_RSS_NONE;
}
}
tidy_end_element:
if(rss_element)
raptor_free_rss_element(rss_element);
}
static void
raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element,
const unsigned char *s, int len)
{
raptor_rss_element* rss_element;
rss_element=(raptor_rss_element*)xml_element->user_data;
if(rss_element->xml_writer) {
raptor_xml_writer_cdata_counted(rss_element->xml_writer, s, len);
return;
}
raptor_stringbuffer_append_counted_string(rss_element->sb, s, len, 1);
}
static void
raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element,
const unsigned char *s)
{
raptor_rss_element* rss_element;
if(!xml_element)
return;
rss_element=(raptor_rss_element*)xml_element->user_data;
if(rss_element->xml_writer) {
raptor_xml_writer_comment(rss_element->xml_writer, s);
return;
}
}
static void
raptor_rss_insert_enclosure_identifiers(raptor_parser* rdf_parser,
raptor_rss_enclosure *enclosure)
{
raptor_identifier* identifier=&enclosure->identifier;
if (enclosure->url) {
/* emit as URI resource */
identifier->uri=raptor_uri_copy(enclosure->url);
identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
identifier->uri_source=RAPTOR_URI_SOURCE_URI;
} else {
/* emit as blank node */
identifier->id=raptor_generate_id(rdf_parser, 0, NULL);
identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED;
}
enclosure->node_type=raptor_rss_types_info[RAPTOR_RSS_ENCLOSURE].uri;
}
static void
raptor_rss_insert_identifiers(raptor_parser* rdf_parser)
{
raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
int i;
raptor_rss_item* item;
for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) {
for(item=rss_parser->model.common[i]; item; item=item->next) {
raptor_identifier* identifier;
identifier=&(item->identifier);
if(!item->fields_count)
continue;
( run in 0.640 second using v1.01-cache-2.11-cpan-39bf76dae61 )