Redland

 view release on metacpan or  search on metacpan

redland/raptor/src/raptor_rss.c  view on Meta::CPAN

/* -*- Mode: c; c-basic-offset: 2 -*-
 *
 * raptor_rss.c - Raptor RSS tag soup parser
 *
 * Copyright (C) 2003-2006, David Beckett http://purl.org/net/dajobe/
 * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
 * 
 * Contributions:
 *   Copyright (C) 2004-2005, Suzan Foster <su@islief.nl>
 *
 * This package is Free Software and part of Redland http://librdf.org/
 * 
 * It is licensed under the following three licenses as alternatives:
 *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
 *   2. GNU General Public License (GPL) V2 or any newer version
 *   3. Apache License, V2.0 or any newer version
 * 
 * You may not use this file except in compliance with at least one of
 * the above three licenses.
 * 
 * See LICENSE.html or LICENSE.txt at the top of this package for the
 * complete terms and further detail along with the license texts for
 * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
 * 
 * 
 */

#ifdef HAVE_CONFIG_H
#include <raptor_config.h>
#endif

#ifdef WIN32
#include <win32_raptor_config.h>
#endif

#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdarg.h>
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif


/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"
#include "raptor_rss.h"


/* local prototypes */
static void raptor_rss_insert_identifiers(raptor_parser* rdf_parser);
static void raptor_rss_uplift_items(raptor_parser* rdf_parser);
static int raptor_rss_emit(raptor_parser* rdf_parser);

static void raptor_rss_start_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_end_element_handler(void *user_data, raptor_xml_element* xml_element);
static void raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s, int len);
static void raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element, const unsigned char *s);

/*
 * RSS parser object
 */
struct raptor_rss_parser_s {
  /* static model */
  raptor_rss_model model;
  
  /* current line */
  char *line;
  /* current line length */
  int line_length;
  /* current char in line buffer */
  int offset;
  
  /* static statement for use in passing to user code */
  raptor_statement statement;

  raptor_sax2 *sax2;

  /* rss node type of current item */
  raptor_rss_type current_type;

  /* one place stack */
  raptor_rss_type prev_type;
  raptor_rss_fields_type current_field;

  /* emptyness of current element */
  int element_is_empty;

  /* stack of namespaces */
  raptor_namespace_stack *nstack;

  /* non-0 if this is an atom 1.0 parser */
  int is_atom;
};

typedef struct raptor_rss_parser_s raptor_rss_parser;


typedef enum {
  RAPTOR_RSS_CONTENT_TYPE_NONE,
  RAPTOR_RSS_CONTENT_TYPE_XML,
  RAPTOR_RSS_CONTENT_TYPE_TEXT
} raptor_rss_content_type;


struct raptor_rss_element_s
{
  raptor_uri* uri;
  const unsigned char *rel;

  /* Two types of content */
  raptor_rss_content_type type;

  /* 1) XML */
  raptor_xml_writer* xml_writer;
  /* XML written to this iostream to the xml_content string */
  raptor_iostream* iostream;
  /* ends up here */
  void *xml_content;
  size_t xml_content_length;

  /* 2) cdata */
  raptor_stringbuffer* sb;
};

typedef struct raptor_rss_element_s raptor_rss_element;


static void
raptor_free_rss_element(raptor_rss_element *rss_element)
{
  if(rss_element->uri)
    raptor_free_uri(rss_element->uri);
  if(rss_element->rel)
    raptor_free_memory((void*)rss_element->rel);
  if(rss_element->type == RAPTOR_RSS_CONTENT_TYPE_XML) {
    if(rss_element->xml_writer)
      raptor_free_xml_writer(rss_element->xml_writer);
    if(rss_element->iostream)
      raptor_free_iostream(rss_element->iostream);
    if(rss_element->xml_content)
      raptor_free_memory(rss_element->xml_content);
  }
  if(rss_element->sb)
    raptor_free_stringbuffer(rss_element->sb);

  RAPTOR_FREE(raptor_rss_element, rss_element);
}


static int
raptor_rss_parse_init(raptor_parser* rdf_parser, const char *name)
{
  raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
  raptor_sax2* sax2;
  raptor_uri_handler *uri_handler;
  void *uri_context;
  int n;

  raptor_rss_common_init();

  raptor_rss_model_init(&rss_parser->model);

  rss_parser->prev_type=RAPTOR_RSS_NONE;
  rss_parser->current_field=RAPTOR_RSS_FIELD_NONE;
  rss_parser->current_type=RAPTOR_RSS_NONE;

  if(rss_parser->sax2) {
    raptor_free_sax2(rss_parser->sax2);
    rss_parser->sax2=NULL;
  }

  raptor_uri_get_handler(&uri_handler, &uri_context);

  rss_parser->nstack=raptor_new_namespaces(uri_handler, uri_context,
                                           NULL, NULL, /* errors */
                                           1);

  /* Initialise the namespaces */
  for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
    unsigned const char* prefix=(unsigned const char*)raptor_rss_namespaces_info[n].prefix;
    raptor_uri* uri=raptor_rss_namespaces_info[n].uri;
    raptor_namespace* nspace=NULL;

    if(prefix && uri)
      nspace=raptor_new_namespace_from_uri(rss_parser->nstack,
                                           prefix, uri, 0);
    raptor_rss_namespaces_info[n].nspace=nspace;
  }

  sax2=raptor_new_sax2(rdf_parser, 
                       rdf_parser, raptor_parser_error_message_handler,
                       rdf_parser, raptor_parser_fatal_error_message_handler,
                       rdf_parser, raptor_parser_warning_message_handler);
  rss_parser->sax2=sax2;

  raptor_sax2_set_start_element_handler(sax2, raptor_rss_start_element_handler);
  raptor_sax2_set_end_element_handler(sax2, raptor_rss_end_element_handler);
  raptor_sax2_set_characters_handler(sax2, raptor_rss_cdata_handler);
  raptor_sax2_set_cdata_handler(sax2, raptor_rss_cdata_handler);
  raptor_sax2_set_comment_handler(sax2, raptor_rss_comment_handler);

  raptor_sax2_set_locator(sax2, &rdf_parser->locator);

  return 0;
}


static void
raptor_rss_parse_terminate(raptor_parser *rdf_parser)
{
  raptor_rss_parser *rss_parser=(raptor_rss_parser*)rdf_parser->context;
  int n;
  
  if(rss_parser->sax2)
    raptor_free_sax2(rss_parser->sax2);

  raptor_rss_model_clear(&rss_parser->model);

  /* Initialise the namespaces */
  for(n=0; n < RAPTOR_RSS_NAMESPACES_SIZE; n++) {
    if(raptor_rss_namespaces_info[n].nspace)
      raptor_free_namespace(raptor_rss_namespaces_info[n].nspace);
  }

  if(rss_parser->nstack)
    raptor_free_namespaces(rss_parser->nstack);

  raptor_rss_common_terminate();
}


static int
raptor_rss_parse_start(raptor_parser *rdf_parser) 
{
  raptor_uri *uri=rdf_parser->base_uri;
  raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
  
  /* base URI required for RSS */
  if(!uri)
    return 1;

  /* Optionally forbid network requests in the XML parser */
  raptor_sax2_set_feature(rss_parser->sax2, 
                          RAPTOR_FEATURE_NO_NET,
                          rdf_parser->features[RAPTOR_FEATURE_NO_NET]);
  
  raptor_sax2_parse_start(rss_parser->sax2, uri);

  return 0;
}



static void
raptor_rss_start_element_handler(void *user_data,
                                 raptor_xml_element* xml_element)
{
  raptor_parser *rdf_parser;
  raptor_rss_parser *rss_parser;

redland/raptor/src/raptor_rss.c  view on Meta::CPAN

          RAPTOR_DEBUG2("  setting enclosure length %s\n", attrValue);
          enclosure->length=(char*)RAPTOR_MALLOC(cstring, len+1);
          strncpy(enclosure->length, (char*)attrValue, len+1);
        }
      } else if (!strcmp((const char*)attrName, "type")) {
        if (!strcmp((const char*)name, "enclosure") && enclosure) {
          size_t len=strlen((const char*)attrValue);
          RAPTOR_DEBUG2("  setting enclosure type %s\n", attrValue);
          enclosure->type=(char*)RAPTOR_MALLOC(cstring, len+1);
          strncpy(enclosure->type, (char*)attrValue, len+1);
        } else if(rss_parser->current_field == RAPTOR_RSS_FIELD_ATOM_LINK) {
          /* do nothing with atom link attribute type */
        } else if(rss_parser->is_atom) {
          /* Atom only typing */
          if (!strcmp((const char*)attrValue, "xhtml") ||
              !strcmp((const char*)attrValue, "xml") ||
              strstr((const char*)attrValue, "+xml")) {
            raptor_uri_handler *uri_handler;
            void *uri_context;

            RAPTOR_DEBUG2("  found type '%s', making an XML writer\n", 
                          attrValue);
            
            raptor_uri_get_handler(&uri_handler, &uri_context);
            rss_element->type=RAPTOR_RSS_CONTENT_TYPE_XML;
            rss_element->iostream=raptor_new_iostream_to_string(&rss_element->xml_content, &rss_element->xml_content_length, raptor_alloc_memory);
            rss_element->xml_writer=raptor_new_xml_writer(NULL,
                                                          uri_handler, uri_context,
                                                          rss_element->iostream,
                                                          (raptor_simple_message_handler)raptor_parser_simple_error, rdf_parser,
                                                          1);
            raptor_xml_writer_set_feature(rss_element->xml_writer, 
                                          RAPTOR_FEATURE_WRITER_XML_DECLARATION, 0);

            raptor_free_stringbuffer(rss_element->sb);
            rss_element->sb=NULL;

          }
        }
      } else if (!strcmp((const char*)attrName, "version")) {
        if(!raptor_strcasecmp((const char*)name, "feed")) {
          if(!strcmp((const char*)attrValue, "0.3"))
            rss_parser->is_atom=1;
        }
      }
    }
  } /* if have attributes */
}


static void
raptor_rss_end_element_handler(void *user_data, 
                               raptor_xml_element* xml_element)
{
  raptor_parser* rdf_parser;
  raptor_rss_parser* rss_parser;
#ifdef RAPTOR_DEBUG
  const unsigned char* name=raptor_xml_element_get_name(xml_element)->local_name;
#endif
  raptor_rss_element* rss_element;
  size_t cdata_len=0;
  unsigned char* cdata=NULL;

  rss_element=(raptor_rss_element*)xml_element->user_data;

  rdf_parser=(raptor_parser*)user_data;
  rss_parser=(raptor_rss_parser*)rdf_parser->context;

  if(rss_element->xml_writer) {
    if(rss_element->type != RAPTOR_RSS_CONTENT_TYPE_XML) {
      raptor_xml_writer_end_element(rss_element->xml_writer, xml_element);
      goto tidy_end_element;
    }

    /* otherwise we are done making XML */
    raptor_free_iostream(rss_element->iostream);
    rss_element->iostream=NULL;
    cdata=(unsigned char*)rss_element->xml_content;
    cdata_len=rss_element->xml_content_length;
  }

  if(rss_element->sb) {
    cdata_len=raptor_stringbuffer_length(rss_element->sb);
    cdata=raptor_stringbuffer_as_string(rss_element->sb);
  }

  if(cdata) {
    raptor_uri* base_uri=NULL;
    
    base_uri=raptor_sax2_inscope_base_uri(rss_parser->sax2);

    if((rss_parser->current_type==RAPTOR_RSS_NONE ||
        rss_parser->current_type==RAPTOR_RSS_UNKNOWN) ||
       (rss_parser->current_field==RAPTOR_RSS_FIELD_NONE ||
        rss_parser->current_field==RAPTOR_RSS_FIELD_UNKNOWN)) {
      unsigned char *p=cdata;
      int i;
      for(i=cdata_len; i>0 && *p; i--) {
        if(!isspace(*p))
          break;
        p++;
      }
      if(i>0 && *p) {
        RAPTOR_DEBUG4("IGNORING non-whitespace text '%s' inside type %s, field %s\n", cdata,
                      raptor_rss_types_info[rss_parser->current_type].name,
                      raptor_rss_fields_info[rss_parser->current_field].name);
      }

      goto tidy_end_element;
    }

    if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
      /* skipHours, skipDays common but IGNORED */ 
      RAPTOR_DEBUG2("Ignoring fields for type %s\n", raptor_rss_types_info[rss_parser->current_type].name);
    } else {
      raptor_rss_item* update_item;
      raptor_rss_field* field=raptor_rss_new_field();

      if(rss_parser->current_type == RAPTOR_RSS_ITEM)
        update_item=rss_parser->model.last;
      else
        update_item=raptor_rss_model_get_common(&rss_parser->model,
                                                rss_parser->current_type);

      /* if value is always an uri, make it so */
      if(raptor_rss_fields_info[rss_parser->current_field].flags & 
         RAPTOR_RSS_INFO_FLAG_URI_VALUE) {
        RAPTOR_DEBUG4("Added URI %s to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
        field->uri=raptor_new_uri_relative_to_base(base_uri, cdata);
      } else {
        RAPTOR_DEBUG4("Added text '%s' to field %s of type %s\n", cdata, raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
        field->uri=NULL;
        field->value=(unsigned char*)RAPTOR_MALLOC(cstring, cdata_len+1);
        strncpy((char*)field->value, (const char*)cdata, cdata_len);
        field->value[cdata_len]='\0';
      }

      RAPTOR_DEBUG1("fa3 - ");
      raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
    }
  } /* end if contained cdata */
  

  if(raptor_xml_element_is_empty(xml_element)) {
    /* Empty element, so consider adding one of the attributes as
     * literal or URI content
     */
    if(rss_parser->current_type >= RAPTOR_RSS_COMMON_IGNORED) {
      /* skipHours, skipDays common but IGNORED */ 
      RAPTOR_DEBUG3("Ignoring empty element %s for type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
    } else if(rss_element->uri) {
      raptor_rss_item* update_item;
      raptor_rss_field* field=raptor_rss_new_field();

      if(rss_parser->current_type == RAPTOR_RSS_ITEM)
        update_item=rss_parser->model.last;
      else
        update_item=raptor_rss_model_get_common(&rss_parser->model,
                                                rss_parser->current_type);

      if(rss_parser->current_field == RAPTOR_RSS_FIELD_LINK &&
         rss_element->rel && 
         !strcmp((const char*)rss_element->rel, "alternate")) {
        /* RSS with rel != alternate ignored FIXME */
      } else if(rss_parser->current_field == RAPTOR_RSS_FIELD_UNKNOWN) {
        RAPTOR_DEBUG2("Cannot add URI from alternate attribute to type %s unknown field\n", raptor_rss_types_info[rss_parser->current_type].name);
        raptor_rss_field_free(field);
      } else {
        RAPTOR_DEBUG3("Added URI to field %s of type %s\n", raptor_rss_fields_info[rss_parser->current_field].name, raptor_rss_types_info[rss_parser->current_type].name);
        field->uri=rss_element->uri;
        rss_element->uri=NULL;
        RAPTOR_DEBUG1("fa2 - ");
        raptor_rss_item_add_field(update_item, rss_parser->current_field, field);
      }
    }

  }
  if(rss_parser->current_type != RAPTOR_RSS_NONE) {
    if(rss_parser->current_field != RAPTOR_RSS_FIELD_NONE) {
      RAPTOR_DEBUG3("Ending element %s field %s\n", name, raptor_rss_fields_info[rss_parser->current_field].name);
      rss_parser->current_field= RAPTOR_RSS_FIELD_NONE;
    } else {
      RAPTOR_DEBUG3("Ending element %s type %s\n", name, raptor_rss_types_info[rss_parser->current_type].name);
      if(rss_parser->prev_type != RAPTOR_RSS_NONE) {
        rss_parser->current_type=rss_parser->prev_type;
        rss_parser->prev_type=RAPTOR_RSS_NONE;
        RAPTOR_DEBUG3("Returning to type %d - %s\n", rss_parser->current_type, raptor_rss_types_info[rss_parser->current_type].name);
      } else
        rss_parser->current_type= RAPTOR_RSS_NONE;
    }
  }

 tidy_end_element:

  if(rss_element)
    raptor_free_rss_element(rss_element);

}



static void
raptor_rss_cdata_handler(void *user_data, raptor_xml_element* xml_element,
                         const unsigned char *s, int len)
{      
  raptor_rss_element* rss_element;

  rss_element=(raptor_rss_element*)xml_element->user_data;

  if(rss_element->xml_writer) {
    raptor_xml_writer_cdata_counted(rss_element->xml_writer, s, len);
    return;
  }

  raptor_stringbuffer_append_counted_string(rss_element->sb, s, len, 1);
}      
      

static void
raptor_rss_comment_handler(void *user_data, raptor_xml_element* xml_element,
                           const unsigned char *s)
{
  raptor_rss_element* rss_element;

  if(!xml_element)
    return;
  
  rss_element=(raptor_rss_element*)xml_element->user_data;

  if(rss_element->xml_writer) {
    raptor_xml_writer_comment(rss_element->xml_writer, s);
    return;
  }
}


static void
raptor_rss_insert_enclosure_identifiers(raptor_parser* rdf_parser, 
                                        raptor_rss_enclosure *enclosure)
{
  raptor_identifier* identifier=&enclosure->identifier;
  if (enclosure->url) { 
    /* emit as URI resource */
    identifier->uri=raptor_uri_copy(enclosure->url);
    identifier->type=RAPTOR_IDENTIFIER_TYPE_RESOURCE;
    identifier->uri_source=RAPTOR_URI_SOURCE_URI;
  } else { 
    /* emit as blank node */
    identifier->id=raptor_generate_id(rdf_parser, 0, NULL);
    identifier->type=RAPTOR_IDENTIFIER_TYPE_ANONYMOUS;
    identifier->uri_source=RAPTOR_URI_SOURCE_GENERATED;
  }
  enclosure->node_type=raptor_rss_types_info[RAPTOR_RSS_ENCLOSURE].uri;
}


static void
raptor_rss_insert_identifiers(raptor_parser* rdf_parser) 
{
  raptor_rss_parser* rss_parser=(raptor_rss_parser*)rdf_parser->context;
  int i;
  raptor_rss_item* item;
  
  for(i=0; i< RAPTOR_RSS_COMMON_SIZE; i++) {
    for(item=rss_parser->model.common[i]; item; item=item->next) {
      raptor_identifier* identifier;
      identifier=&(item->identifier);
	
      if(!item->fields_count)
        continue;
      



( run in 0.640 second using v1.01-cache-2.11-cpan-39bf76dae61 )