HTML-TagReader

 view release on metacpan or  search on metacpan

TagReader.xs  view on Meta::CPAN

tr_new_from_iofh(class, fh)
	SV *class
	PerlIO *fh
CODE:
	STRLEN i; // int
	char str[]="iofh";
	char c;
	if (fh == NULL){
		croak("ERROR: invalid PerlIO fh");
	}
	// let's do some test to see if we will be able to read on this io filehandle:
	c=PerlIO_getc(fh);
	// c is EOF in case of error or end of file
	if (c==EOF){
		if (PerlIO_error(fh)){
			croak("ERROR: can not read from IO filehandle");
		}
		// no ungetc in case of EOF 
	}else{
		if (PerlIO_ungetc(fh,c)==EOF){
			croak("ERROR: ungetc on filehandle failed");
		}
	}
	i=strlen(str);
	// malloc and zero the struct 
        Newz(0, RETVAL, 1, struct trstuct );
	// malloc filename, we need it for some error printouts
        New(0, RETVAL->filename, i+1, char );
	strncpy(RETVAL->filename,str,i);
	// put a zero at the end of the string, perl might not do it 
	*(RETVAL->filename + i )=(char)0;
	// malloc initial buffer 
        New(0, RETVAL->buffer, BUFFLEN+1, char );
	RETVAL->currbuflen=BUFFLEN;
	RETVAL->fd=fh;
	RETVAL->charpos=0;
	RETVAL->tagcharpos=0;
	RETVAL->fileline=1;
	RETVAL->tagline=0;
OUTPUT:
	RETVAL

void
DESTROY(self)
	HTML::TagReader self
CODE:
	Safefree(self->filename);
	Safefree(self->buffer);
	PerlIO_close(self->fd);
	Safefree(self);

void
tr_gettag(self,showerrors)
	HTML::TagReader self
	SV *showerrors
PREINIT:
	int bufpos;
	char ch;
	char chn;
	int state;
PPCODE:
        if (! self->fileline){
		croak("Object not initialized");
	}
	/* initialize */
	state=0;
	bufpos=0;
	ch=(char)0;
	chn=(char)0;
	self->tagline=self->fileline;
	/* find the next tag */
	while(state != 3 && (chn=PerlIO_getc(self->fd))!=EOF ){
		self->charpos++;
		if (ch==0){ /* read one more character ahead so we have always 2 */
			ch=chn;
			continue;
		}
		/* we can not run out of mem because TAGREADER_MAX_TAGLEN
		* is much smaller than BUFFLEN */
		if (bufpos > TAGREADER_MAX_TAGLEN){
			if (SvTRUE(showerrors)){
				PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, tag not terminated or too long.\n",self->filename,self->tagline,self->charpos);
			}
			self->buffer[bufpos]=ch;bufpos++;
			self->buffer[bufpos]=(char)0;bufpos++;
			state=3;
			continue; /* jump out of while */
		}
		if (ch=='\n') {
			self->fileline++;
			self->charpos=0;
		}
		if (ch=='\n'|| ch=='\r' || ch=='\t' || ch==' ') {
			ch=' ';
			if (chn=='\n'|| chn=='\r' || chn=='\t' || chn==' '){
				/* delete mupltiple spaces */
				ch=chn; /* shift next char */
				continue;
			}
		}
		switch (state) {
		/*---*/
			case 0:
			/* outside of tag and we start tag here*/
			if (ch=='<') {
				if (is_start_of_tag(chn)) {
					self->buffer[0]=(char)0;
					bufpos=0;
					self->tagcharpos=self->charpos;
					/*line where tag starts*/
					self->tagline=self->fileline;
					self->buffer[bufpos]=ch;bufpos++;
					state=1;
				}else{
					if (SvTRUE(showerrors)){
						PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, single \'<\' should be written as &lt;\n",self->filename,self->fileline,self->charpos);
					}
				}
			}
			break;
		/*---*/

TagReader.xs  view on Meta::CPAN

		/*---*/
			case 31: 
				/* done reading this comment tag */
				state=0;
				self->buffer[0]=(char)0; /* zero buffer*/
				bufpos=0;
			break;
		/*---*/
			default:
				PerlIO_printf(PerlIO_stderr(),"%s:%d: Programm Error, state = %d\n",self->filename,self->fileline,state);
				exit(1);
		}
		/* shift this and next char */
		ch=chn;
	}
	/* put back chn for the next round */
	self->charpos--;
	if (chn!=EOF && PerlIO_ungetc(self->fd,chn)==EOF){
		PerlIO_printf(PerlIO_stderr(),"%s:%d: ERROR, TagReader library can not ungetc \"%c\" before returning\n",self->filename,self->fileline,chn);
		exit(1);
	}
	/* we have a strange html file not ending in newline and there is a closing tag at the very end */
	if (chn==EOF && state==1 && ch=='>'){
		state=3; 
		self->buffer[bufpos]=ch;bufpos++;
		self->buffer[bufpos]=(char)0;bufpos++;
	}
	/* buffer was already terminated above */
	if (state == 3){
		/* we have found a tag */
		if(GIMME == G_ARRAY){
			EXTEND(SP,3);
			XST_mPV(0,self->buffer);
			XST_mIV(1,self->tagline);
			XST_mIV(2,self->tagcharpos);
			XSRETURN(3);
		}else{
			EXTEND(SP,1);
			XST_mPV(0,self->buffer);
			XSRETURN(1);
		}
	}else{
		/* we are at the end of the file and no tag was found 
		 * return an empty list or string such that the user 
		 * will probably call destroy.
		 */
		 XSRETURN_EMPTY;
	}

void
tr_getbytoken(self,showerrors)
	HTML::TagReader self
	SV *showerrors
PREINIT:
	int bufpos;
	char ch;
	char chn; /* next character */
	int typepos;
	int typeposdone;
	int state;
PPCODE:
        if (! self->fileline){
		croak("Object not initialized");
	}
	/* initialize */
	state=0;
	bufpos=0;
	typeposdone=0;
	typepos=0;
	self->buffer[bufpos]=(char)0;
	self->tagline=self->fileline;
	self->tagtype[typepos]=(char)0;
	ch=(char)0;chn=(char)0;
	/* find the next tag */
	while(state != 3 && (chn=PerlIO_getc(self->fd))!=EOF ){
		self->charpos++;
		if (ch==0){ /* read one more character ahead so we have always 2 */
			ch=chn;
			continue;
		}
		if (ch=='\n') {
			self->fileline++;
			self->charpos=0;
		}
		//printf("DBG ch%c chn%c state%d\n",ch ,chn,state);
		self->buffer[bufpos]=ch;bufpos++;
		switch (state) {
		/*---*/
			case 0:
			self->tagcharpos=self->charpos;
			if (ch=='<'){
				if ( is_start_of_tag(chn)) { 
					state=1; /* we will be reading a tag */
				}else{
					state=2; /* we will be reading a text/paragraph */
					if (SvTRUE(showerrors)){
						PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, single \'<\' should be written as &lt;\n",self->filename,self->fileline,self->charpos);
					}
				}
			}else{
				state=2; /* we will be reading a text/paragraph */
			}
			break;
		/*---*/
			case 1:
			/* inside a tag. Wait for '>' */
			if (typeposdone==0 && typepos < TAGREADER_TAGTYPELEN -1 ){ 
				if (is_start_of_tag(ch)){
					self->tagtype[typepos]=tolower(ch);typepos++;
				}else{
					/* end of tag type e.g "<a " -> save only "a" in 
					*  tagtype array */
					self->tagtype[typepos]=(char)0;
					typeposdone=1; /* mark end */
				}
			}
			if (ch=='<' && SvTRUE(showerrors)) {
				PerlIO_printf(PerlIO_stderr(),"%s:%d: Warning, single \'<\' or tag starting at line %d not terminated\n",self->filename,self->fileline,self->tagline);
			}
			if (SvTRUE(showerrors) && bufpos > TAGREADER_MAX_TAGLEN){
				PerlIO_printf(PerlIO_stderr(),"%s:%d: Warning, tag not terminated or too long.\n",self->filename,self->tagline);



( run in 1.686 second using v1.01-cache-2.11-cpan-5511b514fd6 )