HTML-TagReader
view release on metacpan or search on metacpan
TagReader.xs view on Meta::CPAN
tr_new_from_iofh(class, fh)
SV *class
PerlIO *fh
CODE:
STRLEN i; // int
char str[]="iofh";
char c;
if (fh == NULL){
croak("ERROR: invalid PerlIO fh");
}
// let's do some test to see if we will be able to read on this io filehandle:
c=PerlIO_getc(fh);
// c is EOF in case of error or end of file
if (c==EOF){
if (PerlIO_error(fh)){
croak("ERROR: can not read from IO filehandle");
}
// no ungetc in case of EOF
}else{
if (PerlIO_ungetc(fh,c)==EOF){
croak("ERROR: ungetc on filehandle failed");
}
}
i=strlen(str);
// malloc and zero the struct
Newz(0, RETVAL, 1, struct trstuct );
// malloc filename, we need it for some error printouts
New(0, RETVAL->filename, i+1, char );
strncpy(RETVAL->filename,str,i);
// put a zero at the end of the string, perl might not do it
*(RETVAL->filename + i )=(char)0;
// malloc initial buffer
New(0, RETVAL->buffer, BUFFLEN+1, char );
RETVAL->currbuflen=BUFFLEN;
RETVAL->fd=fh;
RETVAL->charpos=0;
RETVAL->tagcharpos=0;
RETVAL->fileline=1;
RETVAL->tagline=0;
OUTPUT:
RETVAL
void
DESTROY(self)
HTML::TagReader self
CODE:
Safefree(self->filename);
Safefree(self->buffer);
PerlIO_close(self->fd);
Safefree(self);
void
tr_gettag(self,showerrors)
HTML::TagReader self
SV *showerrors
PREINIT:
int bufpos;
char ch;
char chn;
int state;
PPCODE:
if (! self->fileline){
croak("Object not initialized");
}
/* initialize */
state=0;
bufpos=0;
ch=(char)0;
chn=(char)0;
self->tagline=self->fileline;
/* find the next tag */
while(state != 3 && (chn=PerlIO_getc(self->fd))!=EOF ){
self->charpos++;
if (ch==0){ /* read one more character ahead so we have always 2 */
ch=chn;
continue;
}
/* we can not run out of mem because TAGREADER_MAX_TAGLEN
* is much smaller than BUFFLEN */
if (bufpos > TAGREADER_MAX_TAGLEN){
if (SvTRUE(showerrors)){
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, tag not terminated or too long.\n",self->filename,self->tagline,self->charpos);
}
self->buffer[bufpos]=ch;bufpos++;
self->buffer[bufpos]=(char)0;bufpos++;
state=3;
continue; /* jump out of while */
}
if (ch=='\n') {
self->fileline++;
self->charpos=0;
}
if (ch=='\n'|| ch=='\r' || ch=='\t' || ch==' ') {
ch=' ';
if (chn=='\n'|| chn=='\r' || chn=='\t' || chn==' '){
/* delete mupltiple spaces */
ch=chn; /* shift next char */
continue;
}
}
switch (state) {
/*---*/
case 0:
/* outside of tag and we start tag here*/
if (ch=='<') {
if (is_start_of_tag(chn)) {
self->buffer[0]=(char)0;
bufpos=0;
self->tagcharpos=self->charpos;
/*line where tag starts*/
self->tagline=self->fileline;
self->buffer[bufpos]=ch;bufpos++;
state=1;
}else{
if (SvTRUE(showerrors)){
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, single \'<\' should be written as <\n",self->filename,self->fileline,self->charpos);
}
}
}
break;
/*---*/
TagReader.xs view on Meta::CPAN
/*---*/
case 31:
/* done reading this comment tag */
state=0;
self->buffer[0]=(char)0; /* zero buffer*/
bufpos=0;
break;
/*---*/
default:
PerlIO_printf(PerlIO_stderr(),"%s:%d: Programm Error, state = %d\n",self->filename,self->fileline,state);
exit(1);
}
/* shift this and next char */
ch=chn;
}
/* put back chn for the next round */
self->charpos--;
if (chn!=EOF && PerlIO_ungetc(self->fd,chn)==EOF){
PerlIO_printf(PerlIO_stderr(),"%s:%d: ERROR, TagReader library can not ungetc \"%c\" before returning\n",self->filename,self->fileline,chn);
exit(1);
}
/* we have a strange html file not ending in newline and there is a closing tag at the very end */
if (chn==EOF && state==1 && ch=='>'){
state=3;
self->buffer[bufpos]=ch;bufpos++;
self->buffer[bufpos]=(char)0;bufpos++;
}
/* buffer was already terminated above */
if (state == 3){
/* we have found a tag */
if(GIMME == G_ARRAY){
EXTEND(SP,3);
XST_mPV(0,self->buffer);
XST_mIV(1,self->tagline);
XST_mIV(2,self->tagcharpos);
XSRETURN(3);
}else{
EXTEND(SP,1);
XST_mPV(0,self->buffer);
XSRETURN(1);
}
}else{
/* we are at the end of the file and no tag was found
* return an empty list or string such that the user
* will probably call destroy.
*/
XSRETURN_EMPTY;
}
void
tr_getbytoken(self,showerrors)
HTML::TagReader self
SV *showerrors
PREINIT:
int bufpos;
char ch;
char chn; /* next character */
int typepos;
int typeposdone;
int state;
PPCODE:
if (! self->fileline){
croak("Object not initialized");
}
/* initialize */
state=0;
bufpos=0;
typeposdone=0;
typepos=0;
self->buffer[bufpos]=(char)0;
self->tagline=self->fileline;
self->tagtype[typepos]=(char)0;
ch=(char)0;chn=(char)0;
/* find the next tag */
while(state != 3 && (chn=PerlIO_getc(self->fd))!=EOF ){
self->charpos++;
if (ch==0){ /* read one more character ahead so we have always 2 */
ch=chn;
continue;
}
if (ch=='\n') {
self->fileline++;
self->charpos=0;
}
//printf("DBG ch%c chn%c state%d\n",ch ,chn,state);
self->buffer[bufpos]=ch;bufpos++;
switch (state) {
/*---*/
case 0:
self->tagcharpos=self->charpos;
if (ch=='<'){
if ( is_start_of_tag(chn)) {
state=1; /* we will be reading a tag */
}else{
state=2; /* we will be reading a text/paragraph */
if (SvTRUE(showerrors)){
PerlIO_printf(PerlIO_stderr(),"%s:%d:%d: Warning, single \'<\' should be written as <\n",self->filename,self->fileline,self->charpos);
}
}
}else{
state=2; /* we will be reading a text/paragraph */
}
break;
/*---*/
case 1:
/* inside a tag. Wait for '>' */
if (typeposdone==0 && typepos < TAGREADER_TAGTYPELEN -1 ){
if (is_start_of_tag(ch)){
self->tagtype[typepos]=tolower(ch);typepos++;
}else{
/* end of tag type e.g "<a " -> save only "a" in
* tagtype array */
self->tagtype[typepos]=(char)0;
typeposdone=1; /* mark end */
}
}
if (ch=='<' && SvTRUE(showerrors)) {
PerlIO_printf(PerlIO_stderr(),"%s:%d: Warning, single \'<\' or tag starting at line %d not terminated\n",self->filename,self->fileline,self->tagline);
}
if (SvTRUE(showerrors) && bufpos > TAGREADER_MAX_TAGLEN){
PerlIO_printf(PerlIO_stderr(),"%s:%d: Warning, tag not terminated or too long.\n",self->filename,self->tagline);
( run in 1.686 second using v1.01-cache-2.11-cpan-5511b514fd6 )