Tripletail

 view release on metacpan or  search on metacpan

ext/Tripletail-HtmlFilter/HtmlFilter.pm  view on Meta::CPAN

    $XS_LOADERROR = 'disabled';
  }
  
  do
  {
    no strict 'refs';
    #$err and chomp $err;
    #warn "warning: $err";
    foreach my $name (@XSUBS)
    {
      my $xsub = __PACKAGE__.'::'.$name;
      if( !defined(&$xsub) )
      {
        (my $ppsub = $xsub) =~ s/(\w+)$/_$1_pp/;
        *$xsub = \&$ppsub;
      }
    }
  }
}

1;

sub _new {
    my $class = shift;
    my $opts = { @_ };

	my $this = bless [] => $class;

	$this->[INTEREST]       = $opts->{interest};
	$this->[TRACK]          = $opts->{track};
	$this->[FILTER_TEXT]    = $opts->{filter_text};
	$this->[FILTER_COMMENT] = $opts->{filter_comment};
	
    $this->[CONTEXT]        = Tripletail::HtmlFilter::Context->_new;
    $this->[HTML]           = undef; # 文字列
    $this->[OUTPUT]         = []; # Tripletail::HtmlFilter::{Element,Text,Comment}

    # interest, trackに渡された正規表現はこの時点で CODE にコンパイルしておく。
    if ($this->[INTEREST]) {
		$this->[INTEREST] = $this->_compile_matcher($this->[INTEREST]);
    }

    if ($this->[TRACK]) {
		$this->[TRACK] = $this->_compile_matcher($this->[TRACK]);
    }

    $this;
}

sub set {
    my $this = shift;
    my $html = shift;

    if (not defined $html) {
		die __PACKAGE__."#set: ARG[1] is not defined.\n";
    }
    elsif (ref $html) {
		die __PACKAGE__."#set: ARG[1] is a Ref.\n";
    }

    #@{$this->[HTML]} = split m/(<.+?>)/s, $html;
    # ↑では、<!-- <hoge> -->を正しく解析できない。真面目にパーズする必要がある
    # しかし、perlで真面目にパーザを書くのは非常に面倒なので正規表現で誤魔化す
    # NB: 他にも、正しく解析できないパターンが存在するかも
    @{$this->[HTML]} = split m/((?:<!--.*?-->)|(?:<.+?>))/s, $html;
    @{$this->[OUTPUT]} = ();
    $this;
}

sub toStr {
    my $this = shift;

    $this->[CONTEXT]->_flush($this); # 未確定の部分を確定する

    join('', map {ref($_)?$_->toStr:$_} @{$this->[OUTPUT]});
}

sub _compile_matcher {
	my $this = shift;
	my $regexes = shift;

	my $joined = join('', @$regexes);
	if (my $cached = $_MATCHER_CACHE{$joined}) {
		return $cached;
	}

	my $ret = [];
	foreach my $reg (@$regexes) {
		if (ref($reg) eq 'Regexp') {
			# コンパイル済み正規表現だった。
			push @$ret, sub {
				return 1 if $_[0] =~ $reg;
			};
		}
		else {
			# 単純な文字列だった。
			push @$ret, lc $reg;
		}
	}

	$_MATCHER_CACHE{$joined} = $ret;
	$ret;
}

sub _next_pp {
    my $this = shift;
    $this->[CONTEXT]->_flush($this); # 未確定の部分を確定する
    
    while (@{$this->[HTML]}) {
		my $str = shift @{$this->[HTML]};
		my $parsed;
		my $interested;
	
		if ($str =~ m/^<!--\s*(.+?)\s*-->$/) {
			# コメント
			if ($this->[FILTER_COMMENT]) {
				$interested = $this->[CONTEXT]->newComment($1);
			}
		} elsif ($str =~ m/^</) {
			# 要素
			if ($this->[TRACK] or $this->[INTEREST]) {
				($interested,$parsed) = $this->_next_elem($str);
			}
		} else {
			# テキスト



( run in 0.939 second using v1.01-cache-2.11-cpan-71847e10f99 )