Attean

 view release on metacpan or  search on metacpan

lib/AtteanX/Parser/Turtle/Lexer.pm  view on Meta::CPAN

			my ($ns, $local)	= ($ln =~ /^([^:]*:)(.*)$/);
			no warnings 'uninitialized';
			$local	=~ s{\\([-~.!&'()*+,;=:/?#@%_\$])}{$1}g;
			return $self->new_token(PREFIXNAME, $self->start_line, $self->start_column, $ns, $local);
		} else {
			$self->buffer =~ $r_PNAME_NS;
			my $ns	= $self->read_length($+[0]);
			return $self->new_token(PREFIXNAME, $self->start_line, $self->start_column, $ns);
		}
	}

	sub _get_gtgt {
		my $self	= shift;
		$self->read_word('>>');
		return $self->new_token(GTGT, $self->start_line, $self->start_column, '>>');
	}
	
	sub _get_lbrace_or_lannot {
		my $self	= shift;
		$self->get_char_safe(q[{]);
		if ($self->buffer =~ /^\|/o) {
			$self->get_char_safe(q[|]);
			return $self->new_token(LANNOT, $self->start_line, $self->start_column, '{|');
		}
		return $self->new_token(LBRACE, $self->start_line, $self->start_column, '{');
	}
	
	sub _get_rannot {
		my $self	= shift;
		$self->read_word('|}');
		return $self->new_token(RANNOT, $self->start_line, $self->start_column, '|}');
	}
	
	sub _get_iriref_or_ltlt {
		my $self	= shift;
		$self->get_char_safe(q[<]);
		if ($self->buffer =~ /^</o) {
			$self->get_char_safe(q[<]);
			return $self->new_token(LTLT, $self->start_line, $self->start_column, '<<');
		}
		
		if ($self->buffer =~ m/^[\x23-\x3d\x3f-\x5a\x5d-\x7e]*>/o) {
			my $iri	.= $self->read_length($+[0]);
			chop($iri);
			return $self->new_token(IRI, $self->start_line, $self->start_column, $iri);
		}
	
		my $iri	= '';
		while (1) {
			if (length($self->buffer) == 0) {
				my $c	= $self->peek_char;
				last unless defined($c);
			}
			if (substr($self->buffer, 0, 1) eq '\\') {
				$self->get_char_safe('\\');
				my $esc	= $self->get_char;
				if ($esc eq '\\') {
					$iri	.= "\\";
				} elsif ($esc eq 'U') {
					my $codepoint	= $self->read_length(8);
					$self->_throw_error("Bad unicode escape codepoint '$codepoint'") unless ($codepoint =~ /^[0-9A-Fa-f]+$/o);
					$iri .= chr(hex($codepoint));
				} elsif ($esc eq 'u') {
					my $codepoint	= $self->read_length(4);
					$self->_throw_error("Bad unicode escape codepoint '$codepoint'") unless ($codepoint =~ /^[0-9A-Fa-f]+$/o);
					my $char	= chr(hex($codepoint));
					if ($char =~ /[<>" {}|\\^`]/o) {
						$self->_throw_error(sprintf("Bad IRI character: '%s' (0x%x)", $char, ord($char)));
					}
					$iri .= $char;
				} else {
					$self->_throw_error("Unrecognized iri escape '$esc'");
				}
			} elsif ($self->buffer =~ /^[^<>\x00-\x20\\"{}|^`]+/o) {
				$iri	.= $self->read_length($+[0]);
			} elsif (substr($self->buffer, 0, 1) eq '>') {
				last;
			} else {
				my $c	= $self->peek_char;
				$self->_throw_error("Got '$c' while expecting IRI character");
			}
		}
		$self->get_char_safe(q[>]);
		return $self->new_token(IRI, $self->start_line, $self->start_column, $iri);
	}

	sub _get_bnode {
		my $self	= shift;
		$self->read_word('_:');
		$self->_throw_error("Expected: name") unless ($self->buffer =~ /^${r_bnode_id}/o);
		my $name	= substr($self->buffer, 0, $+[0]);
		$self->read_word($name);
		return $self->new_token(BNODE, $self->start_line, $self->start_column, $name);
	}

	sub _get_number {
		my $self	= shift;
		if ($self->buffer =~ /^${r_double}/o) {
			return $self->new_token(DOUBLE, $self->start_line, $self->start_column, $self->read_length($+[0]));
		} elsif ($self->buffer =~ /^${r_decimal}/o) {
			return $self->new_token(DECIMAL, $self->start_line, $self->start_column, $self->read_length($+[0]));
		} elsif ($self->buffer =~ /^${r_integer}/o) {
			return $self->new_token(INTEGER, $self->start_line, $self->start_column, $self->read_length($+[0]));
		}
		$self->_throw_error("Expected number");
	}

	sub _get_comment {
		my $self	= shift;
		$self->get_char_safe('#');
		my $comment	= '';
		my $c		= $self->peek_char;
		while (length($c) and $c !~ /[\r\n]/o) {
			$comment	.= $self->get_char;
			$c			= $self->peek_char;
		}
		if (length($c) and $c =~ /[\r\n]/o) {
			$self->get_char;
		}
		return $self->new_token(COMMENT, $self->start_line, $self->start_column, $comment);
	}

	sub _get_double_literal {
		my $self	= shift;
	# 	my $c		= $self->peek_char();

lib/AtteanX/Parser/Turtle/Lexer.pm  view on Meta::CPAN

			my $quote_count	= 0;
			my $string	= '';
			while (1) {
				if (length($self->buffer) == 0) {
					$self->fill_buffer;
					$self->_throw_error("Found EOF in string literal") if (length($self->buffer) == 0);
				}
				if (substr($self->buffer, 0, 1) eq "'") {
					my $c	= $self->get_char;
					$quote_count++;
					last if ($quote_count == 3);
				} else {
					if ($quote_count) {
						$string	.= "'" foreach (1..$quote_count);
						$quote_count	= 0;
					}
					if (substr($self->buffer, 0, 1) eq '\\') {
						$string	.= $self->_get_escaped_char();
					} else {
						$self->buffer	=~ /^[^'\\]+/;
						$string	.= $self->read_length($+[0]);
					}
				}
			}
			return $self->new_token(STRING3S, $self->start_line, $self->start_column, $string);
		} else {
			### #x22 scharacter* #x22
			my $string	= '';
			while (1) {
				$self->fill_buffer unless (length($self->buffer));
				if (substr($self->buffer, 0, 1) eq '\\') {
					$string	.= $self->_get_escaped_char();
				} elsif ($self->buffer =~ /^[^'\\]+/o) {
					$string	.= $self->read_length($+[0]);
				} elsif (substr($self->buffer, 0, 1) eq "'") {
					last;
				} else {
					my $c		= $self->peek_char();
					$self->_throw_error("Got '$c' while expecting string character");
				}
			}
			$self->get_char_safe(q[']);
			return $self->new_token(STRING1S, $self->start_line, $self->start_column, $string);
		}
	}

	sub _get_escaped_char {
		my $self	= shift;
		my $c	= $self->peek_char;
		$self->get_char_safe('\\');
		my $esc	= $self->get_char;
		if ($esc eq '\\') { return "\\" }
		elsif ($esc =~ /^['">]$/) { return $esc }
		elsif ($esc eq 'r') { return "\r" }
		elsif ($esc eq 't') { return "\t" }
		elsif ($esc eq 'n') { return "\n" }
		elsif ($esc eq 'b') { return "\b" }
		elsif ($esc eq 'f') { return "\f" }
		elsif ($esc eq 'U') {
			my $codepoint	= $self->read_length(8);
			$self->_throw_error("Bad unicode escape codepoint '$codepoint'") unless ($codepoint =~ /^[0-9A-Fa-f]+$/o);
			return chr(hex($codepoint));
		} elsif ($esc eq 'u'){
			my $codepoint	= $self->read_length(4);
			$self->_throw_error("Bad unicode escape codepoint '$codepoint'") unless ($codepoint =~ /^[0-9A-Fa-f]+$/o);
			return chr(hex($codepoint));
		}
		$self->_throw_error("Unrecognized string escape '$esc'");
	}
	
	sub _get_keyword {
		my $self	= shift;
		$self->get_char_safe('@');
		if ($self->buffer =~ /^base/o) {
			$self->read_word('base');
			return $self->new_token(TURTLEBASE, $self->start_line, $self->start_column);
		} elsif ($self->buffer =~ /^prefix/o) {
			$self->read_word('prefix');
			return $self->new_token(TURTLEPREFIX, $self->start_line, $self->start_column);
		} else {
			if ($self->buffer =~ /^[a-zA-Z]+(-[a-zA-Z0-9]+)*\b/o) {
				my $lang	= $self->read_length($+[0]);
				return $self->new_token(LANG, $self->start_line, $self->start_column, $lang);
			}
			$self->_throw_error("Expected keyword or language tag");
		}
	}

	sub _throw_error {
		my $self	= shift;
		my $error	= shift;
		my $line	= $self->line;
		my $col		= $self->column;
		Carp::confess "$error at $line:$col with buffer: " . Dumper($self->buffer);
	}
}

1;

__END__

=end private

=back

=head1 BUGS

Please report any bugs or feature requests to through the GitHub web interface
at L<https://github.com/kasei/perlrdf/issues>.

=head1 AUTHOR

Gregory Todd Williams  C<< <gwilliams@cpan.org> >>

=head1 COPYRIGHT

Copyright (c) 2014--2022 Gregory Todd Williams. This
program is free software; you can redistribute it and/or modify it under
the same terms as Perl itself.

=cut



( run in 0.661 second using v1.01-cache-2.11-cpan-39bf76dae61 )