view release on metacpan or search on metacpan
simdjson.cpp view on Meta::CPAN
namespace simdjson {
namespace dom {
class document;
} // namespace dom
/**
* This enum is used with the dom_parser_implementation::stage1 function.
* 1) The regular mode expects a fully formed JSON document.
* 2) The streaming_partial mode expects a possibly truncated
* input within a stream on JSON documents.
* 3) The stream_final mode allows us to truncate final
* unterminated strings. It is useful in conjunction with streaming_partial.
*/
enum class stage1_mode { regular, streaming_partial, streaming_final};
/**
* Returns true if mode == streaming_partial or mode == streaming_final
*/
inline bool is_streaming(stage1_mode mode) {
// performance note: it is probably faster to check that mode is different
// from regular than checking that it is either streaming_partial or streaming_final.
return (mode != stage1_mode::regular);
// return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
}
namespace internal {
/**
* An implementation of simdjson's DOM parser for a particular CPU architecture.
*
* This class is expected to be accessed only by pointer, and never move in memory (though the
simdjson.cpp view on Meta::CPAN
* @private For internal implementation use
*
* Stage 1 of the document parser.
*
* Guaranteed only to be called when capacity > document length.
*
* Overridden by each implementation.
*
* @param buf The json document to parse.
* @param len The length of the json document.
* @param streaming Whether this is being called by parser::parse_many.
* @return The error code, or SUCCESS if there was no error.
*/
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
/**
* @private For internal implementation use
*
* Stage 2 of the document parser.
*
* Called after stage1().
*
* Overridden by each implementation.
*
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace arm64 {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return arm64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace haswell {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return haswell::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace icelake {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return icelake::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace ppc64 {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return ppc64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
// Implementation-specific overrides
//
namespace simdjson {
namespace westmere {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return westmere::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace lsx {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return lsx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return lsx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return lsx::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// Implementation-specific overrides
//
namespace simdjson {
namespace lasx {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return lasx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return lasx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return lasx::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
// extra continuation
error = UTF8_ERROR;
idx++;
return;
}
// 2-byte
if ((buf[idx] & 0x20) == 0) {
// missing continuation
if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 1100000_ 10______
if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
idx += 2;
return;
}
// 3-byte
if ((buf[idx] & 0x10) == 0) {
// missing continuation
if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11100000 100_____ ________
if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
// surrogates: U+D800-U+DFFF 11101101 101_____
if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
idx += 3;
return;
}
// 4-byte
// missing continuation
if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11110000 1000____ ________ ________
if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
// too large: > U+10FFFF:
// 11110100 (1001|101_)____
// 1111(1___|011_|0101) 10______
// also includes 5, 6, 7 and 8 byte characters:
simdjson.cpp view on Meta::CPAN
add_structural();
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
idx++;
};
break;
}
}
// We pad beyond.
// https://github.com/simdjson/simdjson/issues/906
// See json_structural_indexer.h for an explanation.
*next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
next_structural_index[1] = len;
next_structural_index[2] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
parser.next_structural_index = 0;
if (partial == stage1_mode::streaming_partial) {
if(unclosed_string) {
parser.n_structural_indexes--;
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if(partial == stage1_mode::streaming_final) {
if(unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,