view release on metacpan or search on metacpan
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return arm64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
// extra continuation
error = UTF8_ERROR;
idx++;
return;
}
// 2-byte
if ((buf[idx] & 0x20) == 0) {
// missing continuation
if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 1100000_ 10______
if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
idx += 2;
return;
}
// 3-byte
if ((buf[idx] & 0x10) == 0) {
// missing continuation
if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11100000 100_____ ________
if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
// surrogates: U+D800-U+DFFF 11101101 101_____
if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
idx += 3;
return;
}
// 4-byte
// missing continuation
if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11110000 1000____ ________ ________
if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
// too large: > U+10FFFF:
// 11110100 (1001|101_)____
// 1111(1___|011_|0101) 10______
// also includes 5, 6, 7 and 8 byte characters:
simdjson.cpp view on Meta::CPAN
add_structural();
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
idx++;
};
break;
}
}
// We pad beyond.
// https://github.com/simdjson/simdjson/issues/906
// See json_structural_indexer.h for an explanation.
*next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
next_structural_index[1] = len;
next_structural_index[2] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
parser.next_structural_index = 0;
if (partial == stage1_mode::streaming_partial) {
if(unclosed_string) {
parser.n_structural_indexes--;
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if(partial == stage1_mode::streaming_final) {
if(unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return icelake::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return haswell::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return ppc64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson.cpp view on Meta::CPAN
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
simdjson.cpp view on Meta::CPAN
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
simdjson.cpp view on Meta::CPAN
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
simdjson.cpp view on Meta::CPAN
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
simdjson.cpp view on Meta::CPAN
return find_escaped_branchless(backslash);
}
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return westmere::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
namespace simdjson {
namespace dom {
class document;
} // namespace dom
/**
* This enum is used with the dom_parser_implementation::stage1 function.
* 1) The regular mode expects a fully formed JSON document.
* 2) The streaming_partial mode expects a possibly truncated
* input within a stream on JSON documents.
* 3) The stream_final mode allows us to truncate final
* unterminated strings. It is useful in conjunction with streaming_partial.
*/
enum class stage1_mode { regular, streaming_partial, streaming_final};
/**
* Returns true if mode == streaming_partial or mode == streaming_final
*/
inline bool is_streaming(stage1_mode mode) {
// performance note: it is probably faster to check that mode is different
// from regular than checking that it is either streaming_partial or streaming_final.
return (mode != stage1_mode::regular);
// return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
}
namespace internal {
/**
* An implementation of simdjson's DOM parser for a particular CPU architecture.
*
* This class is expected to be accessed only by pointer, and never move in memory (though the
* @private For internal implementation use
*
* Stage 1 of the document parser.
*
* Guaranteed only to be called when capacity > document length.
*
* Overridden by each implementation.
*
* @param buf The json document to parse.
* @param len The length of the json document.
* @param streaming Whether this is being called by parser::parse_many.
* @return The error code, or SUCCESS if there was no error.
*/
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
/**
* @private For internal implementation use
*
* Stage 2 of the document parser.
*
* Called after stage1().
*
* Overridden by each implementation.
*
*/
inline size_t size_in_bytes() const noexcept;
/**
* After iterating through the stream, this method
* returns the number of bytes that were not parsed at the end
* of the stream. If truncated_bytes() differs from zero,
* then the input was truncated maybe because incomplete JSON
* documents were found at the end of the stream. You
* may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
*
* You should only call truncated_bytes() after streaming through all
* documents, like so:
*
* document_stream stream = parser.parse_many(json,window);
* for(auto doc : stream) {
* // do something with doc
* }
* size_t truncated = stream.truncated_bytes();
*
*/
inline size_t truncated_bytes() const noexcept;
return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
}
inline size_t document_stream::next_batch_start() const noexcept {
return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
}
inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
size_t remaining = len - _batch_start;
if (remaining <= batch_size) {
return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
} else {
return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
}
}
#ifdef SIMDJSON_THREADS_ENABLED
inline void document_stream::load_from_stage1_thread() noexcept {
worker->finish();
// Swap to the parser that was loaded up in the thread. Make sure the parser has
// enough memory to swap to, as well.
std::swap(*parser, stage1_thread_parser);
*
* - 0 = finished with document
* - 1 = document root value (could be [ or {, not yet known)
* - 2 = , or } inside root array/object
* - 3 = key or value inside root array/object.
*/
depth_t _depth{};
/**
* Beginning of the document indexes.
* Normally we have root == parser->implementation->structural_indexes.get()
* but this may differ, especially in streaming mode (where we have several
* documents);
*/
token_position _root{};
/**
* Normally, a json_iterator operates over a single document, but in
* some cases, we may have a stream of documents. This attribute is meant
* as meta-data: the json_iterator works the same irrespective of the
* value of this attribute.
*/
bool _streaming{false};
public:
simdjson_inline json_iterator() noexcept = default;
simdjson_inline json_iterator(json_iterator &&other) noexcept;
simdjson_inline json_iterator &operator=(json_iterator &&other) noexcept;
simdjson_inline explicit json_iterator(const json_iterator &other) noexcept = default;
simdjson_inline json_iterator &operator=(const json_iterator &other) noexcept = default;
/**
* Skips a JSON value, whether it is a scalar, array or object.
*/
simdjson_warn_unused simdjson_inline error_code skip_child(depth_t parent_depth) noexcept;
/**
* Tell whether the iterator is still at the start
*/
simdjson_inline bool at_root() const noexcept;
/**
* Tell whether we should be expected to run in streaming
* mode (iterating over many documents). It is pure metadata
* that does not affect how the iterator works. It is used by
* start_root_array() and start_root_object().
*/
simdjson_inline bool streaming() const noexcept;
/**
* Get the root value iterator
*/
simdjson_inline token_position root_position() const noexcept;
/**
* Assert that we are at the document depth (== 1)
*/
simdjson_inline void assert_at_document_depth() const noexcept;
/**
inline size_t size_in_bytes() const noexcept;
/**
* After iterating through the stream, this method
* returns the number of bytes that were not parsed at the end
* of the stream. If truncated_bytes() differs from zero,
* then the input was truncated maybe because incomplete JSON
* documents were found at the end of the stream. You
* may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
*
* You should only call truncated_bytes() after streaming through all
* documents, like so:
*
* document_stream stream = parser.iterate_many(json,window);
* for(auto & doc : stream) {
* // do something with doc
* }
* size_t truncated = stream.truncated_bytes();
*
*/
inline size_t truncated_bytes() const noexcept;
namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
namespace ondemand {
simdjson_inline json_iterator::json_iterator(json_iterator &&other) noexcept
: token(std::forward<token_iterator>(other.token)),
parser{other.parser},
_string_buf_loc{other._string_buf_loc},
error{other.error},
_depth{other._depth},
_root{other._root},
_streaming{other._streaming}
{
other.parser = nullptr;
}
simdjson_inline json_iterator &json_iterator::operator=(json_iterator &&other) noexcept {
token = other.token;
parser = other.parser;
_string_buf_loc = other._string_buf_loc;
error = other.error;
_depth = other._depth;
_root = other._root;
_streaming = other._streaming;
other.parser = nullptr;
return *this;
}
simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser) noexcept
: token(buf, &_parser->implementation->structural_indexes[0]),
parser{_parser},
_string_buf_loc{parser->string_buf.get()},
_depth{1},
_root{parser->implementation->structural_indexes.get()},
_streaming{false}
{
logger::log_headers();
#if SIMDJSON_CHECK_EOF
assert_more_tokens();
#endif
}
inline void json_iterator::rewind() noexcept {
token.set_position( root_position() );
SIMDJSON_POP_DISABLE_WARNINGS
simdjson_inline bool json_iterator::at_root() const noexcept {
return position() == root_position();
}
simdjson_inline bool json_iterator::is_single_token() const noexcept {
return parser->implementation->n_structural_indexes == 1;
}
simdjson_inline bool json_iterator::streaming() const noexcept {
return _streaming;
}
simdjson_inline token_position json_iterator::root_position() const noexcept {
return _root;
}
simdjson_inline void json_iterator::assert_at_document_depth() const noexcept {
SIMDJSON_ASSUME( _depth == 1 );
}
if (*_json_iter->peek() == '}') {
logger::log_value(*_json_iter, "empty object");
_json_iter->return_current_and_advance();
end_container();
return false;
}
return true;
}
simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_object() noexcept {
// When in streaming mode, we cannot expect peek_last() to be the last structural element of the
// current document. It only works in the normal mode where we have indexed a single document.
// Note that adding a check for 'streaming' is not expensive since we only have at most
// one root element.
if ( ! _json_iter->streaming() ) {
// The following lines do not fully protect against garbage content within the
// object: e.g., `{"a":2} foo }`. Users concerned with garbage content should
// call `at_end()` on the document instance at the end of the processing to
// ensure that the processing has finished at the end.
//
if (*_json_iter->peek_last() != '}') {
_json_iter->abandon();
return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing } at end");
}
// If the last character is } *and* the first gibberish character is also '}'
return false;
}
_json_iter->descend_to(depth()+1);
#if SIMDJSON_DEVELOPMENT_CHECKS
_json_iter->set_start_position(_depth, start_position());
#endif
return true;
}
simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_array() noexcept {
// When in streaming mode, we cannot expect peek_last() to be the last structural element of the
// current document. It only works in the normal mode where we have indexed a single document.
// Note that adding a check for 'streaming' is not expensive since we only have at most
// one root element.
if ( ! _json_iter->streaming() ) {
// The following lines do not fully protect against garbage content within the
// array: e.g., `[1, 2] foo]`. Users concerned with garbage content should
// also call `at_end()` on the document instance at the end of the processing to
// ensure that the processing has finished at the end.
//
if (*_json_iter->peek_last() != ']') {
_json_iter->abandon();
return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing ] at end");
}
// If the last character is ] *and* the first gibberish character is also ']'
error = run_stage1(*parser, batch_start);
while(error == EMPTY) {
// In exceptional cases, we may start with an empty block
batch_start = next_batch_start();
if (batch_start >= len) { return; }
error = run_stage1(*parser, batch_start);
}
if (error) { return; }
doc_index = batch_start;
doc = document(json_iterator(&buf[batch_start], parser));
doc.iter._streaming = true;
#ifdef SIMDJSON_THREADS_ENABLED
if (use_thread && next_batch_start() < len) {
// Kick off the first thread on next batch if needed
error = stage1_thread_parser.allocate(batch_size);
if (error) { return; }
worker->start_thread();
start_stage1_thread();
if (error) { return; }
}
*
* That is, stage1 calls "this->buf = _buf" so the parser remembers the buffer that
* we used. But json_iterator has no callback when stage1 is called on the parser.
* In fact, I think that the parser is unaware of json_iterator.
*
*
* So we need to re-anchor the json_iterator after each call to stage 1 so that
* all of the pointers are in sync.
*/
doc.iter = json_iterator(&buf[batch_start], parser);
doc.iter._streaming = true;
/**
* End of resync.
*/
if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
doc_index = batch_start;
}
}
}
inline size_t document_stream::next_batch_start() const noexcept {
return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
}
inline error_code document_stream::run_stage1(ondemand::parser &p, size_t _batch_start) noexcept {
// This code only updates the structural index in the parser, it does not update any json_iterator
// instance.
size_t remaining = len - _batch_start;
if (remaining <= batch_size) {
return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
} else {
return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
}
}
simdjson_inline size_t document_stream::iterator::current_index() const noexcept {
return stream->doc_index;
}
simdjson_inline std::string_view document_stream::iterator::source() const noexcept {
auto depth = stream->doc.iter.depth();
auto cur_struct_index = stream->doc.iter._root - stream->parser->implementation->structural_indexes.get();